diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..88ae4659c8f2ebb10c3bdb1146d2355665484259
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,46 @@
+---
+name: Bug report
+about: Something isn't working as expected
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+
+## Prerequisites
+Please make sure to check off these prerequisites before submitting a bug report.
+- [ ] Test that the bug appears on the current version of the dev-branch. Make sure to include the commit hash of the commit you checked out.
+- [ ] Check that the issue hasn't already been reported, by checking the currently open issues.
+- [ ] If there are steps to reproduce the problem, make sure to write them down below.
+- [ ] If relevant, please include the ONNX files, which were created directly before and/or after the bug.
+
+## Quick summary
+Please give a brief and concise description of the bug.
+
+## Details
+Please add to the following sections to describe the bug as accurately as possible.
+
+### Steps to Reproduce
+Add what needs to be done to reproduce the bug. Add code examples where useful
+and make sure to include the resulting ONNX files, and the commit hash you are working on.
+
+1. Clone the FINN repository
+2. Checkout the dev branch, with commit hash: [...]
+3. Start the docker container with the command: [...]
+4. Run transformation [...] on ONNX file [...] or run the dataflow builder with the following settings: [...]
+5. [Further steps ...]
+
+### Expected behavior
+Please add a brief description of what you expected to happen.
+
+### Actual behavior
+Describe what actually happens instead.
+
+## Optional
+
+### Possible fix
+If you already know where the issue stems from, or you have a hint please let us know.
+
+### Additional context
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..57e3d5495215bf04e7863729a3b8de1d331a1b2f
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,8 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Getting started with FINN
+    url: https://finn.readthedocs.io/en/latest/getting_started.html
+    about: Documentation about how to get up and running with FINN.
+  - name: Ask for help and get in touch with the community
+    url: https://gitter.im/xilinx-finn/community
+    about: Check out our gitter channel, if you have a question about FINN or a general problem that is likely not a bug.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000000000000000000000000000000..dfd71f43087473c7972b972c5b547b10b51fc496
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,27 @@
+---
+name: Feature request
+about: Suggest an idea for FINN
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+## Prerequisites
+Please make sure to check that the idea is not already being worked on
+by looking at the currently open issues and the [project Kanbans](https://github.com/Xilinx/finn/projects).
+
+Even if an idea is already being worked on you can still create a feature request,
+if you would like to open a discussion about the feature or want to contribute to it.
+
+## Details
+Please add to the following sections to describe the feature as accurately as possible.
+
+### New behavior
+Please add a brief and concise description of what you would like to happen in FINN in the future.
+
+### Motivation
+Please tell us why this feature is important to the FINN community.
+
+### Parts of FINN affected
+Please describe which parts of FINN would be affected by this feature.
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4374111f22a12e586c5c5233a7eee096b848b86e
--- /dev/null
+++ b/.github/workflows/docker-image.yml
@@ -0,0 +1,33 @@
+name: DockerImage
+
+on:
+  push:
+    branches:
+      - 'dev'
+
+jobs:
+  docker:
+    runs-on: ubuntu-18.04
+    steps:
+      -
+        name: checkout
+        uses: actions/checkout@v2
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Build and push
+        uses: docker/build-push-action@v2
+        with:
+          file: docker/Dockerfile.finn
+          context: .
+          push: true
+          tags: maltanar/finn:dev_latest
+      -
+        name: Image digest
+        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2fbb9265beb49644f08a2c6e916ab9c23d4bd339
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,22 @@
+name: Pre-commit
+
+on:
+  pull_request:
+    branches: [ main, dev ]
+  push:
+    branches: [ main, dev ]
+
+jobs:
+  lint:
+    name: Lint PR or Push to DEV
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+
+      - name: Run Lint
+        uses: pre-commit/action@v2.0.0
diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index 051fd506cab6ae2da0cbea8badff342361734562..80ac0b61e6d1a6f469f7d5bb0f1d50ce9c56565b 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -17,7 +17,37 @@ jobs:
       - name: checkout
         uses: actions/checkout@v2
 
+      - name: set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: cache Docker layers
+        uses: actions/cache@v2
+        with:
+          path: /tmp/.buildx-cache
+          key: ${{ runner.os }}-buildx-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-buildx-
+
+      - name: Build and push
+        uses: docker/build-push-action@v2
+        with:
+          file: docker/Dockerfile.finn
+          context: .
+          push: false
+          load: true
+          tags: finn_gha
+          cache-from: type=local,src=/tmp/.buildx-cache
+          cache-to: type=local,dest=/tmp/.buildx-cache-new
+      -
+        # Temp fix
+        # https://github.com/docker/build-push-action/issues/252
+        # https://github.com/moby/buildkit/issues/1896
+        name: Move cache
+        run: |
+          rm -rf /tmp/.buildx-cache
+          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
+
+
       - name: DockerRunQuicktest
         run: |
-          docker build -t finn_gha -f docker/Dockerfile.finn_ci --build-arg BUILD_PATH=/tmp/finn_gha .
           docker run --init --hostname finn_gha -v $(pwd):/workspace/finn -e FINN_BUILD_DIR=/tmp/finn_gha -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
diff --git a/.isort.cfg b/.isort.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..6cfe1c8919fd8488c5e6f00a816e500b1f88a784
--- /dev/null
+++ b/.isort.cfg
@@ -0,0 +1,11 @@
+[settings]
+line_length=88
+indent='    '
+skip=.tox,.venv,build,dist
+known_standard_library=setuptools,pkg_resources
+known_test=pytest
+known_first_party=finn
+sections=FUTURE,STDLIB,TEST,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
+default_section=THIRDPARTY
+multi_line_output=3
+profile=black
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c513c5493d674b067b82fdae9e675d7f9b6eb024..143514b36ba31cb2b292f3a1961187709798efbf 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,28 +27,44 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 exclude: '^docs/conf.py'
+
 default_language_version:
     python: python3
+
 repos:
-# black
-- repo: https://github.com/ambv/black
-  rev: stable
-  hooks:
-  - id: black
-    language_version: python3
 - repo: git://github.com/pre-commit/pre-commit-hooks
-  rev: v2.2.3
+  rev: v3.2.0
   hooks:
   - id: trailing-whitespace
+    exclude: '\.dat$'
+  - id: check-added-large-files
   - id: check-ast
   - id: check-json
   - id: check-merge-conflict
   - id: check-xml
   - id: check-yaml
   - id: debug-statements
+    exclude: '^src/finn/builder/build_dataflow.py$'
   - id: end-of-file-fixer
   - id: requirements-txt-fixer
   - id: mixed-line-ending
     args: ['--fix=no']
+
+- repo: git://github.com/PyCQA/isort
+  rev: 5.5.3
+  hooks:
+  - id: isort
+
+- repo: git://github.com/psf/black
+  rev: stable
+  hooks:
+  - id: black
+    language_version: python3
+
+- repo: https://gitlab.com/pycqa/flake8
+  rev: 3.8.3
+  hooks:
   - id: flake8
-    args: ['--max-line-length=88']  # default of Black
+    # black-compatible flake-8 config
+    args: ['--max-line-length=88',  # black default
+           '--extend-ignore=E203']  # E203 is not PEP8 compliant
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3601fcdccff675e6f850d4636ebbfc0726f7cd4d
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,43 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+sphinx:
+   configuration: docs/finn/conf.py
+
+python:
+   version: 3.7
+   install:
+    - method: pip
+      path: .
+      extra_requirements:
+        - docs
diff --git a/AUTHORS.rst b/AUTHORS.rst
index 533ed62e1dbda2799f74805f2100769f9c4fecfc..1d42d35a3b269176fcab79d8239b84ac8442fa43 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -13,3 +13,12 @@ Contributors
 * Suranga Mahesh (@surangamh)
 * Peter Lehnhardt (@pete-lennart)
 * Neil Kim Nielsen (@neilkimn)
+* Jon Ander Lezeta (@jalezeta)
+* John Terry (@jterry-x)
+* Alina Vasilciuc (@alinavalinav)
+* Alessandro Pappalardo (@volcacius)
+* Giuseppe Franco (@Giuseppe5)
+* Syed Asad Alam (@asadalam)
+* Javier Duarte (@jmduarte)
+* Uma Maheshwari (@umav1511)
+* JosÃ© Rosa (@pinxau1000)
diff --git a/README.md b/README.md
index 10ac25cb8f9e23520830efa4f2f7a58a21370e29..f36eac3a911315c260f1849a0406a9a467f0d53f 100644
--- a/README.md
+++ b/README.md
@@ -24,9 +24,9 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s
 
 ## What's New in FINN?
 
+* **2021-11-05:** v0.7 is released, introducing QONNX support, three new example networks and many other improvements. Read more on the [v0.7 release blog post](https://xilinx.github.io/finn//2021/11/05/finn-v07-is-released.html).
 * **2021-06-15:** v0.6 is released, with ResNet-50 on U250 and ZCU104 MobileNet-v1 in finn-examples showcasing new features plus a lot more. Read more on the [v0.6 release blog post](https://xilinx.github.io/finn//2021/06/15/finn-v06-is-released.html).
 * **2020-12-17:** v0.5b (beta) is released, with a new [examples repo](https://github.com/Xilinx/finn-examples) including MobileNet-v1. Read more on the <a href="https://xilinx.github.io/finn/2020/12/17/finn-v05b-beta-is-released.html">release blog post</a>.
-* **2020-09-21:** v0.4b (beta) is released. Read more on the <a href="https://xilinx.github.io/finn/2020/09/21/finn-v04b-beta-is-released.html">release blog post</a>.
 
 ## Documentation
 
diff --git a/custom_hls/lookup.hpp b/custom_hls/lookup.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3001f6613ec6ed9a9e5f47d9be356d4b032f7192
--- /dev/null
+++ b/custom_hls/lookup.hpp
@@ -0,0 +1,60 @@
+/******************************************************************************
+* Copyright (c) 2021, Xilinx
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* * Redistributions of source code must retain the above copyright notice, this
+*   list of conditions and the following disclaimer.
+*
+* * Redistributions in binary form must reproduce the above copyright notice,
+*   this list of conditions and the following disclaimer in the documentation
+*   and/or other materials provided with the distribution.
+*
+* * Neither the name of FINN nor the names of its
+*   contributors may be used to endorse or promote products derived from
+*   this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *******************************************************************************/
+
+#include <ap_int.h>
+#include <hls_stream.h>
+
+#ifndef LOOKUP_HPP
+#define LOOKUP_HPP
+
+template <
+    unsigned NumEmbeddings,
+    unsigned EmbeddingDim,
+    unsigned NumInputs,
+    typename InputType,
+    typename EmbeddingType,
+    typename InputPackedType = ap_uint<InputType::width>,
+    typename OutputPackedType = ap_uint<EmbeddingDim*EmbeddingType::width>>
+void StreamingLookup(
+    hls::stream<InputPackedType> &in,
+    hls::stream<OutputPackedType> &out,
+    OutputPackedType const embeddings[NumEmbeddings]
+) {
+    for(unsigned i = 0; i < NumInputs; i++) {
+#pragma HLS PIPELINE II=1
+        InputPackedType inPackedElem = in.read();
+        InputType inElem = *(reinterpret_cast<InputType*>(&inPackedElem));
+        OutputPackedType outElem = embeddings[inElem];
+        out.write(outElem);
+    }
+}
+
+#endif
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn
similarity index 55%
rename from docker/Dockerfile.finn_dev
rename to docker/Dockerfile.finn
index 1feeb51a5308638b404cd0a8c70550f95ccbc450..4d03e2fbb5c4cce7dbda6a757aea8dce3e15e569 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2021, Xilinx
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,9 @@
 
 FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime
 LABEL maintainer="Yaman Umuroglu <yamanu@xilinx.com>"
-ARG GID
-ARG GNAME
-ARG UNAME
-ARG UID
-ARG PASSWD
+
+# XRT version to be installed
+ARG XRT_DEB_VERSION="xrt_202010.2.7.766_18.04-amd64-xrt"
 
 WORKDIR /workspace
 
@@ -41,7 +39,6 @@ WORKDIR /workspace
 ENV TZ="Europe/Dublin"
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 
-
 RUN apt-get update
 RUN apt-get -y upgrade
 RUN apt-get install -y build-essential
@@ -56,14 +53,24 @@ RUN apt-get install -y rsync
 RUN apt-get install -y git
 RUN apt-get install -y sshpass
 RUN apt-get install -y wget
+RUN apt-get install -y sudo
 RUN apt-get install -y unzip
 RUN apt-get install -y zip
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 
+# install XRT
+RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb
+RUN apt install -y /tmp/$XRT_DEB_VERSION.deb
+RUN rm /tmp/$XRT_DEB_VERSION.deb
+
+# versioned Python package requirements for FINN compiler
+# these are given in requirements.txt
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 RUN rm requirements.txt
+# extra Python package dependencies (for testing and interaction)
 RUN pip install pygments==2.4.1
+RUN pip install ipykernel==5.5.5
 RUN pip install jupyter==1.0.0
 RUN pip install matplotlib==3.3.1 --ignore-installed
 RUN pip install pytest-dependency==0.5.1
@@ -71,67 +78,80 @@ RUN pip install sphinx==3.1.2
 RUN pip install sphinx_rtd_theme==0.5.0
 RUN pip install pytest-xdist==2.0.0
 RUN pip install pytest-parallel==0.1.0
-RUN pip install netron>=4.7.9
+RUN pip install "netron>=5.0.0"
 RUN pip install pandas==1.1.5
 RUN pip install scikit-learn==0.24.1
 RUN pip install tqdm==4.31.1
 RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
 
+# git-based Python repo dependencies
+# these are installed in editable mode for easier co-development
+ARG FINN_BASE_COMMIT="e8facdd719b55839cca46da2cc4f4a4a372afb41"
+ARG QONNX_COMMIT="9f9eff95227cc57aadc6eafcbd44b7acda89f067"
+ARG FINN_EXP_COMMIT="af6102769226b82b639f243dc36f065340991513"
+ARG BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
+ARG PYVERILATOR_COMMIT="0c3eb9343500fc1352a02c020a736c8c2db47e8e"
+ARG CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
+ARG HLSLIB_COMMIT="966d17d3fddd801927b2167627d23a9a15ed1461"
+ARG OMX_COMMIT="1dfc4aa2f2895632742cd5751520c6b472feb74e"
+ARG AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 
-# switch user
-RUN groupadd -g $GID $GNAME
-RUN useradd -M -u $UID $UNAME -g $GNAME
-RUN usermod -aG sudo $UNAME
-RUN echo "$UNAME:$PASSWD" | chpasswd
-RUN echo "root:$PASSWD" | chpasswd
-RUN chown -R $UNAME:$GNAME /workspace
-RUN ln -s /workspace /home/$UNAME
-USER $UNAME
-
-
-# cloning dependency repos (as user)
 # finn-base
 RUN git clone https://github.com/Xilinx/finn-base.git /workspace/finn-base
+RUN git -C /workspace/finn-base checkout $FINN_BASE_COMMIT
+RUN pip install -e /workspace/finn-base
+# Install qonnx without dependencies, currently its only dependency is finn-base
+RUN git clone https://github.com/fastmachinelearning/qonnx.git /workspace/qonnx
+RUN git -C /workspace/qonnx checkout $QONNX_COMMIT
+RUN pip install --no-dependencies -e /workspace/qonnx
 # finn-experimental
 RUN git clone https://github.com/Xilinx/finn-experimental.git /workspace/finn-experimental
-# Brevitas
+RUN git -C /workspace/finn-experimental checkout $FINN_EXP_COMMIT
+RUN pip install -e /workspace/finn-experimental
+# brevitas
 RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
-# CNPY
+RUN git -C /workspace/brevitas checkout $BREVITAS_COMMIT
+RUN pip install -e /workspace/brevitas
+# pyverilator
+RUN git clone https://github.com/maltanar/pyverilator.git /workspace/pyverilator
+RUN git -C /workspace/pyverilator checkout $PYVERILATOR_COMMIT
+RUN pip install -e /workspace/pyverilator
+# other git-based dependencies (non-Python)
+# cnpy
 RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy
-# FINN hlslib
+RUN git -C /workspace/cnpy checkout $CNPY_COMMIT
+# finn-hlslib
 RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib
-# PyVerilator
-RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
+RUN git -C /workspace/finn-hlslib checkout $HLSLIB_COMMIT
 # oh-my-xilinx
 RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx
+RUN git -C /workspace/oh-my-xilinx checkout $OMX_COMMIT
+# board files
+RUN cd /tmp; \
+    wget -q https://github.com/cathalmccabe/pynq-z1_board_files/raw/master/pynq-z1.zip; \
+    wget -q https://dpoauwgwqsy2x.cloudfront.net/Download/pynq-z2.zip; \
+    unzip -q pynq-z1.zip; \
+    unzip -q pynq-z2.zip; \
+    mkdir /workspace/board_files; \
+    mv pynq-z1/ /workspace/board_files/; \
+    mv pynq-z2/ /workspace/board_files/; \
+    rm pynq-z1.zip; \
+    rm pynq-z2.zip; \
+    git clone https://github.com/Avnet/bdf.git /workspace/avnet-bdf; \
+    git -C /workspace/avnet-bdf checkout  $AVNET_BDF_COMMIT; \
+    mv /workspace/avnet-bdf/* /workspace/board_files/;
+
 
-# for this developer-oriented Docker container we assume the FINN repo is cloned and mounted from the host
-# at /workspace/finn -- see run-docker.sh for an example of how to do this.
-ENV PATH "${PATH}:/workspace/oh-my-xilinx:/home/$UNAME/.local/bin"
+# extra environment variables for FINN compiler
+ENV VIVADO_IP_CACHE "/tmp/vivado_ip_cache"
+ENV PATH "${PATH}:/workspace/oh-my-xilinx"
 ENV OHMYXILINX "/workspace/oh-my-xilinx"
 
-WORKDIR /home/$UNAME/finn
-RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /home/$UNAME/.bashrc
-RUN echo "source \$VIVADO_PATH/settings64.sh" >> /home/$UNAME/.bashrc
+WORKDIR /workspace/finn
 
-# copy entrypoint script
-USER root
 COPY docker/finn_entrypoint.sh /usr/local/bin/
 COPY docker/quicktest.sh /usr/local/bin/
 RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
 RUN chmod 755 /usr/local/bin/quicktest.sh
-# install vitis deps if required
-ARG INSTALL_XRT_DEPS
-ARG XRT_DEB_VERSION
-RUN if [ "$INSTALL_XRT_DEPS" = "1" ] ; then \
-    echo "Installing XRT: $XRT_DEB_VERSION"; \
-    wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb; \
-    apt install -y /tmp/$XRT_DEB_VERSION.deb; \
-  else \
-    echo "Skipping installation of XRT dependencies"; \
-  fi
-
-USER $UNAME
-
 ENTRYPOINT ["finn_entrypoint.sh"]
 CMD ["bash"]
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
deleted file mode 100644
index c4248919696a1bbff0a1fd5a2264f4edadd4b112..0000000000000000000000000000000000000000
--- a/docker/Dockerfile.finn_ci
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
-LABEL maintainer="Yaman Umuroglu <yamanu@xilinx.com>"
-
-WORKDIR /workspace
-
-# some Vitis deps require a timezone to be specified, which hangs in Docker
-# use workaround from https://grigorkh.medium.com/fix-tzdata-hangs-docker-image-build-cdb52cc3360d
-ENV TZ="Europe/Dublin"
-RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-
-RUN apt-get update
-RUN apt-get -y upgrade
-RUN apt-get install -y build-essential
-RUN apt-get install -y libglib2.0-0
-RUN apt-get install -y libsm6
-RUN apt-get install -y libxext6
-RUN apt-get install -y libxrender-dev
-RUN apt-get install -y verilator
-RUN apt-get install -y nano
-RUN apt-get install -y zsh
-RUN apt-get install -y rsync
-RUN apt-get install -y git
-RUN apt-get install -y sshpass
-RUN apt-get install -y wget
-RUN apt-get install -y unzip
-RUN apt-get install -y zip
-RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
-
-# XRT deps
-# install vitis deps if required
-ARG INSTALL_XRT_DEPS="0"
-ARG XRT_DEB_VERSION="xrt_202010.2.7.766_18.04-amd64-xrt"
-RUN if [ "$INSTALL_XRT_DEPS" = "1" ] ; then \
-    echo "Installing XRT: $XRT_DEB_VERSION"; \
-    wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb; \
-    apt install -y /tmp/$XRT_DEB_VERSION.deb; \
-  else \
-    echo "Skipping installation of XRT dependencies"; \
-  fi
-
-# cloning dependency repos
-# finn-base
-RUN git clone https://github.com/Xilinx/finn-base.git /workspace/finn-base
-# finn-experimental
-RUN git clone https://github.com/Xilinx/finn-experimental.git /workspace/finn-experimental
-# Brevitas
-RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
-# CNPY
-RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy
-# FINN hlslib
-RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib
-# PyVerilator
-RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
-# oh-my-xilinx
-RUN git clone https://bitbucket.org/maltanar/oh-my-xilinx.git /workspace/oh-my-xilinx
-
-COPY requirements.txt .
-RUN pip install -r requirements.txt
-RUN rm requirements.txt
-RUN pip install pytest-dependency
-RUN pip install pytest-xdist
-RUN pip install pytest-parallel
-RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
-
-ENV VIVADO_IP_CACHE "/tmp/vivado_ip_cache"
-ENV PATH "${PATH}:/workspace/oh-my-xilinx"
-ENV OHMYXILINX "/workspace/oh-my-xilinx"
-
-# colorful terminal output
-RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /root/.bashrc
-
-WORKDIR /workspace/finn
-
-COPY docker/finn_entrypoint.sh /usr/local/bin/
-COPY docker/quicktest.sh /usr/local/bin/
-RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
-RUN chmod 755 /usr/local/bin/quicktest.sh
-ENTRYPOINT ["finn_entrypoint.sh"]
-CMD ["bash"]
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 7edb3f9a9e831e005440dd69b36998272d727da5..a2312d025b616acd285b94f1b56b83f0c35cc0ae 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -1,12 +1,48 @@
 #!/bin/bash
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 
-export SHELL=/bin/bash
 export FINN_ROOT=/workspace/finn
+export HOME=/tmp/home_dir
+export SHELL=/bin/bash
+# colorful terminal output
+export PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '
 
+YELLOW='\033[0;33m'
 GREEN='\033[0;32m'
 RED='\033[0;31m'
 NC='\033[0m' # No Color
 
+yecho () {
+  echo -e "${YELLOW}WARNING: $1${NC}"
+}
+
 gecho () {
   echo -e "${GREEN}$1${NC}"
 }
@@ -15,97 +51,44 @@ recho () {
   echo -e "${RED}ERROR: $1${NC}"
 }
 
-# checkout the correct dependency repo commits
-# the repos themselves are cloned in the Dockerfile
-FINN_BASE_COMMIT=ac0b86a63eb937b869bfa453a996a8a8b8506546
-FINN_EXP_COMMIT=f82c0d9868bb88ea045dfadb28508d327d287221
-BREVITAS_COMMIT=d7ded80fa9557da2998ea310669edee7fb2d9526
-CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=4d74baefa79df48b5a0348d63f39a26df075de51
-PYVERILATOR_COMMIT=e2ff74030de3992dcac54bf1b6aad2915946e8cb
-OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
-AVNET_BDF_COMMIT=2d49cfc25766f07792c0b314489f21fe916b639b
-
-gecho "Setting up known-good commit versions for FINN dependencies"
-# finn-base
-gecho "finn-base @ $FINN_BASE_COMMIT"
-git -C /workspace/finn-base pull --quiet
-git -C /workspace/finn-base checkout $FINN_BASE_COMMIT --quiet
-pip install --user -e /workspace/finn-base
-# finn-experimental
-gecho "finn-experimental @ $FINN_EXP_COMMIT"
-git -C /workspace/finn-experimental pull --quiet
-git -C /workspace/finn-experimental checkout $FINN_EXP_COMMIT --quiet
-pip install --user -e /workspace/finn-experimental
-# Brevitas
-gecho "brevitas @ $BREVITAS_COMMIT"
-git -C /workspace/brevitas pull --quiet
-git -C /workspace/brevitas checkout $BREVITAS_COMMIT --quiet
-pip install --user -e /workspace/brevitas
-# CNPY
-gecho "cnpy @ $CNPY_COMMIT"
-git -C /workspace/cnpy pull --quiet
-git -C /workspace/cnpy checkout $CNPY_COMMIT --quiet
-# FINN hlslib
-gecho "finn-hlslib @ $HLSLIB_COMMIT"
-git -C /workspace/finn-hlslib pull --quiet
-git -C /workspace/finn-hlslib checkout $HLSLIB_COMMIT --quiet
-# PyVerilator
-gecho "PyVerilator @ $PYVERILATOR_COMMIT"
-git -C /workspace/pyverilator pull --quiet
-git -C /workspace/pyverilator checkout $PYVERILATOR_COMMIT --quiet
-pip install --user -e /workspace/pyverilator
-# oh-my-xilinx
-gecho "oh-my-xilinx @ $OMX_COMMIT"
-git -C /workspace/oh-my-xilinx pull --quiet
-git -C /workspace/oh-my-xilinx checkout $OMX_COMMIT --quiet
-# remove old version egg-info, if any
-rm -rf $FINN_ROOT/src/FINN.egg-info
-# run pip install for finn
-pip install --user -e $FINN_ROOT
-
-if [ ! -z "$VIVADO_PATH" ];then
-  # source Vivado env.vars
-  export XILINX_VIVADO=$VIVADO_PATH
-  source $VIVADO_PATH/settings64.sh
+if [ -f "$FINN_ROOT/setup.py" ];then
+  # run pip install for finn
+  pip install --user -e $FINN_ROOT
+else
+  recho "Unable to find FINN source code in /workspace/finn"
+  recho "Ensure you have passed -v <path-to-finn-repo>:/workspace/finn to the docker run command"
+  exit -1
 fi
 
-# download PYNQ board files if not already there
-if [ ! -d "/workspace/finn/board_files" ]; then
-    gecho "Downloading PYNQ board files for Vivado"
-    OLD_PWD=$(pwd)
-    cd /workspace/finn
-    wget -q https://github.com/cathalmccabe/pynq-z1_board_files/raw/master/pynq-z1.zip
-    wget -q https://d2m32eurp10079.cloudfront.net/Download/pynq-z2.zip
-    unzip -q pynq-z1.zip
-    unzip -q pynq-z2.zip
-    mkdir /workspace/finn/board_files
-    mv pynq-z1/ board_files/
-    mv pynq-z2/ board_files/
-    rm pynq-z1.zip
-    rm pynq-z2.zip
-    cd $OLD_PWD
-fi
-if [ ! -d "/workspace/finn/board_files/ultra96v2" ]; then
-    gecho "Downloading Avnet BDF files from known-good commit into board_files"
-    OLD_PWD=$(pwd)
-    cd /workspace/finn
-    git clone https://github.com/Avnet/bdf.git
-    git -C /workspace/finn/bdf checkout $AVNET_BDF_COMMIT --quiet
-    mv /workspace/finn/bdf/* /workspace/finn/board_files/
-    rm -rf /workspace/finn/bdf
-    cd $OLD_PWD
-fi
-if [ ! -z "$VITIS_PATH" ];then
+if [ -f "$VITIS_PATH/settings64.sh" ];then
   # source Vitis env.vars
   export XILINX_VITIS=$VITIS_PATH
   export XILINX_XRT=/opt/xilinx/xrt
   source $VITIS_PATH/settings64.sh
-  if [ ! -z "$XILINX_XRT" ];then
+  gecho "Found Vitis at $VITIS_PATH"
+  if [ -f "$XILINX_XRT/setup.sh" ];then
     # source XRT
     source $XILINX_XRT/setup.sh
+    gecho "Found XRT at $XILINX_XRT"
   else
     recho "XRT not found on $XILINX_XRT, did the installation fail?"
+    exit -1
+  fi
+else
+  yecho "Unable to find $VITIS_PATH/settings64.sh"
+  yecho "Functionality dependent on Vitis will not be available."
+  yecho "If you need Vitis, ensure VITIS_PATH is set correctly and mounted into the Docker container."
+  if [ -f "$VIVADO_PATH/settings64.sh" ];then
+    # source Vivado env.vars
+    export XILINX_VIVADO=$VIVADO_PATH
+    source $VIVADO_PATH/settings64.sh
+    gecho "Found Vivado at $VIVADO_PATH"
+  else
+    yecho "Unable to find $VIVADO_PATH/settings64.sh"
+    yecho "Functionality dependent on Vivado will not be available."
+    yecho "If you need Vivado, ensure VIVADO_PATH is set correctly and mounted into the Docker container."
   fi
 fi
+
+# execute the provided command(s) as root
 exec "$@"
diff --git a/docker/Dockerfile.jenkins b/docker/jenkins/Dockerfile.jenkins
similarity index 100%
rename from docker/Dockerfile.jenkins
rename to docker/jenkins/Dockerfile.jenkins
diff --git a/docker/Jenkinsfile b/docker/jenkins/Jenkinsfile
similarity index 85%
rename from docker/Jenkinsfile
rename to docker/jenkins/Jenkinsfile
index b2d3102bd4aa3c00620f41c102af5a8b385cede7..f3211941890d634b12142ed13c0f0cf49a9003d8 100644
--- a/docker/Jenkinsfile
+++ b/docker/jenkins/Jenkinsfile
@@ -2,7 +2,8 @@ pipeline {
     agent any
     parameters {
         string(name: 'FINN_CI_BRANCH', defaultValue: '', description: 'FINN branch to build')
-        string(name: 'VIVADO_PATH', defaultValue: '', description: 'Path to Vivado installation')
+        string(name: 'FINN_XILINX_PATH', defaultValue: '', description: 'Path to Xilinx tool installation')
+        string(name: 'FINN_XILINX_VERSION', defaultValue: '2020.1', description: 'Xilinx tool version')
         string(name: 'PYNQ_BOARD', defaultValue: 'Pynq-Z1', description: 'PYNQ board type')
         string(name: 'PYNQ_IP', defaultValue: '', description: 'PYNQ board IP address')
         string(name: 'PYNQ_USERNAME', defaultValue: 'xilinx', description: 'PYNQ board username')
@@ -22,6 +23,8 @@ pipeline {
         DOCKER_TAG='finn_ci:$BUILD_ID'
         DOCKER_INST_NAME='finn_ci'
         BUILD_PATH='/tmp/finn_ci'
+        VIVADO_PATH=${params.FINN_XILINX_PATH}/Vivado/${params.FINN_XILINX_VERSION}
+        VITIS_PATH=${params.FINN_XILINX_PATH}/Vitis/${params.FINN_XILINX_VERSION}
     }
     stages {
         stage("Clone") {
@@ -45,10 +48,11 @@ pipeline {
                 docker run --init \
                 --hostname $DOCKER_INST_NAME \
                 -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
-                -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
+                -v ${params.FINN_XILINX_PATH}:${params.FINN_XILINX_PATH}:ro \
                 -e NUM_DEFAULT_WORKERS=1 \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
-                -e VIVADO_PATH=${params.VIVADO_PATH} \
+                -e VIVADO_PATH=$VIVADO_PATH \
+                -e VITIS_PATH=$VITIS_PATH \
                 -e PYNQ_BOARD=${params.PYNQ_BOARD} \
                 -e PYNQ_IP=${params.PYNQ_IP} \
                 -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
@@ -65,10 +69,11 @@ pipeline {
                 docker run --init \
                 --hostname $DOCKER_INST_NAME \
                 -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
-                -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
+                -v $VIVADO_PATH:$VIVADO_PATH:ro \
                 -e NUM_DEFAULT_WORKERS=1 \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
-                -e VIVADO_PATH=${params.VIVADO_PATH} \
+                -e VIVADO_PATH=$VIVADO_PATH \
+                -e VITIS_PATH=$VITIS_PATH \
                 -e PYNQ_BOARD=${params.PYNQ_BOARD} \
                 -e PYNQ_IP=${params.PYNQ_IP} \
                 -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
@@ -85,10 +90,11 @@ pipeline {
                 docker run --init \
                 --hostname $DOCKER_INST_NAME \
                 -v ${params.WORKSPACE_MOUNT}:/workspace/finn \
-                -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
+                -v $VIVADO_PATH:$VIVADO_PATH:ro \
                 -e NUM_DEFAULT_WORKERS=${params.NUM_DEFAULT_WORKERS} \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
-                -e VIVADO_PATH=${params.VIVADO_PATH} \
+                -e VIVADO_PATH=$VIVADO_PATH \
+                -e VITIS_PATH=$VITIS_PATH \
                 -e PYNQ_BOARD=${params.PYNQ_BOARD} \
                 -e PYNQ_IP=${params.PYNQ_IP} \
                 -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
diff --git a/docker/launch-jenkins.sh b/docker/jenkins/launch-jenkins.sh
similarity index 100%
rename from docker/launch-jenkins.sh
rename to docker/jenkins/launch-jenkins.sh
diff --git a/docs/finn/brevitas_export.rst b/docs/finn/brevitas_export.rst
index 65f6ab6b3053d9f11239b3c048143b3d2f346808..408b14fd2b6c99ce3ec128a0361a25b3f2c193a5 100644
--- a/docs/finn/brevitas_export.rst
+++ b/docs/finn/brevitas_export.rst
@@ -8,7 +8,13 @@ Brevitas Export
    :scale: 70%
    :align: center
 
-FINN expects an ONNX model as input. This can be a model trained with `Brevitas <https://github.com/Xilinx/brevitas>`_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks <https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq>`_. Brevitas provides an export of a quantized network in ONNX representation. The resulting model consists only of `ONNX standard nodes <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_, but also contains additional attributes for the ONNX nodes to represent low precision datatypes. To work with the model it is wrapped into :ref:`modelwrapper` provided by FINN.
+FINN expects an ONNX model as input. This can be a model trained with `Brevitas <https://github.com/Xilinx/brevitas>`_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks <https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq>`_. Brevitas provides an export of a quantized network in ONNX representation in several flavors.
+Two of the Brevitas-exported ONNX variants can be ingested by FINN:
+
+   * FINN-ONNX: Quantized weights exported as tensors with additional attributes to mark low-precision datatypes. Quantized activations exported as MultiThreshold nodes.
+   * QONNX: All quantization is represented using Quant, BinaryQuant or Trunc nodes. QONNX must be converted into FINN-ONNX by :py:mod:`finn.transformation.qonnx.convert_qonnx_to_finn`
+
+To work with either type of ONNX model, it is loaded into a :ref:`modelwrapper` provided by FINN.
 
 At this stage we can already use the functional verification flow to simulate the model using Python, this is marked in the graphic with the dotted arrow. For more details please have look at :ref:`verification`.
 
diff --git a/docs/finn/conf.py b/docs/finn/conf.py
index 1bd179c3f7904ba102f7a9b4f2edc2739ba58183..47ba99fb5fca55e99b3da6403532f145ac8ebbca 100644
--- a/docs/finn/conf.py
+++ b/docs/finn/conf.py
@@ -12,14 +12,15 @@
 #
 import os
 import sys
-sys.path.insert(0, os.path.abspath('../../src/'))
+
+sys.path.insert(0, os.path.abspath("../../src/"))
 
 
 # -- Project information -----------------------------------------------------
 
-project = 'FINN'
-copyright = '2020, Xilinx'
-author = 'Y. Umuroglu and J. Petri-Koenig'
+project = "FINN"
+copyright = "2020, Xilinx"
+author = "Y. Umuroglu and J. Petri-Koenig"
 
 
 # -- General configuration ---------------------------------------------------
@@ -27,17 +28,17 @@ author = 'Y. Umuroglu and J. Petri-Koenig'
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = [
-]
-extensions.append('sphinx.ext.autodoc')
+extensions = []
+extensions.append("sphinx.ext.autodoc")
+extensions.append("sphinx.ext.autosectionlabel")
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
 
 # -- Options for HTML output -------------------------------------------------
@@ -45,11 +46,11 @@ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
-master_doc = 'index'
+master_doc = "index"
diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst
index 6e7fa0d920a943e468fd70464b050ab74cf8ec7d..508cd86a31b6284e072499987ae45864d3942e16 100644
--- a/docs/finn/developers.rst
+++ b/docs/finn/developers.rst
@@ -7,7 +7,7 @@ Developer documentation
 This page is intended to serve as a starting point for new FINN developers.
 Power users may also find this information useful.
 
-Getting started
+Prerequisites
 ================
 
 Before starting to do development on FINN it's a good idea to start
diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst
index 093344e70331572a425a09d34c5a68d7313bc521..e426bdb4e28dd02c83b47d59b59c318840815f78 100644
--- a/docs/finn/faq.rst
+++ b/docs/finn/faq.rst
@@ -4,68 +4,109 @@
 Frequently Asked Questions
 ***********************
 
-.. note:: **This page is under construction.**
+Can't find the answer to your question here? Check `FINN GitHub Discussions <https://github.com/Xilinx/finn/discussions>`_.
 
-Can I install FINN out of the Docker container?
-===============================================
 
-We do not support out of the Docker implementations at the moment. This is due 
-to the high complexity of the FINN project dependencies.
+Can I install FINN out of the Docker container?
+    We do not support out of the Docker implementations at the moment. This is due
+    to the high complexity of the FINN project dependencies.
 
 Since FINN uses ONNX, can I compile any model from the ONNX Model Zoo to an FPGA accelerator?
-=============================================================================================
+    The short answer is no. FINN uses ONNX in a specific (non-standard) way, including custom layer
+    types and quantization annotations. Networks must be first quantized using Brevitas and exported
+    to FINN-ONNX to be converted to FPGA accelerators.
 
-The short answer is no. FINN uses ONNX in a specific (non-standard) way, including custom layer 
-types and quantization annotations. Networks must be first quantized using Brevitas and exported
-to FINN-ONNX to be converted to FPGA accelerators.
 
+Can I install FINN out of the Docker container?
+    We do not support out of the Docker implementations at the moment. This is due
+    to the high complexity of the FINN project dependencies.
 
-Can I deploy custom NNs with arbitrary precisions and layers using FINN? 
-=========================================================================
+Since FINN uses ONNX, can I compile any model from the ONNX Model Zoo to an FPGA accelerator?
+    The short answer is no. FINN uses ONNX in a specific (non-standard) way, including custom layer
+    types and quantization annotations. Networks must be first quantized using Brevitas and exported
+    to FINN-ONNX to be converted to FPGA accelerators.
 
-Yes, though the effort required and quality of results will vary.
-Although we do support arbitrary 
-precision, the way we create the hardware isn't typically practical for more than 
-4 bits, or very large networks for a single FPGA. 
-In terms of layers, only a subset of quantized layers covered by the various FINN examples 
-are currently supported.
-It is possible to add support for new layers, though we don't have tutorials for this in place
-just yet.
 
-Does FINN only work with the example networks?
-==============================================
+Can I deploy custom NNs with arbitrary precisions and layers using FINN?
+    Yes, though the effort required and quality of results will vary.
+    Although we do support arbitrary
+    precision, the way we create the hardware isn't typically practical for more than
+    4 bits, or very large networks for a single FPGA.
+    In terms of layers, only a subset of quantized layers covered by the various FINN examples
+    are currently supported.
+    It is possible to add support for new layers, though we don't have tutorials for this in place
+    just yet.
 
-FINN isn't restricted to the example networks; 
-rather, it's restricted to certain patterns (e.g. certain layer types and their combinations). 
-The current best practice for custom networks is to take a working network and gradually modify it. 
+Does FINN only work with the example networks?
+    FINN isn't restricted to the example networks;
+    rather, it's restricted to certain patterns (e.g. certain layer types and their combinations).
+    The current best practice for custom networks is to take a working network and gradually modify it.
 
 What is the expected background for using FINN?
-===============================================
-
-Some general knowledge of Python, Docker, machine learning with neural networks and Jupyter notebooks
-is expected.
-Our goal is to make the tool in a shape and form so that no hardware/FPGA background 
-should be necessary, although having some knowledge would give better results.
+    Some general knowledge of Python, Docker, machine learning with neural networks and Jupyter notebooks
+    is expected.
+    Our goal is to make the tool in a shape and form so that no hardware/FPGA background
+    should be necessary, although having some knowledge would give better results.
 
 What operating systems are supported by FINN?
-=============================================
-
-FINN should work fine under any Linux-based OS capable of running Vivado/Vitis, as long
-as you install Docker (``docker-ce``) on your machine .
+    FINN should work fine under any Linux-based OS capable of running Vivado/Vitis, as long
+    as you install Docker (``docker-ce``) on your machine.
 
 
 I am getting DocNav and Model_Composer errors when launching the Docker image.
-==============================================================================
-
-We do not mount those particular directories into the Docker container because they are not
-used. The errors are Vivado related but you can safely ignore them.
+    We do not mount those particular directories into the Docker container because they are not
+    used. The errors are Vivado related but you can safely ignore them.
 
 What board do you recommend to start working with FINN?
-=======================================================
-
-Our preferred target platforms are those supported by  `PYNQ <http://www.pynq.io/board.html>`_.
-For those boards we can offer end-to-end (DNN-to-bitstream) deployment,
-see the `finn-examples <https://github.com/Xilinx/finn-examples>`_ repository for some examples.
-However, FINN also supports Vivado IP Integrator designs. The IPs connect using AXI stream (FIFO) 
-in-and-out interfaces. This means that it can be integrated onto any Xilinx FPGA board,
-though you will have to do the system integration manually.
+    Our preferred target platforms are those supported by  `PYNQ <http://www.pynq.io/board.html>`_.
+    For those boards we can offer end-to-end (DNN-to-bitstream) deployment,
+    see the `finn-examples <https://github.com/Xilinx/finn-examples>`_ repository for some examples.
+    However, FINN also supports Vivado IP Integrator designs. The IPs connect using AXI stream (FIFO)
+    in-and-out interfaces. This means that it can be integrated onto any Xilinx FPGA board,
+    though you will have to do the system integration manually.
+
+FINN-generated builds break after I restart my computer, because ``/tmp`` gets wiped.
+    See https://github.com/Xilinx/finn/discussions/404
+
+How can I target an arbitrary Xilinx FPGA without PYNQ support?
+    See https://github.com/Xilinx/finn/discussions/387
+
+Why does FINN-generated architectures need FIFOs between layers?
+    See https://github.com/Xilinx/finn/discussions/383
+
+How do I tell FINN to utilize DSPs instead of LUTs for MAC operations in particular layers?
+    This is done with the ``resType="dsp"`` attribute on ``StreamingFCLayer`` and ``Vector_Vector_Activate`` instances.
+    When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layersâ€™
+    folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`).
+    This is a good idea for layers with more weight/input act bits and high PE*SIMD.
+    See the `MobileNet-v1 build config for ZCU104 in finn-examples <https://github.com/Xilinx/finn-examples/blob/main/build/mobilenet-v1/folding_config/ZCU104_folding_config.json#L15>`_ for reference.
+
+
+How do I tell FINN to utilize a particular type of memory resource in particular layers?
+    This is done with the ``ram_style`` attribute. Check the particular ``HLSCustomOp`` attribute definition to see
+    which modes are supported (`example for StreamingFCLayer <https://github.com/Xilinx/finn/blob/dev/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py#L95>`_).
+    When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layersâ€™
+    folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`).
+    See the `MobileNet-v1 build config for ZCU104 in finn-examples <https://github.com/Xilinx/finn-examples/blob/main/build/mobilenet-v1/folding_config/ZCU104_folding_config.json#L15>`_ for reference.
+
+Which data layout do FINN-generated accelerators use? Big-endian? Little-endian?
+    The data layout used by FINN does not correspond to system-level big or little endian due to difficulties in defining what
+    the â€œword sizeâ€ is and bit packing for smaller datatypes. FINNâ€™s â€œword sizeâ€ is dependent on the parallelization of the
+    first/last layers. For instance, if the first HLS layer is using SIMD=3 this means the â€œinnermost dimensionâ€ in the
+    data packing functions will be of size 3.
+    When you use the verification infrastructure or the generated PYNQ Python drivers that FINN provides, the tool normally
+    takes care of any required data layout conversion on standard numpy arrays before presenting the data to the accelerator,
+    and vice versa on the output side. Doing this data packing and layout conversion manually can be messy at the moment.
+    If you need to do this manually, first examine how the `FINN PYNQ Python drivers <https://github.com/Xilinx/finn-examples/blob/main/finn_examples/driver.py#L379>`_ do this â€“ notice how the input data is
+    first reshaped to create the â€œfolded input shapeâ€ that reflects the word size of the first layer based on how much it
+    was parallelized, then data packing is applied to obtain a raw byte array (with some reversals going on) that can be
+    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation:
+
+Why does FIFO sizing take so long for my network? Is something wrong?
+    The automatic FIFO sizing in FINN can take quite long. It unfortunately doesnâ€™t really parallelize on multiple cores since
+    itâ€™s based on running an rtl simulation with lots of inputs and very large FIFOs, then observing the max occupancy/count
+    in each FIFO.
+
+What's a good starting point for the folding configuration if I want to make manual changes?
+    First, enable automatic folding options in ``build_dataflow`` such ``target_fps``. This should find a decent set of
+    folding factors and save them to ``output_folder/auto_folding_config.json`` which you can use as a basis for creating the desired config.
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index f7ca6af31c7660e52eca4b2a8f6dc1be482e3606..af7a05751b0b2f9c991849e2c808e089aaac68d9 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -4,90 +4,65 @@
 Getting Started
 ***************
 
-How to use the FINN compiler
-============================
-Currently, it's best to think of the FINN compiler as *compiler infrastructure*
-instead of a full *compiler* like `gcc` (although the aim is to get there).
-Although we provide a :ref:`command_line` entry for building dataflow
-accelerators, this only exposes a basic flow that works for simpler networks.
-A better way of looking at the FINN compiler is as a collection of scripts/tools that will help
-you convert a QNN into a custom FPGA accelerator that performs high-performance inference.
-
-**So where do I get started?** The best way of getting started with the FINN
-compiler is to follow the existing
-`Jupyter notebooks <tutorials>`_ and check out the prebuilt
-`examples <https://github.com/Xilinx/finn-examples>`_.
-
-**How do I compile my custom network?**
-This depends on how similar your custom network is to the examples we provide.
+Quickstart
+==========
+
+1. Install Docker to run `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
+2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2020.1``)
+3. Clone the FINN compiler from the repo: ``git clone https://github.com/Xilinx/finn/`` and go into the directory where it is cloned
+4. Execute ``./run-docker.sh quicktest`` to verify your installation.
+5. Optionally, follow the instructions on :ref:`PYNQ board first-time setup` or :ref:`Alveo first-time setup` for board setup.
+6. Optionally, set up a `Vivado/Vitis license`_.
+7. All done! See :ref:`Running FINN in Docker` for the various options on how to run the FINN compiler.
+
+
+How do I use FINN?
+==================
+
+We strongly recommend that you first watch one of the pre-recorded `FINN tutorial <https://www.youtube.com/watch?v=zw2aG4PhzmA&amp%3Bindex=2>`_
+videos, then follow the Jupyter notebook tutorials for `training and deploying an MLP for network intrusion detection <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/cybersecurity>`_ .
+You may also want to check out the other :ref:`tutorials`, and the `FINN examples repository <https://github.com/Xilinx/finn-examples>`_ .
+
+Our aim in FINN is *not* to accelerate common off-the-shelf neural networks, but instead provide you with a set of tools
+to train *customized* networks and create highly-efficient FPGA implementations from them.
+In general, the approach for using the FINN framework is as follows:
+
+1. Train your own quantized neural network (QNN) in `Brevitas <https://github.com/Xilinx/brevitas>`_. We have some `guidelines <https://bit.ly/finn-hls4ml-qat-guidelines>`_ on quantization-aware training (QAT).
+2. Export to FINN-ONNX by following `this tutorial <https://github.com/Xilinx/finn/blob/master/notebooks/basics/1_brevitas_network_import.ipynb>`_ .
+3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial <https://github.com/Xilinx/finn/blob/master/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb>`_
+4. Adjust your QNN topology, quantization settings and ``build_dataflow`` configuration to get the desired results.
+
+Please note that the framework is still under development, and how well this works will depend on how similar your custom network is to the examples we provide.
 If there are substantial differences, you will most likely have to write your own
 Python scripts that call the appropriate FINN compiler
 functions that process your design correctly, or adding new functions (including
 Vivado HLS layers)
 as required.
-For custom networks, we recommend making a copy of the end-to-end
-Jupyter notebook as a starting point, visualizing the model at intermediate
+The `advanced FINN tutorials <https://github.com/Xilinx/finn/tree/master/notebooks/advanced>`_ can be useful here.
+For custom networks, we recommend making a copy of the `BNN-PYNQ end-to-end
+Jupyter notebook tutorials <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/bnn-pynq>`_ as a starting point, visualizing the model at intermediate
 steps and adding calls to new transformations as needed.
 Once you have a working flow, you can implement a command line entry for this
 by using the "advanced mode" described in the :ref:`command_line` section.
 
-
-
-
-System Requirements
-====================
-
-* Ubuntu 18.04 with ``bash`` installed
-* Docker `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
-* A working Vivado 2019.1 or 2020.1 installation
-* A ``VIVADO_PATH`` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located)
-* *(optional)* A PYNQ board with a network connection, see `PYNQ board first-time setup`_ below
-* *(optional)* An Alveo board, and a working Vitis 2020.1 installation if you want to use Vitis and Alveo (see `Alveo first-time setup`_ below)
-
-We also recommend running the FINN compiler on a system with sufficiently
-strong hardware:
-
-* **RAM.** Depending on your target FPGA platform, your system must have sufficient RAM to be
-  able to run Vivado/Vitis synthesis for that part. See `this page <https://www.xilinx.com/products/design-tools/vivado/memory.html>`_
-  for more information. For targeting Zynq and Zynq UltraScale+ parts, at least 8 GB is recommended. Larger parts may require up to 16 GB.
-  For targeting Alveo parts with Vitis, at least 64 GB RAM is recommended.
-
-* **CPU.** FINN can parallelize HLS synthesis and several other operations for different
-  layers, so using a multi-core CPU is recommended. However, this should be balanced
-  against the memory usage as a high degree of parallelization will require more
-  memory. See the ``NUM_DEFAULT_WORKERS`` environment variable below for more on
-  how to control the degree of parallelization.
-
-* **Storage.** While going through the build steps, FINN will generate many files as part of
-  the process. For larger networks, you may need 10s of GB of space for the temporary
-  files generated during the build.
-  By default, these generated files will be placed under ``/tmp/finn_dev_<username>``.
-  You can override this location by using the ``FINN_HOST_BUILD_DIR`` environment
-  variable.
-  Mapping the generated file dir to a fast SSD will result in quicker builds.
-
-
 Running FINN in Docker
 ======================
-We use Docker extensively for developing and deploying FINN. If you are not familiar with Docker, there are many excellent `online resources <https://docker-curriculum.com/>`_ to get started. There is a Dockerfile in the root of the repository, as well as a `run-docker.sh` script that can be launched in the following modes:
+FINN only running inside a Docker container, and comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources <https://docker-curriculum.com/>`_ to get started.
+You may want to review the :ref:`General FINN Docker tips` and :ref:`Environment variables` as well.
+If you want to use prebuilt images, read :ref:`Using a prebuilt image`.
+The ``run-docker.sh`` script that can be launched in the following modes:
 
-Getting an interactive shell for development or experimentation
-***************************************************************
-.. warning:: Do not use ``sudo`` to launch the FINN Docker. Instead, setup Docker to run `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
+Launch interactive shell
+************************
+Simply running sh run-docker.sh without any additional arguments will create a Docker container with all dependencies and give you a terminal with you can use for development for experimentation:
 
 ::
 
   bash ./run_docker.sh
 
-Simply running sh run-docker.sh without any additional arguments will clone the dependency repos, create a Docker container and give you a terminal with you can use for development for experimentation.
-If you want a new terminal on an already-running container, you can do this with `docker exec -it finn_dev_<username> bash`.
-
-.. warning:: The Docker container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the /workspace/finn folder (which is mounted from the host computer) or otherwise backed up.
-
-.. note:: **Develop from host, run inside container:** The FINN repository directory will be mounted from the host, so that you can use a text editor on your host computer to develop and the changes will be reflected directly inside the container.
 
-Command Line Entry
-*******************
+Launch a Build with ``build_dataflow``
+**************************************
 FINN is currently more compiler infrastructure than compiler, but we do offer
 a :ref:`command_line` entry for certain use-cases. These run a predefined flow
 or a user-defined flow from the command line as follows:
@@ -98,16 +73,17 @@ or a user-defined flow from the command line as follows:
   bash ./run_docker.sh build_custom <path/to/custom_build_dir/>
 
 
-Running the Jupyter notebooks
-*****************************
+Launch Jupyter notebooks
+************************
+FINN comes with numerous Jupyter notebook tutorials, which you can launch with:
+
 ::
 
   bash ./run-docker.sh notebook
 
 This will launch the `Jupyter notebook <https://jupyter.org/>`_ server inside a Docker container, and print a link on the terminal that you can open in your browser to run the FINN notebooks or create new ones.
 .. note:: The link will look something like this (the token you get will be different):
-http://127.0.0.1:8888/?token=f5c6bd32ae93ec103a88152214baedff4ce1850d81065bfc
-
+http://127.0.0.1:8888/?token=f5c6bd32ae93ec103a88152214baedff4ce1850d81065bfc.
 The ``run-docker.sh`` script forwards ports 8888 for Jupyter and 8081 for Netron, and launches the notebook server with appropriate arguments.
 
 
@@ -117,23 +93,50 @@ Environment variables
 Prior to running the `run-docker.sh` script, there are several environment variables you can set to configure certain aspects of FINN.
 These are summarized below:
 
-* ``VIVADO_PATH`` points to your Vivado installation on the host
-* (optional, for Vitis & Alveo only) ``VITIS_PATH``, and ``PLATFORM_REPO_PATHS`` respectively point to your Vitis installation, and the Vitis platform files.
-* (optional, for Vitis & Alveo only) ``XRT_DEB_VERSION`` specifies the .deb to be installed for XRT inside the container (see default value in ``run-docker.sh``).
+* (required) ``FINN_XILINX_PATH`` points to your Xilinx tools installation on the host (e.g. ``/opt/Xilinx``)
+* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2020.1``)
+* (required for Alveo) ``PLATFORM_REPO_PATHS`` points to the Vitis platform files (DSA).
+* (required for Alveo) ``XRT_DEB_VERSION`` specifies the .deb to be installed for XRT inside the container (see default value in ``run-docker.sh``).
+* (optional) ``NUM_DEFAULT_WORKERS`` (default 4) specifies the degree of parallelization for the transformations that can be run in parallel, potentially reducing build time
+* (optional) ``FINN_HOST_BUILD_DIR`` specifies which directory on the host will be used as the build directory. Defaults to ``/tmp/finn_dev_<username>``
 * (optional) ``JUPYTER_PORT`` (default 8888) changes the port for Jupyter inside Docker
 * (optional) ``JUPYTER_PASSWD_HASH`` (default "") Set the Jupyter notebook password hash. If set to empty string, token authentication will be used (token printed in terminal on launch).
 * (optional) ``LOCALHOST_URL`` (default localhost) sets the base URL for accessing e.g. Netron from inside the container. Useful when running FINN remotely.
 * (optional) ``NETRON_PORT`` (default 8081) changes the port for Netron inside Docker
-* (optional) ``NUM_DEFAULT_WORKERS`` (default 1) specifies the degree of parallelization for the transformations that can be run in parallel
 * (optional) ``PYNQ_BOARD`` or ``ALVEO_BOARD`` specifies the type of PYNQ/Alveo board used (see "supported hardware" below) for the test suite
 * (optional) ``PYNQ_IP`` and ``PYNQ_PORT`` (or ``ALVEO_IP`` and ``ALVEO_PORT``) specify ip address and port number to access the PYNQ board / Alveo target
 * (optional) ``PYNQ_USERNAME`` and ``PYNQ_PASSWORD`` (or ``ALVEO_USERNAME`` and ``ALVEO_PASSWORD``) specify the PYNQ board / Alveo host access credentials for the test suite. For PYNQ, password is always needed to run as sudo. For Alveo, you can leave the password empty and place your ssh private key in the ``finn/ssh_keys`` folder to use keypair authentication.
 * (optional) ``PYNQ_TARGET_DIR`` (or ``ALVEO_TARGET_DIR``) specifies the target dir on the PYNQ board / Alveo host for the test suite
-* (optional) ``FINN_HOST_BUILD_DIR`` specifies which directory on the host will be used as the build directory. Defaults to ``/tmp/finn_dev_<username>``
 * (optional) ``IMAGENET_VAL_PATH`` specifies the path to the ImageNet validation directory for tests.
+* (optional) ``FINN_DOCKER_PREBUILT`` (default 0) if set to 1 then skip Docker image building and use the image tagged with ``FINN_DOCKER_TAG``.
+* (optional) ``FINN_DOCKER_TAG`` (autogenerated) specifies the Docker image tag to use.
+* (optional) ``FINN_DOCKER_RUN_AS_ROOT`` (default 0) if set to 1 then run Docker container as root, default is the current user.
+* (optional) ``FINN_DOCKER_GPU`` (autodetected) if not 0 then expose all Nvidia GPUs or those selected by ``NVIDIA_VISIBLE_DEVICES`` to Docker container for accelerated DNN training. Requires `Nvidia Container Toolkit <https://github.com/NVIDIA/nvidia-docker>`_
+* (optional) ``FINN_DOCKER_EXTRA`` (default "") pass extra arguments to the ``docker run`` command when executing ``./run-docker.sh``
+* (optional) ``NVIDIA_VISIBLE_DEVICES`` (default "") specifies specific Nvidia GPUs to use in Docker container. Possible values are a comma-separated list of GPU UUID(s) or index(es) e.g. ``0,1,2``, ``all``, ``none``, or void/empty/unset.
+* (optional) ``DOCKER_BUILDKIT`` (default "1") enables `Docker BuildKit <https://docs.docker.com/develop/develop-images/build_enhancements/>`_ for faster Docker image rebuilding (recommended).
+
+General FINN Docker tips
+************************
+* Several folders including the root directory of the FINN compiler and the ``FINN_HOST_BUILD_DIR`` will be mounted into the Docker container and can be used to exchange files.
+* Do not use ``sudo`` to launch the FINN Docker. Instead, setup Docker to run `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_.
+* If you want a new terminal on an already-running container, you can do this with `docker exec -it <name_of_container> bash`.
+* The container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the /workspace/finn folder (which is mounted from the host computer) or otherwise backed up.
+
+Using a prebuilt image
+**********************
+
+By default the ``run-docker.sh`` script tries to re-build the Docker image with each run. After the first run this should go quite fast thanks to Docker caching.
+If you are having trouble building the Docker image or need offline access, you can use prebuilt images by following these steps:
 
-Supported Hardware
-===================
+1. Pull a prebuilt Docker image with ``docker pull maltanar/finn:<tag>`` where ``<tag>`` can be ``dev_latest`` or ``main_latest``
+2. Set the ``FINN_DOCKER_TAG`` to the name of the image you just pulled e.g. ``FINN_DOCKER_TAG=maltanar/finn:dev_latest``
+3. Set ``FINN_DOCKER_PREBUILT=1``
+4. You can now launch the Docker image in all modes without re-building or any internet access.
+
+
+Supported FPGA Hardware
+=======================
 **Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <https://pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards.
 As of FINN v0.4b we also have preliminary support for `Xilinx Alveo boards <https://www.xilinx.com/products/boards-and-kits/alveo.html>`_ using PYNQ and Vitis, see instructions below for Alveo setup.
 
@@ -178,4 +181,49 @@ On the host side:
 3. Install the Vitis platform files for Alveo and set up the ``PLATFORM_REPO_PATHS`` environment variable to point to your installation. *This must be the same path as the target's platform files (target step 2)*
 4. Set up the ``ALVEO_*`` environment variables accordingly for your target, see description of environment variables above.
 5. `Set up public key authentication <https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server>`_. Copy your private key to the ``finn/ssh_keys`` folder on the host to get password-less deployment and remote execution.
-5. Done! You can try the ``test_end2end_vitis`` tests in the FINN Docker to verify your setup, although this will take some time.
+6. Done! You can try the ``test_end2end_vitis`` tests in the FINN Docker to verify your setup, although this will take some time.
+
+Vivado/Vitis license
+*********************
+If you are targeting Xilinx FPGA parts that needs specific licenses (non-WebPack) you can make these available to the
+FINN Docker container by passing extra arguments. To do this, you can use the ``FINN_DOCKER_EXTRA`` environment variable as follows:
+
+::
+
+  export FINN_DOCKER_EXTRA=" -v /path/to/licenses:/path/to/licenses -e XILINXD_LICENSE_FILE=/path/to/licenses "
+
+The above example mounts ``/path/to/licenses`` from the host into the same path on the Docker container, and sets the
+value of the ``XILINXD_LICENSE_FILE`` environment variable.
+
+System Requirements
+====================
+
+* Ubuntu 18.04 with ``bash`` installed
+* Docker `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
+* A working Vivado 2020.1 installation
+* ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables correctly set, see `Quickstart`_
+* *(optional)* `Vivado/Vitis license`_ if targeting non-WebPack FPGA parts.
+* *(optional)* A PYNQ board with a network connection, see `PYNQ board first-time setup`_
+* *(optional)* An Alveo board, and a working Vitis 2020.1 installation if you want to use Vitis and Alveo (see `Alveo first-time setup`_ )
+
+We also recommend running the FINN compiler on a system with sufficiently
+strong hardware:
+
+* **RAM.** Depending on your target FPGA platform, your system must have sufficient RAM to be
+  able to run Vivado/Vitis synthesis for that part. See `this page <https://www.xilinx.com/products/design-tools/vivado/memory.html>`_
+  for more information. For targeting Zynq and Zynq UltraScale+ parts, at least 8 GB is recommended. Larger parts may require up to 16 GB.
+  For targeting Alveo parts with Vitis, at least 64 GB RAM is recommended.
+
+* **CPU.** FINN can parallelize HLS synthesis and several other operations for different
+  layers, so using a multi-core CPU is recommended. However, this should be balanced
+  against the memory usage as a high degree of parallelization will require more
+  memory. See the ``NUM_DEFAULT_WORKERS`` environment variable below for more on
+  how to control the degree of parallelization.
+
+* **Storage.** While going through the build steps, FINN will generate many files as part of
+  the process. For larger networks, you may need 10s of GB of space for the temporary
+  files generated during the build.
+  By default, these generated files will be placed under ``/tmp/finn_dev_<username>``.
+  You can override this location by using the ``FINN_HOST_BUILD_DIR`` environment
+  variable.
+  Mapping the generated file dir to a fast SSD will result in quicker builds.
diff --git a/docs/finn/index.rst b/docs/finn/index.rst
index 320cd88fe91af857c5a3948ef36a587ea305040f..751b105bb4ec35c880664e85a9550207e8a1f076 100644
--- a/docs/finn/index.rst
+++ b/docs/finn/index.rst
@@ -12,20 +12,20 @@ What is FINN?
 
 'FINN' is colloquially used to refer to two separate but highly related things:
 
-* The FINN **project**, which is an experimental framework from Xilinx Research Labs
-to explore deep neural network inference on FPGAs. It specifically targets
-quantized neural networks (QNNs), with emphasis on generating dataflow-style
-architectures customized for each network.
-The key components are illustrated in the figure above;
-including tools for training
-quantized neural networks (Brevitas), the FINN compiler, and the finn-hlslib
-Vivado HLS library of FPGA components for QNNs.
-Read more on the `FINN project homepage <https://xilinx.github.io/finn/>`_.
-
-* The FINN **compiler**, which this Read the Docs website is the documentation for.
-The compiler is a central part of the FINN project (above) that maps QNNs to
-dataflow-style FPGA architectures.
-You can find the FINN compiler in this `GitHub repository <https://github.com/Xilinx/finn>`_.
+*  The FINN **project**, which is an experimental framework from Xilinx Research Labs
+   to explore deep neural network inference on FPGAs. It specifically targets
+   quantized neural networks (QNNs), with emphasis on generating dataflow-style
+   architectures customized for each network.
+   The key components are illustrated in the figure above;
+   including tools for training
+   quantized neural networks (Brevitas), the FINN compiler, and the finn-hlslib
+   Vivado HLS library of FPGA components for QNNs.
+   Read more on the `FINN project homepage <https://xilinx.github.io/finn/>`_.
+
+*  The FINN **compiler**, which this Read the Docs website is the documentation for.
+   The compiler is a central part of the FINN project (above) that maps QNNs to
+   dataflow-style FPGA architectures.
+   You can find the FINN compiler in this `GitHub repository <https://github.com/Xilinx/finn>`_.
 
 
 More FINN Resources
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index 0fbc3cf72795005591994ddca0fa0d58b72622a8..9305f7840216f6d076a11337ddb3cfa588f1a062 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -4,12 +4,12 @@
 Internals
 *********
 
-Intermediate Representation: FINN-ONNX
-======================================
+Intermediate Representation: QONNX and FINN-ONNX
+================================================
 
 FINN uses `ONNX <https://github.com/onnx/onnx>`_ as an intermediate representation (IR) for neural networks. As such, almost every component inside FINN uses ONNX and its `Python API <https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md>`_, so you may want to familiarize yourself with how ONNX represents DNNs. Specifically, the `ONNX protobuf description <https://github.com/onnx/onnx/blob/master/onnx/onnx.proto>`_ (or its `human-readable documentation <https://github.com/onnx/onnx/blob/master/docs/IR.md>`_ and the `operator schemas <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_ are useful as reference documents. We also provide a Jupyter notebook that can help to get familiar with ONNX by showing how to work with a simple ONNX model in FINN, see chapter :ref:`tutorials` for details.
 
-.. note:: FINN uses ONNX is a specific way that we refer to as FINN-ONNX, and not all ONNX graphs are supported by FINN (and vice versa).
+.. note:: FINN supports two specialized variants of ONNX called QONNX and FINN-ONNX, and not all ONNX graphs are supported by FINN (and vice versa).
 
 Custom Quantization Annotations
 ===============================
diff --git a/docs/finn/source_code/finn.analysis.rst b/docs/finn/source_code/finn.analysis.rst
index 7312150657c86976638e73fdf2c0450160989a6a..1de42ac32bc62ce71e039f63168302b22711f454 100644
--- a/docs/finn/source_code/finn.analysis.rst
+++ b/docs/finn/source_code/finn.analysis.rst
@@ -23,6 +23,13 @@ finn.analysis.base
    :undoc-members:
    :show-inheritance:
 
+finn.analysis.inference\_cost
+-----------------------------
+
+.. automodule:: finn.analysis.inference_cost
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
 finn.analysis.topology
 -----------------------------
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index 7b4e7bfa05f895cd03aed2859576e07db28bd9f9..34a6285f227690c87c568855e7ca70ddb9b2764c 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -13,6 +13,23 @@ Base Class
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.addstreams\_batch
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.addstreams_batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.channelwise\_op\_batch
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.channelwise_op_batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.convolutioninputgenerator
 -------------------------------------------------------------
 
@@ -21,6 +38,87 @@ finn.custom\_op.fpgadataflow.convolutioninputgenerator
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.convolutioninputgenerator1d
+-------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator1d
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.downsampler
+-----------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.downsampler
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.duplicatestreams\_batch
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.duplicatestreams_batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.fmpadding\_batch
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.fmpadding_batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.globalaccpool\_batch
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.globalaccpool_batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.iodma
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.iodma
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.labelselect\_batch
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.labelselect_batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.lookup
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.lookup
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.pool\_batch
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.pool_batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.streamingdataflowpartition
+--------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.streamingdataflowpartition
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch
 ----------------------------------------------------------------------
 
@@ -61,6 +159,15 @@ finn.custom\_op.fpgadataflow.templates
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.thresholding\_batch
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.thresholding_batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.tlastmarker
 -----------------------------------------------
 
@@ -68,3 +175,19 @@ finn.custom\_op.fpgadataflow.tlastmarker
    :members:
    :undoc-members:
    :show-inheritance:
+
+finn.custom\_op.fpgadataflow.upsampler
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.upsampler
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.vector\_vector\_activate\_batch
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.vector_vector_activate_batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/finn/source_code/finn.custom_op.general.rst b/docs/finn/source_code/finn.custom_op.general.rst
index e86774a48e22b5af9e4d2995a4287a740b1c08e5..87749fd69e541e628436aa904c180338418addc1 100644
--- a/docs/finn/source_code/finn.custom_op.general.rst
+++ b/docs/finn/source_code/finn.custom_op.general.rst
@@ -5,6 +5,14 @@ Custom Op - General
 General Custom Ops
 ===================
 
+finn.custom\_op.general.bipolar_quant
+--------------------------------------
+
+.. automodule:: finn.custom_op.general.bipolar_quant
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.general.debugmarker
 -----------------------------------
 
@@ -13,6 +21,14 @@ finn.custom\_op.general.debugmarker
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.general.genericpartition
+-----------------------------------------
+
+.. automodule:: finn.custom_op.general.genericpartition
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.general.im2col
 ------------------------------
 
@@ -37,6 +53,14 @@ finn.custom\_op.general.multithreshold
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.general.quant
+------------------------------
+
+.. automodule:: finn.custom_op.general.quant
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
 finn.custom\_op.general.quantavgpool2d
 --------------------------------------
 
@@ -45,13 +69,13 @@ finn.custom\_op.general.quantavgpool2d
   :undoc-members:
   :show-inheritance:
 
-finn.custom\_op.general.streamingdataflowpartition
----------------------------------------------------
+finn.custom\_op.general.trunc
+------------------------------
 
-.. automodule:: finn.custom_op.general.streamingdataflowpartition
-   :members:
-   :undoc-members:
-   :show-inheritance:
+.. automodule:: finn.custom_op.general.trunc
+  :members:
+  :undoc-members:
+  :show-inheritance:
 
 finn.custom\_op.general.xnorpopcount
 -------------------------------------
diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
index 42bc7fb5315756b924e0d1cce58ca4e110bda824..b1e7075bdcfb675a894f3e66b61d59117e4f078d 100644
--- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
@@ -62,6 +62,14 @@ finn.transformation.fpgadataflow.create\_stitched\_ip
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.fpgadataflow.externalize\_params
+------------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.externalize_params
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.transformation.fpgadataflow.floorplan
 ----------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.transformation.qonnx.rst b/docs/finn/source_code/finn.transformation.qonnx.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8320e19efb81dd5a52f750e22e280f41070bf48c
--- /dev/null
+++ b/docs/finn/source_code/finn.transformation.qonnx.rst
@@ -0,0 +1,51 @@
+***********************
+Transformation - QONNX
+************************
+
+Transformation (QONNX)
+===========================
+
+.. automodule:: finn.transformation.qonnx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.transformation.qonnx.convert\_qonnx\_to\_finn
+---------------------------------------------------
+
+.. automodule:: finn.transformation.qonnx.convert_qonnx_to_finn
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.transformation.qonnx.fold\_quant\_weights
+-----------------------------------------------
+
+.. automodule:: finn.transformation.qonnx.fold_quant_weights
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.transformation.qonnx.infer\_quant\_avg\_pool\_2d
+------------------------------------------------------
+
+.. automodule:: finn.transformation.qonnx.infer_quant_avg_pool_2d
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.transformation.qonnx.qonnx\_activation\_handlers
+-------------------------------------------------------
+
+.. automodule:: finn.transformation.qonnx.qonnx_activation_handlers
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.transformation.qonnx.quant\_act\_to\_multithreshold
+---------------------------------------------------------
+
+.. automodule:: finn.transformation.qonnx.quant_act_to_multithreshold
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst
index aeb0d7614222740315633f7658cab9cc7e75490b..cffb0fd0f9e963c02a6986e47e1654951ad3bab0 100644
--- a/docs/finn/source_code/finn.transformation.rst
+++ b/docs/finn/source_code/finn.transformation.rst
@@ -11,6 +11,7 @@ Submodules
    :maxdepth: 2
 
    finn.transformation.fpgadataflow
+   finn.transformation.qonnx
    finn.transformation.streamline
 
 Transformation Passes
@@ -40,6 +41,14 @@ finn.transformation.bipolar\_to\_xnor
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.change\_3d\_tensors\_to\_4d
+------------------------------------------------
+
+.. automodule:: finn.transformation.change_3d_tensors_to_4d
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
 finn.transformation.change\_datalayout
 --------------------------------------------
 
@@ -48,6 +57,13 @@ finn.transformation.change\_datalayout
   :undoc-members:
   :show-inheritance:
 
+finn.transformation.create\_generic\_partitions
+------------------------------------------------
+
+.. automodule:: finn.transformation.create_generic_partitions
+  :members:
+  :undoc-members:
+  :show-inheritance:
 
 finn.transformation.double\_to\_single\_float
 ----------------------------------------------------
@@ -57,6 +73,23 @@ finn.transformation.double\_to\_single\_float
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.extend\_partition
+------------------------------------------
+
+.. automodule:: finn.transformation.extend_partition
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.transformation.extract\_conv\_bias
+------------------------------------------
+
+.. automodule:: finn.transformation.extract_conv_bias
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.transformation.fold\_constants
 ------------------------------------------
 
@@ -65,6 +98,14 @@ finn.transformation.fold\_constants
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.gemm\_to\_matmul
+------------------------------------------
+
+.. automodule:: finn.transformation.gemm_to_matmul
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.transformation.general
 ----------------------------------
 
@@ -113,6 +154,13 @@ finn.transformation.lower\_convs\_to\_matmul
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.make\_input\_chanlast
+------------------------------------------
+
+.. automodule:: finn.transformation.make_input_chanlast
+  :members:
+  :undoc-members:
+  :show-inheritance:
 
 finn.transformation.merge\_onnx\_models
 ----------------------------------------
@@ -130,3 +178,11 @@ finn.transformation.move\_reshape
    :members:
    :undoc-members:
    :show-inheritance:
+
+finn.transformation.remove
+-------------------------------------
+
+.. automodule:: finn.transformation.remove
+  :members:
+  :undoc-members:
+  :show-inheritance:
diff --git a/docs/finn/source_code/finn.transformation.streamline.rst b/docs/finn/source_code/finn.transformation.streamline.rst
index f43d6d12314d3bad38f189d2831e21447f10cf10..9ed4bbe1d8c6b12c67e1c0c2927e8b7067410a9c 100644
--- a/docs/finn/source_code/finn.transformation.streamline.rst
+++ b/docs/finn/source_code/finn.transformation.streamline.rst
@@ -26,13 +26,6 @@ finn.transformation.streamline.collapse\_repeated
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.streamline.remove
--------------------------------------
-
-.. automodule:: finn.transformation.streamline.remove
-  :members:
-  :undoc-members:
-  :show-inheritance:
 
 finn.transformation.streamline.reorder
 ---------------------------------------------
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index 82e4bf3261582c9be622cbe3f15af38ba5e3fa41..62b72c2ac84567b20fee73a16e82b5857d698c9d 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -72,6 +72,15 @@ finn.util.onnx
    :undoc-members:
    :show-inheritance:
 
+finn.util.platforms
+--------------------
+
+.. automodule:: finn.util.platforms
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.util.pytorch
 ------------------
 
diff --git a/finn-rtllib/memstream/component.xml b/finn-rtllib/memstream/component.xml
index 3d6767abfc11eb114ddb084f1f7275f7a93d0607..1e5b710dc86bde4d442ce9e83b188aeed24388c5 100644
--- a/finn-rtllib/memstream/component.xml
+++ b/finn-rtllib/memstream/component.xml
@@ -1662,9 +1662,27 @@
   <spirit:vendorExtensions>
     <xilinx:coreExtensions>
       <xilinx:supportedFamilies>
-        <xilinx:family xilinx:lifeCycle="Production">zynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">virtexuplusHBM</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Beta">aartix7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Beta">akintex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Beta">artix7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Beta">artix7l</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Beta">azynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintex7l</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintexu</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintexuplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qkintex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qkintex7l</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qvirtex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qzynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qzynqplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">versal</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexu</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">virtexuplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexuplusHBM</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexupluse58g</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">zynq</xilinx:family>
         <xilinx:family xilinx:lifeCycle="Production">zynquplus</xilinx:family>
       </xilinx:supportedFamilies>
       <xilinx:taxonomies>
diff --git a/finn-rtllib/memstream/hdl/mux.v b/finn-rtllib/memstream/hdl/mux.v
index c5b89aeb4e7eb7b2858d062c18b693d9bd685fb2..f7087f9735771a73aa532ae19baf18569e9de663 100644
--- a/finn-rtllib/memstream/hdl/mux.v
+++ b/finn-rtllib/memstream/hdl/mux.v
@@ -1,44 +1,44 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module mux
-#(
-    parameter NINPUTS = 1,
-	parameter WIDTH = 16
-)
-(
-	input [NINPUTS*WIDTH-1:0] in,
-	output [WIDTH-1:0] out,
-	input [$clog2(NINPUTS)-1:0] sel
-);
-
-assign out = in >> (sel*WIDTH);
-
-endmodule
\ No newline at end of file
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+module mux
+#(
+    parameter NINPUTS = 1,
+	parameter WIDTH = 16
+)
+(
+	input [NINPUTS*WIDTH-1:0] in,
+	output [WIDTH-1:0] out,
+	input [$clog2(NINPUTS)-1:0] sel
+);
+
+assign out = in >> (sel*WIDTH);
+
+endmodule
diff --git a/finn-rtllib/memstream/sim/gen_memblocks.sh b/finn-rtllib/memstream/sim/gen_memblocks.sh
index 05962f7be8fe4afd7790640cf4d280600dcf43d1..b6e6b656ad1ea7846666108a1d4b79eae295490f 100644
--- a/finn-rtllib/memstream/sim/gen_memblocks.sh
+++ b/finn-rtllib/memstream/sim/gen_memblocks.sh
@@ -36,4 +36,4 @@ for (( i=0; i<$NBLOCKS; i++ ))
 do
     START=$(( 1 + $i * 1024 ))
     tail -n +$START $1 | head -n 1024 >> memblock_$i.dat
-done
\ No newline at end of file
+done
diff --git a/finn-rtllib/memstream/sim/tb_memstream.v b/finn-rtllib/memstream/sim/tb_memstream.v
index d63fa30046d7c5f2c50f509174b4937374e70c13..ad3efad5bd70c37a860ddb0ec5bff1c2e72c15f0 100644
--- a/finn-rtllib/memstream/sim/tb_memstream.v
+++ b/finn-rtllib/memstream/sim/tb_memstream.v
@@ -1,369 +1,369 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-`timescale 1ns/10ps
-
-module tb_memstream;
-
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-parameter CONFIG_EN = 1;
-parameter NSTREAMS = 4;//1 up to 6
-
-parameter MEM_DEPTH = 9216;
-parameter MEM_WIDTH = 32;
-parameter MEM_INIT = "./";
-parameter MEM_CHECK = "golden.dat";
-
-//widths per stream
-parameter STRM0_WIDTH = 32;
-parameter STRM1_WIDTH = 32;
-parameter STRM2_WIDTH = 32;
-parameter STRM3_WIDTH = 32;
-parameter STRM4_WIDTH = 1;
-parameter STRM5_WIDTH = 1;
-
-//depths per stream
-parameter STRM0_DEPTH = 2304;
-parameter STRM1_DEPTH = 2304;
-parameter STRM2_DEPTH = 2304;
-parameter STRM3_DEPTH = 2304;
-parameter STRM4_DEPTH = 1;
-parameter STRM5_DEPTH = 1;
-
-//offsets for each stream
-parameter STRM0_OFFSET = 0;
-parameter STRM1_OFFSET = 2304;
-parameter STRM2_OFFSET = 4608;
-parameter STRM3_OFFSET = 6912;
-parameter STRM4_OFFSET = 0;
-parameter STRM5_OFFSET = 0;
-
-
-reg clk;
-reg rst;
-
-reg [31:0] config_address = 0;
-reg config_ce = 0;
-reg config_we = 0;
-reg [31:0] config_d0 = 0;
-wire [31:0] config_q0;
-
-//multiple wire AXI Streams
-reg m_axis_0_afull;
-reg m_axis_0_tready;
-wire m_axis_0_tvalid;
-wire [STRM0_WIDTH-1:0] m_axis_0_tdata;
-
-reg m_axis_1_afull;
-reg m_axis_1_tready;
-wire m_axis_1_tvalid;
-wire [STRM1_WIDTH-1:0] m_axis_1_tdata;
-
-reg m_axis_2_afull;
-reg m_axis_2_tready;
-wire m_axis_2_tvalid;
-wire [STRM2_WIDTH-1:0] m_axis_2_tdata;
-
-reg m_axis_3_afull;
-reg m_axis_3_tready;
-wire m_axis_3_tvalid;
-wire [STRM3_WIDTH-1:0] m_axis_3_tdata;
-
-reg m_axis_4_afull;
-reg m_axis_4_tready;
-wire m_axis_4_tvalid;
-wire [STRM4_WIDTH-1:0] m_axis_4_tdata;
-
-reg m_axis_5_afull;
-reg m_axis_5_tready;
-wire m_axis_5_tvalid;
-wire [STRM5_WIDTH-1:0] m_axis_5_tdata;
-
-reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0];
-integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5;
-integer done = 0;
-reg [5:0] rng;
-
-//clock
-initial begin
-    clk = 0;
-    forever #5 clk = ~clk;
-end
-
-initial begin
-    rst = 1;
-	config_ce = 0;
-    m_axis_0_afull = 0;
-    m_axis_1_afull = 0;
-    m_axis_2_afull = 0;
-    m_axis_3_afull = 0;
-    m_axis_4_afull = 0;
-    m_axis_5_afull = 0;
-    m_axis_0_tready = 1;
-    m_axis_1_tready = 1;
-    m_axis_2_tready = 1;
-    m_axis_3_tready = 1;
-    m_axis_4_tready = 1;
-    m_axis_5_tready = 1;
-    repeat(100) @(negedge clk);
-    rst = 0;
-    #100
-    fork
-	    begin
-		    $display("Starting to generate random AFULL");
-			while(~done) begin
-			    rng = $random;
-				m_axis_0_afull = rng[0];
-				m_axis_1_afull = rng[1];
-				m_axis_2_afull = rng[2];
-				m_axis_3_afull = rng[3];
-				m_axis_4_afull = rng[4];
-				m_axis_5_afull = rng[5];
-				@(negedge clk);
-			end
-		end
-	join
-end
-
-
-//DUT
-memstream
-#(
-    CONFIG_EN,
-    NSTREAMS,
-    MEM_DEPTH,
-    MEM_WIDTH,
-    MEM_INIT,
-    
-    //widths per stream
-    STRM0_WIDTH,
-    STRM1_WIDTH,
-    STRM2_WIDTH,
-    STRM3_WIDTH,
-    STRM4_WIDTH,
-    STRM5_WIDTH,
-    
-    //depths per stream
-    STRM0_DEPTH,
-    STRM1_DEPTH,
-    STRM2_DEPTH,
-    STRM3_DEPTH,
-    STRM4_DEPTH,
-    STRM5_DEPTH,
-    
-    //offsets for each stream
-    STRM0_OFFSET,
-    STRM1_OFFSET,
-    STRM2_OFFSET,
-    STRM3_OFFSET,
-    STRM4_OFFSET,
-    STRM5_OFFSET
-)
-dut
-(
-    clk,
-    ~rst,
-
-    //optional AXI-Lite interface
-    config_address,
-    config_ce,
-    config_we,
-    config_d0,
-    config_q0,
-
-    //multiple output AXI Streams
-    m_axis_0_afull,
-    m_axis_0_tready,
-    m_axis_0_tvalid,
-    m_axis_0_tdata,
-    
-    m_axis_1_afull,
-    m_axis_1_tready,
-    m_axis_1_tvalid,
-    m_axis_1_tdata,
-    
-    m_axis_2_afull,
-    m_axis_2_tready,
-    m_axis_2_tvalid,
-    m_axis_2_tdata,
-    
-    m_axis_3_afull,
-    m_axis_3_tready,
-    m_axis_3_tvalid,
-    m_axis_3_tdata,
-    
-    m_axis_4_afull,
-    m_axis_4_tready,
-    m_axis_4_tvalid,
-    m_axis_4_tdata,
-    
-    m_axis_5_afull,
-    m_axis_5_tready,
-    m_axis_5_tvalid,
-    m_axis_5_tdata
-    
-
-);
-
-//stream checkers
-initial begin
-    ptr0 = STRM0_OFFSET;
-	ptr1 = STRM1_OFFSET;
-	ptr2 = STRM2_OFFSET;
-	ptr3 = STRM3_OFFSET;
-	ptr4 = STRM4_OFFSET;
-	ptr5 = STRM5_OFFSET;
-    fork
-		//check stream 0
-	    begin
-		    $display("Starting stream 0 checker");
-		    while(~done & (NSTREAMS > 0)) begin
-				@(negedge clk);
-				if(m_axis_0_tvalid) begin
-					if(m_axis_0_tdata != golden[ptr0]) begin
-						$display("Mismatch on stream 0");
-						$stop();
-					end
-					//increment pointer
-					ptr0 = ptr0 + 1;
-					//rewind pointer if it's reached end
-					if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH))
-				    ptr0 = STRM0_OFFSET;
-				end
-			end
-		end
-		//check stream 1
-	    begin
-		    $display("Starting stream 1 checker");
-		    while(~done & (NSTREAMS > 1)) begin
-				@(negedge clk);
-				if(m_axis_1_tvalid) begin
-					if(m_axis_1_tdata != golden[ptr1]) begin
-						$display("Mismatch on stream 1");
-						$stop();
-					end
-					//increment pointer
-					ptr1 = ptr1 + 1;
-					//rewind pointer if it's reached end
-					if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH))
-						ptr1 = STRM1_OFFSET;
-				end
-			end
-		end
-		
-		//check stream 2
-	    begin
-		    $display("Starting stream 2 checker");
-		    while(~done & (NSTREAMS > 2)) begin
-				@(negedge clk);
-				if(m_axis_2_tvalid) begin
-					if(m_axis_2_tdata != golden[ptr2]) begin
-						$display("Mismatch on stream 2");
-						$stop();
-					end
-					//increment pointer
-					ptr2 = ptr2 + 1;
-					//rewind pointer if it's reached end
-					if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH))
-						ptr2 = STRM2_OFFSET;
-				end
-			end
-		end
-		//check stream 3
-	    begin
-		    $display("Starting stream 3 checker");
-		    while(~done & (NSTREAMS > 3)) begin
-				@(negedge clk);
-				if(m_axis_3_tvalid) begin
-					if(m_axis_3_tdata != golden[ptr3]) begin
-						$display("Mismatch on stream 3");
-						$stop();
-					end
-					//increment pointer
-					ptr3 = ptr3 + 1;
-					//rewind pointer if it's reached end
-					if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH))
-						ptr3 = STRM3_OFFSET;
-				end
-			end
-		end
-		//check stream 4
-	    begin
-		    $display("Starting stream 4 checker");
-		    while(~done & (NSTREAMS > 4)) begin
-				@(negedge clk);
-				if(m_axis_4_tvalid) begin
-					if(m_axis_4_tdata != golden[ptr4]) begin
-						$display("Mismatch on stream 4");
-						$stop();
-					end
-					//increment pointer
-					ptr4 = ptr4 + 1;
-					//rewind pointer if it's reached end
-					if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH))
-						ptr4 = STRM4_OFFSET;
-				end
-			end
-		end
-		//check stream 5
-	    begin
-		    $display("Starting stream 5 checker");
-		    while(~done & (NSTREAMS > 5)) begin
-				@(negedge clk);
-				if(m_axis_5_tvalid) begin
-					if(m_axis_5_tdata != golden[ptr5]) begin
-						$display("Mismatch on stream 5");
-						$stop();
-					end
-					//increment pointer
-					ptr5 = ptr5 + 1;
-					//rewind pointer if it's reached end
-					if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH))
-						ptr5 = STRM5_OFFSET;
-				end
-			end
-		end
-	join
-end
-
-initial begin
-    done = 0;
-	$readmemh(MEM_CHECK,golden);
-//    $dumpfile("wave.vcd");
-//    $dumpvars(0,tb_memstream);
-    @(negedge rst);
-    #10000000
-	$display("Test done!");
-	done = 1;
-	#1000
-    $finish();
-end
-
-endmodule
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+`timescale 1ns/10ps
+
+module tb_memstream;
+
+//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
+parameter CONFIG_EN = 1;
+parameter NSTREAMS = 4;//1 up to 6
+
+parameter MEM_DEPTH = 9216;
+parameter MEM_WIDTH = 32;
+parameter MEM_INIT = "./";
+parameter MEM_CHECK = "golden.dat";
+
+//widths per stream
+parameter STRM0_WIDTH = 32;
+parameter STRM1_WIDTH = 32;
+parameter STRM2_WIDTH = 32;
+parameter STRM3_WIDTH = 32;
+parameter STRM4_WIDTH = 1;
+parameter STRM5_WIDTH = 1;
+
+//depths per stream
+parameter STRM0_DEPTH = 2304;
+parameter STRM1_DEPTH = 2304;
+parameter STRM2_DEPTH = 2304;
+parameter STRM3_DEPTH = 2304;
+parameter STRM4_DEPTH = 1;
+parameter STRM5_DEPTH = 1;
+
+//offsets for each stream
+parameter STRM0_OFFSET = 0;
+parameter STRM1_OFFSET = 2304;
+parameter STRM2_OFFSET = 4608;
+parameter STRM3_OFFSET = 6912;
+parameter STRM4_OFFSET = 0;
+parameter STRM5_OFFSET = 0;
+
+
+reg clk;
+reg rst;
+
+reg [31:0] config_address = 0;
+reg config_ce = 0;
+reg config_we = 0;
+reg [31:0] config_d0 = 0;
+wire [31:0] config_q0;
+
+//multiple wire AXI Streams
+reg m_axis_0_afull;
+reg m_axis_0_tready;
+wire m_axis_0_tvalid;
+wire [STRM0_WIDTH-1:0] m_axis_0_tdata;
+
+reg m_axis_1_afull;
+reg m_axis_1_tready;
+wire m_axis_1_tvalid;
+wire [STRM1_WIDTH-1:0] m_axis_1_tdata;
+
+reg m_axis_2_afull;
+reg m_axis_2_tready;
+wire m_axis_2_tvalid;
+wire [STRM2_WIDTH-1:0] m_axis_2_tdata;
+
+reg m_axis_3_afull;
+reg m_axis_3_tready;
+wire m_axis_3_tvalid;
+wire [STRM3_WIDTH-1:0] m_axis_3_tdata;
+
+reg m_axis_4_afull;
+reg m_axis_4_tready;
+wire m_axis_4_tvalid;
+wire [STRM4_WIDTH-1:0] m_axis_4_tdata;
+
+reg m_axis_5_afull;
+reg m_axis_5_tready;
+wire m_axis_5_tvalid;
+wire [STRM5_WIDTH-1:0] m_axis_5_tdata;
+
+reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0];
+integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5;
+integer done = 0;
+reg [5:0] rng;
+
+//clock
+initial begin
+    clk = 0;
+    forever #5 clk = ~clk;
+end
+
+initial begin
+    rst = 1;
+	config_ce = 0;
+    m_axis_0_afull = 0;
+    m_axis_1_afull = 0;
+    m_axis_2_afull = 0;
+    m_axis_3_afull = 0;
+    m_axis_4_afull = 0;
+    m_axis_5_afull = 0;
+    m_axis_0_tready = 1;
+    m_axis_1_tready = 1;
+    m_axis_2_tready = 1;
+    m_axis_3_tready = 1;
+    m_axis_4_tready = 1;
+    m_axis_5_tready = 1;
+    repeat(100) @(negedge clk);
+    rst = 0;
+    #100
+    fork
+	    begin
+		    $display("Starting to generate random AFULL");
+			while(~done) begin
+			    rng = $random;
+				m_axis_0_afull = rng[0];
+				m_axis_1_afull = rng[1];
+				m_axis_2_afull = rng[2];
+				m_axis_3_afull = rng[3];
+				m_axis_4_afull = rng[4];
+				m_axis_5_afull = rng[5];
+				@(negedge clk);
+			end
+		end
+	join
+end
+
+
+//DUT
+memstream
+#(
+    CONFIG_EN,
+    NSTREAMS,
+    MEM_DEPTH,
+    MEM_WIDTH,
+    MEM_INIT,
+
+    //widths per stream
+    STRM0_WIDTH,
+    STRM1_WIDTH,
+    STRM2_WIDTH,
+    STRM3_WIDTH,
+    STRM4_WIDTH,
+    STRM5_WIDTH,
+
+    //depths per stream
+    STRM0_DEPTH,
+    STRM1_DEPTH,
+    STRM2_DEPTH,
+    STRM3_DEPTH,
+    STRM4_DEPTH,
+    STRM5_DEPTH,
+
+    //offsets for each stream
+    STRM0_OFFSET,
+    STRM1_OFFSET,
+    STRM2_OFFSET,
+    STRM3_OFFSET,
+    STRM4_OFFSET,
+    STRM5_OFFSET
+)
+dut
+(
+    clk,
+    ~rst,
+
+    //optional AXI-Lite interface
+    config_address,
+    config_ce,
+    config_we,
+    config_d0,
+    config_q0,
+
+    //multiple output AXI Streams
+    m_axis_0_afull,
+    m_axis_0_tready,
+    m_axis_0_tvalid,
+    m_axis_0_tdata,
+
+    m_axis_1_afull,
+    m_axis_1_tready,
+    m_axis_1_tvalid,
+    m_axis_1_tdata,
+
+    m_axis_2_afull,
+    m_axis_2_tready,
+    m_axis_2_tvalid,
+    m_axis_2_tdata,
+
+    m_axis_3_afull,
+    m_axis_3_tready,
+    m_axis_3_tvalid,
+    m_axis_3_tdata,
+
+    m_axis_4_afull,
+    m_axis_4_tready,
+    m_axis_4_tvalid,
+    m_axis_4_tdata,
+
+    m_axis_5_afull,
+    m_axis_5_tready,
+    m_axis_5_tvalid,
+    m_axis_5_tdata
+
+
+);
+
+//stream checkers
+initial begin
+    ptr0 = STRM0_OFFSET;
+	ptr1 = STRM1_OFFSET;
+	ptr2 = STRM2_OFFSET;
+	ptr3 = STRM3_OFFSET;
+	ptr4 = STRM4_OFFSET;
+	ptr5 = STRM5_OFFSET;
+    fork
+		//check stream 0
+	    begin
+		    $display("Starting stream 0 checker");
+		    while(~done & (NSTREAMS > 0)) begin
+				@(negedge clk);
+				if(m_axis_0_tvalid) begin
+					if(m_axis_0_tdata != golden[ptr0]) begin
+						$display("Mismatch on stream 0");
+						$stop();
+					end
+					//increment pointer
+					ptr0 = ptr0 + 1;
+					//rewind pointer if it's reached end
+					if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH))
+				    ptr0 = STRM0_OFFSET;
+				end
+			end
+		end
+		//check stream 1
+	    begin
+		    $display("Starting stream 1 checker");
+		    while(~done & (NSTREAMS > 1)) begin
+				@(negedge clk);
+				if(m_axis_1_tvalid) begin
+					if(m_axis_1_tdata != golden[ptr1]) begin
+						$display("Mismatch on stream 1");
+						$stop();
+					end
+					//increment pointer
+					ptr1 = ptr1 + 1;
+					//rewind pointer if it's reached end
+					if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH))
+						ptr1 = STRM1_OFFSET;
+				end
+			end
+		end
+
+		//check stream 2
+	    begin
+		    $display("Starting stream 2 checker");
+		    while(~done & (NSTREAMS > 2)) begin
+				@(negedge clk);
+				if(m_axis_2_tvalid) begin
+					if(m_axis_2_tdata != golden[ptr2]) begin
+						$display("Mismatch on stream 2");
+						$stop();
+					end
+					//increment pointer
+					ptr2 = ptr2 + 1;
+					//rewind pointer if it's reached end
+					if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH))
+						ptr2 = STRM2_OFFSET;
+				end
+			end
+		end
+		//check stream 3
+	    begin
+		    $display("Starting stream 3 checker");
+		    while(~done & (NSTREAMS > 3)) begin
+				@(negedge clk);
+				if(m_axis_3_tvalid) begin
+					if(m_axis_3_tdata != golden[ptr3]) begin
+						$display("Mismatch on stream 3");
+						$stop();
+					end
+					//increment pointer
+					ptr3 = ptr3 + 1;
+					//rewind pointer if it's reached end
+					if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH))
+						ptr3 = STRM3_OFFSET;
+				end
+			end
+		end
+		//check stream 4
+	    begin
+		    $display("Starting stream 4 checker");
+		    while(~done & (NSTREAMS > 4)) begin
+				@(negedge clk);
+				if(m_axis_4_tvalid) begin
+					if(m_axis_4_tdata != golden[ptr4]) begin
+						$display("Mismatch on stream 4");
+						$stop();
+					end
+					//increment pointer
+					ptr4 = ptr4 + 1;
+					//rewind pointer if it's reached end
+					if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH))
+						ptr4 = STRM4_OFFSET;
+				end
+			end
+		end
+		//check stream 5
+	    begin
+		    $display("Starting stream 5 checker");
+		    while(~done & (NSTREAMS > 5)) begin
+				@(negedge clk);
+				if(m_axis_5_tvalid) begin
+					if(m_axis_5_tdata != golden[ptr5]) begin
+						$display("Mismatch on stream 5");
+						$stop();
+					end
+					//increment pointer
+					ptr5 = ptr5 + 1;
+					//rewind pointer if it's reached end
+					if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH))
+						ptr5 = STRM5_OFFSET;
+				end
+			end
+		end
+	join
+end
+
+initial begin
+    done = 0;
+	$readmemh(MEM_CHECK,golden);
+//    $dumpfile("wave.vcd");
+//    $dumpvars(0,tb_memstream);
+    @(negedge rst);
+    #10000000
+	$display("Test done!");
+	done = 1;
+	#1000
+    $finish();
+end
+
+endmodule
diff --git a/finn-rtllib/memstream/sim/tb_memstream_writes.v b/finn-rtllib/memstream/sim/tb_memstream_writes.v
index a6ac747e967e594ac010f25a2827ebf7a7fcaa0f..c66807454b9a7f8ff7ab7008a504938740fb03a0 100644
--- a/finn-rtllib/memstream/sim/tb_memstream_writes.v
+++ b/finn-rtllib/memstream/sim/tb_memstream_writes.v
@@ -1,486 +1,486 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-`timescale 1ns/10ps
-
-module tb_memstream_writes;
-
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-parameter CONFIG_EN = 1;
-parameter NSTREAMS = 2;//1 up to 6
-
-parameter MEM_DEPTH = 40;
-parameter MEM_WIDTH = 70;
-
-//widths per stream
-parameter STRM0_WIDTH = 70;
-parameter STRM1_WIDTH = 32;
-parameter STRM2_WIDTH = 32;
-parameter STRM3_WIDTH = 32;
-parameter STRM4_WIDTH = 1;
-parameter STRM5_WIDTH = 1;
-
-//depths per stream
-parameter STRM0_DEPTH = 20;
-parameter STRM1_DEPTH = 20;
-parameter STRM2_DEPTH = 2304;
-parameter STRM3_DEPTH = 2304;
-parameter STRM4_DEPTH = 1;
-parameter STRM5_DEPTH = 1;
-
-//offsets for each stream
-parameter STRM0_OFFSET = 0;
-parameter STRM1_OFFSET = 20;
-parameter STRM2_OFFSET = 4608;
-parameter STRM3_OFFSET = 6912;
-parameter STRM4_OFFSET = 0;
-parameter STRM5_OFFSET = 0;
-
-
-reg clk;
-reg rst;
-
-wire        awready;
-reg         awvalid;
-reg [31:0]  awaddr;
-reg [2:0]   awprot;
-//write data
-wire        wready;
-reg         wvalid;
-reg [31:0]  wdata;
-reg [3:0]   wstrb;
-//burst response
-reg         bready;
-wire        bvalid;
-wire [1:0]  bresp;
-
-//Read channels
-//read address
-wire        arready;
-reg         arvalid;
-reg [31:0]  araddr;
-reg [2:0]   arprot;
-//read data
-reg         rready;
-wire        rvalid;
-wire [1:0]  rresp;
-wire [31:0] rdata;
-
-//multiple wire AXI Streams
-reg m_axis_0_afull;
-reg m_axis_0_tready;
-wire m_axis_0_tvalid;
-wire [STRM0_WIDTH-1:0] m_axis_0_tdata;
-
-reg m_axis_1_afull;
-reg m_axis_1_tready;
-wire m_axis_1_tvalid;
-wire [STRM1_WIDTH-1:0] m_axis_1_tdata;
-
-reg m_axis_2_afull;
-reg m_axis_2_tready;
-wire m_axis_2_tvalid;
-wire [STRM2_WIDTH-1:0] m_axis_2_tdata;
-
-reg m_axis_3_afull;
-reg m_axis_3_tready;
-wire m_axis_3_tvalid;
-wire [STRM3_WIDTH-1:0] m_axis_3_tdata;
-
-reg m_axis_4_afull;
-reg m_axis_4_tready;
-wire m_axis_4_tvalid;
-wire [STRM4_WIDTH-1:0] m_axis_4_tdata;
-
-reg m_axis_5_afull;
-reg m_axis_5_tready;
-wire m_axis_5_tvalid;
-wire [STRM5_WIDTH-1:0] m_axis_5_tdata;
-
-reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0];
-reg [MEM_WIDTH-1:0] gword;
-integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5;
-integer done = 0;
-integer i, j;
-reg [5:0] rng;
-
-parameter NFOLDS_PER_WORD = (MEM_WIDTH+31)/32;
-
-task axi_write;
-    input [MEM_WIDTH-1:0] data;
-    input [31:0] adr;
-    begin
-        for(j=0; j<(1<<$clog2(NFOLDS_PER_WORD)); j=j+1) begin
-            @(negedge clk);
-            awvalid = 1;
-            wvalid = 1;
-            wdata = data>>(j*32);
-            awaddr = (adr*(1<<$clog2(NFOLDS_PER_WORD))+j)*4;
-            fork
-                begin
-                    @(posedge awready);
-                    @(posedge clk) awvalid = 0;
-                end
-                begin
-                    @(posedge wready);
-                    @(posedge clk) wvalid = 0;
-                end
-            join
-            @(posedge clk);
-        end
-    end
-endtask
-
-task axi_read;
-    input [31:0] adr;
-    output [MEM_WIDTH-1:0] data;
-    begin
-        data = 0;
-        for(j=0; j<NFOLDS_PER_WORD; j=j+1) begin
-            @(negedge clk);
-            arvalid = 1;
-            araddr = (adr*(1<<$clog2(NFOLDS_PER_WORD))+j)*4;
-            rready = 1;
-            fork
-                begin
-                    @(posedge arready);
-                    @(posedge clk) arvalid = 0;
-                end
-                begin
-                    @(posedge rvalid);
-                    @(posedge clk) rready = 0;
-                    data = data | (rdata<<(32*j));
-                end
-            join
-            @(posedge clk);
-        end
-    end
-endtask
-
-//clock
-initial begin
-    clk = 0;
-    forever #5 clk = ~clk;
-end
-
-initial begin
-    rst = 1;
-    awvalid = 0;
-    arvalid = 0;
-    wvalid = 0;
-    rready = 1;
-    bready = 1;
-    m_axis_0_afull = 1;
-    m_axis_1_afull = 1;
-    m_axis_2_afull = 1;
-    m_axis_3_afull = 1;
-    m_axis_4_afull = 1;
-    m_axis_5_afull = 1;
-    m_axis_0_tready = 0;
-    m_axis_1_tready = 0;
-    m_axis_2_tready = 0;
-    m_axis_3_tready = 0;
-    m_axis_4_tready = 0;
-    m_axis_5_tready = 0;
-    repeat(100) @(negedge clk);
-    rst = 0;
-    #100
-    //random initialization of golden data
-    for(i=0; i<MEM_DEPTH; i=i+1) begin
-        gword = 0;
-        repeat(NFOLDS_PER_WORD)
-            gword = (gword << 32) | $random;
-        golden[i] = gword;
-        axi_write(golden[i],i);
-        axi_read(i,gword);
-    end
-    //re-reset
-    repeat(100) @(negedge clk);
-    rst = 1;
-    #100
-    repeat(100) @(negedge clk);
-    rst = 0;
-    #100
-    @(negedge clk);
-    //start reads
-    m_axis_0_afull = 0;
-    m_axis_1_afull = 0;
-    m_axis_2_afull = 0;
-    m_axis_3_afull = 0;
-    m_axis_4_afull = 0;
-    m_axis_5_afull = 0;
-    m_axis_0_tready = 1;
-    m_axis_1_tready = 1;
-    m_axis_2_tready = 1;
-    m_axis_3_tready = 1;
-    m_axis_4_tready = 1;
-    m_axis_5_tready = 1;
-    fork
-	    begin
-		    $display("Starting to generate random AFULL");
-			while(~done) begin
-			    rng = $random;
-				m_axis_0_afull = rng[0];
-				m_axis_1_afull = rng[1];
-				m_axis_2_afull = rng[2];
-				m_axis_3_afull = rng[3];
-				m_axis_4_afull = rng[4];
-				m_axis_5_afull = rng[5];
-				@(negedge clk);
-			end
-		end
-	join
-end
-
-
-//DUT
-memstream
-#(
-    CONFIG_EN,
-    NSTREAMS,
-    MEM_DEPTH,
-    MEM_WIDTH,
-    ".",
-    "auto",
-    //widths per stream
-    STRM0_WIDTH,
-    STRM1_WIDTH,
-    STRM2_WIDTH,
-    STRM3_WIDTH,
-    STRM4_WIDTH,
-    STRM5_WIDTH,
-    //depths per stream
-    STRM0_DEPTH,
-    STRM1_DEPTH,
-    STRM2_DEPTH,
-    STRM3_DEPTH,
-    STRM4_DEPTH,
-    STRM5_DEPTH,
-    //offsets for each stream
-    STRM0_OFFSET,
-    STRM1_OFFSET,
-    STRM2_OFFSET,
-    STRM3_OFFSET,
-    STRM4_OFFSET,
-    STRM5_OFFSET
-)
-dut
-(
-    clk,
-    ~rst,
-
-    //optional AXI-Lite interface
-    awready,
-    awvalid,
-    awaddr,
-    awprot,
-    //write data
-    wready,
-    wvalid,
-    wdata,
-    wstrb,
-    //burst response
-    bready,
-    bvalid,
-    bresp,
-
-    //Read channels
-    //read address
-    arready,
-    arvalid,
-    araddr,
-    arprot,
-    //read data
-    rready,
-    rvalid,
-    rresp,
-    rdata,
-
-    //multiple output AXI Streams
-    m_axis_0_afull,
-    m_axis_0_tready,
-    m_axis_0_tvalid,
-    m_axis_0_tdata,
-    m_axis_1_afull,
-    m_axis_1_tready,
-    m_axis_1_tvalid,
-    m_axis_1_tdata,
-    m_axis_2_afull,
-    m_axis_2_tready,
-    m_axis_2_tvalid,
-    m_axis_2_tdata,
-    m_axis_3_afull,
-    m_axis_3_tready,
-    m_axis_3_tvalid,
-    m_axis_3_tdata,
-    m_axis_4_afull,
-    m_axis_4_tready,
-    m_axis_4_tvalid,
-    m_axis_4_tdata,
-    m_axis_5_afull,
-    m_axis_5_tready,
-    m_axis_5_tvalid,
-    m_axis_5_tdata
-
-);
-
-//stream checkers
-initial begin
-    ptr0 = STRM0_OFFSET;
-	ptr1 = STRM1_OFFSET;
-	ptr2 = STRM2_OFFSET;
-	ptr3 = STRM3_OFFSET;
-	ptr4 = STRM4_OFFSET;
-	ptr5 = STRM5_OFFSET;
-    fork
-		//check stream 0
-	    begin
-		    $display("Starting stream 0 checker");
-		    while(~done & (NSTREAMS > 0)) begin
-				@(negedge clk);
-				if(m_axis_0_tvalid & m_axis_0_tready) begin
-					if(m_axis_0_tdata != golden[ptr0]) begin
-						$display("Mismatch on stream 0");
-						$stop();
-					end
-					//increment pointer
-					ptr0 = ptr0 + 1;
-					//rewind pointer if it's reached end
-					if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH))
-				        ptr0 = STRM0_OFFSET;
-				end
-			end
-		end
-		//check stream 1
-	    begin
-		    $display("Starting stream 1 checker");
-		    while(~done & (NSTREAMS > 1)) begin
-				@(negedge clk);
-				if(m_axis_1_tvalid & m_axis_1_tready) begin
-					if(m_axis_1_tdata != golden[ptr1]) begin
-						$display("Mismatch on stream 1");
-						$stop();
-					end
-					//increment pointer
-					ptr1 = ptr1 + 1;
-					//rewind pointer if it's reached end
-					if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH))
-						ptr1 = STRM1_OFFSET;
-				end
-			end
-		end
-		//check stream 2
-	    begin
-		    $display("Starting stream 2 checker");
-		    while(~done & (NSTREAMS > 2)) begin
-				@(negedge clk);
-				if(m_axis_2_tvalid & m_axis_2_tready) begin
-					if(m_axis_2_tdata != golden[ptr2]) begin
-						$display("Mismatch on stream 2");
-						$stop();
-					end
-					//increment pointer
-					ptr2 = ptr2 + 1;
-					//rewind pointer if it's reached end
-					if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH))
-						ptr2 = STRM2_OFFSET;
-				end
-			end
-		end
-		//check stream 3
-	    begin
-		    $display("Starting stream 3 checker");
-		    while(~done & (NSTREAMS > 3)) begin
-				@(negedge clk);
-				if(m_axis_3_tvalid & m_axis_3_tready) begin
-					if(m_axis_3_tdata != golden[ptr3]) begin
-						$display("Mismatch on stream 3");
-						$stop();
-					end
-					//increment pointer
-					ptr3 = ptr3 + 1;
-					//rewind pointer if it's reached end
-					if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH))
-						ptr3 = STRM3_OFFSET;
-				end
-			end
-		end
-		//check stream 4
-	    begin
-		    $display("Starting stream 4 checker");
-		    while(~done & (NSTREAMS > 4)) begin
-				@(negedge clk);
-				if(m_axis_4_tvalid & m_axis_4_tready) begin
-					if(m_axis_4_tdata != golden[ptr4]) begin
-						$display("Mismatch on stream 4");
-						$stop();
-					end
-					//increment pointer
-					ptr4 = ptr4 + 1;
-					//rewind pointer if it's reached end
-					if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH))
-						ptr4 = STRM4_OFFSET;
-				end
-			end
-		end
-		//check stream 5
-	    begin
-		    $display("Starting stream 5 checker");
-		    while(~done & (NSTREAMS > 5)) begin
-				@(negedge clk);
-				if(m_axis_5_tvalid & m_axis_5_tready) begin
-					if(m_axis_5_tdata != golden[ptr5]) begin
-						$display("Mismatch on stream 5");
-						$stop();
-					end
-					//increment pointer
-					ptr5 = ptr5 + 1;
-					//rewind pointer if it's reached end
-					if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH))
-						ptr5 = STRM5_OFFSET;
-				end
-			end
-		end
-	join
-end
-
-initial begin
-    done = 0;
-    @(negedge rst);
-    $dumpfile("wave.vcd");
-    $dumpvars(0,tb_memstream_writes);
-    #50000
-	$display("Test done!");
-	done = 1;
-	#1000
-    $finish();
-end
-
-endmodule
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+`timescale 1ns/10ps
+
+module tb_memstream_writes;
+
+//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
+parameter CONFIG_EN = 1;
+parameter NSTREAMS = 2;//1 up to 6
+
+parameter MEM_DEPTH = 40;
+parameter MEM_WIDTH = 70;
+
+//widths per stream
+parameter STRM0_WIDTH = 70;
+parameter STRM1_WIDTH = 32;
+parameter STRM2_WIDTH = 32;
+parameter STRM3_WIDTH = 32;
+parameter STRM4_WIDTH = 1;
+parameter STRM5_WIDTH = 1;
+
+//depths per stream
+parameter STRM0_DEPTH = 20;
+parameter STRM1_DEPTH = 20;
+parameter STRM2_DEPTH = 2304;
+parameter STRM3_DEPTH = 2304;
+parameter STRM4_DEPTH = 1;
+parameter STRM5_DEPTH = 1;
+
+//offsets for each stream
+parameter STRM0_OFFSET = 0;
+parameter STRM1_OFFSET = 20;
+parameter STRM2_OFFSET = 4608;
+parameter STRM3_OFFSET = 6912;
+parameter STRM4_OFFSET = 0;
+parameter STRM5_OFFSET = 0;
+
+
+reg clk;
+reg rst;
+
+wire        awready;
+reg         awvalid;
+reg [31:0]  awaddr;
+reg [2:0]   awprot;
+//write data
+wire        wready;
+reg         wvalid;
+reg [31:0]  wdata;
+reg [3:0]   wstrb;
+//burst response
+reg         bready;
+wire        bvalid;
+wire [1:0]  bresp;
+
+//Read channels
+//read address
+wire        arready;
+reg         arvalid;
+reg [31:0]  araddr;
+reg [2:0]   arprot;
+//read data
+reg         rready;
+wire        rvalid;
+wire [1:0]  rresp;
+wire [31:0] rdata;
+
+//multiple wire AXI Streams
+reg m_axis_0_afull;
+reg m_axis_0_tready;
+wire m_axis_0_tvalid;
+wire [STRM0_WIDTH-1:0] m_axis_0_tdata;
+
+reg m_axis_1_afull;
+reg m_axis_1_tready;
+wire m_axis_1_tvalid;
+wire [STRM1_WIDTH-1:0] m_axis_1_tdata;
+
+reg m_axis_2_afull;
+reg m_axis_2_tready;
+wire m_axis_2_tvalid;
+wire [STRM2_WIDTH-1:0] m_axis_2_tdata;
+
+reg m_axis_3_afull;
+reg m_axis_3_tready;
+wire m_axis_3_tvalid;
+wire [STRM3_WIDTH-1:0] m_axis_3_tdata;
+
+reg m_axis_4_afull;
+reg m_axis_4_tready;
+wire m_axis_4_tvalid;
+wire [STRM4_WIDTH-1:0] m_axis_4_tdata;
+
+reg m_axis_5_afull;
+reg m_axis_5_tready;
+wire m_axis_5_tvalid;
+wire [STRM5_WIDTH-1:0] m_axis_5_tdata;
+
+reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0];
+reg [MEM_WIDTH-1:0] gword;
+integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5;
+integer done = 0;
+integer i, j;
+reg [5:0] rng;
+
+parameter NFOLDS_PER_WORD = (MEM_WIDTH+31)/32;
+
+task axi_write;
+    input [MEM_WIDTH-1:0] data;
+    input [31:0] adr;
+    begin
+        for(j=0; j<(1<<$clog2(NFOLDS_PER_WORD)); j=j+1) begin
+            @(negedge clk);
+            awvalid = 1;
+            wvalid = 1;
+            wdata = data>>(j*32);
+            awaddr = (adr*(1<<$clog2(NFOLDS_PER_WORD))+j)*4;
+            fork
+                begin
+                    @(posedge awready);
+                    @(posedge clk) awvalid = 0;
+                end
+                begin
+                    @(posedge wready);
+                    @(posedge clk) wvalid = 0;
+                end
+            join
+            @(posedge clk);
+        end
+    end
+endtask
+
+task axi_read;
+    input [31:0] adr;
+    output [MEM_WIDTH-1:0] data;
+    begin
+        data = 0;
+        for(j=0; j<NFOLDS_PER_WORD; j=j+1) begin
+            @(negedge clk);
+            arvalid = 1;
+            araddr = (adr*(1<<$clog2(NFOLDS_PER_WORD))+j)*4;
+            rready = 1;
+            fork
+                begin
+                    @(posedge arready);
+                    @(posedge clk) arvalid = 0;
+                end
+                begin
+                    @(posedge rvalid);
+                    @(posedge clk) rready = 0;
+                    data = data | (rdata<<(32*j));
+                end
+            join
+            @(posedge clk);
+        end
+    end
+endtask
+
+//clock
+initial begin
+    clk = 0;
+    forever #5 clk = ~clk;
+end
+
+initial begin
+    rst = 1;
+    awvalid = 0;
+    arvalid = 0;
+    wvalid = 0;
+    rready = 1;
+    bready = 1;
+    m_axis_0_afull = 1;
+    m_axis_1_afull = 1;
+    m_axis_2_afull = 1;
+    m_axis_3_afull = 1;
+    m_axis_4_afull = 1;
+    m_axis_5_afull = 1;
+    m_axis_0_tready = 0;
+    m_axis_1_tready = 0;
+    m_axis_2_tready = 0;
+    m_axis_3_tready = 0;
+    m_axis_4_tready = 0;
+    m_axis_5_tready = 0;
+    repeat(100) @(negedge clk);
+    rst = 0;
+    #100
+    //random initialization of golden data
+    for(i=0; i<MEM_DEPTH; i=i+1) begin
+        gword = 0;
+        repeat(NFOLDS_PER_WORD)
+            gword = (gword << 32) | $random;
+        golden[i] = gword;
+        axi_write(golden[i],i);
+        axi_read(i,gword);
+    end
+    //re-reset
+    repeat(100) @(negedge clk);
+    rst = 1;
+    #100
+    repeat(100) @(negedge clk);
+    rst = 0;
+    #100
+    @(negedge clk);
+    //start reads
+    m_axis_0_afull = 0;
+    m_axis_1_afull = 0;
+    m_axis_2_afull = 0;
+    m_axis_3_afull = 0;
+    m_axis_4_afull = 0;
+    m_axis_5_afull = 0;
+    m_axis_0_tready = 1;
+    m_axis_1_tready = 1;
+    m_axis_2_tready = 1;
+    m_axis_3_tready = 1;
+    m_axis_4_tready = 1;
+    m_axis_5_tready = 1;
+    fork
+	    begin
+		    $display("Starting to generate random AFULL");
+			while(~done) begin
+			    rng = $random;
+				m_axis_0_afull = rng[0];
+				m_axis_1_afull = rng[1];
+				m_axis_2_afull = rng[2];
+				m_axis_3_afull = rng[3];
+				m_axis_4_afull = rng[4];
+				m_axis_5_afull = rng[5];
+				@(negedge clk);
+			end
+		end
+	join
+end
+
+
+//DUT
+memstream
+#(
+    CONFIG_EN,
+    NSTREAMS,
+    MEM_DEPTH,
+    MEM_WIDTH,
+    ".",
+    "auto",
+    //widths per stream
+    STRM0_WIDTH,
+    STRM1_WIDTH,
+    STRM2_WIDTH,
+    STRM3_WIDTH,
+    STRM4_WIDTH,
+    STRM5_WIDTH,
+    //depths per stream
+    STRM0_DEPTH,
+    STRM1_DEPTH,
+    STRM2_DEPTH,
+    STRM3_DEPTH,
+    STRM4_DEPTH,
+    STRM5_DEPTH,
+    //offsets for each stream
+    STRM0_OFFSET,
+    STRM1_OFFSET,
+    STRM2_OFFSET,
+    STRM3_OFFSET,
+    STRM4_OFFSET,
+    STRM5_OFFSET
+)
+dut
+(
+    clk,
+    ~rst,
+
+    //optional AXI-Lite interface
+    awready,
+    awvalid,
+    awaddr,
+    awprot,
+    //write data
+    wready,
+    wvalid,
+    wdata,
+    wstrb,
+    //burst response
+    bready,
+    bvalid,
+    bresp,
+
+    //Read channels
+    //read address
+    arready,
+    arvalid,
+    araddr,
+    arprot,
+    //read data
+    rready,
+    rvalid,
+    rresp,
+    rdata,
+
+    //multiple output AXI Streams
+    m_axis_0_afull,
+    m_axis_0_tready,
+    m_axis_0_tvalid,
+    m_axis_0_tdata,
+    m_axis_1_afull,
+    m_axis_1_tready,
+    m_axis_1_tvalid,
+    m_axis_1_tdata,
+    m_axis_2_afull,
+    m_axis_2_tready,
+    m_axis_2_tvalid,
+    m_axis_2_tdata,
+    m_axis_3_afull,
+    m_axis_3_tready,
+    m_axis_3_tvalid,
+    m_axis_3_tdata,
+    m_axis_4_afull,
+    m_axis_4_tready,
+    m_axis_4_tvalid,
+    m_axis_4_tdata,
+    m_axis_5_afull,
+    m_axis_5_tready,
+    m_axis_5_tvalid,
+    m_axis_5_tdata
+
+);
+
+//stream checkers
+initial begin
+    ptr0 = STRM0_OFFSET;
+	ptr1 = STRM1_OFFSET;
+	ptr2 = STRM2_OFFSET;
+	ptr3 = STRM3_OFFSET;
+	ptr4 = STRM4_OFFSET;
+	ptr5 = STRM5_OFFSET;
+    fork
+		//check stream 0
+	    begin
+		    $display("Starting stream 0 checker");
+		    while(~done & (NSTREAMS > 0)) begin
+				@(negedge clk);
+				if(m_axis_0_tvalid & m_axis_0_tready) begin
+					if(m_axis_0_tdata != golden[ptr0]) begin
+						$display("Mismatch on stream 0");
+						$stop();
+					end
+					//increment pointer
+					ptr0 = ptr0 + 1;
+					//rewind pointer if it's reached end
+					if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH))
+				        ptr0 = STRM0_OFFSET;
+				end
+			end
+		end
+		//check stream 1
+	    begin
+		    $display("Starting stream 1 checker");
+		    while(~done & (NSTREAMS > 1)) begin
+				@(negedge clk);
+				if(m_axis_1_tvalid & m_axis_1_tready) begin
+					if(m_axis_1_tdata != golden[ptr1]) begin
+						$display("Mismatch on stream 1");
+						$stop();
+					end
+					//increment pointer
+					ptr1 = ptr1 + 1;
+					//rewind pointer if it's reached end
+					if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH))
+						ptr1 = STRM1_OFFSET;
+				end
+			end
+		end
+		//check stream 2
+	    begin
+		    $display("Starting stream 2 checker");
+		    while(~done & (NSTREAMS > 2)) begin
+				@(negedge clk);
+				if(m_axis_2_tvalid & m_axis_2_tready) begin
+					if(m_axis_2_tdata != golden[ptr2]) begin
+						$display("Mismatch on stream 2");
+						$stop();
+					end
+					//increment pointer
+					ptr2 = ptr2 + 1;
+					//rewind pointer if it's reached end
+					if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH))
+						ptr2 = STRM2_OFFSET;
+				end
+			end
+		end
+		//check stream 3
+	    begin
+		    $display("Starting stream 3 checker");
+		    while(~done & (NSTREAMS > 3)) begin
+				@(negedge clk);
+				if(m_axis_3_tvalid & m_axis_3_tready) begin
+					if(m_axis_3_tdata != golden[ptr3]) begin
+						$display("Mismatch on stream 3");
+						$stop();
+					end
+					//increment pointer
+					ptr3 = ptr3 + 1;
+					//rewind pointer if it's reached end
+					if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH))
+						ptr3 = STRM3_OFFSET;
+				end
+			end
+		end
+		//check stream 4
+	    begin
+		    $display("Starting stream 4 checker");
+		    while(~done & (NSTREAMS > 4)) begin
+				@(negedge clk);
+				if(m_axis_4_tvalid & m_axis_4_tready) begin
+					if(m_axis_4_tdata != golden[ptr4]) begin
+						$display("Mismatch on stream 4");
+						$stop();
+					end
+					//increment pointer
+					ptr4 = ptr4 + 1;
+					//rewind pointer if it's reached end
+					if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH))
+						ptr4 = STRM4_OFFSET;
+				end
+			end
+		end
+		//check stream 5
+	    begin
+		    $display("Starting stream 5 checker");
+		    while(~done & (NSTREAMS > 5)) begin
+				@(negedge clk);
+				if(m_axis_5_tvalid & m_axis_5_tready) begin
+					if(m_axis_5_tdata != golden[ptr5]) begin
+						$display("Mismatch on stream 5");
+						$stop();
+					end
+					//increment pointer
+					ptr5 = ptr5 + 1;
+					//rewind pointer if it's reached end
+					if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH))
+						ptr5 = STRM5_OFFSET;
+				end
+			end
+		end
+	join
+end
+
+initial begin
+    done = 0;
+    @(negedge rst);
+    $dumpfile("wave.vcd");
+    $dumpvars(0,tb_memstream_writes);
+    #50000
+	$display("Test done!");
+	done = 1;
+	#1000
+    $finish();
+end
+
+endmodule
diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
index 7d7bc5c50b25e5b270fadc641bc2fa40d6dadddd..57f2601c73853ba9acc5f8ff85e0a18efc7e9a17 100644
--- a/notebooks/advanced/2_custom_op.ipynb
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -58,13 +58,11 @@
        " '__repr__',\n",
        " '__setattr__',\n",
        " '__sizeof__',\n",
+       " '__slots__',\n",
        " '__str__',\n",
        " '__subclasshook__',\n",
        " '__weakref__',\n",
-       " '_abc_cache',\n",
-       " '_abc_negative_cache',\n",
-       " '_abc_negative_cache_version',\n",
-       " '_abc_registry',\n",
+       " '_abc_impl',\n",
        " 'execute_node',\n",
        " 'get_nodeattr',\n",
        " 'get_nodeattr_allowed_values',\n",
@@ -211,7 +209,7 @@
        "{'DebugMarker': finn.custom_op.general.debugmarker.DebugMarker,\n",
        " 'QuantAvgPool2d': finn.custom_op.general.quantavgpool2d.QuantAvgPool2d,\n",
        " 'MaxPoolNHWC': finn.custom_op.general.maxpoolnhwc.MaxPoolNHWC,\n",
-       " 'StreamingDataflowPartition': finn.custom_op.general.streamingdataflowpartition.StreamingDataflowPartition,\n",
+       " 'GenericPartition': finn.custom_op.general.genericpartition.GenericPartition,\n",
        " 'MultiThreshold': finn.custom_op.general.multithreshold.MultiThreshold,\n",
        " 'XnorPopcountMatMul': finn.custom_op.general.xnorpopcount.XnorPopcountMatMul,\n",
        " 'Im2Col': finn.custom_op.general.im2col.Im2Col,\n",
@@ -335,8 +333,8 @@
     {
      "data": {
       "text/plain": [
-       "array([[[-6.,  2., -3., -6.],\n",
-       "        [-6.,  0.,  1., -2.]]], dtype=float32)"
+       "array([[[ 0., -3.,  1., -8.],\n",
+       "        [ 2., -2., -4., -8.]]], dtype=float32)"
       ]
      },
      "execution_count": 7,
@@ -349,7 +347,7 @@
     "from finn.util.basic import gen_finn_dt_tensor\n",
     "\n",
     "# generate a random input of e.g signed 4-bit values\n",
-    "random_input = gen_finn_dt_tensor(DataType.INT4, input_shape)\n",
+    "random_input = gen_finn_dt_tensor(DataType[\"INT4\"], input_shape)\n",
     "random_input\n"
    ]
   },
@@ -368,8 +366,8 @@
     {
      "data": {
       "text/plain": [
-       "{'outp': array([[[36.,  4.,  9., 36.],\n",
-       "         [36.,  0.,  1.,  4.]]], dtype=float32)}"
+       "{'outp': array([[[ 0.,  9.,  1., 64.],\n",
+       "         [ 4.,  4., 16., 64.]]], dtype=float32)}"
       ]
      },
      "execution_count": 8,
@@ -576,7 +574,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Available functions: ['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', 'execute_node', 'get_nodeattr', 'get_nodeattr_allowed_values', 'get_nodeattr_def', 'get_nodeattr_types', 'infer_node_datatype', 'make_shape_compatible_op', 'my_custom_cpp_gen', 'onnx_node', 'set_nodeattr', 'verify_node']\n",
+      "Available functions: ['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', 'execute_node', 'get_nodeattr', 'get_nodeattr_allowed_values', 'get_nodeattr_def', 'get_nodeattr_types', 'infer_node_datatype', 'make_shape_compatible_op', 'my_custom_cpp_gen', 'onnx_node', 'set_nodeattr', 'verify_node']\n",
       "codegen_dir: \n",
       "exec_mode: python\n"
      ]
@@ -666,7 +664,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/tmp/finn_dev_jalezeta/my_custom_oppaxpincq\n"
+      "/tmp/finn_dev_maltanar/my_custom_oppswiou3i\n"
      ]
     }
    ],
@@ -820,8 +818,8 @@
     {
      "data": {
       "text/plain": [
-       "array([[[-8.,  4.,  7.,  2.],\n",
-       "        [-5., -1.,  2.,  0.]]], dtype=float32)"
+       "array([[[-6.,  3.,  2., -5.],\n",
+       "        [ 5.,  2.,  0., -2.]]], dtype=float32)"
       ]
      },
      "execution_count": 21,
@@ -831,7 +829,7 @@
    ],
    "source": [
     "# generate a random input of e.g signed 4-bit values\n",
-    "random_input = gen_finn_dt_tensor(DataType.INT4, input_shape)\n",
+    "random_input = gen_finn_dt_tensor(DataType[\"INT4\"], input_shape)\n",
     "random_input"
    ]
   },
@@ -850,8 +848,8 @@
     {
      "data": {
       "text/plain": [
-       "{'outp': array([[[64., 16., 49.,  4.],\n",
-       "         [25.,  1.,  4.,  0.]]], dtype=float32)}"
+       "{'outp': array([[[36.,  9.,  4., 25.],\n",
+       "         [25.,  4.,  0.,  4.]]], dtype=float32)}"
       ]
      },
      "execution_count": 22,
@@ -882,8 +880,8 @@
     {
      "data": {
       "text/plain": [
-       "{'outp': array([[[64., 16., 49.,  4.],\n",
-       "         [25.,  1.,  4.,  0.]]])}"
+       "{'outp': array([[[36.,  9.,  4., 25.],\n",
+       "         [25.,  4.,  0.,  4.]]])}"
       ]
      },
      "execution_count": 23,
@@ -897,6 +895,13 @@
     "ret = execute_onnx(mixedop_graph_new, inp_dict)\n",
     "ret"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -915,7 +920,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/basics/1_brevitas_network_import.ipynb b/notebooks/basics/1_brevitas_network_import.ipynb
index 8ba7d00a171f68577b28c5897f0106ea4207a6ef..b6d6c3bdfd2962987a63c62a5532e7969a33982f 100644
--- a/notebooks/basics/1_brevitas_network_import.ipynb
+++ b/notebooks/basics/1_brevitas_network_import.ipynb
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -36,121 +36,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "# MIT License\n",
-      "#\n",
-      "# Copyright (c) 2019 Xilinx\n",
-      "#\n",
-      "# Permission is hereby granted, free of charge, to any person obtaining a copy\n",
-      "# of this software and associated documentation files (the \"Software\"), to deal\n",
-      "# in the Software without restriction, including without limitation the rights\n",
-      "# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n",
-      "# copies of the Software, and to permit persons to whom the Software is\n",
-      "# furnished to do so, subject to the following conditions:\n",
-      "#\n",
-      "# The above copyright notice and this permission notice shall be included in all\n",
-      "# copies or substantial portions of the Software.\n",
-      "#\n",
-      "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n",
-      "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n",
-      "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n",
-      "# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n",
-      "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n",
-      "# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n",
-      "# SOFTWARE.\n",
-      "\n",
-      "import ast\n",
-      "from functools import reduce\n",
-      "from operator import mul\n",
-      "\n",
-      "from torch.nn import Module, ModuleList, BatchNorm1d, Dropout\n",
-      "import torch\n",
-      "\n",
-      "from brevitas.nn import QuantIdentity, QuantLinear\n",
-      "from .common import CommonWeightQuant, CommonActQuant\n",
-      "from .tensor_norm import TensorNorm\n",
-      "\n",
-      "DROPOUT = 0.2\n",
-      "\n",
-      "\n",
-      "class FC(Module):\n",
-      "\n",
-      "    def __init__(\n",
-      "            self,\n",
-      "            num_classes,\n",
-      "            weight_bit_width,\n",
-      "            act_bit_width,\n",
-      "            in_bit_width,\n",
-      "            in_channels,\n",
-      "            out_features,\n",
-      "            in_features=(28, 28)):\n",
-      "        super(FC, self).__init__()\n",
-      "\n",
-      "        self.features = ModuleList()\n",
-      "        self.features.append(QuantIdentity(act_quant=CommonActQuant, bit_width=in_bit_width))\n",
-      "        self.features.append(Dropout(p=DROPOUT))\n",
-      "        in_features = reduce(mul, in_features)\n",
-      "        for out_features in out_features:\n",
-      "            self.features.append(QuantLinear(\n",
-      "                in_features=in_features,\n",
-      "                out_features=out_features,\n",
-      "                bias=False,\n",
-      "                weight_bit_width=weight_bit_width,\n",
-      "                weight_quant=CommonWeightQuant))\n",
-      "            in_features = out_features\n",
-      "            self.features.append(BatchNorm1d(num_features=in_features))\n",
-      "            self.features.append(QuantIdentity(act_quant=CommonActQuant, bit_width=act_bit_width))\n",
-      "            self.features.append(Dropout(p=DROPOUT))\n",
-      "        self.features.append(QuantLinear(\n",
-      "                in_features=in_features,\n",
-      "                out_features=num_classes,\n",
-      "                bias=False,\n",
-      "                weight_bit_width=weight_bit_width,\n",
-      "                weight_quant=CommonWeightQuant))\n",
-      "        self.features.append(TensorNorm())\n",
-      "\n",
-      "        for m in self.modules():\n",
-      "          if isinstance(m, QuantLinear):\n",
-      "            torch.nn.init.uniform_(m.weight.data, -1, 1)\n",
-      "\n",
-      "    def clip_weights(self, min_val, max_val):\n",
-      "        for mod in self.features:\n",
-      "            if isinstance(mod, QuantLinear):\n",
-      "                mod.weight.data.clamp_(min_val, max_val)\n",
-      "    \n",
-      "    def forward(self, x):\n",
-      "        x = x.view(x.shape[0], -1)\n",
-      "        x = 2.0 * x - torch.tensor([1.0], device=x.device)\n",
-      "        for mod in self.features:\n",
-      "            x = mod(x)\n",
-      "        return x\n",
-      "\n",
-      "\n",
-      "def fc(cfg):\n",
-      "    weight_bit_width = cfg.getint('QUANT', 'WEIGHT_BIT_WIDTH')\n",
-      "    act_bit_width = cfg.getint('QUANT', 'ACT_BIT_WIDTH')\n",
-      "    in_bit_width = cfg.getint('QUANT', 'IN_BIT_WIDTH')\n",
-      "    num_classes = cfg.getint('MODEL', 'NUM_CLASSES')\n",
-      "    in_channels = cfg.getint('MODEL', 'IN_CHANNELS')\n",
-      "    out_features = ast.literal_eval(cfg.get('MODEL', 'OUT_FEATURES'))\n",
-      "    net = FC(\n",
-      "        weight_bit_width=weight_bit_width,\n",
-      "        act_bit_width=act_bit_width,\n",
-      "        in_bit_width=in_bit_width,\n",
-      "        in_channels=in_channels,\n",
-      "        out_features=out_features,\n",
-      "        num_classes=num_classes)\n",
-      "    return net\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from brevitas_examples import bnn_pynq\n",
     "showSrc(bnn_pynq.models.FC)"
@@ -165,255 +53,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "FC(\n",
-       "  (features): ModuleList(\n",
-       "    (0): QuantIdentity(\n",
-       "      (input_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (act_quant): ActQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "        (fused_activation_quant_proxy): FusedActivationQuantProxy(\n",
-       "          (activation_impl): Identity()\n",
-       "          (tensor_quant): ClampedBinaryQuant(\n",
-       "            (scaling_impl): ConstScaling(\n",
-       "              (restrict_clamp_scaling): _RestrictClampValue(\n",
-       "                (restrict_value_impl): FloatRestrictValue()\n",
-       "                (clamp_min_ste): Identity()\n",
-       "              )\n",
-       "              (value): StatelessBuffer()\n",
-       "            )\n",
-       "            (bit_width): BitWidthConst(\n",
-       "              (bit_width): StatelessBuffer()\n",
-       "            )\n",
-       "            (delay_wrapper): DelayWrapper(\n",
-       "              (delay_impl): _NoDelay()\n",
-       "            )\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "    (1): Dropout(p=0.2)\n",
-       "    (2): QuantLinear(\n",
-       "      in_features=784, out_features=1024, bias=False\n",
-       "      (input_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (output_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (weight_quant): WeightQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "        (tensor_quant): BinaryQuant(\n",
-       "          (scaling_impl): ConstScaling(\n",
-       "            (restrict_clamp_scaling): _RestrictClampValue(\n",
-       "              (restrict_value_impl): FloatRestrictValue()\n",
-       "              (clamp_min_ste): Identity()\n",
-       "            )\n",
-       "            (value): StatelessBuffer()\n",
-       "          )\n",
-       "          (bit_width): BitWidthConst(\n",
-       "            (bit_width): StatelessBuffer()\n",
-       "          )\n",
-       "          (delay_wrapper): DelayWrapper(\n",
-       "            (delay_impl): _NoDelay()\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "      (bias_quant): BiasQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "    )\n",
-       "    (3): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "    (4): QuantIdentity(\n",
-       "      (input_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (act_quant): ActQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "        (fused_activation_quant_proxy): FusedActivationQuantProxy(\n",
-       "          (activation_impl): Identity()\n",
-       "          (tensor_quant): ClampedBinaryQuant(\n",
-       "            (scaling_impl): ConstScaling(\n",
-       "              (restrict_clamp_scaling): _RestrictClampValue(\n",
-       "                (restrict_value_impl): FloatRestrictValue()\n",
-       "                (clamp_min_ste): Identity()\n",
-       "              )\n",
-       "              (value): StatelessBuffer()\n",
-       "            )\n",
-       "            (bit_width): BitWidthConst(\n",
-       "              (bit_width): StatelessBuffer()\n",
-       "            )\n",
-       "            (delay_wrapper): DelayWrapper(\n",
-       "              (delay_impl): _NoDelay()\n",
-       "            )\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "    (5): Dropout(p=0.2)\n",
-       "    (6): QuantLinear(\n",
-       "      in_features=1024, out_features=1024, bias=False\n",
-       "      (input_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (output_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (weight_quant): WeightQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "        (tensor_quant): BinaryQuant(\n",
-       "          (scaling_impl): ConstScaling(\n",
-       "            (restrict_clamp_scaling): _RestrictClampValue(\n",
-       "              (restrict_value_impl): FloatRestrictValue()\n",
-       "              (clamp_min_ste): Identity()\n",
-       "            )\n",
-       "            (value): StatelessBuffer()\n",
-       "          )\n",
-       "          (bit_width): BitWidthConst(\n",
-       "            (bit_width): StatelessBuffer()\n",
-       "          )\n",
-       "          (delay_wrapper): DelayWrapper(\n",
-       "            (delay_impl): _NoDelay()\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "      (bias_quant): BiasQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "    )\n",
-       "    (7): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "    (8): QuantIdentity(\n",
-       "      (input_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (act_quant): ActQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "        (fused_activation_quant_proxy): FusedActivationQuantProxy(\n",
-       "          (activation_impl): Identity()\n",
-       "          (tensor_quant): ClampedBinaryQuant(\n",
-       "            (scaling_impl): ConstScaling(\n",
-       "              (restrict_clamp_scaling): _RestrictClampValue(\n",
-       "                (restrict_value_impl): FloatRestrictValue()\n",
-       "                (clamp_min_ste): Identity()\n",
-       "              )\n",
-       "              (value): StatelessBuffer()\n",
-       "            )\n",
-       "            (bit_width): BitWidthConst(\n",
-       "              (bit_width): StatelessBuffer()\n",
-       "            )\n",
-       "            (delay_wrapper): DelayWrapper(\n",
-       "              (delay_impl): _NoDelay()\n",
-       "            )\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "    (9): Dropout(p=0.2)\n",
-       "    (10): QuantLinear(\n",
-       "      in_features=1024, out_features=1024, bias=False\n",
-       "      (input_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (output_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (weight_quant): WeightQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "        (tensor_quant): BinaryQuant(\n",
-       "          (scaling_impl): ConstScaling(\n",
-       "            (restrict_clamp_scaling): _RestrictClampValue(\n",
-       "              (restrict_value_impl): FloatRestrictValue()\n",
-       "              (clamp_min_ste): Identity()\n",
-       "            )\n",
-       "            (value): StatelessBuffer()\n",
-       "          )\n",
-       "          (bit_width): BitWidthConst(\n",
-       "            (bit_width): StatelessBuffer()\n",
-       "          )\n",
-       "          (delay_wrapper): DelayWrapper(\n",
-       "            (delay_impl): _NoDelay()\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "      (bias_quant): BiasQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "    )\n",
-       "    (11): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "    (12): QuantIdentity(\n",
-       "      (input_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (act_quant): ActQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "        (fused_activation_quant_proxy): FusedActivationQuantProxy(\n",
-       "          (activation_impl): Identity()\n",
-       "          (tensor_quant): ClampedBinaryQuant(\n",
-       "            (scaling_impl): ConstScaling(\n",
-       "              (restrict_clamp_scaling): _RestrictClampValue(\n",
-       "                (restrict_value_impl): FloatRestrictValue()\n",
-       "                (clamp_min_ste): Identity()\n",
-       "              )\n",
-       "              (value): StatelessBuffer()\n",
-       "            )\n",
-       "            (bit_width): BitWidthConst(\n",
-       "              (bit_width): StatelessBuffer()\n",
-       "            )\n",
-       "            (delay_wrapper): DelayWrapper(\n",
-       "              (delay_impl): _NoDelay()\n",
-       "            )\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "    (13): Dropout(p=0.2)\n",
-       "    (14): QuantLinear(\n",
-       "      in_features=1024, out_features=10, bias=False\n",
-       "      (input_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (output_quant): IdentityQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "      (weight_quant): WeightQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "        (tensor_quant): BinaryQuant(\n",
-       "          (scaling_impl): ConstScaling(\n",
-       "            (restrict_clamp_scaling): _RestrictClampValue(\n",
-       "              (restrict_value_impl): FloatRestrictValue()\n",
-       "              (clamp_min_ste): Identity()\n",
-       "            )\n",
-       "            (value): StatelessBuffer()\n",
-       "          )\n",
-       "          (bit_width): BitWidthConst(\n",
-       "            (bit_width): StatelessBuffer()\n",
-       "          )\n",
-       "          (delay_wrapper): DelayWrapper(\n",
-       "            (delay_impl): _NoDelay()\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "      (bias_quant): BiasQuantProxyFromInjector(\n",
-       "        (_zero_hw_sentinel): StatelessBuffer()\n",
-       "      )\n",
-       "    )\n",
-       "    (15): TensorNorm()\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from finn.util.test import get_test_model\n",
     "lfc = get_test_model(netname = \"LFC\", wbits = 1, abits = 1, pretrained = True)\n",
@@ -429,22 +71,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD4CAYAAAAq5pAIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAARYElEQVR4nO3dfYyVZXrH8d/FoDAw8iYRCaisG/5QqmUbgk1KyOKmxlUMbKJm/aPauAmarMmqTVqz/UOSaqJVa/pH3YStL9CsmiWoq0a7a82mWo1GNFQQW1CULGR4E5H3t+HqH/NgZ3We6549z3nOc9z7+0kmM3Ouec65OTM/zsv13Pdt7i4Af/xGNT0AAJ1B2IFMEHYgE4QdyARhBzIxupM3Zma89Z+ZUaPKH09OnTpV23VXvf6enp6wPjAw0PJ1183dbbjLK4XdzK6U9M+SeiT9q7vfV+X6cmU27O/mS6k/6ip/eKNHx38CqcCk6r29vaW1Q4cOhcem9PX1hfUDBw6U1lIt50mTJoX1zz77LKx3o5afxptZj6R/kfR9SRdLusHMLm7XwAC0V5XX7PMlfeTuW9z9uKSnJS1pz7AAtFuVsM+Q9Lsh328rLvs9ZrbMzNaa2doKtwWgotrfoHP3FZJWSLxBBzSpyiP7dknnDfl+ZnEZgC5UJezvSJptZt8yszMl/VDS8+0ZFoB2a/lpvLufNLPbJP1ag623x9z9g7aNLCPjx48P6wcPHmz5useMGRPWjx07FtZTbcFx48aF9ai9lmoppqSOj9prqT76vn37WhlSV6v0mt3dX5L0UpvGAqBGnC4LZIKwA5kg7EAmCDuQCcIOZIKwA5mwTq4um+vpsqled6qXffTo0bA+duzYlo9Nia676vWfffbZYb3qNNLofp06dWp47O7du8N6amrwyZMnw3qdyuaz88gOZIKwA5kg7EAmCDuQCcIOZIKwA5mg9fYNkGrNVfkd1nnddUtNDa6yem1q6m5qanCTS03TegMyR9iBTBB2IBOEHcgEYQcyQdiBTBB2IBP02TvgrLPOCuvRbqOSNHHixLB+4sSJ0lpqN9LUFNbPP/88rC9YsCCs33rrraW1VC/6jjvuCOtbt24N601OM20SfXYgc4QdyARhBzJB2IFMEHYgE4QdyARhBzJBn/0b4JFHHgnrUS871Wuuuox1b29vWI+ktk2+5JJLwvqmTZvC+vHjx0trZ5xxRnhsdO6ClP53HzlyJKzXqazPXmnLZjP7VNIBSQOSTrr7vCrXB6A+lcJeWOTue9pwPQBqxGt2IBNVw+6SfmNm75rZsuF+wMyWmdlaM1tb8bYAVFD1afwCd99uZudIesXM/sfdXxv6A+6+QtIKiTfogCZVemR39+3F512SnpU0vx2DAtB+LYfdzMab2Vmnv5Z0haQN7RoYgPaq8jR+mqRniz7taElPuvu/t2VUf2RSWzYvWrQorF922WVhPeqVHzx4MDw21W/u6+sL66nzNKI566m11x999NGWr1uS7rzzztLaW2+9FR5b93bSTWg57O6+RdKftnEsAGpE6w3IBGEHMkHYgUwQdiAThB3IBFNcu0Bqqubs2bPD+v79+0trEyZMCI+NpoFK6SmwVbZ8TrX9UlJLcO/du7e0tnTp0vDYdevWhfVUSzLV8qwTS0kDmSPsQCYIO5AJwg5kgrADmSDsQCYIO5CJdiw42TFRT7fOfnBK6thU/ZZbbgnrq1atCuszZ85s+bZTffZ77rknrK9evTqsn3nmmaW1K664Ijz2wQcfDOuprbCj2168eHF47LZt28L6nj3fvDVWeWQHMkHYgUwQdiAThB3IBGEHMkHYgUwQdiATHZ/Pnup3Rzo51naqOvd54cKFYf2iiy4qrY0bNy48dvTo+FSLNWvWhPUtW7aE9SpSyz3PmTMnrKfu90jq75T57AC6FmEHMkHYgUwQdiAThB3IBGEHMkHYgUx0vM8+alT5/y9V54XXqcpc+lOnTlW67eg+S9VPnjwZHjt+/PiwfujQobCe2o46+p2l5tJfffXVYf3pp58O61X67Kk17VP3a5Na7rOb2WNmtsvMNgy5bIqZvWJmm4vPk9s5WADtN5Kn8U9IuvIrl90l6VV3ny3p1eJ7AF0sGXZ3f03SV/fRWSJpZfH1SklL2zssAO3W6hp009y9v/h6h6RpZT9oZsskLWvxdgC0SeUFJ93dow0b3X2FpBUSGzsCTWq19bbTzKZLUvF5V/uGBKAOrYb9eUk3FV/fJOlX7RkOgLok++xm9pSk70qaKmmnpLslPSfpl5LOl7RV0vXuXr4Z9v9fV21P46uuG1+1Hkn1ZFN7qEf7r1fV29sb1o8cORLWU+cAVDnH4MILLwzrH3/8ccvXnRpXak36lMOHD1c6voqyPnvyNbu731BS+l6lEQHoKE6XBTJB2IFMEHYgE4QdyARhBzLBls2FVAtyYGAgrEd6enrCetVlh6M2UarFlJrCmpK6/mjb5KgmSYsWLWppTKdFv9MTJ06Ex6amuFb5e2gKj+xAJgg7kAnCDmSCsAOZIOxAJgg7kAnCDmSiq/rsdW7nXHU55yrqvu0DBw6U1lL94lSvO3V8qk8fLRedWsb6uuuuC+tHjx4N62PHji2tpfrsqd9Zk1syt4pHdiAThB3IBGEHMkHYgUwQdiAThB3IBGEHMtHxPns0t7ube+XRksmp5ZRT6txW+dJLLw2PnTNnTlhPLSX93HPPhfVI1AeXpIULF4b1Klt4p5ahjs5dkKovwd0EHtmBTBB2IBOEHcgEYQcyQdiBTBB2IBOEHchEx/vs0Zz1OvvoqbnyqXndUU949Oj4bly6dGlYTx2/ZMmSsD5mzJjS2ty5c8NjJ02aFNZTvezXX3+95eNnz54dHptamz3V616/fn1p7fLLLw+Pje5TqTv76CnJR3Yze8zMdpnZhiGXLTez7Wa2rvi4qt5hAqhqJE/jn5B05TCXP+zuc4uPl9o7LADtlgy7u78maW8HxgKgRlXeoLvNzN4vnuZPLvshM1tmZmvNbG2F2wJQUath/5mkb0uaK6lf0kNlP+juK9x9nrvPa/G2ALRBS2F3953uPuDupyT9XNL89g4LQLu1FHYzmz7k2x9I2lD2swC6g6X6qGb2lKTvSpoqaaeku4vv50pySZ9KusXd+5M3ZhbeWKrfnJr3HZk1a1ZYv+aaa8L64sWLS2upedepedupudPR/utSvIZ5X19feGxK1Xnd0e/0iy++CI+dOHFiWE/ZvHlzaW3VqlXhsQ89VPrKVFJ399ndfdiTSpIn1bj7DcNc/GjlEQHoKE6XBTJB2IFMEHYgE4QdyARhBzKRbL219cbMPFp2uc4prnfffXdYX758eVjfs2dPaW3q1KmtDOlLqa2H9+6NpyZE9QsuuCA8NtUWTG3ZnHLs2LHSWmoaaervIdWKjaYtp7Zcfvnll8P6zTffHNab3NK5rPXGIzuQCcIOZIKwA5kg7EAmCDuQCcIOZIKwA5noeJ89qlfZmjg11TLV96yy7fKuXbvC+tatW8P6Aw88ENZXr14d1ufNK18E6OGHHw6PTW3ZPHly6YpjkqRt27aF9eh3+sQTT4THfvLJJ2H92muvDevR1OOq02tffPHFsJ6aMl0n+uxA5gg7kAnCDmSCsAOZIOxAJgg7kAnCDmSio332UaNGeTQ/+vjx4+Hx55xzTmlt9+7d4bGpPntq7nTUL05tB71p06awPmXKlLCeWrY4Wu75/PPPD49NzWdPLe+9b9++sH7jjTeW1l544YXw2JTUOgLRctGLFi0Kj02tMZC6X1LLf9eJPjuQOcIOZIKwA5kg7EAmCDuQCcIOZIKwA5noqvnsVaT6nitXrgzr119/fcvXf/jw4fDYcePGhfXUtsipef4DAwOltdS672+++WZYf/LJJ8P6unXrwvobb7xRWkudX5Dq4ad+59F5G/Pnzw+Pffvtt8P6448/HtZT68rXqeU+u5mdZ2a/NbONZvaBmf2kuHyKmb1iZpuLz/EqBwAaNZKn8Scl/Y27XyzpzyX92MwulnSXpFfdfbakV4vvAXSpZNjdvd/d3yu+PiDpQ0kzJC2RdPq58UpJS2saI4A2iF/0fIWZzZL0HUlvS5rm7v1FaYekaSXHLJO0rMIYAbTBiN+NN7M+SWsk3e7u+4fWfPBdvmHffHP3Fe4+z93LV0UEULsRhd3MztBg0H/h7s8UF+80s+lFfbqkeIlVAI1Ktt5scP7mSkl73f32IZc/IOkzd7/PzO6SNMXd/zZxXeGNnXvuueFYduzYEdYj0fa9kjRz5sywfu+995bWZsyYER6b2nI5tXVxtF20JN1///2ltY0bN4bHpqa4prZFTklNW46k2oYnTpwI69HU49Tf/YQJE8J61SnTdSprvY3kNftfSPorSevNbF1x2U8l3Sfpl2b2I0lbJcWNagCNSobd3f9LUtl/kd9r73AA1IXTZYFMEHYgE4QdyARhBzJB2IFMdHSKa09Pj0d93dRU0aj3uX///tKaJPX19YX1VN806vlW6fdK6Z5v6hyBqJed6uEfO3YsrFcV/b5TyzWnpgan/l6q/M5Sqo6tTiwlDWSOsAOZIOxAJgg7kAnCDmSCsAOZIOxAJrpqKenUHOKol55aVrjqvOzp06eX1vr7+0trI9Hb2xvWU1s213ndqWWsDx06FNarzClPGTUqfqyqMqe86fMTqqDPDmSOsAOZIOxAJgg7kAnCDmSCsAOZIOxAJrqqzw6gOvrsQOYIO5AJwg5kgrADmSDsQCYIO5AJwg5kIhl2MzvPzH5rZhvN7AMz+0lx+XIz225m64qPq+ofLoBWJU+qMbPpkqa7+3tmdpakdyUt1eB+7Afd/cER3xgn1QC1KzupZiT7s/dL6i++PmBmH0qa0d7hAajbH/Sa3cxmSfqOpLeLi24zs/fN7DEzm1xyzDIzW2tma6sNFUAVIz433sz6JP2npHvd/RkzmyZpjySX9A8afKp/c+I6eBoP1KzsafyIwm5mZ0h6UdKv3f2fhqnPkvSiu/9J4noIO1CzlifC2ODyoI9K+nBo0Is37k77gaQNVQcJoD4jeTd+gaTXJa2XdHpt3p9KukHSXA0+jf9U0i3Fm3nRdfHIDtSs0tP4diHsQP2Yzw5kjrADmSDsQCYIO5AJwg5kgrADmSDsQCYIO5AJwg5kgrADmSDsQCYIO5AJwg5kgrADmUguONlmeyRtHfL91OKybtStY+vWcUmMrVXtHNsFZYWOzmf/2o2brXX3eY0NINCtY+vWcUmMrVWdGhtP44FMEHYgE02HfUXDtx/p1rF167gkxtaqjoyt0dfsADqn6Ud2AB1C2IFMNBJ2M7vSzP7XzD4ys7uaGEMZM/vUzNYX21A3uj9dsYfeLjPbMOSyKWb2ipltLj4Pu8deQ2Prim28g23GG73vmt7+vOOv2c2sR9ImSX8paZukdyTd4O4bOzqQEmb2qaR57t74CRhmtlDSQUmrTm+tZWb/KGmvu99X/Ec52d3/rkvGtlx/4DbeNY2tbJvxv1aD9107tz9vRROP7PMlfeTuW9z9uKSnJS1pYBxdz91fk7T3KxcvkbSy+HqlBv9YOq5kbF3B3fvd/b3i6wOSTm8z3uh9F4yrI5oI+wxJvxvy/TZ1137vLuk3ZvaumS1rejDDmDZkm60dkqY1OZhhJLfx7qSvbDPeNfddK9ufV8UbdF+3wN3/TNL3Jf24eLralXzwNVg39U5/JunbGtwDsF/SQ00OpthmfI2k2919/9Bak/fdMOPqyP3WRNi3SzpvyPczi8u6grtvLz7vkvSsBl92dJOdp3fQLT7vang8X3L3ne4+4O6nJP1cDd53xTbjayT9wt2fKS5u/L4bblydut+aCPs7kmab2bfM7ExJP5T0fAPj+BozG1+8cSIzGy/pCnXfVtTPS7qp+PomSb9qcCy/p1u28S7bZlwN33eNb3/u7h3/kHSVBt+R/1jS3zcxhpJxXSjpv4uPD5oem6SnNPi07oQG39v4kaSzJb0qabOk/5A0pYvG9m8a3Nr7fQ0Ga3pDY1ugwafo70taV3xc1fR9F4yrI/cbp8sCmeANOiAThB3IBGEHMkHYgUwQdiAThB3IBGEHMvF/rSIwqVQD1iIAAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import torch\n",
     "import matplotlib.pyplot as plt\n",
@@ -460,21 +89,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([0.1020, 0.0113, 0.4806, 0.0571, 0.0482, 0.0079, 0.0450, 0.0076, 0.1851,\n",
-       "        0.0552])"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from torch.nn.functional import softmax\n",
     "# do forward pass in PyTorch/Brevitas\n",
@@ -485,22 +102,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEICAYAAABS0fM3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAbi0lEQVR4nO3debxdZXn28d9FIDKFQRIVEiDMNk6IERAVZWpDq2AREV4nrEwtsSi+VlTUSp3qhFWxCgRBKfACgo0WZShKHYGAKIRBwhzGMAmiLxC4+sd6Dm6O++yzMqx1yFnX9/PZn6z5vvc+sO+9nmetZ8k2ERHRXSuNdQIRETG2UggiIjouhSAiouNSCCIiOi6FICKi41IIIiI6LoUgoiZJlrR5mf66pI8s5XF+L2nT5ZvdwHiS9E1JD0i6pK24seJIIRiHJN0sadc+y18r6cnyRTT0+l7P+i0lnSHpXkm/k/QbSYdLmrCM+cyWNE/So5JOXMJ995H0c0l/kPTjUbbtfX8PS7pO0juXJfeR2D7E9r+Mtp2kH0s6YNi+a9q+sYm8RvAqYDdgmu1tl/VgkqaXorjysqcWzwQpBN1zR/kiGnq9HkDSZsDFwG3Ai2yvDbwJmAlMWtaYwCeAE5Zi3/uBLwGfqRvL9prAWsAHgOMkzRi+Uce+xDYGbrb9yJLu2LHPqbNSCGLIx4Gf2z7c9p0Atq+z/X9sPzh8Y0k7SbqyZ/58SZf2zP9E0hvKcc6y/V3gvj7HWVfS9yUtKk0X35c0bWi97Qtsn05VTGpz5bvAA8AMSftL+pmkoyXdB/yzpGdJ+rykWyXdXZp7VuvJ7f2S7pR0h6S/G5b3iZI+0TO/p6QrJD0k6QZJsyR9Eng18NVylvLVsm1vE9Pakr5V3v8tko6UtFJZt7+kn5YcH5B0k6Tde2LuL+nGcvZzk6S39Pl83wUcD7yi5PDxsvxASQsk3S9prqQNevaxpEMlXQ9cP9pnXT6Lr0n6QYnxM0nPk/Slkve1kl7as/0R5TN6WNLVkv62Z90ESV8oZ6U3lbPJp84+yuc1p/xdbpf0iWU9Y40UgviTXYEzl2D7XwJbSJosaRXgxcAGkiaVL9OZwE9qHGcl4JtUv1o3Av4IfHWJMu9D0krlC2YdYKhgbQfcCDwX+CTVWcaWwNbA5sBU4KNl/1nA/6VqUtmC6vMZKda2wLeA95d4O1L9Av8w1Wcwu5x9ze6z+1eAtYFNgdcAbwd6m7O2A64DJgOfBeaosgbwZWB325OAHYArhh/c9hzgEOAXJYePSdoZ+DSwD7A+cAtw2rBd31Bi/9nZ1Aj2AY4seT4K/AK4vMyfCXyxZ9sbqArk2lQ/QE6WtH5ZdyCwO9XfZJuSR68TgcVUf6+XAn8JHEAsG9t5jbMXcDOwa5/lrwWeBB7see1T1j0OzFrCOD8B9gK2B84DTgdmATsBv+mz/SeAE0c55tbAA32WHwD8eJR9e9/f/VRfjPuWdfsDt/ZsK+ARYLOeZa8AbirTJwCf6Vm3JWBg8zJ/IvCJMv0N4OgRcvoxcMCwZab6IpsAPAbM6Fl38ND7LDkv6Fm3etn3ecAa5X2+EVhtlM9lf+CnPfNzgM/2zK9Z/v7Te/LbecDxppdtVu75LI7rWf9u4Jqe+RcBDw443hXAnmX6QuDgnnW7DsWiKuCP9r5fYD/gR23/PzbeXmn/6547bE/rs/w+ql+HfUn6OvDWMvsp258CLqL68l1Yph+g+lX7aJkflaTVgaOpCsi6ZfEkSRNsP1HnGMOM9P6g6v8YMoXqi/UySU+lQ/XlDLABcFnP9rcMiLkhcM6Sp8pkYJVhx76F6sxkyF1DE7b/UHJd0/Zdkt5MddYyR9LPgPfZvrZG3A2ofq0PHff3pblsKtWPCHj6Z1XH3T3Tf+wzv+bQjKS3A4dTFRTKusk9ufXG7p3emOrzurPnb7bSUuQaw6RpKIZcQPXrsi9XV8kMdTB/qiweKgQ7lumLqArBa6hZCID3AVsB29leqxwLqi/l5a13qN17qb6gXmB7nfJa21VHM8CdVF/wQzYacNzbgM1qxBzuXqpf4hsPi3P7gH3+dGD7XNu7URXwa4Hj6uxH1d/yVMzSzLTesLiNDEssaWOqPGcD69leB7iKP/297wR6C3nv3+A2qh8Zk3v+ZmvZfkETuXZJCsH4tYqkVXteo539fQzYQdLnJD0PQNLmkk6WtM4I+/yc6kt8W+AS2/OpvmC2A/5naCNJK0talerX9oRh+Uyi+kJ+UNKzSx707Duh7LsysFLZd5X6H0N/tp+k+kI6WtJzSqypkv6qbHI6sL+kGeWs5WMjHAqqppZ3Stql9E1MlfT8su5uqvb/fjk8UeJ8svStbEz1S/nk0fKX9NzSQb0G1Zfj76maxeo4teS7taRnAZ8CLrZ9c839l8UaVEVmEYCqy3tf2LP+dOCw8hmuQ3XlFwCuLmI4D/iCpLXKZ72ZpNe0kPe4lkIwfp1D9QU79PrnQRvbvoGqjXw6MF/S74DvAPOAh0fY5xGqJob5th8ri38B3GL7np5Njyw5HEHVvPTHsgyqS0NXo/p1/Evgh8PCvK1s/+9UHYx/pP4v39F8AFgA/FLSQ1RnRVuV9/aDktuFZZsLRzqI7UuoOniPBn5HdTY09Iv734C9y9UzX+6z+7up+ipuBH4KnEK9y2xXoioad1D1h7wG+Psa+2H7AuAjVH/fO6nOZvats++ysn018AWq/07upuo/+FnPJsdRfdn/BvgV1X/Hi4GhZsK3AxOBq6maIs9kQJNm1KPS4RIR8YxTLpf9uu2NR904llrOCCLiGUPSapL+ujQnTqVqkjt7rPMa73JGEBHPGKU/5iLg+VTNgP8FHGb7oTFNbJxLIYiI6LhGm4ZU3WZ/XbmV/Yg+6/dXdWv9FeWVOwQjIlrW2A1lZfyPY6hu0V8IXCppbrlqoNf/c/9b7/uaPHmyp0+fvvwSjYjogMsuu+xe21P6rWvyzuJtqW6PvxFA0mnAnlSXfS216dOnM2/evOWQXkREd0ga8e74JpuGpvL0W78X8vRb54e8UdW492dK2rDPeiQdpGo8+3mLFi1qIteIiM4a68tHv0c10NWLgfOBk/ptZPtY2zNtz5wype+ZTURELKUmC8HtPH2ckGkMG0PF9n22Hy2zxwMvazCfiIjoo8lCcCnVePWbSJpIdQv73N4NesYgB9gDuKbBfCIioo/GOottL5Y0GziXarCxE2zPl3QUMM/2XOAfJe1BNZbI/VTjpkdERItWuBvKZs6c6Vw1FBGxZCRdZntmv3Vj3VkcERFjLIUgIqLjUggiIjouzyzugKPP/23jMd6725aNx4iIZuSMICKi41IIIiI6LoUgIqLjUggiIjouhSAiouNSCCIiOi6FICKi41IIIiI6LoUgIqLjUggiIjouhSAiouNSCCIiOi6FICKi41IIIiI6LoUgIqLjUggiIjouhSAiouNSCCIiOi6FICKi41IIIiI6LoUgIqLjUggiIjouhSAiouNSCCIiOi6FICKi41IIIiI6LoUgIqLjUggiIjouhSAiouNSCCIiOq7RQiBplqTrJC2QdMSA7d4oyZJmNplPRET8ucYKgaQJwDHA7sAMYD9JM/psNwk4DLi4qVwiImJkTZ4RbAsssH2j7ceA04A9+2z3L8C/Av+/wVwiImIETRaCqcBtPfMLy7KnSNoG2ND2fw06kKSDJM2TNG/RokXLP9OIiA4bs85iSSsBXwTeN9q2to+1PdP2zClTpjSfXEREhzRZCG4HNuyZn1aWDZkEvBD4saSbge2BuekwjohoV5OF4FJgC0mbSJoI7AvMHVpp+3e2J9uebns68EtgD9vzGswpIiKGaawQ2F4MzAbOBa4BTrc9X9JRkvZoKm5ERCyZlZs8uO1zgHOGLfvoCNu+tslcIiKiv9xZHBHRcSkEEREdl0IQEdFxKQQRER2XQhAR0XEpBBERHZdCEBHRcSkEEREdN2ohkPRuSeu2kUxERLSvzhnBc4FLJZ1enjimppOKiIj2jFoIbB8JbAHMAfYHrpf0KUmbNZxbRES0oFYfgW0Dd5XXYmBd4ExJn20wt4iIaMGog85JOgx4O3AvcDzwftuPlwfLXA/8U7MpRkREk+qMPvpsYC/bt/QutP2kpNc1k1ZERLSlTtPQpsOLgKRvA9i+ppGsIiKiNXUKwQt6ZyRNAF7WTDoREdG2EQuBpA9Kehh4saSHyuth4B7gP1vLMCIiGjViIbD9aduTgM/ZXqu8Jtlez/YHW8wxIiIaNGJnsaTn274WOEPSNsPX27680cwiIqIVg64aeh9wIPCFPusM7NxIRhER0aoRC4HtA8u/O7WXTkREtG1Q09Beg3a0fdbyTyciIto2qGno9QPWGUghiIgYBwY1Db2zzUQiImJsDGoaeqvtkyUd3m+97S82l1ZERLRlUNPQGuXfSW0kEhERY2NQ09A3yr8fby+diIhoW51HVW4q6XuSFkm6R9J/Stq0jeQiIqJ5dQadOwU4HVgf2AA4Azi1yaQiIqI9dQrB6ra/bXtxeZ0MrNp0YhER0Y5BVw09u0z+QNIRwGlU9w+8GTinhdwiIqIFg64auozqi19l/uCedQYyAmlExDgw6KqhTdpMJCIixkadZxYj6YXADHr6Bmx/q6mkIiKiPXUuH/0Y8JXy2gn4LLBHnYNLmiXpOkkLSj/D8PWHSLpS0hWSfippxhLmHxERy6jOVUN7A7sAd5Xxh14CrD3aTuXZxscAu1OdTezX54v+FNsvsr01VYHJsBURES2rUwj+aPtJYLGktaieWbxhjf22BRbYvtH2Y1RXHe3Zu4Hth3pm16DqhI6IiBbV6SOYJ2kd4DiqK4l+D/yixn5Tgdt65hcC2w3fSNKhwOHARPLUs4iI1o16RmD7H2w/aPvrwG7AO5bnENW2j7G9GfAB4Mh+20g6SNI8SfMWLVq0vEJHRAT1moaQtJekLwLvBjareezbeXoT0rSybCSnAW/ot8L2sbZn2p45ZcqUmuEjIqKOOlcNfQ04BLgSuAo4WNIxNY59KbCFpE0kTQT2BeYOO/YWPbN/A1xfN/GIiFg+6vQR7Az8hW0DSDoJmD/aTrYXS5oNnAtMAE6wPV/SUcA823OB2ZJ2BR4HHgDesZTvIyIillKdQrAA2Ai4pcxvWJaNyvY5DBuXyPZHe6YPq5dmREQ0ZdCgc9+jupxzEnCNpEvKqm2BS0baLyIiViyDzgg+31oWERExZgYNOnfR0LSk5wIvL7OX2L6n6cQiIqIdda4a2oeqKehNwD7AxZL2bjqxiIhoR53O4g8DLx86C5A0BbgAOLPJxCIioh11bihbaVhT0H0194uIiBVAnTOCH0o6lz89sD6PqoyIGEcGFgJJAr5M1VH8qrL4WNtnN51YRES0Y2AhsG1J59h+EXBWSzlFRESL6rT1Xy7p5aNvFhERK6I6fQTbAW+VdDPwCCCqk4UXN5lYRES0o04h+KvGs4iIiDEzaKyh5wAfAjanGoL608MeLRkREePAoD6Cb1E1BX0FWJPq6qGIiBhnBjUNrW/7w2X6XEmXt5FQRES0a7T7CNal6hwGmNA7b/v+hnOLiIgWDCoEawOX8adCADB0VmBg06aSioiI9gwahnp6i3lERMQYyeBxEREdl0IQEdFxKQQRER036IayZw/aMVcNRUSMD4OuGrqM6uogARsBD5TpdYBbgU2aTi4iIpo3YtOQ7U1sb0r1WMrX255sez3gdcB5bSUYERHNqtNHsL3tp55IZvsHwA7NpRQREW2qM/roHZKOBE4u828B7mgupYiIaFOdM4L9gCnA2VRPKZtSlkVExDgw6hlBuTroMElr2H6khZwiIqJFo54RSNpB0tXANWX+JZK+1nhmERHRijpNQ0dTPaXsPgDbvwZ2bDKpiIhoT607i23fNmzREw3kEhERY6DOVUO3SdoBsKRVgMMozUQREbHiq3NGcAhwKDAVuB3YGviHBnOKiIgW1Tkj2Mr2W3oXSHol8LNmUoqIiDbVOSP4Ss1lf0bSLEnXSVog6Yg+6w+XdLWk30j6b0kb1zluREQsP4NGH30F1VASUyQd3rNqLWDCaAeWNAE4BtgNWAhcKmmu7at7NvsVMNP2HyT9PfBZ4M1L/jYiImJpDTojmAisSVUsJvW8HgL2rnHsbYEFtm+0/RhwGrBn7wa2f2T7D2X2l8C0JUs/IiKW1aBnFl8EXCTpRNu3LMWxpwK9l50uBLYbsP27gB8sRZyIiFgGdfoIjpe0ztCMpHUlnbs8k5D0VmAm8LkR1h8kaZ6keYsWLVqeoSMiOq9OIZhs+8GhGdsPAM+psd/twIY989PKsqeRtCvwYWAP24/2O5DtY23PtD1zypQpNUJHRERddQrBk5I2GpopV/a4xn6XAltI2kTSRGBfYG7vBpJeCnyDqgjcUz/tiIhYXurcR/Bh4KeSLqJ6VOWrgYNG28n2YkmzgXOprjI6wfZ8SUcB82zPpWoKWhM4QxLArbb3WLq3EhERS6POMNQ/lLQNsH1Z9B7b99Y5eHmy2TnDln20Z3rXJcg1IiIaMGLTkKTnl3+3oXp4/R3ltVFZFhER48CgM4L3AQcCX+izzsDOjWQUEbGUjj7/t43HeO9uWzYeo22D7iM4sPy7U3vpRERE2wYNMbHXoB1tn7X804mIiLYNahp6ffn3OVRjDl1Y5ncCfk71IPuIiFjBDWoaeieApPOAGbbvLPPrAye2kl1ERDSuzg1lGw4VgeJuqquIIiJiHKhzQ9l/l7GFTi3zbwYuaC6liIhoU50bymZL+ltgx7LoWNtnN5tWRES0pc4ZAcDlwMO2L5C0uqRJth9uMrGIiGjHqH0Ekg4EzqQaHA6q5wx8t8GcIiKiRXU6iw8FXkn1ZDJsX0+9YagjImIFUKcQPFoeNQmApJWpNwx1RESsAOoUgoskfQhYTdJuwBnA95pNKyIi2lKnEHwAWARcCRxMNaz0kU0mFRER7Rl41ZCkCcB8288HjmsnpYiIaNPAMwLbTwDX9T6qMiIixpc69xGsC8yXdAnwyNDCPFIyImJ8qFMIPtJ4FhERMWYGPY9gVeAQYHOqjuI5the3lVhERLRjUB/BScBMqiKwO/0fWRkRESu4QU1DM2y/CEDSHOCSdlKKiIg2DTojeHxoIk1CERHj16AzgpdIeqhMi+rO4ofKtG2v1Xh2ERHRuEGPqpzQZiIRETE26gwxERER41gKQUREx6UQRER0XApBRETHpRBERHRcCkFERMelEEREdFwKQUREx6UQRER0XApBRETHNVoIJM2SdJ2kBZKO6LN+R0mXS1osae8mc4mIiP4aKwTlwffHUD3LYAawn6QZwza7FdgfOKWpPCIiYrA6j6pcWtsCC2zfCCDpNGBP4OqhDWzfXNY92WAeTzn6/N82HuO9u23ZeIyIiOWpyaahqcBtPfMLy7IlJukgSfMkzVu0aNFySS4iIiorRGex7WNtz7Q9c8qUKWOdTkTEuNJkIbgd2LBnflpZFhERzyBNFoJLgS0kbSJpIrAvMLfBeBERsRQaKwTlOcezgXOBa4DTbc+XdJSkPQAkvVzSQuBNwDckzW8qn4iI6K/Jq4awfQ5wzrBlH+2ZvpSqySgiIsbICtFZHBERzUkhiIjouBSCiIiOSyGIiOi4FIKIiI5LIYiI6LgUgoiIjkshiIjouBSCiIiOSyGIiOi4FIKIiI5LIYiI6LhGB52LyONBI575UggiIpaDFflHT5qGIiI6LoUgIqLj0jQU41rTp+vpn4jxIIUgYhxKAYwlkaahiIiOSyGIiOi4FIKIiI5LIYiI6LgUgoiIjkshiIjouBSCiIiOSyGIiOi4FIKIiI5LIYiI6LgUgoiIjkshiIjouBSCiIiOSyGIiOi4DEPdkhX5MXYRMb6lEEQ0JM8EiBVFo4VA0izg34AJwPG2PzNs/bOAbwEvA+4D3mz75iZziohm5ex3xdNYH4GkCcAxwO7ADGA/STOGbfYu4AHbmwNHA//aVD4REdFfk53F2wILbN9o+zHgNGDPYdvsCZxUps8EdpGkBnOKiIhhZLuZA0t7A7NsH1Dm3wZsZ3t2zzZXlW0Wlvkbyjb3DjvWQcBBZXYr4LpGku5vMnDvqFsldmIndmI/s2NvbHtKvxUrRGex7WOBY8citqR5tmcmdmIndmKPl9jDNdk0dDuwYc/8tLKs7zaSVgbWpuo0joiIljRZCC4FtpC0iaSJwL7A3GHbzAXeUab3Bi50U21VERHRV2NNQ7YXS5oNnEt1+egJtudLOgqYZ3suMAf4tqQFwP1UxeKZZkyapBI7sRM7sdvSWGdxRESsGDLWUEREx6UQRER0XArBCCTNknSdpAWSjmg59gmS7in3WbQZd0NJP5J0taT5kg5rMfaqki6R9OsS++Ntxe7JYYKkX0n6/hjEvlnSlZKukDSv5djrSDpT0rWSrpH0ipbiblXe79DrIUnvaSN2if/e8t/aVZJOlbRqi7EPK3Hnt/meR2Q7r2Evqs7tG4BNgYnAr4EZLcbfEdgGuKrl970+sE2ZngT8tq33DQhYs0yvAlwMbN/y+z8cOAX4fptxS+ybgcltxy2xTwIOKNMTgXXGIIcJwF1UNz21EW8qcBOwWpk/Hdi/pdgvBK4CVqe6YOcCYPOx+NsPvXJG0F+d4TEaY/t/qK6iapXtO21fXqYfBq6h+h+mjdi2/fsyu0p5tXYlg6RpwN8Ax7cV85lA0tpUPzzmANh+zPaDY5DKLsANtm9pMebKwGrlHqbVgTtaivsXwMW2/2B7MXARsFdLsftKIehvKnBbz/xCWvpCfKaQNB14KdUv87ZiTpB0BXAPcL7t1mIDXwL+CXiyxZi9DJwn6bIypEpbNgEWAd8szWLHS1qjxfhD9gVObSuY7duBzwO3AncCv7N9XkvhrwJeLWk9SasDf83Tb75tXQpB/BlJawLfAd5j+6G24tp+wvbWVHehbyvphW3ElfQ64B7bl7URbwSvsr0N1Wi9h0rasaW4K1M1Q/677ZcCjwBt94lNBPYAzmgx5rpUZ/mbABsAa0h6axuxbV9DNdLyecAPgSuAJ9qIPZIUgv7qDI8xLklahaoI/Ifts8Yih9I08SNgVkshXwnsIelmqmbAnSWd3FJs4KlfqNi+BzibqnmyDQuBhT1nX2dSFYY27Q5cbvvuFmPuCtxke5Htx4GzgB3aCm57ju2X2d4ReICqP27MpBD0V2d4jHGnDAE+B7jG9hdbjj1F0jplejVgN+DaNmLb/qDtabanU/2tL7Tdyq9DAElrSJo0NA38JVXzQeNs3wXcJmmrsmgX4Oo2YvfYjxabhYpbge0lrV7+u9+Fqk+sFZKeU/7diKp/4JS2YvezQow+2jaPMDxGW/ElnQq8FpgsaSHwMdtzWgj9SuBtwJWlrR7gQ7bPaSH2+sBJ5YFGKwGn2279Ms4x8lzg7PIojpWBU2z/sMX47wb+o/zouRF4Z1uBS+HbDTi4rZgAti+WdCZwObAY+BXtDvnwHUnrAY8Dh45RB/1TMsRERETHpWkoIqLjUggiIjouhSAiouNSCCIiOi6FICKi41IIIiI6LoUgIqLj/hdRB2LXFx7MKAAAAABJRU5ErkJggg==\n",
-      "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "objects = [str(x) for x in range(10)]\n",
@@ -529,7 +133,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -548,39 +152,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/LFCW1A1.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f3a27be9ac8>"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "showInNetron('/tmp/LFCW1A1.onnx')"
    ]
@@ -603,27 +177,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "input: \"37\"\n",
-       "input: \"38\"\n",
-       "output: \"40\"\n",
-       "op_type: \"MatMul\""
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from finn.core.modelwrapper import ModelWrapper\n",
     "model = ModelWrapper(export_onnx_path)\n",
-    "model.graph.node[9]"
+    "model.graph.node[8]"
    ]
   },
   {
@@ -635,28 +195,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[-1., -1.,  1., ..., -1.,  1., -1.],\n",
-       "       [ 1.,  1., -1., ...,  1., -1.,  1.],\n",
-       "       [-1., -1., -1., ...,  1., -1.,  1.],\n",
-       "       ...,\n",
-       "       [ 1., -1., -1., ..., -1., -1.,  1.],\n",
-       "       [ 1., -1., -1., ...,  1.,  1.,  1.],\n",
-       "       [ 1., -1.,  1., ...,  1., -1.,  1.]], dtype=float32)"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "model.get_initializer(model.graph.node[9].input[1])"
+    "model.get_initializer(model.graph.node[8].input[1])"
    ]
   },
   {
@@ -668,42 +211,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<DataType.BIPOLAR: 34>"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "model.get_tensor_datatype(model.graph.node[9].input[1])"
+    "model.get_tensor_datatype(model.graph.node[8].input[1]).name"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[784, 1024]"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "model.get_tensor_shape(model.graph.node[9].input[1])"
+    "model.get_tensor_shape(model.graph.node[8].input[1])"
    ]
   },
   {
@@ -715,7 +236,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -729,39 +250,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/LFCW1A1-clean.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f3a27b49e10>"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "showInNetron('/tmp/LFCW1A1-clean.onnx')"
    ]
@@ -775,22 +266,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[-1.3736125, -3.5715756,  0.1768887, -1.9529207, -2.1233053,\n",
-       "        -3.9293835, -2.1914592, -3.9634604, -0.7772659, -1.9869976]],\n",
-       "      dtype=float32)"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import finn.core.onnx_exec as oxe\n",
     "input_dict = {\"0\": nph.to_array(input_tensor)}\n",
@@ -802,20 +280,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "np.isclose(produced, produced_finn).all()"
    ]
@@ -844,7 +311,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index a141caf423f5238245b509e077d32c6bd1a85fcd..2d668f3e041e54bd82e79a79efca8a82210bfcbc 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -71,14 +71,25 @@
    "source": [
     "## 1. Brevitas Export, FINN Import and Tidy-Up\n",
     "\n",
-    "Similar to what we did in the TFC-w1a1 end-to-end notebook, we will start by exporting the [pretrained CNV-w1a1 network](https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq) to ONNX, importing that into FINN and running the \"tidy-up\" transformations to have a first look at the topology."
+    "Similar to what we did in the TFC-w1a1 end-to-end notebook, we will start by exporting the [pretrained CNV-w1a1 network](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq) to ONNX, importing that into FINN and running the \"tidy-up\" transformations to have a first look at the topology."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/brevitas/src/brevitas_examples/bnn_pynq/models/CNV.py:106: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "  x = 2.0 * x - torch.tensor([1.0], device=x.device)\n",
+      "/workspace/brevitas/src/brevitas/quant_tensor/__init__.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "  training = torch.tensor(training, dtype=torch.bool)\n"
+     ]
+    }
+   ],
    "source": [
     "import onnx\n",
     "from finn.util.test import get_test_model_trained\n",
@@ -108,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -125,17 +136,17 @@
        "        <iframe\n",
        "            width=\"100%\"\n",
        "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
+       "            src=\"http://localhost:8081/\"\n",
        "            frameborder=\"0\"\n",
        "            allowfullscreen\n",
        "        ></iframe>\n",
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f25b19194a8>"
+       "<IPython.lib.display.IFrame at 0x7f912af76550>"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -157,19 +168,19 @@
    "source": [
     "### Adding Pre- and Postprocessing <a id='prepost'></a>\n",
     "\n",
-    "TODO"
+    "Preprocessing and postprocessing steps can be added directly in the ONNX graph. In this case, the preprocessing step divides the input `uint8` data by 255 so the inputs to the CNV-w1a1 network are bounded between [0, 1]. The postprocessing step takes the output of the network and returns the index (0-9) of the image category with the highest probability (top-1). "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/workspace/finn/src/finn/transformation/infer_data_layouts.py:113: UserWarning: Assuming 4D input is NCHW\n",
+      "/workspace/finn-base/src/finn/transformation/infer_data_layouts.py:114: UserWarning: Assuming 4D input is NCHW\n",
       "  warnings.warn(\"Assuming 4D input is NCHW\")\n"
      ]
     }
@@ -192,19 +203,40 @@
     "model = model.transform(MergeONNXModels(pre_model))\n",
     "# add input quantization annotation: UINT8 for all BNN-PYNQ models\n",
     "global_inp_name = model.graph.input[0].name\n",
-    "model.set_tensor_datatype(global_inp_name, DataType.UINT8)"
+    "model.set_tensor_datatype(global_inp_name, DataType[\"UINT8\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.insert_topk import InsertTopK\n",
+    "from finn.transformation.infer_datatypes import InferDataTypes\n",
+    "\n",
+    "# postprocessing: insert Top-1 node at the end\n",
+    "model = model.transform(InsertTopK(k=1))\n",
+    "chkpt_name = build_dir+\"/end2end_cnv_w1a1_pre_post.onnx\"\n",
+    "# tidy-up again\n",
+    "model = model.transform(InferShapes())\n",
+    "model = model.transform(FoldConstants())\n",
+    "model = model.transform(GiveUniqueNodeNames())\n",
+    "model = model.transform(GiveReadableTensorNames())\n",
+    "model = model.transform(InferDataTypes())\n",
+    "model = model.transform(RemoveStaticGraphInputs())\n",
+    "model.save(chkpt_name)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
       "Stopping http://0.0.0.0:8081\n",
       "Serving '/workspace/finn/end2end_cnv_w1a1_pre_post.onnx' at http://0.0.0.0:8081\n"
      ]
@@ -216,37 +248,22 @@
        "        <iframe\n",
        "            width=\"100%\"\n",
        "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
+       "            src=\"http://localhost:8081/\"\n",
        "            frameborder=\"0\"\n",
        "            allowfullscreen\n",
        "        ></iframe>\n",
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f25b1919518>"
+       "<IPython.lib.display.IFrame at 0x7f8ffd85a760>"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "from finn.transformation.insert_topk import InsertTopK\n",
-    "from finn.transformation.infer_datatypes import InferDataTypes\n",
-    "\n",
-    "# postprocessing: insert Top-1 node at the end\n",
-    "model = model.transform(InsertTopK(k=1))\n",
-    "chkpt_name = build_dir+\"/end2end_cnv_w1a1_pre_post.onnx\"\n",
-    "# tidy-up again\n",
-    "model = model.transform(InferShapes())\n",
-    "model = model.transform(FoldConstants())\n",
-    "model = model.transform(GiveUniqueNodeNames())\n",
-    "model = model.transform(GiveReadableTensorNames())\n",
-    "model = model.transform(InferDataTypes())\n",
-    "model = model.transform(RemoveStaticGraphInputs())\n",
-    "model.save(chkpt_name)\n",
-    "\n",
     "showInNetron(build_dir+\"/end2end_cnv_w1a1_pre_post.onnx\")"
    ]
   },
@@ -268,7 +285,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -311,14 +328,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
       "Stopping http://0.0.0.0:8081\n",
       "Serving '/workspace/finn/end2end_cnv_w1a1_streamlined.onnx' at http://0.0.0.0:8081\n"
      ]
@@ -330,17 +346,17 @@
        "        <iframe\n",
        "            width=\"100%\"\n",
        "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
+       "            src=\"http://localhost:8081/\"\n",
        "            frameborder=\"0\"\n",
        "            allowfullscreen\n",
        "        ></iframe>\n",
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f25b19a9470>"
+       "<IPython.lib.display.IFrame at 0x7f91ac6e6f70>"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -360,9 +376,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/finn/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py:591: UserWarning: Clipping some thresholds in \n",
+      "  warnings.warn(\"Clipping some thresholds in %s\" % self.onnx_node.name)\n"
+     ]
+    }
+   ],
    "source": [
     "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n",
     "from finn.transformation.fpgadataflow.create_dataflow_partition import (\n",
@@ -409,7 +434,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 10,
    "metadata": {
     "scrolled": false
    },
@@ -418,7 +443,6 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
       "Stopping http://0.0.0.0:8081\n",
       "Serving '/workspace/finn/end2end_cnv_w1a1_dataflow_parent.onnx' at http://0.0.0.0:8081\n"
      ]
@@ -430,17 +454,17 @@
        "        <iframe\n",
        "            width=\"100%\"\n",
        "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
+       "            src=\"http://localhost:8081/\"\n",
        "            frameborder=\"0\"\n",
        "            allowfullscreen\n",
        "        ></iframe>\n",
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f25b18b7668>"
+       "<IPython.lib.display.IFrame at 0x7f8ffd85ae20>"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -458,14 +482,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
       "Stopping http://0.0.0.0:8081\n",
       "Serving '/workspace/finn/end2end_cnv_w1a1_dataflow_model.onnx' at http://0.0.0.0:8081\n"
      ]
@@ -477,17 +500,17 @@
        "        <iframe\n",
        "            width=\"100%\"\n",
        "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
+       "            src=\"http://localhost:8081/\"\n",
        "            frameborder=\"0\"\n",
        "            allowfullscreen\n",
        "        ></iframe>\n",
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f25b18fe860>"
+       "<IPython.lib.display.IFrame at 0x7f8ffd832280>"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -505,7 +528,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -549,14 +572,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
       "Stopping http://0.0.0.0:8081\n",
       "Serving '/workspace/finn/end2end_cnv_w1a1_folded.onnx' at http://0.0.0.0:8081\n"
      ]
@@ -568,17 +590,17 @@
        "        <iframe\n",
        "            width=\"100%\"\n",
        "            height=\"400\"\n",
-       "            src=\"http://0.0.0.0:8081/\"\n",
+       "            src=\"http://localhost:8081/\"\n",
        "            frameborder=\"0\"\n",
        "            allowfullscreen\n",
        "        ></iframe>\n",
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f252e5a6278>"
+       "<IPython.lib.display.IFrame at 0x7f8ff1243af0>"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -605,11 +627,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/finn/src/finn/transformation/fpgadataflow/floorplan.py:107: UserWarning: 32 nodes have no entry in the provided floorplan, SLR was set to -1\n",
+      "  warnings.warn(\n",
+      "/workspace/finn/src/finn/transformation/fpgadataflow/insert_fifo.py:154: UserWarning: Overriding input FIFO depth to 32\n",
+      "  warnings.warn(\"Overriding input FIFO depth to 32\")\n",
+      "/workspace/finn/src/finn/transformation/fpgadataflow/insert_fifo.py:200: UserWarning: Overriding output FIFO depth to 32\n",
+      "  warnings.warn(\"Overriding output FIFO depth to 32\")\n"
+     ]
+    }
+   ],
    "source": [
-    "test_pynq_board = \"Pynq-Z1\"\n",
+    "test_pynq_board = \"Pynq-Z2\"\n",
     "target_clk_ns = 10\n",
     "\n",
     "from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild\n",
@@ -631,18 +666,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Welcome to PYNQ Linux, based on Ubuntu 18.04 (GNU/Linux 5.4.0-xilinx-v2020.1 armv7l)\r\n",
+      "Welcome to PYNQ Linux, based on Ubuntu 18.04 (GNU/Linux 4.19.0-xilinx-v2019.1 armv7l)\r\n",
       "\r\n",
-      " * Pure upstream Kubernetes 1.21, smallest, simplest cluster ops!\r\n",
+      " * Super-optimized for small spaces - read how we shrank the memory\r\n",
+      "   footprint of MicroK8s to make it the smallest full K8s around.\r\n",
       "\r\n",
-      "     https://microk8s.io/\r\n"
+      "   https://ubuntu.com/blog/microk8s-memory-optimisation\r\n"
      ]
     }
    ],
@@ -665,7 +701,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -678,16 +714,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'/home/xilinx/finn_dev_maltanar/pynq_deployment_obskagv5'"
+       "'/home/xilinx/finn_dev_jduarte/pynq_deployment_yrxnwrak'"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -699,19 +735,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "total 4216\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    8508 Sep 21 13:19 driver.py\r\n",
-      "drwxr-xr-x 4 xilinx xilinx    4096 Sep 21 13:19 finn\r\n",
-      "-rw-r--r-- 1 xilinx xilinx 4045671 Sep 21 13:19 resizer.bit\r\n",
-      "-rw-r--r-- 1 xilinx xilinx  246205 Sep 21 13:19 resizer.hwh\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    1727 Sep 21 13:19 validate.py\r\n"
+      "total 4240\r\n",
+      "-rw-rw-r-- 1 xilinx xilinx   18616 Jun 28 20:42 driver_base.py\r\n",
+      "-rw-r--r-- 1 xilinx xilinx    4868 Jun 28 20:42 driver.py\r\n",
+      "drwxr-xr-x 4 xilinx xilinx    4096 Jun 28 20:42 finn\r\n",
+      "-rw-r--r-- 1 xilinx xilinx 4045671 Jun 28 20:42 resizer.bit\r\n",
+      "-rw-r--r-- 1 xilinx xilinx  247083 Jun 28 20:42 resizer.hwh\r\n",
+      "drwxr-xr-x 2 xilinx xilinx    4096 Jun 28 20:42 runtime_weights\r\n",
+      "-rw-rw-r-- 1 xilinx xilinx    4107 Jun 28 20:42 validate.py\r\n"
      ]
     }
    ],
@@ -728,16 +766,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<matplotlib.image.AxesImage at 0x7f89a07e6eb8>"
+       "<matplotlib.image.AxesImage at 0x7f917faeb6d0>"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -774,7 +812,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -791,7 +829,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -800,7 +838,7 @@
        "array([[3.]], dtype=float32)"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -836,7 +874,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -846,7 +884,7 @@
       "[sudo] password for xilinx: Requirement already satisfied: dataset_loading from git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading in /usr/local/lib/python3.6/dist-packages\n",
       "Requirement already satisfied: Pillow in /usr/lib/python3/dist-packages (from dataset_loading)\n",
       "Requirement already satisfied: scipy in /usr/lib/python3/dist-packages (from dataset_loading)\n",
-      "Connection to 192.168.2.99 closed.\n"
+      "Connection to 99.121.248.96 closed.\n"
      ]
     }
    ],
@@ -867,7 +905,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -877,18 +915,18 @@
       "[sudo] password for xilinx: Tar File found in dest_dir. Not Downloading again\n",
       "Extracting Python CIFAR10 data.\n",
       "Files extracted\n",
-      "batch 0 / 10 : total OK 851 NOK 149\n",
-      "batch 1 / 10 : total OK 1683 NOK 317\n",
-      "batch 2 / 10 : total OK 2522 NOK 478\n",
-      "batch 3 / 10 : total OK 3370 NOK 630\n",
-      "batch 4 / 10 : total OK 4207 NOK 793\n",
-      "batch 5 / 10 : total OK 5044 NOK 956\n",
-      "batch 6 / 10 : total OK 5887 NOK 1113\n",
-      "batch 7 / 10 : total OK 6728 NOK 1272\n",
-      "batch 8 / 10 : total OK 7570 NOK 1430\n",
-      "batch 9 / 10 : total OK 8419 NOK 1581\n",
+      "batch 1 / 10 : total OK 851 NOK 149\n",
+      "batch 2 / 10 : total OK 1683 NOK 317\n",
+      "batch 3 / 10 : total OK 2522 NOK 478\n",
+      "batch 4 / 10 : total OK 3370 NOK 630\n",
+      "batch 5 / 10 : total OK 4207 NOK 793\n",
+      "batch 6 / 10 : total OK 5044 NOK 956\n",
+      "batch 7 / 10 : total OK 5887 NOK 1113\n",
+      "batch 8 / 10 : total OK 6728 NOK 1272\n",
+      "batch 9 / 10 : total OK 7570 NOK 1430\n",
+      "batch 10 / 10 : total OK 8419 NOK 1581\n",
       "Final accuracy: 84.190000\n",
-      "Connection to 192.168.2.99 closed.\n"
+      "Connection to 99.121.248.96 closed.\n"
      ]
     }
    ],
@@ -900,7 +938,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We see that the final top-1 accuracy is 84.19%, which is very close to the 84.22% reported on the [BNN-PYNQ accuracy table in Brevitas](https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq). "
+    "We see that the final top-1 accuracy is 84.19%, which is very close to the 84.22% reported on the [BNN-PYNQ accuracy table in Brevitas](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq). "
    ]
   },
   {
@@ -927,7 +965,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index 5ed4b170b4eeee4b438d9539d2317a7d5eab5df2..a1a8450225f6bd375443a10a66739ca9dd00017e 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -384,7 +384,7 @@
     "model = model.transform(MergeONNXModels(pre_model))\n",
     "# add input quantization annotation: UINT8 for all BNN-PYNQ models\n",
     "global_inp_name = model.graph.input[0].name\n",
-    "model.set_tensor_datatype(global_inp_name, DataType.UINT8)\n",
+    "model.set_tensor_datatype(global_inp_name, DataType[\"UINT8\"])\n",
     "\n",
     "model.save(build_dir+\"/tfc_w1_a1_with_preproc.onnx\")\n",
     "showInNetron(build_dir+\"/tfc_w1_a1_with_preproc.onnx\")"
@@ -1799,7 +1799,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index e0ce00c1beefe8172ac5fd2aeaaa076b9bb574c1..2c9f4a99ed3edd05a8e8d32db2fe6bcdad204716 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -98,7 +98,7 @@
     "\n",
     "Following Murovic and Trost's open-source implementation provided as a Matlab script [here](https://github.com/TadejMurovic/BNN_Deployment/blob/master/cybersecurity_dataset_unswb15.m), we've created a [Python version](dataloader_quantized.py).\n",
     "\n",
-    "<font color=\"red\">**FPGA'21 tutorial:** Downloading the original dataset and quantizing it can take some time, so we provide a download link to the pre-quantized version for your convenience. </font>"
+    "<font color=\"red\">**Live FINN tutorial:** Downloading the original dataset and quantizing it can take some time, so we provide a download link to the pre-quantized version for your convenience. </font>"
    ]
   },
   {
@@ -110,16 +110,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2021-05-10 18:14:00--  https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1\n",
+      "--2021-10-12 15:49:17--  https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1\n",
       "Resolving zenodo.org (zenodo.org)... 137.138.76.77\n",
       "Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 13391907 (13M) [application/octet-stream]\n",
       "Saving to: â€˜unsw_nb15_binarized.npzâ€™\n",
       "\n",
-      "unsw_nb15_binarized 100%[===================>]  12.77M  3.96MB/s    in 3.4s    \n",
+      "unsw_nb15_binarized 100%[===================>]  12.77M  3.56MB/s    in 3.7s    \n",
       "\n",
-      "2021-05-10 18:14:04 (3.77 MB/s) - â€˜unsw_nb15_binarized.npzâ€™ saved [13391907/13391907]\n",
+      "2021-10-12 15:49:22 (3.44 MB/s) - â€˜unsw_nb15_binarized.npzâ€™ saved [13391907/13391907]\n",
       "\n"
      ]
     }
@@ -422,9 +422,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Training loss:   0%|          | 0/10 [00:00<?, ?it/s]/opt/conda/lib/python3.8/site-packages/torch/autograd/__init__.py:130: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at  /opt/conda/conda-bld/pytorch_1607370172916/work/c10/cuda/CUDAFunctions.cpp:100.)\n",
-      "  Variable._execution_engine.run_backward(\n",
-      "Training loss = 0.131708 test accuracy = 0.805398: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10/10 [01:04<00:00,  6.42s/it]\n"
+      "Training loss = 0.132918 test accuracy = 0.798341: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10/10 [00:44<00:00,  4.45s/it]\n"
      ]
     }
    ],
@@ -459,7 +457,7 @@
    "outputs": [
     {
      "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEWCAYAAABxMXBSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAApFUlEQVR4nO3df5xddX3n8dd77vzK/LyTzJCfMyRAAAPi3O6AP6i0KrbAtsB2qYK/oGtLbYvVRa2ou9pl5bFWWrXuUhdWxVqpqKglVRCsAtJWMIGEQIKBGAJJSMjk9ySTzM/P/nHPTO5MJsncZG7unZn38/G4j3vP9/y4n3Mheeec7znfo4jAzMxsvMqKXYCZmU0uDg4zM8uLg8PMzPLi4DAzs7w4OMzMLC8ODjMzy4uDw+w4SLpf0rUTvWyeNfympE0TvV2zYykvdgFmJ4ukfTmTNUAPMJBM/3FE3DXebUXEpYVY1mwycHDYtBERdUOfJW0A/jAi/mX0cpLKI6L/ZNZmNpn4VJVNe0OnfCR9VNJW4E5JTZJ+IKlT0q7k84KcdR6W9IfJ5+sk/aukv06WfUHSpce57CJJP5PUJelfJN0m6Rvj3I9XJd+1W9JqSZfnzLtM0ppku5slfThpb072bbeknZIeleS/F+yo/D+IWdYcYCZwKnA92T8bdybTbcAB4P8cZf3XAmuBZuCzwFck6TiW/UfgF8As4C+Bd4+neEkVwD8DDwKnAO8H7pJ0VrLIV8iejqsHzgV+mrR/CNgEtACzgY8DHofIjsrBYZY1CHwqInoi4kBE7IiI70ZEd0R0AbcAv3GU9V+MiP8XEQPA3wNzyf5FPO5lJbUB5wOfjIjeiPhXYOk4638dUAd8Jln3p8APgGuS+X3AEkkNEbErIp7MaZ8LnBoRfRHxaHgAOzsGB4dZVmdEHByakFQj6XZJL0raC/wMSEtKHWH9rUMfIqI7+ViX57LzgJ05bQAbx1n/PGBjRAzmtL0IzE8+/2fgMuBFSY9Ien3SfiuwDnhQ0npJN43z+2wac3CYZY3+V/aHgLOA10ZEA3BR0n6k008TYQswU1JNTlvrONd9GWgd1T/RBmwGiIhlEXEF2dNY/wR8O2nviogPRcRpwOXAjZLecmK7YVOdg8NsbPVk+zV2S5oJfKrQXxgRLwLLgb+UVJkcFfzuOFd/HOgG/kJShaTfTNa9O9nWOyU1RkQfsJfsqTkk/Y6kM5I+lj1kL08eHPMbzBIODrOxfQGYAWwHHgN+dJK+953A64EdwKeBb5G93+SoIqKXbFBcSrbmvwPeExG/TBZ5N7AhOe32vuR7ABYD/wLsA34O/F1EPDRhe2NTktwPZla6JH0L+GVEFPyIx2y8fMRhVkIknS/pdEllki4BriDbJ2FWMnznuFlpmQN8j+x9HJuAP4mIFcUtyWwkn6oyM7O8+FSVmZnlZVqcqmpubo6FCxcWuwwzs0nliSee2B4RLaPbp0VwLFy4kOXLlxe7DDOzSUXSi2O1+1SVmZnlxcFhZmZ5cXCYmVleHBxmZpYXB4eZmeXFwWFmZnlxcJiZWV4cHEdx78rNfOOxMS9jNjObthwcR/HA6q3c/rNfFbsMM7OS4uA4ivbWNBt3HmD7vmM+R8fMbNooaHBIukTSWknrJN00xvz3SXpa0kpJ/yppSc68jyXrrZX02+Pd5kTKtDUBsPKl3YX8GjOzSaVgwSEpBdxG9lGWS4BrcoMh8Y8R8eqIaAc+C3wuWXcJcDVwDnAJ8HeSUuPc5oQ5d14j5WVixcZdhfoKM7NJp5BHHBcA6yJiffI85LvJPs1sWETszZmsBYYeDnIFcHdE9ETEC8C6ZHvH3OZEmlGZ4lVzG1jhIw4zs2GFDI75wMac6U1J2wiS/kzSr8gecfz5MdYd1zaT7V4vabmk5Z2dnce9E5m2NE9t3M3AoB94ZWYGJdA5HhG3RcTpwEeB/zaB270jIjoioqOl5bDh5Mct05Zmf+8Az2/rmqjSzMwmtUIGx2agNWd6QdJ2JHcDVx5j3Xy3ecLaW91BbmaWq5DBsQxYLGmRpEqynd1LcxeQtDhn8j8CzyeflwJXS6qStAhYDPxiPNucaAtn1ZCuqXA/h5lZomBPAIyIfkk3AA8AKeCrEbFa0s3A8ohYCtwg6WKgD9gFXJusu1rSt4E1QD/wZxExADDWNgu1D8n3kWlN+8oqM7NEQR8dGxH3AfeNavtkzucPHGXdW4BbxrPNQsu0NfHwc53sPdhHQ3XFyfxqM7OSU/TO8ckg05YmAlZt3FPsUszMis7BMQ6vaU0jwYqXfLrKzMzBMQ4N1RWc3lLHyo27i12KmVnROTjGKdtBvpsI3whoZtObg2OcMm1N7Nzfy0s7u4tdiplZUTk4xinTlgbw/RxmNu05OMbpzNn11FSm3EFuZtOeg2OcUmXivAWNrHAHuZlNcw6OPGTamljz8l4O9g0UuxQzs6JxcOQh05qmfzBY/bJvBDSz6cvBkYd2d5CbmTk48nFKfTULmmY4OMxsWnNw5CnT1uQrq8xsWnNw5Km9Nc3Lew7yyt6DxS7FzKwoHBx58o2AZjbdOTjydM68BipTZX6wk5lNWw6OPFWVp1gyr8FHHGY2bTk4jkOmLc2qTbvpHxgsdilmZiedg+M4ZNqaONg3yC+3dhW7FDOzk87BcRwyrWkAP9jJzKalggaHpEskrZW0TtJNY8y/UdIaSask/UTSqUn7myStzHkdlHRlMu9rkl7ImddeyH0Yy4KmGTTXVbqfw8ympfJCbVhSCrgNeCuwCVgmaWlErMlZbAXQERHdkv4E+Czw9oh4CGhPtjMTWAc8mLPeRyLinkLVfiySaG9t8pVVZjYtFfKI4wJgXUSsj4he4G7gitwFIuKhiBh6pN5jwIIxtnMVcH/OciUh05Zmfed+dnf3FrsUM7OTqpDBMR/YmDO9KWk7kvcC94/RfjXwzVFttySntz4vqWqsjUm6XtJyScs7OzvzqXtchm4EdD+HmU03JdE5LuldQAdw66j2ucCrgQdymj8GnA2cD8wEPjrWNiPijojoiIiOlpaWCa/5vAVpJN9BbmbTTyGDYzPQmjO9IGkbQdLFwCeAyyOiZ9TstwHfj4i+oYaI2BJZPcCdZE+JnXR1VeWcNbveRxxmNu0UMjiWAYslLZJUSfaU09LcBSRlgNvJhsa2MbZxDaNOUyVHIUgScCXwzMSXPj6ZtjQrN+5mcDCKVYKZ2UlXsOCIiH7gBrKnmZ4Fvh0RqyXdLOnyZLFbgTrgO8mltcPBImkh2SOWR0Zt+i5JTwNPA83Apwu1D8eSaW1iz4E+Xtixv1glmJmddAW7HBcgIu4D7hvV9smczxcfZd0NjNGZHhFvnsAST0juSLmnt9QVtxgzs5OkJDrHJ6vTW+qoryr3g53MbFpxcJyAsjLxmta0r6wys2nFwXGCMm1p1r7SRXdvf7FLMTM7KRwcJyjTlmZgMHh6055il2JmdlI4OE5Qe2sTACt8P4eZTRMOjhM0s7aShbNq3EFuZtOGg2MCZNqaWPHSbiJ8I6CZTX0OjgnQ3ppmW1cPW/YcLHYpZmYF5+CYALk3ApqZTXUOjglw9pwGqsrL3M9hZtOCg2MCVJaX8er5jb6yysymBQfHBMm0pXl68x56+weLXYqZWUE5OCZIe2sTvf2DPLtlb7FLMTMrKAfHBPGjZM1sunBwTJC5jdXMbqhyB7mZTXkOjgkiiUxrkzvIzWzKc3BMoExbmhd3dLNj3+hHp5uZTR0OjgmUacsOeOh+DjObyhwcE+jV8xtJlcl3kJvZlFbQ4JB0iaS1ktZJummM+TdKWiNplaSfSDo1Z96ApJXJa2lO+yJJjyfb/JakykLuQz5mVKY4e069jzjMbEorWHBISgG3AZcCS4BrJC0ZtdgKoCMizgPuAT6bM+9ARLQnr8tz2v8K+HxEnAHsAt5bqH04Hpm2NCs37mZg0CPlmtnUVMgjjguAdRGxPiJ6gbuBK3IXiIiHIqI7mXwMWHC0DUoS8GayIQPw98CVE1n0icq0NrGvp59fde4rdilmZgVRyOCYD2zMmd6UtB3Je4H7c6arJS2X9JikK5O2WcDuiBh6wPextnnSHRop1/dzmNnUVBKd45LeBXQAt+Y0nxoRHcA7gC9IOj3PbV6fBM/yzs7OCaz26BY119I4o8Id5GY2ZRUyODYDrTnTC5K2ESRdDHwCuDwihm+AiIjNyft64GEgA+wA0pLKj7bNZL07IqIjIjpaWlpOfG/GSRLtrWl3kJvZlFXI4FgGLE6ugqoErgaW5i4gKQPcTjY0tuW0N0mqSj43AxcCayL7bNaHgKuSRa8F7i3gPhyXTFuata90sa+n/9gLm5lNMgULjqQf4gbgAeBZ4NsRsVrSzZKGrpK6FagDvjPqsttXAcslPUU2KD4TEWuSeR8FbpS0jmyfx1cKtQ/HK9PWRASs8lGHmU1B5cde5PhFxH3AfaPaPpnz+eIjrPfvwKuPMG892Su2Slb7gjQAKzbu5g1nNBe3GDOzCVYSneNTTWNNBae31PrKKjObkhwcBdLe2sSKl3aT7ZYxM5s6HBwFkmlLs2N/L5t2HSh2KWZmE8rBUSBDNwI+6dNVZjbFODgK5KzZ9cyoSPlGQDObchwcBVKeKuO8BY1+IqCZTTkOjgLKtDWx5uU9HOwbKHYpZmYTxsFRQO2tafoGgtUv7y12KWZmE8bBUUBDHeQet8rMphIHRwHNbqhmfnqGbwQ0synFwVFg7W1pX1llZlOKg6PAMq1pNu8+wLa9B4tdipnZhHBwFFimrQnAl+Wa2ZTh4Ciwc+Y1UJGST1eZ2ZTh4Ciw6ooUS+Y2sHKjO8jNbGpwcJwEmbYmVm3aQ//AYLFLMTM7YQ6OkyDTlqa7d4DnXtlX7FLMzE6Yg+MkyLQOdZD7dJWZTX4OjpOgdeYMZtZWuoPczKYEB8dJIIlMa9pDj5jZlFDQ4JB0iaS1ktZJummM+TdKWiNplaSfSDo1aW+X9HNJq5N5b89Z52uSXpC0Mnm1F3IfJkqmLc26bfvYc6Cv2KWYmZ2QggWHpBRwG3ApsAS4RtKSUYutADoi4jzgHuCzSXs38J6IOAe4BPiCpHTOeh+JiPbktbJQ+zCRhm4EfMpHHWY2yRXyiOMCYF1ErI+IXuBu4IrcBSLioYjoTiYfAxYk7c9FxPPJ55eBbUBLAWstuPMWNCLhfg4zm/TGFRySaiWVJZ/PlHS5pIpjrDYf2JgzvSlpO5L3AveP8d0XAJXAr3Kab0lOYX1eUtURar5e0nJJyzs7O49RauHVV1dw5in1vrLKzCa98R5x/AyoljQfeBB4N/C1iSpC0ruADuDWUe1zgX8A/iAihu6e+xhwNnA+MBP46FjbjIg7IqIjIjpaWkrjYKU96SCPiGKXYmZ23MYbHEpOKf0e8HcR8fvAOcdYZzPQmjO9IGkbuWHpYuATwOUR0ZPT3gD8EPhERDw21B4RWyKrB7iT7CmxSSHTlmZ3dx8bdnQfe2EzsxI17uCQ9HrgnWT/MgdIHWOdZcBiSYskVQJXA0tHbTQD3E42NLbltFcC3we+HhH3jFpn7lBBwJXAM+Pch6IbHinXD3Yys0lsvMHxQbKniL4fEaslnQY8dLQVIqIfuAF4AHgW+Hay7s2SLk8WuxWoA76TXFo7FCxvAy4Crhvjstu7JD0NPA00A58e5z4U3Rmn1FFXVe4OcjOb1JTv+fakk7wuIvYWpqSJ19HREcuXLy92GQC888uPsedAHz94/xuLXYqZ2VFJeiIiOka3j/eqqn+U1CCpluypoTWSPjLRRU4HmdYmnt3SxYHegWKXYmZ2XMZ7qmpJcoRxJdlLZheRvbLK8tTemmZgMHh6855il2JmdlzGGxwVyX0bVwJLI6IP8DWlx6G9LQ3gBzuZ2aQ13uC4HdgA1AI/S8aUmjR9HKWkua6Ktpk17iA3s0lrXMEREV+MiPkRcVlyD8WLwJsKXNuUlWlLOzjMbNIab+d4o6TPDQ3hIelvyB592HHItKbZuvcgW/YcKHYpZmZ5G++pqq8CXWTvr3gb2dNUdxaqqKmuffhGwN3FLcTM7DiMNzhOj4hPJSPdro+I/wGcVsjCprIlcxuoLC/zg53MbFIab3AckPTrQxOSLgR8nuU4VZaXce68Bg89YmaTUvk4l3sf8HVJjcn0LuDawpQ0PWTamvjGYy/SNzBIRcpP8DWzyWO8V1U9FRGvAc4DzouIDPDmglY2xWXa0vT0D/LLLV3FLsXMLC95/VM3IvbmjFF1YwHqmTaGR8r1jYBmNsmcyDkSTVgV09C8xmpa6qt8ZZWZTTonEhwecuQESCKTPBHQzGwyOWrnuKQuxg4IATMKUtE0kmlr4sE1r7Brfy9NtZXFLsfMbFyOesQREfUR0TDGqz4ixntFlh1BZnjAw91FrcPMLB++DrSIzlvQSJn8KFkzm1wcHEVUU1nO2XMaWOEjDjObRBwcRdbelmblS7sZHPS1BmY2OTg4iizTmqarp5/12/cVuxQzs3EpaHBIukTSWknrJN00xvwbJa2RtErST5IHRA3Nu1bS88nr2pz2/yDp6WSbX5Q0qe8nGboR8Enfz2Fmk0TBgkNSCrgNuBRYAlwjacmoxVYAHRFxHnAP8Nlk3ZnAp4DXAhcAn5LUlKzzJeCPgMXJ65JC7cPJcFpzLQ3V5b4R0MwmjUIecVwArEuGYe8F7gauyF0gIh6KiO5k8jFgQfL5t4EfR8TOiNgF/Bi4RNJcoCEiHouIAL5O9jnok1ZZmWhva/KVVWY2aRQyOOYDG3OmNyVtR/Je4P5jrDs/+XzMbUq6fuiJhZ2dnXmWfnK1t6Z57pUu9vX0F7sUM7NjKonOcUnvAjqAWydqmxFxR0R0RERHS0vLRG22IDJtaQYDVm3aXexSzMyOqZDBsRlozZlekLSNIOli4BPA5RHRc4x1N3PodNYRtznZtC9IA76D3Mwmh0IGxzJgsaRFkiqBq4GluQtIygC3kw2NbTmzHgB+S1JT0in+W8ADEbEF2CvpdcnVVO8B7i3gPpwUTbWVnNZc6w5yM5sUCjbeVET0S7qBbAikgK9GxGpJNwPLI2Ip2VNTdcB3kqtqX4qIyyNip6T/STZ8AG6OiJ3J5z8FvkZ2kMX7OdQvMqm1t6X52XPbiQgm+RXGZjbFFXSgwoi4D7hvVNsncz5ffJR1vwp8dYz25cC5E1hmSci0NfG9JzezadcBWmfWFLscM7MjKonOccveQQ543CozK3kOjhJx1px6qivKWOl+DjMrcQ6OElGRKuO8+Wk/g9zMSp6Do4Rk2tKs3ryXnv6BYpdiZnZEDo4SkmlL0zswyJqX9xa7FDOzI3JwlJChkXJ9P4eZlTIHRwmZ3VDN3MZqX1llZiXNwVFiMm1pVrqD3MxKmIOjxGRam9i48wCdXT3HXtjMrAgcHCUm05YGPOChmZUuB0eJOXd+I+Vl8oOdzKxkOThKTHVFilfNbfCVVWZWshwcJSjTlmbVpt0MDEaxSzEzO4yDowRl2tLs7x3g+W1dxS7FzOwwDo4SlGn1jYBmVrocHCXo1Fk1NNVUuIPczEqSg6MESSLT1uQjDjMrSQ6OEtXemmZd5z72HuwrdilmZiM4OEpUpi1NBKzauKfYpZiZjVDQ4JB0iaS1ktZJummM+RdJelJSv6SrctrfJGllzuugpCuTeV+T9ELOvPZC7kOxvKY1jYT7Ocys5JQXasOSUsBtwFuBTcAySUsjYk3OYi8B1wEfzl03Ih4C2pPtzATWAQ/mLPKRiLinULWXgobqCs5oqfNIuWZWcgp5xHEBsC4i1kdEL3A3cEXuAhGxISJWAYNH2c5VwP0R0V24UktTpi3Nipd2EeEbAc2sdBQyOOYDG3OmNyVt+boa+OaotlskrZL0eUlVY60k6XpJyyUt7+zsPI6vLb5MWxO7uvt4cce0y0wzK2El3TkuaS7wauCBnOaPAWcD5wMzgY+OtW5E3BERHRHR0dLSUvBaC6G9NQ3ACj+fw8xKSCGDYzPQmjO9IGnLx9uA70fE8DWpEbElsnqAO8meEpuSzpxdT01lipW+n8PMSkghg2MZsFjSIkmVZE85Lc1zG9cw6jRVchSCJAFXAs+ceKmlKVUmXrMg7Q5yMyspBQuOiOgHbiB7mulZ4NsRsVrSzZIuB5B0vqRNwO8Dt0taPbS+pIVkj1geGbXpuyQ9DTwNNAOfLtQ+lIJMW5o1L+/lYN9AsUsxMwMKeDkuQETcB9w3qu2TOZ+XkT2FNda6GxijMz0i3jyxVZa2TFsT/YPBM5v30LFwZrHLMTMr7c5xO9RBvmyDO8jNrDQ4OEpcS30V58xr4K8fXMv/uv9ZDvT6lJWZFZeDYxK46w9fy1W/toDbH1nPWz//CA+v3VbsksxsGnNwTALpmkr+6qrz+Nb1r6OyvIzr7lzG+7+5gs6unmKXZmbTkINjEnntabO4/wNv5IMXL+aBZ7bylr95mG/+4iUG/WxyMzuJHByTTFV5ig9efCb3f/CNvGpuAx/73tO8/Y6f8/wrfj65mZ0cDo5J6vSWOu6+/nV89qrzeH7bPi774qP8zYNrfb+HmRWcg2MSk8TbOlr5yY2/we+eN4///dN1XPq3j/Lv67YXuzQzm8IcHFPArLoqPvf2dr7x3tcyGME7vvw4N357JTv39xa7NDObghwcU8ivL27mgQ9exJ+96XSWrnyZt/zNw9zzxCY/z8PMJpSDY4qprkjxkd8+mx/++Rs5raWOD3/nKd7x/x5nfee+YpdmZlOEg2OKOmtOPd/549dzy386l2de3sMlf/soX/zJ8/T2H+1hi2Zmx+bgmMLKysQ7X3sqP7nxN3jrktl87sfPcdkXH2XZhp3FLs3MJjEHxzRwSkM1t73j17jzuvM50DvA7//fn3PTd1exp7vv2CubmY3i4JhG3nT2Kfz4xou4/qLT+M4Tm3jL5x7m3pWb3XluZnlxcEwzNZXlfPyyV7H0hguZl57BB+5eybV3LmPjzu5il2Zmk4SDY5o6Z14j3//TC/nU7y7hiQ07eevnH+FLD/+KvgF3npvZ0Tk4prFUmfiDCxfxLx/6DS5a3MJf/eiX/O7//ldWvOSHRpnZkTk4jLmNM7jjPR3c/u7/wO7uPn7vS//OJ+99hr0H3XluZodzcNiw3z5nDj++8SKuff1C/uGxF3nr5x7h/qe3uPPczEYoaHBIukTSWknrJN00xvyLJD0pqV/SVaPmDUhambyW5rQvkvR4ss1vSaos5D5MN/XVFfzl5efwT396ITNrq/iTu57kj76+nM27DxS7NDMrEQULDkkp4DbgUmAJcI2kJaMWewm4DvjHMTZxICLak9flOe1/BXw+Is4AdgHvnfDijde0pvnnGy7k45edzb+t28FbP/cIX350PXsO+PSV2XRXXsBtXwCsi4j1AJLuBq4A1gwtEBEbknnjupRHkoA3A+9Imv4e+EvgSxNVtB1Snirj+otO59Jz5/Lf732GT//wWT79w2eZ21jNmbPrOWtOffZ9dj1nnFLHjMpUsUs2s5OgkMExH9iYM70JeG0e61dLWg70A5+JiH8CZgG7I6I/Z5vzx1pZ0vXA9QBtbW35VW4jtM6s4c7rzufn63ewatMe1m7tYu3WLn6+fsfw2FcSLJxVy5mz6zhrdj1nzskGysLmWipS7kozm0oKGRwn6tSI2CzpNOCnkp4G9ox35Yi4A7gDoKOjw727J0gSbzi9mTec3jzc1j8wyIs7u3luaxdrX+niuVeygfLjNa8w9Bj0ipQ4vaVu+AjlrOR9fnoGZWUq0t6Y2YkoZHBsBlpzphckbeMSEZuT9/WSHgYywHeBtKTy5Kgjr23axCpPlXF6Sx2nt9Rx6avnDrcf7BvgV537kiDJvj/x4i6WPvXy8DI1lSkWz67nrNkjQ6WlvorsGUkzK1WFDI5lwGJJi8j+5X41h/omjkpSE9AdET2SmoELgc9GREh6CLgKuBu4Fri3INXbcauuSHHOvEbOmdc4or3rYB/Pb9s3fISydmsXP/3lNr69fNPwMk01FSP7T+bUc+Yp9TTWVJzs3TCzI1Ahr9GXdBnwBSAFfDUibpF0M7A8IpZKOh/4PtAEHAS2RsQ5kt4A3A4Mkr3y6wsR8ZVkm6eRDY2ZwArgXRHRc7Q6Ojo6Yvny5QXZRztx2/f18NwrXUmg7Bv+3NXTP7zMnIbqpN8ke4RyWkstp86qZVZtpY9QzApE0hMR0XFY+3S4ucvBMflEBFv2HBw+Mhk6Snl+274RD6OqrypnYXNt9jWrhoWzDn2e6VAxOyFHCo5S7hy3aUwS89IzmJeewZvOOmW4fWAweGlnNxu27+eF7ft5ccd+XtjRzVMbd/PDVS8Pd8oD1FeXs6g5e2SyaFYNC4c+N9fSVFPhUDE7Tg4Om1RSZWJRc/Yv/zeNmtfbP8imXd1s2LGfF7Z3Z0Nl+35Wbtx1WKg0VCdHKskRyqLmmiRgammq9WAEZkfj4LApo7K8jNNa6jitpe6web39g2zclXukkg2YJ1/axQ9GhUrjjIrsaa/hYMmeAlvUXEu6xqFi5uCwaaGy/NClw6P19A+wcecBNmzfz4YdyWt79/AlxLndgOmaiuFTX22zapnXWM3sxmrmNlYzt2EGDTPKfQrMpjwHh017VeUpzjiljjNOOVKodI849bVhx36WbdjFvaNCBWBGRYo5jdXMaciGyZyhV0M1cxtnMKexmlm1lb750SY1B4fZUWRDpZ4zTqk/bF7fwCDbunrYuucAW/YcZGvy2rI3+/74Czt5Ze9B+gdHpktFSpxSnw2W2Y3VzG3Ihks2WKqY0ziDU+qrPFSLlSwHh9lxqkiVMT89g/npGUdcZnAw2L6/ZzhUtu49yJY9B3llT/Z9zct7+cmzr3Cwb+Q4nxK01FWNOnpJgqVhxvDRTHWFB5a0k8/BYVZAZWXZo4tT6qs5b8HYy0QEew/0s2XvyCOXoaOXDTv289j6Hew92H/YuumaCprrqqitTFFbVU5NZTm1Vanse9I2PF2VorayPFkuNfxel6xXWe4jHBsfB4dZkUmisaaCxpoKzp7TcMTl9vf0s3XvwVFHLwfYub+X/T0DdPf28/LuA+zv7R+e7u4dGHcdFSmNCJyaquznmspy6qpSI6Zrq5JQqjwUQo0zKjilvoqZtZWU+zTblObgMJskaqvKj3hl2JEMDgbdfQN09/Szv3eA/T3ZMNnf08/+3n66eway770D7OvpH16uOwmf/T397Oo+MDw9njCSYGZNJS31VbTUV9Fcl31vqauiub6SlrrqpL2SphpfKDAZOTjMprCyMlFXVU5d1cT9UR8YDA6MCqN9Pf3s7u6jc18P27t66NzXQ2dXD9v39fDC9v10dvXQ03/489pSZaK5rnJUuGTfRwdPQ7UvdS4VDg4zy0vqOMIoItjX009n11Cg9NLZdTAJmt7hoPnlli627+s57Eo0yN6LMzJYKocDpjnnvb66nBmVKarLUz6aKRAHh5kVnCTqqyuor64Y887+XIODwZ4Dhx+9DL939bB59wFWbtzNjv09h91Lk6uqvGw4RGZUpqiuSDGjIts2oyI7nW3LnX9omdHzD61TdmidaRhQDg4zKyllZaKptpKm2krOnH34/TO5BgaDnft7h4Nle1cP+3v7OdA7wIG+7Otg7wAH+wYPTfcNcKB3gN3dfRzsy5mXrHM8hgIqN4yqysuoLC+jKnlVlpdRmSqjqjyV/ZzTPrqtamjZijIqU6kRy45YL5VKlik7qeHl4DCzSStVpuG+kIkQEfT0D3Kgd4CD/YfCJBs2g4c+5wTQUCD19A2OnO4fpLc/2we0c/8gvf2DSdsgvQOD9PQN0DswSN/AxDzaorxMo8IlGzhffk8HC5trJ+Q7hr9rQrdmZjaJSRo+YjhZBgcjGyT9g/T0D2SD5bCQGaR3YCB5H1p2MGfZgZHrDLUNDDKjcuL3xcFhZlZEZWWiumworCbHI5J9l46ZmeXFwWFmZnkpaHBIukTSWknrJN00xvyLJD0pqV/SVTnt7ZJ+Lmm1pFWS3p4z72uSXpC0Mnm1F3IfzMxspIL1cUhKAbcBbwU2AcskLY2INTmLvQRcB3x41OrdwHsi4nlJ84AnJD0QEbuT+R+JiHsKVbuZmR1ZITvHLwDWRcR6AEl3A1cAw8ERERuSeSPGIoiI53I+vyxpG9AC7C5gvWZmNg6FPFU1H9iYM70pacuLpAuASuBXOc23JKewPi9pYi7gNjOzcSnpznFJc4F/AP4gIoaOSj4GnA2cD8wEPnqEda+XtFzS8s7OzpNSr5nZdFDI4NgMtOZML0jaxkVSA/BD4BMR8dhQe0Rsiawe4E6yp8QOExF3RERHRHS0tLQc1w6YmdnhCtnHsQxYLGkR2cC4GnjHeFaUVAl8H/j66E5wSXMjYouy4ytfCTxzrO098cQT2yW9mGf9Q5qB7ce57lTk3+MQ/xYj+fcYaSr8HqeO1ag42tCSJ0jSZcAXgBTw1Yi4RdLNwPKIWCrpfLIB0QQcBLZGxDmS3kX2aGJ1zuaui4iVkn5KtqNcwErgfRGxr4D7sDwiOgq1/cnGv8ch/i1G8u8x0lT+PQoaHFPBVP6Pfzz8exzi32Ik/x4jTeXfo6Q7x83MrPQ4OI7tjmIXUGL8exzi32Ik/x4jTdnfw6eqzMwsLz7iMDOzvDg4zMwsLw6OozjW6L7ThaRWSQ9JWpOMWPyBYtdUCiSlJK2Q9INi11JsktKS7pH0S0nPSnp9sWsqFkn/Nflz8oykb0qqLnZNE83BcQQ5o/teCiwBrpG0pLhVFU0/8KGIWAK8Dvizafxb5PoA8GyxiygRfwv8KCLOBl7DNP1dJM0H/hzoiIhzyd7DdnVxq5p4Do4jGx7dNyJ6gaHRfaedZJiXJ5PPXWT/Ush7wMqpRNIC4D8CXy52LcUmqRG4CPgKQET05jwCYToqB2ZIKgdqgJeLXM+Ec3Ac2YSM7jvVSFoIZIDHi1xKsX0B+Atg8BjLTQeLgE7gzuTU3Zcl1Ra7qGKIiM3AX5N91tAWYE9EPFjcqiaeg8PGTVId8F3ggxGxt9j1FIuk3wG2RcQTxa6lRJQDvwZ8KSIywH5gWvYJSmoie2ZiETAPqE2GUJpSHBxHdkKj+041kirIhsZdEfG9YtdTZBcCl0vaQPYU5pslfaO4JRXVJmBTRAwdhd5DNkimo4uBFyKiMyL6gO8BbyhyTRPOwXFkw6P7JqP1Xg0sLXJNRZGMRPwV4NmI+Fyx6ym2iPhYRCyIiIVk/7/4aURMuX9VjldEbAU2SjoraXoLOU/6nGZeAl4nqSb5c/MWpuCFAoUcVn1Si4h+STcAD3BodN/Vx1htqroQeDfwtKSVSdvHI+K+4pVkJeb9wF3JP7LWA39Q5HqKIiIel3QP8CTZqxFXMAWHHvGQI2ZmlhefqjIzs7w4OMzMLC8ODjMzy4uDw8zM8uLgMDOzvDg4zI5B0r7kfaGkd0zwtj8+avrfJ3L7ZoXg4DAbv4VAXsGRDHR3NCOCIyKm3F3GNvU4OMzG7zPAGyWtTJ65kJJ0q6RlklZJ+mMASb8p6VFJS0nuoJb0T5KeSJ7TcH3S9hmyo6iulHRX0jZ0dKNk289IelrS23O2/XDOsy/uSu5QRtJnkmemrJL01yf917Fpw3eOm43fTcCHI+J3AJIA2BMR50uqAv5N0tBIqL8GnBsRLyTT/yUidkqaASyT9N2IuEnSDRHRPsZ3/R7QTvbZFs3JOj9L5mWAc8gO1/1vwIWSngX+E3B2RISk9MTuutkhPuIwO36/BbwnGYblcWAWsDiZ94uc0AD4c0lPAY+RHTxzMUf368A3I2IgIl4BHgHOz9n2pogYBFaSPYW2BzgIfEXS7wHdJ7hvZkfk4DA7fgLeHxHtyWtRzrMX9g8vJP0m2VFTXx8RryE7ftGJPE60J+fzAFAeEf1kHz52D/A7wI9OYPtmR+XgMBu/LqA+Z/oB4E+SIeeRdOYRHmDUCOyKiG5JZ5N9/O6QvqH1R3kUeHvSj9JC9gl7vzhSYcmzUhqTgSf/K9lTXGYF4T4Os/FbBQwkp5y+RvY52wuBJ5MO6k7gyjHW+xHwvqQfYi3Z01VD7gBWSXoyIt6Z0/594PXAU0AAfxERW5PgGUs9cK+karJHQjce1x6ajYNHxzUzs7z4VJWZmeXFwWFmZnlxcJiZWV4cHGZmlhcHh5mZ5cXBYWZmeXFwmJlZXv4//lzH8IMQHB8AAAAASUVORK5CYII=\n",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEWCAYAAABxMXBSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAofElEQVR4nO3de3Rd5X3m8e+jo5slSzq2JRMsHWMbTIi5SLSGXEsTSlpIO8C0uUCbhLRpmXRKmpY2DWlmpR2mWSuFTpJ2SjowJSSZkFJCksaTQsiNQNKEBBOMb9yM8N1g+Spbsu6/+eNsiSMhyTq2js+R9HzW0tLe776c3z4herz3u/e7FRGYmZlNVVmxCzAzs5nFwWFmZnlxcJiZWV4cHGZmlhcHh5mZ5cXBYWZmeXFwmJ0ASQ9Ium66182zhjdL2jnd+zU7nvJiF2B2qkg6mjNbA/QCg8n8f4mIu6e6r4i4ohDrms0EDg6bMyJi/vC0pK3A70fEd8euJ6k8IgZOZW1mM4kvVdmcN3zJR9JHJL0I3CVpgaRvSuqQdDCZbsnZ5geSfj+Zfp+kH0n6u2TdFyRdcYLrLpf0iKQjkr4r6TZJX5ricbwm+axDkjZJujJn2dskbU72u0vSnyftjcmxHZJ0QNIPJfnvgk3K/4GYZb0KWAicAVxP9v8bdyXzS4FjwD9Osv1rgWeARuAW4E5JOoF1vwz8DFgE/DXwnqkUL6kC+H/At4HFwAeBuyW9OlnlTrKX4+qA84DvJ+1/BuwEmoDTgL8EPA6RTcrBYZY1BPxVRPRGxLGI2B8RX42I7og4AnwC+OVJtt8WEf8nIgaBLwCnk/1DPOV1JS0FLgI+HhF9EfEjYM0U638dMB/4ZLLt94FvAtcmy/uBVZLqI+JgRPw8p/104IyI6I+IH4YHsLPjcHCYZXVERM/wjKQaSbdL2iapE3gESEtKTbD9i8MTEdGdTM7Pc90lwIGcNoAdU6x/CbAjIoZy2rYBzcn0bwFvA7ZJeljS65P2W4EtwLcltUu6aYqfZ3OYg8Msa+y/sv8MeDXw2oioBy5J2ie6/DQd9gALJdXktGWmuO1uIDOmf2IpsAsgIh6LiKvIXsb6N+DepP1IRPxZRKwArgRulPQrJ3cYNts5OMzGV0e2X+OQpIXAXxX6AyNiG7AW+GtJlclZwX+a4uY/BbqBv5BUIenNybb3JPv6HUkNEdEPdJK9NIek35B0VtLHcpjs7clD436CWcLBYTa+zwDzgH3Ao8C3TtHn/g7wemA/8DfAv5J93mRSEdFHNiiuIFvzZ4H3RsTTySrvAbYml90+kHwOwErgu8BR4CfAZyPioWk7GpuV5H4ws9Il6V+BpyOi4Gc8ZlPlMw6zEiLpIklnSiqTdDlwFdk+CbOS4SfHzUrLq4CvkX2OYyfwhxHxRHFLMhvNl6rMzCwvvlRlZmZ5mROXqhobG2PZsmXFLsPMbEZ5/PHH90VE09j2OREcy5YtY+3atcUuw8xsRpG0bbx2X6oyM7O8ODjMzCwvDg4zM8uLg8PMzPLi4DAzs7w4OMzMLC8ODjMzy4uDYxLfWLeLLz067m3MZmZzloNjEt/a+CJ3PNJe7DLMzEqKg2MSrZk02w90c6Crr9ilmJmVDAfHJFpb0gA8ueNQUeswMyslDo5JXNDSQJlgnYPDzGyEg2MStVXlrFxcx5M7DxW7FDOzkuHgOI62TJondxzCL7wyM8tycBxHaybNwe5+th/oLnYpZmYloaDBIelySc9I2iLppnGWf0DSBknrJP1I0qqcZR9NtntG0q9NdZ/TrTXTALifw8xsWMGCQ1IKuA24AlgFXJsbDIkvR8T5EdEG3AJ8Ktl2FXANcC5wOfBZSakp7nNavfq0OqoryhwcZmaJQp5xXAxsiYj2iOgD7gGuyl0hIjpzZmuB4Y6Eq4B7IqI3Il4AtiT7O+4+p1t5qozzmxt8S66ZWaKQwdEM7MiZ35m0jSLpjyQ9T/aM44+Ps+2U9pns93pJayWt7ejoOOGDgOzzHBt3d9I/OHRS+zEzmw2K3jkeEbdFxJnAR4D/No37vSMiVkfE6qamV7xrPS9tS9P0DQzx9J4j01SdmdnMVcjg2AVkcuZbkraJ3ANcfZxt893ntBh+gnydn+cwMytocDwGrJS0XFIl2c7uNbkrSFqZM/vrwHPJ9BrgGklVkpYDK4GfTWWfhdCyYB6LaitZt/1QoT/KzKzklRdqxxExIOkG4EEgBXwuIjZJuhlYGxFrgBskXQb0AweB65JtN0m6F9gMDAB/FBGDAOPts1DHMExS9kFAn3GYmRUuOAAi4n7g/jFtH8+Z/tAk234C+MRU9nkqtGbSfP+ZvXT29FNfXXGqP97MrGQUvXN8pmjLpImADTsPF7sUM7OicnBM0QUtfoLczAwcHFOWrqlkeWOtHwQ0sznPwZGHtkyadR4p18zmOAdHHlpbGth7pJcXO3uKXYqZWdE4OPLQmkkDfpWsmc1tDo48rFpST0VKPOHgMLM5zMGRh6ryFKtOr/cZh5nNaQ6OPLVm0mzYeZjBIXeQm9nc5ODIU1smTVffIFv2Hi12KWZmReHgyJM7yM1srnNw5Gn5olrqq8s9xLqZzVkOjjyVlYnWTNpDrJvZnOXgOAGtLWmeeekIx/oGi12Kmdkp5+A4AW2ZNINDwcbdHinXzOYeB8cJuCCTHSnXHeRmNhc5OE7A4rpqmtPzPMS6mc1JDo4TNDxSrpnZXOPgOEGtmQZ2HjzGvqO9xS7FzOyUKmhwSLpc0jOStki6aZzlN0raLGm9pO9JOiNpf4ukdTk/PZKuTpZ9XtILOcvaCnkME2ltSQOw3s9zmNkcU7DgkJQCbgOuAFYB10paNWa1J4DVEXEBcB9wC0BEPBQRbRHRBlwKdAPfztnuw8PLI2JdoY5hMue3NFAm/DyHmc05hTzjuBjYEhHtEdEH3ANclbtCEhDdyeyjQMs4+3k78EDOeiWhprKcs0+rY91O35JrZnNLIYOjGdiRM78zaZvI+4EHxmm/BviXMW2fSC5vfVpS1Xg7k3S9pLWS1nZ0dORT95S1ZdI86VfJmtkcUxKd45LeDawGbh3TfjpwPvBgTvNHgXOAi4CFwEfG22dE3BERqyNidVNTU0HqbsukOXysn637S+pkyMysoAoZHLuATM58S9I2iqTLgI8BV0bE2FuU3gl8PSL6hxsiYk9k9QJ3kb0kVhQeKdfM5qJCBsdjwEpJyyVVkr3ktCZ3BUkXAreTDY294+zjWsZcpkrOQpAk4Gpg4/SXPjVnn1ZHTWXKz3OY2ZxSXqgdR8SApBvIXmZKAZ+LiE2SbgbWRsQaspem5gNfyeYA2yPiSgBJy8iesTw8Ztd3S2oCBKwDPlCoYzieVJk4r7nBwWFmc0rBggMgIu4H7h/T9vGc6csm2XYr43SmR8Sl01jiSWvLpPn8f2ylb2CIyvKS6DIyMyso/6U7SW2ZNH2DQzy1p7PYpZiZnRIOjpM00kHuJ8jNbI5wcJykJQ3VNM6vcj+Hmc0ZDo6TJMkj5ZrZnOLgmAZtmQbaO7o4fKz/+Cubmc1wDo5pMNzPscHjVpnZHODgmAYXJEOsr9txsLiFmJmdAg6OadAwr4IVTbWs2+EzDjOb/Rwc02S4g9wj5ZrZbOfgmCZtmTT7jvay+3BPsUsxMysoB8c0GX6VrEfKNbPZzsExTV5zej2VqTIHh5nNeg6OaVJZXsaqJfU84eAws1nOwTGN2jJpNuw8zMDgULFLMTMrGAfHNGrLpDnWP8hze48WuxQzs4JxcEwjv0rWzOYCB8c0WraohoZ5FR5i3cxmNQfHNJJEaybNE9sPFbsUM7OCcXBMs7aWBp596QjdfQPFLsXMrCAKGhySLpf0jKQtkm4aZ/mNkjZLWi/pe5LOyFk2KGld8rMmp325pJ8m+/xXSZWFPIZ8tS1NMxSwcZdfJWtms1PBgkNSCrgNuAJYBVwradWY1Z4AVkfEBcB9wC05y45FRFvyc2VO+98Cn46Is4CDwPsLdQwnwiPlmtlsV8gzjouBLRHRHhF9wD3AVbkrRMRDEdGdzD4KtEy2Q0kCLiUbMgBfAK6ezqJPVuP8KloWzONJj5RrZrNUIYOjGdiRM78zaZvI+4EHcuarJa2V9Kikq5O2RcChiBjuQJhwn5KuT7Zf29HRcUIHcKL8Klkzm81KonNc0ruB1cCtOc1nRMRq4LeBz0g6M599RsQdEbE6IlY3NTVNY7XH15ZJs+vQMTqO9J7SzzUzOxUKGRy7gEzOfEvSNoqky4CPAVdGxMhf2ojYlfxuB34AXAjsB9KSyifbZ7H5QUAzm80KGRyPASuTu6AqgWuANbkrSLoQuJ1saOzNaV8gqSqZbgTeCGyO7FuSHgLenqx6HfCNAh7DCTlvSQOpMvlBQDOblQoWHEk/xA3Ag8BTwL0RsUnSzZKG75K6FZgPfGXMbbevAdZKepJsUHwyIjYnyz4C3ChpC9k+jzsLdQwnal5lilefVud+DjOblcqPv8qJi4j7gfvHtH08Z/qyCbb7MXD+BMvayd6xVdJaM2n+ff1uhoaCsjIVuxwzs2lTEp3js9GFmTSdPQNs3d9V7FLMzKaVg6NAhjvIfbnKzGYbB0eBnLV4PrWVKd9ZZWazjoOjQFJl4vyWBp9xmNms4+AooNZMms17OukdGCx2KWZm08bBUUAXZtL0DwZP7TlS7FLMzKaNg6OARjrIt3ukXDObPRwcBfSq+moW11Xx5E6PlGtms4eDo4Ak0ZZJ+84qM5tVHBwF1ppJ076vi8Pd/cUuxcxsWjg4CqxteKRcD3hoZrOEg6PAzm9pQPIQ62Y2ezg4Cqy+uoIzm+b7QUAzmzUcHKdAa0uaJ3ceIvs6ETOzmc3BcQq0LU2z72gfuw4dK3YpZmYnzcFxCrS1pAGPlGtms4OD4xQ45/Q6KsvL3EFuZrOCg+MUqEiVcd6Sep9xmNms4OA4RVozaTbsOszA4FCxSzEzOylTCg5JtZLKkumzJV0pqWIK210u6RlJWyTdNM7yGyVtlrRe0vcknZG0t0n6iaRNybJ35WzzeUkvSFqX/LRN+WiLqC2Tpqd/iGdfOlrsUszMTspUzzgeAaolNQPfBt4DfH6yDSSlgNuAK4BVwLWSVo1Z7QlgdURcANwH3JK0dwPvjYhzgcuBz0hK52z34YhoS37WTfEYiqrNr5I1s1liqsGhiOgGfhP4bES8Azj3ONtcDGyJiPaI6APuAa7KXSEiHkr2C/Ao0JK0PxsRzyXTu4G9QNMUay1JSxfWsKCmwh3kZjbjTTk4JL0e+B3g35O21HG2aQZ25MzvTNom8n7ggXE++GKgEng+p/kTySWsT0uqmqDg6yWtlbS2o6PjOKUWniRaM2mPWWVmM95Ug+NPgI8CX4+ITZJWAA9NVxGS3g2sBm4d03468H+B342I4V7ljwLnABcBC4GPjLfPiLgjIlZHxOqmptI4WWltSfPsS0fo6h0odilmZidsSsEREQ9HxJUR8bdJJ/m+iPjj42y2C8jkzLckbaNIugz4GHBlRPTmtNeTPbv5WEQ8mlPLnsjqBe4ie0lsRmjLpBkK2LDLL3Yys5lrqndVfVlSvaRaYCOwWdKHj7PZY8BKScslVQLXAGvG7PdC4HayobE3p70S+DrwxYi4b8w2pye/BVyd1DMjDL9K1v0cZjaTTfVS1aqI6CT7h/oBYDnZO6smFBEDwA3Ag8BTwL3JZa6bJV2ZrHYrMB/4SnJr7XCwvBO4BHjfOLfd3i1pA7ABaAT+ZorHUHQLaytZurDGd1aZ2YxWPsX1KpLnNq4G/jEi+iUdd6jXiLgfuH9M28dzpi+bYLsvAV+aYNmlU6y5JLVl0qzdeqDYZZiZnbCpnnHcDmwFaoFHkgf1OgtV1GzWmkmz+3APezt7il2KmdkJmWrn+D9ERHNEvC3pmN4GvKXAtc1KbZkGwA8CmtnMNdXO8QZJnxp+LkLS/yR79mF5OndJA+Vl8vMcZjZjTfVS1eeAI2Q7rd9J9jLVXYUqajarrkhxzul1PuMwsxlrqp3jZ0bEb+XM/3dJ6wpQz5zQ2pJmzbrdDA0FZWUqdjlmZnmZ6hnHMUlvGp6R9EbA70E9QW2ZNEd6B2jf11XsUszM8jbVM44PAF+U1JDMHwSuK0xJs1/uSLlnLZ5f3GLMzPI01buqnoyIVuAC4IKIuBCY0c9TFNOKpvnMryr3E+RmNiPl9QbAiOhMniAHuLEA9cwJqTJxQUuD76wysxnpZF4d617dk9CaSfPUnk56+geLXYqZWV5OJjiOO+SITay1JU3/YLB5jx/AN7OZZdLOcUlHGD8gBMwrSEVzxIVL00B2pNxfWLqguMWYmeVh0uCIiLpTVchcc1p9Na+qr/aDgGY245zMpSo7SW2ZtO+sMrMZx8FRRK2ZNFv3d3Oou6/YpZiZTZmDo4haPVKumc1ADo4iuqAljQRP7vA7yM1s5nBwFNH8qnJWLp7vBwHNbEZxcBRZa0uadTsOEeHHYsxsZihocEi6XNIzkrZIummc5TdK2ixpvaTvJa+kHV52naTnkp/rctp/UdKGZJ//IGlGP8HetjTNga4+dh70YMNmNjMULDgkpYDbgCuAVcC1klaNWe0JYHVEXADcB9ySbLsQ+CvgtcDFwF9JGn5K7p+APwBWJj+XF+oYToXWljQAT7iD3MxmiEKecVwMbImI9ojoA+4BrspdISIeiojuZPZRoCWZ/jXgOxFxICIOAt8BLpd0OlAfEY9G9trOF4GrC3gMBffqV9VRVV7m5znMbMYoZHA0Azty5ncmbRN5P/DAcbZtTqaPu09J1w+/I72joyPP0k+dilQZ5zc3ODjMbMYoic5xSe8GVgO3Ttc+I+KOiFgdEaubmpqma7cF0ZpJs2HXYfoHh4pdipnZcRUyOHYBmZz5lqRtFEmXAR8DroyI3uNsu4uXL2dNuM+Zpi2TpndgiGdePFLsUszMjquQwfEYsFLSckmVwDXAmtwVJF0I3E42NPbmLHoQ+FVJC5JO8V8FHoyIPUCnpNcld1O9F/hGAY/hlBh+layf5zCzmaBgwRERA8ANZEPgKeDeiNgk6WZJVyar3QrMB74iaZ2kNcm2B4D/QTZ8HgNuTtoA/ivwz8AW4Hle7heZsVoWzGNhbSXrth8qdilmZsc16bDqJysi7gfuH9P28ZzpyybZ9nPA58ZpXwucN41lFp2k7Ei5PuMwsxmgJDrHLfs8x3N7j3K0d6DYpZiZTcrBUSJaMw1EwHqfdZhZiXNwlIiRDnKPlGtmJc7BUSLSNZUsW1TjBwHNrOQ5OEpIaybtlzqZWclzcJSQtkyaFzt7ePFwT7FLMTObkIOjhLQm/Rw+6zCzUubgKCGrTq+nIiU/z2FmJc3BUUKqK1K85vR6d5CbWUlzcJSY1pY063ceZnDIr5I1s9Lk4CgxbZk0R3sHaO84WuxSzMzG5eAoMe4gN7NS5+AoMSsaa6mrLndwmFnJcnCUmLIy0drikXLNrHQ5OEpQa6aBp/ccoad/sNilmJm9goOjBLW2pBkYCjbt9oCHZlZ6HBwlqG2kg9zBYWalx8FRghbXV7OkodoPAppZSXJwlCiPlGtmpaqgwSHpcknPSNoi6aZxll8i6eeSBiS9Paf9LZLW5fz0SLo6WfZ5SS/kLGsr5DEUS1smzfYD3Rzo6it2KWZmoxQsOCSlgNuAK4BVwLWSVo1ZbTvwPuDLuY0R8VBEtEVEG3Ap0A18O2eVDw8vj4h1hTmC4modeSPgoaLWYWY2ViHPOC4GtkREe0T0AfcAV+WuEBFbI2I9MDTJft4OPBAR3YUrtfSc39xAmfwEuZmVnkIGRzOwI2d+Z9KWr2uAfxnT9glJ6yV9WlLViRZYymqryjn7tDo/CGhmJaekO8clnQ6cDzyY0/xR4BzgImAh8JEJtr1e0lpJazs6OgpeayG0tqR5cschIjxSrpmVjkIGxy4gkzPfkrTl453A1yOif7ghIvZEVi9wF9lLYq8QEXdExOqIWN3U1JTnx5aGtqVpDnb3s/3AnLpKZ2YlrpDB8RiwUtJySZVkLzmtyXMf1zLmMlVyFoIkAVcDG0++1NLU2pIG3M9hZqWlYMEREQPADWQvMz0F3BsRmyTdLOlKAEkXSdoJvAO4XdKm4e0lLSN7xvLwmF3fLWkDsAFoBP6mUMdQbGefNp95FSkHh5mVlPJC7jwi7gfuH9P28Zzpx8hewhpv262M05keEZdOb5WlqzxVxvnNDb4l18xKSkl3jlt2pNyNuzvpH5zsjmUzs1PHwVHiWjNp+gaGeHrPkWKXYmYGODhK3shIuX6ew8xKhIOjxDWn59Gcnsct33qaf/jecxzp6T/+RmZmBeTgKHGS+MLvXcwbzlzEp77zLJfc8hC3P/w8x/r8dkAzKw7NhaeSV69eHWvXri12GSdt/c5DfOo7z/KDZzponF/FDW85k2tfu5Sq8lSxSzOzWUjS4xGx+hXtDo6ZZ+3WA/zdt5/h0fYDLGmo5oZLV/KO1S1UpHwCaWbTx8Exi4Jj2I+37OPWbz/DE9sPsXRhDX9y2UquamsmVaZil2Zms8BEweF/os5gbzirka/94Ru4630XUVddzo33Psmvfvphvrl+N0NDs/8fBGZWHA6OGU4SbzlnMd/84Jv43+/+BVJl4oYvP8Hb/uGHfGfzSx5Z18ymnYNjlpDE5eedzgMfuoS/v6aNnv5B/uCLa7n6tv/gkWc7HCBmNm0cHLNMqkxc1dbMd2/8ZW75rQvYd7SP937uZ7zr9kf5afv+YpdnZrOAO8dnud6BQe59bAf/6/tb2Hukl19a2ciNbz2bC5cuKHZpZlbifFfVHA2OYT39g3zp0W189gfPc6Crj8tes5g/fevZnLukodilmVmJcnDM8eAY1tU7wOd/vJXbH36ezp4Bfv380/nTt67krMV1xS7NzEqMg8PBMcrhY/3c+cN27vzRCxzrH+TqtmY+dNlKzlhUW+zSzKxEODgcHOM60NXH7Q8/zxd+spX+weAdv9jCB39lJc3pecUuzcyKzMHh4JjU3s4ePvuD5/nyT7cDcO3FGf7oLWexuL66yJWZWbE4OBwcU7Lr0DH+8fvP8ZW1O0mVievesIwP/PKZLKytLHZpZnaKOTgcHHnZtr+Lv//uc3x93S5qKlL83puW8/u/tIKGeRXFLs3MTpGiBIeky4G/B1LAP0fEJ8csvwT4DHABcE1E3JezbBDYkMxuj4grk/blwD3AIuBx4D0R0TdZHQ6OE/fcS0f4zHef49837KGuupxLVjZxbnM95zc3cN6SBhb4TMRs1jrlwSEpBTwLvBXYCTwGXBsRm3PWWQbUA38OrBkTHEcjYv44+70X+FpE3CPpfwNPRsQ/TVaLg+Pkbdp9mP/zSDuPbz/IjgPHRtqb0/M4LwmSc5MwaaqrKmKlZjZdJgqO8gJ+5sXAlohoTwq4B7gKGAmOiNiaLBuayg4lCbgU+O2k6QvAXwOTBoedvHOXNPCZay4E4FB3H5t2d7Jh12E27jrMpt2dPLjppZF1X1VfzXnN9Zy7pCF7ZtLcwGn1VWT/5zOzma6QwdEM7MiZ3wm8No/tqyWtBQaAT0bEv5G9PHUoIgZy9tk83saSrgeuB1i6dGl+lduk0jWVvPGsRt54VuNIW2dPP5t3d7IxCZONuzv53tN7GT6hbZxfxXnN9Zy3JBsk5zXX05ye5zAxm4EKGRwn64yI2CVpBfB9SRuAw1PdOCLuAO6A7KWqAtVoifrqCl63YhGvW7FopK2rd4Cn9mTDZMOuTjbtPswPn9vHYPKukAU1FZzX3JBzZlLP0oU1DhOzElfI4NgFZHLmW5K2KYmIXcnvdkk/AC4EvgqkJZUnZx157dNOrdqqclYvW8jqZQtH2nr6B7NhsruTjTsPs3H3Ye78UTv9g9kwqasuT85K6pMzkwaWL6qlzG81NCsZhQyOx4CVyV1Qu4BreLlvYlKSFgDdEdErqRF4I3BLRISkh4C3k72z6jrgGwWp3gqiuiLFhUsXjBqdt3dgkGdfPMrG3YfZsOswm3Yd5gs/2UbfQLbrq7YyxblLGkbu5jp3SQPLGmuoKk8V6zDM5rRC3477NrK326aAz0XEJyTdDKyNiDWSLgK+DiwAeoAXI+JcSW8AbgeGyL4z5DMRcWeyzxVkQ2Mh8ATw7ojonawO31U18/QPDvHcS9kwGe432bynk57+bJiUCTILa1jeWMuKxvmsaKplRWMtK5rmuyPebJr4AUAHx4w3MDhE+74uNu0+zAsdXTy/r4v2ji5e2Hd0JFAAaipT2UBpms/yxlrObMqGy/KmWuZXlXK3nllpKcbtuGbTqjxVxtmn1XH2aaOHgB8aCl7s7BkJkec7umjf18W6HQf55vrd5P7baHFdFSuaalneOD8bKMl0ZsE8ylN+IabZVDg4bMYrKxNL0vNYkp7Hm1Y2jlrW0z/I9gPdtHdkA+WFfV20dxzlgY17ONTdP7JeRUosXVgzEijDZywrmmpZVFvpS19mORwcNqtVV6TGPUsBONjVR/u+o7QnZyjtHUd5YV8XjzzbQd/gy5e+6qrLWdE0nzMba0ddAlveWMu8SnfQ29zj4LA5a0FtJb9Yu5BfPGPhqPbBoWDXwWMjofLCvi7a9x3lJ+37+doTo+/+flV9NWcsqmHZolrOaEx+L6rhjEXuT7HZy/9lm42RKhNLF9WwdFENb3716GXdfQPJ5a5soGzb3822/V187+m97Ds6+ua+xvlVLEtCZFmyv2WLalm2qJaGGo8ybDOXg8MsDzWV5dlnSpY0vGLZ0d4Btu3PhsnW/V1s29fNtgNd/Pj5fXz15z2j1k3XVIwEytjfC92nYiXOwWE2TeZXTRwqw530W/flBMv+bh7fdpD/9+RuhnLu/KqrKueMxrGBkr0EtrjOz6hY8Tk4zE6ByTrpewcG2XnwGNv2d7F1X3c2YPZ3sXl3Jw9ufJGBnFSZV5FK+lCG+1NqWTS/kqGhYGAoGBz5PUT/4Oj5gaFgcPDl9fqHhkbNj1ovmR8YHMpZFgyM7DPb3p8zX1NZzuK6KhbXV3NafRWnJb8X11WzuL6KRbVVpDx0zKzg4DArsqryFGc2zefMple8foaBwSF2H+pJzlC62Jr0qTzf0cVDT4+++ysfqTJRnvykykR5qmykLfd3xSvas/NVFeU5+8i2He0dYPfhHtbtOMT+rle+Wy1VJprmV2XDpL6axXU54VJfzWlJwCysqfTYZCXOwWFWwspTZSMd9dA0atlg8uDjwa4+ylPDf9zLcsLg5T/qYwOh0Je7+gaG6Djay0udPezt7GXvkR5e6uzhpc5s244D3azdeoCDOc/SjBxzmUaduSyuywmXnLYFNRW+bFckDg6zGSpVJprT82hOzyt2Ka9QWV42pdp6+gfpOJINlr1JqLx05OXAeWFfF4+2H+DwsVcGTGWqjKa6qpHLYsNhU1ddTlV5GVXlqezvipenqytSLy+rKBuZriwv82W0PDg4zKxoqitSZBbWkFlYM+l6Pf2D2WAZFTAvTz+39yg/2rKPIz0Dk+5nMhUpvRw25WVUDYdMRU7bmMAZG0zD4VRZXkZFaswZYHJJcOzZ38hZYWr89orUmPXKVPRLeQ4OMyt51RWpnEt2EzvWN0hX3wC9A0P09g9mf4+dHhiktz873TPSnvzuz5ketd0gR3sH2H+0b2R5T866w68AOFUkXhk8EwTUnddddNzvLV8ODjObNeZVpooyDMzQUNA3ODqYcu9G6x97d9pQzvzgOO2D49/l9vKdba9sf+VnZNerqpj+wTsdHGZmJ6msTFSXpaiuSAGzf1QAjyNtZmZ5cXCYmVleHBxmZpYXB4eZmeWloMEh6XJJz0jaIummcZZfIunnkgYkvT2nvU3STyRtkrRe0rtyln1e0guS1iU/bYU8BjMzG61gd1VJSgG3AW8FdgKPSVoTEZtzVtsOvA/48zGbdwPvjYjnJC0BHpf0YEQcSpZ/OCLuK1TtZmY2sULejnsxsCUi2gEk3QNcBYwER0RsTZaNenomIp7Nmd4taS/ZgXoOFbBeMzObgkJeqmoGduTM70za8iLpYqASeD6n+RPJJaxPS6qaYLvrJa2VtLajoyPfjzUzswmU9AOAkk4H/i9wXUQMn5V8FHiRbJjcAXwEuHnsthFxR7IcSR2Stp1gGY3AvhPcdjby9/Eyfxej+fsYbTZ8H2eM11jI4NgFZHLmW5K2KZFUD/w78LGIeHS4PSL2JJO9ku7ilf0jrxARTcdbZ5I61kbE6hPdfrbx9/Eyfxej+fsYbTZ/H4W8VPUYsFLSckmVwDXAmqlsmKz/deCLYzvBk7MQlB2I/2pg43QWbWZmkytYcETEAHAD8CDwFHBvRGySdLOkKwEkXSRpJ/AO4HZJm5LN3wlcArxvnNtu75a0AdhA9lTwbwp1DGZm9kqKiOOvNYdJuj7pLzH8feTydzGav4/RZvP34eAwM7O8eMgRMzPLi4PDzMzy4uCYxPHG2porJGUkPSRpczJ+2IeKXVMpkJSS9ISkbxa7lmKTlJZ0n6SnJT0l6fXFrqlYJP1p8v+TjZL+RVJ1sWuabg6OCeSMtXUFsAq4VtKq4lZVNAPAn0XEKuB1wB/N4e8i14fI3jFo8PfAtyLiHKCVOfq9SGoG/hhYHRHnASmyjyLMKg6OiY2MtRURfcDwWFtzTkTsiYifJ9NHyP5RyHv4mNlEUgvw68A/F7uWYpPUQPb2+TsBIqIvZ0DSuagcmCepHKgBdhe5nmnn4JjYtIy1NdtIWgZcCPy0yKUU22eAvwCGjrPeXLAc6ADuSi7d/bOk2mIXVQwRsQv4O7Ijf+8BDkfEt4tb1fRzcNiUSZoPfBX4k4joLHY9xSLpN4C9EfF4sWspEeXALwD/FBEXAl3AnOwTlLSA7JWJ5cASoFbSu4tb1fRzcEzspMbamm0kVZANjbsj4mvFrqfI3ghcKWkr2UuYl0r6UnFLKqqdwM6IGD4LvY9skMxFlwEvRERHRPQDXwPeUOSapp2DY2InPNbWbJOMC3Yn8FREfKrY9RRbRHw0IloiYhnZ/y6+HxGz7l+VUxURLwI7JL06afoVct67M8dsB14nqSb5/82vMAtvFCjpYdWLKSIGJA2PtZUCPhcRm46z2Wz1RuA9wAZJ65K2v4yI+4tXkpWYD5IdR64SaAd+t8j1FEVE/FTSfcDPyd6N+ATJ6x1mEw85YmZmefGlKjMzy4uDw8zM8uLgMDOzvDg4zMwsLw4OMzPLi4PD7DgkHU1+L5P029O8778cM//j6dy/WSE4OMymbhmQV3AkA91NZlRwRMSse8rYZh8Hh9nUfRL4JUnrkncupCTdKukxSesl/RcASW+W9ENJa0ieoJb0b5IeT97TcH3S9kmyo6iuk3R30jZ8dqNk3xslbZD0rpx9/yDn3Rd3J08oI+mTyTtT1kv6u1P+7dic4SfHzabuJuDPI+I3AJIAOBwRF0mqAv5D0vBIqL8AnBcRLyTzvxcRByTNAx6T9NWIuEnSDRHRNs5n/SbQRvbdFo3JNo8kyy4EziU7XPd/AG+U9BTwn4FzIiIkpaf30M1e5jMOsxP3q8B7k2FYfgosAlYmy36WExoAfyzpSeBRsoNnrmRybwL+JSIGI+Il4GHgopx974yIIWAd2Utoh4Ee4E5Jvwl0n+SxmU3IwWF24gR8MCLakp/lOe9e6BpZSXoz2VFTXx8RrWTHLzqZ14n25kwPAuURMUD25WP3Ab8BfOsk9m82KQeH2dQdAepy5h8E/jAZch5JZ0/wAqMG4GBEdEs6h+zrd4f1D28/xg+BdyX9KE1k37D3s4kKS96V0pAMPPmnZC9xmRWE+zjMpm49MJhccvo82fdsLwN+nnRQdwBXj7Pdt4APJP0Qz5C9XDXsDmC9pJ9HxO/ktH8deD3wJBDAX0TEi0nwjKcO+IakarJnQjee0BGaTYFHxzUzs7z4UpWZmeXFwWFmZnlxcJiZWV4cHGZmlhcHh5mZ5cXBYWZmeXFwmJlZXv4/QAgzW/yBXxUAAAAASUVORK5CYII=\n",
       "text/plain": [
        "<Figure size 432x288 with 1 Axes>"
       ]
@@ -485,7 +483,7 @@
    "outputs": [
     {
      "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAn/klEQVR4nO3de5xdZ13v8c937pOZyaXJ5J4maZs2DRRSGCtYxQtQq2KL4MEUEVCx4KGAqHCKx4NYjp56AeFgX0qBKsqlQFEMnkoFBURuJoVy6eyWpmmhSXaSyXXPJHOf3/ljrZnsTPbM7DTZWXvv+b5fr51Z61nrWfs3+5VZv72eZ63nUURgZmY2XUPWAZiZWXVygjAzs5KcIMzMrCQnCDMzK8kJwszMSnKCMDOzkpwgzMysJCcIq3mSBopeE5IGi9Z/+Ukc7wuSXlWJWM1qSVPWAZidq4jonFyW9Djwqoj4XHYRVZakpogYyzoOq3++grC6JalB0q2SHpV0WNLHJV2UbmuT9KG0/JikHZJWSPoj4MeAv0yvQP5yhmN/QtJ+Sccl/YekpxRta5f0DknfT7f/p6T2dNuPSvpK+p5PSHplWn7aVYukV0r6z6L1kPRaSY8Aj6Rl706PUZB0v6QfK9q/UdLvpb97f7p9naQ7JL1j2u+yXdIbz/0Tt3rjBGH17HXAC4EfB1YDR4E70m2vABYB64ClwGuAwYj4n8CXgFsiojMibpnh2P8CbAKWA98APly07c+BZwI/AlwEvBmYkLQ+rfceoBvYCjxwFr/PC4EfBrak6zvSY1wEfAT4hKS2dNtvAzcBPwssBH4NOAl8ELhJUgOApGXA89L6ZqdxE5PVs9eQnOj3AEh6G/ADSb8CjJIkhssi4tvA/Wdz4Ii4a3I5Pe5RSYuAfpKT8bMiYm+6y1fS/V4KfC4iPpqWH05f5fo/EXGkKIYPFW17h6TfB64AvgW8CnhzRDycbv/W5HtKOg48F/gssA34QkQcOIs4bJ7wFYTVs/XAP6bNOceAHDAOrAD+HrgPuFvSPkl/Kqm5nIOmzTe3p803BeDxdNOy9NUGPFqi6roZysv1xLQ4fldSLm3GOkZyRbSsjPf6IPCydPllJJ+F2RmcIKyePQH8TEQsLnq1RcTeiBiNiD+MiC0kTUEvAF6e1ptriOOXAjeSNM0sAjak5QIOAUPApTPEU6oc4ASwoGh9ZYl9puJK+xveDLwEWBIRi4HjaQxzvdeHgBslPR24EvjUDPvZPOcEYfXsr4E/Stv+kdQt6cZ0+SclXSWpESiQNDlNpPUOAJfMctwuYJikeWgB8MeTGyJiArgLeKek1enVxrMltZL0UzxP0kskNUlaKmlrWvUB4EWSFki6DPj1OX63LmAM6AOaJL2VpK9h0vuBt0vapMTTJC1NY9xD0n/x98AnI2JwjveyecoJwurZu4HtwL9K6ge+RtLJC8k39HtIkkMO+CKnmlreDfyipKOS/m+J4/4d8H1gL9CbHrfY7wLfITkJHwH+BGiIiB+QdBr/Tlr+APD0tM5fACMkyemDnN7pXcp9wGeA76WxDHF6E9Q7gY8D/5r+jh8A2ou2fxC4Cjcv2SzkCYPM5h9JzyFpalofPgnYDHwFYTbPpJ3xbwDe7+Rgs3GCMJtHJF0JHANWAe/KNBirem5iMjOzknwFYWZmJdXNk9TLli2LDRs2ZB2GmVlNuf/++w9FRHepbXWTIDZs2MDOnTuzDsPMrKZI+v5M29zEZGZmJTlBmJlZSU4QZmZWkhOEmZmV5ARhZmYlOUGYmVlJThBmZlZS3TwHYWb1ISIYGp2gf2iUwtAYhaFR+ofGkvXB5OfI2ATdXa2sWtzOqkVtrFzUxsK2siYErBsTE8GhE8McLAwzEcHT1i4+7+9R0QQh6XqSsfUbSUaOvH3a9otJxqVfnO5za0Tcm257C8mkKePA6yPivkrGambnx8jYqZN7/7ST++TJvvikf/r6GIXBUcYmzn6MuM7WJlYuamNV+lq5qH1qedWi9jSJNCFp7oNlKCIoDI1xsDDE/sIQBwrDHCgMTb32F4Y5WBiir3946nN6+rrF/NNrrz3vsVQsQaQzdd0BPB/YA+yQtD0ieot2+33g4xHxV5K2APcCG9LlbcBTgNXA5yRdHhHjlYrXzE6JCAaGxzg8MMLhE8McGhhJlgeGTz+ZT0sEhcFRhscm5jx+Z2sTC9ua6GprpqutieVdbVza3URXWrYwLe9qa2Jhe/Np+y5sa6a5sYGD/UPsPz5E/vgQ+eOD5I8n6/uOD/G9A30c7B9m+likHS2NaRJpPz2RLE4TycJ2FrZXLokMjY6nJ/ph9heGOFh00j+Qru8vDDE0euZnuKi9mRULW1mxsI1Ny5dNLa9Y2MbaJe0l3u3cVfIK4hpgV0TsBpB0N8k8vsUJIjg1TeIiYF+6fCNwd0QMA49J2pUe76sVjNesrg2PjXPkRHKiPzQwPHXyT9ZPLR8eGObQiRFGZjjRtzU3FJ3Am1nU3szaJe1TJ/HpJ/Opk3578rOztYnGhnM/Aa9dsoC1SxbMuH10fIKD/cPsT5NH/tjpyeRLjxziYP8Q0y9WFkwlkVOJZOWiNlanVyGrFrWxqL35tCQyNj7BoYGR9Bv/0Azf/oc5Pjh6RpytTQ2sXJSc6K9au5jndbWyclEbyxe2sWJyuauN9pbGc/7MzlYlE8QaTp8CcQ+npnuc9DaS6SBfB3SQTAI/Wbd4Gsc9adlpJN0M3Axw8cUXn5egzWrFxERwbHA0OaGXOMEfnkoCSULoHxoreZyWpgaWdbSwtLOVpZ0tXL6ii2WdLSztbGFpR1K2rLOVZZ2tLOloprXpwp+onozmxgbWLG5nzeKZv12PpUlkMnFMvyL58q5DHCicmUTamxtZtaiNBa2NHCgMc2jgzKuVxgaxvKuV5Qvb2LC0g2ddsnTqG3/xt/9qbvbKupP6JuBvI+Idkp4N/L2kp5ZbOSLuBO4E6Onp8cQWVhcmOx/3HRsif2yQvceSk9WhgeGpb/6HBkY4enKE8RJt9RJctODUCf4pqxeyrLOVpUVJYFnRyb+ztXpPUJXW1NjA6sXtrF7cDiwpuc/Y+AR9A8NFVyGnEsnJkTGesmoRKxalJ/2utvTbfytLO1rPy5VSliqZIPYC64rW16ZlxX4duB4gIr4qqQ1YVmZds5pUGBpl37FB8seG0pP/IPuODSVl6bfX0fHTT/xtzQ0s72pjaWcLa5csYOu6xSW/4S/tbGHJgpaaPzFVk6bGhrSpqR3mWUNFJRPEDmCTpI0kJ/dtwEun7fMD4LnA36ZTIbYBfcB24COS3knSSb0J+K8Kxmp2XgyPjbP/eHriT0/6+45PnvyTRDAwfHpTT2ODWLkwadveum4xP3PVStYsbp9q/16zuJ3FC5rn7bd8y07FEkREjEm6BbiP5BbWuyLiQUm3ATsjYjvwO8D7JL2RpMP6lekk6g9K+jhJh/YY8FrfwWRZm5gI+gaGk5N+2tQwlQjSk/+hgeEz6i3taGHV4qQd+kcuXcbqxUnn5+rFbaxe3M7yrjZ/47eqVDdzUvf09IQnDLJzNTY+wQ+OnOTRvhPsOjjAroMDPHH0JPuODXKgMHRG009HSyOr0jbs1YuSE/7kt/7Jh7jammujU9fmJ0n3R0RPqW1Zd1KbZWJwZJxH+wZ4tC9JApM/Hz90kpHxU7d3rljYyvqLOuhZvyQ5+S9uZ83kFcCiyt4zb5Y1Jwira0dOjEyd/Cdfj/YNsPfY4NRtiQ2C9Us7uLS7k5/avILLlndyaXcHly7vnHfDN5gVc4KwmjcxEew7Ppie/JOmoUcPDrCrb4AjJ0am9mtrbuCSZZ084+IlvKRnHZct7+Sy5Z2sX7qgZu7tN7uQnCCsZoyMTfD9wydOuxLY1TfAowdPMDh66h6GJQuauWx5J9dtSa8GlndyWXcnaxa30+DOYLOyOUFYVTpyYoTPP3SQXZN9BAcH+P6Rk6c9GLZmcTuXLu/kh665KLka6E6uCJZ2tmYYuVn9cIKwqvS//7mXf/jmXpoaxIZlHVy+ooufvWoVly7v4LLuLi7p7qCj1f99zSrJf2FWlb699zjPubybD7yih+ZGz2tllgX/5VnVGRodZ3ffAFvXLnJyMMuQ//qs6nzvQD8TAVeuWjj3zmZWMU4QVnVy+QLgBGGWNScIqzq5fD8dLY1cfNHMk8GYWeU5QVjV6d1X4IqVXX5mwSxjThBWVSKC3P4CW1a7ecksa04QVlX2HB2kf2jM/Q9mVcAJwqqKO6jNqocThFWV3nwBCTav7Mo6FLN5zwnCqkouX2Dj0g4WtPghf7OsOUFYVcnl+928ZFYlnCCsavQPjfKDIye5cpWbl8yqQUUThKTrJT0saZekW0ts/wtJD6Sv70k6VrRtvGjb9krGadXh4f39gDuozapFxRp6JTUCdwDPB/YAOyRtj4jeyX0i4o1F+78OuLroEIMRsbVS8Vn16fUdTGZVpZJXENcAuyJid0SMAHcDN86y/03ARysYj1W5XL7A4gXNrFrUlnUoZkZlE8Qa4Imi9T1p2RkkrQc2Av9eVNwmaaekr0l64Qz1bk732dnX13eewras9Ob7uXLlQiQPsWFWDaqlk3obcE9EjBeVrY+IHuClwLskXTq9UkTcGRE9EdHT3d19oWK1ChifCB7eX3DzklkVqWSC2AusK1pfm5aVso1pzUsRsTf9uRv4Aqf3T1idefzwCYZGJ3wHk1kVqWSC2AFskrRRUgtJEjjjbiRJm4ElwFeLypZIak2XlwHXAr3T61r96N3nDmqzalOxu5giYkzSLcB9QCNwV0Q8KOk2YGdETCaLbcDdERFF1a8E3itpgiSJ3V5895PVn1y+QFOD2LSiM+tQzCxV0fEMIuJe4N5pZW+dtv62EvW+AlxVydisuuTyBS5b3klrU2PWoZhZqlo6qW2e8xAbZtXHCcIyd+TECPsLQ+6gNqsyThCWuck5ILasWpRxJGZWzAnCMndqkiBfQZhVEycIy1xvvsDyrlaWdrZmHYqZFXGCsMy5g9qsOjlBWKZGxibYddAJwqwaOUFYpnYdHGB0PNiy2gnCrNo4QVimTt3B5A5qs2rjBGGZyuULtDY1sGFpR9ahmNk0ThCWqd58gStWdtHU6P+KZtXGf5WWmYggly9w5Ur3P5hVIycIy8yBwjBHT466g9qsSjlBWGZOPUHtBGFWjZwgLDO9aYLY7DuYzKqSE4RlpjdfYO2Sdha2NWcdipmV4ARhmcnlC2xx85JZ1XKCsEwMjozz+KET7n8wq2JOEJaJhw/0MxHuoDarZhVNEJKul/SwpF2Sbi2x/S8kPZC+vifpWNG2V0h6JH29opJx2oV3aogNJwizatVUqQNLagTuAJ4P7AF2SNoeEb2T+0TEG4v2fx1wdbp8EfAHQA8QwP1p3aOVitcurN59BTpbm1i7pD3rUMxsBpW8grgG2BURuyNiBLgbuHGW/W8CPpou/zTw2Yg4kiaFzwLXVzBWu8By+QJXruqioUFZh2JmM6hkglgDPFG0victO4Ok9cBG4N/Ppq6kmyXtlLSzr6/vvARtlTcxETy033NAmFW7aumk3gbcExHjZ1MpIu6MiJ6I6Onu7q5QaHa+7Tk6yMDwmBOEWZWrZILYC6wrWl+blpWyjVPNS2db12pMb/444DuYzKpdJRPEDmCTpI2SWkiSwPbpO0naDCwBvlpUfB9wnaQlkpYA16VlVgd68/00CK5Y4SE2zKpZxe5iiogxSbeQnNgbgbsi4kFJtwE7I2IyWWwD7o6IKKp7RNLbSZIMwG0RcaRSsdqFlcsX2Lisg/aWxqxDMbNZVCxBAETEvcC908reOm39bTPUvQu4q2LBWWZy+QJb1y3OOgwzm0O1dFLbPHF8cJQ9Rwfd/2BWA5wg7IJ6yE9Qm9UMJwi7oKaG2PAscmZVb8Y+CEkvKqP+UNrPYFaWXL6fizpaWN7VmnUoZjaH2Tqp3wf8EzDbWAjPYVontNlscvuTITYkD7FhVu1mSxD/EhG/NltlSR86z/FYHRsbn+Ch/f28/Fnrsw7FzMowYx9ERLxsrsrl7GM26bFDJxgZm/AdTGY1ouxOakmXSfqQpE9KenYlg7L61OsOarOaMlsndVtEDBUVvR14c7r8aWBrBeOyOpTL99PcKC7t7sw6FDMrw2xXEJ+W9PKi9VFgA7AeOKtRV80gucX1suVdtDT57mqzWjDbX+r1wEJJn5H0HOB3SSby+QXgly9EcFZfetNJgsysNszYxJTOzfCXkv4e+F/AbwK/HxGPXqjgrH4cGhimr3/YT1Cb1ZDZ+iB+GHgTMAL8MTAI/JGkvcDbI+LYBYnQ6kLOQ2yY1ZzZnoN4L/CzQCfwNxFxLbBN0o8DHyNpbjIry2SC8C2uZrVjtgQxRtIp3UFyFQFARHwR+GJlw7J6k8v3s3JhG0s6WrIOxczKNFuCeCnwapLk8PJZ9jObU+8+d1Cb1ZrZOqm/B/zOBYzF6tTw2DiP9g3wvC3Lsw7FzM7CjLe5SvrnuSqXs4/ZIwcGGJsI9z+Y1ZjZmph+VNL2WbYL2HKe47E65A5qs9o0W4K4sYz6I7NtlHQ98G6gEXh/RNxeYp+XAG8DAvhWRLw0LR8HvpPu9oOIuKGMeKwK9eYLtDU3sGFpR9ahmNlZmK0P4pzuVJLUCNwBPB/YA+yQtD0ieov22QS8Bbg2Io5KKm6kHoyIrecSg1WHXL7AFSsX0tjgOSDMakklB8W5BtgVEbsjYgS4mzOvSn4DuCMijgJExMEKxmMZiAhy+X4/IGdWgyqZINYATxSt70nLil0OXC7py5K+ljZJTWqTtDMtf2GpN5B0c7rPzr6+vvMavJ0f+eNDHB8cZYtvcTWrOXMmCEk/L6lSiaQJ2AT8BHAT8D5Ji9Nt6yOih+R5jHdJunR65Yi4MyJ6IqKnu7u7QiHauXAHtVntKufE/0vAI5L+VNLmszj2XmBd0fratKzYHmB7RIxGxGPA90gSBhGxN/25G/gCcPVZvLdVid59SYLY7ARhVnPmTBDptKJXA48Cfyvpq2nTzlxtBjuATZI2SmoBtgHTb5v9FMnVA5KWkTQ57Za0RFJrUfm1QC9Wc3L7C6xfuoDO1tlumDOzalRW01FEFIB7SDqaV5HMCfENSa+bpc4YcAtwH5ADPh4RD0q6TdLkLav3AYcl9QKfB94UEYeBK4Gdkr6Vlt9efPeT1Y5cvp8rV/rqwawWzfm1Lj2Z/ypwGfB3wDURcVDSApJv9e+ZqW5E3AvcO63srUXLAfx2+ire5yvAVeX/GlaNTo6M8fjhE7xw6/R7E8ysFpRz3f9i4C8i4j+KCyPipKRfr0xYVg8e2t9PBB6kz6xGlZMg3gbkJ1cktQMrIuLxiPi3SgVmtW+yg9p3MJnVpnL6ID4BTBStj6dlZrPK5Qt0tTWxdkl71qGY2ZNQToJoSp+EBiBd9qwvNqdcvsCVqxYieYgNs1pUToLoK7rrCEk3AocqF5LVg4mJ4KH9HmLDrJaV0wfxGuDDkv6SZIjvJ/AMczaH7x85ycmRcXdQm9WwORNERDwKPEtSZ7o+UPGorOZ5iA2z2lfW462Sfg54CskAegBExG0VjMtqXC5foLFBXL7CVxBmtaqcwfr+mmQ8pteRNDH9N2B9heOyGpfLF7hkWQdtzY1Zh2JmT1I5ndQ/EhEvB45GxB8CzyYZM8lsRrl8v5uXzGpcOQliKP15UtJqYJRkPCazko6dHGHvsUEnCLMaV04fxKfTORr+DPgGydzR76tkUFbbcvl+ALasdoIwq2WzJoh0oqB/i4hjwCcl/TPQFhHHL0RwVptO3cHkDmqzWjZrE1NETAB3FK0POznYXHL5Ass6W1je1ZZ1KGZ2Dsrpg/g3SS+Wx0uwMvWmQ2yYWW0rJ0G8mmRwvmFJBUn9kgoVjstq1Oj4BI8cGHCCMKsD5TxJ7YZkK9vuvhOMjE94DCazOlDOjHLPKVU+fQIhM/AQG2b1pJwmpjcVvf4X8GmSSYTmJOl6SQ9L2iXp1hn2eYmkXkkPSvpIUfkrJD2Svl5RzvtZ9nL5Ai2NDVzS3ZF1KGZ2jsppYvr54nVJ64B3zVVPUiPJHVDPB/YAOyRtj4jeon02AW8Bro2Io5KWp+UXAX8A9JA8d3F/Wvdoub+YZaM3X2DTik6aG8v57mFm1ezJ/BXvAa4sY79rgF0RsTudZOhu4MZp+/wGcMfkiT8iDqblPw18NiKOpNs+C1z/JGK1CyznO5jM6kY5fRDvIfkWD0lC2UryRPVc1pDMHTFpD/DD0/a5PH2PLwONwNsi4jMz1F1TIrabgZsBLr744jJCsko62D/EoYERd1Cb1YlyhtrYWbQ8Bnw0Ir58Ht9/E/ATwFrgPyRdVW7liLgTuBOgp6cn5tjdKmxyiA1fQZjVh3ISxD3AUESMQ9K3IGlBRJyco95eYF3R+tq0rNge4OsRMQo8Jul7JAljL0nSKK77hTJitQxN3sHkKwiz+lDWk9RAe9F6O/C5MurtADZJ2iipBdgGbJ+2z6dIE4GkZSRNTruB+4DrJC2RtAS4Li2zKta7r8DqRW0sWtCcdShmdh6UcwXRVjzNaEQMSFowV6WIGJN0C8mJvRG4KyIelHQbsDMitnMqEfQC48CbIuIwgKS3kyQZgNsi4shZ/WZ2weXyBY/galZHykkQJyQ9IyK+ASDpmcBgOQePiHuBe6eVvbVoOYDfTl/T694F3FXO+1j2hkbH2X3oBNc/dWXWoZjZeVJOgvgt4BOS9pFMObqSZApSsymPHBhgfCLcQW1WR8p5UG6HpM3AFWnRw2mnstmU3nwyCrwThFn9mLOTWtJrgY6I+G5EfBfolPTfKx+a1ZJcvp8FLY2sv2jO7ikzqxHl3MX0G+mMcgCkTzb/RsUisprUmy+weWUXDQ2eNsSsXpSTIBqLJwtKx1hqqVxIVmsiwkNsmNWhcjqpPwN8TNJ70/VXp2VmAOw9Nkj/0JgThFmdKSdB/A+S8Y5+M13/LPC+ikVkNad3n+eAMKtHczYxRcRERPx1RPxiRPwi0Au8p/KhWa3I5fuRYPNKTz5oVk/KuYJA0tXATcBLgMeAf6hkUFZbcvkCG5Z20NFa1n8nM6sRM/5FS7qcJCncBBwCPgYoIn7yAsVmNSK3v8BTPMSGWd2ZrYnpIeCngBdExI9GxHtIxksymzIwPMb3D5/kypVOEGb1ZrYE8SIgD3xe0vskPZdkqA2zKQ/l3UFtVq9mTBAR8amI2AZsBj5PMibTckl/Jem6CxSfVbmpOSDcxGRWd8q5i+lERHwkIn6eZOKeb5Lc+mpGb76fRe3NrFrUlnUoZnaelfMk9ZSIOBoRd0bEcysVkNWW5AnqLooetjezOnFWCcKs2PhE8NB+D7FhVq+cIOxJe/zwCYZGJ5wgzOqUE4Q9aVMd1E4QZnXJCcKetFy+QFOD2LSiM+tQzKwCKpogJF0v6WFJuyTdWmL7KyX1SXogfb2qaNt4Ufn2SsZpT04u38+l3Z20NjVmHYqZVUDFBs9J5424A3g+sAfYIWl7RPRO2/VjEXFLiUMMRsTWSsVn5653X4FnXXJR1mGYWYVU8griGmBXROyOiBHgbuDGCr6fXUBHT4ywvzDkDmqzOlbJBLEGeKJofU9aNt2LJX1b0j2S1hWVt0naKelrkl5Y6g0k3Zzus7Ovr+/8RW5z8hPUZvUv607qTwMbIuJpJBMRfbBo2/qI6AFeCrxL0qXTK6cP7fVERE93d/eFidiAZA5q8BhMZvWskgliL1B8RbA2LZsSEYcjYjhdfT/wzKJte9Ofu4EvAFdXMFY7S735At1drSzrbM06FDOrkEomiB3AJkkbJbUA24DT7kaStKpo9QYgl5YvkdSaLi8DriWZyc6qRC7f76sHszpXsbuYImJM0i3AfUAjcFdEPCjpNmBnRGwHXi/pBmAMOAK8Mq1+JfBeSRMkSez2Enc/WUZGxibYdbCfH7/czXpm9ayic0RGxL3AvdPK3lq0/BbgLSXqfQW4qpKx2ZP3aN8Ao+PBlas8B7VZPcu6k9pqkIfYMJsfnCDsrPXuK9DS1MDGZR1Zh2JmFeQEYWctt7/AFSu6aGr0fx+zeua/cDsrEUEu3+/mJbN5wAnCzsrB/mGOnBhxB7XZPOAEYWfFT1CbzR9OEHZWevclCWKzE4RZ3XOCsLOSyxdYs7idRe3NWYdiZhXmBGFnJZcveARXs3nCCcLKNjgyzmOHTrj/wWyecIKwsj18oJ+JgC2+g8lsXnCCsLLlfAeT2bziBGFly+ULdLY2sW7JgqxDMbMLwAnCypbLF9i8souGBmUdipldAE4QVpaJifAkQWbzjBOElWXP0UEGhsecIMzmEScIK8upITZ8B5PZfOEEYWXJ5Qs0CDav9BWE2XzhBGFlyeULbFjWQXtLY9ahmNkFUtEEIel6SQ9L2iXp1hLbXympT9ID6etVRdteIemR9PWKSsZpc+vNF9z/YDbPNFXqwJIagTuA5wN7gB2StkdE77RdPxYRt0yrexHwB0APEMD9ad2jlYrXZlYYGmXP0UFuuubirEMxswuoklcQ1wC7ImJ3RIwAdwM3lln3p4HPRsSRNCl8Fri+QnHaHB7K9wN4FjmzeaaSCWIN8ETR+p60bLoXS/q2pHskrTubupJulrRT0s6+vr7zFbdN4yE2zOanrDupPw1siIinkVwlfPBsKkfEnRHRExE93d3dFQnQkgSxZEEzKxa2Zh2KmV1AlUwQe4F1Retr07IpEXE4IobT1fcDzyy3rl04kx3UkofYMJtPKpkgdgCbJG2U1AJsA7YX7yBpVdHqDUAuXb4PuE7SEklLgOvSMrvAxsYneHi/h9gwm48qdhdTRIxJuoXkxN4I3BURD0q6DdgZEduB10u6ARgDjgCvTOsekfR2kiQDcFtEHKlUrDazxw+fYHhswh3UZvNQxRIEQETcC9w7reytRctvAd4yQ927gLsqGZ/NrTe9g8lXEGbzT9ad1FblcvkCzY3isuWdWYdiZheYE4TNqndfgUu7O2lp8n8Vs/nGf/U2q1y+4P4Hs3nKCcJmdHhgmIP9w2xZ7QRhNh85QdiMcu6gNpvXnCBsRr3544AThNl85QRhM8rl+1mxsJWLOlqyDsXMMuAEYTNyB7XZ/OYEYSUNj42z6+CAm5fM5jEnCCtp18EBxibCCcJsHnOCsJJ693kOCLP5zgnCSsrl+2lrbmDjso6sQzGzjDhBWEm5fIErVi6kscFzQJjNV04QdoaIILe/wJZVXVmHYmYZcoKwM+wvDHHs5Kj7H8zmOScIO4M7qM0MnCCshFw+SRCbV7qJyWw+c4KwM+Ty/Vx80QK62pqzDsXMMlTRBCHpekkPS9ol6dZZ9nuxpJDUk65vkDQo6YH09deVjNNOl8sXuNId1GbzXsXmpJbUCNwBPB/YA+yQtD0ieqft1wW8Afj6tEM8GhFbKxWflXZyZIzHDp/ghq2rsw7FzDJWySuIa4BdEbE7IkaAu4EbS+z3duBPgKEKxmJlemh/PxHuoDazCl5BAGuAJ4rW9wA/XLyDpGcA6yLi/0l607T6GyV9EygAvx8RX6pEkEdPjPCC9/znaWXSDMtohvLi/VWyfHrBTHUiIvk59U/yIyImV4mAIIg4tV5cP4rKJvcrrju5Nlle/J6jYxMAHsXVzCqaIGYlqQF4J/DKEpvzwMURcVjSM4FPSXpKRBSmHeNm4GaAiy+++EnF0dQonnXJ0qn1U6dhmGFx6oR6Znnp/Werc/p7RJKE0nwhkuQxmT6kU2WT20/tq6ntp/bVVCJTurNOO/appFd87NWL21i7pB0zm98qmSD2AuuK1temZZO6gKcCX0hPeCuB7ZJuiIidwDBARNwv6VHgcmBn8RtExJ3AnQA9PT3Tz8ll6Wpr5h0vefqTqWpmVtcq2QexA9gkaaOkFmAbsH1yY0Qcj4hlEbEhIjYAXwNuiIidkrrTTm4kXQJsAnZXMFYzM5umYlcQETEm6RbgPqARuCsiHpR0G7AzIrbPUv05wG2SRoEJ4DURcaRSsZqZ2ZlU3DZey3p6emLnzp1z72hmZlMk3R8RPaW2+UlqMzMryQnCzMxKcoIwM7OSnCDMzKwkJwgzMyupbu5iktQHfP8cDrEMOHSewql1/ixO58/jdP48TqmHz2J9RHSX2lA3CeJcSdo5061e840/i9P58zidP49T6v2zcBOTmZmV5ARhZmYlOUGccmfWAVQRfxan8+dxOn8ep9T1Z+E+CDMzK8lXEGZmVpIThJmZlTTvE4Sk6yU9LGmXpFuzjidLktZJ+rykXkkPSnpD1jFlTVKjpG9K+uesY8mapMWS7pH0kKScpGdnHVOWJL0x/Tv5rqSPSmrLOqbzbV4niHRSojuAnwG2ADdJ2pJtVJkaA34nIrYAzwJeO88/D4A3ALmsg6gS7wY+ExGbgaczjz8XSWuA1wM9EfFUkjlvtmUb1fk3rxMEcA2wKyJ2R8QIcDdwY8YxZSYi8hHxjXS5n+QEsCbbqLIjaS3wc8D7s44la5IWkUzk9QGAiBiJiGOZBpW9JqBdUhOwANiXcTzn3XxPEGuAJ4rW9zCPT4jFJG0Arga+nnEoWXoX8GaSWQ3nu41AH/A3aZPb+yV1ZB1UViJiL/DnwA+APHA8Iv4126jOv/meIKwESZ3AJ4HfiohC1vFkQdILgIMRcX/WsVSJJuAZwF9FxNXACWDe9tlJWkLS2rARWA10SHpZtlGdf/M9QewF1hWtr03L5i1JzSTJ4cMR8Q9Zx5Oha4EbJD1O0vT4U5I+lG1ImdoD7ImIySvKe0gSxnz1POCxiOiLiFHgH4AfyTim826+J4gdwCZJGyW1kHQybc84psxIEkkbcy4i3pl1PFmKiLdExNqI2EDy/+LfI6LuviGWKyL2A09IuiItei7Qm2FIWfsB8CxJC9K/m+dSh532TVkHkKWIGJN0C3AfyV0Id0XEgxmHlaVrgV8BviPpgbTs9yLi3uxCsiryOuDD6Zep3cCvZhxPZiLi65LuAb5BcvffN6nDYTc81IaZmZU035uYzMxsBk4QZmZWkhOEmZmV5ARhZmYlOUGYmVlJThBmKUkD6c8Nkl56no/9e9PWv3I+j29WCU4QZmfaAJxVgkgHbJvNaQkiIuruqVurP04QZme6HfgxSQ+kY/43SvozSTskfVvSqwEk/YSkL0naTvpUsaRPSbo/nSfg5rTsdpJRPx+Q9OG0bPJqRemxvyvpO5J+qejYXyiaf+HD6RO7SLo9nbPj25L+/IJ/OjZvzOsnqc1mcCvwuxHxAoD0RH88In5IUivwZUmTI3c+A3hqRDyWrv9aRByR1A7skPTJiLhV0i0RsbXEe70I2Eoyv8KytM5/pNuuBp5CMoz0l4FrJeWAXwA2R0RIWnx+f3WzU3wFYTa364CXp8OPfB1YCmxKt/1XUXIAeL2kbwFfIxkIchOz+1HgoxExHhEHgC8CP1R07D0RMQE8QNL0dRwYAj4g6UXAyXP83cxm5ARhNjcBr4uIrelrY9HY/yemdpJ+gmSUz2dHxNNJxuc5l2koh4uWx4GmiBgjmejqHuAFwGfO4fhms3KCMDtTP9BVtH4f8JvpUOhIunyGyXIWAUcj4qSkzSTTtk4anaw/zZeAX0r7ObpJZm37r5kCS+fqWJQOoPhGkqYps4pwH4TZmb4NjKdNRX9LMhfzBuAbaUdxH/DCEvU+A7wm7Sd4mKSZadKdwLclfSMifrmo/B+BZwPfAgJ4c0TsTxNMKV3AP0lqI7my+e0n9RualcGjuZqZWUluYjIzs5KcIMzMrCQnCDMzK8kJwszMSnKCMDOzkpwgzMysJCcIMzMr6f8DtnEhqj6H3isAAAAASUVORK5CYII=\n",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAot0lEQVR4nO3deXxcZ33v8c9Xuyx5X+LEexI7djYSUMOSsrQ0kFJIKFDqpBRyWxrgEkqhhZv09lIaXuXSjcItebUNNMAtS6CBUtObktKyFgq1Q0xCrHHiOIttNIls2dZYsvbf/eMcyWN5JI1tjWak+b5fr3nNOc9Z5qeJc37znOc5z6OIwMzMbLyacgdgZmaVyQnCzMwKcoIwM7OCnCDMzKwgJwgzMyvICcLMzApygjAzs4KcIGzWk3Qs7zUi6Xje+q+dwfm+JenNpYjVbDapK3cAZmcrIlpHlyU9Abw5Iv6tfBGVlqS6iBgqdxw297kGYXOWpBpJt0p6TNIhSV+UtCTd1iTpM2n5EUnbJZ0j6Y+BFwIfS2sgH5vg3P8gKSvpqKTvSLokb1uzpL+Q9GS6/T8kNafbflbS99PP3CfpprT8pFqLpJsk/Ufeekh6u6RHgUfTso+m5+iWdL+kF+btXyvp99O/PZduXyPpDkl/Me5v2SbpXWf/jdtc4wRhc9k7gFcDLwbOAw4Dd6Tb3gQsBNYAS4G3Ascj4n8C3wVuiYjWiLhlgnP/C7ARWAH8CPhs3rY/B54DvABYArwXGJG0Lj3ur4DlwBXAztP4e14NPBe4OF3fnp5jCfA54B8kNaXb3g3cALwCWAD8BtALfBq4QVINgKRlwC+kx5udxLeYbC57K8mFfj+ApPcDT0n6dWCQJDFcGBEPAvefzokj4q7R5fS8hyUtBHIkF+PnRcSBdJfvp/vdCPxbRHw+LT+Uvor1vyOiKy+Gz+Rt+wtJfwBcBPwYeDPw3ojYnW7/8ehnSjoKvBT4OrAV+FZEPH0acViVcA3C5rJ1wD+mt3OOAO3AMHAO8PfAfcDdkn4q6U8l1Rdz0vT2zYfS2zfdwBPppmXpqwl4rMChayYoL9a+cXH8nqT29DbWEZIa0bIiPuvTwBvS5TeQfBdmp3CCsLlsH/CLEbEo79UUEQciYjAi/igiLia5FfRK4I3pcVMNcXwjcD3JrZmFwPq0XMBBoA+4YIJ4CpUD9ADz8tZXFthnLK60veG9wOuBxRGxCDiaxjDVZ30GuF7Ss4AtwFcm2M+qnBOEzWV/A/xxeu8fScslXZ8u/5ykyyTVAt0kt5xG0uOeBs6f5LzzgX6S20PzgA+OboiIEeAu4MOSzktrG8+X1EjSTvELkl4vqU7SUklXpIfuBF4jaZ6kC4HfnOJvmw8MAZ1AnaT3kbQ1jPoE8AFJG5W4XNLSNMb9JO0Xfw98KSKOT/FZVqWcIGwu+yiwDfhXSTngBySNvJD8Qr+HJDm0A9/mxK2WjwKvk3RY0v8pcN7/CzwJHAB2pefN93vAQyQX4S7gT4CaiHiKpNH4d9PyncCz0mP+EhggSU6f5uRG70LuA74GPJLG0sfJt6A+DHwR+Nf0b/w7oDlv+6eBy/DtJZuEPGGQWfWR9CKSW03rwhcBm4BrEGZVJm2MfyfwCScHm4wThFkVkbQFOAKcC3ykrMFYxfMtJjMzK8g1CDMzK2jOPEm9bNmyWL9+fbnDMDObVe6///6DEbG80LY5kyDWr1/Pjh07yh2GmdmsIunJibb5FpOZmRXkBGFmZgU5QZiZWUFOEGZmVpAThJmZFeQEYWZmBZU0QUi6VtJuSXsk3Vpg+1pJ35T0gKQHJb0ib9tt6XG7Jb28lHGamdmpSvYcRDrO/h3ANcB+YLukbRGxK2+3PwC+GBF/Leli4F5gfbq8FbiEZC7hf5O0KSKGSxWvmRUWETzd3c+D+4/w6DPHaKyrYdG8BhbPqx97XzyvgYXN9dTUaOoT2qxRygflrgL2RMReAEl3k8zClZ8gghOTnCwEfpouXw/cHRH9wOOS9qTn+88SxmtmwDPdfTy4/ygPHTjx6sz1T3mcBAubk2SxaN7J7yeSSd5yS7Ktqb52Bv4qOxOlTBCrOHkCk/2cmKxl1PtJJnN5B9BCMoXj6LH5k7DsT8tOIulm4GaAtWvXTkvQZtWkM9fPTw4cTRPCER46cJSnu5NkUCO4YHkrL9y4jMtXLeSy1QvZvHIBQ8PB4d4BjhwfTN57BzjcM5i8946WDfJ0dx+7szkO9w7QOzBx5b+pviZNJg0saq5nccvJNZNCtZUFzfXUVkFtpW9wmEM9A3QdG+BQTz+Hjg3Q1TPAwZ5+usaWB1i/dB4f3XrltH9+uYfauAH4VET8haTnA38v6dJiD46IO4E7Adra2jwsrdkkDh3rT2oEebWDjqN9QPLr//xlLbzggmVcliaDi89dQEtj4UvEwnn1p/XZ/UPDHEmTx6nJJFk+kiaW3dkcR3oHOXJ8kOGRwv9bS9DaUMf8pjrmN9WzoDl5T9ZPLC8Y9z5W3lxPS0Mt0swmmb7BYbp6BjiUXvBPLA/QlSaAQz3ptmMD9EyQWOtrxdKWRpa0NLC0tYFzFzYX3O9slTJBHADW5K2vTsvy/SZwLUBE/KekJmBZkcea2QQO9wycuEWUJoQDR05MPX3+shau2rAkSQarFnLJqoW0TpAMpkNjXS3nLKjlnAVNRR8zMhLk+odOTSY9SfLI9Q3SfXyIXN8gub4hnsn18VjnELm+IbqPDzI0QXIZVSNobTw5mUyVaOY31bMgr7y2RnT1JL/kD/UMcOhY/ynLB9Nf+l09AxzrHyoYS32tkot9SyNLWxtYt3Te2PLSloaxRLC0pZElrQ3Mb6ybkeRWygSxHdgoaQPJxX0rcOO4fZ4CXgp8Kp3IpIlkEvZtwOckfZikkXoj8F8ljNVszOGeAXbuP8IDTx1h574j9PYPsaA5uTAk78mFJHk/dX1+Ux31tTPXg/xo7yA/+enJt4n2dZ1IBuuXzuPZ6xbzphes47JVi7hk1QIWNJ1eDaAcamrEwuZ6FjbXs27p6R0bEfQNjiRJpO9EEulO33Nj70nZaKL56ZE+cv25sW0T1WCmUlcjlrY2sKSlkaUtyQV/SUsDy1qTX/3JcrJ9SUsDC5pm5oJ/ukqWICJiSNItJJOr1wJ3RcTDkm4HdkTENpLJ2z8u6V0kDdY3pVMgPizpiyQN2kPA292DyUphcHiE3dkcDzx1mAeeOsID+47w+MEeIPmFuemc+SxpaeCZXB97nhm9mAwy1XVjXkNtgURy9gmmu2+Qn4y7TfTkod6x7WuXzOPyVYv4teeu4/K0ZrCwufKTwXSTRHNDLc0NtaxYMPX+hUQEvQPDYwmlUKIZHgkWz2sY+6W/tLWyL/ina87MKNfW1hYe7tumkj3alySDfUd44KnDPHTgKH2DIwAsa23kyrWLkteaxVy+emHBe/CjF47RX56jSWNsPX+5b7DAflP/Mh2fYFoa63iqq3cseQGsXtw81l5w+apFXLpqAYvmNUzvF2ZznqT7I6Kt0LZyN1KblUzf4DAPHTjKA08dZue+5JbRaKNsQ20Nl6xawI1XrRtLCqsWNRf1q08SLY11tDTWce7C04/rTBLM4d4BNp3Tyuues5pL03aDJS1OBlZaThB2kogg291Ha2MdrTPUEDYdIoInDvWyc196q+ipI7R3dI81VK5Z0szPrF+SJoPFbDl3Po115el/f7YJxmymOEHYSe7evo/bvvwQkPzKXtxSnzakpe/z0vfWBpbMaxhrcFvSkvRRr5uhxtnuvkF+nNYKRmsIh3sHAWhpqOVZaxbxlhefz5VrFnPF2kUsa22ckbjM5hInCDvJ/U8eZvG8et72kgvo6hmkK+2r3dUzwEOHj9DVM0B3X+GuepA8STvaLW9xS9JwN/q+ZNzykpYG5jVM/U9weCR45OncWDJ4YN8RHus8RkTSH37jilZedvFKrkhvFW1cMb8qHqIyKzUnCDtJJtvNpasWcvOLLphwn8HhEQ73DNDVmzzh2dU7MPbAz+HepA/44Z4B9nX1Jr/sewYm7JPeVF+T1ERGu/zNO1Fj6RkYZudTR/jx/iNjT+IuaWngyjWLuP5Z53Hl2sVcvmbhrOiyaTYbOUHYmKHhER55+hg3vWD9pPvV19awYkETK4p86Cki6O4bGquJdKUJZPTp0bGaSu8gjx88NvYEaV2NuOS8BfzKc1Zz5drFXLl2EWuXzJs17SJms50ThI154lAPA0MjbF45f1rPK5144GnDspaijukbHEaibA3JZuYEYXnaO3IAbF55hk8WTSOP8GlWfp5RzsZkst3U1YgLVhT3K9/M5jYnCBuT6chxwfJW39YxM8AJwvJksjk2nzu97Q9mNns5QRgAR48PcuDI8YpofzCzyuAEYQDszqYN1K5BmFnKCcIA2J3tBmCLaxBmlnKCMADaszkWzavnnAUes8jMEk4QBkCmo5vNK+f7KWUzG+MEYYyMBLuzOTdQm9lJnCCM/YeP0zMwPO1DbJjZ7OYEYbSnDdSbz3UNwsxOKGmCkHStpN2S9ki6tcD2v5S0M309IulI3rbhvG3bShlntct05JBg0zmt5Q7FzCpIyQbrk1QL3AFcA+wHtkvaFhG7RveJiHfl7f8O4Mq8UxyPiCtKFZ+dkMl2s35pS1GT95hZ9ShlDeIqYE9E7I2IAeBu4PpJ9r8B+HwJ47EJZLI5tz+Y2SlKmSBWAfvy1venZaeQtA7YAHwjr7hJ0g5JP5D06gmOuzndZ0dnZ+c0hV1degeGeOJQj3swmdkpKqWReitwT0QM55Wti4g24EbgI5JOmQMzIu6MiLaIaFu+fPlMxTqnPPJ0Mrezh9gws/FKmSAOAGvy1lenZYVsZdztpYg4kL7vBb7Fye0TNk0yHR5iw8wKK2WC2A5slLRBUgNJEjilN5KkzcBi4D/zyhZLakyXlwFXA7vGH2tnL5PN0dJQy+rFzeUOxcwqTMm6rUTEkKRbgPuAWuCuiHhY0u3AjogYTRZbgbsjIvIO3wL8raQRkiT2ofzeTzZ92ju6uWjlfGpqPMSGmZ2spP0aI+Je4N5xZe8bt/7+Asd9H7islLEZRASZbI5fuvzccodiZhWoUhqprQyy3X0cPT7IFndxNbMCnCCqWKZjdJIgN1Cb2amcIKrY6BhMF7kGYWYFOEFUsUxHjlWLmlnQVF/uUMysAjlBVLFMtpstfkDOzCbgBFGl+oeGeazTQ2yY2cScIKrUnmeOMTwSHmLDzCbkBFGlxnowuQZhZhNwgqhSmWw3jXU1rF86r9yhmFmFcoKoUplsjk3nzKeu1v8EzKwwXx2qVHuHJwkys8k5QVShzlw/B4/1+wlqM5uUE0QV2p1NGqg9BpOZTcYJogplPMSGmRXBCaIKtXfkWDG/kaWtjeUOxcwqmBNEFcpku93+YGZTcoKoMkPDIzz69DG3P5jZlJwgqszjB3sYGB7xEBtmNiUniCrTnvUQG2ZWnJImCEnXStotaY+kWwts/0tJO9PXI5KO5G17k6RH09ebShlnNcl0dFNXIy5Y3lruUMyswtWV6sSSaoE7gGuA/cB2SdsiYtfoPhHxrrz93wFcmS4vAf4QaAMCuD899nCp4q0WmWyOC1e00lDnyqOZTa6UV4mrgD0RsTciBoC7gesn2f8G4PPp8suBr0dEV5oUvg5cW8JYq0amo9tDbJhZUUqZIFYB+/LW96dlp5C0DtgAfON0jpV0s6QdknZ0dnZOS9Bz2dHeQX56tM9dXM2sKJVyn2ErcE9EDJ/OQRFxZ0S0RUTb8uXLSxTa3DH6BLVrEGZWjFImiAPAmrz11WlZIVs5cXvpdI+1ImVGx2ByDcLMilDKBLEd2Chpg6QGkiSwbfxOkjYDi4H/zCu+D3iZpMWSFgMvS8vsLGSy3SyeV8+K+R5iw8ymVrJeTBExJOkWkgt7LXBXRDws6XZgR0SMJoutwN0REXnHdkn6AEmSAbg9IrpKFWu1yGRzbF65AEnlDsXMZoGSJQiAiLgXuHdc2fvGrb9/gmPvAu4qWXBVZmQk2J3N8as/s2bqnc3MqJxGaiuxfYd76R0YdgO1mRXNCaJKtHd4iA0zOz1OEFUik+1Ggk3nuAZhZsVxgqgSmY4cG5a20NxQW+5QzGyWcIKoEskkQa49mFnxJuzFJOk1RRzfl/ZUsgrW0z/Ek129vObZq8sdipnNIpN1c/048E/AZJ3mX8S4bqxWeR55OkeEh9gws9MzWYL4l4j4jckOlvSZaY7HSsBDbJjZmZiwDSIi3jDVwcXsY+WX6eimtbGOVYuayx2Kmc0iRTdSS7pQ0mckfUnS80sZlE2v9myOi1bOp6bGQ2yYWfEma6Ruioi+vKIPAO9Nl78KXFHCuGyaRASZjm5e9azzyh2Kmc0yk9UgvirpjXnrg8B6YB1wWvM2WPl0HO2ju2/IkwSZ2WmbLEFcCyyQ9DVJLwJ+j2Qq0F8Gfm0mgrOzNzpJ0Bb3YDKz0zThLaZ0drePSfp74H8BbwP+ICIem6ng7OyNjsG0yQnCzE7TZG0QzwXeAwwAHwSOA38s6QDwgYg4MiMR2lnJZHOsXtzMgqb6codiZrPMZM9B/C3wCqAV+GREXA1slfRi4Askt5uswmU6uj2Cq5mdkcnaIIY40Sg9MFoYEd+OCCeHWaBvcJi9B3vY4jGYzOwMTFaDuBF4C0lyeOMk+1mF2vPMMYZHwjUIMzsjkzVSPwL87gzGYtNsdIgNj+JqZmdiwltMkv55qoOn2kfStZJ2S9oj6dYJ9nm9pF2SHpb0ubzyYUk709e2qWKxU2U6ummsq2H90pZyh2Jms9Bkt5h+dooLs4CLJ9wo1QJ3ANcA+4HtkrZFxK68fTYCtwFXR8RhSSvyTnE8Iq4o4m+wCWTSITZqPcSGmZ2ByRLE9UUcPzDJtquAPRGxF0DS3ek5d+Xt81vAHRFxGCAininiM61ImWw3P795xdQ7mpkVMFkbxLfP8tyrgH156/uB547bZxOApO8BtcD7I+Jr6bYmSTtIelN9KCK+Mv4DJN0M3Aywdu3aswx3bunM9XPw2IAbqM3sjE1Wg5ipz98IvARYDXxH0mXpQ3jrIuKApPOBb0h6aPxT3BFxJ3AnQFtbW8xo5BVudIgNN1Cb2Zkq5ZzUB4A1eeur07J8+4FtETEYEY8Dj5AkDCLiQPq+F/gWcGUJY51zMukQG65BmNmZmjJBSHqVpDNJJNuBjZI2SGoAtgLjG72/QlJ7QNIykltOeyUtltSYV341J7dd2BTas92cs6CRJS0N5Q7FzGapYi78vwo8KulPJW0u9sQRMQTcAtwHtANfjIiHJd0u6bp0t/uAQ5J2Ad8E3hMRh4AtwA5JP07LP5Tf+8mmlunIufZgZmdlyjaIiHiDpAXADcCnJAXwSeDzEZGb4th7gXvHlb0vbzmAd6ev/H2+D1xW7B9hJxscHmHPM8d44aZl5Q7FzGaxom4dRUQ3cA9wN3AuyZwQP5L0jhLGZmfo8YM9DAyPsMU1CDM7C8W0QVwn6R9JGorrgasi4heBZ+GhOCpSe4d7MJnZ2Summ+trgb+MiO/kF0ZEr6TfLE1YdjYy2Rz1teL8Za3lDsXMZrFiEsT7gY7RFUnNwDkR8URE/HupArMzl+no5oLlrTTUlbIXs5nNdcVcQf4BGMlbH07LrEJlsjm2nOv2BzM7O8UkiLqIyJ8waABw5/oKdaR3gI6jfWz2HNRmdpaKSRCdec8tIOl64GDpQrKzcWIOCNcgzOzsFNMG8Vbgs5I+RjLE9z48w1zFyqQ9mLa4BmFmZ6mYB+UeA54nqTVdP1byqOyMZbI5lrQ0sHx+Y7lDMbNZrqjRXCX9EnAJyRDcAETE7SWMy85QezbH5pXzGf3vZGZ2pop5UO5vSMZjegfJLaZfAdaVOC47AyMjwSNZj8FkZtOjmEbqF0TEG4HDEfFHwPNJJ/qxyvJUVy/HB4f9BLWZTYtiEkRf+t4r6TxgkGQ8JqswY5MEuYHazKZBMW0QX5W0CPgz4EdAAB8vZVB2Zto7ctQINq5wgjCzszdpgkgnCvr3dArQL0n6Z6ApIo7ORHB2ejLZbtYva6G5obbcoZjZHDDpLaaIGAHuyFvvd3KoXJlszkN8m9m0KaYN4t8lvVbuN1nRevqHePJQr9sfzGzaFJMg3kIyOF+/pG5JOUndJY7LTtPupz3EhplNr2KepPZP0lkg05EmCNcgzGyaFPOg3IsKvYo5uaRrJe2WtEfSrRPs83pJuyQ9LOlzeeVvkvRo+npT8X9Sdcpku2ltrGP14uZyh2Jmc0Qx3Vzfk7fcBFwF3A/8/GQHSaolaeC+BtgPbJe0LSJ25e2zEbgNuDoiDktakZYvAf4QaCPpVnt/euzhov+yKpPp8BAbZja9pqxBRMSr8l7XAJcCxVyorwL2RMTedA6Ju4Hrx+3zW8Adoxf+iHgmLX858PWI6Eq3fR24trg/qfpEBO3Zbj9BbWbT6kzmpNwPbCliv1UkQ4PnH7dq3D6bgE2SvifpB5KuPY1jkXSzpB2SdnR2dhb9B8w1Pz3aR65vyGMwmdm0mvIWk6S/IrnNA0lCuYLkierp+vyNwEuA1cB3JF1W7MERcSdwJ0BbW1tMsfucNTYHhGsQZjaNimmD2JG3PAR8PiK+V8RxB4A1eeur07J8+4EfRsQg8LikR0gSxgGSpJF/7LeK+MyqNDqL3KZznCDMbPoUkyDuAfoiYhiSxmdJ8yKid4rjtgMbJW0gueBvBW4ct89XgBuAT0paRnLLaS/wGPBBSYvT/V5G0phtBbR3dLNmSTPzm+rLHYqZzSFFPUkN5PedbAb+baqDImIIuAW4D2gHvhgRD0u6PW+O6/uAQ5J2Ad8E3hMRhyKiC/gASZLZDtyellkBGc8BYWYlUEwNoil/mtGIOCZpXjEnj4h7gXvHlb0vbzmAd6ev8cfeBdxVzOdUs77BYfZ2HuMVl64sdyhmNscUU4PokfTs0RVJzwGOly4kOx17njnGSHiIDTObfsXUIH4H+AdJPyWZcnQlyRSkVgHaOzxJkJmVRjFjMW2XtBm4KC3anfY6sgqQyeZoqq9h3dKWcodiZnNMMWMxvR1oiYifRMRPgFZJ/730oVkxMtluLjpnPrU1HmLDzKZXMW0Qv5XOKAdAOvTFb5UsIitaRNDe4R5MZlYaxSSI2vzJgtJB+BpKF5IVq/NYP109Ax6DycxKophG6q8BX5D0t+n6W9IyK7MTc0C4BmFm06+YBPE/gJuBt6XrXwc+XrKIrGiZrHswmVnpFDPc90hE/E1EvC4iXgfsAv6q9KHZVDIdOVYuaGJxi+/4mdn0K6YGgaQrScZMej3wOPDlUgZlxWnP5tz+YGYlM2GCkLSJJCncABwEvgAoIn5uhmKzSQwOj7DnmRwv3rS83KGY2Rw1WQ0iA3wXeGVE7AGQ9K4ZicqmtLezh8Hh8BwQZlYyk7VBvAboAL4p6eOSXkoy1IZVgBMN1O7BZGalMWGCiIivRMRWYDPJUNy/A6yQ9NeSXjZD8dkE2jty1NeK85d7iA0zK41iejH1RMTnIuJVJDO7PUDS9dXKKJPt5sIV86mvPZNpxc3MpnZaV5eIOBwRd0bES0sVkBUn05Fji59/MLMS8s/PWehwzwDZ7j53cTWzknKCmIUyWQ+xYWal5wQxC431YHINwsxKqKQJQtK1knZL2iPp1gLbb5LUKWln+npz3rbhvPJtpYxztsl05Fja0sDy1sZyh2Jmc1hRQ22ciXRY8DuAa4D9wHZJ2yJi17hdvxARtxQ4xfGIuKJU8c1mmWw3m8+dT94o7GZm066UNYirgD0RsTciBoC7getL+HlVYXgk2P20Jwkys9IrZYJYBezLW9+flo33WkkPSrpH0pq88iZJOyT9QNKrC32ApJvTfXZ0dnZOX+QV7MlDPfQNjniIbzMruXI3Un8VWB8Rl5PMM/HpvG3rIqINuBH4iKQLxh+cPpPRFhFty5dXx6B1u9MeTFvOdQ3CzEqrlAniAJBfI1idlo2JiEMR0Z+ufgJ4Tt62A+n7XuBbwJUljHXWaM/mqBFcuKK13KGY2RxXygSxHdgoaYOkBmArcFJvJEnn5q1eB7Sn5YslNabLy4CrSSYqqnqZjm42LGuhqb623KGY2RxXsl5METEk6RbgPqAWuCsiHpZ0O7AjIrYBvy3pOmAI6AJuSg/fAvytpBGSJPahAr2fqlImm+Oy1QvLHYaZVYGSJQiAiLgXuHdc2fvylm8Dbitw3PeBy0oZ22x0rH+Ip7p6eX3b6nKHYmZVoNyN1HYadnuIDTObQU4Qs4iH2DCzmeQEMYtkOnLMb6xj1aLmcodiZlXACWIW8RAbZjaTnCBmiYgg0+EhNsxs5jhBzBIHjhwn1z/k9gczmzFOELNEpsM9mMxsZjlBzBKjPZgu8iB9ZjZDnCBmifZsjrVL5tHaWNJnG83MxjhBzBKZjm4P8W1mM8oJYhboGxzm8YM9bPYQ32Y2g5wgZoFHnz7GSMAW1yDMbAY5QcwC7WNDbLgGYWYzxwliFsh05Giur2XtknnlDsXMqogTxCyQyXazaeV8ams8xIaZzRwniAoXEbR3dLv9wcxmnBNEhevM9XO4d9BdXM1sxjlBVLj20UmC3EBtZjPMCaLCZTrSHkyuQZjZDCtpgpB0raTdkvZIurXA9pskdUramb7enLftTZIeTV9vKmWclSyTzXHuwiYWzWsodyhmVmVKNrCPpFrgDuAaYD+wXdK2iNg1btcvRMQt445dAvwh0AYEcH967OFSxVup2j3EhpmVSSlrEFcBeyJib0QMAHcD1xd57MuBr0dEV5oUvg5cW6I4K9bA0AiPdR5z+4OZlUUpE8QqYF/e+v60bLzXSnpQ0j2S1pzOsZJulrRD0o7Ozs7pirti7D14jMHhcA3CzMqi3I3UXwXWR8TlJLWET5/OwRFxZ0S0RUTb8uXLSxJgOY1OErTFNQgzK4NSJogDwJq89dVp2ZiIOBQR/enqJ4DnFHtsNWjPdtNQW8OGZS3lDsXMqlApE8R2YKOkDZIagK3AtvwdJJ2bt3od0J4u3we8TNJiSYuBl6VlVSXTkePCFa3U15a7omdm1ahkvZgiYkjSLSQX9lrgroh4WNLtwI6I2Ab8tqTrgCGgC7gpPbZL0gdIkgzA7RHRVapYK1Um283VFy4rdxhmVqVKOn9lRNwL3Duu7H15y7cBt01w7F3AXaWMr5J19QzwdHc/W1a6/cHMysP3LipUZmwOCPdgMrPycIKoUKM9mDa7BmFmZeIEUaEy2W6WtTawfH5juUMxsyrlBFGhMtmcaw9mVlZOEBVoeCTYnc35CWozKysniAr0xKEe+odGPAaTmZWVE0QFOtFA7RqEmZWPE0QFymS7qa0RF65oLXcoZlbFnCAqUCab4/xlLTTV15Y7FDOrYk4QFSiT7eYi314yszJzgqgwub5B9nUd9xDfZlZ2ThAV5pGn3UBtZpXBCaLCtI/2YHINwszKzAmiwmSy3cxvquO8hU3lDsXMqpwTRIXJdOTYsnIBksodiplVOSeIChIRyRhMHuLbzCqAE0QF2X/4OMf6hzxIn5lVBCeICpLJjjZQuwZhZuXnBFFBMh3JLHIXneMEYWblV9IEIelaSbsl7ZF06yT7vVZSSGpL19dLOi5pZ/r6m1LGWSky2Rzrls6jpbGkU4WbmRWlZFciSbXAHcA1wH5gu6RtEbFr3H7zgXcCPxx3isci4opSxVeJ2rPdfkDOzCpGKX+qXgXsiYi9AJLuBq4Hdo3b7wPAnwDvKWEsExoeCQ4e60eCGil9gdL30bIT2xlbn86uqMcHhnniYA+vuvy8aTunmdnZKGWCWAXsy1vfDzw3fwdJzwbWRMT/kzQ+QWyQ9ADQDfxBRHy3FEEe6R3guR/89zM6Nj9pTJ1Q8ren+9ec2H9oZISRgC1uoDazClG2m92SaoAPAzcV2NwBrI2IQ5KeA3xF0iUR0T3uHDcDNwOsXbv2jOJoaazjg798GSMRRAQjASPpe7J+oiwCRkby14vYP68sIhgZOXX/4XS/521YytUXLjujv8PMbLqVMkEcANbkra9Oy0bNBy4FvpXeqlkJbJN0XUTsAPoBIuJ+SY8Bm4Ad+R8QEXcCdwK0tbXFmQTZVF/Ljc89s+RiZjaXlbIX03Zgo6QNkhqArcC20Y0RcTQilkXE+ohYD/wAuC4idkhanjZyI+l8YCOwt4SxmpnZOCWrQUTEkKRbgPuAWuCuiHhY0u3AjojYNsnhLwJulzQIjABvjYiuUsVqZmanUsQZ3ZmpOG1tbbFjx46pdzQzszGS7o+ItkLb/CS1mZkV5ARhZmYFOUGYmVlBThBmZlaQE4SZmRU0Z3oxSeoEnjyLUywDDk5TOLOdv4uT+fs4mb+PE+bCd7EuIpYX2jBnEsTZkrRjoq5e1cbfxcn8fZzM38cJc/278C0mMzMryAnCzMwKcoI44c5yB1BB/F2czN/Hyfx9nDCnvwu3QZiZWUGuQZiZWUFOEGZmVlDVJwhJ10raLWmPpFvLHU85SVoj6ZuSdkl6WNI7yx1TuUmqlfSApH8udyzlJmmRpHskZSS1S3p+uWMqJ0nvSv8/+Ymkz0tqKndM062qE0Q6KdEdwC8CFwM3SLq4vFGV1RDwuxFxMfA84O1V/n0AvBNoL3cQFeKjwNciYjPwLKr4e5G0CvhtoC0iLiWZ82ZreaOaflWdIICrgD0RsTciBoC7gevLHFPZRERHRPwoXc6RXABWlTeq8pG0Gvgl4BPljqXcJC0kmcjr7wAiYiAijpQ1qPKrA5ol1QHzgJ+WOZ5pV+0JYhWwL299P1V8QcwnaT1wJfDDModSTh8B3ksyq2G12wB0Ap9Mb7l9QlJLuYMql4g4APw58BTQARyNiH8tb1TTr9oThBUgqRX4EvA7EdFd7njKQdIrgWci4v5yx1Ih6oBnA38dEVcCPUDVttlJWkxyt2EDcB7QIukN5Y1q+lV7gjgArMlbX52WVS1J9STJ4bMR8eVyx1NGVwPXSXqC5Nbjz0v6THlDKqv9wP6IGK1R3kOSMKrVLwCPR0RnRAwCXwZeUOaYpl21J4jtwEZJGyQ1kDQybStzTGUjSST3mNsj4sPljqecIuK2iFgdEetJ/l18IyLm3C/EYkVEFtgn6aK06KXArjKGVG5PAc+TNC/9/+alzMFG+7pyB1BOETEk6RbgPpJeCHdFxMNlDqucrgZ+HXhI0s607Pcj4t7yhWQV5B3AZ9MfU3uB/1bmeMomIn4o6R7gRyS9/x5gDg674aE2zMysoGq/xWRmZhNwgjAzs4KcIMzMrCAnCDMzK8gJwszMCnKCMEtJOpa+r5d04zSf+/fHrX9/Os9vVgpOEGanWg+cVoJIB2ybzEkJIiLm3FO3Nvc4QZid6kPACyXtTMf8r5X0Z5K2S3pQ0lsAJL1E0nclbSN9qljSVyTdn84TcHNa9iGSUT93SvpsWjZaW1F67p9IekjSr+ad+1t58y98Nn1iF0kfSufseFDSn8/4t2NVo6qfpDabwK3A70XEKwHSC/3RiPgZSY3A9ySNjtz5bODSiHg8Xf+NiOiS1Axsl/SliLhV0i0RcUWBz3oNcAXJ/ArL0mO+k267EriEZBjp7wFXS2oHfhnYHBEhadH0/ulmJ7gGYTa1lwFvTIcf+SGwFNiYbvuvvOQA8NuSfgz8gGQgyI1M7meBz0fEcEQ8DXwb+Jm8c++PiBFgJ8mtr6NAH/B3kl4D9J7l32Y2IScIs6kJeEdEXJG+NuSN/d8ztpP0EpJRPp8fEc8iGZ/nbKah7M9bHgbqImKIZKKre4BXAl87i/ObTcoJwuxUOWB+3vp9wNvSodCRtGmCyXIWAocjolfSZpJpW0cNjh4/zneBX03bOZaTzNr2XxMFls7VsTAdQPFdJLemzErCbRBmp3oQGE5vFX2KZC7m9cCP0obiTuDVBY77GvDWtJ1gN8ltplF3Ag9K+lFE/Fpe+T8Czwd+DATw3ojIpgmmkPnAP0lqIqnZvPuM/kKzIng0VzMzK8i3mMzMrCAnCDMzK8gJwszMCnKCMDOzgpwgzMysICcIMzMryAnCzMwK+v9dS7Ovcb84WwAAAABJRU5ErkJggg==\n",
       "text/plain": [
        "<Figure size 432x288 with 1 Axes>"
       ]
@@ -509,7 +507,7 @@
     {
      "data": {
       "text/plain": [
-       "0.8053976582616722"
+       "0.798340863819657"
       ]
      },
      "execution_count": 15,
@@ -782,7 +780,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 25,
    "metadata": {
     "scrolled": true
    },
@@ -799,7 +797,9 @@
      "output_type": "stream",
      "text": [
       "<ipython-input-22-78c27bb59095>:15: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  x = (x + torch.tensor([1.0])) / 2.0\n"
+      "  x = (x + torch.tensor([1.0])) / 2.0\n",
+      "/workspace/brevitas/src/brevitas/quant_tensor/__init__.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "  training = torch.tensor(training, dtype=torch.bool)\n"
      ]
     }
    ],
@@ -843,7 +843,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -867,10 +867,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f49738bffa0>"
+       "<IPython.lib.display.IFrame at 0x7fb36398c3a0>"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index 6ac4e52072d71f527e4ec5d923a76851b77dc247..a0fef1ab6112e734abfc5d5a22a526b41f5503a5 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -169,7 +169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -177,13 +177,13 @@
      "output_type": "stream",
      "text": [
       "Input tensor name: 0\n",
-      "Output tensor name: 78\n",
+      "Output tensor name: 73\n",
       "Input tensor shape: [1, 600]\n",
       "Output tensor shape: [1, 1]\n",
-      "Input tensor datatype: DataType.BIPOLAR\n",
-      "Output tensor datatype: DataType.FLOAT32\n",
+      "Input tensor datatype: BIPOLAR\n",
+      "Output tensor datatype: FLOAT32\n",
       "List of node operator types in the graph: \n",
-      "['Add', 'Div', 'MatMul', 'Add', 'Mul', 'Unsqueeze', 'BatchNormalization', 'Squeeze', 'MultiThreshold', 'Mul', 'MatMul', 'Add', 'Mul', 'Unsqueeze', 'BatchNormalization', 'Squeeze', 'MultiThreshold', 'Mul', 'MatMul', 'Add', 'Mul', 'Unsqueeze', 'BatchNormalization', 'Squeeze', 'MultiThreshold', 'Mul', 'MatMul', 'Add', 'Mul', 'MultiThreshold']\n"
+      "['Mul', 'Add', 'Div', 'MatMul', 'Mul', 'Add', 'BatchNormalization', 'MultiThreshold', 'Mul', 'MatMul', 'Mul', 'Add', 'BatchNormalization', 'MultiThreshold', 'Mul', 'MatMul', 'Mul', 'Add', 'BatchNormalization', 'MultiThreshold', 'Mul', 'MatMul', 'Mul', 'Add', 'MultiThreshold']\n"
      ]
     }
    ],
@@ -200,8 +200,8 @@
     "print(\"Output tensor shape: %s\" % str(finnonnx_model_out_shape))\n",
     "finnonnx_model_in_dt = model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)\n",
     "finnonnx_model_out_dt = model_for_sim.get_tensor_datatype(finnonnx_out_tensor_name)\n",
-    "print(\"Input tensor datatype: %s\" % str(model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)))\n",
-    "print(\"Output tensor datatype: %s\" % str(model_for_sim.get_tensor_datatype(finnonnx_out_tensor_name)))\n",
+    "print(\"Input tensor datatype: %s\" % str(finnonnx_model_in_dt.name))\n",
+    "print(\"Output tensor datatype: %s\" % str(finnonnx_model_out_dt.name))\n",
     "print(\"List of node operator types in the graph: \")\n",
     "print([x.op_type for x in model_for_sim.graph.node])"
    ]
@@ -226,7 +226,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -262,7 +262,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -286,10 +286,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f388298b470>"
+       "<IPython.lib.display.IFrame at 0x7f3be619b2b0>"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -311,7 +311,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -320,7 +320,7 @@
        "torch.Size([100, 593])"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -356,16 +356,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "IncompatibleKeys(missing_keys=[], unexpected_keys=[])"
+       "<All keys matched successfully>"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -409,7 +409,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -441,7 +441,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -476,14 +476,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "ok 100 nok 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:47<00:00,  2.09it/s]\n"
+      "ok 100 nok 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:21<00:00,  4.72it/s]\n"
      ]
     }
    ],
@@ -511,7 +511,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -560,7 +560,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/end2end_example/cybersecurity/dataloader_quantized.py b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
index 45651faa5a9a57e9a1d0d784b15ebe8945d9ddd7..738811fa72754f059ce7e62196deae8a824f03f6 100644
--- a/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
+++ b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
@@ -26,12 +26,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import torch
-import pandas as pd
+import math
 import numpy as np
+import pandas as pd
+import torch
 from sklearn import preprocessing
 from sklearn.preprocessing import OneHotEncoder
-import math
 
 # quantize the UNSW_NB15 dataset and convert it to binary vectors
 # reimplementation
@@ -112,7 +112,7 @@ class UNSW_NB15_quantized(torch.utils.data.Dataset):
 
     def round_like_matlab_number(self, n: np.float64) -> int:
         """Round the input "n" like matlab uint32(n) cast (which also rounds) e.g.
-        0.5->1;  1.5->2; 2.3->2;   2.45->2 """
+        0.5->1;  1.5->2; 2.3->2;   2.45->2"""
         if n - math.floor(n) < 0.5:
             return math.floor(n)
         return math.ceil(n)
diff --git a/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py b/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
index 622c69c8d0abdf8025b0486c63bf336e4f8675f5..0ffb525544845757c90f30799fbad472d389348f 100644
--- a/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
+++ b/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
@@ -27,9 +27,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
+import numpy as np
 from driver import io_shape_dict
 from driver_base import FINNExampleOverlay
-import numpy as np
 
 
 def make_unsw_nb15_test_batches(bsize, dataset_root):
diff --git a/requirements.txt b/requirements.txt
index de007ace503de9f110027b4190d5db6188633575..da0ec0b63092f0618bb7c9982b95fa90e8f91118 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ onnx==1.7.0
 onnxoptimizer
 onnxruntime==1.4.0
 pre-commit==2.6.0
+pyscaffold==3.2.1
 scipy==1.5.2
 setupext-janitor>=1.1.2
 toposort==1.5
diff --git a/run-docker.sh b/run-docker.sh
index 6e8439810c40da4d78229300b20d4728119a92ab..2abd67f0679b32a09e51d03efe548bdc095c11a0 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -41,24 +41,19 @@ recho () {
   echo -e "${RED}$1${NC}"
 }
 
-if [ -z "$VIVADO_PATH" ];then
-  recho "Please set the VIVADO_PATH that contains the path to your Vivado installation directory."
-  recho "FINN functionality depending on Vivado or Vivado HLS will not be available."
+if [ -z "$FINN_XILINX_PATH" ];then
+  recho "Please set the FINN_XILINX_PATH environment variable to the path to your Xilinx tools installation directory (e.g. /opt/Xilinx)."
+  recho "FINN functionality depending on Vivado, Vitis or HLS will not be available."
 fi
 
-if [ -z "$PYNQ_IP" ];then
-  recho "Please set the PYNQ_IP env.var. to enable PYNQ deployment tests."
+if [ -z "$FINN_XILINX_VERSION" ];then
+  recho "Please set the FINN_XILINX_VERSION to the version of the Xilinx tools to use (e.g. 2020.1)"
+  recho "FINN functionality depending on Vivado, Vitis or HLS will not be available."
 fi
 
-if [ -z "$VITIS_PATH" ];then
-  recho "Please set the VITIS_PATH that contains the path to your Vitis installation directory."
-  recho "FINN functionality depending on Vitis will not be available."
-else
-  if [ -z "$PLATFORM_REPO_PATHS" ];then
-    recho "Please set PLATFORM_REPO_PATHS pointing to Vitis platform files (DSAs)."
-    recho "This is required to be able to use Vitis."
-    exit -1
-  fi
+if [ -z "$PLATFORM_REPO_PATHS" ];then
+  recho "Please set PLATFORM_REPO_PATHS pointing to Vitis platform files (DSAs)."
+  recho "This is required to be able to use Alveo PCIe cards."
 fi
 
 DOCKER_GID=$(id -g)
@@ -66,21 +61,8 @@ DOCKER_GNAME=$(id -gn)
 DOCKER_UNAME=$(id -un)
 DOCKER_UID=$(id -u)
 DOCKER_PASSWD="finn"
-# generate a random number per-run to allow multiple
-# containers from the same user
-DOCKER_RND=$(shuf -i0-32768 -n1)
-# create a separate tag when Vitis enabled, since Docker image needs
-# additional dependencies installed
-if [ ! -z "$VITIS_PATH" ];then
-  DOCKER_TAG="finn_dev_vitis_${DOCKER_UNAME}"
-else
-  DOCKER_TAG="finn_dev_${DOCKER_UNAME}"
-fi
-# uncomment to run multiple instances with different names
-# DOCKER_INST_NAME="finn_${DOCKER_UNAME}_${DOCKER_RND}"
 DOCKER_INST_NAME="finn_dev_${DOCKER_UNAME}"
-# ensure Docker tag and inst. name are all lowercase
-DOCKER_TAG=$(echo "$DOCKER_TAG" | tr '[:upper:]' '[:lower:]')
+# ensure Docker inst. name is all lowercase
 DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
 # Absolute path to this script, e.g. /home/user/bin/foo.sh
 SCRIPT=$(readlink -f "$0")
@@ -97,7 +79,7 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${PYNQ_PASSWORD="xilinx"}
 : ${PYNQ_BOARD="Pynq-Z1"}
 : ${PYNQ_TARGET_DIR="/home/xilinx/$DOCKER_INST_NAME"}
-: ${NUM_DEFAULT_WORKERS=1}
+: ${NUM_DEFAULT_WORKERS=4}
 : ${FINN_SSH_KEY_DIR="$SCRIPTPATH/ssh_keys"}
 : ${ALVEO_USERNAME="alveo_user"}
 : ${ALVEO_PASSWORD=""}
@@ -106,9 +88,15 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${PLATFORM_REPO_PATHS="/opt/xilinx/platforms"}
 : ${XRT_DEB_VERSION="xrt_202010.2.7.766_18.04-amd64-xrt"}
 : ${FINN_HOST_BUILD_DIR="/tmp/$DOCKER_INST_NAME"}
+: ${FINN_DOCKER_TAG="xilinx/finn:$(git describe --tags --dirty).$XRT_DEB_VERSION"}
+: ${FINN_DOCKER_PREBUILT="0"}
+: ${FINN_DOCKER_RUN_AS_ROOT="0"}
+: ${FINN_DOCKER_GPU="$(docker info | grep nvidia | wc -m)"}
+: ${FINN_DOCKER_EXTRA=""}
+: ${NVIDIA_VISIBLE_DEVICES=""}
+: ${DOCKER_BUILDKIT="1"}
 
 DOCKER_INTERACTIVE=""
-DOCKER_EXTRA=""
 
 if [ "$1" = "test" ]; then
   gecho "Running test suite (all tests)"
@@ -123,21 +111,21 @@ elif [ "$1" = "notebook" ]; then
   else
     JUPYTER_PASSWD_ARG="--NotebookApp.password='$JUPYTER_PASSWD_HASH'"
   fi
-  DOCKER_CMD="jupyter notebook --no-browser --ip=0.0.0.0 --port $JUPYTER_PORT $JUPYTER_PASSWD_ARG notebooks"
-  DOCKER_EXTRA+="-e JUPYTER_PORT=$JUPYTER_PORT "
-  DOCKER_EXTRA+="-e NETRON_PORT=$NETRON_PORT "
-  DOCKER_EXTRA+="-p $JUPYTER_PORT:$JUPYTER_PORT "
-  DOCKER_EXTRA+="-p $NETRON_PORT:$NETRON_PORT "
+  DOCKER_CMD="jupyter notebook --allow-root --no-browser --ip=0.0.0.0 --port $JUPYTER_PORT $JUPYTER_PASSWD_ARG notebooks"
+  FINN_DOCKER_EXTRA+="-e JUPYTER_PORT=$JUPYTER_PORT "
+  FINN_DOCKER_EXTRA+="-e NETRON_PORT=$NETRON_PORT "
+  FINN_DOCKER_EXTRA+="-p $JUPYTER_PORT:$JUPYTER_PORT "
+  FINN_DOCKER_EXTRA+="-p $NETRON_PORT:$NETRON_PORT "
 elif [ "$1" = "build_dataflow" ]; then
   BUILD_DATAFLOW_DIR=$(readlink -f "$2")
-  DOCKER_EXTRA="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR"
+  FINN_DOCKER_EXTRA="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR "
   DOCKER_INTERACTIVE="-it"
   #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
   gecho "Running build_dataflow for folder $BUILD_DATAFLOW_DIR"
   DOCKER_CMD="build_dataflow $BUILD_DATAFLOW_DIR"
 elif [ "$1" = "build_custom" ]; then
   BUILD_CUSTOM_DIR=$(readlink -f "$2")
-  DOCKER_EXTRA="-v $BUILD_CUSTOM_DIR:$BUILD_CUSTOM_DIR -w $BUILD_CUSTOM_DIR"
+  FINN_DOCKER_EXTRA="-v $BUILD_CUSTOM_DIR:$BUILD_CUSTOM_DIR -w $BUILD_CUSTOM_DIR "
   DOCKER_INTERACTIVE="-it"
   #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
   gecho "Running build_custom: $BUILD_CUSTOM_DIR/build.py"
@@ -148,49 +136,47 @@ else
   DOCKER_INTERACTIVE="-it"
 fi
 
+if [ "$FINN_DOCKER_GPU" != 0 ];then
+  gecho "nvidia-docker detected, enabling GPUs"
+  if [ ! -z "$NVIDIA_VISIBLE_DEVICES" ];then
+    FINN_DOCKER_EXTRA+="--runtime nvidia -e NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES "
+  else
+    FINN_DOCKER_EXTRA+="--gpus all "
+  fi
+fi
+
 VIVADO_HLS_LOCAL=$VIVADO_PATH
 VIVADO_IP_CACHE=$FINN_HOST_BUILD_DIR/vivado_ip_cache
-INSTALL_XRT_DEPS=0
 
 # ensure build dir exists locally
 mkdir -p $FINN_HOST_BUILD_DIR
 mkdir -p $FINN_SSH_KEY_DIR
 
 gecho "Docker container is named $DOCKER_INST_NAME"
+gecho "Docker tag is named $FINN_DOCKER_TAG"
 gecho "Mounting $FINN_HOST_BUILD_DIR into $FINN_HOST_BUILD_DIR"
-gecho "Mounting $VIVADO_PATH into $VIVADO_PATH"
-if [ ! -z "$VITIS_PATH" ];then
-  gecho "Mounting $VITIS_PATH into $VITIS_PATH"
-  INSTALL_XRT_DEPS=1
-fi
+gecho "Mounting $FINN_XILINX_PATH into $FINN_XILINX_PATH"
 gecho "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
 gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
 gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE"
 gecho "Using default PYNQ board $PYNQ_BOARD"
 
 # Build the FINN Docker image
-# Need to ensure this is done within the finn/ root folder:
-OLD_PWD=$(pwd)
-cd $SCRIPTPATH
-docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \
-             --build-arg GID=$DOCKER_GID \
-             --build-arg GNAME=$DOCKER_GNAME \
-             --build-arg UNAME=$DOCKER_UNAME \
-             --build-arg UID=$DOCKER_UID \
-             --build-arg PASSWD=$DOCKER_PASSWD \
-             --build-arg INSTALL_XRT_DEPS=$INSTALL_XRT_DEPS \
-             --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION \
-             .
-cd $OLD_PWD
+if [ "$FINN_DOCKER_PREBUILT" = "0" ]; then
+  # Need to ensure this is done within the finn/ root folder:
+  OLD_PWD=$(pwd)
+  cd $SCRIPTPATH
+  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --tag=$FINN_DOCKER_TAG .
+  cd $OLD_PWD
+fi
 # Launch container with current directory mounted
 # important to pass the --init flag here for correct Vivado operation, see:
 # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins
-DOCKER_EXEC="docker run -t --rm $DOCKER_INTERACTIVE --init "
+DOCKER_EXEC="docker run -t --rm $DOCKER_INTERACTIVE --tty --init "
 DOCKER_EXEC+="--hostname $DOCKER_INST_NAME "
 DOCKER_EXEC+="-e SHELL=/bin/bash "
 DOCKER_EXEC+="-v $SCRIPTPATH:/workspace/finn "
 DOCKER_EXEC+="-v $FINN_HOST_BUILD_DIR:$FINN_HOST_BUILD_DIR "
-DOCKER_EXEC+="-v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh "
 DOCKER_EXEC+="-e FINN_BUILD_DIR=$FINN_HOST_BUILD_DIR "
 DOCKER_EXEC+="-e FINN_ROOT="/workspace/finn" "
 DOCKER_EXEC+="-e LOCALHOST_URL=$LOCALHOST_URL "
@@ -201,31 +187,42 @@ DOCKER_EXEC+="-e PYNQ_USERNAME=$PYNQ_USERNAME "
 DOCKER_EXEC+="-e PYNQ_PASSWORD=$PYNQ_PASSWORD "
 DOCKER_EXEC+="-e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR "
 DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
+if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ];then
+  DOCKER_EXEC+="-v /etc/group:/etc/group:ro "
+  DOCKER_EXEC+="-v /etc/passwd:/etc/passwd:ro "
+  DOCKER_EXEC+="-v /etc/shadow:/etc/shadow:ro "
+  DOCKER_EXEC+="-v /etc/sudoers.d:/etc/sudoers.d:ro "
+  DOCKER_EXEC+="-v $FINN_SSH_KEY_DIR:$HOME/.ssh "
+  DOCKER_EXEC+="--user $DOCKER_UID:$DOCKER_GID "
+else
+  DOCKER_EXEC+="-v $FINN_SSH_KEY_DIR:/root/.ssh "
+fi
 if [ ! -z "$IMAGENET_VAL_PATH" ];then
   DOCKER_EXEC+="-v $IMAGENET_VAL_PATH:$IMAGENET_VAL_PATH "
   DOCKER_EXEC+="-e IMAGENET_VAL_PATH=$IMAGENET_VAL_PATH "
 fi
-if [ ! -z "$VIVADO_PATH" ];then
-  DOCKER_EXEC+="-e "XILINX_VIVADO=$VIVADO_PATH" "
-  DOCKER_EXEC+="-v $VIVADO_PATH:$VIVADO_PATH "
-  DOCKER_EXEC+="-e VIVADO_PATH=$VIVADO_PATH "
-fi
-if [ ! -z "$VITIS_PATH" ];then
-  if [ -z "$PLATFORM_REPO_PATHS" ];then
-    recho "PLATFORM_REPO_PATHS must be set for Vitis/Alveo flows"
-    exit -1
+if [ ! -z "$FINN_XILINX_PATH" ];then
+  VIVADO_PATH="$FINN_XILINX_PATH/Vivado/$FINN_XILINX_VERSION"
+  VITIS_PATH="$FINN_XILINX_PATH/Vitis/$FINN_XILINX_VERSION"
+  DOCKER_EXEC+="-v $FINN_XILINX_PATH:$FINN_XILINX_PATH "
+  if [ -d "$VIVADO_PATH" ];then
+    DOCKER_EXEC+="-e "XILINX_VIVADO=$VIVADO_PATH" "
+    DOCKER_EXEC+="-e VIVADO_PATH=$VIVADO_PATH "
+  fi
+  if [ -d "$VITIS_PATH" ];then
+    DOCKER_EXEC+="-e VITIS_PATH=$VITIS_PATH "
+  fi
+  if [ -d "$PLATFORM_REPO_PATHS" ];then
+    DOCKER_EXEC+="-v $PLATFORM_REPO_PATHS:$PLATFORM_REPO_PATHS "
+    DOCKER_EXEC+="-e PLATFORM_REPO_PATHS=$PLATFORM_REPO_PATHS "
+    DOCKER_EXEC+="-e ALVEO_IP=$ALVEO_IP "
+    DOCKER_EXEC+="-e ALVEO_USERNAME=$ALVEO_USERNAME "
+    DOCKER_EXEC+="-e ALVEO_PASSWORD=$ALVEO_PASSWORD "
+    DOCKER_EXEC+="-e ALVEO_BOARD=$ALVEO_BOARD "
+    DOCKER_EXEC+="-e ALVEO_TARGET_DIR=$ALVEO_TARGET_DIR "
   fi
-  DOCKER_EXEC+="-v $VITIS_PATH:$VITIS_PATH "
-  DOCKER_EXEC+="-v $PLATFORM_REPO_PATHS:$PLATFORM_REPO_PATHS "
-  DOCKER_EXEC+="-e VITIS_PATH=$VITIS_PATH "
-  DOCKER_EXEC+="-e PLATFORM_REPO_PATHS=$PLATFORM_REPO_PATHS "
-  DOCKER_EXEC+="-e ALVEO_IP=$ALVEO_IP "
-  DOCKER_EXEC+="-e ALVEO_USERNAME=$ALVEO_USERNAME "
-  DOCKER_EXEC+="-e ALVEO_PASSWORD=$ALVEO_PASSWORD "
-  DOCKER_EXEC+="-e ALVEO_BOARD=$ALVEO_BOARD "
-  DOCKER_EXEC+="-e ALVEO_TARGET_DIR=$ALVEO_TARGET_DIR "
 fi
-DOCKER_EXEC+="$DOCKER_EXTRA "
-DOCKER_EXEC+="$DOCKER_TAG $DOCKER_CMD"
+DOCKER_EXEC+="$FINN_DOCKER_EXTRA "
+DOCKER_EXEC+="$FINN_DOCKER_TAG $DOCKER_CMD"
 
 $DOCKER_EXEC
diff --git a/setup.cfg b/setup.cfg
index e98077ddf1e60f69513d85193374414636a0f355..96618e0ffcb8dcb217185c67948a71a132a7b45a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -58,9 +58,6 @@ package_dir =
     =src
 # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
 setup_requires = pyscaffold>=3.2a0,<3.3a0
-# finn-base is added specifically to be able to build on readthedocs
-install_requires =
-    finn-base @ git+https://github.com/Xilinx/finn-base#egg=finn-base
 # The usage of test_requires is discouraged, see `Dependency Management` docs
 # tests_require = pytest; pytest-cov
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4
@@ -75,6 +72,19 @@ exclude =
 # Add here additional requirements for extra features, to install with:
 # `pip install FINN[PDF]` like:
 # PDF = ReportLab; RXP
+# finn-base is needed to build the full set of docs
+docs =
+    finn-base==0.0.3
+    docutils==0.17.1
+    dataclasses-json==0.5.2
+    gspread==3.6.0
+    pytest
+    netron
+    vcdvcd
+    torchvision
+    torch
+    qonnx@git+https://github.com/fastmachinelearning/qonnx@main#egg=qonnx
+
 # Add here test requirements (semicolon/line-separated)
 testing =
     pytest
diff --git a/setup.py b/setup.py
index d7e158b56010fbc9ba2fb9f143ea2fc8d8a901d9..8fd781462c22f029071ac441f2be33e2f525bd47 100644
--- a/setup.py
+++ b/setup.py
@@ -35,10 +35,11 @@
     PyScaffold helps you to put up the scaffold of your new Python project.
     Learn more under: https://pyscaffold.org/
 """
-import sys
 from pkg_resources import VersionConflict, require
 from setuptools import setup
 
+import sys
+
 try:
     require("setuptools>=38.3")
 except VersionConflict:
diff --git a/src/finn/analysis/fpgadataflow/floorplan_params.py b/src/finn/analysis/fpgadataflow/floorplan_params.py
index 4c8cbf53de1ae7dc951911678a3f118bd3506dfe..9ba99fb546587ba3c2f385c958ecb172f8903bf7 100644
--- a/src/finn/analysis/fpgadataflow/floorplan_params.py
+++ b/src/finn/analysis/fpgadataflow/floorplan_params.py
@@ -26,8 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.custom_op.registry import getCustomOp
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
 def floorplan_params(model):
diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index 39d6332aa42594528fbd5a04dd5efad2c3237e77..aff99efd807d8b04dc6490b299d66c0be8d8fc44 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -25,8 +25,8 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import warnings
 import os
+import warnings
 import xml.etree.ElementTree as ET
 
 import finn.custom_op.registry as registry
diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py
index 79204c54cdb8233fd7b65968c25af819fce91959..4b817910949fa750f34a53592413bb38c7557c08 100644
--- a/src/finn/analysis/fpgadataflow/post_synth_res.py
+++ b/src/finn/analysis/fpgadataflow/post_synth_res.py
@@ -29,9 +29,9 @@
 import os
 import xml.etree.ElementTree as ET
 
-from finn.transformation.move_reshape import _is_fpgadataflow_node
 from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
+from finn.transformation.move_reshape import _is_fpgadataflow_node
 
 
 def post_synth_res(model, override_synth_report_filename=None):
diff --git a/src/finn/analysis/verify_custom_nodes.py b/src/finn/analysis/verify_custom_nodes.py
index 9af1e9a4fe83de24f64a7e9df535bcf78f5fc234..62dac2827f11d290c5a50137e12684eb93326297 100644
--- a/src/finn/analysis/verify_custom_nodes.py
+++ b/src/finn/analysis/verify_custom_nodes.py
@@ -32,7 +32,8 @@ from finn.util.basic import is_finn_op
 
 def verify_nodes(model):
     """Checks if custom ops in graph are correctly built, with all attributes
-    and inputs.
+    and inputs. Please note that many FINN CustomOps don't yet implement the
+    verify_node function required for this analysis pass to work correctly.
 
     Returns {node op_type : info_messages}
 
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index c46bfa48dff289d37f2cb2a89cdbef8e2789317f..c4664a5471984e1f88a70f1d9bb6ce674e38c782 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -26,20 +26,21 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from finn.core.modelwrapper import ModelWrapper
-import os
-import json
-import time
 import clize
-import sys
+import json
 import logging
+import os
 import pdb  # NOQA
+import sys
+import time
 import traceback
-from finn.builder.build_dataflow_steps import build_dataflow_step_lookup
+
 from finn.builder.build_dataflow_config import (
     DataflowBuildConfig,
     default_build_dataflow_steps,
 )
+from finn.builder.build_dataflow_steps import build_dataflow_step_lookup
+from finn.core.modelwrapper import ModelWrapper
 
 
 # adapted from https://stackoverflow.com/a/39215961
@@ -61,7 +62,7 @@ class StreamToLogger(object):
         pass
 
 
-def resolve_build_steps(cfg: DataflowBuildConfig):
+def resolve_build_steps(cfg: DataflowBuildConfig, partial: bool = True):
     steps = cfg.steps
     if steps is None:
         steps = default_build_dataflow_steps
@@ -75,19 +76,56 @@ def resolve_build_steps(cfg: DataflowBuildConfig):
             steps_as_fxns.append(transform_step)
         else:
             raise Exception("Could not resolve build step: " + str(transform_step))
+    if partial:
+        step_names = list(map(lambda x: x.__name__, steps_as_fxns))
+        if cfg.start_step is None:
+            start_ind = 0
+        else:
+            start_ind = step_names.index(cfg.start_step)
+        if cfg.stop_step is None:
+            stop_ind = len(step_names) - 1
+        else:
+            stop_ind = step_names.index(cfg.stop_step)
+        steps_as_fxns = steps_as_fxns[start_ind : (stop_ind + 1)]
+
     return steps_as_fxns
 
 
+def resolve_step_filename(
+    step_name: str, cfg: DataflowBuildConfig, step_delta: int = 0
+):
+    step_names = list(
+        map(lambda x: x.__name__, resolve_build_steps(cfg, partial=False))
+    )
+    assert step_name in step_names, "start_step %s not found" + step_name
+    step_no = step_names.index(step_name) + step_delta
+    assert step_no >= 0, "Invalid step+delta combination"
+    assert step_no < len(step_names), "Invalid step+delta combination"
+    filename = cfg.output_dir + "/intermediate_models/"
+    filename += "%s.onnx" % (step_names[step_no])
+    return filename
+
+
 def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
     """Best-effort build a dataflow accelerator using the given configuration.
 
     :param model_filename: ONNX model filename to build
     :param cfg: Build configuration
     """
-    model = ModelWrapper(model_filename)
+    # if start_step is specified, override the input model
+    if cfg.start_step is None:
+        print("Building dataflow accelerator from " + model_filename)
+        model = ModelWrapper(model_filename)
+    else:
+        intermediate_model_filename = resolve_step_filename(cfg.start_step, cfg, -1)
+        print(
+            "Building dataflow accelerator from intermediate checkpoint"
+            + intermediate_model_filename
+        )
+        model = ModelWrapper(intermediate_model_filename)
     assert type(model) is ModelWrapper
     finn_build_dir = os.environ["FINN_BUILD_DIR"]
-    print("Building dataflow accelerator from " + model_filename)
+
     print("Intermediate outputs will be generated in " + finn_build_dir)
     print("Final outputs will be generated in " + cfg.output_dir)
     print("Build log is at " + cfg.output_dir + "/build_dataflow.log")
@@ -131,7 +169,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             sys.stdout = stdout_orig
             sys.stderr = stderr_orig
             time_per_step[step_name] = step_end - step_start
-            chkpt_name = "%d_%s.onnx" % (step_num, step_name)
+            chkpt_name = "%s.onnx" % (step_name)
             if cfg.save_intermediate_models:
                 intermediate_model_dir = cfg.output_dir + "/intermediate_models"
                 if not os.path.exists(intermediate_model_dir):
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index bd938f17411ee42e94e95e02776ad8e973ea10fa..807fd706860d7e4667107ddd2ed46ea2b123c3ec 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -26,14 +26,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from typing import List, Optional, Any
-from finn.util.basic import pynq_part_map, alveo_part_map
-from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy
-from enum import Enum
+import numpy as np
+import os
 from dataclasses import dataclass
 from dataclasses_json import dataclass_json
-import os
-import numpy as np
+from enum import Enum
+from typing import Any, List, Optional
+
+from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy
+from finn.util.basic import alveo_part_map, pynq_part_map
 
 
 class ShellFlowType(str, Enum):
@@ -88,6 +89,8 @@ class LargeFIFOMemStyle(str, Enum):
 class VerificationStepType(str, Enum):
     "Steps at which FINN ONNX execution can be launched for verification."
 
+    #: verify after step_qonnx_to_finn, using Python execution
+    QONNX_TO_FINN_PYTHON = "finn_onnx_python"
     #: verify after step_tidy_up, using Python execution
     TIDY_UP_PYTHON = "initial_python"
     #: verify after step_streamline , using Python execution
@@ -102,6 +105,7 @@ class VerificationStepType(str, Enum):
 #: specified order. Use the `steps` as part of build config to restrict which
 #: steps will be run.
 default_build_dataflow_steps = [
+    "step_qonnx_to_finn",
     "step_tidy_up",
     "step_streamline",
     "step_convert_to_hls",
@@ -122,6 +126,7 @@ default_build_dataflow_steps = [
 
 #: List of steps to run for an estimate-only (no synthesis) dataflow build
 estimate_only_dataflow_steps = [
+    "step_qonnx_to_finn",
     "step_tidy_up",
     "step_streamline",
     "step_convert_to_hls",
@@ -171,6 +176,13 @@ class DataflowBuildConfig:
     #: that will override the target_fps setting here.
     target_fps: Optional[int] = None
 
+    #: (Optional) Use two-pass relaxation for folding, only relevant if target_fps
+    #: is set. If enabled, parallelization will internally run a second time if the
+    #: target cycles from the first pass could not be achieved, instead using the
+    #: achievable target to obtain a balanced pipeline. If disabled, this can be
+    #: useful for decreasing the latency (even though throughput won't increase).
+    folding_two_pass_relaxation: Optional[bool] = True
+
     #: (Optional) At which steps the generated intermediate output model
     #: will be verified. See documentation of VerificationStepType for
     #: available options.
@@ -184,6 +196,19 @@ class DataflowBuildConfig:
     #: verification. Only required if verify_steps is not empty.
     verify_expected_output_npy: Optional[str] = "expected_output.npy"
 
+    #: (Optional) Save full execution context for each of the verify_steps.
+    #: By default, only the top-level graph output is saved.
+    verify_save_full_context: Optional[bool] = False
+
+    #: (Optional) Save .vcd waveforms from rtlsim under reports.
+    #: By default, waveforms won't be saved.
+    verify_save_rtlsim_waveforms: Optional[bool] = False
+
+    #: (Optional) Run synthesis to generate a .dcp for the stitched-IP output product.
+    #: This can make it easier to treat it as a standalone artifact without requiring
+    #: the full list of layer IP build directories. By default, synthesis will not run.
+    stitched_ip_gen_dcp: Optional[bool] = False
+
     #: (Optional) Control the maximum width of the per-PE MVAU stream while
     #: exploring the parallelization attributes to reach target_fps
     #: Only relevant if target_fps is specified.
@@ -263,6 +288,24 @@ class DataflowBuildConfig:
     #: - functions are called with (model, DataflowBuildConfig) as args
     steps: Optional[List[Any]] = None
 
+    #: If given, start from this step, loading the intermediate model generated
+    #: from the previous step (save_intermediate_models must be enabled)
+    start_step: Optional[str] = None
+
+    #: If given, stop at this step.
+    stop_step: Optional[str] = None
+
+    #: The optional argument `max_multithreshold_bit_width` affects which Quant nodes
+    #: of the QONNX format get converted to the MultiThreshold nodes of FINN. This
+    #: only affects Quant nodes in the activation path. Quant nodes, which define a
+    #: bit width larger than `max_multithreshold_bit_width` are not converted to
+    #: MultiThreshold nodes and a warning is raised instead.
+    #: If not given `max_multithreshold_bit_width` defaults to 8.
+    max_multithreshold_bit_width: Optional[int] = 8
+
+    #: Override the number of inputs for rtlsim performance measurement.
+    rtlsim_batch_size: Optional[int] = 1
+
     def _resolve_hls_clk_period(self):
         if self.hls_clk_period_ns is None:
             # use same clk for synth and hls if not explicitly specified
@@ -332,4 +375,7 @@ class DataflowBuildConfig:
                 + self.verify_expected_output_npy
             )
             verify_expected_output_npy = np.load(self.verify_expected_output_npy)
-            return (verify_input_npy, verify_expected_output_npy)
+            return (
+                verify_input_npy.astype(np.float32),
+                verify_expected_output_npy.astype(np.float32),
+            )
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 50cd9ed4ff663044433c5d8997f2fcd9337a0dd6..c977f15e7090f5cae633a013f5eb9e6b3dd34dd2 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -26,78 +26,85 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from finn.core.modelwrapper import ModelWrapper
-import os
 import json
+import numpy as np
+import os
+from copy import deepcopy
+from distutils.dir_util import copy_tree
+from qonnx.util.cleanup import cleanup_model
+from shutil import copy
+
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
+from finn.analysis.fpgadataflow.op_and_param_counts import (
+    aggregate_dict_keys,
+    op_and_param_counts,
+)
+from finn.analysis.fpgadataflow.res_estimation import (
+    res_estimation,
+    res_estimation_complete,
+)
+from finn.builder.build_dataflow_config import (
+    DataflowBuildConfig,
+    DataflowOutputType,
+    ShellFlowType,
+    VerificationStepType,
+)
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.onnx_exec import execute_onnx
+from finn.core.throughput_test import throughput_test_rtlsim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import (
-    ApplyConfig,
-    GiveReadableTensorNames,
-    GiveUniqueNodeNames,
-    RemoveUnusedTensors,
-    RemoveStaticGraphInputs,
+from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
 )
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.streamline import Streamline
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
-
-from shutil import copy, copytree
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import (
     InsertAndSetFIFODepths,
     RemoveShallowFIFOs,
 )
-from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
-from finn.transformation.fpgadataflow.vitis_build import VitisBuild
-from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.set_folding import SetFolding
-from finn.transformation.fpgadataflow.create_dataflow_partition import (
-    CreateDataflowPartition,
-)
-from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
-    ReplaceVerilogRelPaths,
-)
-from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.analysis.fpgadataflow.res_estimation import (
-    res_estimation,
-    res_estimation_complete,
+from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
+from finn.transformation.fpgadataflow.vitis_build import VitisBuild
+from finn.transformation.general import (
+    ApplyConfig,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    RemoveStaticGraphInputs,
+    RemoveUnusedTensors,
 )
-from finn.analysis.fpgadataflow.op_and_param_counts import (
-    aggregate_dict_keys,
-    op_and_param_counts,
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+from finn.transformation.qonnx.quant_act_to_multithreshold import (
+    default_filter_function_generator,
 )
-from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
-from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
+from finn.transformation.streamline import Streamline
+from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+from finn.util.basic import get_rtlsim_trace_depth
 from finn.util.config import extract_model_config_to_json
-from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
-from finn.builder.build_dataflow_config import (
-    DataflowBuildConfig,
-    DataflowOutputType,
-    ShellFlowType,
-    VerificationStepType,
-)
-from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
-from finn.core.onnx_exec import execute_onnx
-import numpy as np
+from finn.util.pyverilator import pyverilate_get_liveness_threshold_cycles
 from finn.util.test import execute_parent
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.core.throughput_test import throughput_test_rtlsim
-from copy import deepcopy
 
 
 def verify_step(
@@ -115,21 +122,108 @@ def verify_step(
         parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx"
         child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name
         model.save(child_model_fn)
-        out_npy = execute_parent(parent_model_fn, child_model_fn, in_npy)
+        out_tensor_name = ModelWrapper(parent_model_fn).graph.output[0].name
+        out_dict = execute_parent(
+            parent_model_fn, child_model_fn, in_npy, return_full_ctx=True
+        )
+        out_npy = out_dict[out_tensor_name]
     else:
         inp_tensor_name = model.graph.input[0].name
         out_tensor_name = model.graph.output[0].name
         inp_dict = {inp_tensor_name: in_npy}
-        out_dict = execute_onnx(model, inp_dict)
+        out_dict = execute_onnx(model, inp_dict, True)
         out_npy = out_dict[out_tensor_name]
     res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all()
     res_to_str = {True: "SUCCESS", False: "FAIL"}
     res_str = res_to_str[res]
-    verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % (step_name, res_str)
-    np.save(verification_output_fn, out_npy)
+    if cfg.verify_save_full_context:
+        verification_output_fn = verify_out_dir + "/verify_%s_%s.npz" % (
+            step_name,
+            res_str,
+        )
+        np.savez(verification_output_fn, **out_dict)
+    else:
+        verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % (
+            step_name,
+            res_str,
+        )
+        np.save(verification_output_fn, out_npy)
     print("Verification for %s : %s" % (step_name, res_str))
 
 
+def prepare_for_stitched_ip_rtlsim(verify_model, cfg):
+    need_restitch = False
+    # rtlsim only supports certain impl_style for some nodes
+    # StreamingFIFO must have impl_style=rtl
+    for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
+        inst = getCustomOp(fifo_layer)
+        if inst.get_nodeattr("impl_style") != "rtl":
+            inst.set_nodeattr("impl_style", "rtl")
+            inst.set_nodeattr("code_gen_dir_ipgen", "")
+            inst.set_nodeattr("ipgen_path", "")
+            need_restitch = True
+    # StreamingDataWidthConverter must have impl_style=hls
+    for dwc_layer in verify_model.get_nodes_by_op_type(
+        "StreamingDataWidthConverter_Batch"
+    ):
+        inst = getCustomOp(dwc_layer)
+        if inst.get_nodeattr("impl_style") != "hls":
+            inst.set_nodeattr("impl_style", "hls")
+            inst.set_nodeattr("code_gen_dir_ipgen", "")
+            inst.set_nodeattr("ipgen_path", "")
+            need_restitch = True
+    # if we've made alterations to the model, need to do some re-prep
+    if need_restitch:
+        print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
+        verify_model = verify_model.transform(
+            PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
+        )
+        verify_model = verify_model.transform(HLSSynthIP())
+        verify_model = verify_model.transform(
+            CreateStitchedIP(
+                cfg._resolve_fpga_part(),
+                cfg.synth_clk_period_ns,
+                vitis=False,
+            )
+        )
+    # set top-level prop for stitched-ip rtlsim and launch
+    verify_model.set_metadata_prop("exec_mode", "rtlsim")
+    # TODO make configurable
+    # verify_model.set_metadata_prop("rtlsim_trace", "trace.vcd")
+    return verify_model
+
+
+def step_qonnx_to_finn(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """
+    This step will only execute if QONNX nodes are found.
+    These include the following op_types: "Quant" , "Trunc" and "BinaryQuant".
+    If such nodes are found the step will run the tidy-up step from QONNX
+    and then convert the QONNX model to the FINN-ONNX dialect.
+    """
+    # Check if any QONNX nodes exist, i.e. BinaryQuant, Quant or Trunc
+    q_count = 0
+    for op_type in ["BinaryQuant", "Quant", "Trunc"]:
+        q_count += len(model.get_nodes_by_op_type(op_type))
+    if q_count == 0:
+        return model
+
+    # QONNX cleanup
+    model = cleanup_model(model)
+    # QONNX to FINN-ONNX
+    model = model.transform(
+        ConvertQONNXtoFINN(
+            filter_function=default_filter_function_generator(
+                max_multithreshold_bit_width=cfg.max_multithreshold_bit_width
+            )
+        )
+    )
+
+    if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps():
+        verify_step(model, cfg, "qonnx_to_finn_python", need_parent=False)
+
+    return model
+
+
 def step_tidy_up(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Run the tidy-up step on given model. This includes shape and datatype
     inference, constant folding, and giving nodes and tensors better names.
@@ -164,6 +258,7 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(MakeMaxPoolNHWC())
         model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
         model = model.transform(MakeMaxPoolNHWC())
+        model = model.transform(absorb.AbsorbConsecutiveTransposes())
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
     model = model.transform(Streamline())
     # absorb final add-mul nodes into TopK
@@ -212,7 +307,12 @@ def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig
     nodes, which point to a separate ONNX file. Dataflow accelerator synthesis
     can only be performed on those HLSCustomOp sub-graphs."""
 
-    parent_model = model.transform(CreateDataflowPartition())
+    parent_model = model.transform(
+        CreateDataflowPartition(
+            partition_model_dir=cfg.output_dir
+            + "/intermediate_models/supported_op_partitions"
+        )
+    )
     sdp_nodes = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")
     assert len(sdp_nodes) == 1, "Only a single StreamingDataflowPartition supported."
     sdp_node = sdp_nodes[0]
@@ -226,13 +326,32 @@ def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig
 
 def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfig):
     """If target_fps was specified, use the SetFolding transformation to determine
-    parallelization attributes."""
+    parallelization attributes. The auto-generated config will be saved under
+    auto_folding_config.json under the outputs, which can serve as a basis for
+    customizing the folding factors further."""
 
     target_cycles_per_frame = cfg._resolve_cycles_per_frame()
     if target_cycles_per_frame is not None:
         model = model.transform(
-            SetFolding(target_cycles_per_frame, mvau_wwidth_max=cfg.mvau_wwidth_max)
+            SetFolding(
+                target_cycles_per_frame,
+                mvau_wwidth_max=cfg.mvau_wwidth_max,
+                two_pass_relaxation=cfg.folding_two_pass_relaxation,
+            )
         )
+        # extract the suggested configuration and save it as json
+        hw_attrs = [
+            "PE",
+            "SIMD",
+            "ram_style",
+            "resType",
+            "mem_mode",
+            "runtime_writeable_weights",
+        ]
+        extract_model_config_to_json(
+            model, cfg.output_dir + "/auto_folding_config.json", hw_attrs
+        )
+
     return model
 
 
@@ -380,25 +499,35 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
     if DataflowOutputType.STITCHED_IP in cfg.generate_outputs:
         stitched_ip_dir = cfg.output_dir + "/stitched_ip"
         model = model.transform(
-            CreateStitchedIP(cfg._resolve_fpga_part(), cfg.synth_clk_period_ns)
+            CreateStitchedIP(
+                cfg._resolve_fpga_part(),
+                cfg.synth_clk_period_ns,
+                vitis=cfg.stitched_ip_gen_dcp,
+            )
         )
         # TODO copy all ip sources into output dir? as zip?
-        copytree(model.get_metadata_prop("vivado_stitch_proj"), stitched_ip_dir)
+        copy_tree(model.get_metadata_prop("vivado_stitch_proj"), stitched_ip_dir)
         print("Vivado stitched IP written into " + stitched_ip_dir)
     if VerificationStepType.STITCHED_IP_RTLSIM in cfg._resolve_verification_steps():
         # prepare ip-stitched rtlsim
         verify_model = deepcopy(model)
-        # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that
-        for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
-            getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl")
-        # similarly for StreamingDataWidthConverter with impl_style=hls
-        for dwc_layer in verify_model.get_nodes_by_op_type(
-            "StreamingDataWidthConverter_Batch"
-        ):
-            getCustomOp(dwc_layer).set_nodeattr("impl_style", "hls")
-        verify_model = verify_model.transform(PrepareRTLSim())
-        verify_model.set_metadata_prop("exec_mode", "rtlsim")
+        verify_model = prepare_for_stitched_ip_rtlsim(verify_model, cfg)
+        # use critical path estimate to set rtlsim liveness threshold
+        # (very conservative)
+        verify_model = verify_model.transform(AnnotateCycles())
+        estimate_network_performance = verify_model.analysis(dataflow_performance)
+        prev_liveness = pyverilate_get_liveness_threshold_cycles()
+        os.environ["LIVENESS_THRESHOLD"] = str(
+            int(estimate_network_performance["critical_path_cycles"])
+        )
+        if cfg.verify_save_rtlsim_waveforms:
+            report_dir = cfg.output_dir + "/report"
+            os.makedirs(report_dir, exist_ok=True)
+            verify_model.set_metadata_prop(
+                "rtlsim_trace", "%s/verify_rtlsim.vcd" % (report_dir)
+            )
         verify_step(verify_model, cfg, "stitched_ip_rtlsim", need_parent=True)
+        os.environ["LIVENESS_THRESHOLD"] = str(prev_liveness)
     return model
 
 
@@ -411,30 +540,30 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
         assert (
             DataflowOutputType.STITCHED_IP in cfg.generate_outputs
         ), "rtlsim_perf needs stitched IP"
+        report_dir = cfg.output_dir + "/report"
+        os.makedirs(report_dir, exist_ok=True)
         # prepare ip-stitched rtlsim
         rtlsim_model = deepcopy(model)
-        # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that
-        for fifo_layer in rtlsim_model.get_nodes_by_op_type("StreamingFIFO"):
-            getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl")
-        # similarly for StreamingDataWidthConverter with impl_style=hls
-        for dwc_layer in rtlsim_model.get_nodes_by_op_type(
-            "StreamingDataWidthConverter_Batch"
-        ):
-            getCustomOp(dwc_layer).set_nodeattr("impl_style", "hls")
-        rtlsim_model = rtlsim_model.transform(PrepareRTLSim())
-        rtlsim_model.set_metadata_prop("exec_mode", "rtlsim")
+        rtlsim_model = prepare_for_stitched_ip_rtlsim(rtlsim_model, cfg)
         # run with single input to get latency
-        rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, 1)
-        rtlsim_latency = rtlsim_perf_dict["cycles"]
-        # run with num inputs equal to layers to fill the whole pipeline
-        # to get the steady-state throughput
-        rtlsim_bs = len(rtlsim_model.graph.node)
+        orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
+        rtlsim_bs = int(cfg.rtlsim_batch_size)
+        assert rtlsim_bs > 0, "rtlsim batch size must be >0"
+        if cfg.verify_save_rtlsim_waveforms:
+            # set depth to 3 for layer-by-layer visibility
+            os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+            rtlsim_model.set_metadata_prop(
+                "rtlsim_trace", "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs)
+            )
+        rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"]))
         rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
+        rtlsim_latency = rtlsim_perf_dict["cycles"]
         rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
-        report_dir = cfg.output_dir + "/report"
-        os.makedirs(report_dir, exist_ok=True)
         with open(report_dir + "/rtlsim_performance.json", "w") as f:
             json.dump(rtlsim_perf_dict, f, indent=2)
+        if cfg.verify_save_rtlsim_waveforms:
+            # restore original trace depth
+            os.environ["RTLSIM_TRACE_DEPTH"] = str(orig_rtlsim_trace_depth)
 
     return model
 
@@ -446,7 +575,7 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig):
     if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs:
         driver_dir = cfg.output_dir + "/driver"
         model = model.transform(MakePYNQDriver(cfg._resolve_driver_platform()))
-        copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir)
+        copy_tree(model.get_metadata_prop("pynq_driver_dir"), driver_dir)
         print("PYNQ Python driver written into " + driver_dir)
     return model
 
@@ -487,9 +616,15 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
         os.makedirs(bitfile_dir, exist_ok=True)
         report_dir = cfg.output_dir + "/report"
         os.makedirs(report_dir, exist_ok=True)
+        partition_model_dir = cfg.output_dir + "/intermediate_models/kernel_partitions"
         if cfg.shell_flow_type == ShellFlowType.VIVADO_ZYNQ:
             model = model.transform(
-                ZynqBuild(cfg.board, cfg.synth_clk_period_ns, cfg.enable_hw_debug)
+                ZynqBuild(
+                    cfg.board,
+                    cfg.synth_clk_period_ns,
+                    cfg.enable_hw_debug,
+                    partition_model_dir=partition_model_dir,
+                )
             )
             copy(model.get_metadata_prop("bitfile"), bitfile_dir + "/finn-accel.bit")
             copy(model.get_metadata_prop("hw_handoff"), bitfile_dir + "/finn-accel.hwh")
@@ -513,6 +648,7 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
                     strategy=cfg._resolve_vitis_opt_strategy(),
                     enable_debug=cfg.enable_hw_debug,
                     floorplan_file=cfg.vitis_floorplan_file,
+                    partition_model_dir=partition_model_dir,
                 )
             )
             copy(model.get_metadata_prop("bitfile"), bitfile_dir + "/finn-accel.xclbin")
@@ -535,13 +671,14 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
         bitfile_dir = cfg.output_dir + "/bitfile"
         driver_dir = cfg.output_dir + "/driver"
         os.makedirs(deploy_dir, exist_ok=True)
-        copytree(bitfile_dir, deploy_dir + "/bitfile")
-        copytree(driver_dir, deploy_dir + "/driver")
+        copy_tree(bitfile_dir, deploy_dir + "/bitfile")
+        copy_tree(driver_dir, deploy_dir + "/driver")
     return model
 
 
 #: map step name strings to step functions
 build_dataflow_step_lookup = {
+    "step_qonnx_to_finn": step_qonnx_to_finn,
     "step_tidy_up": step_tidy_up,
     "step_streamline": step_streamline,
     "step_convert_to_hls": step_convert_to_hls,
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index b20b652254028c4ee5dc2edd1f1302ea3359019b..417a505898fb1aba751e4b44db336b8cf313cb6a 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -26,6 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
+from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
 from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
     ConvolutionInputGenerator,
 )
@@ -33,28 +35,28 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import (
     ConvolutionInputGenerator1D,
 )
 from finn.custom_op.fpgadataflow.downsampler import DownSampler
-from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
-from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
-from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
-from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
+from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
+from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
+from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
+from finn.custom_op.fpgadataflow.iodma import IODMA
+from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
+from finn.custom_op.fpgadataflow.lookup import Lookup
+from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
+from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
+    StreamingDataflowPartition,
+)
 from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
-from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
-from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
-from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
+from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
+from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
+from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
-from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
-from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
-from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
+from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
+from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch
 from finn.custom_op.fpgadataflow.vector_vector_activate_batch import (
     Vector_Vector_Activate_Batch,
 )
-from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
-from finn.custom_op.fpgadataflow.iodma import IODMA
-from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
-    StreamingDataflowPartition,
-)
 
 custom_op = dict()
 
@@ -79,3 +81,5 @@ custom_op["Vector_Vector_Activate_Batch"] = Vector_Vector_Activate_Batch
 custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
 custom_op["IODMA"] = IODMA
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
+custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch
+custom_op["Lookup"] = Lookup
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index 38940ccb94f11fe49af5f49ee020f150326a026c..fa80e47485eef4f289b0272fd73ac185bd1c2c5e 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -26,13 +26,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-
 import numpy as np
+import os
 import warnings
+
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -84,19 +83,7 @@ class AddStreams_Batch(HLSCustomOp):
         assert ishape == exp_ishape, "Unexpected input1 shape."
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1]))
         assert ishape == exp_ishape, "Unexpected input2 shape."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
index 097ec336ff24cd826e6530c42b7cdb1108971fa1..4961f6148231252d255c1830ced418308032ce41 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -26,12 +26,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from math import ceil
-import os
-
 import numpy as np
+import os
+import warnings
+from math import ceil
 
-from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import (
@@ -39,9 +38,8 @@ from finn.util.data_packing import (
     numpy_to_hls_code,
     rtlsim_output_to_npy,
 )
-from . import templates
 
-import warnings
+from . import templates
 
 # ONNX i/o tensor shape assumptions for channelwise ops:
 # input 0 is the input tensor, shape (..., NumChannels)
@@ -57,10 +55,10 @@ def get_smallest_possible(vals):
     for v in vals:
         assert int(v) == v, "Error float value"
 
-    for k in DataType.__members__:
+    for k in DataType.get_accumulator_dt_cands():
         dt = DataType[k]
 
-        if dt in [DataType.BIPOLAR, DataType.TERNARY, DataType.FLOAT32]:
+        if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]:
             # not currently supported
             continue
 
@@ -76,9 +74,9 @@ def get_smallest_possible(vals):
     )
 
     if (0 <= vals).all():
-        return DataType.UINT64
+        return DataType["UINT64"]
     else:
-        return DataType.INT64
+        return DataType["INT64"]
 
 
 class ChannelwiseOp_Batch(HLSCustomOp):
@@ -126,18 +124,7 @@ class ChannelwiseOp_Batch(HLSCustomOp):
     def make_shape_compatible_op(self, model):
         oshape = self.get_normal_output_shape()
         # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -217,7 +204,7 @@ class ChannelwiseOp_Batch(HLSCustomOp):
             return 0
 
     def lut_estimation(self):
-        """Calculates LUT cost, taking memory resource type into account """
+        """Calculates LUT cost, taking memory resource type into account"""
         # TODO add in/out FIFO contributions
         style = self.get_nodeattr("ram_style")
         P = self.get_nodeattr("PE")
@@ -348,8 +335,8 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         )
         # get input data type
         export_idt = self.get_input_datatype()
-        if self.get_input_datatype() == DataType.BIPOLAR:
-            export_idt = DataType.BINARY
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
+            export_idt = DataType["BINARY"]
         idt_hls = export_idt.get_hls_datatype_str()
 
         # write parameters into params.h
@@ -357,8 +344,8 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         pdt_hls = pdt.get_hls_datatype_str()
         # use binary to export bipolar activations
         export_odt = self.get_output_datatype()
-        if self.get_output_datatype() == DataType.BIPOLAR:
-            export_odt = DataType.BINARY
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
+            export_odt = DataType["BINARY"]
         odt_hls = export_odt.get_hls_datatype_str()
         # get desired function
         func = self.get_nodeattr("Func")
@@ -439,7 +426,7 @@ class ChannelwiseOp_Batch(HLSCustomOp):
             # load output npy file
             super().npy_to_dynamic_output(context)
             # reinterpret binary output as bipolar where needed
-            if self.get_output_datatype() == DataType.BIPOLAR:
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
                 out = context[node.output[0]]
                 out = 2 * out - 1
                 context[node.output[0]] = out
@@ -490,7 +477,9 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         numReps = numInputVectors[0]
         self.code_gen_dict["$DEFINES$"] = [
             """#define NumChannels1 {}\n#define PE1 {}\n#define numReps {}""".format(
-                self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps,
+                self.get_nodeattr("NumChannels"),
+                self.get_nodeattr("PE"),
+                numReps,
             )
         ]
 
@@ -525,24 +514,29 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         # should ImgDim be defined or just filled in here like we do now?
         ishape = self.get_folded_input_shape()
         if len(ishape) == 3:
-            imgdim = 1
+            imgdim_h = 1
+            imgdim_w = 1
         elif len(ishape) == 5:
-            imgdim = ishape[1]
+            imgdim_h = ishape[1]
+            imgdim_w = ishape[2]
         else:
             raise Exception("""Unexpeted input shape""")
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}>
+            """Thresholding_Batch<{}, {}, NumChannels1, PE1, {}, {}>
             (in0, out, threshs, numReps);""".format(
-                imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"],
+                imgdim_h,
+                imgdim_w,
+                tmpl_args["TSrcI"],
+                tmpl_args["TDstI"],
             )
         ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 6e77cd3da7328fd81dccc2ff171a9ae84723d165..a4018836846257c15ad203b1cef54c03cd081e45 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -26,15 +26,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-
 import math
 import numpy as np
+import os
 
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.custom_op.general.im2col import compute_conv_output_dim
-from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator:
@@ -149,18 +147,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen."
         # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -320,10 +307,10 @@ class ConvolutionInputGenerator(HLSCustomOp):
             inp.shape == exp_ishape
         ), """Input shape doesn't
         match expected shape (1, ifm_dim_h, ifm_dim_w, ifm_ch)."""
-        if self.get_input_datatype() == DataType.BIPOLAR:
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
             # store bipolar activations as binary
             inp = (inp + 1) / 2
-            export_idt = DataType.BINARY
+            export_idt = DataType["BINARY"]
         else:
             export_idt = self.get_input_datatype()
         # reshape input into folded form
@@ -371,7 +358,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
                 )
             )
         # binary -> bipolar if needed
-        if self.get_output_datatype() == DataType.BIPOLAR:
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
             out = context[node.output[0]]
             out = 2 * out - 1
             context[node.output[0]] = out
@@ -405,9 +392,9 @@ class ConvolutionInputGenerator(HLSCustomOp):
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -466,9 +453,9 @@ class ConvolutionInputGenerator(HLSCustomOp):
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
index 782655b31b7f4add4c886a46845506af875190bc..e43d73b1cd3ec7902fc743bfdf4d2fcad1c01dfe 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -26,15 +26,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-
 import math
 import numpy as np
+import os
 
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.custom_op.general.im2col import compute_conv_output_dim
-from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # This operation should only be used for 1D convolutions. Either the
@@ -129,8 +127,12 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
         ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
         assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        wf = int((k_h * k_w * ifm_ch) // simd)
-        folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
+        if self.use_parallel_window_output():
+            wf = int((ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
+        else:
+            wf = int((k_h * k_w * ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
         return folded_oshape
 
     def make_shape_compatible_op(self, model):
@@ -138,19 +140,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -170,8 +160,6 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         return DataType[self.get_nodeattr("outputDataType")]
 
     def get_instream_width(self):
-        """Returns stream width, input and output stream width are equal for
-        the sliding window function"""
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -180,10 +168,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         return in_width
 
     def get_outstream_width(self):
-        """Returns stream width, input and output stream width are equal for
-        the sliding window function, so the function to determine the input
-        stream width can be reused."""
-        return self.get_instream_width()
+        if self.use_parallel_window_output():
+            # feed all window pixels in parallel
+            k_h, k_w = self.get_nodeattr("ConvKernelDim")
+            return self.get_instream_width() * k_h * k_w
+        else:
+            # if parallel variant not in use: same width for output and input stream
+            return self.get_instream_width()
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
@@ -219,6 +210,22 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
 
         return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
 
+    def use_parallel_window_output(self):
+        # Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to
+        # feed window in parallel to the following layer, enabling full SIMD unfolding.
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"):
+            if self.get_nodeattr("depthwise") == 0:
+                if stride_h == 1 and stride_w == 1:
+                    if dilation_h == 1 and dilation_w == 1:
+                        return True
+
+        return False
+
     def get_exp_cycles(self):
         simd = self.get_nodeattr("SIMD")
         (
@@ -238,12 +245,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
         # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
-        cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
-        cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
-        max_cycles = max(cycles_write_block, cycles_read_block)
-        exp_cycles = (
-            ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
-        )
+        if self.use_parallel_window_output():
+            exp_cycles = k_w + ofm_dim_w
+        else:
+            cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
+            cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
+            max_cycles = max(cycles_write_block, cycles_read_block)
+            exp_cycles = (
+                ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
+            )
 
         return int(exp_cycles)
 
@@ -346,10 +356,10 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
             inp.shape == exp_ishape
         ), """Input shape doesn't
         match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
-        if self.get_input_datatype() == DataType.BIPOLAR:
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
             # store bipolar activations as binary
             inp = (inp + 1) / 2
-            export_idt = DataType.BINARY
+            export_idt = DataType["BINARY"]
         else:
             export_idt = self.get_input_datatype()
         # reshape input into folded form
@@ -397,7 +407,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
                 )
             )
         # binary -> bipolar if needed
-        if self.get_output_datatype() == DataType.BIPOLAR:
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
             out = context[node.output[0]]
             out = 2 * out - 1
             context[node.output[0]] = out
@@ -503,9 +513,9 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -536,46 +546,56 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
             "ultra": "ap_resource_uram()",
         }
         hls_ram_style = map_to_hls_ram_style[ram_style]
-        hls_call = "ConvolutionInputGenerator"
-        # check which ConvolutionInputGenerator is needed
-        dilation_h, dilation_w = self.get_nodeattr("Dilation")
 
-        hls_call += "_NonSquare"
-        if dilation_h > 1 or dilation_w > 1:
-            hls_call += "_Dilated"
-            if self.get_nodeattr("depthwise") == 1:
-                hls_call += "_dws"
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
-                IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y,
-                Dilation1_x, Dilation1_y> (in0, out, numReps, {});""".format(
-                    hls_call, hls_ram_style
-                )
-            ]
-        elif self.get_nodeattr("depthwise") == 1:
-            hls_call += "_dws"
+        # check which ConvolutionInputGenerator is needed
+        if self.use_parallel_window_output():
+            hls_call = "ConvolutionInputGenerator_1D_parallel"
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
-                IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y>
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, SIMD1, Stride1_x>
                 (in0, out, numReps, {});""".format(
                     hls_call, hls_ram_style
                 )
             ]
         else:
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
-                IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y>
-                (in0, out, numReps, {});""".format(
-                    hls_call, hls_ram_style
-                )
-            ]
+            hls_call = "ConvolutionInputGenerator_NonSquare"
+            dilation_h, dilation_w = self.get_nodeattr("Dilation")
+            if dilation_h > 1 or dilation_w > 1:
+                hls_call += "_Dilated"
+                if self.get_nodeattr("depthwise") == 1:
+                    hls_call += "_dws"
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
+                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
+                    SIMD1, Stride1_x, Stride1_y, Dilation1_x, Dilation1_y>
+                    (in0, out, numReps, {});""".format(
+                        hls_call, hls_ram_style
+                    )
+                ]
+            elif self.get_nodeattr("depthwise") == 1:
+                hls_call += "_dws"
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
+                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
+                    SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
+                        hls_call, hls_ram_style
+                    )
+                ]
+            else:
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
+                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
+                    SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
+                        hls_call, hls_ram_style
+                    )
+                ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -584,9 +604,16 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         npy_out = "%s/output.npy" % code_gen_dir
         oshape = self.get_folded_output_shape()
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+        if self.use_parallel_window_output():
+            # pass the number of pixels in the folded output to apintstream2npy, needed
+            # to unpack the ouput correctly and reverse only the inner SIMD dimension
+            k_h, k_w = self.get_nodeattr("ConvKernelDim")
+            multi_pixel_out = k_h * k_w
+        else:
+            multi_pixel_out = 1
 
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", true, 1, %d);'
             % (
                 packed_hls_type,
                 elem_hls_type,
@@ -594,6 +621,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
                 npy_type,
                 oshape_cpp_str,
                 npy_out,
+                multi_pixel_out,
             )
         ]
 
@@ -601,12 +629,21 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
-                hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
-                self.onnx_node.name
-            )
-        ]
+        if self.use_parallel_window_output():
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
+                    hls::stream<ap_uint<ConvKernelDim1_x*SIMD1*Input_precision1>>
+                    &out)""".format(
+                    self.onnx_node.name
+                )
+            ]
+        else:
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
+                    hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
+                    self.onnx_node.name
+                )
+            ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
index 002f71aa30b4cf94c63d572e536999327eb2a527..124b3e4645caa63a2590d91c58f430f8d56bb6a0 100644
--- a/src/finn/custom_op/fpgadataflow/downsampler.py
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -1,10 +1,38 @@
-import os
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import numpy as np
-from onnx import TensorProto, helper
+import os
+import warnings
+
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-import warnings
 
 
 class DownSampler(HLSCustomOp):
@@ -82,19 +110,7 @@ class DownSampler(HLSCustomOp):
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpect input shape for DownSampler."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -163,9 +179,9 @@ class DownSampler(HLSCustomOp):
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -196,9 +212,9 @@ class DownSampler(HLSCustomOp):
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 73da77bd3f940cee5ffd10fcfc43571f1a612eb4..3b0fa55b0065e6ceeb8ad2eb7282a413adf443d7 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -26,13 +26,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-
 import numpy as np
+import os
 import warnings
+from onnx import TensorProto, helper
+
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from onnx import helper, TensorProto
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index 99f959bf59f06d6ab5b71dd7245d657f4964cca4..8ac30524ebee6f503e34f6d92408f3f137a59c72 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -1,10 +1,38 @@
-import os
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import numpy as np
-from onnx import TensorProto, helper
+import os
+import warnings
+
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-import warnings
 
 
 class FMPadding_Batch(HLSCustomOp):
@@ -98,19 +126,7 @@ class FMPadding_Batch(HLSCustomOp):
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpect input shape for SameResize."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -209,9 +225,9 @@ class FMPadding_Batch(HLSCustomOp):
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -260,9 +276,9 @@ class FMPadding_Batch(HLSCustomOp):
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index 8cc71ce9eb57c2dcf1f743a7b96e501ab833f6cd..6d4a55ee5c86b68776f4c7c2e58930034bb0be02 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -26,13 +26,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-
 import numpy as np
+import os
 import warnings
+
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -95,19 +94,7 @@ class GlobalAccPool_Batch(HLSCustomOp):
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpected input shape."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten(),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index c07188430244b635ab6b1ec192337da74550d57d..3aac7f6b451ed12ab265a20a7df1bfa6c1d7b4c7 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -27,22 +27,24 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # namespace package, extend path
 
-from abc import abstractmethod
 import numpy as np
 import os
 import subprocess
+from abc import abstractmethod
+
 from finn.custom_op.base import CustomOp
 from finn.util.basic import (
     CppBuilder,
+    get_rtlsim_trace_depth,
     make_build_dir,
     roundup_to_integer_multiple,
-    get_rtlsim_trace_depth,
 )
+from finn.util.hls import CallHLS
 from finn.util.pyverilator import (
     pyverilate_get_liveness_threshold_cycles,
     rtlsim_multi_io,
 )
-from finn.util.hls import CallHLS
+
 from . import templates
 
 try:
@@ -289,6 +291,7 @@ class HLSCustomOp(CustomOp):
         self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir]
         self.code_gen_dict["$FPGAPART$"] = [fpgapart]
         self.code_gen_dict["$FINNHLSLIBDIR$"] = ["/workspace/finn-hlslib"]
+        self.code_gen_dict["$FINNHLSCUSTOMDIR$"] = ["/workspace/finn/custom_hls"]
         self.code_gen_dict["$TOPFXN$"] = [node.name]
         self.code_gen_dict["$CLKPERIOD$"] = [str(clk)]
         self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives()
@@ -372,6 +375,7 @@ class HLSCustomOp(CustomOp):
         builder.append_includes("-I/workspace/finn/src/finn/qnn-data/cpp")
         builder.append_includes("-I/workspace/cnpy/")
         builder.append_includes("-I/workspace/finn-hlslib")
+        builder.append_includes("-I/workspace/finn/custom_hls")
         builder.append_includes("-I{}/include".format(os.environ["VIVADO_PATH"]))
         builder.append_includes("--std=c++11")
         builder.append_includes("-O3")
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index 857496a2614894588ebf065db3e384cf2cecf106..802c7e78515336ef884e5ff09356085b5cc6069f 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -26,12 +26,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import math
-from onnx import TensorProto, helper
+import numpy as np
+import warnings
+
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-import warnings
 
 # the IODMA inerfaces a memory-mapped AXI interface and an AXI stream
 # direction "in": pulls data from AXI-MM to AXI stream
@@ -145,19 +145,7 @@ class IODMA(HLSCustomOp):
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpected input shape."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
index 1640e2f27c4672449775fa1c6f2d9b9745e305c4..1eb5962fdbc54092eaeb4796806b3a623c65aea8 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -26,15 +26,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-
 import numpy as np
+import os
+from onnx import TensorProto, helper
 
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from onnx import TensorProto, helper
-from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 from finn.util.basic import roundup_to_integer_multiple
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
 class LabelSelect_Batch(HLSCustomOp):
@@ -103,18 +102,14 @@ class LabelSelect_Batch(HLSCustomOp):
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpected input shape."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.int64)
         return helper.make_node(
-            "Constant",
+            "RandomNormal",
             inputs=[],
             outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.INT64,
-                dims=values.shape,
-                vals=values.flatten(),
-            ),
+            mean=0.0,
+            scale=1.0,
+            dtype=TensorProto.INT64,
+            shape=list(oshape),
         )
 
     def infer_node_datatype(self, model):
diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py
new file mode 100644
index 0000000000000000000000000000000000000000..27be06bdfa3ce3d980a139ec91385c7fe85afab3
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/lookup.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import warnings
+from math import ceil
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    rtlsim_output_to_npy,
+)
+
+
+class Lookup(HLSCustomOp):
+    "Streaming elementwise HLS lookup, mapping indices to values."
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # Number of embeddings ("memory depth")
+            "NumEmbeddings": ("i", True, 0),
+            # Dimensionality of each embedding (part of "memory width")
+            "EmbeddingDim": ("i", True, 0),
+            # Datatype for embeddings (part of "memory width")
+            "EmbeddingType": ("s", True, ""),
+            # Datatype for inputs
+            "InputType": ("s", True, ""),
+            # Input shape
+            "InputShape": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_exp_cycles(self):
+        n_inputs = np.prod(self.get_nodeattr("InputShape"))
+        exp_cycles = int(n_inputs)
+        return exp_cycles
+
+    def get_normal_input_shape(self):
+        return self.get_nodeattr("InputShape")
+
+    def get_normal_output_shape(self):
+        ishape = self.get_normal_input_shape()
+        oshape = list(ishape) + [self.get_nodeattr("EmbeddingDim")]
+        return tuple(oshape)
+
+    def get_folded_input_shape(self):
+        ishape = self.get_normal_input_shape()
+        folded_ishape = list(ishape) + [1]
+        return tuple(folded_ishape)
+
+    def get_folded_output_shape(self):
+        return self.get_normal_output_shape()
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = tuple(self.get_normal_input_shape())
+        oshape = tuple(self.get_normal_output_shape())
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for Lookup: %s vs %s" % (
+            str(exp_ishape),
+            str(ishape),
+        )
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "InputType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("InputType", idt.name)
+        odt = DataType[self.get_nodeattr("EmbeddingType")]
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        ret = DataType[self.get_nodeattr("InputType")]
+        return ret
+
+    def get_output_datatype(self):
+        ret = DataType[self.get_nodeattr("EmbeddingType")]
+        return ret
+
+    def get_instream_width(self):
+        ibits = self.get_input_datatype().bitwidth()
+        return ibits
+
+    def get_outstream_width(self):
+        obits = self.get_output_datatype().bitwidth()
+        ofm_ch = self.get_nodeattr("EmbeddingDim")
+        return obits * ofm_ch
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def global_includes(self):
+        global_incls = ['#include "lookup.hpp"']
+        global_incls.append('#include "embeddings.hpp"')
+        self.code_gen_dict["$GLOBALS$"] = global_incls
+
+    def defines(self, var):
+        n_inputs = np.prod(self.get_folded_input_shape()[:-1])
+        dtype = self.get_input_datatype()
+        elem_hls_type = dtype.get_hls_datatype_str()
+        emb_type = DataType[self.get_nodeattr("EmbeddingType")]
+        emb_hls_type = emb_type.get_hls_datatype_str()
+        my_defines = []
+        my_defines.append(
+            "#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings")
+        )
+        my_defines.append("#define EmbeddingDim %d" % self.get_nodeattr("EmbeddingDim"))
+        my_defines.append("#define NumInputs %d" % n_inputs)
+        my_defines.append("#define InputType %s" % elem_hls_type)
+        my_defines.append("#define EmbeddingType %s" % emb_hls_type)
+        self.code_gen_dict["$DEFINES$"] = my_defines
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "int64_t"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """StreamingLookup<NumEmbeddings,  EmbeddingDim, NumInputs,
+            InputType, EmbeddingType >(in0, out, embeddings);"""
+        ]
+
+    def blackboxfunction(self):
+        ibits = self.get_instream_width()
+        packed_input_hls_type = "ap_uint<%d>" % ibits
+        obits = self.get_outstream_width()
+        packed_output_hls_type = "ap_uint<%d>" % obits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
+            % (self.onnx_node.name, packed_input_hls_type, packed_output_hls_type)
+        ]
+
+    def pragmas(self):
+        my_pragmas = ["#pragma HLS INTERFACE axis port=in0"]
+        my_pragmas.append("#pragma HLS INTERFACE axis port=out")
+        my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+        self.code_gen_dict["$PRAGMAS$"] = my_pragmas
+
+    def generate_params(self, model, path):
+        code_gen_dir = path
+        embeddings = model.get_initializer(self.onnx_node.input[1])
+        weight_filename = "{}/embeddings.hpp".format(code_gen_dir)
+        edt = DataType[self.get_nodeattr("EmbeddingType")]
+        # obits = self.get_outstream_width()
+        # packed_output_hls_type = "ap_uint<%d>" % obits
+        assert np.vectorize(edt.allowed)(
+            embeddings
+        ).all(), "Embeddings can't be expressed with type %s" % str(edt)
+        embeddings_hls_code = numpy_to_hls_code(
+            embeddings, edt, "embeddings", True, False
+        )
+        f_thresh = open(weight_filename, "w")
+        f_thresh.write(embeddings_hls_code)
+        f_thresh.close()
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = tuple(self.get_normal_input_shape())
+        exp_oshape = tuple(self.get_normal_output_shape())
+        folded_ishape = tuple(self.get_folded_input_shape())
+        folded_oshape = tuple(self.get_folded_output_shape())
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert inp.dtype == np.int64, "Inputs must be contained in int64 ndarray"
+        assert inp.shape == exp_ishape, """Input shape doesn't match expected shape."""
+        export_idt = self.get_input_datatype()
+        odt = self.get_output_datatype()
+
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output,
+                out_npy_path,
+                odt,
+                out_shape,
+                packed_bits,
+                target_bits,
+                reverse_inner=False,
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape."""
+
+    def bram_estimation(self):
+        # current calculation assumes embeddings always stored in BRAM_18Ks
+        width_factor = ceil(self.get_outstream_width() / 16)
+        depth_factor = ceil(self.get_nodeattr("NumEmbeddings") / 1024)
+        return width_factor * depth_factor
+
+    def bram_efficiency_estimation(self):
+        bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
+        ebits = self.get_outstream_width() * self.get_nodeattr("NumEmbeddings")
+        bram16_est_capacity = bram16_est * 18 * 1024
+        return ebits / bram16_est_capacity
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index edba084b5258de37198520257e438f90f8cc65e3..ba8a446f2cf7541c0bd2e1dff731afe2397942ef 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -26,12 +26,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
 import numpy as np
+import os
 
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.core.datatype import DataType
-from onnx import TensorProto, helper
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -163,19 +162,7 @@ class Pool_Batch(HLSCustomOp):
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -235,9 +222,9 @@ class Pool_Batch(HLSCustomOp):
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -296,9 +283,9 @@ class Pool_Batch(HLSCustomOp):
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
diff --git a/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
index 53446ff1f2aba30e69bf188c1673c738440567fb..cf065cf156abed591e579b3f257e8f442eb3a976 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py
@@ -47,6 +47,7 @@ class StreamingDataflowPartition(CustomOp):
             "partition_id": ("i", False, 0),
             "device_id": ("i", False, 0),
             "mem_port": ("s", False, ""),
+            "instance_name": ("s", False, ""),
         }
 
     def make_shape_compatible_op(self, model):
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 4d84b74dce001fca769ed2850a8f718ac942f14c..1791706afa217d5eb453064547c1ea66b306d227 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -26,13 +26,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import numpy as np
 import math
+import numpy as np
+import os
 import warnings
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+
 from finn.core.datatype import DataType
-from onnx import TensorProto, helper
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # does not do anything at the ONNX node-by-node level, and input-output
@@ -164,19 +164,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -227,9 +215,9 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -261,9 +249,9 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -330,10 +318,10 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             exp_shape
         ), "Input shape does not match expected shape."
 
-        if self.get_input_datatype() == DataType.BIPOLAR:
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
             # store bipolar activations as binary
             inp = (inp + 1) / 2
-            export_idt = DataType.BINARY
+            export_idt = DataType["BINARY"]
         else:
             export_idt = self.get_input_datatype()
         # reshape input into folded shape
@@ -376,7 +364,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
                 )
             )
         # binary -> bipolar if needed
-        if self.get_output_datatype() == DataType.BIPOLAR:
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
             out = context[node.output[0]]
             out = 2 * out - 1
             context[node.output[0]] = out
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 48b40f105c8cbcba1d82990d91c23e4e6614fc4d..68cd1ff9ea680e157f59353d0c9d05afc3d9d6d7 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -26,27 +26,27 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import warnings
 import math
-import os
 import numpy as np
+import os
+import textwrap
+import warnings
 
-from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.basic import (
+    calculate_matvec_accumulator_range,
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
-    calculate_matvec_accumulator_range,
 )
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
-    rtlsim_output_to_npy,
     pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
 )
+
 from . import templates
-import textwrap
 
 # ONNX i/o tensor shape assumptions for StreamingFCLayer:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
@@ -104,6 +104,16 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 "auto",
                 {"auto", "block", "distributed", "ultra"},
             ),
+            # FPGA resource type for threshold memories (if noActivation is False)
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            "ram_style_thresholds": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed"},
+            ),
             # (mem_mode = decoupled only) whether weights will be writable through
             # an AXI-lite interface during runtime
             # 1 for enabled, 0 for disabled.
@@ -140,19 +150,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def make_shape_compatible_op(self, model):
         oshape = self.get_normal_output_shape()
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -238,9 +236,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         mem_width = Q * W * P
         mmode = self.get_nodeattr("mem_mode")
         mstyle = self.get_nodeattr("ram_style")
-        if (mmode == "decoupled" and mstyle != "ultra") or (
-            mmode == "const" and self.calc_wmem() <= 128) or (
-            mmode == "external"
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
         ):
             return 0
         width_multiplier = math.ceil(mem_width / 72)
@@ -266,9 +265,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         mem_width = Q * W * P
         mmode = self.get_nodeattr("mem_mode")
         mstyle = self.get_nodeattr("ram_style")
-        if (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) or (
-            mmode == "const" and self.calc_wmem() <= 128) or (
-            mmode == "external"
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
         ):
             return 0
         # assuming SDP mode RAMB18s (see UG573 Table 1-10)
@@ -496,15 +496,15 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         ret = dict()
         inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
         out_hls_str = self.get_output_datatype().get_hls_datatype_str()
-        inp_is_binary = self.get_input_datatype() == DataType.BINARY
-        # out_is_binary = self.get_output_datatype() == DataType.BINARY
-        wt_is_binary = self.get_weight_datatype() == DataType.BINARY
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
         bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
         if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
             raise Exception("True binary (non-bipolar) inputs not yet supported")
-        inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR
-        # out_is_bipolar = self.get_output_datatype() == DataType.BIPOLAR
-        wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
         # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
         inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
         wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
@@ -554,7 +554,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         # ONNX uses (in_features, out_features) and matmul(x, W)
         # finn-hlslib uses (out_features, in_features) and matmul(W, x)
         ret = orig_weight_matrix.T
-        if self.get_weight_datatype() == DataType.BIPOLAR:
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
             # convert bipolar to binary
             ret = (ret + 1) / 2
         # interleave rows between PEs and reshape
@@ -601,12 +601,14 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 if abs(tdt_min) > tdt_max:
                     tdt = DataType.get_smallest_possible(tdt_min)
                 else:
-                    tdt = DataType.get_smallest_possible(0 - tdt_max)
+                    tdt = DataType.get_smallest_possible(-tdt_max - 1)
             else:
                 tdt = DataType.get_smallest_possible(tdt_max)
-            assert np.vectorize(tdt.allowed)(threshold_tensor).all(), (
-                "Thresholds in %s can't be expressed with type %s"
-                % (self.onnx_node.name, str(tdt))
+            assert np.vectorize(tdt.allowed)(
+                threshold_tensor
+            ).all(), "Thresholds in %s can't be expressed with type %s" % (
+                self.onnx_node.name,
+                str(tdt),
             )
             self.set_nodeattr("accDataType", tdt.name)
         else:
@@ -614,7 +616,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 if abs(acc_min) > acc_max:
                     adt = DataType.get_smallest_possible(acc_min)
                 else:
-                    adt = DataType.get_smallest_possible(0 - acc_max)
+                    adt = DataType.get_smallest_possible(-acc_max - 1)
             else:
                 adt = DataType.get_smallest_possible(acc_max)
             # ensure a datatype divisible by 8-bits in case this is the last node
@@ -643,11 +645,11 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         ), """Threshold matrix dimension is
         not as expected (2)."""
         n_thres_steps = orig_thres_matrix.shape[1]
-        inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR
-        wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
         # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
-        inp_is_binary = self.get_input_datatype() == DataType.BINARY
-        wt_is_binary = self.get_weight_datatype() == DataType.BINARY
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
         bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
         inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
         wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
@@ -658,7 +660,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
         ret = orig_thres_matrix
         # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0:
+        if ret[0][0] == 0 and n_thres_steps == 1:
             ret = np.copy(ret)
             ret[0][0] = 1
             warnings.warn(
@@ -702,8 +704,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         export_wdt = self.get_weight_datatype()
         # we have converted bipolar weights to binary for export,
         # so use it as such for weight generation
-        if self.get_weight_datatype() == DataType.BIPOLAR:
-            export_wdt = DataType.BINARY
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            export_wdt = DataType["BINARY"]
         if weight_file_mode == "hls_header":
             weight_hls_code = numpy_to_hls_code(
                 weight_tensor, export_wdt, "weights", True, True
@@ -832,20 +834,22 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             if thresholds is not None:
                 threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
                 # use UINT32 threshold export for bipolar times bipolar
-                inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR
-                wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR
+                inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+                wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
                 # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
-                inp_is_binary = self.get_input_datatype() == DataType.BINARY
-                wt_is_binary = self.get_weight_datatype() == DataType.BINARY
+                inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+                wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
                 bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
                 inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
                 wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
                 # get computed threshold datatype from attribute
                 tdt = DataType[self.get_nodeattr("accDataType")]
 
-                assert np.vectorize(tdt.allowed)(threshold_tensor).all(), (
-                    "Thresholds in %s can't be expressed with type %s"
-                    % (self.onnx_node.name, str(tdt))
+                assert np.vectorize(tdt.allowed)(
+                    threshold_tensor
+                ).all(), "Thresholds in %s can't be expressed with type %s" % (
+                    self.onnx_node.name,
+                    str(tdt),
                 )
                 thresholds_hls_code = numpy_to_hls_code(
                     threshold_tensor, tdt, "thresholds", False, True
@@ -855,8 +859,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 tdt_hls = tdt.get_hls_datatype_str()
                 # use binary to export bipolar activations
                 export_odt = self.get_output_datatype()
-                if self.get_output_datatype() == DataType.BIPOLAR:
-                    export_odt = DataType.BINARY
+                if self.get_output_datatype() == DataType["BIPOLAR"]:
+                    export_odt = DataType["BINARY"]
                 odt_hls = export_odt.get_hls_datatype_str()
                 f_thresh.write(
                     "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
@@ -904,10 +908,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 not float32 as expected."""
                 expected_inp_shape = self.get_folded_input_shape()
                 reshaped_input = context[inputs].reshape(expected_inp_shape)
-                if self.get_input_datatype() == DataType.BIPOLAR:
+                if self.get_input_datatype() == DataType["BIPOLAR"]:
                     # store bipolar activations as binary
                     reshaped_input = (reshaped_input + 1) / 2
-                    export_idt = DataType.BINARY
+                    export_idt = DataType["BINARY"]
                 else:
                     export_idt = self.get_input_datatype()
                 # make copy before saving the array
@@ -926,7 +930,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # load output npy file
             super().npy_to_dynamic_output(context)
             # reinterpret binary output as bipolar where needed
-            if self.get_output_datatype() == DataType.BIPOLAR:
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
                 out = context[node.output[0]]
                 out = 2 * out - 1
                 context[node.output[0]] = out
@@ -949,8 +953,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 export_wdt = self.get_weight_datatype()
                 # we have converted bipolar weights to binary for export,
                 # so use it as such for weight generation
-                if self.get_weight_datatype() == DataType.BIPOLAR:
-                    export_wdt = DataType.BINARY
+                if self.get_weight_datatype() == DataType["BIPOLAR"]:
+                    export_wdt = DataType["BINARY"]
                 wei = npy_to_rtlsim_input(
                     "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
                 )
@@ -1005,6 +1009,17 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
 
     def defines(self, var):
+        # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements.
+        if var == "ipgen":
+            SIMD = self.get_nodeattr("SIMD")
+            MW = self.get_nodeattr("MW")
+            condition = SIMD >= (MW / 1024)
+            msg = (
+                f"HLS synthesis of StreamingFCLayer_Batch requires: "
+                f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} "
+                f"and MW={MW} for node: {self.onnx_node.name}."
+            )
+            assert condition, msg
         mem_mode = self.get_nodeattr("mem_mode")
         numInputVectors = list(self.get_nodeattr("numInputVectors"))
         numReps = np.prod(numInputVectors)
@@ -1030,9 +1045,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -1106,8 +1121,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             ]
         elif mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
-            if wdt == DataType.BIPOLAR:
-                export_wdt = DataType.BINARY
+            if wdt == DataType["BIPOLAR"]:
+                export_wdt = DataType["BINARY"]
             else:
                 export_wdt = wdt
             wdtype_hls_str = export_wdt.get_hls_datatype_str()
@@ -1132,9 +1147,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -1194,6 +1209,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def pragmas(self):
         mem_mode = self.get_nodeattr("mem_mode")
+        ram_style_thresholds = self.get_nodeattr("ram_style_thresholds")
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
         in_fifo_depth = self.get_nodeattr("inFIFODepth")
@@ -1252,6 +1268,28 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     "complete dim=3"
                 )
             )
+            # add resource pragma for thresholds if set
+            if ram_style_thresholds == "distributed":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
+                        "core=ROM_2P_LUTRAM"
+                    )
+                )
+            elif ram_style_thresholds == "block":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
+                        "core=ROM_2P_BRAM"
+                    )
+                )
+            elif ram_style_thresholds == "auto":
+                # no pragma needed
+                pass
+            else:
+                raise Exception(
+                    "Unrecognized ram_style_thresholds value:" + ram_style_thresholds
+                )
 
     def code_generation_ipi(self):
         cmd = []
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 133a869b28cf9968a719e243a3266dfb25b637ba..91f6ed5b8d29fd72ea1fbb8a3da94cfc103af88e 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -25,16 +25,15 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import os
+import math
 import numpy as np
-from shutil import copy
+import os
 import subprocess
-import math
 import warnings
+from shutil import copy
 
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.core.datatype import DataType
-from onnx import TensorProto, helper
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 from . import templates
@@ -78,19 +77,7 @@ class StreamingFIFO(HLSCustomOp):
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingFIFO."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -261,10 +248,10 @@ class StreamingFIFO(HLSCustomOp):
                 not float32 as expected."""
             expected_inp_shape = self.get_folded_input_shape()
             reshaped_input = inp.reshape(expected_inp_shape)
-            if DataType[self.get_nodeattr("dataType")] == DataType.BIPOLAR:
+            if DataType[self.get_nodeattr("dataType")] == DataType["BIPOLAR"]:
                 # store bipolar activations as binary
                 reshaped_input = (reshaped_input + 1) / 2
-                export_idt = DataType.BINARY
+                export_idt = DataType["BINARY"]
             else:
                 export_idt = DataType[self.get_nodeattr("dataType")]
             # make copy before saving the array
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 07e1197af54fe5267995cf15424a02df8a5e1500..1e66a5c204cc62bb7620907f82fcd5b2072bc184 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -26,13 +26,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
 import numpy as np
+import os
 import warnings
+
+from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.custom_op.general.im2col import compute_conv_output_dim
-from finn.core.datatype import DataType
-from onnx import TensorProto, helper
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -41,8 +41,8 @@ class StreamingMaxPool_Batch(HLSCustomOp):
 
     def get_nodeattr_types(self):
         my_attrs = {
-            "ImgDim": ("i", True, 0),
-            "PoolDim": ("i", True, 0),
+            "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "PoolDim": ("ints", True, []),  # [H, W] = [Y, X]
             "NumChannels": ("i", True, 0),
             # FINN DataTypes for inputs/outputs
             "dataType": ("s", True, ""),
@@ -58,10 +58,27 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_normal_input_shape(self):
+    def get_1d_attrs_normalized(self):
+        # support both (1, D) and (D, 1) cases transparently:
+        # assume the dummy ('1') dimension is the Y-dimension, i.e.
+        # images and kernels (and their attributes) of dimension
+        # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D]
         ifm_dim = self.get_nodeattr("ImgDim")
+        k = self.get_nodeattr("PoolDim")
+        ifm_ch = self.get_nodeattr("NumChannels")
+        if ifm_dim[1] == 1:
+            ifm_dim = ifm_dim[::-1]
+            k = k[::-1]
+        return (ifm_dim, k, ifm_ch)
+
+    def is_1d(self):
+        ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
+        return (ifm_dim[0] == 1) and (k[0] == 1)
+
+    def get_normal_input_shape(self):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         ifm_ch = self.get_nodeattr("NumChannels")
-        ishape = (1, ifm_dim, ifm_dim, ifm_ch)
+        ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
     def get_folded_input_shape(self):
@@ -73,14 +90,17 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         return tuple(ret)
 
     def get_normal_output_shape(self):
-        k = self.get_nodeattr("PoolDim")
-        ifm_dim = self.get_nodeattr("ImgDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
+        k_h, k_w = tuple(self.get_nodeattr("PoolDim"))
         ifm_ch = self.get_nodeattr("NumChannels")
-        stride = k
+        stride_h = k_h
+        stride_w = k_w
         pad = 0
-        assert ifm_dim % k == 0, "StreamingMaxPool needs ImgDim % PoolDim == 0"
-        ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad)
-        oshape = (1, ofm_dim, ofm_dim, ifm_ch)
+        assert ifm_dim_h % k_h == 0, "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0"
+        assert ifm_dim_w % k_w == 0, "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad)
+        oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch)
         return oshape
 
     def get_folded_output_shape(self):
@@ -97,9 +117,12 @@ class StreamingMaxPool_Batch(HLSCustomOp):
 
     def get_exp_cycles(self):
         # derived from StreamingMaxPool_Batch loop nest
-        k = self.get_nodeattr("PoolDim")
-        ifm_dim = self.get_nodeattr("ImgDim")
-        return int(ifm_dim * (ifm_dim + (ifm_dim / k)))
+        ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
+        if self.is_1d():
+            return int(ifm_dim[1] + k[1])
+        else:
+            # TODO: adjust inaccurate formula
+            return int(ifm_dim[1] * (ifm_dim[1] + (ifm_dim[1] / k[1])))
 
     def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
@@ -116,19 +139,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         oshape = self.get_normal_output_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpect input shape for StreamingMaxPool."
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -166,11 +177,13 @@ class StreamingMaxPool_Batch(HLSCustomOp):
 
     def defines(self, var):
         numReps = 2
+        ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
+
         self.code_gen_dict["$DEFINES$"] = [
             """#define ImgDim {}\n #define PoolDim {}\n
             #define NumChannels {}\n #define numReps {}""".format(
-                self.get_nodeattr("ImgDim"),
-                self.get_nodeattr("PoolDim"),
+                ifm_dim[1],
+                k[1],
                 self.get_nodeattr("NumChannels"),
                 numReps,
             )
@@ -179,9 +192,9 @@ class StreamingMaxPool_Batch(HLSCustomOp):
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_input_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -206,12 +219,18 @@ class StreamingMaxPool_Batch(HLSCustomOp):
     def docompute(self):
         dtype = self.get_input_datatype()
         if dtype.bitwidth() == 1:
-            op = "StreamingMaxPool_Batch"
+            if self.is_1d():
+                raise Exception("Binary 1d MaxPool not implemented on HLS backend")
+            else:
+                op = "StreamingMaxPool_Batch"
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 "%s<ImgDim, PoolDim, NumChannels>(in0, out, numReps);" % (op)
             ]
         else:
-            op = "StreamingMaxPool_Precision_Batch"
+            if self.is_1d():
+                op = "StreamingMaxPool_Precision_Batch_1d"
+            else:
+                op = "StreamingMaxPool_Precision_Batch"
             dtype = self.get_input_datatype()
             dtype_hls = dtype.get_hls_datatype_str()
             minval_str = str(int(dtype.min()))
@@ -223,9 +242,9 @@ class StreamingMaxPool_Batch(HLSCustomOp):
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
@@ -291,10 +310,10 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             inp.shape == exp_ishape
         ), """Input shape doesn't
         match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
-        if self.get_input_datatype() == DataType.BIPOLAR:
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
             # store bipolar activations as binary
             inp = (inp + 1) / 2
-            export_idt = DataType.BINARY
+            export_idt = DataType["BINARY"]
         else:
             export_idt = self.get_input_datatype()
         # no reshaping for input since assuming no folding on input
@@ -341,7 +360,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
                 )
             )
         # binary -> bipolar if needed
-        if self.get_output_datatype() == DataType.BIPOLAR:
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
             out = context[node.output[0]]
             out = 2 * out - 1
             context[node.output[0]] = out
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 40221ce3b303fc9c1ec1851c7260c82dc3f0b40a..e253348598d72897c2a8f83f5bee04351eb43d32 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -88,12 +88,13 @@ puts "HW source dir: $config_hwsrcdir"
 set config_proj_part "$FPGAPART$"
 
 set config_bnnlibdir "$FINNHLSLIBDIR$"
+set config_customhlsdir "$FINNHLSCUSTOMDIR$"
 
 set config_toplevelfxn "$TOPFXN$"
 set config_clkperiod $CLKPERIOD$
 
 open_project $config_proj_name
-add_files $config_hwsrcdir/top_$TOPFXN$.cpp -cflags "-std=c++0x -I$config_bnnlibdir"
+add_files $config_hwsrcdir/top_$TOPFXN$.cpp -cflags "-std=c++0x -I$config_bnnlibdir -I$config_customhlsdir"
 
 set_top $config_toplevelfxn
 open_solution sol1
@@ -355,56 +356,3 @@ $LAYER_NAME$
 
 endmodule
 """
-
-decoupled_thresholding_template = """
-template <
-    unsigned ImgDim, unsigned NumChannels, unsigned PE,
-    typename TSrcI = Identity, typename TDstI = Identity,
-    int ActVal=0, typename TT, unsigned int NumSteps,
-    typename TI, typename TO>
-void Thresholding_Stream_Batch(hls::stream<TI> &in,
-                        hls::stream<TO> &out,
-                        hls::stream<ap_uint<PE*NumSteps*TT::width>> &weight,
-                        int const reps)
-{
-
-  // how many different rows each neuron will compute
-  // alternatively: number of vertical matrix chunks
-  unsigned const NF = NumChannels / PE;
-
-  ThresholdsActivation<1, PE, NumSteps, TT, TO, ActVal, comp::less_equal<TT>> internal_thr;
-  #pragma HLS ARRAY_PARTITION variable=internal_thr.m_thresholds complete dim=0
-
-  // everything merged into a common iteration space (one "big" loop instead
-  // of smaller nested loops) to get the pipelinening the way we want
-  for (unsigned i = 0; i < reps * ImgDim * ImgDim * NF; i++)
-  {
-    #pragma HLS PIPELINE II=1
-
-    ap_uint<PE*NumSteps*TT::width> packed_thr;
-    packed_thr = weight.read();
-    // slicer to get 1 PE's worth of thresholds
-    auto const pe_slicer = Slice<ap_uint<NumSteps*TT::width>>()(packed_thr);
-
-    TI inElem;
-    inElem = in.read();
-    auto outElem = TDstI().template operator()<TO>();
-
-    for (unsigned pe = 0; pe < PE; pe++)
-    {
-#pragma HLS UNROLL
-      // slicer to get individual thresholds
-      auto const thr_slicer = Slice<TT>()(pe_slicer(pe, 0));
-      for (unsigned nt = 0; nt < NumSteps; nt++)
-      {
-      #pragma HLS UNROLL
-        internal_thr.m_thresholds[pe][0][nt] = thr_slicer(nt, 0);
-      }
-
-      auto const act = TSrcI()(inElem);
-      outElem(pe,0,1) = internal_thr.activate(0, pe, act(pe,0));
-    }
-    out.write(outElem);
-  }
-}
-"""
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 0b248c15035a2b685ebfb024c8a944a6ea6c65bf..610139f44ee7e8be1320b47c99222667fa6ed850 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -26,13 +26,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from math import ceil, log2
-import textwrap
+import numpy as np
 import os
+import textwrap
 import warnings
-import numpy as np
+from math import ceil, log2
 
-from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.basic import (
@@ -42,9 +41,10 @@ from finn.util.basic import (
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
-    rtlsim_output_to_npy,
     pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
 )
+
 from . import templates
 
 # ONNX i/o tensor shape assumptions for Thresholding:
@@ -111,19 +111,7 @@ class Thresholding_Batch(HLSCustomOp):
 
     def make_shape_compatible_op(self, model):
         oshape = self.get_normal_output_shape()
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -131,8 +119,8 @@ class Thresholding_Batch(HLSCustomOp):
         if idt != self.get_input_datatype():
             warn_str = "inputDataType changing for %s: %s -> %s " % (
                 node.name,
-                str(self.get_input_datatype()),
-                str(idt),
+                str(self.get_input_datatype().name),
+                str(idt.name),
             )
             warnings.warn(warn_str)
         self.set_nodeattr("inputDataType", idt.name)
@@ -180,7 +168,7 @@ class Thresholding_Batch(HLSCustomOp):
             return 0
 
     def lut_estimation(self):
-        """Calculates LUT cost, taking memory resource type into account """
+        """Calculates LUT cost, taking memory resource type into account"""
         # TODO add in/out FIFO contributions
         style = self.get_nodeattr("ram_style")
         P = self.get_nodeattr("PE")
@@ -224,7 +212,7 @@ class Thresholding_Batch(HLSCustomOp):
             if abs(tdt_min) > tdt_max:
                 tdt = DataType.get_smallest_possible(tdt_min)
             else:
-                tdt = DataType.get_smallest_possible(0 - tdt_max - 1)
+                tdt = DataType.get_smallest_possible(-tdt_max - 1)
         else:
             tdt = DataType.get_smallest_possible(tdt_max)
         assert np.vectorize(tdt.allowed)(
@@ -335,7 +323,7 @@ class Thresholding_Batch(HLSCustomOp):
         ).all(), "Need int threshold tensor"
         ret = orig_thres_matrix
         # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0:
+        if ret[0][0] == 0 and n_thres_steps == 1:
             ret = np.copy(ret)
             ret[0][0] = 1
             warnings.warn(
@@ -389,8 +377,8 @@ class Thresholding_Batch(HLSCustomOp):
             tdt_hls = tdt.get_hls_datatype_str()
             # use binary to export bipolar activations
             export_odt = self.get_output_datatype()
-            if self.get_output_datatype() == DataType.BIPOLAR:
-                export_odt = DataType.BINARY
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
+                export_odt = DataType["BINARY"]
             odt_hls = export_odt.get_hls_datatype_str()
             f_thresh.write(
                 "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
@@ -514,10 +502,10 @@ class Thresholding_Batch(HLSCustomOp):
                 not float32 as expected."""
                 expected_inp_shape = self.get_folded_input_shape()
                 reshaped_input = context[inputs].reshape(expected_inp_shape)
-                if self.get_input_datatype() == DataType.BIPOLAR:
+                if self.get_input_datatype() == DataType["BIPOLAR"]:
                     # store bipolar activations as binary
                     reshaped_input = (reshaped_input + 1) / 2
-                    export_idt = DataType.BINARY
+                    export_idt = DataType["BINARY"]
                 else:
                     export_idt = self.get_input_datatype()
                 # make copy before saving the array
@@ -536,7 +524,7 @@ class Thresholding_Batch(HLSCustomOp):
             # load output npy file
             super().npy_to_dynamic_output(context)
             # reinterpret binary output as bipolar where needed
-            if self.get_output_datatype() == DataType.BIPOLAR:
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
                 out = context[node.output[0]]
                 out = 2 * out - 1
                 context[node.output[0]] = out
@@ -604,7 +592,9 @@ class Thresholding_Batch(HLSCustomOp):
         numReps = numInputVectors[0]
         self.code_gen_dict["$DEFINES$"] = [
             """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}""".format(
-                self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps,
+                self.get_nodeattr("NumChannels"),
+                self.get_nodeattr("PE"),
+                numReps,
             )
         ]
         if self.get_nodeattr("mem_mode") == "decoupled":
@@ -618,10 +608,6 @@ class Thresholding_Batch(HLSCustomOp):
             self.code_gen_dict["$DEFINES$"].append(
                 "#define NumSteps1 %d" % self.get_nodeattr("numSteps")
             )
-            # TODO remove once Thresholding_Stream_Batch is in hlslib:
-            self.code_gen_dict["$DEFINES$"].append(
-                templates.decoupled_thresholding_template
-            )
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -676,25 +662,32 @@ class Thresholding_Batch(HLSCustomOp):
         node = self.onnx_node
         ishape = self.get_folded_input_shape()
         if len(ishape) == 3:
-            imgdim = 1
+            imgdimh = 1
+            imgdimw = 1
         elif len(ishape) == 5:
-            imgdim = ishape[1]
+            imgdimh = ishape[1]
+            imgdimw = ishape[2]
         else:
-            raise Exception("""Unexpeted input shape""")
+            raise Exception("""Unexpected input shape""")
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "const":
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}>
+                """{}<{}, {}, NumChannels1, PE1, {}, {}>
                 (in0, out, threshs, numReps);""".format(
-                    node.op_type, imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"],
+                    node.op_type,
+                    imgdimh,
+                    imgdimw,
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
                 )
             ]
         elif mem_mode == "decoupled":
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
+                """{}<{}, {}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
                 (in0, out, weights, numReps);""".format(
                     "Thresholding_Stream_Batch",
-                    imgdim,
+                    imgdimh,
+                    imgdimw,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
@@ -705,9 +698,9 @@ class Thresholding_Batch(HLSCustomOp):
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         dtype = self.get_output_datatype()
-        if dtype == DataType.BIPOLAR:
+        if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
-            dtype = DataType.BINARY
+            dtype = DataType["BINARY"]
         elem_bits = dtype.bitwidth()
         packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7114cd83ed08b53eab2cfe38d98d84944d537168
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/upsampler.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import warnings
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class UpsampleNearestNeighbour_Batch(HLSCustomOp):
+    """
+    Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function.
+    Upsampling is done with the Nearest Neighbour algorithm.
+    The layer expects square feature maps for the in and output.
+    """
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # Size of the output feature map
+            "OFMDim": ("i", True, 0),
+            # Size of the input feature map
+            "IFMDim": ("i", True, 0),
+            # Amount of channels of the input feature map
+            "NumChannels": ("i", True, 0),
+            # FINN input datatype
+            "inputDataType": ("s", True, ""),
+            # Batch size
+            "numInputVectors": ("i", False, 1),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_exp_cycles(self):
+        OFMDim = self.get_nodeattr("OFMDim")
+        batch_size = self.get_nodeattr("numInputVectors")
+        exp_cycles = OFMDim * OFMDim * batch_size
+        return int(exp_cycles)
+
+    def get_normal_input_shape(self):
+        IFMDim = self.get_nodeattr("IFMDim")
+        num_ch = self.get_nodeattr("NumChannels")
+        batch = self.get_nodeattr("numInputVectors")
+        ishape = (batch, IFMDim, IFMDim, num_ch)
+        return ishape
+
+    def get_normal_output_shape(self):
+        OFMDim = self.get_nodeattr("OFMDim")
+        num_ch = self.get_nodeattr("NumChannels")
+        batch = self.get_nodeattr("numInputVectors")
+        oshape = (batch, OFMDim, OFMDim, num_ch)
+        return oshape
+
+    def get_folded_input_shape(self):
+        normal_ishape = list(self.get_normal_input_shape())
+        return tuple(normal_ishape)
+
+    def get_folded_output_shape(self):
+        normal_oshape = list(self.get_normal_output_shape())
+        return tuple(normal_oshape)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert (
+            ishape == exp_ishape
+        ), "Unexpect input shape for UpsampleNearestNeighbour_Batch."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        return ret
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self):
+        ibits = self.get_input_datatype().bitwidth()
+        ifm_ch = self.get_nodeattr("NumChannels")
+        return ibits * ifm_ch
+
+    def get_outstream_width(self):
+        obits = self.get_output_datatype().bitwidth()
+        ifm_ch = self.get_nodeattr("NumChannels")
+        return obits * ifm_ch
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+        ifm_ch = self.get_nodeattr("NumChannels")
+        self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)]
+
+        ibits = self.get_input_datatype().bitwidth()
+        self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)]
+
+        idim = self.get_nodeattr("IFMDim")
+        self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)]
+
+        odim = self.get_nodeattr("OFMDim")
+        self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)]
+
+        batch_size = self.get_nodeattr("numInputVectors")
+        self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """UpsampleNearestNeighbour_Batch<OFMDim, IFMDim, IFMChannels,
+            ap_uint<Input_precision> > (in0, out, numReps);"""
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
+            % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels)."""
+        export_idt = self.get_input_datatype()
+
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape
+            (1, OutputDim, OutputDim, NumChannels)."""
diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
index fead30650c60d38f9cd70de8f1515f847e15276f..e0f789a8883aad83ed8c8b37a16392308bc720cc 100644
--- a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
+++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
@@ -1,20 +1,48 @@
-import os
-import numpy as np
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import math
-from onnx import TensorProto, helper
+import numpy as np
+import os
+import warnings
+
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.basic import (
+    calculate_matvec_accumulator_range,
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
-    calculate_matvec_accumulator_range,
 )
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
     rtlsim_output_to_npy,
 )
-import warnings
 
 
 class Vector_Vector_Activate_Batch(HLSCustomOp):
@@ -82,7 +110,7 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
                 if abs(tdt_min) > tdt_max:
                     tdt = DataType.get_smallest_possible(tdt_min)
                 else:
-                    tdt = DataType.get_smallest_possible(0 - tdt_max)
+                    tdt = DataType.get_smallest_possible(-tdt_max - 1)
             else:
                 tdt = DataType.get_smallest_possible(tdt_max)
             assert np.vectorize(tdt.allowed)(
@@ -97,7 +125,7 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
                 if abs(acc_min) > acc_max:
                     adt = DataType.get_smallest_possible(acc_min)
                 else:
-                    adt = DataType.get_smallest_possible(0 - acc_max)
+                    adt = DataType.get_smallest_possible(-acc_max - 1)
             else:
                 adt = DataType.get_smallest_possible(acc_max)
             # ensure a datatype divisible by 8-bits in case this is the last node
@@ -128,19 +156,7 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
 
     def make_shape_compatible_op(self, model):
         oshape = self.get_normal_output_shape()
-        # implement tensor with correct shape
-        values = np.random.randn(*oshape).astype(np.float32)
-        return helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=[self.onnx_node.output[0]],
-            value=helper.make_tensor(
-                name="const_tensor",
-                data_type=TensorProto.FLOAT,
-                dims=values.shape,
-                vals=values.flatten().astype(float),
-            ),
-        )
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -235,8 +251,8 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
         ret = dict()
         inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
         out_hls_str = self.get_output_datatype().get_hls_datatype_str()
-        inp_is_bipolar = self.get_input_datatype() == DataType.BIPOLAR
-        wt_is_bipolar = self.get_weight_datatype() == DataType.BIPOLAR
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
         # fill in TSrcI and TWeightI
         # TODO handle bipolar inputs
         if inp_is_bipolar or wt_is_bipolar:
diff --git a/src/finn/qnn-data/build_dataflow/dataflow_build_config.json b/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
index 5e4cf2d3028fb48cbc768b744bb0144d4f0d5fda..27ec38f6a4eb55c99dc4805f91d6e388e735308c 100644
--- a/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
+++ b/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
@@ -6,6 +6,7 @@
   "board": "Pynq-Z1",
   "standalone_thresholds": true,
   "shell_flow_type": "vivado_zynq",
+  "verify_save_rtlsim_waveforms": true,
   "verify_steps": [
     "initial_python",
     "streamlined_python",
diff --git a/src/finn/qnn-data/cpp/npy2apintstream.hpp b/src/finn/qnn-data/cpp/npy2apintstream.hpp
index f3afbc5bfb16e2423184e334e78b96a8cdeef45c..6aade3a2bbe2ba9914728802a8a6a448ef2d9fb2 100644
--- a/src/finn/qnn-data/cpp/npy2apintstream.hpp
+++ b/src/finn/qnn-data/cpp/npy2apintstream.hpp
@@ -3,6 +3,7 @@
 #include "hls_stream.h"
 #include "ap_int.h"
 #include <vector>
+#include <stdio.h>
 
 #ifdef DEBUG
 #define DEBUG_NPY2APINTSTREAM(x) std::cout << "[npy2apintstream] " << x << std::endl;
@@ -34,7 +35,7 @@ void npy2apintstream(const char * npy_path, hls::stream<PackedT> & out_stream, b
         NpyT loaded_elem_npyt = *loaded_data;
         ElemT loaded_elem = (ElemT) loaded_elem_npyt;
         DEBUG_NPY2APINTSTREAM("NpyT " << loaded_elem_npyt << " elem " << loaded_elem)
-        packed_elem((i+1)*ElemBits-1, i*ElemBits) = loaded_elem;
+        packed_elem((i+1)*ElemBits-1, i*ElemBits) = *reinterpret_cast<ap_uint<ElemBits>*>(&loaded_elem);
         loaded_data++;
       }
       DEBUG_NPY2APINTSTREAM("packed hls elem " << std::hex << packed_elem << std::dec)
@@ -44,25 +45,34 @@ void npy2apintstream(const char * npy_path, hls::stream<PackedT> & out_stream, b
 }
 
 template <typename PackedT, typename ElemT, int ElemBits, typename NpyT>
-void apintstream2npy(hls::stream<PackedT> & in_stream, const std::vector<size_t> & shape, const char * npy_path, bool reverse_inner = true, size_t numReps = 1) {
+void apintstream2npy(hls::stream<PackedT> & in_stream, const std::vector<size_t> & shape, const char * npy_path, bool reverse_inner = true, size_t numReps = 1, size_t multi_pixel_out = 1) {
   for(size_t rep = 0; rep < numReps; rep++) {
     std::vector<NpyT> data_to_save;
     size_t outer_dim_elems = 1;
     for(size_t dim = 0; dim < shape.size()-1; dim++) {
       outer_dim_elems *= shape[dim];
     }
-    size_t inner_dim_elems = shape[shape.size()-1];
-    DEBUG_APINTSTREAM2NPY("n_outer " << outer_dim_elems << " n_inner " << inner_dim_elems)
+    size_t inner_dim_elems = shape[shape.size()-1] / multi_pixel_out;
+    DEBUG_APINTSTREAM2NPY("n_outer " << outer_dim_elems << " n_inner " << inner_dim_elems << " n_multi_pixel_out " << multi_pixel_out)
     for(size_t outer_elem = 0; outer_elem < outer_dim_elems; outer_elem++) {
       PackedT packed_elem;
       in_stream >> packed_elem;
       DEBUG_APINTSTREAM2NPY("packed hls elem " << std::hex << packed_elem << std::dec)
-      for(size_t ii = 0; ii < inner_dim_elems; ii++) {
-        size_t i = reverse_inner ? inner_dim_elems-ii-1 : ii;
-        ElemT elem = packed_elem((i+1)*ElemBits-1, i*ElemBits);
-        NpyT npyt = (NpyT) elem;
-        DEBUG_APINTSTREAM2NPY("elem " << elem << " NpyT " << npyt)
-        data_to_save.push_back(npyt);
+      for(size_t ii_multi_pixel_out = 0; ii_multi_pixel_out < multi_pixel_out; ii_multi_pixel_out++) {
+        // loop over multi_pixel_out blocks of inner_dim_elems separately,
+        // so that reverse_inner is not applied across multiple pixels
+        for(size_t ii = 0; ii < inner_dim_elems; ii++) {
+          size_t i = ii_multi_pixel_out*inner_dim_elems;
+          i += reverse_inner ? inner_dim_elems-ii-1 : ii;
+          ap_uint<ElemBits> tmp_elem = packed_elem((i+1)*ElemBits-1, i*ElemBits);
+          // important: don't init elem = reinterpret_cast.. directly here
+          // this causes weird behavior for conversion to NpyT afterwards
+          ElemT elem;
+          elem = reinterpret_cast<ElemT&>(tmp_elem);
+          NpyT npyt = (NpyT) elem;
+          DEBUG_APINTSTREAM2NPY("elem " << elem << " NpyT " << npyt)
+          data_to_save.push_back(npyt);
+        }
       }
     }
     cnpy::npy_save(npy_path, &data_to_save[0], shape, "w");
diff --git a/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py b/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py
index 2fabc716a66a3cc24697e49aa26ec3bbbb231b43..be09abad9c10b8b5e9a32e21233107421fdef95e 100644
--- a/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py
+++ b/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py
@@ -27,9 +27,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
+import numpy as np
 from driver import io_shape_dict
 from driver_base import FINNExampleOverlay
-import numpy as np
 
 
 def make_unsw_nb15_test_batches(bsize, dataset_root, limit_batches):
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
index df3c9881372659a4d8f6fceb8a385e6055c161e1..b6dd8350809a33ab5dad3e21b0f52f41cbe872ec 100644
--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -27,19 +27,18 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
-import time
 import os
+import time
 from pynq import Overlay, allocate
 from pynq.ps import Clocks
 
+from finn.core.datatype import DataType
+from finn.util.basic import gen_finn_dt_tensor
 from finn.util.data_packing import (
     finnpy_to_packed_bytearray,
     packed_bytearray_to_finnpy,
 )
 
-from finn.util.basic import gen_finn_dt_tensor
-from finn.core.datatype import DataType
-
 # Driver base class for FINN-generated dataflow accelerators.
 # The particulars of the generated accelerator are specified via the
 # io_shape_dict (generated by the MakePYNQDriver transformation).
@@ -86,24 +85,27 @@ class FINNExampleOverlay(Overlay):
         self.platform = platform
         self.batch_size = batch_size
         self.fclk_mhz = fclk_mhz
-        if self.platform == "alveo":
-            if "input_dma_name" in io_shape_dict.keys():
-                self.idma = getattr(self, io_shape_dict["input_dma_name"])
-            else:
-                self.idma = self.idma0
-            self.odma = self.odma0
-            self.odma_handle = None
-        elif self.platform == "zynq-iodma":
-            if "input_dma_name" in io_shape_dict.keys():
-                self.idma = getattr(self, io_shape_dict["input_dma_name"])
-            else:
-                self.idma = self.idma0
-            self.odma = self.odma0
+        self.idma = []
+        self.odma = []
+        self.odma_handle = []
+        if "input_dma_name" in io_shape_dict.keys():
+            for idma_name in io_shape_dict["input_dma_name"]:
+                self.idma.append(getattr(self, idma_name))
+        else:
+            self.idma = [self.idma0]
+        if "output_dma_name" in io_shape_dict.keys():
+            for odma_name in io_shape_dict["output_dma_name"]:
+                self.odma.append(getattr(self, odma_name))
+                if self.platform == "alveo":
+                    self.odma_handle.append(None)
+        else:
+            self.odma = [self.odma0]
+            if self.platform == "alveo":
+                self.odma_handle.append(None)
+        if self.platform == "zynq-iodma":
             # set the clock frequency as specified by user during transformations
             if self.fclk_mhz > 0:
                 Clocks.fclk0_mhz = self.fclk_mhz
-        else:
-            raise ValueError("Supported platforms are zynq-iodma alveo")
         # load any external + runtime weights
         self.load_external_weights()
         self.load_runtime_weights()
@@ -205,50 +207,50 @@ class FINNExampleOverlay(Overlay):
             # run accelerator to flush any stale weights from weight streamer FIFOs
             self.execute_on_buffers()
 
-    @property
-    def idt(self):
-        return self._io_shape_dict["idt"]
+    def idt(self, ind=0):
+        return self._io_shape_dict["idt"][ind]
 
-    @property
-    def odt(self):
-        return self._io_shape_dict["odt"]
+    def odt(self, ind=0):
+        return self._io_shape_dict["odt"][ind]
 
-    @property
-    def ishape_normal(self):
-        ret = list(self._io_shape_dict["ishape_normal"])
+    def ishape_normal(self, ind=0):
+        ret = list(self._io_shape_dict["ishape_normal"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
-    @property
-    def oshape_normal(self):
-        ret = list(self._io_shape_dict["oshape_normal"])
+    def oshape_normal(self, ind=0):
+        ret = list(self._io_shape_dict["oshape_normal"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
-    @property
-    def ishape_folded(self):
-        ret = list(self._io_shape_dict["ishape_folded"])
+    def ishape_folded(self, ind=0):
+        ret = list(self._io_shape_dict["ishape_folded"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
-    @property
-    def oshape_folded(self):
-        ret = list(self._io_shape_dict["oshape_folded"])
+    def oshape_folded(self, ind=0):
+        ret = list(self._io_shape_dict["oshape_folded"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
-    @property
-    def ishape_packed(self):
-        ret = list(self._io_shape_dict["ishape_packed"])
+    def ishape_packed(self, ind=0):
+        ret = list(self._io_shape_dict["ishape_packed"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
-    @property
-    def oshape_packed(self):
-        ret = list(self._io_shape_dict["oshape_packed"])
+    def oshape_packed(self, ind=0):
+        ret = list(self._io_shape_dict["oshape_packed"][ind])
         ret[0] = self.batch_size
         return tuple(ret)
 
+    @property
+    def num_inputs(self):
+        return self._io_shape_dict["num_inputs"]
+
+    @property
+    def num_outputs(self):
+        return self._io_shape_dict["num_outputs"]
+
     @property
     def batch_size(self):
         return self._batch_size
@@ -262,68 +264,72 @@ class FINNExampleOverlay(Overlay):
             self.ibuf_packed_device = None
         if self.obuf_packed_device is not None:
             self.obuf_packed_device = None
-        if self.platform == "alveo":
-            self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
-            self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
-        else:
-            self.ibuf_packed_device = allocate(
-                shape=self.ishape_packed, dtype=np.uint8, cacheable=True
+        cacheable = {"alveo": False, "zynq-iodma": True}[self.platform]
+        self.ibuf_packed_device = []
+        self.obuf_packed_device = []
+        self.obuf_packed = []
+        for i in range(self.num_inputs):
+            new_packed_ibuf = allocate(
+                shape=self.ishape_packed(i), dtype=np.uint8, cacheable=cacheable
             )
-            self.obuf_packed_device = allocate(
-                shape=self.oshape_packed, dtype=np.uint8, cacheable=True
+            self.ibuf_packed_device.append(new_packed_ibuf)
+        for o in range(self.num_outputs):
+            new_packed_obuf = allocate(
+                shape=self.oshape_packed(o), dtype=np.uint8, cacheable=cacheable
             )
-        self.obuf_packed = np.empty_like(self.obuf_packed_device)
+            self.obuf_packed_device.append(new_packed_obuf)
+            self.obuf_packed.append(np.empty_like(new_packed_obuf))
 
-    def fold_input(self, ibuf_normal):
+    def fold_input(self, ibuf_normal, ind=0):
         """Reshapes input in desired shape.
         Gets input data (ibuf_normal), checks if data is in expected normal shape.
         Returns folded input."""
         # ensure that shape is as expected
-        assert ibuf_normal.shape == self.ishape_normal
+        assert ibuf_normal.shape == self.ishape_normal(ind)
         # convert to folded form
-        ibuf_folded = ibuf_normal.reshape(self.ishape_folded)
+        ibuf_folded = ibuf_normal.reshape(self.ishape_folded(ind))
         return ibuf_folded
 
-    def pack_input(self, ibuf_folded):
+    def pack_input(self, ibuf_folded, ind=0):
         """Packs folded input and reverses both SIMD dim and endianness.
         Gets input data in folded shape and returns packed input data."""
         ibuf_packed = finnpy_to_packed_bytearray(
             ibuf_folded,
-            self.idt,
+            self.idt(ind),
             reverse_endian=True,
             reverse_inner=True,
             fast_mode=True,
         )
         return ibuf_packed
 
-    def unpack_output(self, obuf_packed):
+    def unpack_output(self, obuf_packed, ind=0):
         """Unpacks the packed output buffer from accelerator.
         Gets packed output and returns output data in folded shape."""
         obuf_folded = packed_bytearray_to_finnpy(
             obuf_packed,
-            self.odt,
-            self.oshape_folded,
+            self.odt(ind),
+            self.oshape_folded(ind),
             reverse_endian=True,
             reverse_inner=True,
             fast_mode=True,
         )
         return obuf_folded
 
-    def unfold_output(self, obuf_folded):
+    def unfold_output(self, obuf_folded, ind=0):
         """Unfolds output data to normal shape.
         Gets folded output data and returns output data in normal shape."""
-        obuf_normal = obuf_folded.reshape(self.oshape_normal)
+        obuf_normal = obuf_folded.reshape(self.oshape_normal(ind))
         return obuf_normal
 
-    def copy_input_data_to_device(self, data):
+    def copy_input_data_to_device(self, data, ind=0):
         """Copies given input data to PYNQ buffer."""
-        np.copyto(self.ibuf_packed_device, data)
-        self.ibuf_packed_device.flush()
+        np.copyto(self.ibuf_packed_device[ind], data)
+        self.ibuf_packed_device[ind].flush()
 
-    def copy_output_data_from_device(self, data):
+    def copy_output_data_from_device(self, data, ind=0):
         """Copies PYNQ output buffer from device."""
-        self.obuf_packed_device.invalidate()
-        np.copyto(data, self.obuf_packed_device)
+        self.obuf_packed_device[ind].invalidate()
+        np.copyto(data, self.obuf_packed_device[ind])
 
     def execute_on_buffers(self, asynch=False, batch_size=None):
         """Executes accelerator by setting up the DMA(s) on pre-allocated buffers.
@@ -339,24 +345,36 @@ class FINNExampleOverlay(Overlay):
             batch_size = self.batch_size
         assert batch_size <= self.batch_size, "Specified batch_size is too large."
         if self.platform == "zynq-iodma":
-            assert self.odma.read(0x00) & 0x4 != 0, "Output DMA is not idle"
+            for o in range(self.num_outputs):
+                assert (
+                    self.odma[o].read(0x00) & 0x4 != 0
+                ), "Output DMA %d is not idle" % (o)
             # manually launch IODMAs since signatures are missing
             for iwdma, iwbuf, iwdma_name in self.external_weights:
                 iwdma.write(0x10, iwbuf.device_address)
                 iwdma.write(0x1C, batch_size)
                 iwdma.write(0x00, 1)
-            self.idma.write(0x10, self.ibuf_packed_device.device_address)
-            self.idma.write(0x1C, batch_size)
-            self.odma.write(0x10, self.obuf_packed_device.device_address)
-            self.odma.write(0x1C, batch_size)
-            self.idma.write(0x00, 1)
-            self.odma.write(0x00, 1)
+            for o in range(self.num_outputs):
+                self.odma[o].write(0x10, self.obuf_packed_device[o].device_address)
+                self.odma[o].write(0x1C, batch_size)
+                self.odma[o].write(0x00, 1)
+            for i in range(self.num_inputs):
+                self.idma[i].write(0x10, self.ibuf_packed_device[i].device_address)
+                self.idma[i].write(0x1C, batch_size)
+                self.idma[i].write(0x00, 1)
         elif self.platform == "alveo":
-            assert self.odma_handle is None, "Output DMA is already running"
-            self.idma.start(self.ibuf_packed_device, batch_size)
+            for o in range(self.num_outputs):
+                assert self.odma_handle[o] is None, (
+                    "Output DMA %d is already running" % o
+                )
+            for i in range(self.num_inputs):
+                self.idma[i].start(self.ibuf_packed_device[i], batch_size)
             for iwdma, iwbuf, iwdma_name in self.external_weights:
                 iwdma.start(iwbuf, batch_size)
-            self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size)
+            for o in range(self.num_outputs):
+                self.odma_handle[o] = self.odma[o].start(
+                    self.obuf_packed_device[o], batch_size
+                )
         else:
             raise Exception("Unrecognized platform: %s" % self.platform)
         # blocking behavior depends on asynch parameter
@@ -364,31 +382,48 @@ class FINNExampleOverlay(Overlay):
             self.wait_until_finished()
 
     def wait_until_finished(self):
-        "Block until the output DMA has finished writing."
+        "Block until all output DMAs have finished writing."
         if self.platform == "zynq-iodma":
             # check if output IODMA is finished via register reads
-            status = self.odma.read(0x00)
-            while status & 0x2 == 0:
-                status = self.odma.read(0x00)
+            for o in range(self.num_outputs):
+                status = self.odma[o].read(0x00)
+                while status & 0x2 == 0:
+                    status = self.odma[o].read(0x00)
         elif self.platform == "alveo":
-            assert self.odma_handle is not None, "No odma_handle to wait on"
-            self.odma_handle.wait()
-            self.odma_handle = None
+            assert all(
+                [x is not None for x in self.odma_handle]
+            ), "No odma_handle to wait on"
+            for o in range(self.num_outputs):
+                self.odma_handle[o].wait()
+                self.odma_handle[o] = None
         else:
             raise Exception("Unrecognized platform: %s" % self.platform)
 
     def execute(self, input_npy):
-        """Given input numpy array, first perform necessary packing and copying
-        to device buffers, execute on accelerator, then unpack output and return
-        output numpy array from accelerator."""
-        ibuf_folded = self.fold_input(input_npy)
-        ibuf_packed = self.pack_input(ibuf_folded)
-        self.copy_input_data_to_device(ibuf_packed)
+        """Given a single or a list of input numpy array, first perform necessary
+        packing and copying to device buffers, execute on accelerator, then unpack
+        output and return output numpy array from accelerator."""
+        # if single input, convert to list to normalize how we process the input
+        if not type(input_npy) is list:
+            input_npy = [input_npy]
+        assert self.num_inputs == len(
+            input_npy
+        ), "Not all accelerator inputs are specified."
+        for i in range(self.num_inputs):
+            ibuf_folded = self.fold_input(input_npy[i], ind=i)
+            ibuf_packed = self.pack_input(ibuf_folded, ind=i)
+            self.copy_input_data_to_device(ibuf_packed, ind=i)
         self.execute_on_buffers()
-        self.copy_output_data_from_device(self.obuf_packed)
-        obuf_folded = self.unpack_output(self.obuf_packed)
-        obuf_normal = self.unfold_output(obuf_folded)
-        return obuf_normal
+        outputs = []
+        for o in range(self.num_outputs):
+            self.copy_output_data_from_device(self.obuf_packed[o], ind=o)
+            obuf_folded = self.unpack_output(self.obuf_packed[o], ind=o)
+            obuf_normal = self.unfold_output(obuf_folded, ind=o)
+            outputs.append(obuf_normal)
+        if self.num_outputs == 1:
+            return outputs[0]
+        else:
+            return outputs
 
     def throughput_test(self):
         """Run accelerator with empty inputs to measure throughput and other metrics.
@@ -401,12 +436,14 @@ class FINNExampleOverlay(Overlay):
         runtime = end - start
         res["runtime[ms]"] = runtime * 1000
         res["throughput[images/s]"] = self.batch_size / runtime
-        res["DRAM_in_bandwidth[Mb/s]"] = (
-            np.prod(self.ishape_packed) * 0.000001 / runtime
-        )
-        res["DRAM_out_bandwidth[Mb/s]"] = (
-            np.prod(self.oshape_packed) * 0.000001 / runtime
-        )
+        total_in = 0
+        for i in range(self.num_inputs):
+            total_in += np.prod(self.ishape_packed(i))
+        res["DRAM_in_bandwidth[Mb/s]"] = total_in * 0.000001 / runtime
+        total_out = 0
+        for o in range(self.num_outputs):
+            total_out += np.prod(self.oshape_packed(o))
+        res["DRAM_out_bandwidth[Mb/s]"] = total_out * 0.000001 / runtime
         for iwdma, iwbuf, iwdma_name in self.external_weights:
             res["DRAM_extw_%s_bandwidth[Mb/s]" % iwdma_name] = (
                 self.batch_size * np.prod(iwbuf.shape) * 0.000001 / runtime
@@ -417,11 +454,11 @@ class FINNExampleOverlay(Overlay):
             res["fclk[mhz]"] = self.clock_dict["clock0"]["frequency"]
         res["batch_size"] = self.batch_size
         # also benchmark driver-related overheads
-        input_npy = gen_finn_dt_tensor(self.idt, self.ishape_normal)
+        input_npy = gen_finn_dt_tensor(self.idt(), self.ishape_normal())
         # provide as int8/uint8 to support fast packing path where possible
-        if self.idt == DataType.UINT8:
+        if self.idt() == DataType["UINT8"]:
             input_npy = input_npy.astype(np.uint8)
-        elif self.idt == DataType.INT8:
+        elif self.idt() == DataType["INT8"]:
             input_npy = input_npy.astype(np.int8)
         start = time.time()
         ibuf_folded = self.fold_input(input_npy)
@@ -442,13 +479,13 @@ class FINNExampleOverlay(Overlay):
         res["copy_input_data_to_device[ms]"] = runtime * 1000
 
         start = time.time()
-        self.copy_output_data_from_device(self.obuf_packed)
+        self.copy_output_data_from_device(self.obuf_packed[0])
         end = time.time()
         runtime = end - start
         res["copy_output_data_from_device[ms]"] = runtime * 1000
 
         start = time.time()
-        obuf_folded = self.unpack_output(self.obuf_packed)
+        obuf_folded = self.unpack_output(self.obuf_packed[0])
         end = time.time()
         runtime = end - start
         res["unpack_output[ms]"] = runtime * 1000
diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py
index 4aa7d67aa162e91b878d387bee1457e4b477e635..1b29d4342c830ae896e580f602e810ee25ed234d 100644
--- a/src/finn/qnn-data/templates/driver/validate.py
+++ b/src/finn/qnn-data/templates/driver/validate.py
@@ -27,9 +27,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
+import numpy as np
 from driver import io_shape_dict
 from driver_base import FINNExampleOverlay
-import numpy as np
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -94,11 +94,11 @@ if __name__ == "__main__":
     test_labels = test_labels.reshape(n_batches, bsize)
 
     for i in range(n_batches):
-        ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape)
+        ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device[0].shape)
         exp = test_labels[i]
         driver.copy_input_data_to_device(ibuf_normal)
         driver.execute_on_buffers()
-        obuf_normal = np.empty_like(driver.obuf_packed_device)
+        obuf_normal = np.empty_like(driver.obuf_packed_device[0])
         driver.copy_output_data_from_device(obuf_normal)
         ret = np.bincount(obuf_normal.flatten() == exp.flatten())
         nok += ret[0]
diff --git a/src/finn/transformation/fpgadataflow/annotate_cycles.py b/src/finn/transformation/fpgadataflow/annotate_cycles.py
index 2c547203df94308b929a7989f1a9102bd3002fed..5ab491dd1031cfec64308aee678edc9c94aa6da2 100644
--- a/src/finn/transformation/fpgadataflow/annotate_cycles.py
+++ b/src/finn/transformation/fpgadataflow/annotate_cycles.py
@@ -27,10 +27,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import finn.custom_op.registry as registry
-from finn.transformation.base import Transformation
-from finn.transformation.move_reshape import _is_fpgadataflow_node
 from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
+from finn.transformation.base import Transformation
+from finn.transformation.move_reshape import _is_fpgadataflow_node
 
 
 class AnnotateCycles(Transformation):
diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py
index 4e501510110ef724c6a67f6214654b5454b30a77..d9089cbeba6e0791f6d8375e28b2c2d99b506eda 100644
--- a/src/finn/transformation/fpgadataflow/annotate_resources.py
+++ b/src/finn/transformation/fpgadataflow/annotate_resources.py
@@ -27,13 +27,13 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import finn.custom_op.registry as registry
-from finn.transformation.base import Transformation
-from finn.transformation.move_reshape import _is_fpgadataflow_node
-from finn.analysis.fpgadataflow.res_estimation import res_estimation
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.analysis.fpgadataflow.post_synth_res import post_synth_res
+from finn.analysis.fpgadataflow.res_estimation import res_estimation
 from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
+from finn.transformation.base import Transformation
+from finn.transformation.move_reshape import _is_fpgadataflow_node
 
 
 class AnnotateResources(Transformation):
diff --git a/src/finn/transformation/fpgadataflow/cleanup.py b/src/finn/transformation/fpgadataflow/cleanup.py
index 5dbe5f0517d07bef07e5ecff6e4c7afff0293d86..f59f4bdeab72a5af9615ecf308306e4fb4b69fb5 100644
--- a/src/finn/transformation/fpgadataflow/cleanup.py
+++ b/src/finn/transformation/fpgadataflow/cleanup.py
@@ -30,8 +30,8 @@ import os
 import shutil
 
 import finn.custom_op.registry as registry
-from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.transformation.base import Transformation
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
 class CleanUp(Transformation):
diff --git a/src/finn/transformation/fpgadataflow/compile_cppsim.py b/src/finn/transformation/fpgadataflow/compile_cppsim.py
index 6321b3335907948fb49de966c80eb21637e0a6ec..5f7c534b4561ffc0fac0c8c2b6160279f4e34fbc 100644
--- a/src/finn/transformation/fpgadataflow/compile_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/compile_cppsim.py
@@ -27,8 +27,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import finn.custom_op.registry as registry
-from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.transformation.base import NodeLocalTransformation
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
 class CompileCppSim(NodeLocalTransformation):
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 1f3d40e929e29d16790a491bbfd7a4a5033f866f..113ccb93b839d6a3bd67e3bf8f23e477e86822c6 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -27,22 +27,22 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-from onnx import helper, TensorProto
 import numpy as np
 import warnings
+from onnx import TensorProto, helper
 
+import finn.core.data_layout as DataLayout
 from finn.core.datatype import DataType
-from finn.transformation.base import Transformation
 from finn.custom_op.registry import getCustomOp
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.general import SortGraph
-import finn.core.data_layout as DataLayout
-from finn.util.onnx import nchw_to_nhwc
-from finn.util.basic import get_by_name
+from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.minimize_accumulator_width import (
     MinimizeAccumulatorWidth,
 )
+from finn.transformation.general import SortGraph
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import get_by_name
+from finn.util.onnx import nchw_to_nhwc
 
 
 class InferConvInpGen(Transformation):
@@ -61,7 +61,9 @@ class InferConvInpGen(Transformation):
                 i2c_out_shape = model.get_tensor_shape(i2c_output)
                 dt = model.get_tensor_datatype(i2c_input)
                 if not dt.is_integer():
-                    warnings.warn("Input is not int. Can't infer ConvInpGen")
+                    warnings.warn(
+                        "%s : Input is not int. Can't infer ConvInpGen." % n.name
+                    )
                     continue
                 i2c_inst = getCustomOp(n)
                 stride_h, stride_w = i2c_inst.get_nodeattr("stride")
@@ -89,9 +91,10 @@ class InferConvInpGen(Transformation):
                     # if padding enabled, ensure pad_val supported by DataType
                     # assert dt.allowed(pad_val),"""FMPadding_Batch DataType
                     # must support pad_val"""
-                    assert (
-                        pad_val == 0
-                    ), "FMPadding_Batch doesn't currently support pad_val!= 0"
+                    assert pad_val == 0, (
+                        "%s : FMPadding_Batch doesn't currently support pad_val!= 0"
+                        % n.name
+                    )
 
                     odim_padding_h = ifm_dim_h + pad_h
                     odim_padding_w = ifm_dim_w + pad_w
@@ -121,6 +124,7 @@ class InferConvInpGen(Transformation):
                         NumChannels=ifm_ch,
                         inputDataType=dt.name,
                         SIMD=ifm_ch,
+                        name="FMPadding_Batch_" + n.name,
                     )
                     graph.node.insert(node_ind, padding_node)
 
@@ -134,11 +138,15 @@ class InferConvInpGen(Transformation):
                 )
 
                 if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
-                    assert (
-                        is_square_image
-                    ), "DownSampler currently only supports square input images."
-                    assert is_equal_stride, """DownSampler currently only supports equal stride value
+                    assert is_square_image, (
+                        "%s : DownSampler currently only supports square input images."
+                        % n.name
+                    )
+                    assert is_equal_stride, (
+                        """%s : DownSampler currently only supports equal stride value
                         along different axes."""
+                        % n.name
+                    )
                     ConvInpGen_idim = ConvInpGen_idim_h
                     stride = stride_h
                     # create DownSampler node
@@ -153,6 +161,7 @@ class InferConvInpGen(Transformation):
                         SIMD=ifm_ch,
                         Stride=stride,
                         inputDataType=dt.name,
+                        name="DownSampler_" + n.name,
                     )
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 else:
@@ -160,12 +169,16 @@ class InferConvInpGen(Transformation):
                     if (
                         is_square_image and is_square_kernel
                     ):  # square images and square kernels
-                        assert is_equal_stride, """Non-equal strides along different axes is not supported
+                        assert is_equal_stride, (
+                            """%s: Non-equal strides along different axes is not supported
                             for (non-)square convolutions"""
-                        assert (
-                            dilation_h == 1 and dilation_w == 1
-                        ), """Dilation value != 1 is not supported
+                            % n.name
+                        )
+                        assert dilation_h == 1 and dilation_w == 1, (
+                            """%s: Dilation value != 1 is not supported
                             for square convolutions"""
+                            % n.name
+                        )
                         ConvInpGen_node = helper.make_node(
                             "ConvolutionInputGenerator",
                             [ConvInpGen_input],
@@ -182,16 +195,19 @@ class InferConvInpGen(Transformation):
                             inputDataType=dt.name,
                             outputDataType=dt.name,
                             depthwise=depthwise,
+                            name="ConvolutionInputGenerator_" + n.name,
                         )
                     else:  # non-square images and/or kernels
-                        assert (
-                            is_1d_convolution
-                        ), "ConvultionInputGenerator1D works only for 1D convolutions"
+                        assert is_1d_convolution, (
+                            "%s: ConvolutionInputGenerator1D works only for 1D convs"
+                            % n.name
+                        )
                         if dilation_h > 1 or dilation_w > 1:
-                            assert (
-                                stride_h == 1 and stride_w == 1
-                            ), """Stride value of greater than 1 is not supported for convolutions
+                            assert stride_h == 1 and stride_w == 1, (
+                                """%s: Stride value of greater than 1 is not supported for convolutions
                                 with dilation value greater than 1"""
+                                % n.name
+                            )
                         ConvInpGen_node = helper.make_node(
                             "ConvolutionInputGenerator1D",
                             [ConvInpGen_input],
@@ -208,6 +224,7 @@ class InferConvInpGen(Transformation):
                             inputDataType=dt.name,
                             outputDataType=dt.name,
                             depthwise=depthwise,
+                            name="ConvolutionInputGenerator1D_" + n.name,
                         )
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
@@ -219,6 +236,102 @@ class InferConvInpGen(Transformation):
         return (model, graph_modified)
 
 
+class InferUpsample(Transformation):
+    """
+    Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour_Batch nodes.
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "Upsample" or n.op_type == "Resize":
+                # Extract mode and scales and input shape
+                mode = get_by_name(n.attribute, "mode").s.decode("ascii")
+                if n.op_type == "Upsample":
+                    scales = model.get_initializer(n.input[1])
+                else:
+                    scales = model.get_initializer(n.input[2])
+                in_shape = model.get_tensor_shape(n.input[0])
+
+                dt = model.get_tensor_datatype(n.input[0])
+                if not dt.is_integer():
+                    warnings.warn(
+                        "%s: Input not int. Can't infer UpsampleNearestNeighbour."
+                        % n.name
+                    )
+                    continue
+
+                if model.get_tensor_layout(n.input[0]) != DataLayout.NHWC:
+                    warnings.warn(
+                        "%s: Input not NHWC. Can't infer UpsampleNearestNeighbour."
+                        % n.name
+                    )
+                    continue
+
+                # Check that the parameters are okay
+                assert mode == "nearest", (
+                    "%s: Upsampling is only supported for the mode nearest." % n.name
+                )
+                assert len(in_shape) == 4, "Upsampling is only supported for 4D inputs."
+                assert scales.shape == (4,), (
+                    "%s: Upsampling is only supported for 4D scales." % n.name
+                )
+                assert (scales >= 1).all(), (
+                    n.name + ": Upsampling is only supported for scales "
+                    "which are larger or equal 1 in all dimensions."
+                )
+
+                # Assumes nhwc layout for scales and input
+                assert scales[1] == scales[2], (
+                    "%s: Upsampling is only supported for quadratic scales." % n.name
+                )
+                assert scales[0] == scales[3] == 1, (
+                    n.name + ": Upsampling is only supported for scales with "
+                    "the first and last dimensions being 1."
+                )
+                spatial_scale = scales[1]
+                assert spatial_scale == int(spatial_scale), (
+                    "%s: Upsampling is only supported for integer scales." % n.name
+                )
+
+                assert in_shape[1] == in_shape[2], (
+                    "%s: Upsampling is only supported for quadratic input shapes."
+                    % n.name
+                )
+
+                # Extract information for HLS node
+                IFMDim = in_shape[1]
+                OFMDim = int(round(in_shape[1] * spatial_scale))
+                NumChannels = in_shape[-1]
+                numInputVectors = in_shape[0]
+                inputDataType = dt.name
+
+                # Insert the HLSCustomOp node
+                Upsample_HLS_node = helper.make_node(
+                    "UpsampleNearestNeighbour_Batch",
+                    [n.input[0]],
+                    [n.output[0]],
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    OFMDim=OFMDim,
+                    IFMDim=IFMDim,
+                    NumChannels=NumChannels,
+                    inputDataType=inputDataType,
+                    numInputVectors=numInputVectors,
+                    name="UpsampleNearestNeighbour_Batch_" + n.name,
+                )
+
+                # Remove the old node
+                graph.node.insert(node_ind, Upsample_HLS_node)
+                # remove old nodes
+                graph.node.remove(n)
+                graph_modified = True
+        return (model, graph_modified)
+
+
 class InferStreamingMaxPool(Transformation):
     """Convert MaxPoolNHWC layers to StreamingMaxPool layers."""
 
@@ -235,25 +348,23 @@ class InferStreamingMaxPool(Transformation):
                 # mp_out_shape = model.get_tensor_shape(mp_output)
                 dt = model.get_tensor_datatype(mp_input)
                 mp_inst = getCustomOp(n)
-                # stride = mp_inst.get_nodeattr("strides")[0]
-                k = mp_inst.get_nodeattr("kernel_shape")[0]
-                # pad = mp_inst.get_nodeattr("pads")[0]
+                k_h, k_w = mp_inst.get_nodeattr("kernel_shape")
                 ifm_ch = mp_in_shape[-1]
-                ifm_dim = mp_in_shape[1]
-                # ofm_dim = mp_out_shape[1]
-                if ifm_dim % k == 0:
+                ifm_dim_h = mp_in_shape[1]
+                ifm_dim_w = mp_in_shape[2]
+                if ifm_dim_h % k_h == 0 and ifm_dim_w % k_w == 0:
                     # create equivalent StreamingMaxPool_Batch node
-                    # TODO support non-k strides
                     new_node = helper.make_node(
                         "StreamingMaxPool_Batch",
                         [mp_input],
                         [mp_output],
                         domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
-                        PoolDim=k,
+                        PoolDim=(k_h, k_w),
                         NumChannels=ifm_ch,
-                        ImgDim=ifm_dim,
+                        ImgDim=(ifm_dim_h, ifm_dim_w),
                         dataType=dt.name,
+                        name="StreamingMaxPool_Batch_" + n.name,
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old nodes
@@ -276,7 +387,7 @@ class InferPool_Batch(Transformation):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type in ["MaxPool", "QuantAvgPool2d"]:
+            if n.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]:
                 # extract pool parameters
 
                 if n.op_type == "MaxPool":
@@ -289,6 +400,15 @@ class InferPool_Batch(Transformation):
                     k = inst.get_nodeattr("kernel")
                     stride = inst.get_nodeattr("stride")
                     dlayout = inst.get_nodeattr("data_layout")
+                elif n.op_type == "MaxPoolNHWC":
+                    inst = getCustomOp(n)
+                    k_shape = inst.get_nodeattr("kernel_shape")
+                    strides = inst.get_nodeattr("strides")
+                    assert k_shape[0] == k_shape[1]
+                    assert strides[0] == strides[1]
+                    k = k_shape[0]
+                    stride = strides[0]
+                    dlayout = "NHWC"
                 try:
                     pad = get_by_name(n.attribute, "pads").ints[-1]
                 except AttributeError:
@@ -305,7 +425,8 @@ class InferPool_Batch(Transformation):
                     continue
                 elif k == stride:
                     warnings.warn(
-                        """Inferring Pool_Batch node for k == stride.
+                        n.name
+                        + """: Inferring Pool_Batch node for k == stride.
                         This case can be optimized.
                         For example, for MaxPool run InferStreamingMaxPool before
                         InferPool_Batch """
@@ -366,7 +487,7 @@ class InferPool_Batch(Transformation):
                 accum_bits = 0
                 pool_size_param = k
                 pad_value = 0
-                if n.op_type == "MaxPool":
+                if n.op_type in ["MaxPool", "MaxPoolNHWC"]:
                     pool_fxn = "MaxPool"
                     odt = idt
                     pad_value = idt.min()
@@ -396,6 +517,7 @@ class InferPool_Batch(Transformation):
                     pad_value=pad_value,
                     depthwise=1,
                     input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
+                    name="Im2Col_" + n.name,
                 )
 
                 # Warning PE has to be equal to ifm_ch until Im2Col is replaced by
@@ -418,6 +540,7 @@ class InferPool_Batch(Transformation):
                     AccumBits=accum_bits,
                     Size=pool_size_param,
                     BatchSize=1,
+                    name="Pool_Batch_" + n.name,
                 )
 
                 if dlayout == "NCHW":
@@ -466,16 +589,18 @@ class InferBinaryStreamingFCLayer(Transformation):
                 mm_output = n.output[0]
                 mm_in_shape = model.get_tensor_shape(mm_input)
                 mm_out_shape = model.get_tensor_shape(mm_output)
-                assert (
-                    model.get_tensor_datatype(mm_input) == DataType.BINARY
-                ), """First
+                assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], (
+                    n.name
+                    + """: First
                 input for xnorpopcount is not set to FINN DataType BINARY."""
-                assert (
-                    model.get_tensor_datatype(mm_weight) == DataType.BINARY
-                ), """Second
+                )
+                assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], (
+                    n.name
+                    + """: Second
                 input (weights) for xnorpopcount is not set to FINN DataType BINARY."""
-                idt = DataType.BINARY
-                wdt = DataType.BINARY
+                )
+                idt = DataType["BINARY"]
+                wdt = DataType["BINARY"]
                 mm_output = n.output[0]
                 W = model.get_initializer(mm_weight)
                 # extract weight shape, note that ONNX and finn-hlslib
@@ -487,13 +612,12 @@ class InferBinaryStreamingFCLayer(Transformation):
                 # create node with no parallelization first
                 pe = 1
                 simd = 1
-                assert mh % pe == 0, "Requirement MH divisable by PE is violated."
-                assert mw % simd == 0, "Requirement MW divisable by SIMD is violated."
                 wmem = mw * mh // (pe * simd)
-                assert (
-                    mw * mh == wmem * pe * simd
-                ), """Requirement (MW * MH) divisiable by
+                assert mw * mh == wmem * pe * simd, (
+                    n.name
+                    + """: Requirement (MW * MH) divisiable by
                 (WMEM * PE * SIMD) is violated."""
+                )
                 # see if we have any following thresholds
                 consumer = model.find_consumer(mm_output)
                 if consumer is not None and consumer.op_type == "MultiThreshold":
@@ -503,10 +627,11 @@ class InferBinaryStreamingFCLayer(Transformation):
                     mt_out_shape = model.get_tensor_shape(mt_output)
                     mt_thres = consumer.input[1]
                     T = model.get_initializer(mt_thres)
-                    assert (
-                        T.shape[0] == 1 or T.shape[0] == mh
-                    ), """First dimension of
+                    assert T.shape[0] == 1 or T.shape[0] == mh, (
+                        consumer.name
+                        + """: First dimension of
                     thresholds neither 1 nor MH."""
+                    )
                     odt = model.get_tensor_datatype(mt_output)
                     if odt.bitwidth() == 1:
                         # covers both bipolar and binary
@@ -534,6 +659,7 @@ class InferBinaryStreamingFCLayer(Transformation):
                         noActivation=0,
                         numInputVectors=list(mm_in_shape[:-1]),
                         mem_mode=self.mem_mode,
+                        name=n.name,
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old nodes
@@ -564,6 +690,7 @@ class InferBinaryStreamingFCLayer(Transformation):
                         noActivation=1,
                         numInputVectors=list(mm_in_shape[:-1]),
                         mem_mode=self.mem_mode,
+                        name=n.name,
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old node
@@ -611,15 +738,12 @@ class InferQuantizedStreamingFCLayer(Transformation):
                     # create node with no parallelization first
                     pe = 1
                     simd = 1
-                    assert mh % pe == 0, "Requirement MH divisable by PE is violated."
-                    assert (
-                        mw % simd == 0
-                    ), "Requirement MW divisable by SIMD is violated."
                     wmem = mw * mh // (pe * simd)
-                    assert (
-                        mw * mh == wmem * pe * simd
-                    ), """Requirement (MW * MH) divisible by
+                    assert mw * mh == wmem * pe * simd, (
+                        n.name
+                        + """: Requirement (MW * MH) divisible by
                     (WMEM * PE * SIMD) is violated."""
+                    )
                     # see if we have any following thresholds
                     consumer = model.find_consumer(mm_output)
                     if consumer is not None and consumer.op_type == "MultiThreshold":
@@ -629,27 +753,30 @@ class InferQuantizedStreamingFCLayer(Transformation):
                         mt_out_shape = model.get_tensor_shape(mt_output)
                         mt_thres = consumer.input[1]
                         T = model.get_initializer(mt_thres)
-                        assert (
-                            T.shape[0] == 1 or T.shape[0] == mh
-                        ), """First dimension of
+                        assert T.shape[0] == 1 or T.shape[0] == mh, (
+                            consumer.name
+                            + """: First dimension of
                         thresholds neither 1 nor MH."""
+                        )
                         odt = model.get_tensor_datatype(mt_output)
                         scale = getCustomOp(consumer).get_nodeattr("out_scale")
                         actval = getCustomOp(consumer).get_nodeattr("out_bias")
-                        assert (
-                            int(actval) == actval
-                        ), "out_bias must be integer for HLS conversion."
+                        assert int(actval) == actval, (
+                            consumer.name
+                            + ": out_bias must be integer for HLS conversion."
+                        )
                         actval = int(actval)
-                        odt_is_bipolar = odt == DataType.BIPOLAR
+                        odt_is_bipolar = odt == DataType["BIPOLAR"]
                         bipolar_ok = (
                             odt_is_bipolar and (scale == 2.0) and (actval == -1)
                         )
-                        assert (
-                            scale == 1.0 or bipolar_ok
-                        ), "out_scale = 1.0 or bipolar output needed for conversion."
-                        assert (not odt.signed()) or (
-                            actval < 0
-                        ), "Signed output requres actval < 0"
+                        assert scale == 1.0 or bipolar_ok, (
+                            consumer.name
+                            + ": out_scale=1 or bipolar output needed for conversion."
+                        )
+                        assert (not odt.signed()) or (actval < 0), (
+                            consumer.name + ": Signed output requres actval < 0"
+                        )
                         model.set_tensor_shape(mm_input, mm_in_shape)
                         model.set_tensor_shape(mt_output, mt_out_shape)
                         if bipolar_ok:
@@ -675,6 +802,7 @@ class InferQuantizedStreamingFCLayer(Transformation):
                             noActivation=0,
                             numInputVectors=list(mm_in_shape[:-1]),
                             mem_mode=self.mem_mode,
+                            name="StreamingFCLayer_Batch_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old nodes
@@ -705,6 +833,7 @@ class InferQuantizedStreamingFCLayer(Transformation):
                             noActivation=1,
                             numInputVectors=list(mm_in_shape[:-1]),
                             mem_mode=self.mem_mode,
+                            name="StreamingFCLayer_Batch_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old node
@@ -739,7 +868,8 @@ class InferVVAU(Transformation):
                     k_h, k_w = sparsity["dw"]["kernel_shape"]
                 except KeyError:
                     raise Exception(
-                        """Sparsity doesn't indicate that MatMul
+                        n.name
+                        + """: sparsity annotation doesn't indicate that MatMul
                         belongs to a depthwise convolution."""
                     )
 
@@ -775,9 +905,6 @@ class InferVVAU(Transformation):
                     model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w))
                     # create node with pe=channels as default
                     pe = channels
-                    assert (
-                        channels % pe == 0
-                    ), "Requirement Channels divisable by PE is violated."
                     # see if we have any following thresholds
                     consumer = model.find_consumer(mm_output)
                     if consumer is not None and consumer.op_type == "MultiThreshold":
@@ -786,23 +913,26 @@ class InferVVAU(Transformation):
                         mt_out_shape = model.get_tensor_shape(mt_output)
                         mt_thres = consumer.input[1]
                         T = model.get_initializer(mt_thres)
-                        assert (
-                            T.shape[0] == 1 or T.shape[0] == channels
-                        ), """First dimension of
+                        assert T.shape[0] == 1 or T.shape[0] == channels, (
+                            consumer.name
+                            + """: First dimension of
                         thresholds neither 1 nor Channels."""
+                        )
                         odt = model.get_tensor_datatype(mt_output)
                         scale = getCustomOp(consumer).get_nodeattr("out_scale")
-                        assert (
-                            scale == 1.0
-                        ), "out_scale must be equal to 1.0 for HLS conversion."
+                        assert scale == 1.0, (
+                            consumer.name
+                            + ": out_scale must be equal to 1.0 for HLS conversion."
+                        )
                         actval = getCustomOp(consumer).get_nodeattr("out_bias")
-                        assert (
-                            int(actval) == actval
-                        ), "out_bias must be integer for HLS conversion."
+                        assert int(actval) == actval, (
+                            consumer.name
+                            + ": out_bias must be integer for HLS conversion."
+                        )
                         actval = int(actval)
-                        assert (not odt.signed()) or (
-                            actval < 0
-                        ), "Signed output requres actval < 0"
+                        assert (not odt.signed()) or (actval < 0), (
+                            consumer.name + ": Signed output requres actval < 0"
+                        )
                         model.set_tensor_shape(mm_input, mm_in_shape)
                         model.set_tensor_shape(mt_output, mt_out_shape)
                         # create and insert new Vector_Vector_Activate_Batch node
@@ -822,6 +952,7 @@ class InferVVAU(Transformation):
                             outputDataType=odt.name,
                             ActVal=actval,
                             noActivation=0,
+                            name="Vector_Vector_Activate_Batch_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old nodes
@@ -850,6 +981,7 @@ class InferVVAU(Transformation):
                             outputDataType=odt.name,
                             ActVal=0,
                             noActivation=1,
+                            name="Vector_Vector_Activate_Batch_" + n.name,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old node
@@ -907,21 +1039,22 @@ class InferThresholdingLayer(Transformation):
                 ifc = int(thl_in_shape[-1])
                 # create node with no parallelization first
                 pe = 1
-                assert ifc % pe == 0, "Requirement IFC divisable by PE is violated."
 
                 odt = model.get_tensor_datatype(thl_output)
                 scale = getCustomOp(node).get_nodeattr("out_scale")
-                assert (
-                    scale == 1.0
-                ), "MultiThreshold out_scale must be equal to 1.0 for HLS conversion."
+                assert scale == 1.0, (
+                    node.name
+                    + ": MultiThreshold out_scale must be 1 for HLS conversion."
+                )
                 actval = getCustomOp(node).get_nodeattr("out_bias")
-                assert (
-                    int(actval) == actval
-                ), "MultiThreshold out_bias must be integer for HLS conversion."
+                assert int(actval) == actval, (
+                    node.name
+                    + ": MultiThreshold out_bias must be integer for HLS conversion."
+                )
                 actval = int(actval)
-                assert (not odt.signed()) or (
-                    actval < 0
-                ), "Signed output requres actval < 0"
+                assert (not odt.signed()) or (actval < 0), (
+                    node.name + ": Signed output requres actval < 0"
+                )
                 # create and insert new Thresholding_Batch node
                 new_node = helper.make_node(
                     "Thresholding_Batch",
@@ -938,6 +1071,7 @@ class InferThresholdingLayer(Transformation):
                     numInputVectors=list(thl_in_shape[:-1]),
                     ActVal=actval,
                     mem_mode=self.mem_mode,
+                    name="Thresholding_Batch_" + node.name,
                 )
                 graph.node.insert(insert_point, new_node)
                 # remove old node
@@ -1011,9 +1145,6 @@ class InferAddStreamsLayer(Transformation):
                 num_channels = int(in0_shape[-1])
                 # create node with no parallelization first
                 pe = 1
-                assert (
-                    num_channels % pe == 0
-                ), "Requirement Channels divisable by PE is violated."
 
                 # create and insert new StreamingFCLayer node
                 new_node = helper.make_node(
@@ -1026,6 +1157,7 @@ class InferAddStreamsLayer(Transformation):
                     PE=pe,
                     inputDataType=idt.name,
                     numInputVectors=in0_shape[:-1],
+                    name="AddStreams_Batch_" + node.name,
                 )
                 graph.node.insert(insert_point, new_node)
                 # remove old node
@@ -1039,7 +1171,7 @@ class InferAddStreamsLayer(Transformation):
 
 
 class InferDuplicateStreamsLayer(Transformation):
-    """Insert a DuplicateStreams HLS layer for any tensor with fanout == 2 """
+    """Insert a DuplicateStreams HLS layer for any tensor with fanout == 2"""
 
     def apply(self, model):
         graph = model.graph
@@ -1072,9 +1204,6 @@ class InferDuplicateStreamsLayer(Transformation):
 
                 # create node with no parallelization first
                 pe = 1
-                assert (
-                    num_ch % pe == 0
-                ), "Requirement channels divisable by PE is violated."
 
                 dup_node = helper.make_node(
                     "DuplicateStreams_Batch",
@@ -1086,6 +1215,7 @@ class InferDuplicateStreamsLayer(Transformation):
                     PE=pe,
                     inputDataType=dt.name,
                     numInputVectors=vecs,
+                    name="DuplicateStreams_Batch_" + node.name,
                 )
 
                 graph.node.insert(node_ind, dup_node)
@@ -1121,10 +1251,10 @@ class InferChannelwiseLinearLayer(Transformation):
         for v in vals:
             assert int(v) == v, "Error float value"
 
-        for k in DataType.__members__:
+        for k in DataType.get_accumulator_dt_cands():
             dt = DataType[k]
 
-            if dt in [DataType.BIPOLAR, DataType.TERNARY, DataType.FLOAT32]:
+            if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]:
                 # not currently supported
                 continue
 
@@ -1140,9 +1270,9 @@ class InferChannelwiseLinearLayer(Transformation):
         )
 
         if (0 <= vals).all():
-            return DataType.UINT64
+            return DataType["UINT64"]
         else:
-            return DataType.INT64
+            return DataType["INT64"]
 
     def apply(self, model):
         graph = model.graph
@@ -1254,6 +1384,7 @@ class InferChannelwiseLinearLayer(Transformation):
                     paramDataType=pdt.name,
                     outputDataType=odt.name,
                     numInputVectors=list(ll_in_shape[:-1]),
+                    name="ChannelwiseOp_Batch_" + node.name,
                 )
                 graph.node.insert(insert_point, new_node)
                 # remove old node
@@ -1296,9 +1427,6 @@ class InferLabelSelectLayer(Transformation):
                 num_inp_vecs = list(fc_in_shape[:-1])
                 # create node with no parallelization first
                 pe = 1
-                assert (
-                    num_labels % pe == 0
-                ), "Requirement Labels divisable by PE is violated."
 
                 k = model.get_initializer(k_input)[0]
 
@@ -1314,6 +1442,7 @@ class InferLabelSelectLayer(Transformation):
                     K=k,
                     inputDataType=idt.name,
                     numInputVectors=num_inp_vecs,
+                    name="LabelSelect_Batch_" + node.name,
                 )
                 graph.node.insert(node_ind, new_node)
                 # remove old node
@@ -1367,9 +1496,6 @@ class InferGlobalAccPoolLayer(Transformation):
                 vecs = in0_shape[:-1]
                 # create node with no parallelization first
                 pe = 1
-                assert (
-                    num_ch % pe == 0
-                ), "Requirement Labels divisable by PE is violated."
 
                 # create an additional tensor of the same shape and layout as result
                 out_shape = model.get_tensor_shape(result)
@@ -1390,6 +1516,7 @@ class InferGlobalAccPoolLayer(Transformation):
                     PE=pe,
                     inputDataType=idt.name,
                     numInputVectors=vecs,
+                    name="GlobalAccPool_Batch_" + node.name,
                 )
 
                 mul_value = helper.make_tensor_value_info(
@@ -1413,3 +1540,56 @@ class InferGlobalAccPoolLayer(Transformation):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferLookupLayer(Transformation):
+    """Convert Gather nodes with constant op0 into Lookup HLS layers."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Gather":
+                emb_name = node.input[0]
+                embs = model.get_initializer(emb_name)
+                axis = get_by_name(node.attribute, "axis")
+                # skip conversion if input0 is not constant
+                if embs is None:
+                    continue
+                # skip conversion if axis != 0
+                if axis is not None and axis.i != 0:
+                    continue
+                ind_name = node.input[1]
+                ind_dtype = model.get_tensor_datatype(ind_name)
+                emb_dtype = model.get_tensor_datatype(emb_name)
+                # skip conversion if inputs are not unsigned integers
+                if (not ind_dtype.is_integer()) or ind_dtype.signed():
+                    continue
+                num_embs, emb_dim = embs.shape
+                out_name = node.output[0]
+                ishape = model.get_tensor_shape(node.input[1])
+                # create and insert new Lookup node
+                new_node = helper.make_node(
+                    "Lookup",
+                    [ind_name, emb_name],
+                    [out_name],
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    name="Lookup_" + node.name,
+                    NumEmbeddings=num_embs,
+                    EmbeddingDim=emb_dim,
+                    EmbeddingType=emb_dtype.name,
+                    InputType=ind_dtype.name,
+                    InputShape=list(ishape),
+                )
+                graph.node.insert(node_ind, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
index 419a6d8c494651862f55e63e6829a61fe8040599..9b2577bc2b863e1075fc3252412ff1001b955cda 100644
--- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
+++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
@@ -26,11 +26,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import copy
-
-from onnx import helper
+from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
+from finn.transformation.create_generic_partitions import PartitionFromLambda
+from finn.transformation.fpgadataflow.externalize_params import ExternalizeParams
 from finn.util.basic import get_by_name, make_build_dir
 
 
@@ -41,120 +41,76 @@ class CreateDataflowPartition(Transformation):
     that indicates the filename for the second graph that only contains
     dataflow nodes. No action is taken if there are no dataflow nodes."""
 
-    def __init__(self):
+    def __init__(self, partition_model_dir=None):
         super().__init__()
+        if partition_model_dir is None:
+            self.partition_model_dir = make_build_dir("dataflow_partition_")
+        else:
+            self.partition_model_dir = partition_model_dir
 
     def apply(self, model):
-        target_partition_id = 0
-        # we currently assume that all dataflow nodes belonging to the same partition
-        # are connected to each other and there is a single input/output to/from each.
-        # NOTE: all dataflow nodes with no partition_id set are moved to partition 0
-        # TODO: check the assumption and/or improve this.
-        while True:
-            all_nodes = list(model.graph.node)
-            df_nodes = filter(
-                lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes
-            )
-            df_nodes = filter(
-                lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8")
-                == "fpgadataflow"
-                and (
-                    get_by_name(x.attribute, "partition_id") is None
-                    or get_by_name(x.attribute, "partition_id").i == target_partition_id
-                )
-                and x.op_type != "StreamingDataflowPartition",
-                df_nodes,
-            )
-            df_nodes = list(df_nodes)
-            non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes)
-            non_df_nodes = list(non_df_nodes)
-
-            if len(df_nodes) == 0:
-                # no changes if no dataflow nodes are present
-                break
-            else:
-                # partition the model into two models
-                df_model = copy.deepcopy(model)
-                non_df_model = model
-                # remove all non-dataflow nodes from the dataflow model
-                for node_to_remove in non_df_nodes:
-                    df_model.graph.node.remove(node_to_remove)
-                # identify the entry and exit points for the dataflow part
-                df_in = df_model.graph.node[0].input[0]
-                df_out = df_model.graph.node[-1].output[0]
-                df_in_vi = df_model.get_tensor_valueinfo(df_in)
-                df_out_vi = df_model.get_tensor_valueinfo(df_out)
-                # set df graph in/out to be df_in/df_out
-                df_model.graph.input.remove(df_model.graph.input[0])
-                df_model.graph.input.insert(0, df_in_vi)
-                df_model.graph.output.remove(df_model.graph.output[0])
-                df_model.graph.output.insert(0, df_out_vi)
-                # parse StreamingFCLayers looking for external weight memories
-                fc_extw_nodes = filter(
-                    lambda x: x.op_type == "StreamingFCLayer_Batch"
-                    and get_by_name(x.attribute, "mem_mode") is not None
-                    and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8")
-                    == "external",
-                    df_nodes,
-                )
-                fc_extw_nodes = list(fc_extw_nodes)
-                extra_df_inputs = []
+        def filter_fc_extw(x):
+            if x.op_type == "IODMA":
+                burst_mode = get_by_name(x.attribute, "burstMode")
+                if burst_mode is not None:
+                    burst_mode = burst_mode.s.decode("UTF-8")
+                    return burst_mode == "wrap"
 
-                for i in range(len(fc_extw_nodes)):
-                    fc_weight_vi = df_model.get_tensor_valueinfo(
-                        fc_extw_nodes[i].input[1]
-                    )
-                    df_model.graph.input.insert(i + 1, fc_weight_vi)
-                    extra_df_inputs.append(fc_extw_nodes[i].input[1])
+        extw_dma_nodes = list(filter(filter_fc_extw, model.graph.node))
+        if len(extw_dma_nodes) > 0:
+            model = model.transform(ExternalizeParams())
 
-                # save model
-                df_model_dir = make_build_dir(
-                    "dataflow_partition" + str(target_partition_id) + "_"
-                )
-                df_model_filename = df_model_dir + "/df_model.onnx"
-                df_model.cleanup()
-                df_model.save(df_model_filename)
-                # remove all dataflow nodes from the non-dataflow model
-                # keep track of where the dataflow part starts
-                df_start_ind = all_nodes.index(df_nodes[0])
-
-                # get and check floorplan
-                inst = getCustomOp(df_nodes[0])
-                slr = inst.get_nodeattr("slr")
-                for node in df_nodes[1:]:
-                    inst = getCustomOp(node)
-                    assert slr == inst.get_nodeattr(
-                        "slr"
-                    ), """all nodes with
-                same partition_id must have the same slr id"""
-
-                # check that there is only one non-null mem_port per partition
-                nmemports = 0
-                mem_port = ""
-                for node in df_nodes:
-                    inst = getCustomOp(node)
-                    port = inst.get_nodeattr("mem_port")
-                    if port is not None and port != "":
-                        nmemports += 1
-                        mem_port = port
-                assert nmemports <= 1, """too many memory ports per partition"""
+        def assign_partition_id(node):
+            if node.op_type in ["GenericPartition", "StreamingDataflowPartition"]:
+                return -1
+            else:
+                backend = get_by_name(node.attribute, "backend")
+                if backend is not None and backend.s.decode("UTF-8") == "fpgadataflow":
+                    assigned_partition = get_by_name(node.attribute, "partition_id")
+                    if assigned_partition is not None:
+                        return assigned_partition.i
+                    else:
+                        return 0
+                else:
+                    return -1
 
-                for node_to_remove in df_nodes:
-                    non_df_model.graph.node.remove(node_to_remove)
-                # create StreamingDataflow node with df_in/df_out io
-                df_node = helper.make_node(
-                    "StreamingDataflowPartition",
-                    [df_in] + extra_df_inputs,
-                    [df_out],
-                    # use the model attribute to mark the df model
-                    model=df_model_filename,
-                    domain="finn.custom_op.fpgadataflow",
-                    partition_id=target_partition_id,
-                    slr=slr,
-                    mem_port=mem_port,
-                )
-                non_df_model.graph.node.insert(df_start_ind, df_node)
-                model = non_df_model
-                target_partition_id += 1
+        # first, use the generic partitioning functionality to split up the graph
+        parent_model = model.transform(
+            PartitionFromLambda(
+                partitioning=assign_partition_id, partition_dir=self.partition_model_dir
+            )
+        )
+        # change node types to StreamingDataflowPartition
+        p_nodes = parent_model.get_nodes_by_op_type("GenericPartition")
+        for partition_ind, p_node in enumerate(p_nodes):
+            # go into partition to extract some info
+            p_node_inst = getCustomOp(p_node)
+            node_model_filename = p_node_inst.get_nodeattr("model")
+            p_model = ModelWrapper(node_model_filename)
+            # check floorplan (SLR assignment per node)
+            inst = getCustomOp(p_model.graph.node[0])
+            slr = inst.get_nodeattr("slr")
+            for node in p_model.graph.node:
+                inst = getCustomOp(node)
+                assert slr == inst.get_nodeattr(
+                    "slr"
+                ), """all nodes with same partition_id must have the same slr id"""
+            # check that there is only one non-null mem_port per partition
+            nmemports = 0
+            mem_port = ""
+            for node in p_model.graph.node:
+                inst = getCustomOp(node)
+                port = inst.get_nodeattr("mem_port")
+                if port is not None and port != "":
+                    nmemports += 1
+                    mem_port = port
+            assert nmemports <= 1, """Too many memory ports per partition"""
+            # done, change node type and add info in parent graph
+            p_node.op_type = "StreamingDataflowPartition"
+            p_node.domain = "finn.custom_op.fpgadataflow"
+            new_p_node_inst = getCustomOp(p_node)
+            new_p_node_inst.set_nodeattr("partition_id", partition_ind)
+            new_p_node_inst.set_nodeattr("slr", slr)
+            new_p_node_inst.set_nodeattr("mem_port", mem_port)
 
-        return (model, False)
+        return (parent_model, False)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 738f2000a1929024d3808dd7bad0267338b51659..327c7867fe30485f6df51d5e98dcbbaceea04cd8 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -26,19 +26,19 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import json
+import multiprocessing as mp
 import os
-import warnings
 import subprocess
-import json
+import warnings
 
-from finn.transformation.base import Transformation
-from finn.util.basic import make_build_dir, get_num_default_workers
-from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.custom_op.registry import getCustomOp
-import multiprocessing as mp
+from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.util.basic import get_num_default_workers, make_build_dir
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
 def is_external_input(model, node, i):
@@ -86,11 +86,6 @@ class CreateStitchedIP(Transformation):
         self.clk_ns = clk_ns
         self.ip_name = ip_name
         self.vitis = vitis
-        if float(clk_ns) not in [5.0, 10.0, 20.0]:
-            warnings.warn(
-                """The chosen frequency may lead to failure due to clock divider
-                constraints."""
-            )
         self.has_aximm = False
         self.has_m_axis = False
         self.m_axis_idx = 0
@@ -221,6 +216,13 @@ class CreateStitchedIP(Transformation):
         ip_dirs = ["list"]
         # add RTL streamer IP
         ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
+        if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]:
+            warnings.warn(
+                """First node is not StreamingFIFO or IODMA.
+                You may experience incorrect stitched-IP rtlsim or hardware
+                behavior. It is strongly recommended to insert FIFOs prior to
+                calling CreateStitchedIP."""
+            )
         # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
             assert is_fpgadataflow_node(
@@ -330,12 +332,13 @@ class CreateStitchedIP(Transformation):
         )
         tcl.append("set_property core_revision 2 [ipx::find_open_core %s]" % block_vlnv)
         tcl.append("ipx::create_xgui_files [ipx::find_open_core %s]" % block_vlnv)
+        # mark bus interface params as user-resolvable to avoid FREQ_MHZ mismatches
+        tcl.append(
+            "set_property value_resolve_type user [ipx::get_bus_parameters "
+            "-of [ipx::get_bus_interfaces -of [ipx::current_core ]]]"
+        )
         # if targeting Vitis, add some properties to the IP
         if self.vitis:
-            tcl.append(
-                "ipx::remove_bus_parameter FREQ_HZ "
-                "[ipx::get_bus_interfaces CLK.AP_CLK -of_objects [ipx::current_core]]"
-            )
             # replace source code with dcp
             tcl.append(
                 "set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv
diff --git a/src/finn/transformation/fpgadataflow/externalize_params.py b/src/finn/transformation/fpgadataflow/externalize_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcb66a8538fdff46214c23491f48a59459625082
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/externalize_params.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+from finn.transformation.base import Transformation
+from finn.util.basic import get_by_name
+
+
+class ExternalizeParams(Transformation):
+    """Create top-level graph inputs for IODMAs serving layers where weights are
+    marked as external using mem_mode="external"."""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        graph_modified = False
+
+        def filter_fc_extw(x):
+            if x.op_type == "IODMA":
+                burst_mode = get_by_name(x.attribute, "burstMode")
+                if burst_mode is not None:
+                    burst_mode = burst_mode.s.decode("UTF-8")
+                    return burst_mode == "wrap"
+
+        dma_extw_nodes = list(filter(filter_fc_extw, model.graph.node))
+
+        for dma_extw in dma_extw_nodes:
+            extw_tensor_name = dma_extw.input[0]
+            extw_tensor_name_out = dma_extw.output[0]
+            if extw_tensor_name in [x.name for x in model.graph.input]:
+                continue
+            else:
+                extw_vi = model.get_tensor_valueinfo(extw_tensor_name)
+                assert extw_vi is not None
+                model.graph.value_info.remove(extw_vi)
+                model.graph.input.append(extw_vi)
+                iodma_init = model.get_initializer(extw_vi.name)
+                assert iodma_init is not None
+                # remove output-side initializer to get correct dataflow partitioning
+                model.graph.initializer.remove(
+                    [
+                        x
+                        for x in model.graph.initializer
+                        if x.name == extw_tensor_name_out
+                    ][0]
+                )
+                graph_modified = True
+
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index 3434183b1480eb38ee267328042aec33e87e0446..2bda7883130d0863b7f67943d19caa00b7290de5 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -26,14 +26,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import json
+import warnings
+
+from finn.analysis.fpgadataflow.floorplan_params import floorplan_params
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
-from finn.util.basic import get_by_name
-from finn.analysis.fpgadataflow.floorplan_params import floorplan_params
-from finn.util.basic import make_build_dir
 from finn.transformation.general import ApplyConfig
-import warnings
-import json
+from finn.util.basic import get_by_name, make_build_dir
 
 
 class Floorplan(Transformation):
@@ -70,7 +70,7 @@ class Floorplan(Transformation):
 
         try:
             default_slr = self.user_floorplan["Defaults"]["slr"][0]
-        except:
+        except Exception:
             default_slr = -1
 
         # perform DWC and FIFO specific adjustments
@@ -107,7 +107,8 @@ class Floorplan(Transformation):
             warnings.warn(
                 str(unassigned_nodes)
                 + " nodes have no entry in the provided floorplan,"
-                + " SLR was set to " + str(default_slr)
+                + " SLR was set to "
+                + str(default_slr)
             )
 
         # partition id generation
diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
index bbd012a715e49b61c19daad65f8de889112f92a7..2a7d9e9066836ea0d4af004f01d88953e4adaeb7 100644
--- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py
+++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
@@ -27,10 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import warnings
+
 import finn.custom_op.registry as registry
-from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.transformation.base import NodeLocalTransformation
-import warnings
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
 class HLSSynthIP(NodeLocalTransformation):
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index c8df80659d30e1855fc658bad83c3fe9bccb9bf9..58efe65eb5f9d96d74cdf40672703fabe76afb0d 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -1,10 +1,10 @@
+import warnings
 from onnx import TensorProto
 from onnx import helper as oh
 
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
-import warnings
 
 
 def _is_dwc_node(node):
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 1ce936cd79c2257897e74430d00e5082c51c9320..c8bb716922823876f5f16ffe62f17c425d49aa74 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -1,11 +1,11 @@
+import numpy as np
+import warnings
 from onnx import TensorProto
 from onnx import helper as oh
 
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
-import warnings
-import numpy as np
 
 
 def _is_fifo_node(node):
@@ -180,49 +180,50 @@ class InsertFIFO(Transformation):
                 n.input[0] = fifo_output_tensor.name
 
             # insert FIFO as last node, except when last node is DMA
-            if (
-                graph.node[-1].op_type != "StreamingFIFO"
-                and graph.node[-1].op_type != "IODMA"
-            ):
-                n = graph.node[-1]
-                assert (
-                    n.op_type != "TLastMarker"
-                ), """Insert tlast marker should be done
-                    after inserting the FIFOs"""
-                graph_out_name = graph.output[0].name
-                n0 = getCustomOp(n)
-                # determine fifo node attributes
-                fld_shape = n0.get_folded_output_shape()
-                dtype = n0.get_output_datatype()
-                fifo_depth = n0.get_nodeattr("outFIFODepth")
-
-                if fifo_depth <= 2:
-                    warnings.warn("Overriding output FIFO depth to 32")
-                    fifo_depth = 32
-
-                # create fifo node
-                fifo_input_tensor = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(),
-                    TensorProto.FLOAT,
-                    n0.get_normal_output_shape(),
-                )
-                graph.value_info.append(fifo_input_tensor)
-                model.set_tensor_datatype(fifo_input_tensor.name, dtype)
-
-                fifo_node = oh.make_node(
-                    "StreamingFIFO",
-                    [fifo_input_tensor.name],
-                    [graph_out_name],
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                    depth=fifo_depth,
-                    folded_shape=fld_shape,
-                    dataType=str(dtype.name),
-                )
-                # insert fifo
-                graph.node.append(fifo_node)
-
-                # set fifo output tensor as new input tensor of second node
-                n.output[0] = fifo_input_tensor.name
+            graph_out_names = [x.name for x in model.graph.output]
+            for graph_out_name in graph_out_names:
+                final_node = model.find_producer(graph_out_name)
+                if (
+                    final_node.op_type != "StreamingFIFO"
+                    and final_node.op_type != "IODMA"
+                ):
+                    assert (
+                        final_node.op_type != "TLastMarker"
+                    ), """Insert tlast marker should be done
+                        after inserting the FIFOs"""
+                    n0 = getCustomOp(final_node)
+                    # determine fifo node attributes
+                    fld_shape = n0.get_folded_output_shape()
+                    dtype = n0.get_output_datatype()
+                    fifo_depth = n0.get_nodeattr("outFIFODepth")
+
+                    if fifo_depth <= 2:
+                        warnings.warn("Overriding output FIFO depth to 32")
+                        fifo_depth = 32
+
+                    # create fifo node
+                    fifo_input_tensor = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        n0.get_normal_output_shape(),
+                    )
+                    graph.value_info.append(fifo_input_tensor)
+                    model.set_tensor_datatype(fifo_input_tensor.name, dtype)
+
+                    fifo_node = oh.make_node(
+                        "StreamingFIFO",
+                        [fifo_input_tensor.name],
+                        [graph_out_name],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        depth=fifo_depth,
+                        folded_shape=fld_shape,
+                        dataType=str(dtype.name),
+                    )
+                    # insert fifo
+                    graph.node.append(fifo_node)
+
+                    # set fifo output tensor as new input tensor of second node
+                    final_node.output[0] = fifo_input_tensor.name
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 27055a4fd29dba3849c0e4a889f27802f8c36081..d0ef270816c362af730a75b59be71d0457e0b8e2 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -26,15 +26,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import math
+import numpy as np
 from onnx import TensorProto
 from onnx import helper as oh
 
-from finn.util.basic import get_by_name
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.transformation.general import SortGraph
-import math
-import numpy as np
+from finn.util.basic import get_by_name
 
 
 class InsertIODMA(Transformation):
@@ -87,6 +87,7 @@ class InsertIODMA(Transformation):
         return reshaped_w
 
     def apply(self, model):
+        modified = False
         # only makes sense for a pure fpgadataflow graph -- so we check!
         all_nodes = list(model.graph.node)
         assert all(
@@ -102,59 +103,14 @@ class InsertIODMA(Transformation):
                 all_nodes,
             )
         )
-        graph_in_name = model.graph.input[0].name
-        first_node = model.find_consumer(graph_in_name)
-        graph_out_name = model.graph.output[0].name
-        final_node = model.find_producer(graph_out_name)
-        if (
-            final_node.op_type == "IODMA"
-            and first_node.op_type == "IODMA"
-            and len(fc_extw_nodes) == 0
-        ):
-            # TODO maybe check the correctness of properties
-            return (model, False)
-        else:
-            if final_node.op_type != "IODMA":
-                out_shape = model.get_tensor_shape(graph_out_name)
-                out_dtype = model.get_tensor_datatype(graph_out_name)
-                final_node_inst = getCustomOp(final_node)
-                out_folded_shape = final_node_inst.get_folded_output_shape()
-                # take advantage of AXI stream width padding for DMA alignment
-                # (AXI streams are always padded to 8 bits)
-                # this is the width of stream input to DMA
-                padded_outstream_width = final_node_inst.get_outstream_width_padded()
-                padded_outstream_bytes = padded_outstream_width // 8
-                # determine the feasible interface width
-                transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
-                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-                assert (
-                    intfwidth % 8 == 0
-                ), "No feasible interface width for transfer size"
-                # make new buffer
-                final_node_out = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
-                )
-                model.graph.value_info.append(final_node_out)
-                model.set_tensor_datatype(final_node_out.name, out_dtype)
-                # reroute final node output to final_node_out_name
-                final_node.output[0] = final_node_out.name
-                # FIXME: currently always using 8-bit dtypes to work around the
-                # padding problems for i/o DMA
-                dma_node = oh.make_node(
-                    "IODMA",
-                    [final_node_out.name],
-                    [graph_out_name],
-                    numInputVectors=out_folded_shape[:-1],
-                    NumChannels=padded_outstream_bytes,
-                    dataType="UINT8",
-                    intfWidth=intfwidth,
-                    streamWidth=padded_outstream_width,
-                    direction="out",
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                )
-                model.graph.node.append(dma_node)
-            if first_node.op_type != "IODMA":
+        # insert IODMAs for graph inputs
+        graph_in_names = [x.name for x in model.graph.input]
+        for graph_in_name in graph_in_names:
+            first_node = model.find_consumer(graph_in_name)
+            if first_node.op_type == "IODMA":
+                # IODMA already inserted for this input
+                continue
+            else:
                 in_shape = model.get_tensor_shape(graph_in_name)
                 in_dtype = model.get_tensor_datatype(graph_in_name)
                 first_node_inst = getCustomOp(first_node)
@@ -194,47 +150,96 @@ class InsertIODMA(Transformation):
                     backend="fpgadataflow",
                 )
                 model.graph.node.insert(0, dma_node)
-            for fc_node in fc_extw_nodes:
-                fc_inst = getCustomOp(fc_node)
-                fc_w_name = fc_node.input[1]
-                w_shape = model.get_tensor_shape(fc_w_name)
-                w_dtype = model.get_tensor_datatype(fc_w_name)
+                modified = True
+        # insert IODMAs for graph outputs
+        graph_out_names = [x.name for x in model.graph.output]
+        for graph_out_name in graph_out_names:
+            final_node = model.find_producer(graph_out_name)
+            if final_node.op_type == "IODMA":
+                continue
+            else:
+                out_shape = model.get_tensor_shape(graph_out_name)
+                out_dtype = model.get_tensor_datatype(graph_out_name)
+                final_node_inst = getCustomOp(final_node)
+                out_folded_shape = final_node_inst.get_folded_output_shape()
+                # take advantage of AXI stream width padding for DMA alignment
+                # (AXI streams are always padded to 8 bits)
+                # this is the width of stream input to DMA
+                padded_outstream_width = final_node_inst.get_outstream_width_padded()
+                padded_outstream_bytes = padded_outstream_width // 8
                 # determine the feasible interface width
-                transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
+                transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
                 intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
                 assert (
                     intfwidth % 8 == 0
                 ), "No feasible interface width for transfer size"
-                # calculate width of stream output from DMA
-                pe = get_by_name(fc_node.attribute, "PE").i
-                simd = get_by_name(fc_node.attribute, "SIMD").i
-                streamWidth = fc_inst.get_weightstream_width_padded()
                 # make new buffer
-                W = model.get_initializer(fc_w_name)
-                iodma_mem = self.get_mem_init(W, pe, simd)
-                model.set_initializer(fc_w_name, iodma_mem)
-
-                fc_node_in = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
+                final_node_out = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
                 )
-                model.graph.value_info.append(fc_node_in)
-                model.set_tensor_datatype(fc_node_in.name, w_dtype)
-                model.set_initializer(fc_node_in.name, W)
+                model.graph.value_info.append(final_node_out)
+                model.set_tensor_datatype(final_node_out.name, out_dtype)
+                # reroute final node output to final_node_out_name
+                final_node.output[0] = final_node_out.name
+                # FIXME: currently always using 8-bit dtypes to work around the
+                # padding problems for i/o DMA
                 dma_node = oh.make_node(
                     "IODMA",
-                    [fc_w_name],
-                    [fc_node_in.name],
-                    numInputVectors=[iodma_mem.shape[0]],
-                    NumChannels=pe * simd,
-                    dataType=str(w_dtype.name),
+                    [final_node_out.name],
+                    [graph_out_name],
+                    numInputVectors=out_folded_shape[:-1],
+                    NumChannels=padded_outstream_bytes,
+                    dataType="UINT8",
                     intfWidth=intfwidth,
-                    streamWidth=streamWidth,
-                    direction="in",
-                    burstMode="wrap",
+                    streamWidth=padded_outstream_width,
+                    direction="out",
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                 )
-                fc_node.input[1] = fc_node_in.name
-                model.graph.node.insert(0, dma_node)
+                model.graph.node.append(dma_node)
+                modified = True
+
+        for fc_node in fc_extw_nodes:
+            fc_inst = getCustomOp(fc_node)
+            fc_w_name = fc_node.input[1]
+            w_shape = model.get_tensor_shape(fc_w_name)
+            w_dtype = model.get_tensor_datatype(fc_w_name)
+            # determine the feasible interface width
+            transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
+            intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+            assert intfwidth % 8 == 0, "No feasible interface width for transfer size"
+            # calculate width of stream output from DMA
+            pe = get_by_name(fc_node.attribute, "PE").i
+            simd = get_by_name(fc_node.attribute, "SIMD").i
+            streamWidth = fc_inst.get_weightstream_width_padded()
+            # make new buffer
+            W = model.get_initializer(fc_w_name)
+            iodma_mem = self.get_mem_init(W, pe, simd)
+            model.set_initializer(fc_w_name, iodma_mem)
+
+            fc_node_in = oh.make_tensor_value_info(
+                model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
+            )
+            model.graph.value_info.append(fc_node_in)
+            model.set_tensor_datatype(fc_node_in.name, w_dtype)
+            model.set_initializer(fc_node_in.name, W)
+            dma_node = oh.make_node(
+                "IODMA",
+                [fc_w_name],
+                [fc_node_in.name],
+                numInputVectors=[iodma_mem.shape[0]],
+                NumChannels=pe * simd,
+                dataType=str(w_dtype.name),
+                intfWidth=intfwidth,
+                streamWidth=streamWidth,
+                direction="in",
+                burstMode="wrap",
+                domain="finn.custom_op.fpgadataflow",
+                backend="fpgadataflow",
+            )
+            fc_node.input[1] = fc_node_in.name
+            model.graph.node.insert(0, dma_node)
+            modified = True
+        if modified:
             model = model.transform(SortGraph())
-            return (model, True)
+        return (model, modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 3ce9824b14a54f502c90650e7b3b75e9cdaab77f..34cb61346dcd5bcd6f41a4272748764cf385a524 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
 from onnx import TensorProto
 from onnx import helper as oh
 
@@ -33,8 +34,6 @@ from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.util.basic import get_by_name
 
-import numpy as np
-
 
 class InsertTLastMarker(Transformation):
     """Ensure that the graph is started/terminated with a TLastMarker node, inserting
diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py
index 84d3f4cd94c9e0870b90941f7c46447da4cee631..d43d81716ac7a8b097fc7ec9e38bf5bcb954c7fb 100644
--- a/src/finn/transformation/fpgadataflow/make_deployment.py
+++ b/src/finn/transformation/fpgadataflow/make_deployment.py
@@ -31,9 +31,9 @@ import subprocess
 from distutils.dir_util import copy_tree
 from shutil import copy
 
+import finn.transformation.fpgadataflow.templates as templates
 from finn.transformation.base import Transformation
 from finn.util.basic import make_build_dir
-import finn.transformation.fpgadataflow.templates as templates
 
 
 class DeployToPYNQ(Transformation):
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 6ab12548abbcbe00496101bd146b2c9b873204c8..2c3bd7ee59e23566bbd0acf2241ca67ed2beb3ea 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -27,24 +27,29 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-import shutil
-from finn.transformation.base import Transformation
-from finn.util.basic import gen_finn_dt_tensor, make_build_dir
-import finn.util.data_packing as dpk
-import finn.core.datatype as dtp
-from finn.custom_op.registry import getCustomOp
-import os
-import warnings
 import pkg_resources as pk
-from . import template_driver
-from finn.core.modelwrapper import ModelWrapper
+
 import numpy as np
+import os
+import shutil
+import warnings
 
+import finn.core.datatype as dtp
+import finn.util.data_packing as dpk
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.base import Transformation
+from finn.util.basic import (
+    gen_finn_dt_tensor,
+    make_build_dir,
+    roundup_to_integer_multiple,
+)
 from finn.util.data_packing import (
-    pack_innermost_dim_as_hex_string,
     hexstring2npbytearray,
+    pack_innermost_dim_as_hex_string,
 )
-from finn.util.basic import roundup_to_integer_multiple
+
+from . import template_driver
 
 
 def to_external_tensor(init, w_dtype):
@@ -85,6 +90,7 @@ class MakePYNQDriver(Transformation):
         self.platform = platform
 
     def apply(self, model):
+
         # create a temporary folder for the generated driver
         pynq_driver_dir = make_build_dir(prefix="pynq_driver_")
         model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir)
@@ -95,59 +101,100 @@ class MakePYNQDriver(Transformation):
         )
         driver_base_py = pynq_driver_dir + "/driver_base.py"
         shutil.copy(driver_base_template, driver_base_py)
-
         # extract input-output shapes from the graph
         # TODO convert this to an analysis pass?
-        i_tensor_name = model.graph.input[0].name
-        o_tensor_name = model.graph.output[0].name
-        i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name))
-        o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name))
-        i_tensor_dt = model.get_tensor_datatype(i_tensor_name)
-        o_tensor_dt = model.get_tensor_datatype(o_tensor_name)
-
-        first_node = model.find_consumer(i_tensor_name)
-        last_node = model.find_producer(o_tensor_name)
-        if first_node.op_type == "StreamingDataflowPartition":
-            # IODMAs and dataflow partitions have already been created
-            # extract folded i/o shapes from IODMA consumer/producer
-            first_df_model = ModelWrapper(getCustomOp(first_node).get_nodeattr("model"))
+        idt = []
+        idma_names = []
+        ishape_normal = []
+        ishape_folded = []
+        ishape_packed = []
+        for idma_ind, graph_in in enumerate(model.graph.input):
+            i_tensor_name = graph_in.name
+            # get inp tensor properties
+            i_tensor_dt = model.get_tensor_datatype(i_tensor_name)
+            i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name))
+            # go down into dataflow partition to get folded shape info etc
+            # TODO consider setting these as attributes during dataflow partitioning
+            i_consumer = model.find_consumer(i_tensor_name)
+            assert (
+                i_consumer.op_type == "StreamingDataflowPartition"
+            ), """
+                Ensure CreateDataflowPartition called before driver creation."""
+            first_df_model = ModelWrapper(getCustomOp(i_consumer).get_nodeattr("model"))
             assert (
                 first_df_model.graph.node[0].op_type == "IODMA"
             ), "First partition must hold input IODMA"
-            successors = model.find_direct_successors(first_node)
+            successors = model.find_direct_successors(i_consumer)
+            successor_input_num = list(successors[0].input).index(i_consumer.output[0])
             successor_sdp = getCustomOp(successors[0])
             successor_df_model = ModelWrapper(successor_sdp.get_nodeattr("model"))
             first_node = successor_df_model.find_consumer(
-                successor_df_model.graph.input[0].name
+                successor_df_model.graph.input[successor_input_num].name
             )
-
-            last_df_model = ModelWrapper(getCustomOp(last_node).get_nodeattr("model"))
+            i_tensor_shape_folded = tuple(
+                getCustomOp(first_node).get_folded_input_shape()
+            )
+            # generate dummy folded i/o tensors and their packed versions
+            i_tensor_dummy_folded = gen_finn_dt_tensor(
+                i_tensor_dt, i_tensor_shape_folded
+            )
+            i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray(
+                i_tensor_dummy_folded, i_tensor_dt
+            )
+            i_tensor_shape_packed = i_tensor_dummy_packed.shape
+            # append all input tensor info to relevant lists
+            idt.append("DataType['%s']" % i_tensor_dt.name)
+            ishape_normal.append(i_tensor_shape_normal)
+            ishape_folded.append(i_tensor_shape_folded)
+            ishape_packed.append(i_tensor_shape_packed)
+            idma_names.append(getCustomOp(i_consumer).get_nodeattr("instance_name"))
+
+        odt = []
+        odma_names = []
+        oshape_normal = []
+        oshape_folded = []
+        oshape_packed = []
+        for odma_ind, graph_out in enumerate(model.graph.output):
+            o_tensor_name = graph_out.name
+            # get inp tensor properties
+            o_tensor_dt = model.get_tensor_datatype(o_tensor_name)
+            o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name))
+            # go down into IODMA partition to get folded shape info etc
+            # TODO consider setting these as attributes during dataflow partitioning
+            o_producer = model.find_producer(o_tensor_name)
+            assert (
+                o_producer.op_type == "StreamingDataflowPartition"
+            ), """
+                Ensure CreateDataflowPartition called before driver creation."""
+            df_model = ModelWrapper(getCustomOp(o_producer).get_nodeattr("model"))
             assert (
-                last_df_model.graph.node[0].op_type == "IODMA"
-            ), "Last partition must hold output IODMA"
-            predecessors = model.find_direct_predecessors(last_node)
+                df_model.graph.node[-1].op_type == "IODMA"
+            ), "Partition must hold output IODMA"
+            predecessors = model.find_direct_predecessors(o_producer)
+            predecessor_output_num = list(predecessors[0].output).index(
+                o_producer.input[0]
+            )
             predecessor_sdp = getCustomOp(predecessors[0])
             predecessor_df_model = ModelWrapper(predecessor_sdp.get_nodeattr("model"))
             last_node = predecessor_df_model.find_producer(
-                predecessor_df_model.graph.output[0].name
+                predecessor_df_model.graph.output[predecessor_output_num].name
             )
-
-        # else: transformation called before IODMA/SDP creation (legacy flow)
-        # can access folded i/o shapes directly
-        i_tensor_shape_folded = tuple(getCustomOp(first_node).get_folded_input_shape())
-        o_tensor_shape_folded = tuple(getCustomOp(last_node).get_folded_output_shape())
-
-        # generate dummy folded i/o tensors and their packed versions
-        i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded)
-        o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded)
-        i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray(
-            i_tensor_dummy_folded, i_tensor_dt
-        )
-        o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray(
-            o_tensor_dummy_folded, o_tensor_dt
-        )
-        i_tensor_shape_packed = i_tensor_dummy_packed.shape
-        o_tensor_shape_packed = o_tensor_dummy_packed.shape
+            o_tensor_shape_folded = tuple(
+                getCustomOp(last_node).get_folded_output_shape()
+            )
+            o_tensor_dummy_folded = gen_finn_dt_tensor(
+                o_tensor_dt, o_tensor_shape_folded
+            )
+            o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray(
+                o_tensor_dummy_folded, o_tensor_dt
+            )
+            o_tensor_shape_packed = o_tensor_dummy_packed.shape
+            # append all output tensor info to relevant lists
+            odt.append("DataType['%s']" % o_tensor_dt.name)
+            oshape_normal.append(o_tensor_shape_normal)
+            oshape_folded.append(o_tensor_shape_folded)
+            oshape_packed.append(o_tensor_shape_packed)
+            odma_names.append(getCustomOp(o_producer).get_nodeattr("instance_name"))
 
         # generate external weights npy files
         weights_dir = pynq_driver_dir + "/runtime_weights"
@@ -161,47 +208,50 @@ class MakePYNQDriver(Transformation):
                 node.op_type == "StreamingDataflowPartition"
             ), "CreateDataflowPartition needs to be applied before driver generation"
 
-            producer = model.find_producer(node.input[0])
-            init_tensor = model.get_initializer(node.input[0])
+            if len(node.input) > 0:
+                producer = model.find_producer(node.input[0])
+                init_tensor = model.get_initializer(node.input[0])
+            else:
+                producer = None
+                init_tensor = None
 
             if producer is None:  # input dma?
-                idma_name = "idma" + str(idma_idx)
-                if init_tensor is not None:  # input weights dma?
+                sdp_inst = getCustomOp(node)
+                idma_name = sdp_inst.get_nodeattr("instance_name")
+                df_model = ModelWrapper(sdp_inst.get_nodeattr("model"))
+                assert df_model.graph.node[0].op_type == "IODMA"
+                iodma_node = getCustomOp(df_model.graph.node[0])
+                if iodma_node.get_nodeattr("burstMode") == "wrap":  # input weights dma?
+                    init_tensor = df_model.get_initializer(
+                        iodma_node.onnx_node.input[0]
+                    )
                     ext_weight_dma_cnt += 1
-                    w_dtype = model.get_tensor_datatype(node.input[0])
+                    w_dtype = df_model.get_tensor_datatype(
+                        iodma_node.onnx_node.input[0]
+                    )
                     init_external_tensor = to_external_tensor(init_tensor, w_dtype)
                     np.save(
                         weights_dir + "/" + idma_name + ".npy", init_external_tensor
                     )
-                else:
-                    net_input_name = idma_name
-
                 idma_idx += 1
 
         # fill in the driver template
         driver_py = pynq_driver_dir + "/driver.py"
         driver = template_driver.pynq_driver_template
 
-        def mss(x, batch_var_name="1"):
-            # "make shape string"
-            # for a shape like (1, ...) emit a string (N, ...)
-            # where N is the default value for batch_var_name
-            # this lets the driver work with a batch of samples at once
-            ret = str(x)
-            ret = ret.replace("(1,", "(%s," % batch_var_name)
-            ret = ret.replace("[1,", "[%s," % batch_var_name)
-            return ret
-
         driver = driver.replace("$PLATFORM$", self.platform)
-        driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt))
-        driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal))
-        driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded))
-        driver = driver.replace("$INPUT_SHAPE_PACKED$", mss(i_tensor_shape_packed))
-        driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(o_tensor_dt))
-        driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal))
-        driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded))
-        driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
-        driver = driver.replace("$INPUT_DMA_NAME$", "'%s'" % net_input_name)
+        driver = driver.replace("$INPUT_FINN_DATATYPE$", str(idt).replace('"', ""))
+        driver = driver.replace("$INPUT_SHAPE_NORMAL$", str(ishape_normal))
+        driver = driver.replace("$INPUT_SHAPE_FOLDED$", str(ishape_folded))
+        driver = driver.replace("$INPUT_SHAPE_PACKED$", str(ishape_packed))
+        driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(odt).replace('"', ""))
+        driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", str(oshape_normal))
+        driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", str(oshape_folded))
+        driver = driver.replace("$OUTPUT_SHAPE_PACKED$", str(oshape_packed))
+        driver = driver.replace("$INPUT_DMA_NAME$", "%s" % str(idma_names))
+        driver = driver.replace("$OUTPUT_DMA_NAME$", "%s" % str(odma_names))
+        driver = driver.replace("$NUM_INPUTS$", str(len(idma_names)))
+        driver = driver.replace("$NUM_OUTPUTS$", str(len(odma_names)))
         driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt))
 
         with open(driver_py, "w") as f:
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index f2f172139ed9144eccdfbe37d0ae1a0695d049e4..84d587b6cecea63cb3be41a4a73bcc24aeb822f3 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -28,27 +28,24 @@
 
 import os
 import subprocess
+from shutil import copy
 
+from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
-from finn.core.modelwrapper import ModelWrapper
-from finn.util.basic import get_by_name, make_build_dir
-from finn.util.basic import pynq_part_map
-
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.floorplan import Floorplan
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.transformation.fpgadataflow.floorplan import Floorplan
 from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.transformation.infer_data_layouts import InferDataLayouts
-from shutil import copy
-from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.util.basic import make_build_dir, pynq_part_map
 
 from . import templates
 
@@ -56,19 +53,22 @@ from . import templates
 def collect_ip_dirs(model, ipstitch_path):
     # collect list of all IP dirs
     ip_dirs = []
+    need_memstreamer = False
     for node in model.graph.node:
-        ip_dir_attribute = get_by_name(node.attribute, "ip_path")
-        assert (
-            ip_dir_attribute is not None
-        ), """Node attribute "ip_path" is
-        empty. Please run transformation HLSSynth_ipgen first."""
-        ip_dir_value = ip_dir_attribute.s.decode("UTF-8")
+        node_inst = getCustomOp(node)
+        ip_dir_value = node_inst.get_nodeattr("ip_path")
         assert os.path.isdir(
             ip_dir_value
         ), """The directory that should
         contain the generated ip blocks doesn't exist."""
         ip_dirs += [ip_dir_value]
+        if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
+            if node_inst.get_nodeattr("mem_mode") == "decoupled":
+                need_memstreamer = True
     ip_dirs += [ipstitch_path + "/ip"]
+    if need_memstreamer:
+        # add RTL streamer IP
+        ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
     return ip_dirs
 
 
@@ -144,16 +144,21 @@ class MakeZYNQProject(Transformation):
             # assume only one connection from each ip to the next
             # all aximm allocated to DDR[0]
             # all kernels allocated to SLR0
-            producer = model.find_producer(node.input[0])
+            if len(node.input) == 0:
+                producer = None
+            else:
+                producer = model.find_producer(node.input[0])
             consumer = model.find_consumers(node.output[0])
             # define kernel instances
             # name kernels connected to graph inputs as idmaxx
-            # name kernels connected to graph inputs as odmaxx
+            # name kernels connected to graph outputs as odmaxx
             if producer is None or consumer is None:
                 if producer is None:
                     instance_names[node.name] = "idma" + str(idma_idx)
+                    idma_idx += 1
                 elif consumer is None:
                     instance_names[node.name] = "odma" + str(odma_idx)
+                    odma_idx += 1
                 config.append(
                     "create_bd_cell -type ip -vlnv %s %s"
                     % (vivado_stitch_vlnv, instance_names[node.name])
@@ -178,7 +183,7 @@ class MakeZYNQProject(Transformation):
                     "assign_axi_addr_proc %s/%s"
                     % (instance_names[node.name], axilite_intf_name)
                 )
-                idma_idx += 1
+
                 aximm_idx += 1
                 axilite_idx += 1
             else:
@@ -199,6 +204,7 @@ class MakeZYNQProject(Transformation):
                         % (instance_names[node.name], axilite_intf_name)
                     )
                     axilite_idx += 1
+            sdp_node.set_nodeattr("instance_name", instance_names[node.name])
 
             config.append(
                 "connect_bd_net [get_bd_pins %s/ap_clk] "
@@ -297,12 +303,19 @@ class ZynqBuild(Transformation):
 
     """
 
-    def __init__(self, platform, period_ns, enable_debug=False):
+    def __init__(
+        self,
+        platform,
+        period_ns,
+        enable_debug=False,
+        partition_model_dir=None,
+    ):
         super().__init__()
         self.fpga_part = pynq_part_map[platform]
         self.period_ns = period_ns
         self.platform = platform
         self.enable_debug = enable_debug
+        self.partition_model_dir = partition_model_dir
 
     def apply(self, model):
         # first infer layouts
@@ -312,7 +325,7 @@ class ZynqBuild(Transformation):
             InsertIODMA(64),
             InsertDWC(),
             Floorplan(),
-            CreateDataflowPartition(),
+            CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
         ]
         for trn in prep_transforms:
             model = model.transform(trn)
@@ -334,7 +347,7 @@ class ZynqBuild(Transformation):
             kernel_model = kernel_model.transform(HLSSynthIP())
             kernel_model = kernel_model.transform(
                 CreateStitchedIP(
-                    self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True
+                    self.fpga_part, self.period_ns, sdp_node.onnx_node.name, False
                 )
             )
             kernel_model.set_metadata_prop("platform", "zynq-iodma")
@@ -347,6 +360,4 @@ class ZynqBuild(Transformation):
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "zynq-iodma")
 
-        # create driver
-        model = model.transform(MakePYNQDriver(platform="zynq-iodma"))
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
index 653ec02ff306bf35d5fd3f7265404e61641077ac..8b332972cac6bf001490c0c2396174be175d6d33 100644
--- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
@@ -26,15 +26,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import copy
+import multiprocessing as mp
 import os
 
 import finn.custom_op.registry as registry
-from finn.util.basic import make_build_dir
-from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.transformation.base import Transformation
-from finn.util.basic import get_num_default_workers
-import multiprocessing as mp
-import copy
+from finn.util.basic import get_num_default_workers, make_build_dir
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
 def _codegen_single_node(node, model):
diff --git a/src/finn/transformation/fpgadataflow/prepare_ip.py b/src/finn/transformation/fpgadataflow/prepare_ip.py
index 4ed5e80aa7baa585f83314ec42233d5885dff32d..4fdcf3939fe6d879abe36907a1bf84a417cb9903 100644
--- a/src/finn/transformation/fpgadataflow/prepare_ip.py
+++ b/src/finn/transformation/fpgadataflow/prepare_ip.py
@@ -27,11 +27,12 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import warnings
+
 import finn.custom_op.registry as registry
 from finn.transformation.base import Transformation
 from finn.util.basic import make_build_dir
 from finn.util.fpgadataflow import is_fpgadataflow_node
-import warnings
 
 
 def _codegen_single_node(node, model, fpgapart, clk):
diff --git a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
index eaa85b9102b55bf8ecdf3a9f284f87468581e113..66799ff4297ad0e2f8afa9261b0f3f983b27452d 100644
--- a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
@@ -27,11 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import finn.custom_op.registry as registry
-from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.transformation.base import NodeLocalTransformation
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
-from finn.transformation.base import NodeLocalTransformation
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
 try:
     from pyverilator import PyVerilator
diff --git a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
index cc7c305b3ec94482e64235a1b1cf4eee543c46e1..7850d37423a9add0880e054c7b035b9e735c7f25 100644
--- a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
+++ b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
@@ -29,8 +29,8 @@
 import os
 
 import finn.custom_op.registry as registry
-from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.transformation.base import Transformation
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
 class ReplaceVerilogRelPaths(Transformation):
diff --git a/src/finn/transformation/fpgadataflow/set_exec_mode.py b/src/finn/transformation/fpgadataflow/set_exec_mode.py
index 4677e59f7b35fec38aeaae65485ed16ba1e18f06..caf891bc4444a65976103746685b2e79abdd708f 100644
--- a/src/finn/transformation/fpgadataflow/set_exec_mode.py
+++ b/src/finn/transformation/fpgadataflow/set_exec_mode.py
@@ -27,8 +27,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import finn.custom_op.registry as registry
-from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.transformation.base import Transformation
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
 class SetExecMode(Transformation):
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index ea27eee04db6f90b50a58296ceaf6f6ed58602ac..39eb049565475b462ea0df9d88b46e3598e6cdd9 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -29,16 +29,17 @@
 import math
 import numpy as np
 import warnings
+
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
-from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
-from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.util.pyverilator import pyverilate_stitched_ip, reset_rtlsim, toggle_clk
 
@@ -86,9 +87,14 @@ class RemoveShallowFIFOs(Transformation):
     def apply(self, model):
         shallow_fifos = []
         for node in model.graph.node:
+            if len(node.input) > 0:
+                is_first_node = model.find_producer(node.input[0]) is None
+            else:
+                is_first_node = True
             if (
                 node.op_type == "StreamingFIFO"
                 and getCustomOp(node).get_nodeattr("depth") <= self.shallow_threshold
+                and (not is_first_node)
             ):
                 # bypass shallow fifos
                 shallow_fifos.append(node)
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index bb4e0e1db51d331400e7a294890eb998c2aa4e1d..64d7a080724820d58a026bafbe74a4d7567b2179 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -26,13 +26,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
+import warnings
+
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
-from finn.util.fpgadataflow import is_fpgadataflow_node
-from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
 from finn.transformation.general import GiveUniqueNodeNames
-import warnings
+from finn.util.fpgadataflow import is_fpgadataflow_node
 
 
 def divisors(num):
@@ -153,9 +155,16 @@ class SetFolding(Transformation):
                     pe = node_inst.get_nodeattr("PE")
                     swu_node_inst.set_nodeattr("SIMD", pe)
                 else:
-                    raise Exception(
-                        "Expected SWU on DW op input, found " + swu_node.op_type
-                    )
+                    if op_type == "Vector_Vector_Activate_Batch":
+                        ksize = np.prod(node_inst.get_nodeattr("Kernel"))
+                    elif op_type == "Pool_Batch":
+                        ksize = node_inst.get_nodeattr("KernelSize")
+                    else:
+                        raise Exception("Undefined edge case for %s" % op_type)
+                    if ksize != 1:  # pointwise vvau/pool lack a SWU
+                        raise Exception(
+                            "Expected SWU on DW op input, found " + swu_node.op_type
+                        )
             elif op_type in simd_ops:
                 if op_type == "ConvolutionInputGenerator":
                     depthwise = node_inst.get_nodeattr("depthwise")
diff --git a/src/finn/transformation/fpgadataflow/synth_ooc.py b/src/finn/transformation/fpgadataflow/synth_ooc.py
index acc20e4ad0d0f4a17b20fd77625e9954f09e69aa..49cd6c82bca9ff7578314180c1ba433d63a32087 100644
--- a/src/finn/transformation/fpgadataflow/synth_ooc.py
+++ b/src/finn/transformation/fpgadataflow/synth_ooc.py
@@ -30,8 +30,8 @@ import os
 from shutil import copy2
 
 from finn.transformation.base import Transformation
-from finn.util.vivado import out_of_context_synth
 from finn.util.basic import make_build_dir
+from finn.util.vivado import out_of_context_synth
 
 
 class SynthOutOfContext(Transformation):
@@ -52,10 +52,11 @@ class SynthOutOfContext(Transformation):
         top_module_name = model.get_metadata_prop("wrapper_filename")
         top_module_name = file_to_basename(top_module_name).strip(".v")
         build_dir = make_build_dir("synth_out_of_context_")
+        verilog_extensions = [".v", ".vh"]
         with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
             all_verilog_srcs = f.read().split()
         for file in all_verilog_srcs:
-            if file.endswith(".v"):
+            if any([file.endswith(x) for x in verilog_extensions]):
                 copy2(file, build_dir)
         ret = out_of_context_synth(
             build_dir, top_module_name, self.part, self.clk_name, self.clk_period_ns
diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py
index 5265835dd2530a5c93ceefbef629a43d6f33de52..31dd22573e35894794dc522c0cf6ab47ce6c6cfc 100644
--- a/src/finn/transformation/fpgadataflow/template_driver.py
+++ b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -79,7 +79,10 @@ io_shape_dict = {
     "ishape_packed" : $INPUT_SHAPE_PACKED$,
     "oshape_packed" : $OUTPUT_SHAPE_PACKED$,
     "input_dma_name" : $INPUT_DMA_NAME$,
-    "number_of_external_weights": $EXT_WEIGHT_NUM$
+    "output_dma_name" : $OUTPUT_DMA_NAME$,
+    "number_of_external_weights": $EXT_WEIGHT_NUM$,
+    "num_inputs" : $NUM_INPUTS$,
+    "num_outputs" : $NUM_OUTPUTS$,
 }
 
 if __name__ == "__main__":
@@ -88,8 +91,8 @@ if __name__ == "__main__":
     parser.add_argument('--platform', help='Target platform: zynq-iodma alveo', default="$PLATFORM$")
     parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1)
     parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit")
-    parser.add_argument('--inputfile', help='name of input npy file (i.e. "input.npy")', default="input.npy")
-    parser.add_argument('--outputfile', help='name of output npy file (i.e. "output.npy")', default="output.npy")
+    parser.add_argument('--inputfile', help='name(s) of input npy file(s) (i.e. "input.npy")', nargs="*", type=str, default=["input.npy"])
+    parser.add_argument('--outputfile', help='name(s) of output npy file(s) (i.e. "output.npy")', nargs="*", type=str, default=["output.npy"])
     parser.add_argument('--runtime_weight_dir', help='path to folder containing runtime-writable .dat weights', default="runtime_weights/")
     # parse arguments
     args = parser.parse_args()
@@ -111,16 +114,15 @@ if __name__ == "__main__":
     # for the remote execution the data from the input npy file has to be loaded,
     # packed and copied to the PYNQ buffer
     if exec_mode == "execute":
-        # remove old output file to prevent reusing old output
-        # in case execution fails
-        try:
-            os.remove(outputfile)
-        except FileNotFoundError:
-            pass
-        # load desired input .npy file
-        ibuf_normal = np.load(inputfile)
+        # load desired input .npy file(s)
+        ibuf_normal = []
+        for ifn in inputfile:
+            ibuf_normal.append(np.load(ifn))
         obuf_normal = accel.execute(ibuf_normal)
-        np.save(outputfile, obuf_normal)
+        if not isinstance(obuf_normal, list):
+            obuf_normal = [obuf_normal]
+        for o, obuf in enumerate(obuf_normal):
+            np.save(outputfile[o], obuf)
     elif exec_mode == "throughput_test":
         # remove old metrics file
         try:
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index bd5de66cf575cd2e4eee0bf064a611e736a4b76e..a12f359c7d3f1c29a17694ef4987a1a349286234 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -103,8 +103,8 @@ create_project finn_zynq_link ./ -part $FPGA_PART
 # set board part repo paths to find PYNQ-Z1/Z2
 set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]]
 set paths_param [get_param board.repoPaths]
-lappend paths_prop /workspace/finn/board_files
-lappend paths_param /workspace/finn/board_files
+lappend paths_prop /workspace/board_files
+lappend paths_param /workspace/board_files
 set_property BOARD_PART_REPO_PATHS $paths_prop [current_project]
 set_param board.repoPaths $paths_param
 
@@ -119,6 +119,7 @@ if {$BOARD == "ZCU104"} {
     set ZYNQ_TYPE "zynq_us+"
 } elseif {$BOARD == "Pynq-Z2"} {
     set ZYNQ_TYPE "zynq_7000"
+    set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
 } elseif {$BOARD == "Pynq-Z1"} {
     set ZYNQ_TYPE "zynq_7000"
     set_property board_part www.digilentinc.com:pynq-z1:part0:1.0 [current_project]
@@ -134,6 +135,7 @@ if {$ZYNQ_TYPE == "zynq_us+"} {
     set_property -dict [list CONFIG.PSU__USE__S_AXI_GP2 {1}] [get_bd_cells zynq_ps]
     set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {0}] [get_bd_cells zynq_ps]
     #set frequency of PS clock (this can't always be exactly met)
+    set_property -dict [list CONFIG.PSU__OVERRIDE__BASIC_CLOCK {0}] [get_bd_cells zynq_ps]
     set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps]
 } elseif {$ZYNQ_TYPE == "zynq_7000"} {
     create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 zynq_ps
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index c52dfcf0cde4cbbb393786809852bc965c2856db..a2865321418343efbfdae12c111ba4334ecfee28 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -26,34 +26,33 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import json
 import os
 import subprocess
-import json
+from enum import Enum
 
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.base import Transformation
 from finn.custom_op.registry import getCustomOp
-
+from finn.transformation.base import Transformation
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.floorplan import Floorplan
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.transformation.fpgadataflow.floorplan import Floorplan
-from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     RemoveUnusedTensors,
 )
-from finn.util.basic import make_build_dir
 from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.util.basic import make_build_dir
+
 from . import templates
-from enum import Enum
 
 
 def _check_vitis_envvars():
@@ -207,7 +206,10 @@ class VitisLink(Transformation):
             # has axis, aximm and axilite
             # everything else is axis-only
             # assume only one connection from each ip to the next
-            producer = model.find_producer(node.input[0])
+            if len(node.input) == 0:
+                producer = None
+            else:
+                producer = model.find_producer(node.input[0])
             consumer = model.find_consumers(node.output[0])
             # define kernel instances
             # name kernels connected to graph inputs as idmaxx
@@ -223,6 +225,7 @@ class VitisLink(Transformation):
             else:
                 instance_names[node.name] = node.name
                 config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
+            sdp_node.set_nodeattr("instance_name", instance_names[node.name])
             # explicitly assign SLRs if the slr attribute is not -1
             node_slr = sdp_node.get_nodeattr("slr")
             if node_slr != -1:
@@ -231,7 +234,7 @@ class VitisLink(Transformation):
             if producer is None or consumer is None:
                 node_mem_port = sdp_node.get_nodeattr("mem_port")
                 if node_mem_port == "":
-                    #configure good defaults based on board
+                    # configure good defaults based on board
                     if "u50" in self.platform or "u280" in self.platform:
                         # Use HBM where available (also U50 does not have DDR)
                         mem_type = "HBM"
@@ -251,7 +254,9 @@ class VitisLink(Transformation):
                         mem_type = "DDR"
                         mem_idx = 1
                     node_mem_port = "%s[%d]" % (mem_type, mem_idx)
-                config.append("sp=%s.m_axi_gmem0:%s" % (instance_names[node.name], node_mem_port))
+                config.append(
+                    "sp=%s.m_axi_gmem0:%s" % (instance_names[node.name], node_mem_port)
+                )
             # connect streams
             if producer is not None:
                 for i in range(len(node.input)):
@@ -373,6 +378,7 @@ class VitisBuild(Transformation):
         enable_debug=False,
         floorplan_file=None,
         enable_link=True,
+        partition_model_dir=None,
     ):
         super().__init__()
         self.fpga_part = fpga_part
@@ -382,6 +388,7 @@ class VitisBuild(Transformation):
         self.enable_debug = enable_debug
         self.floorplan_file = floorplan_file
         self.enable_link = enable_link
+        self.partition_model_dir = partition_model_dir
 
     def apply(self, model):
         _check_vitis_envvars()
@@ -396,7 +403,9 @@ class VitisBuild(Transformation):
 
         model = model.transform(Floorplan(floorplan=self.floorplan_file))
 
-        model = model.transform(CreateDataflowPartition())
+        model = model.transform(
+            CreateDataflowPartition(partition_model_dir=self.partition_model_dir)
+        )
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
 
@@ -437,6 +446,4 @@ class VitisBuild(Transformation):
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "alveo")
 
-        # create driver
-        model = model.transform(MakePYNQDriver(platform="alveo"))
         return (model, False)
diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py
index 990b858ad62aec00be4be4e0dd30bef3eb9e3ce3..6c9a2973376be2c4744bc23db2cc975be8e7d52a 100644
--- a/src/finn/transformation/move_reshape.py
+++ b/src/finn/transformation/move_reshape.py
@@ -1,7 +1,8 @@
+import warnings
+
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.util.basic import get_by_name, is_finn_op
-from finn.custom_op.registry import getCustomOp
-import warnings
 
 
 def _is_fpgadataflow_node(node):
diff --git a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
new file mode 100644
index 0000000000000000000000000000000000000000..70656e4d0987924ba43d0e657414d0d172feb5ce
--- /dev/null
+++ b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit
+
+from finn.transformation.base import Transformation
+from finn.transformation.extract_conv_bias import ExtractBiasFromConv
+from finn.transformation.gemm_to_matmul import GemmToMatMul
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights
+from finn.transformation.qonnx.infer_quant_avg_pool_2d import (
+    AvgPoolAndTruncToQuantAvgPool,
+)
+from finn.transformation.qonnx.quant_act_to_multithreshold import (
+    ConvertQuantActToMultiThreshold,
+    default_filter_function_generator,
+)
+from finn.transformation.remove import RemoveIdentityOps
+
+
+class ConvertQONNXtoFINN(Transformation):
+    """Converts QONNX dialect to FINN ONNX dialect.
+    First the weights are converted using the FoldQuantWeights transformation,
+    then the ConvertQuantActToMultiThreshold transformation is used to convert
+    the activations.
+    If incompatibilities are found a ValueError or RuntimeError is raised.
+
+    The optional keyword argument `filter_function`
+    presents a way to control which Quant and BipolarQuant nodes in the activation path
+    are converted to MultiThreshold nodes. A warning will be emitted when a Quant node
+    is not converted to a MultiThreshold node.
+
+    :param filter_function: Each candidate Quant and BinaryQant node is first evaluated
+    by this function. If the function returns False,
+    then the node is not converted to a MultiTrheshold node.
+    The function is given the model and candidate node as parameters.
+    Per default a filter function is inserted, which disables the conversion of
+    Quant nodes, which have a bit width of larger than 8.
+    Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
+    """
+
+    def __init__(
+        self,
+        filter_function=default_filter_function_generator(
+            max_multithreshold_bit_width=8
+        ),
+    ):
+        super().__init__()
+        self._filter_function = filter_function
+
+    def apply(self, model):
+        # Extract the bias from Conv node
+        model = model.transform(ExtractBiasFromConv())
+        # Gemm operations are not supported by FINN, so we convert them to MatMul
+        model = model.transform(GemmToMatMul())
+        model = model.transform(FoldTransposeIntoQuantInit())
+        # Make sure the datatypes exist, these are required for folding the weights
+        model = model.transform(InferDataTypes())
+        # Fold weights
+        model = model.transform(FoldQuantWeights())
+        # Convert activations
+        model = model.transform(
+            ConvertQuantActToMultiThreshold(
+                filter_function=self._filter_function,
+            )
+        )
+        # Recompute datatypes
+        model = model.transform(InferDataTypes())
+        # Convert AvgPool -> Mul -> Trunc structure to QuantAvgPool2d
+        model = model.transform(AvgPoolAndTruncToQuantAvgPool())
+        # Remove empty padding if it exists
+        model = model.transform(RemoveIdentityOps())
+
+        return model, False
diff --git a/src/finn/transformation/qonnx/fold_quant_weights.py b/src/finn/transformation/qonnx/fold_quant_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..12c854d3bab2b762abc3649e15beff29ff8de3ac
--- /dev/null
+++ b/src/finn/transformation/qonnx/fold_quant_weights.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit
+
+import finn.core.onnx_exec as oxe
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.base import Transformation
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.remove import remove_node_and_rewire
+
+
+class FoldQuantWeights(Transformation):
+    """Merges Quant nodes, which are used as weights into the initializer
+    of the weight tensor.
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        execution_context = model.make_empty_exec_context()
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "Quant" or n.op_type == "BipolarQuant":
+                node_inp_inits = list(map(lambda x: model.get_initializer(x), n.input))
+                node_inp_dyn = list(filter(lambda x: x is None, node_inp_inits))
+                node_out = n.output[0]
+                is_all_constant_inputs = len(node_inp_dyn) == 0
+                ishape = model.get_tensor_shape(n.input[0])
+                is_const_shape = (n.op_type == "Shape") and (ishape is not None)
+                if is_all_constant_inputs or is_const_shape:
+                    # Check node validity
+                    if (
+                        n.op_type == "Quant"
+                        and not model.get_initializer(n.input[2]) == 0
+                    ):
+                        raise ValueError(
+                            "Only Quant nodes with zero-point == 0 "
+                            "are currently supported."
+                        )
+                    if model.is_fork_node(n):
+                        raise ValueError(
+                            "Weights quantized with the Quant node are not "
+                            "allowed to be fork nodes node."
+                        )
+                    target_node = model.find_direct_successors(n)
+                    if target_node is None:
+                        raise RuntimeError(
+                            "Weights quantized with the Quant node must have "
+                            "a successor node."
+                        )
+                    else:
+                        target_node = target_node[0]
+                    # If there is a DebugMarker in the weight path,
+                    # then the DebugMarker needs to be removed before any further
+                    # action is taken. Because this node interferes
+                    # with how the constant folding tries to determine how to
+                    # apply scale factors and in any case the DebugMarker would not
+                    # return useful information after folding.
+                    if target_node.op_type == "DebugMarker":
+                        remove_node_and_rewire(model, target_node)
+                        model = model.transform(FoldTransposeIntoQuantInit())
+                        return model, True
+
+                    # Continue with constant folding the quant node
+                    scale = model.get_initializer(n.input[1])
+                    unity_scale = (scale.flatten() == 1.0).all()
+                    # this node has no dynamic inputs, only constant ones -- so we can
+                    # do constant folding.
+                    oxe.execute_node(n, execution_context, graph)
+                    q_node_output = execution_context[node_out]
+                    # Check we can directly constant fold
+                    if unity_scale:
+                        # use the execution result as an initializer
+                        model.set_initializer(node_out, q_node_output)
+                    else:
+                        # Check next operator type
+                        mul_like_nodes = ["Mul", "Div", "Conv", "MatMul"]
+                        add_like_nodes = ["Add", "Sub"]
+                        all_supported_ops = mul_like_nodes.copy()
+                        all_supported_ops.extend(add_like_nodes)
+
+                        if target_node.op_type not in all_supported_ops:
+                            raise ValueError(
+                                f"Can't constant fold Quant weight node "
+                                f"into node type {target_node.op_type} "
+                                f"at node: {target_node}."
+                            )
+
+                        # For both mul and Add:
+                        # Move the scale factor behind the next operator
+                        scale = model.get_initializer(n.input[1])
+                        new_initializer = q_node_output / scale
+                        # Round, to correct for floating point errors
+                        new_initializer = np.round(new_initializer)
+                        model.set_initializer(node_out, new_initializer)
+                        q_inst = getCustomOp(n)
+                        new_dtype = q_inst.get_integer_datatype(model)
+                        model.set_tensor_datatype(node_out, new_dtype)
+
+                        # Reshape scale for Conv if required
+                        if target_node.op_type == "Conv" and len(scale.shape) > 0:
+                            bias_shape = [1] * len(scale.shape)
+                            bias_shape[1] = -1
+                            scale = scale.reshape(bias_shape)
+
+                        if scale.shape == (1,):
+                            scale = scale[0]
+                            mul_shape = tuple()
+                        else:
+                            mul_shape = scale.shape
+                        mul_tensor = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            mul_shape,
+                        )
+                        graph.value_info.append(mul_tensor)
+                        model.set_initializer(mul_tensor.name, scale)
+
+                        successor = model.find_consumers(node_out)
+                        if successor is None:
+                            raise RuntimeError(
+                                "Can only constant fold scaled Quant weights "
+                                "if a successor exists."
+                            )
+                        successor = successor[0]
+                        succ_output_name = successor.output[0]
+
+                        output_shape = model.get_tensor_shape(successor.output[0])
+                        act_mul_tensor = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            output_shape,
+                        )
+                        graph.value_info.append(act_mul_tensor)
+                        successor.output[0] = act_mul_tensor.name
+
+                        mul_node = helper.make_node(
+                            "Mul",
+                            [act_mul_tensor.name, mul_tensor.name],
+                            [succ_output_name],
+                        )
+                        graph.node.insert(node_ind, mul_node)
+
+                        if target_node.op_type in add_like_nodes:
+                            # Move the scale factor behind also in-front of
+                            # the next operator
+                            div_tensor = helper.make_tensor_value_info(
+                                model.make_new_valueinfo_name(),
+                                TensorProto.FLOAT,
+                                mul_shape,
+                            )
+                            graph.value_info.append(div_tensor)
+                            model.set_initializer(div_tensor.name, scale)
+
+                            succ_input_name = successor.input[0]
+                            act_mul_tensor = helper.make_tensor_value_info(
+                                model.make_new_valueinfo_name(),
+                                TensorProto.FLOAT,
+                                output_shape,
+                            )
+                            graph.value_info.append(act_mul_tensor)
+                            successor.input[0] = act_mul_tensor.name
+
+                            div_node = helper.make_node(
+                                "Div",
+                                [succ_input_name, div_tensor.name],
+                                [act_mul_tensor.name],
+                            )
+                            graph.node.insert(node_ind, div_node)
+
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified = True
+                    model = model.transform(InferShapes())
+                    return (model, graph_modified)
+        return (model, graph_modified)
diff --git a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..faad31fa06e76b245f25b6f0aa583fec5c0da29a
--- /dev/null
+++ b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
@@ -0,0 +1,315 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import math
+from onnx import TensorProto, helper
+
+from finn.core.datatype import DataType
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.base import Transformation
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import get_by_name
+
+
+def _get_signed_from_upstream(model, trunc_node):
+    """
+    Find out what the sign of the input to the trunc node is,
+    by looking at the upstream nodes.
+    """
+    node = trunc_node
+    # Check if the input of this node already has a FINN datatype
+    signed = None
+    inp_dt = model.get_tensor_datatype(node.input[0])
+    if inp_dt is not None and inp_dt is not DataType["FLOAT32"]:
+        signed = inp_dt.signed()
+    # Go further up the graph, since the datatype inference works top down
+    # these nodes should either be sign preserving ops or they already have a
+    # datatype defined for the output tensor.
+    curr_node = node
+    if signed is None:
+        while curr_node is not None:
+            if model.is_join_node(curr_node):
+                raise RuntimeError(
+                    "Datatype Inference for the Trunc node only supports "
+                    "linear nodes in the upstream path."
+                )
+            next_node = model.find_direct_predecessors(curr_node)
+            if next_node is None:
+                raise RuntimeError(
+                    "Could not infere the Datatype for the Trunc node due to "
+                    "missing upstream ndoes."
+                )
+            next_node = next_node[0]
+            out_dt = model.get_tensor_datatype(next_node.output[0])
+            if out_dt is not None and out_dt is not DataType["FLOAT32"]:
+                signed = out_dt.signed()
+                break
+            # Special cases where the node has an internal or intrinsic datatype.
+            if next_node.op_type == "MultiThreshold":
+                mt_inst = getCustomOp(next_node)
+                out_dt = DataType[mt_inst.get_nodeattr("out_dtype")]
+                if out_dt is not None and out_dt is not DataType["FLOAT32"]:
+                    signed = out_dt.signed()
+                    break
+            if next_node.op_type == "BipolarQuant":
+                signed = True
+                break
+            if next_node.op_type == "Quant":
+                q_inst = getCustomOp(next_node)
+                out_dt = q_inst.get_integer_datatype(model)
+                if out_dt is not None and out_dt is not DataType["FLOAT32"]:
+                    signed = out_dt.signed()
+                    break
+
+            # Check if we are allowed to move on to the next op
+            sign_preserving_ops = ["Add", "Mul", "AveragePool", "Pad"]
+            if next_node.op_type not in sign_preserving_ops:
+                raise RuntimeError(
+                    f"Could not infere the Datatype for the Trunc node, "
+                    f"because the sign of the input datatype could not be infered "
+                    f"from upstream nodes. And traversal further up the graph was "
+                    f"disallowed, since the next node type {next_node.op_type} "
+                    f"is not in the list of "
+                    f"sign preserving ops {sign_preserving_ops}."
+                )
+            curr_node = next_node
+
+    if signed is None:
+        raise RuntimeError(
+            "Could not infere the Datatype for the Trunc node, "
+            "because the sign of the input datatype could not be infered "
+            "from upstream nodes."
+        )
+
+    return signed
+
+
+class AvgPoolAndTruncToQuantAvgPool(Transformation):
+    """
+    Convert a section of nodes of the pattern:
+    AveragePool -> Mul (scalar) -> Trunc
+    To the FINN op: QuantAvgPool2d
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "AveragePool":
+                mul_node = model.find_direct_successors(n)
+                if (
+                    mul_node is not None
+                    and len(mul_node) == 1
+                    and mul_node[0].op_type == "Mul"
+                ):
+                    mul_node = mul_node[0]
+                    t_node = model.find_direct_successors(mul_node)
+                    if (
+                        t_node is not None
+                        and len(t_node) == 1
+                        and t_node[0].op_type == "Trunc"
+                    ):
+                        t_node = t_node[0]
+                        running_node_index = node_ind
+                        # Check node for compatibility
+                        # Avg pooling node
+                        k_s = get_by_name(n.attribute, "kernel_shape")
+                        if k_s is None or len(k_s.ints) != 2 or len(set(k_s.ints)) != 1:
+                            raise ValueError(
+                                "FINN only supports average pooling with "
+                                "2D square kernels."
+                            )
+                        k_s = k_s.ints[0]
+
+                        pads = get_by_name(n.attribute, "pads")
+                        if (
+                            pads is None
+                            or len(set(pads.ints)) != 1
+                            or pads.ints[0] != 0
+                        ):
+                            raise ValueError(
+                                "FINN dosn't support padding for average pooling."
+                            )
+
+                        stride = get_by_name(n.attribute, "strides")
+                        if (
+                            stride is None
+                            or len(stride.ints) != 2
+                            or len(set(stride.ints)) != 1
+                        ):
+                            raise ValueError(
+                                "FINN only supports 2D strides with equal values in "
+                                "each direction."
+                            )
+                        stride = stride.ints[0]
+
+                        # Mul node
+                        mul_val = model.get_initializer(mul_node.input[1])
+                        if (
+                            mul_val is None
+                            or len(mul_val.shape) != 0
+                            or mul_val != k_s * k_s
+                        ):
+                            raise ValueError(
+                                f"The Mul node after the AveragePool node must have "
+                                f"static initialization at the second input, "
+                                f"further the initialization must be of zero dimension "
+                                f"and the value of the initialization must be "
+                                f"the quadratic value of the kernel size, "
+                                f"in this case {k_s * k_s}."
+                            )
+
+                        # Trunc node
+                        rounding_mode = get_by_name(t_node.attribute, "rounding_mode")
+                        if rounding_mode is None or rounding_mode.s != b"FLOOR":
+                            raise ValueError(
+                                "The Trunc node must have the rounding_mode "
+                                "set to 'FLOOR'."
+                            )
+                        for inp in t_node.input[1:]:
+                            if model.get_initializer(inp) is None:
+                                raise ValueError(
+                                    f"All inputs of the Trunc node, "
+                                    f"except the first, must be statically "
+                                    f"initialized. However, {inp} is not."
+                                )
+                        zero_pt = model.get_initializer(t_node.input[2])
+                        if len(zero_pt.shape) != 0 or zero_pt != 0:
+                            raise ValueError(
+                                f"Finn only supports 0 as the zero point for "
+                                f"the Trunc node, it currently is {zero_pt}."
+                            )
+                        trunc_in_bits = model.get_initializer(t_node.input[3]).flatten()
+                        trunc_out_bits = model.get_initializer(
+                            t_node.input[4]
+                        ).flatten()
+                        if (
+                            len(trunc_in_bits.shape) != 1
+                            or len(trunc_out_bits.shape) != 1
+                        ):
+                            raise ValueError(
+                                f"Finn only supports scalar bit widths "
+                                f"for the Trunc node. The input bit width "
+                                f"currently is: {trunc_in_bits}, "
+                                f"while the output bit width is: {trunc_out_bits}."
+                            )
+                        trunc_in_bits = int(trunc_in_bits[0])
+                        trunc_out_bits = int(trunc_out_bits[0])
+
+                        # Calculate parameters for the QuantAvgPool2d node,
+                        # Calculate input bit width. Basically this backwards:
+                        # https://github.com/Xilinx/finn-base/blob/
+                        # 7c2603a95e90e4de2575020e575c24eab6a15889/src/finn/custom_op/
+                        # general/quantavgpool2d.py#L94
+                        ibits = math.floor(
+                            math.log(2 ** trunc_in_bits / (k_s * k_s), 2)
+                        )
+                        # Get sign
+                        signed = _get_signed_from_upstream(model, t_node)
+                        # ToDo: Change this to NHWC,
+                        #  when the channels last layout comes around.
+                        data_layout = "NCHW"
+
+                        # Insert scale nodes, QuantAvgPool2d node and required tensors
+                        scale = model.get_initializer(t_node.input[1])
+                        scale_div_tensor = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            None,
+                        )
+                        graph.value_info.append(scale_div_tensor)
+                        model.set_initializer(scale_div_tensor.name, scale)
+
+                        act_scale_div_tensor = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            None,
+                        )
+                        graph.value_info.append(act_scale_div_tensor)
+
+                        scale_div_node = helper.make_node(
+                            "Div",
+                            [n.input[0], scale_div_tensor.name],
+                            [act_scale_div_tensor.name],
+                        )
+                        graph.node.insert(running_node_index, scale_div_node)
+                        running_node_index += 1
+
+                        act_scale_mul_tensor = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            None,
+                        )
+                        graph.value_info.append(act_scale_mul_tensor)
+
+                        QuantAvgPool2d_node = helper.make_node(
+                            "QuantAvgPool2d",
+                            [act_scale_div_tensor.name],
+                            [act_scale_mul_tensor.name],
+                            domain="finn.custom_op.general",
+                            stride=stride,
+                            kernel=k_s,
+                            ibits=ibits,
+                            obits=trunc_out_bits,
+                            signed=int(signed),
+                            data_layout=data_layout,
+                        )
+                        graph.node.insert(running_node_index, QuantAvgPool2d_node)
+                        running_node_index += 1
+
+                        scale_mul_tensor = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            None,
+                        )
+                        graph.value_info.append(scale_mul_tensor)
+                        model.set_initializer(scale_mul_tensor.name, scale)
+
+                        scale_mul_node = helper.make_node(
+                            "Mul",
+                            [act_scale_mul_tensor.name, scale_mul_tensor.name],
+                            [t_node.output[0]],
+                        )
+                        graph.node.insert(running_node_index, scale_mul_node)
+                        running_node_index += 1
+
+                        # Remove old nodes
+                        graph.node.remove(n)
+                        graph.node.remove(mul_node)
+                        graph.node.remove(t_node)
+
+                        # Recompute shapes and datatypes
+                        model = model.transform(InferShapes())
+                        model = model.transform(InferDataTypes())
+
+                        return model, True
+
+        return model, False
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3336b1eee7fa9d54092cd56b9ba0edaf9d0884b1
--- /dev/null
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -0,0 +1,524 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+from abc import ABC, abstractmethod
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.registry import getCustomOp
+
+
+class QuantActBaseHandler(ABC):
+    """Base class for converting quantized activation expressed in the QONNX dialect
+    to the FINN ONNX dialect.
+    :param model: The model on which this handler should operate.
+    :type model: class: `finn.core.modelwrapper.ModelWrapper`
+    :param quant_node: The Quant node which a given handler should replace.
+    :param quant_node_index: The index of the Quant node in the given model.
+    :type quant_node_index: `int`
+    """
+
+    def __init__(self, model: ModelWrapper, quant_node, quant_node_index: int):
+        """Base class constructor"""
+        super().__init__()
+        self._model = model
+        self._q_node = quant_node
+        self._q_index = quant_node_index
+
+    @property
+    @classmethod
+    @abstractmethod
+    def valid_predecessor_op_types(self):
+        """Defines which op types the preceding node is allowed to have for
+        this type of activation.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _check_compatibility(self):
+        """Check for compatibility with FINN.
+        There are many more possible combinations of QONNX settings,
+        than what is supported by FINN.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _calculate_act_bias(self):
+        """Calculate the activation bias,
+        which is introduced as an Add node behind the MultiTrheshold node.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _calculate_thresholds(self):
+        """Calculate the threshold array for the MultiThreshold node."""
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _calculate_act_scale(self):
+        """Calculate the activation scale,
+        which is indroduced as a Mul node behind the Add node
+        for the activation bias.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _remove_activation_node(self, multi_threshold_node):
+        """Remove the activation node in front of the Quant node."""
+        raise NotImplementedError()
+
+    def _extract_output_datatype(self):
+        """Get the output datatype for the MultiThreshold node."""
+        q_inst = getCustomOp(self._q_node)
+        dtype = q_inst.get_integer_datatype(self._model)
+        dtype = dtype.name
+        return dtype
+
+    def calculate_node_parameters(self):
+        """Calculate all parameters required for replacing the QONNX style activation
+        with a FINN style one.
+        """
+        return {
+            "out_dtype": self._extract_output_datatype(),
+            "thresholds": self._calculate_thresholds(),
+            "adder_bias": self._calculate_act_bias(),
+            "mul_scale": self._calculate_act_scale(),
+        }
+
+    def replace_quant_node(self):
+        """Replace the given QONNX style activation with a FINN style one."""
+
+        # Check that we actually support what the user is trying to do
+        self._check_compatibility()
+
+        # Shorten instance variables
+        model = self._model
+        graph = model.graph
+        n = self._q_node
+        running_node_index = self._q_index
+
+        # Calculate insertion parameters
+        parameter_dict = self.calculate_node_parameters()
+        thresholds = parameter_dict["thresholds"]
+        out_dtype = parameter_dict["out_dtype"]
+        adder_bias = parameter_dict["adder_bias"]
+        mul_scale = parameter_dict["mul_scale"]
+
+        # Modify graph
+        # Insert threshold tensor
+        thresh_tensor = helper.make_tensor_value_info(
+            model.make_new_valueinfo_name(),
+            TensorProto.FLOAT,
+            thresholds.shape,
+        )
+        graph.value_info.append(thresh_tensor)
+        model.set_initializer(thresh_tensor.name, thresholds)
+
+        # Insert MultiThreshold node
+        outp_trans_node = helper.make_node(
+            "MultiThreshold",
+            [n.input[0], thresh_tensor.name],
+            [n.output[0]],
+            out_dtype="FLOAT32",
+            domain="finn.custom_op.general",
+        )
+        graph.node.insert(running_node_index, outp_trans_node)
+        running_node_index += 1
+
+        # Get the MultiThreshold node instance to work with
+        mt_node = graph.node[running_node_index - 1]
+        mt_inst = getCustomOp(mt_node)
+
+        # Set scale and bias
+        # If these values are scalar then they can be set as attributes
+        # of the MultiThreshold node, if not they get inserted as adder and mul nodes
+        # behind the MultiTrheshold nodes.
+        bias_scalar = adder_bias.shape == (1,) or len(adder_bias.shape) == 0
+        scale_scalar = mul_scale.shape == (1,) or len(mul_scale.shape) == 0
+        if scale_scalar and bias_scalar and self._q_node.op_type == "BipolarQuant":
+            # Get Quant parameters
+            mul_scale = np.atleast_1d(mul_scale)
+            # ONNX only accepts 64bit floats as attributes
+            mul_scale = mul_scale.astype(dtype=np.float64)
+            adder_bias = np.atleast_1d(adder_bias)
+            adder_bias = adder_bias.astype(dtype=np.float64)
+
+            # Set Bias and scale
+            mt_inst.set_nodeattr("out_scale", mul_scale[0])
+            # FINN applies scale first then bias,
+            # which is the other way around in Brevitas,
+            # we thus need to adjust the bias in the MultiThreshold node
+            finn_bias = adder_bias[0] * mul_scale[0]
+            mt_inst.set_nodeattr("out_bias", finn_bias)
+
+            # Set the output data type
+            mt_inst.set_nodeattr("out_dtype", out_dtype)
+        else:
+            # Set datatype
+            mt_inst.set_nodeattr("out_dtype", out_dtype)
+
+            # Insertion parameters
+            up_stream_node = mt_node
+
+            # Set bias
+            zero_bias = False
+            if bias_scalar:
+                adder_bias = np.atleast_1d(adder_bias)
+                # ONNX only accepts 64bit floats as attributes
+                adder_bias = adder_bias.astype(dtype=np.float64)[0]
+                add_shape = tuple()
+                if adder_bias == 0.0:
+                    zero_bias = True
+            else:
+                add_shape = adder_bias.shape
+
+            if not zero_bias:
+                # Insert Add node
+                add_tensor = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    add_shape,
+                )
+                graph.value_info.append(add_tensor)
+                model.set_initializer(add_tensor.name, adder_bias)
+
+                output_shape = model.get_tensor_shape(n.output[0])
+                act_add_tensor = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    output_shape,
+                )
+                graph.value_info.append(act_add_tensor)
+
+                add_node = helper.make_node(
+                    "Add",
+                    [act_add_tensor.name, add_tensor.name],
+                    [n.output[0]],
+                )
+                graph.node.insert(running_node_index, add_node)
+                running_node_index += 1
+                add_node = graph.node[running_node_index - 1]
+
+                # Re-point the upstream node
+                up_stream_node.output[0] = act_add_tensor.name
+                up_stream_node = add_node
+
+            # Set scale
+            # Insert Mul node
+            unity_scale = False
+            if scale_scalar:
+                mul_scale = np.atleast_1d(mul_scale)
+                mul_scale = mul_scale.astype(dtype=np.float64)[0]
+                mul_shape = tuple()
+                if mul_scale == 1.0:
+                    unity_scale = True
+            else:
+                mul_shape = mul_scale.shape
+
+            if not unity_scale:
+                mul_tensor = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    mul_shape,
+                )
+                graph.value_info.append(mul_tensor)
+                model.set_initializer(mul_tensor.name, mul_scale)
+
+                output_shape = model.get_tensor_shape(n.output[0])
+                act_mul_tensor = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    output_shape,
+                )
+                graph.value_info.append(act_mul_tensor)
+
+                mul_node = helper.make_node(
+                    "Mul",
+                    [act_mul_tensor.name, mul_tensor.name],
+                    [n.output[0]],
+                )
+                graph.node.insert(running_node_index, mul_node)
+                running_node_index += 1
+                mul_node = graph.node[running_node_index - 1]
+
+                # Re-point the upstream node
+                up_stream_node.output[0] = act_mul_tensor.name
+                up_stream_node = mul_node
+
+        # Remove activation node
+        self._remove_activation_node(mt_node)
+
+        # Remove the Quant node
+        graph.node.remove(n)
+
+        # return the internal model representation
+        return self._model
+
+
+class QuantReluHandler(QuantActBaseHandler):
+    """Class for converting a quantized relu operation expressed in the QONNX
+    dialect to the FINN ONNX dialect."""
+
+    valid_predecessor_op_types = [
+        "Relu",
+    ]
+
+    def _check_compatibility(self):
+        if self._q_node.op_type == "Quant":
+            q_inst = getCustomOp(self._q_node)
+            narrow = q_inst.get_nodeattr("narrow")
+            signed = q_inst.get_nodeattr("signed")
+            if signed or narrow:
+                raise ValueError(
+                    "FINN only supports unsigned and non-narrow Quant nodes "
+                    "for Relu activations."
+                )
+            if not self._model.get_initializer(self._q_node.input[2]) == 0:
+                raise ValueError(
+                    "Only Quant nodes with zero-point == 0 "
+                    "are currently supported for ReLu activations."
+                )
+        elif self._q_node.op_type == "BipolarQuant":
+            return
+        else:
+            raise RuntimeError("Got an unexpected quantizer node type")
+
+    def _calculate_act_bias(self):
+        # No bias allowed for Relu activations, see: https://github.com/Xilinx/
+        # brevitas/blob/a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/
+        # export/onnx/finn/handler/act.py#L48
+        bias = np.array([0.0])
+        return bias
+
+    def _calculate_thresholds(self):
+        # Gather parameters
+        if self._q_node.op_type == "Quant":
+            bit_width = self._model.get_initializer(self._q_node.input[3])
+        elif self._q_node.op_type == "BipolarQuant":
+            bit_width = 1.0
+        else:
+            raise RuntimeError("Got an unexpected quantizer node type")
+        quant_scale = self._model.get_initializer(self._q_node.input[1]).astype(
+            np.float32
+        )
+        # q_inst = getCustomOp(self._q_node)
+        # narrow = q_inst.get_nodeattr("narrow")
+
+        # Calculate thersholds, see: https://github.com/Xilinx/brevitas/blob/
+        # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
+        # onnx/finn/handler/act.py#L21
+        num_distinct_values = 2 ** bit_width
+        num_thresholds = int(num_distinct_values - 1)
+        flat_scale = quant_scale.flatten().astype(np.float32)
+        num_scale_channels = flat_scale.shape[0]
+        step = np.abs(flat_scale).astype(np.float32)
+        min_threshold = step / 2
+        thresholds = np.empty((num_scale_channels, num_thresholds)).astype(np.float32)
+        for c in range(num_scale_channels):
+            for t in range(num_thresholds):
+                thresholds[c][t] = min_threshold[c] + step[c] * t
+
+        # ToDo: The index 1 needs to be changed to -1 for the channels last format
+        num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1]
+        final_shape = (num_output_channels, num_thresholds)
+        if thresholds.shape != final_shape:
+            thresholds = np.broadcast_to(thresholds, final_shape)
+
+        return thresholds
+
+    def _calculate_act_scale(self):
+        # Gather parameters
+        quant_scale = self._model.get_initializer(self._q_node.input[1])
+        # Calculate scale, see: https://github.com/Xilinx/brevitas/blob/
+        # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
+        # onnx/finn/handler/act.py#L40
+        scale = quant_scale
+        return scale
+
+    def _remove_activation_node(self, multi_threshold_node):
+        # Find the activation node
+        act_node = self._model.find_direct_predecessors(self._q_node)
+        if act_node is None:
+            raise RuntimeError(
+                "For handling of Relu activations a predecesor to "
+                "the Quant node must exist."
+            )
+        act_node = act_node[0]
+        if not act_node.op_type == "Relu":
+            raise RuntimeError(
+                "The predecesor of the Quant node must be Relu for handling "
+                "of Relu activations."
+            )
+
+        # Reroute upstream tensor
+        multi_threshold_node.input[0] = act_node.input[0]
+
+        # Remove the activation node
+        self._model.graph.node.remove(act_node)
+
+
+class QuantIdentityHandler(QuantActBaseHandler):
+    """Class for converting a quantized identity operation expressed in the QONNX
+    dialect to the FINN ONNX dialect.
+    This handler also takes care of quantized HardTanh activations, because
+    these are equivalent to quantized identity activations.
+    """
+
+    valid_predecessor_op_types = [
+        "BatchNormalization",
+        "Sub",
+        "Add",
+        "Mul",
+        "Div",
+        "DebugMarker",
+        None,
+    ]
+
+    def _check_compatibility(self):
+        # Gather parameters to check
+        if self._q_node.op_type == "Quant":
+            q_inst = getCustomOp(self._q_node)
+            signed = q_inst.get_nodeattr("signed")
+            if not signed:
+                raise ValueError(
+                    "FINN only supports signed Quant nodes for identity activations."
+                )
+            if not self._model.get_initializer(self._q_node.input[2]) == 0:
+                raise ValueError(
+                    "Only Quant nodes with zero-point == 0 "
+                    "are currently supported for identity activations."
+                )
+        elif self._q_node.op_type == "BipolarQuant":
+            quant_scale = self._model.get_initializer(self._q_node.input[1])
+            if (quant_scale.flatten().shape[0] != 1) or quant_scale.flatten()[0] != 1.0:
+                raise ValueError(
+                    "FINN only supports Bipolar identity activations "
+                    "with out per channel scaling and the scaling must be 1. "
+                )
+        else:
+            raise RuntimeError("Got an unexpected quantizer node type")
+
+    def _calculate_act_bias(self):
+        # Gather parameters
+        q_inst = getCustomOp(self._q_node)
+        if self._q_node.op_type == "Quant":
+            bit_width = self._model.get_initializer(self._q_node.input[3])
+            narrow = q_inst.get_nodeattr("narrow")
+        elif self._q_node.op_type == "BipolarQuant":
+            bit_width = 1.0
+        else:
+            raise RuntimeError("Got an unexpected quantizer node type")
+        # Calculate bias, see: https://github.com/Xilinx/brevitas/blob/
+        # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
+        # onnx/finn/handler/act.py#L64
+        if bit_width == 1.0:
+            bias = np.array([-0.5])
+        else:
+            if narrow:
+                min_non_scaled_val = -(2 ** (bit_width - 1) - 1)
+            else:
+                min_non_scaled_val = -(2 ** (bit_width - 1))
+            bias = np.array([min_non_scaled_val])
+        return bias
+
+    def _calculate_thresholds(self):
+        # Gather parameters
+        quant_scale = self._model.get_initializer(self._q_node.input[1])
+        q_inst = getCustomOp(self._q_node)
+        if self._q_node.op_type == "Quant":
+            bit_width = self._model.get_initializer(self._q_node.input[3])
+            narrow = q_inst.get_nodeattr("narrow")
+        elif self._q_node.op_type == "BipolarQuant":
+            bit_width = 1.0
+        else:
+            raise RuntimeError("Got an unexpected quantizer node type")
+
+        # Calculate thersholds, see: https://github.com/Xilinx/brevitas/
+        # blob/a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/
+        # export/onnx/finn/handler/act.py#L76
+        if bit_width == 1.0:
+            thresholds = np.empty([1, 1])
+            thresholds[0] = 0
+            return thresholds
+        else:
+            if narrow:
+                num_distinct_values = 2 ** bit_width - 1
+            else:
+                num_distinct_values = 2 ** bit_width
+
+            num_thresholds = int(num_distinct_values - 1)
+            flat_scale = quant_scale.flatten()
+            num_scale_channels = flat_scale.shape[0]
+            step = np.abs(flat_scale)
+            half_step = step / 2.0
+            thresholds = np.empty((num_scale_channels, num_thresholds))
+            # compute the value of the smallest threshold, we'll neg-bias all
+            # generated thresholds by this much
+            min_threshold = -half_step - step * ((num_thresholds // 2) - 1)
+            if not narrow:
+                min_threshold -= step
+            for c in range(num_scale_channels):
+                for t in range(num_thresholds):
+                    thresholds[c][t] = min_threshold[c] + step[c] * t
+
+            # ToDo: The index 1 needs to be changed to -1 for the channels last format
+            num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[
+                1
+            ]
+            final_shape = (num_output_channels, num_thresholds)
+            if thresholds.shape != final_shape:
+                thresholds = np.broadcast_to(thresholds, final_shape)
+
+            return thresholds
+
+    def _calculate_act_scale(self):
+        # Gather parameters
+        if self._q_node.op_type == "Quant":
+            bit_width = self._model.get_initializer(self._q_node.input[3])
+        elif self._q_node.op_type == "BipolarQuant":
+            bit_width = 1.0
+        else:
+            raise RuntimeError("Got an unexpected quantizer node type")
+        quant_scale = self._model.get_initializer(self._q_node.input[1])
+        # Calculate scale, see: https://github.com/Xilinx/brevitas/
+        # blob/a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/
+        # export/onnx/finn/handler/act.py#L111
+        if bit_width != 1:
+            scale = quant_scale
+        else:
+            assert (
+                quant_scale.flatten().shape[0] == 1
+            ), "Unsupported BIPOLAR per channel scale"
+            assert quant_scale.flatten()[0] == 1.0, "Unsupported BIPOLAR scale != 1"
+            scale = quant_scale * 2
+        return scale
+
+    def _remove_activation_node(self, multi_threshold_node):
+        # The Quant identity activation has per definition no explicit activation node
+        return
diff --git a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..29ba93dfcfe6d18e0ff8927b6d646cb310d0262a
--- /dev/null
+++ b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import warnings
+
+from finn.transformation.base import Transformation
+from finn.transformation.qonnx.qonnx_activation_handlers import QuantActBaseHandler
+
+
+def default_filter_function_generator(max_multithreshold_bit_width=8):
+    """
+    This function generates the default filter function for the
+    ConvertQuantActToMultiThreshold transformation. Per default the returned
+    function disables the conversion of Quant nodes which have a bit width above 8 bit.
+
+    This function generator can be used as a template to write custom
+    filter functions.
+    """
+
+    def filter_function(model, q_node):
+        if q_node.op_type == "Quant":
+            bit_width = model.get_initializer(q_node.input[3])
+        elif q_node.op_type == "BipolarQuant":
+            bit_width = 1.0
+        else:
+            raise RuntimeError("Got an unexpected quantizer node type")
+        if bit_width is None:
+            raise ValueError("Quant nodes must have a static bit width.")
+        if bit_width > max_multithreshold_bit_width:
+            warnings.warn(
+                f'The Quant node with name: "{q_node.name}" was not converted to a '
+                f"MultiThreshold node, because its bit width of {bit_width} is "
+                f"higher than the configured maximum bit width of "
+                f"{max_multithreshold_bit_width}."
+            )
+            return False
+        return True
+
+    return filter_function
+
+
+class ConvertQuantActToMultiThreshold(Transformation):
+    """
+    Converts Quant nodes in the activation path to MultiThreshold nodes.
+
+    The optional keyword argument `filter_function`
+    presents a way to control which Quant and BipolarQuant nodes in the activation path
+    are converted to MultiThreshold nodes. A warning will be emitted when a Quant node
+    is not converted to a MultiThreshold node.
+
+    :param filter_function: Each candidate Quant and BinaryQant node is first evaluated
+    by this function. If the function returns False,
+    then the node is not converted to a MultiTrheshold node.
+    The function is given the model and candidate node as parameters.
+    Per default a filter function is inserted, which disables the conversion of
+    Quant nodes, which have a bit width of larger than 8.
+    Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
+    """
+
+    def __init__(
+        self,
+        filter_function=default_filter_function_generator(
+            max_multithreshold_bit_width=8
+        ),
+    ):
+        super().__init__()
+        self._filter_function = filter_function
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "Quant" or n.op_type == "BipolarQuant":
+                # Check that the node is in the activation path
+                inp = model.get_initializer(n.input[0])
+                out = model.get_initializer(n.output[0])
+                if not (inp is None and out is None):
+                    continue
+                predecessor = model.find_direct_predecessors(n)
+                if predecessor is not None:
+                    predecessor_op_type = predecessor[0].op_type
+                else:
+                    predecessor_op_type = predecessor
+                if model.is_fork_node(n):
+                    raise ValueError(
+                        "Forking Quant/BipolarQuant nodes are currently "
+                        "not supported by FINN."
+                    )
+                if n.op_type == "Quant" and not model.get_initializer(n.input[2]) == 0:
+                    raise ValueError(
+                        "Only Quant nodes with zero-point == 0 are currently supported."
+                    )
+
+                # Check that this node passes the user filter
+                if not self._filter_function(model, n):
+                    warnings.warn(
+                        f'The Quant node with name: "{n.name}" was not converted to a '
+                        f"MultiThreshold node, because the filtering function "
+                        f"returned False for this node."
+                    )
+                    continue
+
+                # Check for possible ambiguity in handler selection
+                valid_predecessors = []
+                for cls in QuantActBaseHandler.__subclasses__():
+                    valid_predecessors.extend(cls.valid_predecessor_op_types)
+                if len(valid_predecessors) != len(set(valid_predecessors)):
+                    raise RuntimeError(
+                        "Two or more activation handlers declare the same "
+                        "type of valid predecessor node. "
+                        "This leads to ambiguity in the handler selection "
+                        "and must thus be avoided."
+                    )
+
+                # Try to find a fitting handler for this Quant activation node
+                for handler_cls in QuantActBaseHandler.__subclasses__():
+                    if predecessor_op_type in handler_cls.valid_predecessor_op_types:
+                        handler = handler_cls(model, n, node_ind)
+                        break
+                else:
+                    raise ValueError(
+                        f"Quant nodes in the activation path and with predecessor "
+                        f"nodes of type {predecessor_op_type} are currently not "
+                        f"supported by FINN and can not be converted to "
+                        f"MultiThreshold nodes."
+                    )
+                model = handler.replace_quant_node()
+                graph_modified = True
+                return (model, graph_modified)
+
+        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/__init__.py b/src/finn/transformation/streamline/__init__.py
index 97cd957ce166255c1442a544f6e865b42c33a1df..d0ec26a4d10c688db7931e40d7cfd840394b55a1 100644
--- a/src/finn/transformation/streamline/__init__.py
+++ b/src/finn/transformation/streamline/__init__.py
@@ -31,42 +31,38 @@ from pkgutil import extend_path
 __path__ = extend_path(__path__, __name__)
 
 from finn.transformation.base import Transformation
-from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.batchnorm_to_affine import BatchNormToAffine
 from finn.transformation.general import (
-    ConvertSubToAdd,
     ConvertDivToMul,
+    ConvertSubToAdd,
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
 )
-
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.remove import RemoveIdentityOps
 from finn.transformation.streamline.absorb import (
+    Absorb1BitMulIntoConv,
+    Absorb1BitMulIntoMatMul,
     AbsorbAddIntoMultiThreshold,
     AbsorbMulIntoMultiThreshold,
-    FactorOutMulSignMagnitude,
-    Absorb1BitMulIntoMatMul,
-    Absorb1BitMulIntoConv,
     AbsorbSignBiasIntoMultiThreshold,
+    FactorOutMulSignMagnitude,
 )
-
 from finn.transformation.streamline.collapse_repeated import (
     CollapseRepeatedAdd,
     CollapseRepeatedMul,
 )
-
 from finn.transformation.streamline.reorder import (
-    MoveAddPastMul,
-    MoveScalarMulPastMatMul,
-    MoveScalarAddPastMatMul,
     MoveAddPastConv,
-    MoveScalarMulPastConv,
+    MoveAddPastMul,
     MoveMulPastMaxPool,
+    MoveScalarAddPastMatMul,
     MoveScalarLinearPastInvariants,
+    MoveScalarMulPastConv,
+    MoveScalarMulPastMatMul,
 )
-
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.transformation.streamline.sign_to_thres import ConvertSignToThres
-from finn.transformation.batchnorm_to_affine import BatchNormToAffine
-from finn.transformation.streamline.remove import RemoveIdentityOps
 
 
 class Streamline(Transformation):
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 8237d8bf2f05ab93c61f4f6c58d0e049195e223d..97ae3b51a849a4174c9853cb41c0d6d72bdf8dad 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -27,16 +27,16 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
-from onnx import helper as oh
 import warnings
+from onnx import helper as oh
 
-from finn.core.datatype import DataType
 import finn.core.data_layout as DataLayout
-from finn.transformation.base import Transformation
-from finn.util.basic import get_by_name
+from finn.core.datatype import DataType
 from finn.custom_op.registry import getCustomOp
-from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.base import Transformation
 from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import get_by_name
 
 
 class AbsorbSignBiasIntoMultiThreshold(Transformation):
@@ -205,7 +205,7 @@ class FactorOutMulSignMagnitude(Transformation):
                 actual_ndims = len(tuple(filter(lambda x: x > 1, A.shape)))
                 is_1d = actual_ndims == 1
                 is_not_bipolar = (
-                    model.get_tensor_datatype(mul_weight_name) != DataType.BIPOLAR
+                    model.get_tensor_datatype(mul_weight_name) != DataType["BIPOLAR"]
                 )
                 is_signed = (A < 0).any()
                 if is_signed and (is_scalar or is_1d) and is_not_bipolar:
@@ -217,7 +217,7 @@ class FactorOutMulSignMagnitude(Transformation):
                     # create new mul node with sign(A) as the operand
                     sgn = np.sign(A)
                     model.set_initializer(sign_mul_param_name, sgn)
-                    model.set_tensor_datatype(sign_mul_param_name, DataType.BIPOLAR)
+                    model.set_tensor_datatype(sign_mul_param_name, DataType["BIPOLAR"])
                     # replace original mul weight by magnitudes
                     model.set_initializer(mul_weight_name, np.abs(A))
                     new_mul = oh.make_node(
@@ -308,56 +308,61 @@ class Absorb1BitMulIntoConv(Transformation):
 
 
 class AbsorbTransposeIntoMultiThreshold(Transformation):
-    """Change (NCHWTranspose -> MultiThreshold -> NHWCTranspose) to (MultiThreshold)
-    with NHWC mode. For (NCHWTranspose -> MultiThreshold) move Transpose past MT."""
+    """For (NCHWTranspose -> MultiThreshold) move Transpose past MultiThreshold
+    and set its data_layout mode to NHWC."""
 
     def apply(self, model):
         graph = model.graph
         node_ind = 0
         graph_modified = False
-        for n in graph.node:
+        nodes = [n for n in model.graph.node]
+        for n in nodes:
             node_ind += 1
             if n.op_type == "Transpose" and not model.is_fork_node(n):
                 perms = list(get_by_name(n.attribute, "perm").ints)
                 if perms == [0, 3, 1, 2]:
                     mt_cand = model.find_consumer(n.output[0])
-                    if mt_cand.op_type == "MultiThreshold" and not model.is_fork_node(
-                        mt_cand
+                    if (
+                        mt_cand is not None
+                        and mt_cand.op_type == "MultiThreshold"
+                        # and not model.is_fork_node(mt_cand)
                     ):
-                        final_t_cand = model.find_consumer(mt_cand.output[0])
-                        if final_t_cand.op_type == "Transpose":
-                            perms = list(
-                                get_by_name(final_t_cand.attribute, "perm").ints
-                            )
-                            if perms == [0, 2, 3, 1]:
-                                mt = getCustomOp(mt_cand)
-                                mt.set_nodeattr("data_layout", "NHWC")
-                                # get rid of tranpose nodes, wire MT directly
-                                mt_cand.input[0] = n.input[0]
-                                mt_cand.output[0] = final_t_cand.output[0]
-                                graph.node.remove(n)
-                                graph.node.remove(final_t_cand)
-                                graph_modified = True
-                        else:
-                            mt = getCustomOp(mt_cand)
-                            mt.set_nodeattr("data_layout", "NHWC")
-                            # get rid of first tranpose node
-                            mt_cand.input[0] = n.input[0]
-                            graph.node.remove(n)
-                            # fix output shape for MultiThreshold
-                            mt_ishape = model.get_tensor_shape(mt_cand.input[0])
-                            model.set_tensor_shape(mt_cand.output[0], mt_ishape)
-                            # re-insert Transpose behind MultiThreshold
-                            transpose_output = model.make_new_valueinfo_name()
-                            new_transpose = oh.make_node(
-                                "Transpose",
-                                [mt_cand.output[0]],
-                                [transpose_output],
-                                perm=[0, 3, 1, 2],
-                            )
-                            graph.node.insert(node_ind + 1, new_transpose)
-                            final_t_cand.input[0] = transpose_output
-                            graph_modified = True
+                        mt_cand_orig_output = mt_cand.output[0]
+                        mt = getCustomOp(mt_cand)
+                        mt.set_nodeattr("data_layout", "NHWC")
+                        # Rewire input of MultiThreshold node
+                        mt_cand.input[0] = n.input[0]
+                        # Make new intermediate tensor
+                        intermediate_tensor_name = model.make_new_valueinfo_name()
+                        intermediate_tensor_shape = model.get_tensor_shape(n.input[0])
+                        intermediate_tensor_finn_dtype = model.get_tensor_datatype(
+                            mt_cand.output[0]
+                        )
+                        # Create a new ValueInfoProto and set the shape
+                        model.set_tensor_shape(
+                            intermediate_tensor_name, intermediate_tensor_shape
+                        )
+                        # Set the tensor layout
+                        model.set_tensor_layout(
+                            intermediate_tensor_name, DataLayout.NHWC
+                        )
+                        # Set the tensor FINN datatype
+                        model.set_tensor_datatype(
+                            intermediate_tensor_name, intermediate_tensor_finn_dtype
+                        )
+                        # Rewire output of MT node
+                        mt_cand.output[0] = intermediate_tensor_name
+                        # Get rid of first transpose node
+                        graph.node.remove(n)
+                        # Create new Transpose node
+                        new_transpose = oh.make_node(
+                            "Transpose",
+                            [intermediate_tensor_name],
+                            [mt_cand_orig_output],
+                            perm=[0, 3, 1, 2],
+                        )
+                        graph.node.insert(node_ind + 1, new_transpose)
+                        graph_modified = True
         if graph_modified:
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
@@ -457,7 +462,7 @@ class AbsorbScalarMulAddIntoTopK(Transformation):
                         graph.node.remove(prod)
                         n.input[0] = prod_input
                         # to avoid error the dataype is set to float32
-                        model.set_tensor_datatype(n.input[0], DataType.FLOAT32)
+                        model.set_tensor_datatype(n.input[0], DataType["FLOAT32"])
                         graph_modified = True
         if graph_modified:
             model = model.transform(InferShapes())
@@ -531,11 +536,20 @@ class AbsorbConsecutiveTransposes(Transformation):
                             # TODO implement this to allow for forks as producers
                             consumers = model.find_direct_successors(next_node)
                             prod = model.find_producer(n.input[0])
-                            for cons in consumers:
-                                for cons_in in cons.input:
-                                    if cons_in == next_node.output[0]:
-                                        prod.output[0] = cons_in
-                                        break
+                            if prod is not None:
+                                for cons in consumers:
+                                    for cons_in in cons.input:
+                                        if cons_in == next_node.output[0]:
+                                            prod.output[0] = cons_in
+                                            break
+                            else:
+                                # n.input[0] is top-level graph input
+                                # wire consumers directly to that
+                                for cons in consumers:
+                                    for i, iname in enumerate(cons.input):
+                                        if iname == next_node.output[0]:
+                                            cons.input[i] = n.input[0]
+
                             # remove both transposes
                             graph.node.remove(n)
                             graph.node.remove(next_node)
@@ -544,3 +558,81 @@ class AbsorbConsecutiveTransposes(Transformation):
         if graph_modified:
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class AbsorbTransposeIntoResize(Transformation):
+    """For (NCHWTranspose -> Resize) move Transpose past Resize and
+    change the Resize node's attributes accordingly."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Transpose" and not model.is_fork_node(node):
+                perms = list(get_by_name(node.attribute, "perm").ints)
+                if perms == [0, 3, 1, 2]:
+                    mt_cand = model.find_consumer(node.output[0])
+                    if mt_cand is not None and mt_cand.op_type == "Resize":
+                        mode = get_by_name(mt_cand.attribute, "mode").s.decode("ascii")
+                        # skip if mode is not nearest
+                        if mode != "nearest":
+                            continue
+                        # if sizes specified, turn into scales
+                        if len(mt_cand.input) > 3:
+                            sizes = model.get_initializer(mt_cand.input[3])
+                        else:
+                            sizes = None
+                        if sizes is not None:
+                            ishape = model.get_tensor_shape(mt_cand.input[0])
+                            ns, cs, hs, ws = sizes / np.asarray(ishape)
+                            model.set_initializer(
+                                mt_cand.input[2], np.asarray([ns, cs, hs, ws])
+                            )
+                            mt_cand.input.remove(mt_cand.input[3])
+                        # scales already specified, transpose indices to NHWC
+                        scales = model.get_initializer(mt_cand.input[2])
+                        assert scales is not None
+                        ns, cs, hs, ws = scales
+                        model.set_initializer(
+                            mt_cand.input[2], np.asarray([ns, hs, ws, cs])
+                        )
+                        # get rid of first tranpose node
+                        mt_cand.input[0] = node.input[0]
+                        graph.node.remove(node)
+                        is_last_node = mt_cand.output[0] in [
+                            x.name for x in model.graph.output
+                        ]
+
+                        new_tensor_name = model.make_new_valueinfo_name()
+                        if is_last_node:
+                            trans_input = new_tensor_name
+                            trans_output = mt_cand.output[0]
+                        else:
+                            trans_input = mt_cand.output[0]
+                            trans_output = new_tensor_name
+                        # fix tensor shapes for Resize and Transpose
+                        # n, c, h, w = model.get_tensor_shape(mt_cand.input[0])
+                        n, c, hx, wx = model.get_tensor_shape(mt_cand.output[0])
+                        model.set_tensor_shape(trans_input, (n, hx, wx, c))
+                        model.set_tensor_shape(trans_output, (n, c, hx, wx))
+                        # re-insert Transpose behind Resize
+                        new_transpose = oh.make_node(
+                            "Transpose",
+                            [trans_input],
+                            [trans_output],
+                            perm=[0, 3, 1, 2],
+                        )
+                        graph.node.insert(node_ind + 1, new_transpose)
+                        # rewire nodes
+                        final_t_cands = model.find_consumers(mt_cand.output[0])
+                        if final_t_cands is not None:
+                            # rewire next nodes' inputs
+                            for final_t_cand in final_t_cands:
+                                final_t_cand.input[0] = trans_output
+                        mt_cand.output[0] = trans_input
+                        graph_modified = True
+        if graph_modified:
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py
index 19f1ec3e836f3ec38aa4b716f5d6a25ca0782197..92c48c84ffa1a161f623ef6b22caaeb92f4a8199 100644
--- a/src/finn/transformation/streamline/collapse_repeated.py
+++ b/src/finn/transformation/streamline/collapse_repeated.py
@@ -28,15 +28,15 @@
 
 from onnx import helper as oh
 
+from finn.core.datatype import DataType
 from finn.transformation.base import Transformation
 from finn.transformation.infer_shapes import InferShapes
-from finn.core.datatype import DataType
 
 
 class CollapseRepeatedOp(Transformation):
     """Collapse repeated consecutive operations with constant parameters into
     a single operation. make_collapsed_param_fxn must take two tensors and
-    return a tensor which gives the equivalent result using a single op. """
+    return a tensor which gives the equivalent result using a single op."""
 
     def __init__(self, op_name, make_collapsed_param_fxn):
         super().__init__()
@@ -85,8 +85,8 @@ class CollapseRepeatedOp(Transformation):
                     # replace parameter value
                     model.set_initializer(new_node_param_name, new_param)
                     # be conservative with param/output DataTypes
-                    model.set_tensor_datatype(new_node_param_name, DataType.FLOAT32)
-                    model.set_tensor_datatype(end_name, DataType.FLOAT32)
+                    model.set_tensor_datatype(new_node_param_name, DataType["FLOAT32"])
+                    model.set_tensor_datatype(end_name, DataType["FLOAT32"])
                     # remove old nodes
                     graph.node.remove(n)
                     graph.node.remove(consumer)
diff --git a/src/finn/transformation/streamline/remove.py b/src/finn/transformation/streamline/remove.py
deleted file mode 100644
index 0abcf441f9636a52f9194df325874d293530f78a..0000000000000000000000000000000000000000
--- a/src/finn/transformation/streamline/remove.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-from finn.transformation.base import Transformation
-from finn.transformation.infer_shapes import InferShapes
-import numpy as np
-
-
-def _remove_node_and_rewire(model, node):
-    producer = model.find_producer(node.input[0])
-    if producer is not None:
-        # wire output tensor to
-        # output of producer node
-        producer.output[0] = node.output[0]
-    else:
-        # node is first in graph
-        consumer = model.find_consumer(node.output[0])
-        assert consumer is not None, "Whole graph is identity"
-        assert consumer.input[0] == node.output[0]
-        # rewire consumer's input directly to graph input
-        consumer.input[0] = node.input[0]
-    # remove node
-    model.graph.node.remove(node)
-
-
-class RemoveIdentityOps(Transformation):
-    """Remove identity ops like Add/Sub with zero or Mul/Div with one. A tolerance
-    value (defaults to 1e-05) can be specified during init for the comparison
-    to zero/one."""
-
-    def __init__(self, atol=1e-05):
-        super().__init__()
-        self.atol = atol
-
-    def apply(self, model):
-        graph = model.graph
-        node_ind = 0
-        graph_modified = False
-        for n in graph.node:
-            node_ind += 1
-            if (
-                n.op_type in ["Add", "Sub"]
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
-                A = model.get_initializer(n.input[1])
-                if (
-                    A is not None
-                    and np.isclose(A, np.zeros_like(A), atol=self.atol).all()
-                ):
-                    _remove_node_and_rewire(model, n)
-
-            elif (
-                n.op_type in ["Mul", "Div"]
-                and not model.is_fork_node(n)
-                and not model.is_join_node(n)
-            ):
-                A = model.get_initializer(n.input[1])
-                if (
-                    A is not None
-                    and np.isclose(A, np.ones_like(A), atol=self.atol).all()
-                ):
-                    _remove_node_and_rewire(model, n)
-        model = model.transform(InferShapes())
-        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 4049d7bc8b555dce6f3ab6f9475f6aebf41fbc2c..0cdd6651d982426b1d81d7313346dcd899294bf7 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -28,19 +28,19 @@
 
 import numpy as np
 import warnings
-from onnx import helper as oh
 from onnx import TensorProto
+from onnx import helper as oh
 
-from finn.transformation.base import Transformation
 import finn.core.data_layout as DataLayout
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.core.datatype import DataType
 from finn.core.onnx_exec import execute_node
-from finn.util.basic import get_by_name
 from finn.custom_op.registry import getCustomOp
+from finn.transformation.base import Transformation
 from finn.transformation.general import SortGraph
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import get_by_name
 
 
 class MoveAddPastMul(Transformation):
@@ -408,16 +408,16 @@ class MoveMulPastDWConv(Transformation):
                         # rewire mul input to be conv input
                         conv_node.input[0] = start_name
                         model.set_tensor_shape(start_name, conv_in_shape)
-                        model.set_tensor_datatype(start_name, DataType.FLOAT32)
+                        model.set_tensor_datatype(start_name, DataType["FLOAT32"])
                         # use old conv input tensor as conv output
                         conv_node.output[0] = conv_in_name
                         model.set_tensor_shape(conv_in_name, conv_out_shape)
-                        model.set_tensor_datatype(conv_in_name, DataType.FLOAT32)
+                        model.set_tensor_datatype(conv_in_name, DataType["FLOAT32"])
                         # use new conv output as new mul node input
                         mul_node.input[0] = conv_in_name
                         # use old conv output as new mul node output
                         mul_node.output[0] = conv_out_name
-                        model.set_tensor_datatype(conv_out_name, DataType.FLOAT32)
+                        model.set_tensor_datatype(conv_out_name, DataType["FLOAT32"])
                         # move mul node past conv node
                         graph.node.remove(mul_node)
                         graph.node.insert(node_ind, mul_node)
@@ -482,16 +482,16 @@ class MoveMulPastMaxPool(Transformation):
                         # rewire mul input to be maxpool input
                         maxpool_node.input[0] = start_name
                         model.set_tensor_shape(start_name, maxpool_in_shape)
-                        model.set_tensor_datatype(start_name, DataType.FLOAT32)
+                        model.set_tensor_datatype(start_name, DataType["FLOAT32"])
                         # use old maxpool input tensor as maxpool output
                         maxpool_node.output[0] = maxpool_in_name
                         model.set_tensor_shape(maxpool_in_name, maxpool_out_shape)
-                        model.set_tensor_datatype(maxpool_in_name, DataType.FLOAT32)
+                        model.set_tensor_datatype(maxpool_in_name, DataType["FLOAT32"])
                         # use new maxpool output as new mul node input
                         mul_node.input[0] = maxpool_in_name
                         # use old maxpool output as new mul node output
                         mul_node.output[0] = maxpool_out_name
-                        model.set_tensor_datatype(maxpool_out_name, DataType.FLOAT32)
+                        model.set_tensor_datatype(maxpool_out_name, DataType["FLOAT32"])
                         # move mul node past maxpool node
                         graph.node.remove(mul_node)
                         graph.node.insert(node_ind, mul_node)
@@ -594,11 +594,17 @@ class MoveScalarLinearPastInvariants(Transformation):
         nodes = [n for n in graph.node]
         for n in nodes:
             node_ind += 1
+            is_nearest_neighbor_resample = False
+            if n.op_type == "Upsample" or n.op_type == "Resize":
+                # Extract mode and scales and input shape
+                mode = get_by_name(n.attribute, "mode").s.decode("ascii")
+                is_nearest_neighbor_resample = mode == "nearest"
             if (
                 n.op_type == "GlobalAveragePool"
                 or n.op_type == "Reshape"
                 or n.op_type == "Transpose"
                 or n.op_type == "Flatten"
+                or is_nearest_neighbor_resample
             ):
                 in0 = n.input[0]
                 if in0 is None:
@@ -617,6 +623,10 @@ class MoveScalarLinearPastInvariants(Transformation):
                     # if initializer is not scalar, skip
                     if np.prod(init0.shape) != 1:
                         continue
+                    # Flatten input if required
+                    if len(init0.shape) > 0:
+                        init0 = init0.flatten()[0]
+                        model.set_initializer(prod0.input[1], init0)
                     # move prod0 from input to output,
                     old_prod0_in = prod0.input[0]
                     old_prod0_out = prod0.output[0]
@@ -632,7 +642,7 @@ class MoveScalarLinearPastInvariants(Transformation):
                     model.set_tensor_shape(n.output[0], out_shape)
                     model.set_tensor_shape(prod0.output[0], out_shape)
                     model.set_tensor_datatype(prod0.output[0], scalar_op_odt)
-                    model.set_tensor_datatype(n.output[0], DataType.FLOAT32)
+                    model.set_tensor_datatype(n.output[0], DataType["FLOAT32"])
                     graph.node.remove(prod0)
                     graph.node.insert(node_ind - 1, prod0)
                     graph_modified = True
diff --git a/src/finn/transformation/streamline/sign_to_thres.py b/src/finn/transformation/streamline/sign_to_thres.py
index 13f2e8524af7ce2d3457d0637f1c6d02733f504b..61d7eb35430262b1ee90dfa478076fb6f7556612 100644
--- a/src/finn/transformation/streamline/sign_to_thres.py
+++ b/src/finn/transformation/streamline/sign_to_thres.py
@@ -69,6 +69,6 @@ class ConvertSignToThres(Transformation):
                 graph.node.insert(node_ind, mt_node)
                 graph.node.remove(n)
                 # add quantization annotations
-                model.set_tensor_datatype(sign_out_name, DataType.BIPOLAR)
+                model.set_tensor_datatype(sign_out_name, DataType["BIPOLAR"])
                 graph_modified = True
         return (model, graph_modified)
diff --git a/src/finn/util/create.py b/src/finn/util/create.py
index d9c5d7b1b59916edfc8730992535f3ddb57c4d60..62229a69b68c26dd191b3e1d4a44f1bb8b19ed07 100644
--- a/src/finn/util/create.py
+++ b/src/finn/util/create.py
@@ -49,10 +49,10 @@ def hls_random_mlp_maker(layer_spec):
             # no activation, produce accumulators
             T = None
             tdt = None
-            if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
-                odt = DataType.UINT32
+            if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+                odt = DataType["UINT32"]
             else:
-                odt = DataType.INT32
+                odt = DataType["INT32"]
         else:
             odt = act
             (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
@@ -61,13 +61,13 @@ def hls_random_mlp_maker(layer_spec):
             # provide non-decreasing thresholds
             T = np.sort(T, axis=1)
             # generate thresholds for activation
-            if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
-                tdt = DataType.UINT32
+            if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+                tdt = DataType["UINT32"]
                 # bias thresholds to be positive
                 T = np.ceil((T + mw) / 2)
                 assert (T >= 0).all()
             else:
-                tdt = DataType.INT32
+                tdt = DataType["INT32"]
         lyr["T"] = T
         lyr["tdt"] = tdt
         lyr["odt"] = odt
@@ -120,11 +120,11 @@ def hls_mlp_maker(layer_spec):
         # StreamingFC:
         # - specify their datatypes as such
         # - specify their datatypes as BINARY as use binaryXnorMode
-        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
             # we'll internally convert weights/inputs to binary and specify the
             # datatypes as such, and also set the binaryXnorMode attribute to 1
-            export_wdt = DataType.BINARY
-            export_idt = DataType.BINARY
+            export_wdt = DataType["BINARY"]
+            export_idt = DataType["BINARY"]
             binary_xnor_mode = 1
         else:
             export_wdt = wdt
@@ -134,7 +134,7 @@ def hls_mlp_maker(layer_spec):
         if T is not None:
             no_act = 0
             node_inp_list = [current_in_name, current_W_name, current_T_name]
-            if odt == DataType.BIPOLAR:
+            if odt == DataType["BIPOLAR"]:
                 actval = 0
             else:
                 actval = odt.min()
diff --git a/src/finn/util/imagenet.py b/src/finn/util/imagenet.py
index 71ed9d9d260e2b38b5d9ec47f728ad401e526ca8..abd412e8d963cbcc80370298fb833de86a218c41 100644
--- a/src/finn/util/imagenet.py
+++ b/src/finn/util/imagenet.py
@@ -26,11 +26,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
 import numpy as np
+import os
 from PIL import Image
+
 from finn.core.data_layout import NCHW, NHWC
-from finn.util.test import resize_smaller_side, crop_center
+from finn.util.test import crop_center, resize_smaller_side
 
 
 def get_val_images(n_images=100, interleave_classes=False):
diff --git a/src/finn/util/pytorch.py b/src/finn/util/pytorch.py
index f174c24601578cf827cb0da770f29889344e62b8..18010083f7beb8c71c3a6ae5abae075d51e57cf9 100644
--- a/src/finn/util/pytorch.py
+++ b/src/finn/util/pytorch.py
@@ -26,7 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import torch
-
 from torch.nn import Module, Sequential
 
 
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index 0a34751786170a03361d6a17a24c7250c5ce49fd..9c5462ae7f3ca3122fe672f8f01e939e398963a8 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -26,22 +26,25 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import onnx
-import onnx.numpy_helper as nph
 import pkg_resources as pk
-from pkgutil import get_data
-from brevitas_examples import bnn_pynq, imagenet_classification
-import numpy as np
+
 import pytest
+
+import numpy as np
+import onnx
+import onnx.numpy_helper as nph
+import os
+import torchvision.transforms.functional as torchvision_util
 import warnings
+from brevitas_examples import bnn_pynq, imagenet_classification
+from pkgutil import get_data
+
 from finn.core.modelwrapper import ModelWrapper
-import os
-from finn.util.basic import pynq_part_map, alveo_part_map, alveo_default_platform
+from finn.core.onnx_exec import execute_onnx
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 from finn.transformation.fpgadataflow.vitis_build import VitisBuild, VitisOptStrategy
-from finn.custom_op.registry import getCustomOp
-from finn.core.onnx_exec import execute_onnx
-import torchvision.transforms.functional as torchvision_util
+from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
 
 # map of (wbits,abits) -> model
 example_map = {
diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py
index a4400f7bd7e75549189f081ce255fd67c49b3746..6a5a68f09930783f5a4e094ea88d6eeb9e07b99a 100644
--- a/src/finn/util/vcd.py
+++ b/src/finn/util/vcd.py
@@ -26,9 +26,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import multiprocessing as mp
 from vcdvcd import VCDVCD
+
 from finn.util.basic import get_num_default_workers
-import multiprocessing as mp
 
 # string patterns to search for to find particular interfaces
 # streaming interfaces
@@ -162,7 +163,9 @@ def _get_stats(x):
     return (x[0], get_stream_if_stats(x[1], x[0]))
 
 
-def get_all_stream_if_stats(vcd_file, stream_ifs=None, sort_by="{'V': 1, 'R': 0}", num_workers=None):
+def get_all_stream_if_stats(
+    vcd_file, stream_ifs=None, sort_by="{'V': 1, 'R': 0}", num_workers=None
+):
     """Return a list of streaming interface stats, sorted by the percentage
     for the given sort_by key. If stream_ifs is None, all streaming interface
     stats will be returned, otherwise treated as a list of interface names to
diff --git a/src/finn/util/visualization.py b/src/finn/util/visualization.py
index d8547a32e06aa3b688601aa550abb2c50bcf77d6..397bebb64c21e9c5a1cb09d2a01fe3e10502b558 100644
--- a/src/finn/util/visualization.py
+++ b/src/finn/util/visualization.py
@@ -27,8 +27,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import inspect
-import os
 import netron
+import os
 from IPython.display import IFrame
 
 
@@ -36,7 +36,27 @@ def showSrc(what):
     print("".join(inspect.getsourcelines(what)[0]))
 
 
-def showInNetron(model_filename):
-    netron.start(model_filename, address=("0.0.0.0", 8081))
-    localhost_url = os.getenv("LOCALHOST_URL", default="localhost")
-    return IFrame(src="http://%s:8081/" % localhost_url, width="100%", height=400)
+def showInNetron(model_filename: str, localhost_url: str = None, port: int = None):
+    """Shows a ONNX model file in the Jupyter Notebook using Netron.
+
+    :param model_filename: The path to the ONNX model file.
+    :type model_filename: str
+
+    :param localhost_url: The IP address used by the Jupyter IFrame to show the model.
+     Defaults to localhost.
+    :type localhost_url: str, optional
+
+    :param port: The port number used by Netron and the Jupyter IFrame to show
+     the ONNX model.  Defaults to 8081.
+    :type port: int, optional
+
+    :return: The IFrame displaying the ONNX model.
+    :rtype: IPython.lib.display.IFrame
+    """
+    try:
+        port = port or int(os.getenv("NETRON_PORT", default="8081"))
+    except ValueError:
+        port = 8081
+    localhost_url = localhost_url or os.getenv("LOCALHOST_URL", default="localhost")
+    netron.start(model_filename, address=("0.0.0.0", port), browse=False)
+    return IFrame(src=f"http://{localhost_url}:{port}/", width="100%", height=400)
diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py
index 4b88b0f787fb7780079ee82d6802d5cbff410748..1b38914a83e7c5d68bb004df7545b518d6a93ddd 100644
--- a/tests/brevitas/test_brevitas_avg_pool_export.py
+++ b/tests/brevitas/test_brevitas_avg_pool_export.py
@@ -25,26 +25,29 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import os
-
-import torch
-import numpy as np
 import pytest
-import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.core.datatype import DataType
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.util.basic import gen_finn_dt_tensor
 
+import numpy as np
+import os
+import torch
 from brevitas.export import FINNManager
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
 from brevitas.nn import QuantAvgPool2d
 from brevitas.quant_tensor import QuantTensor
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+from finn.util.basic import gen_finn_dt_tensor
 
-export_onnx_path = "test_brevitas_avg_pool_export.onnx"
+base_export_onnx_path = "test_brevitas_avg_pool_export.onnx"
 
 
+@pytest.mark.parametrize("QONNX_export", [False, True])
 @pytest.mark.parametrize("kernel_size", [2, 3])
 @pytest.mark.parametrize("stride", [1, 2])
 @pytest.mark.parametrize("signed", [True, False])
@@ -53,11 +56,23 @@ export_onnx_path = "test_brevitas_avg_pool_export.onnx"
 @pytest.mark.parametrize("channels", [2, 4])
 @pytest.mark.parametrize("idim", [7, 8])
 def test_brevitas_avg_pool_export(
-    kernel_size, stride, signed, bit_width, input_bit_width, channels, idim
+    kernel_size,
+    stride,
+    signed,
+    bit_width,
+    input_bit_width,
+    channels,
+    idim,
+    QONNX_export,
 ):
-
+    export_onnx_path = base_export_onnx_path.replace(
+        ".onnx", f"test_QONNX-{QONNX_export}.onnx"
+    )
     quant_avgpool = QuantAvgPool2d(
-        kernel_size=kernel_size, stride=stride, bit_width=bit_width
+        kernel_size=kernel_size,
+        stride=stride,
+        bit_width=bit_width,
+        return_quant_tensor=False,
     )
     quant_avgpool.eval()
 
@@ -70,31 +85,57 @@ def test_brevitas_avg_pool_export(
     # Brevitas QuantAvgPool layers need QuantTensors to export correctly
     # which requires setting up a QuantTensor instance with the scale
     # factor, zero point, bitwidth and signedness
-    scale_array = np.random.uniform(low=0, high=1, size=(1, channels, 1, 1)).astype(
-        np.float32
-    )
+    scale_array = np.ones((1, channels, 1, 1)).astype(np.float32)
+    scale_array *= 0.5
     input_tensor = torch.from_numpy(input_array * scale_array).float()
     scale_tensor = torch.from_numpy(scale_array).float()
     zp = torch.tensor(0.0)
     input_quant_tensor = QuantTensor(
-        input_tensor, scale_tensor, zp, input_bit_width, signed
+        input_tensor, scale_tensor, zp, input_bit_width, signed, training=False
     )
 
     # export
-    FINNManager.export(
-        quant_avgpool, export_path=export_onnx_path, input_t=input_quant_tensor
-    )
+    if QONNX_export:
+        BrevitasONNXManager.export(
+            quant_avgpool,
+            export_path=export_onnx_path,
+            input_t=input_quant_tensor,
+        )
+        model = ModelWrapper(export_onnx_path)
+
+        # Statically set the additional inputs generated by the BrevitasONNXManager
+        model.graph.input.remove(model.graph.input[3])
+        model.graph.input.remove(model.graph.input[2])
+        model.graph.input.remove(model.graph.input[1])
+        model.set_initializer("1", scale_array)
+        model.set_initializer("2", np.array(0.0).astype(np.float32))
+        model.set_initializer("3", np.array(input_bit_width).astype(np.float32))
+        model.save(export_onnx_path)
+
+        qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
+        model = ModelWrapper(export_onnx_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(export_onnx_path)
+    else:
+        FINNManager.export(
+            quant_avgpool, export_path=export_onnx_path, input_t=input_quant_tensor
+        )
     model = ModelWrapper(export_onnx_path)
     model = model.transform(InferShapes())
     model = model.transform(InferDataTypes())
 
     # reference brevitas output
-    ref_output_array = quant_avgpool(input_quant_tensor).tensor.detach().numpy()
+    ref_output_array = quant_avgpool(input_quant_tensor).detach().numpy()
     # finn output
-    idict = {model.graph.input[0].name: input_array}
+    if QONNX_export:
+        # Manually apply the Quant tensor scaling for QONNX
+        idict = {model.graph.input[0].name: input_array * scale_array}
+    else:
+        idict = {model.graph.input[0].name: input_array}
     odict = oxe.execute_onnx(model, idict, True)
     finn_output = odict[model.graph.output[0].name]
     # compare outputs
     assert np.isclose(ref_output_array, finn_output).all()
     # cleanup
+    # assert False
     os.remove(export_onnx_path)
diff --git a/tests/brevitas/test_brevitas_cnv.py b/tests/brevitas/test_brevitas_cnv.py
index 4b072535bdfe102a6c59ebd4c730de9ae827c00e..78ca361366902b37f826b575904126c783adbece 100644
--- a/tests/brevitas/test_brevitas_cnv.py
+++ b/tests/brevitas/test_brevitas_cnv.py
@@ -26,19 +26,23 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
 import pkg_resources as pk
+
 import pytest
 
 import brevitas.onnx as bo
 import numpy as np
+import os
 import torch
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.test import get_test_model_trained
 
 export_onnx_path = "test_brevitas_cnv.onnx"
@@ -46,11 +50,20 @@ export_onnx_path = "test_brevitas_cnv.onnx"
 
 @pytest.mark.parametrize("abits", [1, 2])
 @pytest.mark.parametrize("wbits", [1, 2])
-def test_brevitas_cnv_export_exec(wbits, abits):
+@pytest.mark.parametrize("QONNX_export", [False, True])
+def test_brevitas_cnv_export_exec(wbits, abits, QONNX_export):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     cnv = get_test_model_trained("CNV", wbits, abits)
-    bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path)
+    ishape = (1, 3, 32, 32)
+    if QONNX_export:
+        BrevitasONNXManager.export(cnv, ishape, export_onnx_path)
+        qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
+        model = ModelWrapper(export_onnx_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(export_onnx_path)
+    else:
+        bo.export_finn_onnx(cnv, ishape, export_onnx_path)
     model = ModelWrapper(export_onnx_path)
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(InferShapes())
diff --git a/tests/brevitas/test_brevitas_debug.py b/tests/brevitas/test_brevitas_debug.py
index 9115352796b0b90257d64ce9b14163ad372c9c98..e42b93babefd9ca6a7a86def18a5cbb21d795c4c 100644
--- a/tests/brevitas/test_brevitas_debug.py
+++ b/tests/brevitas/test_brevitas_debug.py
@@ -26,39 +26,71 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from pkgutil import get_data
+import pytest
 
-import os
 import brevitas.onnx as bo
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
+import os
 import torch
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from pkgutil import get_data
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.general import RemoveStaticGraphInputs
 from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.test import get_test_model_trained
 
 
-def test_brevitas_debug():
+@pytest.mark.parametrize("QONNX_export", [False, True])
+@pytest.mark.parametrize("QONNX_FINN_conversion", [False, True])
+def test_brevitas_debug(QONNX_export, QONNX_FINN_conversion):
+    if (not QONNX_export) and QONNX_FINN_conversion:
+        pytest.skip("This test configuration is not valid and is thus skipped.")
     finn_onnx = "test_brevitas_debug.onnx"
     fc = get_test_model_trained("TFC", 2, 2)
-    dbg_hook = bo.enable_debug(fc)
-    bo.export_finn_onnx(fc, (1, 1, 28, 28), finn_onnx)
+    ishape = (1, 1, 28, 28)
+    if QONNX_export:
+        dbg_hook = bo.enable_debug(fc, proxy_level=True)
+        BrevitasONNXManager.export(fc, ishape, finn_onnx)
+        # DebugMarkers have the brevitas.onnx domain, so that needs adjusting
+        model = ModelWrapper(finn_onnx)
+        dbg_nodes = model.get_nodes_by_op_type("DebugMarker")
+        for dbg_node in dbg_nodes:
+            dbg_node.domain = "finn.custom_op.general"
+        model.save(finn_onnx)
+        qonnx_cleanup(finn_onnx, out_file=finn_onnx)
+        if QONNX_FINN_conversion:
+            model = ModelWrapper(finn_onnx)
+            model = model.transform(ConvertQONNXtoFINN())
+            model.save(finn_onnx)
+    else:
+        dbg_hook = bo.enable_debug(fc)
+        bo.export_finn_onnx(fc, ishape, finn_onnx)
+        model = ModelWrapper(finn_onnx)
+        # DebugMarkers have the brevitas.onnx domain, so that needs adjusting
+        # ToDo: We should probably have transformation pass, which does this
+        #  domain conversion for us?
+        dbg_nodes = model.get_nodes_by_op_type("DebugMarker")
+        for dbg_node in dbg_nodes:
+            dbg_node.domain = "finn.custom_op.general"
+        model = model.transform(InferShapes())
+        model = model.transform(FoldConstants())
+        model = model.transform(RemoveStaticGraphInputs())
+        model.save(finn_onnx)
     model = ModelWrapper(finn_onnx)
-    model = model.transform(InferShapes())
-    model = model.transform(FoldConstants())
-    model = model.transform(RemoveStaticGraphInputs())
     assert len(model.graph.input) == 1
     assert len(model.graph.output) == 1
     # load one of the test vectors
     raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     # run using FINN-based execution
-    input_dict = {"0": nph.to_array(input_tensor)}
+    input_dict = {model.graph.input[0].name: nph.to_array(input_tensor)}
     output_dict = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
     produced = output_dict[model.graph.output[0].name]
     # run using PyTorch/Brevitas
@@ -71,9 +103,19 @@ def test_brevitas_debug():
     names_brevitas = set(dbg_hook.values.keys())
     names_finn = set(output_dict.keys())
     names_common = names_brevitas.intersection(names_finn)
-    assert len(names_common) == 16
+    # The different exports return debug markers in different numbers and places
+    print(len(names_common))
+    if QONNX_export and not QONNX_FINN_conversion:
+        assert len(names_common) == 12
+    elif QONNX_export and QONNX_FINN_conversion:
+        assert len(names_common) == 8
+    else:
+        assert len(names_common) == 16
     for dbg_name in names_common:
-        tensor_pytorch = dbg_hook.values[dbg_name].detach().numpy()
+        if QONNX_export:
+            tensor_pytorch = dbg_hook.values[dbg_name].value.detach().numpy()
+        else:
+            tensor_pytorch = dbg_hook.values[dbg_name].detach().numpy()
         tensor_finn = output_dict[dbg_name]
         assert np.isclose(tensor_finn, tensor_pytorch, atol=1e-5).all()
     os.remove(finn_onnx)
diff --git a/tests/brevitas/test_brevitas_fc.py b/tests/brevitas/test_brevitas_fc.py
index 24a453007515ba2eba4369a6b76829099f722168..8e1e3de8d06b24ce946fb0a6726d875d0e75736e 100644
--- a/tests/brevitas/test_brevitas_fc.py
+++ b/tests/brevitas/test_brevitas_fc.py
@@ -26,8 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from pkgutil import get_data
-
 import pytest
 
 import brevitas.onnx as bo
@@ -35,32 +33,47 @@ import numpy as np
 import onnx
 import onnx.numpy_helper as nph
 import torch
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from pkgutil import get_data
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.general import RemoveStaticGraphInputs
 from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
 export_onnx_path = make_build_dir("test_brevitas_fc_")
 
+
 # act bits
 @pytest.mark.parametrize("abits", [1, 2])
 # weight bits
 @pytest.mark.parametrize("wbits", [1, 2])
 # network topology / size
 @pytest.mark.parametrize("size", ["TFC", "SFC", "LFC"])
-def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits):
+# QONNX export
+@pytest.mark.parametrize("QONNX_export", [False, True])
+def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits, QONNX_export):
     if size == "LFC" and wbits == 2 and abits == 2:
         pytest.skip("No LFC-w2a2 present at the moment")
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
-    nname = "%s_%dW%dA" % (size, wbits, abits)
+    nname = "%s_%dW%dA_QONNX-%d" % (size, wbits, abits, QONNX_export)
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
-    bo.export_finn_onnx(fc, (1, 1, 28, 28), finn_onnx)
+    ishape = (1, 1, 28, 28)
+    if QONNX_export:
+        BrevitasONNXManager.export(fc, ishape, finn_onnx)
+        qonnx_cleanup(finn_onnx, out_file=finn_onnx)
+        model = ModelWrapper(finn_onnx)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(finn_onnx)
+    else:
+        bo.export_finn_onnx(fc, ishape, finn_onnx)
     model = ModelWrapper(finn_onnx)
     model = model.transform(InferShapes())
     model = model.transform(FoldConstants())
@@ -71,7 +84,7 @@ def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits):
     raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     # run using FINN-based execution
-    input_dict = {"0": nph.to_array(input_tensor)}
+    input_dict = {model.graph.input[0].name: nph.to_array(input_tensor)}
     output_dict = oxe.execute_onnx(model, input_dict)
     produced = output_dict[list(output_dict.keys())[0]]
     # run using PyTorch/Brevitas
diff --git a/tests/brevitas/test_brevitas_mobilenet.py b/tests/brevitas/test_brevitas_mobilenet.py
index 98a18403e79366aca184559a8b868f30aa27c35e..108c97c2e83b7f3ca9dd6ead746b3ef8b4d10af5 100644
--- a/tests/brevitas/test_brevitas_mobilenet.py
+++ b/tests/brevitas/test_brevitas_mobilenet.py
@@ -26,29 +26,31 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from PIL import Image
-import numpy as np
-import brevitas.onnx as bo
 import pytest
+
+import brevitas.onnx as bo
+import numpy as np
 import torch
-from finn.util.basic import make_build_dir
-from finn.util.pytorch import NormalizePreProc
-from finn.util.test import get_test_model_trained, resize_smaller_side, crop_center
-from finn.core.modelwrapper import ModelWrapper
+from PIL import Image
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.streamline.absorb as absorb
 from finn.core.datatype import DataType
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     GiveUniqueParameterTensors,
 )
-from finn.transformation.merge_onnx_models import MergeONNXModels
-import finn.transformation.streamline.absorb as absorb
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.insert_topk import InsertTopK
-import finn.core.onnx_exec as oxe
+from finn.transformation.merge_onnx_models import MergeONNXModels
+from finn.util.basic import make_build_dir
+from finn.util.pytorch import NormalizePreProc
+from finn.util.test import crop_center, get_test_model_trained, resize_smaller_side
 
 
 @pytest.mark.xfail
@@ -76,7 +78,9 @@ def test_brevitas_mobilenet():
     bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx)
     preproc_model = ModelWrapper(preproc_onnx)
     # set input finn datatype to UINT8
-    preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType.UINT8)
+    preproc_model.set_tensor_datatype(
+        preproc_model.graph.input[0].name, DataType["UINT8"]
+    )
     preproc_model = preproc_model.transform(InferShapes())
     preproc_model = preproc_model.transform(GiveUniqueNodeNames())
     preproc_model = preproc_model.transform(GiveUniqueParameterTensors())
diff --git a/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py b/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py
similarity index 82%
rename from tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py
rename to tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py
index 37ea12ac0f6387b1ab6669bce4aaaaaa60e87b58..b530b4bd84c548319549a8b16e0c3a79584e075d 100644
--- a/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py
+++ b/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py
@@ -26,19 +26,24 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import onnx  # noqa
+import pytest
+
+import brevitas.onnx as bo
 import numpy as np
+import onnx  # noqa
+import os
 import torch
-import brevitas.onnx as bo
-from brevitas.nn import QuantHardTanh
+from brevitas.core.quant import QuantType
 from brevitas.core.restrict_val import RestrictValueType
 from brevitas.core.scaling import ScalingImplType
-import pytest
-from finn.core.modelwrapper import ModelWrapper
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.nn import QuantHardTanh
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
+
 import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
-from brevitas.core.quant import QuantType
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 
 export_onnx_path = "test_brevitas_non_scaled_QuantHardTanh_export.onnx"
 
@@ -46,7 +51,10 @@ export_onnx_path = "test_brevitas_non_scaled_QuantHardTanh_export.onnx"
 @pytest.mark.parametrize("abits", [1, 2, 4, 8])
 @pytest.mark.parametrize("narrow_range", [False, True])
 @pytest.mark.parametrize("max_val", [1.0, 1 - 2 ** (-7)])
-def test_brevitas_act_export_qhardtanh_nonscaled(abits, narrow_range, max_val):
+@pytest.mark.parametrize("QONNX_export", [False, True])
+def test_brevitas_act_export_qhardtanh_nonscaled(
+    abits, narrow_range, max_val, QONNX_export
+):
     def get_quant_type(bit_width):
         if bit_width is None:
             return QuantType.FP
@@ -67,7 +75,15 @@ def test_brevitas_act_export_qhardtanh_nonscaled(abits, narrow_range, max_val):
         scaling_impl_type=ScalingImplType.CONST,
         narrow_range=narrow_range,
     )
-    bo.export_finn_onnx(b_act, ishape, export_onnx_path)
+    if QONNX_export:
+        m_path = export_onnx_path
+        BrevitasONNXManager.export(b_act, ishape, m_path)
+        qonnx_cleanup(m_path, out_file=m_path)
+        model = ModelWrapper(m_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(m_path)
+    else:
+        bo.export_finn_onnx(b_act, ishape, export_onnx_path)
     model = ModelWrapper(export_onnx_path)
     model = model.transform(InferShapes())
     inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
diff --git a/tests/brevitas/test_brevitas_QConv2d.py b/tests/brevitas/test_brevitas_qconv2d.py
similarity index 83%
rename from tests/brevitas/test_brevitas_QConv2d.py
rename to tests/brevitas/test_brevitas_qconv2d.py
index 5f124690d7c1f266f074351abe690abdd3ae5a2c..beaea4e51ecdd4cff9f0d4d0c16735cdecad207c 100644
--- a/tests/brevitas/test_brevitas_QConv2d.py
+++ b/tests/brevitas/test_brevitas_qconv2d.py
@@ -27,20 +27,24 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import os
+
+import brevitas.onnx as bo
 import numpy as np
+import os
 import torch
-import brevitas.onnx as bo
-from brevitas.nn import QuantConv2d
-from brevitas.core.restrict_val import RestrictValueType
 from brevitas.core.quant import QuantType
+from brevitas.core.restrict_val import RestrictValueType
 from brevitas.core.scaling import ScalingImplType
 from brevitas.core.stats import StatsOp
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.nn import QuantConv2d
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
 
-from finn.core.modelwrapper import ModelWrapper
-from finn.core.datatype import DataType
 import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.basic import gen_finn_dt_tensor
 
 export_onnx_path = "test_brevitas_conv.onnx"
@@ -49,7 +53,8 @@ export_onnx_path = "test_brevitas_conv.onnx"
 @pytest.mark.parametrize("dw", [False, True])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("in_channels", [32])
-def test_brevitas_QConv2d(dw, bias, in_channels):
+@pytest.mark.parametrize("QONNX_export", [False, True])
+def test_brevitas_QConv2d(dw, bias, in_channels, QONNX_export):
     ishape = (1, 32, 111, 111)
     if dw is True:
         groups = in_channels
@@ -85,10 +90,18 @@ def test_brevitas_QConv2d(dw, bias, in_channels):
         weight_narrow_range=True,
         weight_scaling_min_val=2e-16,
     )
-    weight_tensor = gen_finn_dt_tensor(DataType.INT4, w_shape)
+    weight_tensor = gen_finn_dt_tensor(DataType["INT4"], w_shape)
     b_conv.weight = torch.nn.Parameter(torch.from_numpy(weight_tensor).float())
     b_conv.eval()
-    bo.export_finn_onnx(b_conv, ishape, export_onnx_path)
+    if QONNX_export:
+        m_path = export_onnx_path
+        BrevitasONNXManager.export(b_conv, ishape, m_path)
+        qonnx_cleanup(m_path, out_file=m_path)
+        model = ModelWrapper(m_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(m_path)
+    else:
+        bo.export_finn_onnx(b_conv, ishape, export_onnx_path)
     model = ModelWrapper(export_onnx_path)
     model = model.transform(InferShapes())
     inp_tensor = np.random.uniform(low=-1.0, high=1.0, size=ishape).astype(np.float32)
diff --git a/tests/brevitas/test_brevitas_qlinear.py b/tests/brevitas/test_brevitas_qlinear.py
index 62ed358dc9030c35e865921ca7cf9e80c34020fd..1099d3ec83336e5cd07707b35baea112b7a2aee6 100644
--- a/tests/brevitas/test_brevitas_qlinear.py
+++ b/tests/brevitas/test_brevitas_qlinear.py
@@ -27,16 +27,21 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import os
+
+import brevitas.onnx as bo
 import numpy as np
+import os
 import torch
-import brevitas.onnx as bo
-from brevitas.nn import QuantLinear
 from brevitas.core.quant import QuantType
-from finn.core.modelwrapper import ModelWrapper
-from finn.core.datatype import DataType
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.nn import QuantLinear
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
+
 import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.basic import gen_finn_dt_tensor
 
 export_onnx_path = "test_brevitas_qlinear.onnx"
@@ -46,8 +51,11 @@ export_onnx_path = "test_brevitas_qlinear.onnx"
 @pytest.mark.parametrize("out_features", [4])
 @pytest.mark.parametrize("in_features", [3])
 @pytest.mark.parametrize("w_bits", [4])
-@pytest.mark.parametrize("i_dtype", [DataType.UINT4])
-def test_brevitas_qlinear(bias, out_features, in_features, w_bits, i_dtype):
+@pytest.mark.parametrize("i_dtype", [DataType["UINT4"]])
+@pytest.mark.parametrize("QONNX_export", [False, True])
+def test_brevitas_qlinear(
+    bias, out_features, in_features, w_bits, i_dtype, QONNX_export
+):
     i_shape = (1, in_features)
     w_shape = (out_features, in_features)
     b_linear = QuantLinear(
@@ -64,7 +72,15 @@ def test_brevitas_qlinear(bias, out_features, in_features, w_bits, i_dtype):
     )
     b_linear.weight.data = torch.from_numpy(weight_tensor_fp)
     b_linear.eval()
-    bo.export_finn_onnx(b_linear, i_shape, export_onnx_path)
+    if QONNX_export:
+        m_path = export_onnx_path
+        BrevitasONNXManager.export(b_linear, i_shape, m_path)
+        qonnx_cleanup(m_path, out_file=m_path)
+        model = ModelWrapper(m_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(m_path)
+    else:
+        bo.export_finn_onnx(b_linear, i_shape, export_onnx_path)
     model = ModelWrapper(export_onnx_path)
     model = model.transform(InferShapes())
     inp_tensor = gen_finn_dt_tensor(i_dtype, i_shape)
diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py
index 278f05a4a9f264d69461e555d28437cdc83e1a71..57ead3b6c047220e90d4276620cc14b8f795fe08 100644
--- a/tests/brevitas/test_brevitas_relu_act_export.py
+++ b/tests/brevitas/test_brevitas_relu_act_export.py
@@ -26,19 +26,24 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import onnx  # noqa
+import pytest
+
+import brevitas.onnx as bo
 import numpy as np
+import onnx  # noqa
+import os
 import torch
-import brevitas.onnx as bo
-from brevitas.nn import QuantReLU
 from brevitas.core.quant import QuantType
 from brevitas.core.restrict_val import RestrictValueType
 from brevitas.core.scaling import ScalingImplType
-import pytest
-from finn.core.modelwrapper import ModelWrapper
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.nn import QuantReLU
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
+
 import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 
 export_onnx_path = "test_brevitas_relu_act_export.onnx"
 
@@ -48,7 +53,8 @@ export_onnx_path = "test_brevitas_relu_act_export.onnx"
 @pytest.mark.parametrize(
     "scaling_impl_type", [ScalingImplType.CONST, ScalingImplType.PARAMETER]
 )
-def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type):
+@pytest.mark.parametrize("QONNX_export", [False, True])
+def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type, QONNX_export):
     min_val = -1.0
     ishape = (1, 15)
 
@@ -69,8 +75,15 @@ scaling_impl.learned_value": torch.tensor(
             )
         }
         b_act.load_state_dict(checkpoint)
-
-    bo.export_finn_onnx(b_act, ishape, export_onnx_path)
+    if QONNX_export:
+        m_path = export_onnx_path
+        BrevitasONNXManager.export(b_act, ishape, m_path)
+        qonnx_cleanup(m_path, out_file=m_path)
+        model = ModelWrapper(m_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(m_path)
+    else:
+        bo.export_finn_onnx(b_act, ishape, export_onnx_path)
     model = ModelWrapper(export_onnx_path)
     model = model.transform(InferShapes())
     inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
@@ -101,7 +114,10 @@ scaling_impl.learned_value": torch.tensor(
 @pytest.mark.parametrize("abits", [2, 4, 8])
 @pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)])
 @pytest.mark.parametrize("scaling_per_channel", [True, False])
-def test_brevitas_act_export_relu_imagenet(abits, max_val, scaling_per_channel):
+@pytest.mark.parametrize("QONNX_export", [False, True])
+def test_brevitas_act_export_relu_imagenet(
+    abits, max_val, scaling_per_channel, QONNX_export
+):
     out_channels = 32
     ishape = (1, out_channels, 1, 1)
     min_val = -1.0
@@ -113,7 +129,7 @@ def test_brevitas_act_export_relu_imagenet(abits, max_val, scaling_per_channel):
         restrict_scaling_type=RestrictValueType.LOG_FP,
         scaling_min_val=2e-16,
         max_val=6.0,
-        return_quant_tensor=True,
+        return_quant_tensor=False,
         per_channel_broadcastable_shape=(1, out_channels, 1, 1),
     )
     if scaling_per_channel is True:
@@ -127,7 +143,15 @@ scaling_impl.learned_value": rand_tensor.type(
         )
     }
     b_act.load_state_dict(checkpoint)
-    bo.export_finn_onnx(b_act, ishape, export_onnx_path)
+    if QONNX_export:
+        m_path = export_onnx_path
+        BrevitasONNXManager.export(b_act, ishape, m_path)
+        qonnx_cleanup(m_path, out_file=m_path)
+        model = ModelWrapper(m_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(m_path)
+    else:
+        bo.export_finn_onnx(b_act, ishape, export_onnx_path)
     model = ModelWrapper(export_onnx_path)
     model = model.transform(InferShapes())
     inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
@@ -138,7 +162,7 @@ scaling_impl.learned_value": rand_tensor.type(
     produced = odict[model.graph.output[0].name]
     inp_tensor = torch.from_numpy(inp_tensor).float()
     b_act.eval()
-    expected = b_act.forward(inp_tensor).tensor.detach().numpy()
+    expected = b_act.forward(inp_tensor).detach().numpy()
     if not np.isclose(produced, expected, atol=1e-3).all():
         print(abits, max_val)
         print("scale: ", b_act.quant_act_scale().type(torch.FloatTensor).detach())
diff --git a/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py b/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py
similarity index 87%
rename from tests/brevitas/test_brevitas_scaled_QHardTanh_export.py
rename to tests/brevitas/test_brevitas_scaled_qhardtanh_export.py
index b1652c1cdce881157c6e97dcfde4257902678c8f..c6da2e2e971ee97cb73243284920cc87e8b4d7bb 100644
--- a/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py
+++ b/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py
@@ -26,19 +26,24 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+
+import brevitas.onnx as bo
+import numpy as np
 import onnx  # noqa
 import os
-import numpy as np
 import torch
-import brevitas.onnx as bo
-from brevitas.nn import QuantHardTanh
-from brevitas.core.restrict_val import RestrictValueType
 from brevitas.core.quant import QuantType
+from brevitas.core.restrict_val import RestrictValueType
 from brevitas.core.scaling import ScalingImplType
-import pytest
-from finn.core.modelwrapper import ModelWrapper
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.nn import QuantHardTanh
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
+
 import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 
 export_onnx_path = "test_brevitas_scaled_QHardTanh_export.onnx"
 
@@ -50,8 +55,9 @@ export_onnx_path = "test_brevitas_scaled_QHardTanh_export.onnx"
 @pytest.mark.parametrize(
     "scaling_impl_type", [ScalingImplType.CONST, ScalingImplType.PARAMETER]
 )
+@pytest.mark.parametrize("QONNX_export", [False, True])
 def test_brevitas_act_export_qhardtanh_scaled(
-    abits, narrow_range, min_val, max_val, scaling_impl_type
+    abits, narrow_range, min_val, max_val, scaling_impl_type, QONNX_export
 ):
     def get_quant_type(bit_width):
         if bit_width is None:
@@ -82,8 +88,15 @@ tensor_quant.scaling_impl.learned_value": torch.tensor(
             )
         }
         b_act.load_state_dict(checkpoint)
-
-    bo.export_finn_onnx(b_act, ishape, export_onnx_path)
+    if QONNX_export:
+        m_path = export_onnx_path
+        BrevitasONNXManager.export(b_act, ishape, m_path)
+        qonnx_cleanup(m_path, out_file=m_path)
+        model = ModelWrapper(m_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(m_path)
+    else:
+        bo.export_finn_onnx(b_act, ishape, export_onnx_path)
     model = ModelWrapper(export_onnx_path)
     model = model.transform(InferShapes())
     inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
diff --git a/tests/brevitas/test_brevitas_validate_mobilenet.py b/tests/brevitas/test_brevitas_validate_mobilenet.py
index dd079fe2e27ace85b9a08e699fa437e93f8e7f3d..12e7e7aff2ec2ebae3e2ec7713a24046553dc5f2 100644
--- a/tests/brevitas/test_brevitas_validate_mobilenet.py
+++ b/tests/brevitas/test_brevitas_validate_mobilenet.py
@@ -26,33 +26,35 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
+import pytest
+
+import brevitas.onnx as bo
 import csv
 import numpy as np
-import brevitas.onnx as bo
+import os
 import torch
-from finn.util.basic import make_build_dir
-from finn.util.pytorch import NormalizePreProc
-from finn.util.test import get_test_model_trained
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.streamline.absorb as absorb
+import finn.util.imagenet as imagenet_util
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.general import RemoveStaticGraphInputs
-from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     GiveUniqueParameterTensors,
+    RemoveStaticGraphInputs,
 )
-from finn.transformation.merge_onnx_models import MergeONNXModels
-import finn.transformation.streamline.absorb as absorb
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.insert_topk import InsertTopK
-import finn.core.onnx_exec as oxe
-import finn.util.imagenet as imagenet_util
-import pytest
-import torchvision.datasets as datasets
-import torchvision.transforms as transforms
+from finn.transformation.merge_onnx_models import MergeONNXModels
+from finn.util.basic import make_build_dir
+from finn.util.pytorch import NormalizePreProc
+from finn.util.test import get_test_model_trained
 
 # normalization (preprocessing) settings for MobileNet-v1 w4a4
 mean = [0.485, 0.456, 0.406]
diff --git a/tests/end2end/test_end2end_access_board.py b/tests/end2end/test_end2end_access_board.py
index 21b495c74ca8179e1f9e1e3955e665c4c81763b8..ee15980ffb1b750c993a4b499dce57a1b8133e57 100644
--- a/tests/end2end/test_end2end_access_board.py
+++ b/tests/end2end/test_end2end_access_board.py
@@ -27,7 +27,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+
 import subprocess
+
 from finn.util.test import get_build_env
 
 
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index a6e7ad642222936775293ec145e845ef111dd4d3..1fddc7c1c26a0ba04d5849809ccf59b0a926a509 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -26,77 +26,80 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
 import pytest
 
+import brevitas.onnx as bo
 import numpy as np
 
 # as of Feb'20 there is a bug that segfaults ONNX shape inference if we
 # import pytorch before onnx, so we make sure to import onnx first
 import onnx  # NOQA
+import os
+import subprocess
 import torch
-import brevitas.onnx as bo
+import warnings
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from collections import OrderedDict
+from dataset_loading import cifar, mnist
+from datetime import datetime
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
+from scipy.stats import linregress
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
+from finn.core.throughput_test import throughput_test_remote, throughput_test_rtlsim
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from finn.transformation.fold_constants import FoldConstants
-
+from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.transformation.general import (
-    RemoveUnusedTensors,
-    RemoveStaticGraphInputs,
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
+    RemoveStaticGraphInputs,
+    RemoveUnusedTensors,
 )
+from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.streamline import Streamline
-from finn.util.test import (
-    get_build_env,
-    load_test_checkpoint_or_skip,
-    get_example_input,
-    get_trained_network_and_ishape,
-    execute_parent,
-    get_topk,
-)
-from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.insert_topk import InsertTopK
 from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from finn.transformation.merge_onnx_models import MergeONNXModels
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import (
     MakeMaxPoolNHWC,
     MoveScalarLinearPastInvariants,
 )
-import warnings
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
-from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
-from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
-from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
-from finn.core.modelwrapper import ModelWrapper
-from scipy.stats import linregress
-from finn.core.throughput_test import throughput_test_remote, throughput_test_rtlsim
-from finn.util.pytorch import ToTensor
-from finn.transformation.merge_onnx_models import MergeONNXModels
-from finn.transformation.insert_topk import InsertTopK
-from finn.core.datatype import DataType
-from dataset_loading import mnist, cifar
-from datetime import datetime
-import subprocess
 from finn.util.gdrive import upload_to_end2end_dashboard
-from collections import OrderedDict
+from finn.util.pytorch import ToTensor
+from finn.util.test import (
+    execute_parent,
+    get_build_env,
+    get_example_input,
+    get_topk,
+    get_trained_network_and_ishape,
+    load_test_checkpoint_or_skip,
+)
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 target_clk_ns = 10
@@ -104,8 +107,14 @@ mem_mode = "decoupled"
 rtlsim_trace = False
 
 
-def get_checkpoint_name(topology, wbits, abits, step):
-    return build_dir + "/end2end_%s_w%da%d_%s.onnx" % (topology, wbits, abits, step)
+def get_checkpoint_name(topology, wbits, abits, QONNX_export, step):
+    return build_dir + "/end2end_%s_w%da%d_QONNX-%d_%s.onnx" % (
+        topology,
+        wbits,
+        abits,
+        QONNX_export,
+        step,
+    )
 
 
 def get_dashboard_data(topology, wbits, abits):
@@ -303,15 +312,23 @@ def topology2dataset(topology):
 @pytest.mark.parametrize("wbits", [1, 2])
 @pytest.mark.parametrize("abits", [1, 2])
 @pytest.mark.parametrize("topology", ["lfc", "tfc", "cnv"])
+@pytest.mark.parametrize("QONNX_export", [False, True])
 class TestEnd2End:
-    def test_export(self, topology, wbits, abits):
+    def test_export(self, topology, wbits, abits, QONNX_export):
         if wbits > abits:
             pytest.skip("No wbits > abits end2end network configs for now")
         if topology == "lfc" and not (wbits == 1 and abits == 1):
             pytest.skip("Skipping certain lfc configs")
         (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
-        chkpt_name = get_checkpoint_name(topology, wbits, abits, "export")
-        bo.export_finn_onnx(model, ishape, chkpt_name)
+        chkpt_name = get_checkpoint_name(topology, wbits, abits, QONNX_export, "export")
+        if QONNX_export:
+            BrevitasONNXManager.export(model, ishape, chkpt_name)
+            qonnx_cleanup(chkpt_name, out_file=chkpt_name)
+            model = ModelWrapper(chkpt_name)
+            model = model.transform(ConvertQONNXtoFINN())
+            model.save(chkpt_name)
+        else:
+            bo.export_finn_onnx(model, ishape, chkpt_name)
         nname = "%s_w%da%d" % (topology, wbits, abits)
         update_dashboard_data(topology, wbits, abits, "network", nname)
         dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@@ -323,8 +340,10 @@ class TestEnd2End:
         update_dashboard_data(topology, wbits, abits, "finn-commit", finn_commit)
         assert os.path.isfile(chkpt_name)
 
-    def test_import_and_tidy(self, topology, wbits, abits):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "export")
+    def test_import_and_tidy(self, topology, wbits, abits, QONNX_export):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "export"
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(InferShapes())
         model = model.transform(FoldConstants())
@@ -332,17 +351,23 @@ class TestEnd2End:
         model = model.transform(GiveReadableTensorNames())
         model = model.transform(InferDataTypes())
         model = model.transform(RemoveStaticGraphInputs())
-        chkpt = get_checkpoint_name(topology, wbits, abits, "import_and_tidy")
+        chkpt = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "import_and_tidy"
+        )
         model.save(chkpt)
 
-    def test_add_pre_and_postproc(self, topology, wbits, abits):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "import_and_tidy")
+    def test_add_pre_and_postproc(self, topology, wbits, abits, QONNX_export):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "import_and_tidy"
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         global_inp_name = model.graph.input[0].name
         ishape = model.get_tensor_shape(global_inp_name)
         # preprocessing: torchvision's ToTensor divides uint8 inputs by 255
         totensor_pyt = ToTensor()
-        chkpt_preproc_name = get_checkpoint_name(topology, wbits, abits, "preproc")
+        chkpt_preproc_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "preproc"
+        )
         bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)
         assert os.path.isfile(chkpt_preproc_name)
         # join preprocessing and core model
@@ -352,10 +377,12 @@ class TestEnd2End:
         model = model.transform(MergeONNXModels(pre_model))
         # add input quantization annotation: UINT8 for all BNN-PYNQ models
         global_inp_name = model.graph.input[0].name
-        model.set_tensor_datatype(global_inp_name, DataType.UINT8)
+        model.set_tensor_datatype(global_inp_name, DataType["UINT8"])
         # postprocessing: insert Top-1 node at the end
         model = model.transform(InsertTopK(k=1))
-        chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post")
+        chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "pre_post"
+        )
         # tidy-up again
         model = model.transform(InferShapes())
         model = model.transform(FoldConstants())
@@ -366,8 +393,10 @@ class TestEnd2End:
         model.save(chkpt_name)
         assert os.path.isfile(chkpt_name)
 
-    def test_streamline(self, topology, wbits, abits):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post")
+    def test_streamline(self, topology, wbits, abits, QONNX_export):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "pre_post"
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(absorb.AbsorbSignBiasIntoMultiThreshold())
         # move past any reshapes to be able to streamline input scaling
@@ -383,10 +412,14 @@ class TestEnd2End:
         model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
         model = model.transform(InferDataLayouts())
         model = model.transform(RemoveUnusedTensors())
-        model.save(get_checkpoint_name(topology, wbits, abits, "streamline"))
+        model.save(
+            get_checkpoint_name(topology, wbits, abits, QONNX_export, "streamline")
+        )
 
-    def test_convert_to_hls_layers(self, topology, wbits, abits):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "streamline")
+    def test_convert_to_hls_layers(self, topology, wbits, abits, QONNX_export):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "streamline"
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         if topology == "tfc" and wbits == 1 and abits == 1:
             # use standalone thresholds for tfc-w1a1 to also exercise that option
@@ -408,16 +441,55 @@ class TestEnd2End:
         model = model.transform(absorb.AbsorbConsecutiveTransposes())
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(InferDataLayouts())
-        model.save(get_checkpoint_name(topology, wbits, abits, "convert_to_hls_layers"))
+        model.save(
+            get_checkpoint_name(
+                topology, wbits, abits, QONNX_export, "convert_to_hls_layers"
+            )
+        )
+        exp_layer_counts = {
+            "tfc": [
+                ("Reshape", 1),
+                ("Thresholding_Batch", 1),
+                ("StreamingFCLayer_Batch", 4),
+                ("LabelSelect_Batch", 1),
+            ],
+            "tfc-1-1": [
+                ("Reshape", 1),
+                ("Thresholding_Batch", 4),
+                ("StreamingFCLayer_Batch", 4),
+                ("LabelSelect_Batch", 1),
+            ],
+            "lfc": [
+                ("Reshape", 1),
+                ("Thresholding_Batch", 1),
+                ("StreamingFCLayer_Batch", 4),
+                ("LabelSelect_Batch", 1),
+            ],
+            "cnv": [
+                ("Transpose", 1),
+                ("Thresholding_Batch", 1),
+                ("ConvolutionInputGenerator", 6),
+                ("StreamingFCLayer_Batch", 9),
+                ("StreamingMaxPool_Batch", 2),
+                ("LabelSelect_Batch", 1),
+            ],
+        }
+        if topology == "tfc" and wbits == 1 and abits == 1:
+            exp_key = "tfc-1-1"
+        else:
+            exp_key = topology
+        exp_layer_counts = exp_layer_counts[exp_key]
+        for (op_type, exp_count) in exp_layer_counts:
+            assert len(model.get_nodes_by_op_type(op_type)) == exp_count
 
-    def test_create_dataflow_partition(self, topology, wbits, abits):
+    def test_create_dataflow_partition(self, topology, wbits, abits, QONNX_export):
         prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, "convert_to_hls_layers"
+            topology, wbits, abits, QONNX_export, "convert_to_hls_layers"
         )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         parent_model = model.transform(CreateDataflowPartition())
         parent_model_chkpt = get_checkpoint_name(
-            topology, wbits, abits, "dataflow_parent"
+            topology, wbits, abits, QONNX_export, "dataflow_parent"
         )
         parent_model.save(parent_model_chkpt)
         sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
@@ -425,28 +497,36 @@ class TestEnd2End:
         dataflow_model_filename = sdp_node.get_nodeattr("model")
         dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
         dataflow_model_chkpt = get_checkpoint_name(
-            topology, wbits, abits, "dataflow_model"
+            topology, wbits, abits, QONNX_export, "dataflow_model"
         )
         dataflow_model.save(dataflow_model_chkpt)
 
-    def test_fold(self, topology, wbits, abits):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "dataflow_model")
+    def test_fold(self, topology, wbits, abits, QONNX_export):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "dataflow_model"
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         folding_fxn = get_folding_function(topology, wbits, abits)
         model = folding_fxn(model)
-        model.save(get_checkpoint_name(topology, wbits, abits, "fold"))
+        model.save(get_checkpoint_name(topology, wbits, abits, QONNX_export, "fold"))
 
     @pytest.mark.slow
     @pytest.mark.vivado
-    def test_cppsim(self, topology, wbits, abits):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold")
+    def test_cppsim(self, topology, wbits, abits, QONNX_export):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "fold"
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
         model = model.transform(SetExecMode("cppsim"))
-        cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim")
+        cppsim_chkpt = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "cppsim"
+        )
         model.save(cppsim_chkpt)
-        parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
+        parent_chkpt = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "dataflow_parent"
+        )
         (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
             topology, wbits, abits, return_topk=1
         )
@@ -456,22 +536,28 @@ class TestEnd2End:
     @pytest.mark.slow
     @pytest.mark.vivado
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_ipgen(self, topology, wbits, abits, kind):
+    def test_ipgen(self, topology, wbits, abits, QONNX_export, kind):
         if kind == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold")
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "fold"
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
         model = model.transform(HLSSynthIP())
-        model.save(get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind))
+        model.save(
+            get_checkpoint_name(topology, wbits, abits, QONNX_export, "ipgen_" + kind)
+        )
 
     @pytest.mark.slow
     @pytest.mark.vivado
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_set_fifo_depths(self, topology, wbits, abits, kind):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind)
+    def test_set_fifo_depths(self, topology, wbits, abits, QONNX_export, kind):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "ipgen_" + kind
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
         model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
@@ -483,14 +569,18 @@ class TestEnd2End:
                 op_inst = getCustomOp(node)
                 assert op_inst.get_nodeattr("inFIFODepth") == 0
                 assert op_inst.get_nodeattr("outFIFODepth") == 0
-        model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + kind))
+        model.save(
+            get_checkpoint_name(
+                topology, wbits, abits, QONNX_export, "fifodepth_" + kind
+            )
+        )
 
     @pytest.mark.slow
     @pytest.mark.vivado
     @pytest.mark.parametrize("kind", ["zynq"])
-    def test_ipstitch_rtlsim(self, topology, wbits, abits, kind):
+    def test_ipstitch_rtlsim(self, topology, wbits, abits, QONNX_export, kind):
         prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, "fifodepth_" + kind
+            topology, wbits, abits, QONNX_export, "fifodepth_" + kind
         )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
@@ -514,54 +604,61 @@ class TestEnd2End:
             )
             os.environ["RTLSIM_TRACE_DEPTH"] = "3"
         rtlsim_chkpt = get_checkpoint_name(
-            topology, wbits, abits, "ipstitch_rtlsim_" + kind
+            topology, wbits, abits, QONNX_export, "ipstitch_rtlsim_" + kind
         )
         model.save(rtlsim_chkpt)
-        parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
+        parent_chkpt = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "dataflow_parent"
+        )
         (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
             topology, wbits, abits, return_topk=1
         )
         y = execute_parent(parent_chkpt, rtlsim_chkpt, input_tensor_npy)
-        model = ModelWrapper(rtlsim_chkpt)
-        perf["cycles_rtlsim"] = model.get_metadata_prop("cycles_rtlsim")
-        # warnings.warn("Estimated & rtlsim performance: " + str(perf))
-        # for (k, v) in perf.items():
-        #    update_dashboard_data(topology, wbits, abits, k, v)
-        update_dashboard_data(
-            topology, wbits, abits, "cycles_rtlsim", perf["cycles_rtlsim"]
-        )
         assert np.isclose(y, output_tensor_npy).all()
 
     @pytest.mark.slow
     @pytest.mark.vivado
     @pytest.mark.parametrize("kind", ["zynq"])
-    def test_throughput_rtlsim(self, topology, wbits, abits, kind):
+    def test_throughput_rtlsim(self, topology, wbits, abits, QONNX_export, kind):
         prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, "ipstitch_rtlsim_" + kind
+            topology, wbits, abits, QONNX_export, "ipstitch_rtlsim_" + kind
         )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         n_nodes = len(model.graph.node)
         perf_est = model.analysis(dataflow_performance)
-        latency = int(model.get_metadata_prop("cycles_rtlsim"))
+        ret_b1 = throughput_test_rtlsim(model, batchsize=1)
+        latency = int(ret_b1["cycles"])
         cycles_per_sample_est = perf_est["max_cycles"]
         batchsize = 2 * n_nodes
         ret = throughput_test_rtlsim(model, batchsize=batchsize)
         res_cycles = ret["cycles"]
         est_cycles = latency + cycles_per_sample_est * batchsize
+        # warnings.warn("Estimated & rtlsim performance: " + str(perf))
+        # for (k, v) in perf.items():
+        #    update_dashboard_data(topology, wbits, abits, k, v)
+        update_dashboard_data(topology, wbits, abits, "cycles_rtlsim", latency)
         assert (abs(res_cycles - est_cycles) / res_cycles) < 0.15
 
     @pytest.mark.slow
     @pytest.mark.vivado
     @pytest.mark.parametrize("kind", ["zynq"])
-    def test_validate_top1(self, topology, wbits, abits, kind):
+    def test_validate_top1(self, topology, wbits, abits, QONNX_export, kind):
         if "TEST_END2END_VALIDATE_TOP1" not in os.environ:
             pytest.skip("TEST_END2END_VALIDATE_TOP1 not set")
-        prepostproc_chkpt = get_checkpoint_name(topology, wbits, abits, "pre_post")
-        streamline_chkpt = get_checkpoint_name(topology, wbits, abits, "streamline")
-        parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
-        cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim")
+        prepostproc_chkpt = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "pre_post"
+        )
+        streamline_chkpt = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "streamline"
+        )
+        parent_chkpt = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "dataflow_parent"
+        )
+        cppsim_chkpt = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "cppsim"
+        )
         rtlsim_chkpt = get_checkpoint_name(
-            topology, wbits, abits, "ipstitch_rtlsim_" + kind
+            topology, wbits, abits, QONNX_export, "ipstitch_rtlsim_" + kind
         )
         dataset = topology2dataset(topology)
         assert measure_top1_accuracy(prepostproc_chkpt, dataset) > 80
@@ -573,11 +670,11 @@ class TestEnd2End:
     @pytest.mark.vivado
     @pytest.mark.vitis
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_build(self, topology, wbits, abits, kind):
+    def test_build(self, topology, wbits, abits, QONNX_export, kind):
         if kind == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
         prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, "fifodepth_" + kind
+            topology, wbits, abits, QONNX_export, "fifodepth_" + kind
         )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         cfg = get_build_env(kind, target_clk_ns)
@@ -587,11 +684,32 @@ class TestEnd2End:
         for (k, v) in synth_dct.items():
             update_dashboard_data(topology, wbits, abits, k, v)
         update_dashboard_data(topology, wbits, abits, "board", cfg["board"])
-        model.save(get_checkpoint_name(topology, wbits, abits, "build_" + kind))
+        model.save(
+            get_checkpoint_name(topology, wbits, abits, QONNX_export, "build_" + kind)
+        )
 
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.vitis
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_deploy(self, topology, wbits, abits, kind):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "build_" + kind)
+    def test_make_pynq_driver(self, topology, wbits, abits, QONNX_export, kind):
+        if kind == "alveo" and ("VITIS_PATH" not in os.environ):
+            pytest.skip("VITIS_PATH not set")
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "build_" + kind
+        )
+        model = load_test_checkpoint_or_skip(prev_chkpt_name)
+        kind_to_driver_platform = {"zynq": "zynq-iodma", "alveo": "alveo"}
+        model = model.transform(MakePYNQDriver(kind_to_driver_platform[kind]))
+        model.save(
+            get_checkpoint_name(topology, wbits, abits, QONNX_export, "driver_" + kind)
+        )
+
+    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
+    def test_deploy(self, topology, wbits, abits, QONNX_export, kind):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "driver_" + kind
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         cfg = get_build_env(kind, target_clk_ns)
         if cfg["ip"] == "":
@@ -606,11 +724,15 @@ class TestEnd2End:
             )
         )
         # save the model to be able to link it to the parent
-        model.save(get_checkpoint_name(topology, wbits, abits, "deploy_" + kind))
+        model.save(
+            get_checkpoint_name(topology, wbits, abits, QONNX_export, "deploy_" + kind)
+        )
 
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_run_on_hw(self, topology, wbits, abits, kind):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "deploy_" + kind)
+    def test_run_on_hw(self, topology, wbits, abits, QONNX_export, kind):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "deploy_" + kind
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)  # NOQA
         cfg = get_build_env(kind, target_clk_ns)
         if cfg["ip"] == "":
@@ -619,7 +741,7 @@ class TestEnd2End:
             topology, wbits, abits, return_topk=1
         )
         parent_model = load_test_checkpoint_or_skip(
-            get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
+            get_checkpoint_name(topology, wbits, abits, QONNX_export, "dataflow_parent")
         )
         iname = parent_model.graph.input[0].name
         oname = parent_model.graph.output[0].name
@@ -631,8 +753,10 @@ class TestEnd2End:
         assert np.isclose(y, output_tensor_npy).all()
 
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_throughput_hw(self, topology, wbits, abits, kind):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "deploy_" + kind)
+    def test_throughput_hw(self, topology, wbits, abits, QONNX_export, kind):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, QONNX_export, "deploy_" + kind
+        )
         end2end_example = "%s_w%da%d_%s" % (topology, wbits, abits, kind)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)  # NOQA
         cfg = get_build_env(kind, target_clk_ns)
@@ -688,9 +812,13 @@ class TestEnd2End:
             ret[largest_bsize]["throughput[images/s]"],
         )
 
-    def test_upload_results_to_dashboard(self, topology, wbits, abits):
-        dashboard_data = get_dashboard_data(topology, wbits, abits)
-        if len(dashboard_data.keys()) > 0:
-            upload_to_end2end_dashboard(dashboard_data)
+    def test_upload_results_to_dashboard(self, topology, wbits, abits, QONNX_export):
+        # ToDo: Extend the dashboard to also upload QONNX exported models?
+        if QONNX_export:
+            pytest.skip("Dashboard data upload is disabled for QONNX exported models.")
         else:
-            pytest.skip("No data to upload to dashboard")
+            dashboard_data = get_dashboard_data(topology, wbits, abits)
+            if len(dashboard_data.keys()) > 0:
+                upload_to_end2end_dashboard(dashboard_data)
+            else:
+                pytest.skip("No data to upload to dashboard")
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
index 63d6a91e37586030a15c9a7c2523679875444f3f..e24d87ca6a505de7d0ed50b01157092eb0a26525 100644
--- a/tests/end2end/test_end2end_cybsec_mlp.py
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -26,40 +26,45 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pkg_resources as pk
+
+import pytest
+
+import brevitas.onnx as bo
+import json
+import numpy as np
+import os
+import shutil
+import subprocess
 import torch
-from brevitas.nn import QuantLinear, QuantReLU
-from brevitas.quant_tensor import QuantTensor
 import torch.nn as nn
-import numpy as np
+import wget
 from brevitas.core.quant import QuantType
-from brevitas.nn import QuantIdentity
-import brevitas.onnx as bo
-from finn.core.modelwrapper import ModelWrapper
-from finn.core.datatype import DataType
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from brevitas.nn import QuantIdentity, QuantLinear, QuantReLU
+from brevitas.quant_tensor import QuantTensor
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
+
 import finn.builder.build_dataflow as build
 import finn.builder.build_dataflow_config as build_cfg
-import os
-import shutil
-from finn.util.test import get_build_env, load_test_checkpoint_or_skip
-import pytest
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.util.basic import make_build_dir
-import pkg_resources as pk
-import json
-import wget
-import subprocess
+from finn.util.test import get_build_env, load_test_checkpoint_or_skip
 
 target_clk_ns = 10
 build_kind = "zynq"
 build_dir = os.environ["FINN_BUILD_DIR"]
 
 
-def get_checkpoint_name(step):
+def get_checkpoint_name(step, QONNX_export):
     if step == "build":
         # checkpoint for build step is an entire dir
-        return build_dir + "/end2end_cybsecmlp_build"
+        return build_dir + "/end2end_cybsecmlp_build_QONNX-%d" % (QONNX_export)
     else:
         # other checkpoints are onnx files
-        return build_dir + "/end2end_cybsecmlp_%s.onnx" % (step)
+        return build_dir + "/end2end_cybsecmlp_QONNX-%d_%s.onnx" % (QONNX_export, step)
 
 
 class CybSecMLPForExport(nn.Module):
@@ -80,7 +85,8 @@ class CybSecMLPForExport(nn.Module):
         return out_final
 
 
-def test_end2end_cybsec_mlp_export():
+@pytest.mark.parametrize("QONNX_export", [False, True])
+def test_end2end_cybsec_mlp_export(QONNX_export):
     assets_dir = pk.resource_filename("finn.qnn-data", "cybsec-mlp/")
     # load up trained net in Brevitas
     input_size = 593
@@ -114,7 +120,7 @@ def test_end2end_cybsec_mlp_export():
     W_new = np.pad(W_orig, [(0, 0), (0, 7)])
     model[0].weight.data = torch.from_numpy(W_new)
     model_for_export = CybSecMLPForExport(model)
-    export_onnx_path = get_checkpoint_name("export")
+    export_onnx_path = get_checkpoint_name("export", QONNX_export)
     input_shape = (1, 600)
     # create a QuantTensor instance to mark the input as bipolar during export
     input_a = np.random.randint(0, 1, size=input_shape).astype(np.float32)
@@ -125,32 +131,61 @@ def test_end2end_cybsec_mlp_export():
         input_t, scale=torch.tensor(scale), bit_width=torch.tensor(1.0), signed=True
     )
 
-    bo.export_finn_onnx(
-        model_for_export, export_path=export_onnx_path, input_t=input_qt
-    )
+    if QONNX_export:
+        # With the BrevitasONNXManager we need to manually set
+        # the FINN DataType at the input
+        BrevitasONNXManager.export(
+            model_for_export, input_shape, export_path=export_onnx_path
+        )
+        model = ModelWrapper(export_onnx_path)
+        model.set_tensor_datatype(model.graph.input[0].name, DataType["BIPOLAR"])
+        model.save(export_onnx_path)
+        qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
+        model = ModelWrapper(export_onnx_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(export_onnx_path)
+    else:
+        bo.export_finn_onnx(
+            model_for_export, export_path=export_onnx_path, input_t=input_qt
+        )
     assert os.path.isfile(export_onnx_path)
     # fix input datatype
     finn_model = ModelWrapper(export_onnx_path)
     finnonnx_in_tensor_name = finn_model.graph.input[0].name
     assert tuple(finn_model.get_tensor_shape(finnonnx_in_tensor_name)) == (1, 600)
     # verify a few exported ops
-    assert finn_model.graph.node[1].op_type == "Add"
-    assert finn_model.graph.node[2].op_type == "Div"
-    assert finn_model.graph.node[3].op_type == "MatMul"
-    assert finn_model.graph.node[-1].op_type == "MultiThreshold"
+    if QONNX_export:
+        # The first "Mul" node doesn't exist in the QONNX export,
+        # because the QuantTensor scale is not exported.
+        # However, this node would have been unity scale anyways and
+        # the models are still equivalent.
+        assert finn_model.graph.node[0].op_type == "Add"
+        assert finn_model.graph.node[1].op_type == "Div"
+        assert finn_model.graph.node[2].op_type == "MatMul"
+        assert finn_model.graph.node[-1].op_type == "MultiThreshold"
+    else:
+        assert finn_model.graph.node[0].op_type == "Mul"
+        assert finn_model.get_initializer(finn_model.graph.node[0].input[1]) == 1.0
+        assert finn_model.graph.node[1].op_type == "Add"
+        assert finn_model.graph.node[2].op_type == "Div"
+        assert finn_model.graph.node[3].op_type == "MatMul"
+        assert finn_model.graph.node[-1].op_type == "MultiThreshold"
     # verify datatypes on some tensors
-    assert finn_model.get_tensor_datatype(finnonnx_in_tensor_name) == DataType.BIPOLAR
-    first_matmul_w_name = finn_model.graph.node[3].input[1]
-    assert finn_model.get_tensor_datatype(first_matmul_w_name) == DataType.INT2
+    assert (
+        finn_model.get_tensor_datatype(finnonnx_in_tensor_name) == DataType["BIPOLAR"]
+    )
+    first_matmul_w_name = finn_model.get_nodes_by_op_type("MatMul")[0].input[1]
+    assert finn_model.get_tensor_datatype(first_matmul_w_name) == DataType["INT2"]
 
 
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_end2end_cybsec_mlp_build():
-    model_file = get_checkpoint_name("export")
+@pytest.mark.parametrize("QONNX_export", [False, True])
+def test_end2end_cybsec_mlp_build(QONNX_export):
+    model_file = get_checkpoint_name("export", QONNX_export)
     load_test_checkpoint_or_skip(model_file)
     build_env = get_build_env(build_kind, target_clk_ns)
-    output_dir = make_build_dir("test_end2end_cybsec_mlp_build")
+    output_dir = make_build_dir(f"test_end2end_cybsec_mlp_build_QONNX-{QONNX_export}")
 
     cfg = build.DataflowBuildConfig(
         output_dir=output_dir,
@@ -188,13 +223,14 @@ def test_end2end_cybsec_mlp_build():
         est_res_dict = json.load(f)
         assert est_res_dict["total"]["LUT"] == 11360.0
         assert est_res_dict["total"]["BRAM_18K"] == 36.0
-    shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build"))
+    shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build", QONNX_export))
 
 
-def test_end2end_cybsec_mlp_run_on_hw():
+@pytest.mark.parametrize("QONNX_export", [False, True])
+def test_end2end_cybsec_mlp_run_on_hw(QONNX_export):
     build_env = get_build_env(build_kind, target_clk_ns)
     assets_dir = pk.resource_filename("finn.qnn-data", "cybsec-mlp/")
-    deploy_dir = get_checkpoint_name("build")
+    deploy_dir = get_checkpoint_name("build", QONNX_export)
     if not os.path.isdir(deploy_dir):
         pytest.skip(deploy_dir + " not found from previous test step, skipping")
     driver_dir = deploy_dir + "/driver"
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
index 5bfe8e1ea1b48ed77c40a584d624cc0ecdedb668..e459bfbc3e694d5bbc9698db562765b11f6e8c38 100644
--- a/tests/end2end/test_end2end_mobilenet_v1.py
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -25,57 +25,55 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import time
 import pytest
 
-from PIL import Image
-import os
-import numpy as np
 import brevitas.onnx as bo
+import numpy as np
+import os
+import time
 import torch
+from PIL import Image
 
-from finn.custom_op.registry import getCustomOp
-from finn.util.pytorch import NormalizePreProc
-from finn.util.test import (
-    get_test_model_trained,
-    load_test_checkpoint_or_skip,
-    resize_smaller_side,
-    crop_center,
-)
-
-from finn.core.modelwrapper import ModelWrapper
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.streamline.absorb as absorb
+import finn.transformation.streamline.reorder as reorder
 from finn.core.datatype import DataType
-
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.onnx_exec import execute_onnx
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
+from finn.transformation.double_to_single_float import DoubleToSingleFloat
 from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     GiveUniqueParameterTensors,
     RemoveUnusedTensors,
 )
-from finn.transformation.merge_onnx_models import MergeONNXModels
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.insert_topk import InsertTopK
-import finn.transformation.streamline.absorb as absorb
-import finn.transformation.streamline.reorder as reorder
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from finn.transformation.merge_onnx_models import MergeONNXModels
+from finn.transformation.remove import RemoveIdentityOps
 from finn.transformation.streamline import Streamline
-from finn.transformation.double_to_single_float import DoubleToSingleFloat
-from finn.transformation.streamline.remove import RemoveIdentityOps
 from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul
-from finn.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.transformation.fpgadataflow.create_dataflow_partition import (
-    CreateDataflowPartition,
+from finn.util.basic import alveo_default_platform, alveo_part_map
+from finn.util.pytorch import NormalizePreProc
+from finn.util.test import (
+    crop_center,
+    get_test_model_trained,
+    load_test_checkpoint_or_skip,
+    resize_smaller_side,
 )
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.core.onnx_exec import execute_onnx
-from finn.util.basic import alveo_part_map, alveo_default_platform
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 
@@ -99,7 +97,9 @@ def test_end2end_mobilenet_export():
     bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx)
     preproc_model = ModelWrapper(preproc_onnx)
     # set input finn datatype to UINT8
-    preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType.UINT8)
+    preproc_model.set_tensor_datatype(
+        preproc_model.graph.input[0].name, DataType["UINT8"]
+    )
     preproc_model = preproc_model.transform(InferShapes())
     preproc_model = preproc_model.transform(FoldConstants())
     preproc_model = preproc_model.transform(GiveUniqueNodeNames())
@@ -200,6 +200,7 @@ def test_end2end_mobilenet_lowering():
     )
     model = model.transform(LowerConvsToMatMul())
     model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(InferDataTypes())
diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py
index aa0ce7a6c6c6148ca28b58428ad60d7eb0347bea..550dab4d0321001547efe97487abc543271dcf2e 100644
--- a/tests/end2end/test_ext_weights.py
+++ b/tests/end2end/test_ext_weights.py
@@ -26,16 +26,19 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import finn.builder.build_dataflow as build
-import finn.builder.build_dataflow_config as build_cfg
+import pkg_resources as pk
+
+import pytest
+
 import os
 import shutil
-from finn.util.test import get_build_env, load_test_checkpoint_or_skip
-import pytest
-from finn.util.basic import make_build_dir
-import pkg_resources as pk
-import wget
 import subprocess
+import wget
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.util.basic import make_build_dir
+from finn.util.test import get_build_env, load_test_checkpoint_or_skip
 
 target_clk_ns = 10
 build_kind = "zynq"
diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py
index cf3e064804216e192909eae75f01880554f03d9f..5ddff3d36f03d17833e17bc98649a64dabf31577 100644
--- a/tests/fpgadataflow/test_code_gen_trafo.py
+++ b/tests/fpgadataflow/test_code_gen_trafo.py
@@ -26,10 +26,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
+import pytest
 
+import os
 from onnx import TensorProto, helper
-import pytest
+
 import finn.util.basic as util
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
@@ -38,7 +39,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 
 @pytest.mark.vivado
 def test_code_gen_trafo():
-    idt = wdt = odt = DataType.BIPOLAR
+    idt = wdt = odt = DataType["BIPOLAR"]
     mw = 8
     mh = 8
     pe = 4
diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py
index a12c69285b7b335f075d8ffd7ba27e039ebc6f8c..81e2ff9a7c5829982cdb6121378e9e9e3af81632 100644
--- a/tests/fpgadataflow/test_compilation_trafo.py
+++ b/tests/fpgadataflow/test_compilation_trafo.py
@@ -26,21 +26,21 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
+import pytest
 
+import os
 from onnx import TensorProto, helper
 
-import pytest
 import finn.util.basic as util
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 
 
 @pytest.mark.vivado
 def test_compilation_trafo():
-    idt = wdt = odt = DataType.BIPOLAR
+    idt = wdt = odt = DataType["BIPOLAR"]
     mw = 8
     mh = 8
     pe = 4
diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
index dfdb21fa72cbbeeb503f7ecc447b659ef7934fb9..5cc5f8fa6c1ccd3e5a9e154b6fb2773caf4668a9 100644
--- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
@@ -26,30 +26,29 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from onnx import TensorProto, helper
-import numpy as np
 import pytest
 
-from finn.core.datatype import DataType
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+import numpy as np
+from onnx import TensorProto, helper
 
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.util.basic import gen_finn_dt_tensor
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from finn.util.basic import gen_finn_dt_tensor
 
 
 # conv_config:
@@ -73,7 +72,7 @@ from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
     pad, kernel_size, stride, dilation = conv_config
     np.random.seed(0)
-    idt = DataType.UINT4
+    idt = DataType["UINT4"]
 
     in_feature_dim_h, in_feature_dim_w = [10, 1]
     in_chn = 16
@@ -102,7 +101,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
     input_shape = [1, in_chn, in_feature_dim_h, in_feature_dim_w]
     output_shape = [1, out_chn, out_feature_dim_h, out_feature_dim_w]
 
-    conv_weight_dt = DataType.UINT4
+    conv_weight_dt = DataType["UINT4"]
 
     conv_config = {}
     conv_config["dilations"] = [dilation_h, dilation_w]
diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
index 40f0a620c6cd5db873a731c038a737b35c1cce9d..bf690d1d68bc0f580663735c3596c1dfc0a651e8 100644
--- a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
@@ -28,24 +28,23 @@
 
 import pytest
 
+import numpy as np
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.infer_shapes import InferShapes
-import numpy as np
+from finn.util.basic import gen_finn_dt_tensor
 
 
 def prepare_inputs(input_tensor):
@@ -77,9 +76,13 @@ def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape):
 
 
 # parameter datatype
-@pytest.mark.parametrize("pdt", [DataType.BIPOLAR, DataType.UINT4, DataType.INT2])
+@pytest.mark.parametrize(
+    "pdt", [DataType["BIPOLAR"], DataType["UINT4"], DataType["INT2"]]
+)
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.INT32, DataType.UINT4, DataType.INT4])
+@pytest.mark.parametrize(
+    "idt", [DataType["INT32"], DataType["UINT4"], DataType["INT4"]]
+)
 # function
 @pytest.mark.parametrize("onnx_op_name", ["Add", "Mul"])
 # vector parameter or scalar parameter (broadcast)
@@ -104,10 +107,10 @@ def test_convert_to_hls_channelwise_layer(
 
     # Since the aren't Data types with a bit width of a non power of 2,
     # there are cases where the input won't use it full range.
-    if idt == DataType.INT32:
-        x = gen_finn_dt_tensor(DataType.INT16, (1, ifm_ch, ifm_dim, ifm_dim))
-    elif idt == DataType.UINT32:
-        x = gen_finn_dt_tensor(DataType.UINT16, (1, ifm_ch, ifm_dim, ifm_dim))
+    if idt == DataType["INT32"]:
+        x = gen_finn_dt_tensor(DataType["INT16"], (1, ifm_ch, ifm_dim, ifm_dim))
+    elif idt == DataType["UINT32"]:
+        x = gen_finn_dt_tensor(DataType["UINT16"], (1, ifm_ch, ifm_dim, ifm_dim))
     else:
         x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim))
 
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
index 8b7b0a4b6a93cde690a3d87eb3a2f1a0a55a85f8..9b0f3d68aed655f0b36857d50a085093ea94aecb 100755
--- a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
@@ -26,34 +26,30 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from onnx import TensorProto, helper
-import numpy as np
 import pytest
 
-from finn.core.datatype import DataType
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+import numpy as np
+from onnx import TensorProto, helper
 
+import finn.core.data_layout as DataLayout
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.util.basic import gen_finn_dt_tensor
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+import finn.transformation.streamline.absorb as absorb
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.custom_op.general.im2col import compute_conv_output_dim
-
-import finn.transformation.streamline.absorb as absorb
-from finn.transformation.general import RemoveUnusedTensors
-from finn.transformation.streamline import Streamline
+from finn.transformation.general import GiveUniqueNodeNames, RemoveUnusedTensors
 from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants
-
-import finn.core.data_layout as DataLayout
+from finn.util.basic import gen_finn_dt_tensor
 
 
 def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
@@ -83,10 +79,10 @@ def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
 @pytest.mark.slow
 def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
     np.random.seed(0)
-    idt = DataType.UINT4
-    odt = DataType.UINT4
-    conv_weight_dt = DataType.INT4
-    fc_weight_dt = DataType.INT4
+    idt = DataType["UINT4"]
+    odt = DataType["UINT4"]
+    conv_weight_dt = DataType["INT4"]
+    fc_weight_dt = DataType["INT4"]
 
     input_shape, kernel_shape, stride, pad = conv_config
     kernel_size_h, kernel_size_w = kernel_shape
@@ -190,8 +186,8 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
     model.set_tensor_datatype("global_out", odt)
     model.set_tensor_datatype("conv_param", conv_weight_dt)
     model.set_tensor_datatype("matmul_param", fc_weight_dt)
-    model.set_tensor_datatype("thres1_param", DataType.INT32)
-    model.set_tensor_datatype("thres2_param", DataType.INT32)
+    model.set_tensor_datatype("thres1_param", DataType["INT32"])
+    model.set_tensor_datatype("thres2_param", DataType["INT32"])
 
     model.set_initializer(
         "conv_param", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape)
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index d88576583eaacb7579b02bc00e4e0f9b77b16f7e..d96bc987567cdcfcd18a404986c954c7527c7354 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -26,30 +26,29 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from onnx import TensorProto, helper
-import numpy as np
 import pytest
 
-from finn.core.datatype import DataType
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+import numpy as np
+from onnx import TensorProto, helper
 
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 import finn.core.onnx_exec as oxe
-from finn.core.modelwrapper import ModelWrapper
-from finn.util.basic import gen_finn_dt_tensor
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from finn.util.basic import gen_finn_dt_tensor
 
 # conv_config  kernel_size,stride, pad
 
@@ -64,7 +63,7 @@ from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     kernel_size, stride, pad = conv_config
     np.random.seed(0)
-    idt = DataType.UINT4
+    idt = DataType["UINT4"]
 
     in_feature_dim = 7
     in_chn = 16
@@ -85,7 +84,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
     output_shape = [1, out_chn, out_feature_dim, out_feature_dim]
 
-    conv_weight_dt = DataType.UINT4
+    conv_weight_dt = DataType["UINT4"]
 
     conv_config = {}
     conv_config["dilations"] = [1, 1]
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
index 15bf160799826b0d50a0f043a56dd1fc2accdd12..3357ee6d6c1e540818549f2d0df8b8554690ca3c 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
@@ -26,29 +26,31 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
 import pkg_resources as pk
 
+import pytest
+
 import brevitas.onnx as bo
 import numpy as np
-import pytest
+import os
+
 import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
-from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.streamline import Streamline
+from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 from finn.util.test import get_test_model_trained
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
-import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.custom_op.registry import getCustomOp
 
 export_onnx_path_cnv = "test_convert_to_hls_layers_cnv.onnx"
 
@@ -68,6 +70,7 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation):
     model = model.transform(LowerConvsToMatMul())
     model = model.transform(MakeMaxPoolNHWC())
     model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
     model = model.transform(ConvertBipolarMatMulToXnorPopcount())
     model = model.transform(Streamline())
     model = model.transform(InferDataLayouts())
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
index cb66fa7237416579b509aa4f508c9105d386d08a..a1dc11e0eee5aab462beb0ec34b8771ced20a379 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
@@ -26,15 +26,16 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-from pkgutil import get_data
+import pytest
 
 import brevitas.onnx as bo
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
+import os
 import torch
-import pytest
+from pkgutil import get_data
+
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
@@ -42,8 +43,8 @@ from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from finn.transformation.fold_constants import FoldConstants
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.transformation.infer_shapes import InferShapes
@@ -51,7 +52,6 @@ from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.test import get_test_model_trained
 
-
 export_onnx_path = "test_convert_to_hls_layers_fc.onnx"
 
 
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
index 86875d2ac7f37e697c5de198e15aa3045a9e3d42..6089901566cb412e63cd8acc7a8260081248ba52 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
@@ -26,42 +26,43 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import numpy as np
+import pytest
 
+import numpy as np
+import os
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import (
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     SortGraph,
 )
-from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.util.basic import gen_finn_dt_tensor
-from finn.util.test import soft_verify_topk
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.insert_topk import InsertTopK
-import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.streamline.absorb import (
-    AbsorbScalarMulAddIntoTopK,
     AbsorbConsecutiveTransposes,
+    AbsorbScalarMulAddIntoTopK,
 )
 from finn.transformation.streamline.collapse_repeated import (
-    CollapseRepeatedMul,
     CollapseRepeatedAdd,
+    CollapseRepeatedMul,
 )
-from finn.transformation.streamline.reorder import MoveAddPastMul
-
-import pytest
+from finn.transformation.streamline.reorder import (
+    MoveAddPastMul,
+    MoveScalarLinearPastInvariants,
+)
+from finn.util.basic import gen_finn_dt_tensor
+from finn.util.test import soft_verify_topk
 
 export_onnx_path = "test_output_synthetic.onnx"
 
@@ -137,7 +138,7 @@ def make_model(ch, ifmdim):
 
 
 # data types
-@pytest.mark.parametrize("idt", [DataType.UINT2])
+@pytest.mark.parametrize("idt", [DataType["UINT2"]])
 # channels
 @pytest.mark.parametrize("ch", [16])
 # ifmdim
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
index e8f3c3ae3290b5bdc23e46f7e9991222fdfac000..3efafc040df07a7d56638bf5ce0b1ce01887343c 100644
--- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -28,23 +28,24 @@
 
 import pytest
 
-from onnx import TensorProto, helper
 import numpy as np
+from onnx import TensorProto, helper
+
 import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.custom_op.registry import getCustomOp
-from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.infer_shapes import InferShapes
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.util.basic import gen_finn_dt_tensor
 
 
 def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt):
@@ -117,9 +118,9 @@ def prepare_inputs(input_tensor):
 
 
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4, DataType.INT8])
+@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["INT4"], DataType["INT8"]])
 # output datatype
-@pytest.mark.parametrize("odt", [DataType.UINT4, DataType.INT4])
+@pytest.mark.parametrize("odt", [DataType["UINT4"], DataType["INT4"]])
 # pool configuration:                   ( k,stride, pad, ifm_dim )
 @pytest.mark.parametrize("pool_config", [(7, 7, 0, 7), (3, 2, 1, 5)])
 # input channels
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
index 3efeacb6e6875c6defa799eb7154e02ce880e16a..633db668d3bc5de815a313743c06cd74a7166c9c 100644
--- a/tests/fpgadataflow/test_depthwise_convolution.py
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -27,30 +27,29 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+
+import numpy as np
 import onnx.helper as oh
 from onnx import TensorProto
-import numpy as np
 
-from finn.core.modelwrapper import ModelWrapper
+import finn.core.onnx_exec as oxe
 from finn.core.datatype import DataType
-from finn.transformation.infer_shapes import InferShapes
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.general.im2col import compute_conv_output_dim
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.convert_to_hls_layers import (
     InferConvInpGen,
     InferVVAU,
 )
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-
-import finn.core.onnx_exec as oxe
-from finn.custom_op.general.im2col import compute_conv_output_dim
-from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
-from finn.custom_op.registry import getCustomOp
-
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.general import GiveUniqueNodeNames
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
 
 
 def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
@@ -61,14 +60,14 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
     ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, total_pad=total_pad)
 
     if act is None:
-        odt = DataType.INT32
+        odt = DataType["INT32"]
     else:
         odt = act
         out_act = oh.make_tensor_value_info(
             "out_act", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ofm_ch]
         )
         T = oh.make_tensor_value_info("T", TensorProto.FLOAT, [ofm_ch, 15])
-        tdt = DataType.INT32
+        tdt = DataType["INT32"]
         thresh_node = oh.make_node(
             "MultiThreshold",
             domain="finn.custom_op.general",
@@ -162,7 +161,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
 # PE
 @pytest.mark.parametrize("pe", [1, 2, 4])
 # Output activation
-@pytest.mark.parametrize("act", [None, DataType.UINT4])
+@pytest.mark.parametrize("act", [None, DataType["UINT4"]])
 # kernel size
 @pytest.mark.parametrize("k", [2, 4])
 # stride
@@ -172,7 +171,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding):
-    idt = wdt = DataType.INT4
+    idt = wdt = DataType["INT4"]
     ifm_dim = 6
     ifm_ch = 4
 
@@ -204,7 +203,7 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding):
 # PE
 @pytest.mark.parametrize("pe", [1, 2, 4])
 # Output activation
-@pytest.mark.parametrize("act", [None, DataType.UINT4])
+@pytest.mark.parametrize("act", [None, DataType["UINT4"]])
 # kernel size
 @pytest.mark.parametrize("k", [2, 4])
 # stride
@@ -214,7 +213,7 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding):
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding):
-    idt = wdt = DataType.INT4
+    idt = wdt = DataType["INT4"]
     ifm_dim = 6
     ifm_ch = 4
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
index 0fa156e23b4a01270297e4e8e1fdc13a75eb5a59..8cbf54ec188b12c67e02a33e3540718e9b08f382 100644
--- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
@@ -27,23 +27,23 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import numpy as np
 
+import numpy as np
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.util.basic import gen_finn_dt_tensor
-from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_addstreams_modelwrapper(ch, pe, idt):
@@ -62,7 +62,10 @@ def make_addstreams_modelwrapper(ch, pe, idt):
         inputDataType=idt.name,
     )
     graph = helper.make_graph(
-        nodes=[addstreams_node], name="graph", inputs=[inp1, inp2], outputs=[outp],
+        nodes=[addstreams_node],
+        name="graph",
+        inputs=[inp1, inp2],
+        outputs=[outp],
     )
 
     model = helper.make_model(graph, producer_name="addstreams-model")
@@ -79,7 +82,7 @@ def prepare_inputs(input1, input2):
 
 
 # data types
-@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.UINT8])
+@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
 # channels
 @pytest.mark.parametrize("ch", [1, 64])
 # folding
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
index e45dfe07c3abc0ce218dee0563055acb4458ccd0..949046d4ae313b852471e7d8a93e44fea48f7b0f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -32,19 +32,19 @@ import numpy as np
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.util.basic import gen_finn_dt_tensor
-from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
@@ -85,11 +85,11 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
 
 
 # activation: None or DataType
-@pytest.mark.parametrize("act", [DataType.INT8])
+@pytest.mark.parametrize("act", [DataType["INT8"]])
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.INT4])
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
 # param datatype
-@pytest.mark.parametrize("pdt", [DataType.INT4])
+@pytest.mark.parametrize("pdt", [DataType["INT4"]])
 # folding, -1 is maximum possible
 @pytest.mark.parametrize("nf", [-1, 2])
 # number of input features
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 1ec12263e22a199ac7da55fdf3418185cd38e555..47cd7e7ba1df76cc793cd0946581239a6883874e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -27,25 +27,24 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import numpy as np
 
+import numpy as np
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 
-from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-
 
 def make_single_im2col_modelwrapper(
     k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
@@ -132,7 +131,7 @@ def prepare_inputs(input_tensor):
 
 
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT2"]])
 # kernel size
 @pytest.mark.parametrize("k", [2, 3])
 # input dimension
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
index 6c83aab0d683cdb3888aca3c46bb339bd6330917..8440ac1fe46a0d1ea4db3d76489dfc4ce68ff642 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
@@ -27,26 +27,25 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import numpy as np
 
+import numpy as np
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.general.im2col import compute_conv_output_dim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 
-from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.custom_op.general.im2col import compute_conv_output_dim
-
 
 def make_single_im2col_modelwrapper(
     k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt
@@ -145,8 +144,8 @@ def prepare_inputs(input_tensor):
 
 
 # input datatype
-# @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT8])
-@pytest.mark.parametrize("idt", [DataType.INT8])
+# @pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT8"]])
+@pytest.mark.parametrize("idt", [DataType["INT8"]])
 # kernel size
 @pytest.mark.parametrize("k", [[4, 1]])
 # input dimension
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 12505fdf456aa55f881fb5f3d2d609080cc97074..73bf1165afa9418be0c89f77797de538275fd220 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -27,25 +27,25 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import numpy as np
 
+import numpy as np
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
 from finn.util.basic import gen_finn_dt_tensor
-from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_dupstreams_modelwrapper(ch, pe, idim, idt):
@@ -85,7 +85,7 @@ def prepare_inputs(input_tensor, idt):
 
 
 # data type
-@pytest.mark.parametrize("idt", [DataType.INT4, DataType.UINT16])
+@pytest.mark.parametrize("idt", [DataType["INT4"], DataType["UINT16"]])
 # channels
 @pytest.mark.parametrize("ch", [64])
 # folding
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 34930e672f3ff9816d3328da102b1bc1daa8a3b1..248b591eb48d7cfd6f121738a9bca525c38a45f8 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -30,15 +30,15 @@ import pytest
 
 from onnx import TensorProto, helper
 
+import finn.core.onnx_exec as oxe
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
-import finn.core.onnx_exec as oxe
 
 
 def make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype):
@@ -82,7 +82,7 @@ def prepare_inputs(input_tensor, dt):
 # outWidth
 @pytest.mark.parametrize("OUTWidth", [2, 4])
 # finn_dtype
-@pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"], DataType["INT2"]])
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_dwc_rtlsim(Shape, INWidth, OUTWidth, finn_dtype):
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 00f1ba5d59288b1a463fadbd684ff872269d6970..02c3a3dc9506152fe999873df0612e76a5c9cefd 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -31,22 +31,22 @@ import pytest
 import numpy as np
 from onnx import TensorProto, helper
 
-from finn.custom_op.registry import getCustomOp
 import finn.core.onnx_exec as oxe
 import finn.custom_op.general.xnorpopcount as xp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.general.multithreshold import multithreshold
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None):
@@ -59,11 +59,11 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non
     # StreamingFC:
     # - specify their datatypes as such
     # - specify their datatypes as BINARY as use binaryXnorMode
-    if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
         # we'll internally convert weights/inputs to binary and specify the
         # datatypes as such, and also set the binaryXnorMode attribute to 1
-        export_wdt = DataType.BINARY
-        export_idt = DataType.BINARY
+        export_wdt = DataType["BINARY"]
+        export_idt = DataType["BINARY"]
         binary_xnor_mode = 1
     else:
         export_wdt = wdt
@@ -75,7 +75,7 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non
     if T is not None:
         no_act = 0
         node_inp_list = ["inp", "weights", "thresh"]
-        if odt == DataType.BIPOLAR:
+        if odt == DataType["BIPOLAR"]:
             actval = 0
         else:
             actval = odt.min()
@@ -123,7 +123,7 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non
 
 
 def prepare_inputs(input_tensor, idt, wdt):
-    if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
         # convert bipolar to binary
         return {"inp": (input_tensor + 1) / 2}
     else:
@@ -133,11 +133,11 @@ def prepare_inputs(input_tensor, idt, wdt):
 # mem_mode: const or decoupled
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
-@pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
+@pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]])
 # weight datatype
-@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT4])
+@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["INT4"]])
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT4])
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]])
 # neuron folding, -1 is maximum possible
 @pytest.mark.parametrize("nf", [-1, 2, 1])
 # synapse folding, -1 is maximum possible
@@ -165,10 +165,10 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
         # no activation, produce accumulators
         T = None
         tdt = None
-        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
-            odt = DataType.UINT32
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            odt = DataType["UINT32"]
         else:
-            odt = DataType.INT32
+            odt = DataType["INT32"]
     else:
         odt = act
         (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
@@ -177,13 +177,13 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
         # provide non-decreasing thresholds
         T = np.sort(T, axis=1)
         # generate thresholds for activation
-        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
-            tdt = DataType.UINT32
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            tdt = DataType["UINT32"]
             # bias thresholds to be positive
             T = np.ceil((T + mw) / 2)
             assert (T >= 0).all()
         else:
-            tdt = DataType.INT32
+            tdt = DataType["INT32"]
     model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
     for node in model.graph.node:
         # lookup op_type in registry of CustomOps
@@ -194,14 +194,14 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     model = model.transform(CompileCppSim())
     # prepare input data
     input_dict = prepare_inputs(x, idt, wdt)
-    if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
         # convert inputs to binary and use xnorpopcountmatmul
         y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2)
     else:
         y = np.matmul(x, W)
     if T is not None:
         y = multithreshold(y, T)
-        if act == DataType.BIPOLAR:
+        if act == DataType["BIPOLAR"]:
             # binary to bipolar
             y = 2 * y - 1
         else:
@@ -220,11 +220,11 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 # mem_mode: const or decoupled
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
-@pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
+@pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]])
 # weight datatype
-@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT4])
+@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["INT4"]])
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT4])
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]])
 # neuron folding, -1 is maximum possible
 @pytest.mark.parametrize("nf", [-1, 2, 1])
 # synapse folding, -1 is maximum possible
@@ -252,10 +252,10 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
         # no activation, produce accumulators
         T = None
         tdt = None
-        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
-            odt = DataType.UINT32
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            odt = DataType["UINT32"]
         else:
-            odt = DataType.INT32
+            odt = DataType["INT32"]
     else:
         odt = act
         (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
@@ -264,13 +264,13 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
         # provide non-decreasing thresholds
         T = np.sort(T, axis=1)
         # generate thresholds for activation
-        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
-            tdt = DataType.UINT32
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            tdt = DataType["UINT32"]
             # bias thresholds to be positive
             T = np.ceil((T + mw) / 2)
             assert (T >= 0).all()
         else:
-            tdt = DataType.INT32
+            tdt = DataType["INT32"]
     model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
     for node in model.graph.node:
         # lookup op_type in registry of CustomOps
@@ -279,14 +279,14 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 
     # prepare input data
     input_dict = prepare_inputs(x, idt, wdt)
-    if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
         # convert inputs to binary and use xnorpopcountmatmul
         y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2)
     else:
         y = np.matmul(x, W)
     if T is not None:
         y = multithreshold(y, T)
-        if act == DataType.BIPOLAR:
+        if act == DataType["BIPOLAR"]:
             # binary to bipolar
             y = 2 * y - 1
         else:
@@ -319,11 +319,11 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 # mem_mode: const or decoupled
 @pytest.mark.parametrize("mem_mode", ["decoupled"])
 # activation: None or DataType
-@pytest.mark.parametrize("act", [DataType.INT4])
+@pytest.mark.parametrize("act", [DataType["INT4"]])
 # weight datatype
-@pytest.mark.parametrize("wdt", [DataType.INT4])
+@pytest.mark.parametrize("wdt", [DataType["INT4"]])
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.INT4])
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
 # neuron folding, -1 is maximum possible
 @pytest.mark.parametrize("nf", [-1])
 # synapse folding, -1 is maximum possible
@@ -352,10 +352,10 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
         # no activation, produce accumulators
         T = None
         tdt = None
-        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
-            odt = DataType.UINT32
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            odt = DataType["UINT32"]
         else:
-            odt = DataType.INT32
+            odt = DataType["INT32"]
     else:
         odt = act
         (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
@@ -364,13 +364,13 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
         # provide non-decreasing thresholds
         T = np.sort(T, axis=1)
         # generate thresholds for activation
-        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
-            tdt = DataType.UINT32
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            tdt = DataType["UINT32"]
             # bias thresholds to be positive
             T = np.ceil((T + mw) / 2)
             assert (T >= 0).all()
         else:
-            tdt = DataType.INT32
+            tdt = DataType["INT32"]
     model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
     for node in model.graph.node:
         # lookup op_type in registry of CustomOps
@@ -379,14 +379,14 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
 
     # prepare input data
     input_dict = prepare_inputs(x, idt, wdt)
-    if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
         # convert inputs to binary and use xnorpopcountmatmul
         y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2)
     else:
         y = np.matmul(x, W)
     if T is not None:
         y = multithreshold(y, T)
-        if act == DataType.BIPOLAR:
+        if act == DataType["BIPOLAR"]:
             # binary to bipolar
             y = 2 * y - 1
         else:
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index a603fc0664b78c00354514fbdff62c94aa7b7ef3..4d3074fe14617df4386f060b6a476734931fb4ca 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -27,19 +27,19 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import os
 
+import os
 from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
-import finn.core.onnx_exec as oxe
-
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 test_fpga_part = "xc7z020clg400-1"
@@ -86,7 +86,7 @@ def prepare_inputs(input_tensor, dt):
 # outWidth
 @pytest.mark.parametrize("depth", [16])
 # finn_dtype
-@pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR])  # , DataType.INT2])
+@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"]])  # , DataType["INT2"]])
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index ab47b300136bd95622f064b30e3bbaae76a61597..b564273c0927938859dc438dce619e7067a7ad74 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -27,26 +27,25 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import os
-import numpy as np
 
+import numpy as np
+import os
 from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.util.basic import gen_finn_dt_tensor
-import finn.core.onnx_exec as oxe
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-
-from finn.util.basic import pynq_part_map
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import gen_finn_dt_tensor, pynq_part_map
 
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
@@ -109,7 +108,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
 # PaddingStyle: selects behavior when (odim-idim)%2 != 0
 @pytest.mark.parametrize("pad_style", [2])
 # FINN input datatype
-@pytest.mark.parametrize("idt", [DataType.INT2, DataType.INT4])
+@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
 # execution mode
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
 @pytest.mark.slow
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
index 7fca91925a63a5da4294adb002a3cc97831a88ca..2299cc6e8f397df718d2fd65be8a562c2457e42d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -27,23 +27,23 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import numpy as np
 
+import numpy as np
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.util.basic import gen_finn_dt_tensor
-from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_accpool_modelwrapper(ch, pe, idim, idt):
@@ -78,7 +78,7 @@ def prepare_inputs(input_tensor, idt):
 
 
 # data type
-@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.UINT16])
+@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT16"]])
 # channels
 @pytest.mark.parametrize("ch", [64])
 # folding
@@ -127,7 +127,7 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode):
         exp_cycles_dict = model.analysis(exp_cycles_per_layer)
         exp_cycles = exp_cycles_dict[node.name]
         # commented out, needs performance debug:
-        # test_fpgadataflow_globalaccpool[rtlsim-7-1-64-DataType.UINT4]
+        # test_fpgadataflow_globalaccpool[rtlsim-7-1-64-DataType["UINT4"]]
         # assert False where False =
         # <function isclose at 0x7eff26d5ca60>(50, 103, atol=(0.1 * 103))
         # assert np.isclose(exp_cycles, cycles_rtlsim, atol=0.1 * cycles_rtlsim)
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 4fa780548a544d92e02b28486ae1e325ff1f9a9b..a4e75f5254b3bfd96871dbf32b8400edc2d55379 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -26,41 +26,38 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-
 import pytest
 
 import numpy as np
+import os
 from onnx import TensorProto, helper
 
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
 from finn.custom_op.registry import getCustomOp
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.floorplan import Floorplan
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
-from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
+from finn.transformation.fpgadataflow.vitis_build import VitisBuild
 from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.util.basic import (
+    alveo_default_platform,
+    alveo_part_map,
     gen_finn_dt_tensor,
     pynq_part_map,
-    alveo_part_map,
-    alveo_default_platform,
 )
 from finn.util.pyverilator import pyverilate_stitched_ip
 from finn.util.test import load_test_checkpoint_or_skip
-from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
-from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
-from finn.transformation.fpgadataflow.floorplan import Floorplan
-from finn.transformation.fpgadataflow.vitis_build import VitisBuild
-from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
-
 
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
@@ -71,9 +68,9 @@ ip_stitch_model_dir = os.environ["FINN_BUILD_DIR"]
 def create_one_fc_model(mem_mode="const"):
     # create a model with a StreamingFCLayer instance with no activation
     # the wider range of the full accumulator makes debugging a bit easier
-    wdt = DataType.INT2
-    idt = DataType.INT32
-    odt = DataType.INT32
+    wdt = DataType["INT2"]
+    idt = DataType["INT32"]
+    odt = DataType["INT32"]
     m = 4
     no_act = 1
     binary_xnor_mode = 0
@@ -124,9 +121,9 @@ def create_one_fc_model(mem_mode="const"):
 
 def create_two_fc_model(mem_mode="decoupled"):
     # create a model with two StreamingFCLayer instances
-    wdt = DataType.INT2
-    idt = DataType.INT32
-    odt = DataType.INT32
+    wdt = DataType["INT2"]
+    idt = DataType["INT32"]
+    odt = DataType["INT32"]
     m = 4
     actval = 0
     no_act = 1
@@ -365,11 +362,6 @@ def test_fpgadataflow_ipstitch_zynqbuild(board):
         assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
         assert os.path.isfile(sdp_node.get_nodeattr("model"))
         model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
-    # generate inputs for remote exec
-    iname = "inp"
-    idt = model.get_tensor_datatype(iname)
-    ishape = model.get_tensor_shape(iname)
-    x = gen_finn_dt_tensor(idt, ishape)
     # bitfile using ZynqBuild
     model = model.transform(ZynqBuild(board, 10))
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_customzynq.onnx")
@@ -377,22 +369,3 @@ def test_fpgadataflow_ipstitch_zynqbuild(board):
     bitfile_name = model.get_metadata_prop("bitfile")
     assert bitfile_name is not None
     assert os.path.isfile(bitfile_name)
-    # deployment
-    try:
-        ip = os.environ["PYNQ_IP"]  # no default for this one; skip if not defined
-        if ip == "":
-            pytest.skip("PYNQ board IP address not specified")
-        username = os.getenv("PYNQ_USERNAME", "xilinx")
-        password = os.getenv("PYNQ_PASSWORD", "xilinx")
-        port = os.getenv("PYNQ_PORT", 22)
-        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
-        deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
-        assert deployment_dir is not None
-        assert os.path.isdir(deployment_dir)
-        # remote exec
-        input_dict = {"global_in": x}
-        outp = execute_onnx(model, input_dict)
-        assert np.isclose(outp["global_out"], x).all()
-    except KeyError:
-        pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index 5d496dbb33d21c9092fb2076cac75b3ccbbaa1e9..8ed06c8bdf1c0dbfab2f8141bf724132f4a24705 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -27,20 +27,20 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import numpy as np
 
+import numpy as np
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.util.basic import gen_finn_dt_tensor
 from finn.util.test import soft_verify_topk
 
@@ -61,7 +61,10 @@ def make_labelselect_modelwrapper(labels, pe, k, idt):
         inputDataType=idt.name,
     )
     graph = helper.make_graph(
-        nodes=[labelselect_node], name="graph", inputs=[inp], outputs=[outp],
+        nodes=[labelselect_node],
+        name="graph",
+        inputs=[inp],
+        outputs=[outp],
     )
 
     model = helper.make_model(graph, producer_name="thresholding-model")
@@ -78,7 +81,9 @@ def prepare_inputs(input_tensor, idt):
     return {"inp": input_tensor}
 
 
-@pytest.mark.parametrize("idt", [DataType.UINT8, DataType.UINT16, DataType.INT16])
+@pytest.mark.parametrize(
+    "idt", [DataType["UINT8"], DataType["UINT16"], DataType["INT16"]]
+)
 # labels
 @pytest.mark.parametrize("labels", [10, 100])
 # folding
diff --git a/tests/fpgadataflow/test_fpgadataflow_lookup.py b/tests/fpgadataflow/test_fpgadataflow_lookup.py
new file mode 100644
index 0000000000000000000000000000000000000000..45678bbdf22c21d794777aba27d9070b42238267
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_lookup.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import torch
+from brevitas.export import FINNManager
+from torch import nn
+
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.convert_to_hls_layers import InferLookupLayer
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import gen_finn_dt_tensor
+
+
+def make_lookup_model(embeddings, ishape, idt, edt):
+    num_embeddings, embedding_dim = embeddings.shape
+
+    class LookupModel(nn.Module):
+        def __init__(self, num_embeddings, embedding_dim):
+            super().__init__()
+            self.lookup = nn.Embedding(
+                num_embeddings=num_embeddings, embedding_dim=embedding_dim
+            )
+
+        def forward(self, x):
+            x = self.lookup(x)
+            return x
+
+    torch_model = LookupModel(num_embeddings, embedding_dim)
+    input_t = torch.zeros(ishape, dtype=torch.int64)
+    ret = FINNManager.export(torch_model, input_t=input_t, opset_version=11)
+    model = ModelWrapper(ret)
+    iname = model.graph.input[0].name
+    ename = model.graph.node[0].input[0]
+    model.set_tensor_datatype(iname, idt)
+    eshape = model.get_tensor_shape(ename)
+    assert tuple(eshape) == embeddings.shape
+    model.set_initializer(ename, embeddings)
+    model.set_tensor_datatype(ename, edt)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    return model
+
+
+# embedding DataType
+@pytest.mark.parametrize("edt", [DataType["FIXED<8,2>"]])
+# other embedding config
+@pytest.mark.parametrize(
+    "embedding_cfg", [(130, DataType["UINT8"], 25), (5145, DataType["UINT16"], 20)]
+)
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_lookup(edt, embedding_cfg, exec_mode):
+    ishape = (1, 10)
+    num_embeddings, idt, embedding_dim = embedding_cfg
+    eshape = (num_embeddings, embedding_dim)
+    exp_oshape = tuple(list(ishape) + [embedding_dim])
+    embeddings = gen_finn_dt_tensor(edt, eshape)
+    model = make_lookup_model(embeddings, ishape, idt, edt)
+    assert len(model.graph.node) == 1
+    assert model.graph.node[0].op_type == "Gather"
+    iname = model.graph.input[0].name
+    ename = model.graph.node[0].input[0]
+    oname = model.graph.output[0].name
+    assert model.get_tensor_datatype(iname) == idt
+    assert model.get_tensor_datatype(ename) == edt
+    assert model.get_tensor_datatype(oname) == edt
+    assert tuple(model.get_tensor_shape(ename)) == eshape
+    assert tuple(model.get_tensor_shape(oname)) == exp_oshape
+    assert (model.get_initializer(ename) == embeddings).all()
+    itensor = gen_finn_dt_tensor(idt, ishape).astype(np.int64)
+    itensor = np.clip(itensor, 0, num_embeddings - 1)
+    ret = execute_onnx(model, {iname: itensor})
+    exp_out = np.take(embeddings, itensor, axis=0)
+    assert (exp_out == ret[oname]).all()
+    # call transformation to convert to HLS and verify conversion
+    model = model.transform(InferLookupLayer())
+    assert model.graph.node[0].op_type == "Lookup"
+    assert model.graph.node[0].input[0] == iname
+    assert model.graph.node[0].input[1] == ename
+    assert model.graph.node[0].output[0] == oname
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 10))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(PrepareRTLSim())
+    ret_sim = execute_onnx(model, {iname: itensor})
+    assert (exp_out == ret_sim[oname]).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index 9def746c1c872a8b99b5bab48e8d0bd20798cedd..fe52a73fc07df8551442e975c5eb378c132a56d7 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -54,9 +54,9 @@ def test_res_estimate():
     mw = mh = 4
     simd = 1
     pe = 1
-    idt = DataType.INT2
-    wdt = DataType.INT2
-    odt = DataType.INT2
+    idt = DataType["INT2"]
+    wdt = DataType["INT2"]
+    odt = DataType["INT2"]
     actval = odt.min()
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, mw])
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index bbc7e8227d80fb9d064f484dafe91ecdcdc47144..341bd3f37041c9b5a1526e99b2c4bad4d3dd3029 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -29,27 +29,28 @@
 import pytest
 
 import numpy as np
+import os
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
+from finn.core.rtlsim_exec import rtlsim_exec
 from finn.custom_op.general.multithreshold import multithreshold
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.util.basic import gen_finn_dt_tensor
-from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-import os
 from finn.util.pyverilator import axilite_read, axilite_write
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.core.rtlsim_exec import rtlsim_exec
 
 test_fpga_part = "xc7z020clg400-1"
 target_clk_ns = 5
@@ -97,9 +98,9 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode):
 
 
 # activation: None or DataType
-@pytest.mark.parametrize("act", [DataType.INT4, DataType.BIPOLAR])
+@pytest.mark.parametrize("act", [DataType["INT4"], DataType["BIPOLAR"]])
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.INT16, DataType.UINT16])
+@pytest.mark.parametrize("idt", [DataType["INT16"], DataType["UINT16"]])
 # folding, -1 is maximum possible
 @pytest.mark.parametrize("nf", [-1, 2, 1])
 # number of input features
@@ -124,12 +125,12 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
     # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
     # threshold of first channel is zero, while using BIPOLAR output)
-    if act == DataType.BIPOLAR:
+    if act == DataType["BIPOLAR"]:
         T[0][0] = 0
     # provide non-decreasing thresholds
     T = np.sort(T, axis=1)
 
-    if odt == DataType.BIPOLAR:
+    if odt == DataType["BIPOLAR"]:
         actval = 0
     else:
         actval = odt.min()
@@ -153,7 +154,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     input_dict = {"inp": x}
 
     y = multithreshold(x, T)
-    if act == DataType.BIPOLAR:
+    if act == DataType["BIPOLAR"]:
         # binary to bipolar
         y = 2 * y - 1
     else:
@@ -185,8 +186,8 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
 @pytest.mark.vivado
 def test_runtime_thresholds_single_layer():
     mem_mode = "decoupled"
-    act = DataType.INT4
-    idt = DataType.INT16
+    act = DataType["INT4"]
+    idt = DataType["INT16"]
     nf = 8
     ich = 16
     pe = ich // nf
@@ -201,7 +202,7 @@ def test_runtime_thresholds_single_layer():
     # provide non-decreasing thresholds
     T = np.sort(T, axis=1)
 
-    if odt == DataType.BIPOLAR:
+    if odt == DataType["BIPOLAR"]:
         actval = 0
     else:
         actval = odt.min()
@@ -216,6 +217,7 @@ def test_runtime_thresholds_single_layer():
     old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
     old_weight_stream = list(old_weight_stream)
     # need to create stitched IP for runtime weight testing
+    model = model.transform(InsertFIFO(True))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
@@ -243,7 +245,7 @@ def test_runtime_thresholds_single_layer():
     # old weights (see above)
     y = exec_ctx["outp"][1]
     expected = multithreshold(in_tensor, T)[1]
-    if act == DataType.BIPOLAR:
+    if act == DataType["BIPOLAR"]:
         # binary to bipolar
         expected = 2 * expected - 1
     else:
@@ -272,7 +274,7 @@ def test_runtime_thresholds_single_layer():
     rtlsim_exec(model, exec_ctx, pre_hook=write_weights)
     y = exec_ctx["outp"][1]
     expected = multithreshold(in_tensor, new_weights)[1]
-    if act == DataType.BIPOLAR:
+    if act == DataType["BIPOLAR"]:
         # binary to bipolar
         expected = 2 * expected - 1
     else:
diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1709cfe32904a5ed369f8399150a8a1d05f4b781
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import os
+import torch
+from brevitas.export import FINNManager
+from torch import nn
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.streamline.absorb as absorb
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.base import Transformation
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.convert_to_hls_layers import InferUpsample
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.make_input_chanlast import MakeInputChannelsLast
+
+tmpdir = os.environ["FINN_BUILD_DIR"]
+
+
+class ForceDataTypeForTensors(Transformation):
+    """
+    Forces a certain datatype for all tensors in a model.
+    """
+
+    def __init__(self, dType=DataType["INT8"]):
+        super().__init__()
+        self._dType = dType
+
+    def apply(self, model):
+        graph = model.graph
+        for n in graph.node:
+            for inp in n.input:
+                model.set_tensor_datatype(inp, self._dType)
+            for inp in n.output:
+                model.set_tensor_datatype(inp, self._dType)
+
+        return model, False
+
+
+_to_chan_last_args = (0, 2, 3, 1)
+_to_chan_first_args = (0, 3, 1, 2)
+
+
+class TransposeUpsampleIO(Transformation):
+    """
+    Converts the inputs outputs for all Upsample and Resize nodes
+    from NCHW to NHWC.
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        for n in graph.node:
+            if n.op_type == "Upsample" or n.op_type == "Resize":
+                # Set input shape
+                inp = n.input[0]
+                NCHW_shape = model.get_tensor_shape(inp)
+                NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args]
+                model.set_tensor_shape(inp, NHWC_shape)
+                # Set output shape
+                out = n.output[0]
+                NCHW_shape = model.get_tensor_shape(out)
+                NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args]
+                model.set_tensor_shape(out, NHWC_shape)
+        return model, False
+
+
+class PyTorchTestModel(nn.Module):
+    def __init__(self, upscale_factor=2):
+        super(PyTorchTestModel, self).__init__()
+        self.m = nn.Upsample(
+            scale_factor=upscale_factor,
+            mode="nearest",
+        )
+
+    def forward(self, x):
+        x = self.m(x)
+        return x
+
+
+# param datatype
+@pytest.mark.parametrize("dt", [DataType["INT8"]])
+# Width/height of square input feature map
+@pytest.mark.parametrize("IFMDim", [3, 5])
+# upscaling factor
+@pytest.mark.parametrize("scale", [2, 3])
+# Number of input/output channels
+@pytest.mark.parametrize("NumChannels", [4])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode):
+    atol = 1e-3
+    # Create the test model and inputs for it
+    torch_model = PyTorchTestModel(upscale_factor=scale)
+    input_shape = (1, NumChannels, IFMDim, IFMDim)
+    test_in = torch.arange(0, np.prod(np.asarray(input_shape)))
+    # Limit the input to values valid for the given datatype
+    test_in %= dt.max() - dt.min() + 1
+    test_in += dt.min()
+    # Additionally make sure we always start with 0, for convenience purposes.
+    test_in = torch.roll(test_in, dt.min())
+    test_in = test_in.view(*input_shape).type(torch.float32)
+
+    # Get golden PyTorch and ONNX inputs
+    golden_torch_float = torch_model(test_in)
+    export_path = f"{tmpdir}/Upsample_exported.onnx"
+    FINNManager.export(
+        torch_model, input_shape=input_shape, export_path=export_path, opset_version=11
+    )
+    model = ModelWrapper(export_path)
+    input_dict = {model.graph.input[0].name: test_in.numpy().astype(np.int32)}
+    input_dict = {model.graph.input[0].name: test_in.numpy()}
+    golden_output_dict = oxe.execute_onnx(model, input_dict, True)
+    golden_result = golden_output_dict[model.graph.output[0].name]
+
+    # Make sure PyTorch and ONNX match
+    pyTorch_onnx_match = np.isclose(golden_result, golden_torch_float).all()
+    assert pyTorch_onnx_match, "ONNX and PyTorch upsampling output don't match."
+
+    # Prep model for execution
+    model = ModelWrapper(export_path)
+    # model = model.transform(TransposeUpsampleIO())
+    model = model.transform(MakeInputChannelsLast())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(absorb.AbsorbTransposeIntoResize())
+    model = model.transform(InferShapes())
+    model = model.transform(ForceDataTypeForTensors(dType=dt))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(InferUpsample())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    # Check that all nodes are UpsampleNearestNeighbour_Batch nodes
+    for n in model.get_finn_nodes():
+        node_check = n.op_type == "UpsampleNearestNeighbour_Batch"
+        assert node_check, "All nodes should be UpsampleNearestNeighbour_Batch nodes."
+
+    # Prep sim
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 10))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # Run sim
+    test_in_transposed = test_in.numpy().transpose(_to_chan_last_args)
+    input_dict = {model.graph.input[0].name: test_in_transposed}
+    output_dict = oxe.execute_onnx(model, input_dict, True)
+    test_result = output_dict[model.graph.output[0].name]
+    output_matches = np.isclose(golden_result, test_result, atol=atol).all()
+
+    if exec_mode == "cppsim":
+        assert output_matches, "Cppsim output doesn't match ONNX/PyTorch."
+    elif exec_mode == "rtlsim":
+        assert output_matches, "Rtlsim output doesn't match ONNX/PyTorch."
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index 4756d4fe18ccd4934b4041c70bf2f3a1bb577ec7..6f39994bf27594a063a1e66c5bba7867eaabef6e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -32,20 +32,19 @@ import numpy as np
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.util.basic import gen_finn_dt_tensor
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.general.multithreshold import multithreshold
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.custom_op.general.multithreshold import multithreshold
-
-from finn.custom_op.registry import getCustomOp
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.util.basic import gen_finn_dt_tensor
 
 
 def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels):
@@ -142,11 +141,11 @@ def prepare_inputs(input_tensor):
 
 
 # mem_mode: const or decoupled
-@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.UINT8])
+@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
 # weight datatype
-@pytest.mark.parametrize("wdt", [DataType.INT4])
+@pytest.mark.parametrize("wdt", [DataType["INT4"]])
 # activation: None or DataType
-@pytest.mark.parametrize("act", [DataType.UINT4, None])
+@pytest.mark.parametrize("act", [DataType["UINT4"], None])
 # PE
 @pytest.mark.parametrize("pe", [1, "channels"])
 # Input image shape
@@ -188,14 +187,14 @@ def test_fpgadataflow_vvau(
     if act is None:
         T = None
         tdt = None
-        odt = DataType.INT32
+        odt = DataType["INT32"]
     else:
         odt = act
         (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w * channels)
         n_steps = act.get_num_possible_values() - 1
         T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32)
         T = np.sort(T, axis=1)
-        tdt = DataType.INT32
+        tdt = DataType["INT32"]
 
     model = _make_single_vvau_modelwrapper(
         W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index ff88536f477e80e5c92a2c352f0af81488997c7f..236eb2a0342a2782f106761f4cd356888a2f8630 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -28,31 +28,34 @@
 
 import pytest
 
+import numpy as np
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.custom_op.registry import getCustomOp
-import numpy as np
 
 
 def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    ofm_dim_h, ofm_dim_w = ofm_dim
     odt = idt
     inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
     )
     outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ifm_ch]
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
     )
 
     mp_node = helper.make_node(
@@ -60,8 +63,8 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
         ["inp"],
         ["outp"],
         domain="finn.custom_op.general",
-        kernel_shape=[k, k],
-        strides=[k, k],
+        kernel_shape=[k_h, k_w],
+        strides=[k_h, k_w],
         pads=[0, 0, 0, 0],
     )
     graph = helper.make_graph(
@@ -78,12 +81,15 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
 
 
 def make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    ofm_dim_h, ofm_dim_w = ofm_dim
     odt = idt
     inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
     )
     outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ifm_ch]
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
     )
 
     smp_node = helper.make_node(
@@ -92,9 +98,9 @@ def make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
-        PoolDim=k,
+        PoolDim=[k_h, k_w],
         NumChannels=ifm_ch,
-        ImgDim=ifm_dim,
+        ImgDim=[ifm_dim_h, ifm_dim_w],
         dataType=idt.name,
     )
     graph = helper.make_graph(
@@ -115,24 +121,42 @@ def prepare_inputs(input_tensor):
 
 
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]])
+# 1d maxpool
+@pytest.mark.parametrize("dim_1d", [False, True])
 # kernel size
 @pytest.mark.parametrize("k", [2, 4])
 # input dimension
-@pytest.mark.parametrize("ifm_dim", [4, 6, 8])
+@pytest.mark.parametrize("ifm_dim", [4, 8])
 # input channels
-@pytest.mark.parametrize("ifm_ch", [1, 2])  # , 2, 3, 4])
+@pytest.mark.parametrize("ifm_ch", [1, 3])  # 1,3
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["rtlsim", "cppsim"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode):
-    stride = k
-    ofm_dim = int(((ifm_dim - k) / stride) + 1)
-    if ifm_dim % k != 0:
+def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, exec_mode):
+    ifm_dim_h = ifm_dim
+    k_h = k
+    if dim_1d:
+        ifm_dim_w = 1
+        k_w = 1
+    else:
+        ifm_dim_w = ifm_dim_h
+        k_w = k_h
+    ifm_dim = (ifm_dim_h, ifm_dim_w)
+    k = (k_h, k_w)
+
+    stride_h = k_h
+    stride_w = k_w
+    ofm_dim_h = int(((ifm_dim_h - k_h) / stride_h) + 1)
+    ofm_dim_w = int(((ifm_dim_w - k_w) / stride_w) + 1)
+    ofm_dim = (ofm_dim_h, ofm_dim_w)
+    if idt == DataType["BIPOLAR"] and dim_1d:
+        pytest.skip("Skipping binary StreamingMaxPool_1d (not implemented)")
+    if ifm_dim_h % k_h != 0 or ifm_dim_w % k_w != 0:
         pytest.skip("Skipping StreamingMaxPool test w/ ImgDim % PoolDim != 0")
 
-    x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch))
+    x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
     # prepare input data
     input_dict = prepare_inputs(x)
 
@@ -152,7 +176,7 @@ def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode):
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
     else:
-        raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")
+        raise Exception("Unknown exec_mode in test_layer_streaming_maxpool_batch")
 
     # execute model
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py
index c487824964400cacbde575da2c10757985ad6e32..0196a78d5c4254d7cb116641f946bcccb9e1ebc9 100644
--- a/tests/fpgadataflow/test_runtime_weights.py
+++ b/tests/fpgadataflow/test_runtime_weights.py
@@ -26,20 +26,22 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from finn.util.create import hls_random_mlp_maker
+import pytest
+
+import numpy as np
+import os
+
 from finn.core.datatype import DataType
-from finn.transformation.general import GiveUniqueNodeNames
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.custom_op.registry import getCustomOp
 from finn.core.rtlsim_exec import rtlsim_exec
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
-from finn.util.pyverilator import axilite_write, axilite_read
-import numpy as np
-import pytest
-import os
+from finn.util.create import hls_random_mlp_maker
+from finn.util.pyverilator import axilite_read, axilite_write
 
 test_fpga_part = "xc7z020clg400-1"
 target_clk_ns = 5
@@ -47,8 +49,8 @@ target_clk_ns = 5
 
 @pytest.mark.vivado
 def test_runtime_weights_single_layer():
-    idt = DataType.UINT32
-    wdt = DataType.UINT4
+    idt = DataType["UINT32"]
+    wdt = DataType["UINT4"]
     act = None
     mw = 64
     mh = 32
@@ -76,11 +78,11 @@ def test_runtime_weights_single_layer():
     os.remove("old_weights.dat")
     old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
     old_weight_stream = list(old_weight_stream)
+    model = model.transform(InsertFIFO(True))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-    model = model.transform(PrepareRTLSim())
     model.set_metadata_prop("exec_mode", "rtlsim")
     in_tensor = np.asarray(range(mw), dtype=np.float32)
     # add two copies of the input tensor as the first one is just used to
diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py
index fe3a1db8a476e33bfc0d76996917fab9ae6ed98b..66fd5b43a1b8b8c8986bf9c9b9d0e9efd7a744a6 100644
--- a/tests/fpgadataflow/test_set_folding.py
+++ b/tests/fpgadataflow/test_set_folding.py
@@ -27,18 +27,19 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+
 import numpy as np
 from onnx import TensorProto, helper
 
-from finn.custom_op.registry import getCustomOp
-from finn.core.datatype import DataType
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.fpgadataflow.set_folding import SetFolding
-from finn.transformation.general import GiveUniqueNodeNames
+from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
+from finn.transformation.fpgadataflow.set_folding import SetFolding
+from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.test import load_test_checkpoint_or_skip
 
 
@@ -97,7 +98,8 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
     model.set_tensor_datatype("outp", adt)
 
     for i in range(1, nnodes + 1):
-        model.graph.value_info.append(tensors[i])
+        if tensors[i].name != "outp":
+            model.graph.value_info.append(tensors[i])
         model.set_initializer("weights_" + str(i - 1), W)
         model.set_initializer("thresh_" + str(i - 1), T)
         model.set_tensor_datatype("weights_" + str(i - 1), wdt)
@@ -113,7 +115,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
 def test_set_folding(target_fps, platform):
 
     model = make_multi_fclayer_model(
-        128, DataType.INT4, DataType.INT2, DataType.INT16, 5
+        128, DataType["INT4"], DataType["INT2"], DataType["INT16"], 5
     )
 
     model = model.transform(GiveUniqueNodeNames())
diff --git a/tests/transformation/streamline/test_absorb_mul_into_topk.py b/tests/transformation/streamline/test_absorb_mul_into_topk.py
index d0a089f9e5f894a5da635672eb58af1d8ddef3ef..bc9a31d49c7edfc20ca3e932efd00df939f1135f 100644
--- a/tests/transformation/streamline/test_absorb_mul_into_topk.py
+++ b/tests/transformation/streamline/test_absorb_mul_into_topk.py
@@ -30,13 +30,14 @@ import pytest
 import numpy as np
 from onnx import TensorProto, helper
 
+import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.insert_topk import InsertTopK
 from finn.transformation.streamline.absorb import AbsorbScalarMulAddIntoTopK
-import finn.core.onnx_exec as oxe
+
 
 # parameter to indicate if mul parameter is negative or positive
 @pytest.mark.parametrize("mul_positive", [True, False])
diff --git a/tests/transformation/streamline/test_absorb_transp_into_flatten.py b/tests/transformation/streamline/test_absorb_transp_into_flatten.py
index cbbb33b4606acf55ace662da0986105f8c456b39..1e5d5fe5806d2e3f418438b260d2257f5ae31adf 100644
--- a/tests/transformation/streamline/test_absorb_transp_into_flatten.py
+++ b/tests/transformation/streamline/test_absorb_transp_into_flatten.py
@@ -3,14 +3,15 @@ import pytest
 import numpy as np
 from onnx import TensorProto, helper
 
-from finn.core.modelwrapper import ModelWrapper
 import finn.core.data_layout as DataLayout
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.absorb import AbsorbTransposeIntoFlatten
-import finn.core.onnx_exec as oxe
+
 
 # permutation of transpose node
 @pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 1, 3, 2], [3, 2, 0, 1]])
diff --git a/tests/transformation/streamline/test_collapse_repeated_op.py b/tests/transformation/streamline/test_collapse_repeated_op.py
index b74d868f9b921c35ff9f596c811583f45f761374..1741ab6b8f4fc1c3e806a8868f329cd7753eac4d 100644
--- a/tests/transformation/streamline/test_collapse_repeated_op.py
+++ b/tests/transformation/streamline/test_collapse_repeated_op.py
@@ -26,6 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
+
 import numpy as np
 import onnx.helper as oh
 from onnx import TensorProto
@@ -34,7 +36,6 @@ import finn.core.onnx_exec as ox
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import CollapseRepeatedAdd, CollapseRepeatedMul
-import pytest
 
 
 def test_collapse_repeated_op():
@@ -74,7 +75,8 @@ def test_collapse_repeated_op():
 
 
 @pytest.mark.parametrize(
-    "test_args", [("Add", CollapseRepeatedAdd()), ("Mul", CollapseRepeatedMul())],
+    "test_args",
+    [("Add", CollapseRepeatedAdd()), ("Mul", CollapseRepeatedMul())],
 )
 def test_collapse_repeated_only_if_linear(test_args):
     scalar_op = test_args[0]
diff --git a/tests/transformation/streamline/test_linear_past_eltwise.py b/tests/transformation/streamline/test_linear_past_eltwise.py
index f5af2307fb042879a837a26c50715c8ec1b96963..098b3f9d4f67a2cbc1a87fbb67a313d00e229777 100644
--- a/tests/transformation/streamline/test_linear_past_eltwise.py
+++ b/tests/transformation/streamline/test_linear_past_eltwise.py
@@ -26,19 +26,18 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import numpy as np
+import pytest
 
+import numpy as np
+import os
 from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.streamline.reorder import MoveLinearPastEltwiseAdd
 from finn.transformation.infer_shapes import InferShapes
-
-import pytest
+from finn.transformation.streamline.reorder import MoveLinearPastEltwiseAdd
 
 export_onnx_path = "test_linear_past_eltwise.onnx"
 
diff --git a/tests/transformation/streamline/test_move_chw_add_past_conv.py b/tests/transformation/streamline/test_move_chw_add_past_conv.py
index fc64a04e40036eae7057c15f4e628155bd563e51..e4be8fc3836f18bf95eb193516937c2e9334e2ff 100644
--- a/tests/transformation/streamline/test_move_chw_add_past_conv.py
+++ b/tests/transformation/streamline/test_move_chw_add_past_conv.py
@@ -29,13 +29,13 @@
 import pytest
 
 import numpy as np
-from onnx import helper, TensorProto
+from onnx import TensorProto, helper
 
+import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveAddPastConv
-from finn.custom_op.general.im2col import compute_conv_output_dim
-import finn.core.onnx_exec as oxe
 
 
 # input dimension
diff --git a/tests/transformation/streamline/test_move_flatten_past_affine.py b/tests/transformation/streamline/test_move_flatten_past_affine.py
index b2d5e51613d41f3f2db3dabcef7b982ec2816b19..ef01436dc9435676b562e2b635a8cf12e901046b 100644
--- a/tests/transformation/streamline/test_move_flatten_past_affine.py
+++ b/tests/transformation/streamline/test_move_flatten_past_affine.py
@@ -30,16 +30,17 @@ import pytest
 import numpy as np
 from onnx import TensorProto, helper
 
-from finn.core.modelwrapper import ModelWrapper
-from finn.core.datatype import DataType
 import finn.core.data_layout as DataLayout
-from finn.util.basic import gen_finn_dt_tensor
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveFlattenPastAffine
-import finn.core.onnx_exec as oxe
+from finn.util.basic import gen_finn_dt_tensor
+
 
 # data layout
 @pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
@@ -76,14 +77,14 @@ def test_move_flatten_past_affine(data_layout, batch_size):
     model = ModelWrapper(model)
 
     # initialize values
-    a0_values = gen_finn_dt_tensor(DataType.TERNARY, [1024, 1000])
+    a0_values = gen_finn_dt_tensor(DataType["TERNARY"], [1024, 1000])
     model.set_initializer("a0", a0_values)
     a1_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
     model.set_initializer("a1", a1_values)
     a2_values = np.random.uniform(low=-1, high=1, size=(1000)).astype(np.float32)
     model.set_initializer("a2", a2_values)
 
-    model.set_tensor_datatype("inp", DataType.INT2)
+    model.set_tensor_datatype("inp", DataType["INT2"])
     model.set_tensor_layout("inp", data_layout)
     model = model.transform(InferShapes())
     model = model.transform(InferDataTypes())
@@ -92,7 +93,7 @@ def test_move_flatten_past_affine(data_layout, batch_size):
     model = model.transform(GiveReadableTensorNames())
 
     # compare execution before and after transformation
-    inp_values = gen_finn_dt_tensor(DataType.INT2, ishape)
+    inp_values = gen_finn_dt_tensor(DataType["INT2"], ishape)
     idict = {model.graph.input[0].name: inp_values}
     model_transformed = model.transform(MoveFlattenPastAffine())
     assert oxe.compare_execution(model, model_transformed, idict)
diff --git a/tests/transformation/streamline/test_move_flatten_past_topk.py b/tests/transformation/streamline/test_move_flatten_past_topk.py
index 65da92c22dbe9f6b1c5a49172ffae59fa6e98607..6086f7804eda4447de8f5948f521f0b003f65020 100644
--- a/tests/transformation/streamline/test_move_flatten_past_topk.py
+++ b/tests/transformation/streamline/test_move_flatten_past_topk.py
@@ -29,17 +29,18 @@ import pytest
 
 from onnx import TensorProto, helper
 
-from finn.core.modelwrapper import ModelWrapper
-from finn.core.datatype import DataType
 import finn.core.data_layout as DataLayout
-from finn.util.basic import gen_finn_dt_tensor
-from finn.transformation.insert_topk import InsertTopK
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.insert_topk import InsertTopK
 from finn.transformation.streamline.reorder import MoveFlattenPastTopK
-import finn.core.onnx_exec as oxe
+from finn.util.basic import gen_finn_dt_tensor
+
 
 # data layout
 @pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
@@ -59,13 +60,16 @@ def test_move_flatten_past_affine(data_layout, batch_size):
     flatten_node = helper.make_node("Flatten", ["inp"], ["outp"])
 
     graph = helper.make_graph(
-        nodes=[flatten_node], name="move-flatten-graph", inputs=[inp], outputs=[outp],
+        nodes=[flatten_node],
+        name="move-flatten-graph",
+        inputs=[inp],
+        outputs=[outp],
     )
 
     model = helper.make_model(graph, producer_name="move_flatten_model")
     model = ModelWrapper(model)
 
-    model.set_tensor_datatype("inp", DataType.INT2)
+    model.set_tensor_datatype("inp", DataType["INT2"])
     model.set_tensor_layout("inp", data_layout)
     model = model.transform(InsertTopK())
     model = model.transform(InferShapes())
@@ -75,7 +79,7 @@ def test_move_flatten_past_affine(data_layout, batch_size):
     model = model.transform(GiveReadableTensorNames())
 
     # compare execution before and after transformation
-    inp_values = gen_finn_dt_tensor(DataType.INT2, ishape)
+    inp_values = gen_finn_dt_tensor(DataType["INT2"], ishape)
     idict = {model.graph.input[0].name: inp_values}
     model_transformed = model.transform(MoveFlattenPastTopK())
     assert oxe.compare_execution(model, model_transformed, idict)
diff --git a/tests/transformation/streamline/test_move_identical_op_past_join_op.py b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
index 94eb52835b1800a839e5a9792e9cf1d7be1e681d..60e76b8b07e06048ecf1a15c72134fecf5c97346 100644
--- a/tests/transformation/streamline/test_move_identical_op_past_join_op.py
+++ b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
@@ -1,12 +1,12 @@
 import pytest
 
-from onnx import helper as oh
 from onnx import TensorProto
+from onnx import helper as oh
 
+import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.streamline.reorder import MoveTransposePastJoinAdd
 from finn.util.basic import gen_finn_dt_tensor
-import finn.core.onnx_exec as oxe
 
 
 def create_model(perm):
diff --git a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
index 7c49baf8cd9d5b85b3b76f3513d42483d3bbeb0c..fca05afa5b155e6a293857c14c10c4a9b80eeaf4 100644
--- a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
+++ b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
@@ -1,11 +1,11 @@
-from onnx import TensorProto, helper
 import numpy as np
+from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.streamline.reorder import MoveMaxPoolPastMultiThreshold
-from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline.reorder import MoveMaxPoolPastMultiThreshold
 
 
 def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
diff --git a/tests/transformation/streamline/test_move_mul_past_dw_conv.py b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
index ce0cbcd0405f8a09efabbadd5555de1bd6b89e43..e9e956d845ef8e56d2078bcd738ad3bb0ff72bfa 100644
--- a/tests/transformation/streamline/test_move_mul_past_dw_conv.py
+++ b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
@@ -1,14 +1,15 @@
 import pytest
 
-from onnx import helper, TensorProto
-from finn.custom_op.general.im2col import compute_conv_output_dim
+from onnx import TensorProto, helper
+
 import finn.core.onnx_exec as oxe
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.streamline.reorder import MoveMulPastDWConv
+from finn.util.basic import gen_finn_dt_tensor
 
 
 # input dimension
@@ -67,9 +68,9 @@ def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw):
 
     model = helper.make_model(graph, producer_name="mulpastconv-model")
     model = ModelWrapper(model)
-    inp_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, ifm_dim, ifm_dim])
-    mul_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, 1, 1])
-    W_values = gen_finn_dt_tensor(DataType.INT2, W_shape)
+    inp_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, ifm_dim, ifm_dim])
+    mul_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, 1, 1])
+    W_values = gen_finn_dt_tensor(DataType["INT2"], W_shape)
     model.set_initializer("W", W_values)
     model.set_initializer("mul", mul_values)
     model = model.transform(InferShapes())
diff --git a/tests/transformation/streamline/test_move_mul_past_maxpool.py b/tests/transformation/streamline/test_move_mul_past_maxpool.py
index f612841020e373a3c6458ee3e9a6eb14fcea7eb5..2c51aaf36a79591fd0fd0cea368d5e23da0d07c3 100755
--- a/tests/transformation/streamline/test_move_mul_past_maxpool.py
+++ b/tests/transformation/streamline/test_move_mul_past_maxpool.py
@@ -1,15 +1,16 @@
-import numpy as np
 import pytest
 
-from onnx import helper, TensorProto
-from finn.custom_op.general.maxpoolnhwc import compute_pool_output_dim
+import numpy as np
+from onnx import TensorProto, helper
+
 import finn.core.onnx_exec as oxe
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
-from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.streamline.reorder import MoveMulPastMaxPool
+from finn.util.basic import gen_finn_dt_tensor
 
 
 # input dimension
@@ -65,7 +66,7 @@ def test_move_mul_past_maxpool(ifm_dim, ifm_ch, k, stride, pad, cw, negative):
 
     model = helper.make_model(graph, producer_name="mulpastmaxpool-model")
     model = ModelWrapper(model)
-    inp_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, ifm_dim, ifm_dim])
+    inp_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, ifm_dim, ifm_dim])
     mul_values = np.random.random_sample(mul_shape).astype(np.float32)
     if negative == 1:
         mul_values = mul_values * (-1)
diff --git a/tests/transformation/streamline/test_move_past_fork.py b/tests/transformation/streamline/test_move_past_fork.py
index f3d37bd60c9e2580ca4499daafa8693f39fec810..364590f933ac27539fd546d64e25325032c885c9 100644
--- a/tests/transformation/streamline/test_move_past_fork.py
+++ b/tests/transformation/streamline/test_move_past_fork.py
@@ -1,12 +1,12 @@
-from onnx import TensorProto, helper
+import pytest
+
 import numpy as np
+from onnx import TensorProto, helper
 
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.streamline.reorder import MoveLinearPastFork
 from finn.transformation.infer_shapes import InferShapes
-
-import pytest
+from finn.transformation.streamline.reorder import MoveLinearPastFork
 
 
 @pytest.mark.parametrize("ch", [64, 1])
diff --git a/tests/transformation/streamline/test_move_scalar_past_conv.py b/tests/transformation/streamline/test_move_scalar_past_conv.py
index 94fee7907d1ed1cccbf95520e903c7d9b43d8f7d..5e2ded0174e9aa7a02551ed6b658f97ff070a523 100644
--- a/tests/transformation/streamline/test_move_scalar_past_conv.py
+++ b/tests/transformation/streamline/test_move_scalar_past_conv.py
@@ -1,20 +1,19 @@
+import pytest
+
 import numpy as np
 import onnx.helper as oh
-import pytest
 from onnx import TensorProto
 
 import finn.core.onnx_exec as ox
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.streamline import (
-    MoveAddPastConv,
-    MoveScalarMulPastConv,
-)
+from finn.transformation.streamline import MoveAddPastConv, MoveScalarMulPastConv
 
 
 @pytest.mark.parametrize("padding", [False, True])
 @pytest.mark.parametrize(
-    "test_args", [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())],
+    "test_args",
+    [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())],
 )
 def test_move_scalar_past_conv(test_args, padding):
     scalar_op = test_args[0]
@@ -92,7 +91,8 @@ def test_move_scalar_past_conv(test_args, padding):
 
 
 @pytest.mark.parametrize(
-    "test_args", [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())],
+    "test_args",
+    [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())],
 )
 def test_move_scalar_past_conv_only_if_linear(test_args):
     scalar_op = test_args[0]
diff --git a/tests/transformation/streamline/test_move_scalar_past_matmul.py b/tests/transformation/streamline/test_move_scalar_past_matmul.py
index e432dbf4ec1a38551609e5914e2d44968a020908..b15f84303b0dc2e00bd51397543871cfeb99c1f9 100644
--- a/tests/transformation/streamline/test_move_scalar_past_matmul.py
+++ b/tests/transformation/streamline/test_move_scalar_past_matmul.py
@@ -26,8 +26,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import pytest
+
+import numpy as np
 import onnx.helper as oh
 from onnx import TensorProto
 
diff --git a/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py b/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
index e434fc7d4f683120176e18a2bfa9da99d9ee0b0e..9110ede98da81a627127767276db33362503ef84 100644
--- a/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
+++ b/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
@@ -3,14 +3,15 @@ import pytest
 import numpy as np
 from onnx import TensorProto, helper
 
-from finn.core.modelwrapper import ModelWrapper
 import finn.core.data_layout as DataLayout
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.infer_datatypes import InferDataTypes
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.transformation.infer_data_layouts import InferDataLayouts
-from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.reorder import MoveTransposePastScalarMul
-import finn.core.onnx_exec as oxe
+
 
 # permutation of transpose node
 @pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 1, 3, 2], [3, 2, 0, 1]])
diff --git a/tests/transformation/streamline/test_remove_identity_ops.py b/tests/transformation/streamline/test_remove_identity_ops.py
deleted file mode 100644
index d02e1d39755bf4783cd5dbdc2b88ca0931e02874..0000000000000000000000000000000000000000
--- a/tests/transformation/streamline/test_remove_identity_ops.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import pytest
-
-import numpy as np
-from onnx import helper, TensorProto
-import finn.core.onnx_exec as oxe
-from finn.core.datatype import DataType
-from finn.core.modelwrapper import ModelWrapper
-from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.infer_shapes import InferShapes
-from finn.transformation.streamline.remove import RemoveIdentityOps
-from finn.util.basic import gen_finn_dt_tensor
-
-
-def insert_identity_op(model, op, as_first_node, approx):
-    if approx:
-        zero_val = 0.000001
-        one_val = 0.999999
-    else:
-        zero_val = 0.0
-        one_val = 1.0
-    if op in ["Add", "Sub"]:
-        val = np.asarray([zero_val], dtype=np.float32)
-    elif op in ["Mul", "Div"]:
-        val = np.asarray([one_val], dtype=np.float32)
-    else:
-        return
-
-    graph = model.graph
-    if as_first_node:
-        identity_node = helper.make_node(op, ["inp", "value"], ["ident_out"])
-        graph.node.insert(0, identity_node)
-        graph.node[1].input[0] = "ident_out"
-    else:
-        identity_node = helper.make_node(op, ["div_out", "value"], ["ident_out"])
-        graph.node.insert(3, identity_node)
-        graph.node[-1].input[0] = "ident_out"
-    model.set_initializer("value", val)
-
-    return model
-
-
-# identity operations to be inserted
-@pytest.mark.parametrize("op", ["Add", "Sub", "Mul", "Div"])
-@pytest.mark.parametrize("approx", [False, True])
-@pytest.mark.parametrize("as_first_node", [False, True])
-def test_remove_identity_ops(op, as_first_node, approx):
-
-    # set up onnx model
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 4, 1, 1])
-    mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [])
-    shape = helper.make_tensor_value_info("shape", TensorProto.FLOAT, [2])
-    div = helper.make_tensor_value_info("div", TensorProto.FLOAT, [])
-    matmul = helper.make_tensor_value_info("matmul", TensorProto.FLOAT, [4, 2])
-    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 2])
-
-    mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"])
-    reshape_node = helper.make_node("Reshape", ["mul_out", "shape"], ["reshape_out"])
-    div_node = helper.make_node("Div", ["reshape_out", "div"], ["div_out"])
-    matmul_node = helper.make_node("MatMul", ["div_out", "matmul"], ["outp"])
-
-    graph = helper.make_graph(
-        nodes=[mul_node, reshape_node, div_node, matmul_node],
-        name="identity-graph",
-        inputs=[inp],
-        outputs=[outp],
-        value_info=[mul, shape, div, matmul],
-    )
-
-    model = helper.make_model(graph, producer_name="mulpastconv-model")
-    model = ModelWrapper(model)
-    inp_values = gen_finn_dt_tensor(DataType.INT2, [1, 4, 1, 1])
-    mul_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
-    shape_values = np.asarray([1, -1], dtype=np.int64)
-    div_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
-    matmul_values = gen_finn_dt_tensor(DataType.INT2, [4, 2])
-    model.set_initializer("mul", mul_values)
-    model.set_initializer("shape", shape_values)
-    model.set_initializer("div", div_values)
-    model.set_initializer("matmul", matmul_values)
-    insert_identity_op(model, op, as_first_node, approx)
-    model = model.transform(InferShapes())
-    model = model.transform(InferDataTypes())
-    idict = {"inp": inp_values}
-    odict = oxe.execute_onnx(model, idict)
-    out_before = odict["outp"]
-    num_of_nodes_before = len(model.graph.node)
-
-    model = model.transform(RemoveIdentityOps())
-    num_of_nodes_after = len(model.graph.node)
-    assert num_of_nodes_before - 1 == num_of_nodes_after
-
-    odict = oxe.execute_onnx(model, idict)
-    out_after = odict["outp"]
-    assert np.isclose(out_before, out_after, atol=1e-3).all()
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index f9259908a2b4e4d716e3fb9ae7ec28cd9ec85d03..2e57f1c85f6ac197ca7a4cf15e595c34cc0fb564 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -47,17 +47,17 @@ def test_round_thresholds():
     model = ModelWrapper(model_def)
     threshold_val = np.asarray([[-1.1], [0.7], [2.3], [5.1]], dtype=np.float32)
     model.set_initializer("thresholds", threshold_val)
-    model.set_tensor_datatype("v", DataType.INT8)
+    model.set_tensor_datatype("v", DataType["INT8"])
     inp_dict_f = {"v": np.floor(threshold_val).T}
     inp_dict_n = {"v": np.round(threshold_val).T}
     inp_dict_c = {"v": np.ceil(threshold_val).T}
     orig_f = oxe.execute_onnx(model, inp_dict_f)["out"]
     orig_n = oxe.execute_onnx(model, inp_dict_n)["out"]
     orig_c = oxe.execute_onnx(model, inp_dict_c)["out"]
-    assert model.get_tensor_datatype("thresholds") == DataType.FLOAT32
+    assert model.get_tensor_datatype("thresholds") == DataType["FLOAT32"]
     new_model = model.transform(RoundAndClipThresholds())
     # rounded up thresholds should have same dtype as input
-    assert new_model.get_tensor_datatype("thresholds") == DataType.INT8
+    assert new_model.get_tensor_datatype("thresholds") == DataType["INT8"]
     new_f = oxe.execute_onnx(new_model, inp_dict_f)["out"]
     new_n = oxe.execute_onnx(new_model, inp_dict_n)["out"]
     new_c = oxe.execute_onnx(new_model, inp_dict_c)["out"]
diff --git a/tests/transformation/streamline/test_sign_to_thres.py b/tests/transformation/streamline/test_sign_to_thres.py
index 4618dffc43f5cee848b580b77cf418c612b48f3e..2ffb5713c0363b115dee5c41484fb5826faf803a 100644
--- a/tests/transformation/streamline/test_sign_to_thres.py
+++ b/tests/transformation/streamline/test_sign_to_thres.py
@@ -26,12 +26,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-from pkgutil import get_data
-
 import brevitas.onnx as bo
 import onnx
 import onnx.numpy_helper as nph
+import os
+from pkgutil import get_data
 
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py
index ca8cf3b1ceba6943828f47bcbcf974aa5b368c4e..ed2595330323bfc8a576af36ae3fea27522ec66c 100644
--- a/tests/transformation/streamline/test_streamline_cnv.py
+++ b/tests/transformation/streamline/test_streamline_cnv.py
@@ -26,27 +26,30 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pkg_resources as pk
+
+import pytest
+
 import brevitas.onnx as bo
 import numpy as np
-import pytest
-import pkg_resources as pk
 
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.general import (
-    RemoveUnusedTensors,
-    RemoveStaticGraphInputs,
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
+    RemoveStaticGraphInputs,
+    RemoveUnusedTensors,
 )
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
-from finn.util.test import get_test_model_trained
 from finn.util.basic import make_build_dir
+from finn.util.test import get_test_model_trained
 
 export_onnx_path = make_build_dir("test_streamline_cnv_")
 
+
 # act bits
 @pytest.mark.parametrize("abits", [1, 2])
 # weight bits
diff --git a/tests/transformation/streamline/test_streamline_fc.py b/tests/transformation/streamline/test_streamline_fc.py
index d88bf14913d2551cd7347c5617895998a7d56799..3563b87c45a7ffe99fe6e9bdfd9f54a39e89cb68 100644
--- a/tests/transformation/streamline/test_streamline_fc.py
+++ b/tests/transformation/streamline/test_streamline_fc.py
@@ -26,30 +26,31 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from pkgutil import get_data
+import pytest
 
 import brevitas.onnx as bo
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
-import pytest
+from pkgutil import get_data
 
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.general import (
-    RemoveUnusedTensors,
-    RemoveStaticGraphInputs,
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
+    RemoveStaticGraphInputs,
+    RemoveUnusedTensors,
 )
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import Streamline
-from finn.util.test import get_test_model_trained
 from finn.util.basic import make_build_dir
+from finn.util.test import get_test_model_trained
 
 export_onnx_path = make_build_dir("test_streamline_fc_")
 
+
 # act bits
 @pytest.mark.parametrize("abits", [1, 2])
 # weight bits
diff --git a/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py b/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py
index 7e894c078b15c16f29dec60d694f8b6892e84a8a..300ef85faacf664b89c7b949ea2e462f110eef85 100644
--- a/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py
+++ b/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py
@@ -26,14 +26,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-from pkgutil import get_data
 import pkg_resources as pk
 
 import brevitas.onnx as bo
+import numpy as np
 import onnx
 import onnx.numpy_helper as nph
-import numpy as np
+import os
+from pkgutil import get_data
 
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
diff --git a/tests/transformation/test_infer_data_layouts_cnv.py b/tests/transformation/test_infer_data_layouts_cnv.py
index a8ba81dff608994b8e5efb33ec23bd0e3f894175..10bc687d13d4a85ce64955cb38c1c0dfdc6d53da 100644
--- a/tests/transformation/test_infer_data_layouts_cnv.py
+++ b/tests/transformation/test_infer_data_layouts_cnv.py
@@ -26,22 +26,22 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import brevitas.onnx as bo
 import os
 
-import brevitas.onnx as bo
+import finn.core.data_layout as DataLayout
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
-from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.streamline import Streamline
+from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
 from finn.util.test import get_test_model_trained
-from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
-import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.transformation.infer_data_layouts import InferDataLayouts
-import finn.core.data_layout as DataLayout
 
 export_onnx_path_cnv = "test_infer_data_layouts.onnx"
 
diff --git a/tests/transformation/test_infer_datatypes_lfc.py b/tests/transformation/test_infer_datatypes_lfc.py
index 0802c50c7d15a649182529a4e6897b9bbe273336..8883dac7a54eafaaa768c8ae991b2030e385b318 100644
--- a/tests/transformation/test_infer_datatypes_lfc.py
+++ b/tests/transformation/test_infer_datatypes_lfc.py
@@ -26,9 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-
 import brevitas.onnx as bo
+import os
 
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
@@ -50,12 +49,12 @@ def test_infer_datatypes_lfc():
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(InferDataTypes())
-    assert model.get_tensor_datatype("MatMul_0_out0") == DataType.INT32
-    assert model.get_tensor_datatype("MatMul_1_out0") == DataType.INT32
-    assert model.get_tensor_datatype("MatMul_2_out0") == DataType.INT32
-    assert model.get_tensor_datatype("MatMul_3_out0") == DataType.INT32
-    assert model.get_tensor_datatype("MultiThreshold_0_out0") == DataType.BIPOLAR
-    assert model.get_tensor_datatype("MultiThreshold_1_out0") == DataType.BIPOLAR
-    assert model.get_tensor_datatype("MultiThreshold_2_out0") == DataType.BIPOLAR
-    assert model.get_tensor_datatype("MultiThreshold_3_out0") == DataType.BIPOLAR
+    assert model.get_tensor_datatype("MatMul_0_out0") == DataType["INT32"]
+    assert model.get_tensor_datatype("MatMul_1_out0") == DataType["INT32"]
+    assert model.get_tensor_datatype("MatMul_2_out0") == DataType["INT32"]
+    assert model.get_tensor_datatype("MatMul_3_out0") == DataType["INT32"]
+    assert model.get_tensor_datatype("MultiThreshold_0_out0") == DataType["BIPOLAR"]
+    assert model.get_tensor_datatype("MultiThreshold_1_out0") == DataType["BIPOLAR"]
+    assert model.get_tensor_datatype("MultiThreshold_2_out0") == DataType["BIPOLAR"]
+    assert model.get_tensor_datatype("MultiThreshold_3_out0") == DataType["BIPOLAR"]
     os.remove(export_onnx_path)
diff --git a/tests/transformation/test_qonnx_to_finn.py b/tests/transformation/test_qonnx_to_finn.py
new file mode 100644
index 0000000000000000000000000000000000000000..df7d63e3d2e139077f0fa20b10714c0a43a24e47
--- /dev/null
+++ b/tests/transformation/test_qonnx_to_finn.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import pkg_resources as pk
+
+import pytest
+
+import brevitas.export.onnx.generic as b_onnx
+import brevitas.onnx as bo
+import numpy as np
+import onnx
+import onnx.numpy_helper as nph
+import torch
+from pkgutil import get_data
+from qonnx.util.cleanup import cleanup
+from tempfile import TemporaryDirectory
+
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+from finn.util.test import get_test_model_trained
+
+
+def get_brev_model_and_sample_inputs(model_name, wbits, abits):
+    if "FC" in model_name:
+        in_shape = (1, 1, 28, 28)
+        raw_i = get_data("finn.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
+        input_tensor = onnx.load_tensor_from_string(raw_i)
+        input_tensor = nph.to_array(input_tensor)
+        brev_model = get_test_model_trained(model_name, wbits, abits)
+    elif model_name == "CNV":
+        in_shape = (1, 3, 32, 32)
+        fn = pk.resource_filename(
+            "finn.qnn-data", "cifar10/cifar10-test-data-class3.npz"
+        )
+        input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+        input_tensor = input_tensor / 255
+        brev_model = get_test_model_trained(model_name, wbits, abits)
+    elif model_name == "mobilenet":
+        in_shape = (1, 3, 224, 224)
+        np.random.seed(42)
+        input_tensor = np.random.normal(size=in_shape).astype(dtype=np.float32)
+        brev_model = get_test_model_trained(model_name, 4, 4)
+    else:
+        raise RuntimeError(f"The model with the name {model_name} is not supported.")
+
+    return brev_model, in_shape, input_tensor
+
+
+def analysis_testing_for_no_quant_nodes(model):
+    # Test that all Quant nodes have been converted to MultiThreshold nodes
+    # or folded into tensor initializers.
+
+    for op_type in ["BinaryQuant", "Quant", "Trunc"]:
+        q_count = len(model.get_nodes_by_op_type(op_type))
+        if q_count > 0:
+            raise ValueError(f"There should be no {op_type} nodes left in the graph.")
+
+    return dict()
+
+
+# This test currently takes about 4 min and 20 seconds
+@pytest.mark.parametrize("abits", [1, 2])
+@pytest.mark.parametrize("wbits", [1, 2])
+@pytest.mark.parametrize("model_name", ["TFC", "SFC", "LFC", "CNV", "mobilenet"])
+def test_QONNX_to_FINN(model_name, wbits, abits):
+    if wbits > abits:
+        pytest.skip("No wbits > abits cases at the moment")
+    if model_name == "LFC" and wbits == 2 and abits == 2:
+        pytest.skip("No LFC-w2a2 present at the moment")
+    if model_name == "mobilenet" and (wbits != 2 or abits != 2):
+        pytest.skip("Mobilenet only runs at W2A2, though it's technically W4A4.")
+
+    # Get test config and model
+    ATOL = 1e-7
+    brev_model, in_shape, input_tensor = get_brev_model_and_sample_inputs(
+        model_name, wbits, abits
+    )
+    temp_dir = TemporaryDirectory()
+    qonnx_base_path = temp_dir.name + "/qonnx_{}.onnx"
+    finn_base_path = temp_dir.name + "/finn_{}.onnx"
+
+    # Get Brevitas output
+    torch_input_tensor = torch.from_numpy(input_tensor).float()
+    brev_output = brev_model.forward(torch_input_tensor).detach().numpy()
+
+    # Get "clean" FINN model and it's output
+    _ = bo.export_finn_onnx(brev_model, in_shape, finn_base_path.format("raw"))
+    model = ModelWrapper(finn_base_path.format("raw"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(RemoveStaticGraphInputs())
+    model.save(finn_base_path.format("clean"))
+
+    model = ModelWrapper(finn_base_path.format("clean"))
+    input_dict = {model.graph.input[0].name: input_tensor}
+    output_dict = oxe.execute_onnx(model, input_dict, False)
+    finn_export_output = output_dict[model.graph.output[0].name]
+    # This test always fails on MobileNet for some reason
+    if model_name != "mobilenet":
+        assert np.isclose(
+            brev_output, finn_export_output, atol=ATOL
+        ).all(), "The output of the Brevitas model and the FINN model should match."
+
+    # Get the equivalent QONNX model
+    b_onnx.function.DOMAIN_STRING = "finn.custom_op.general"
+    _ = b_onnx.manager.BrevitasONNXManager.export(
+        brev_model, in_shape, qonnx_base_path.format("raw")
+    )
+    cleanup(qonnx_base_path.format("raw"), out_file=qonnx_base_path.format("clean"))
+
+    # Compare output
+    model = ModelWrapper(qonnx_base_path.format("clean"))
+    input_dict = {model.graph.input[0].name: input_tensor}
+    output_dict = oxe.execute_onnx(model, input_dict, False)
+    qonnx_export_output = output_dict[model.graph.output[0].name]
+    assert np.isclose(
+        brev_output, qonnx_export_output, atol=ATOL
+    ).all(), "The output of the Brevitas model and the QONNX model should match."
+    # This test always fails on MobileNet for some reason
+    if model_name != "mobilenet":
+        assert np.isclose(
+            qonnx_export_output, finn_export_output, atol=ATOL
+        ).all(), "The output of the FINN model and the QONNX model should match."
+
+    # Run QONNX to FINN conversion
+    model = ModelWrapper(qonnx_base_path.format("clean"))
+    model = model.transform(ConvertQONNXtoFINN())
+    model.save(qonnx_base_path.format("whole_trafo"))
+
+    # Compare output
+    model = ModelWrapper(qonnx_base_path.format("whole_trafo"))
+    input_dict = {model.graph.input[0].name: input_tensor}
+    output_dict = oxe.execute_onnx(model, input_dict, False)
+    test_output = output_dict[model.graph.output[0].name]
+    assert np.isclose(test_output, finn_export_output, atol=ATOL).all(), (
+        "The output of the FINN model "
+        "and the QONNX -> FINN converted model should match."
+    )
+
+    # Run analysis passes on the converted model
+    model = ModelWrapper(qonnx_base_path.format("whole_trafo"))
+    _ = model.analysis(analysis_testing_for_no_quant_nodes)
+
+    temp_dir.cleanup()
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
index c8e886ddb047e932e5f03ce7d460d538c95a25f2..de1b3abcc314c0c1451bd86bab8a7b93600ca697 100644
--- a/tests/util/test_build_dataflow.py
+++ b/tests/util/test_build_dataflow.py
@@ -26,12 +26,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pytest
 import pkg_resources as pk
+
+import pytest
+
+import os
 from shutil import copytree
-from finn.util.basic import make_build_dir
+
 from finn.builder.build_dataflow import build_dataflow_directory
-import os
+from finn.util.basic import make_build_dir
 
 
 @pytest.mark.slow
@@ -45,11 +48,14 @@ def test_build_dataflow_directory():
     # check the generated files
     output_dir = target_dir + "/output_tfc_w1a1_Pynq-Z1"
     assert os.path.isfile(output_dir + "/time_per_step.json")
+    assert os.path.isfile(output_dir + "/auto_folding_config.json")
     assert os.path.isfile(output_dir + "/final_hw_config.json")
     assert os.path.isfile(output_dir + "/stitched_ip/ip/component.xml")
     assert os.path.isfile(output_dir + "/driver/driver.py")
     assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json")
     assert os.path.isfile(output_dir + "/report/estimate_layer_resources.json")
+    assert os.path.isfile(output_dir + "/report/verify_rtlsim.vcd")
+    assert os.path.isfile(output_dir + "/report/rtlsim_perf_batch_1.vcd")
     assert os.path.isfile(
         output_dir + "/report/estimate_layer_config_alternatives.json"
     )
diff --git a/tests/util/test_create.py b/tests/util/test_create.py
index 42a288b74ecda9746296519b1b86563c75b2752e..c11e60175ea3ac94b6686ec5f8401a7c134fe53e 100644
--- a/tests/util/test_create.py
+++ b/tests/util/test_create.py
@@ -32,7 +32,9 @@ import finn.util.create as create
 from finn.core.datatype import DataType
 
 
-@pytest.mark.parametrize("bitwidth", [DataType.BIPOLAR, DataType.INT2, DataType.INT4])
+@pytest.mark.parametrize(
+    "bitwidth", [DataType["BIPOLAR"], DataType["INT2"], DataType["INT4"]]
+)
 def test_hls_random_mlp_maker(bitwidth):
     w = bitwidth
     a = bitwidth
@@ -42,7 +44,7 @@ def test_hls_random_mlp_maker(bitwidth):
             "mh": 100,
             "simd": 185,
             "pe": 100,
-            "idt": DataType.BIPOLAR,
+            "idt": DataType["BIPOLAR"],
             "wdt": w,
             "act": a,
         },
@@ -56,7 +58,7 @@ def test_hls_random_mlp_maker(bitwidth):
             "pe": 1,
             "idt": a,
             "wdt": w,
-            "act": DataType.BIPOLAR,
+            "act": DataType["BIPOLAR"],
         },
     ]
 
diff --git a/tests/util/test_data_packing_hls.py b/tests/util/test_data_packing_hls.py
index a926bc4068831a552ccfb728511ddda4a8670ca8..7113a3051bffb568e36b01af59945f0956658f76 100644
--- a/tests/util/test_data_packing_hls.py
+++ b/tests/util/test_data_packing_hls.py
@@ -26,20 +26,28 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import shutil
-import subprocess
-
 import pytest
 
 import numpy as np
+import os
+import shutil
+import subprocess
 
 import finn.util.basic as cutil
 from finn.core.datatype import DataType
 from finn.util.data_packing import numpy_to_hls_code
 
 
-@pytest.mark.parametrize("dtype", [DataType.BINARY, DataType.INT2, DataType.INT32])
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        DataType["BINARY"],
+        DataType["INT2"],
+        DataType["INT32"],
+        DataType["FIXED<9,6>"],
+        DataType["FLOAT32"],
+    ],
+)
 @pytest.mark.parametrize("test_shape", [(1, 2, 4), (1, 1, 64), (2, 64)])
 @pytest.mark.vivado
 def test_npy2apintstream(test_shape, dtype):
@@ -120,17 +128,17 @@ def test_numpy_to_hls_code():
         return "".join(s.split())
 
     A = [[1, 1, 1, 0], [0, 1, 1, 0]]
-    ret = numpy_to_hls_code(A, DataType.BINARY, "test", True)
+    ret = numpy_to_hls_code(A, DataType["BINARY"], "test", True)
     eA = """ap_uint<4> test[2] =
     {ap_uint<4>("0xe", 16), ap_uint<4>("0x6", 16)};"""
     assert remove_all_whitespace(ret) == remove_all_whitespace(eA)
     B = [[[3, 3], [3, 3]], [[1, 3], [3, 1]]]
-    ret = numpy_to_hls_code(B, DataType.UINT2, "test", True)
+    ret = numpy_to_hls_code(B, DataType["UINT2"], "test", True)
     eB = """ap_uint<4> test[2][2] =
     {{ap_uint<4>("0xf", 16), ap_uint<4>("0xf", 16)},
      {ap_uint<4>("0x7", 16), ap_uint<4>("0xd", 16)}};"""
     assert remove_all_whitespace(ret) == remove_all_whitespace(eB)
-    ret = numpy_to_hls_code(B, DataType.UINT2, "test", True, True)
+    ret = numpy_to_hls_code(B, DataType["UINT2"], "test", True, True)
     eB = """{{ap_uint<4>("0xf", 16), ap_uint<4>("0xf", 16)},
      {ap_uint<4>("0x7", 16), ap_uint<4>("0xd", 16)}};"""
     assert remove_all_whitespace(ret) == remove_all_whitespace(eB)