diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 2fbb9265beb49644f08a2c6e916ab9c23d4bd339..20f5b48f7acc65ab18702ef2509e9791f919b825 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -13,10 +13,10 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v3
 
       - name: Run Lint
-        uses: pre-commit/action@v2.0.0
+        uses: pre-commit/action@v3.0.0
diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index d188007465cd27662ffadfb3ece0d8bf2e8e28be..ec92c84665d868b8a4376c82ecdf72395f1367a8 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -17,37 +17,9 @@ jobs:
       - name: checkout
         uses: actions/checkout@v2
 
-      - name: set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-
-      - name: cache Docker layers
-        uses: actions/cache@v2
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: |
-            ${{ runner.os }}-buildx-
-
-      - name: Build and push
-        uses: docker/build-push-action@v2
-        with:
-          file: docker/Dockerfile.finn
-          context: .
-          push: false
-          load: true
-          tags: finn_gha
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache-new
-      -
-        # Temp fix
-        # https://github.com/docker/build-push-action/issues/252
-        # https://github.com/moby/buildkit/issues/1896
-        name: Move cache
-        run: |
-          rm -rf /tmp/.buildx-cache
-          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
-
-
       - name: DockerRunQuicktest
         run: |
-          docker run --init --hostname finn_gha -w $(pwd) -v $(pwd):$(pwd) -e FINN_BUILD_DIR=/tmp/finn_gha -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
+          export FINN_ROOT=$(pwd)
+          export FINN_BUILD_DIR=/tmp/finn_gha
+          export FINN_INST_NAME=finn_gha
+          ./run-docker.sh quicktest
diff --git a/.gitignore b/.gitignore
index 126321cf4deccaa01ab0f2025460e53519d4c06f..be6137873055e42720c260c29e38feec93cdb6e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -77,9 +77,6 @@ MANIFEST
 # Per-project virtualenvs
 .venv*/
 
-# Jenkins cfg dir
-/docker/jenkins_home
-
 # SSH key dir mounted into Docker
 /ssh_keys/
 
diff --git a/AUTHORS.rst b/AUTHORS.rst
index 1d42d35a3b269176fcab79d8239b84ac8442fa43..d011ce3d7ad74125b7013b7a7e987eb22e70a9f3 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -2,8 +2,9 @@
 Contributors
 ============
 
-* Yaman Umuroglu (@maltanar) (maintainer)
-* Jakoba Petri-Koenig (@auphelia)
+* Jakoba Petri-Koenig (@auphelia) (maintainer)
+* Thomas Preusser (@preusser)
+* Yaman Umuroglu (@maltanar)
 * Andrea Rigoni (@AndreaRigoni)
 * Hendrik Borras (@HenniOVP)
 * Lucian Petrica (@quetric)
@@ -22,3 +23,6 @@ Contributors
 * Javier Duarte (@jmduarte)
 * Uma Maheshwari (@umav1511)
 * JosÃ© Rosa (@pinxau1000)
+* Aziz Bahri (@azizb-xlnx)
+* Fionn O'Donohoe (@fionnodonohoe-xlnx)
+* Matthias Gehre (@mgehre-amd)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f12dafa857b8a99493d7266ad029bec3f725d9ec..d376a1b42b0f1f3856f40b3993533785fb254a9b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,7 +2,7 @@ We welcome contributions to FINN.
 
 Please follow the steps below and be sure that your contribution complies with our guidelines.
 
-1. Share your proposal via <a href="https://github.com/Xilinx/finn/issues" target="_blank">Github issues</a>. If you are looking for some issues to get started with, we have a list of <a href="https://github.com/Xilinx/finn/labels/good%20first%20issue">good first issues</a> in the issue tracker. Feel free to ask questions on the <a href="https://gitter.im/xilinx-finn/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge">FINN gitter channel as well</a>.
+1. Share your proposal via <a href="https://github.com/Xilinx/finn/issues" target="_blank">Github issues</a>. If you are looking for some issues to get started with, we have a list of <a href="https://github.com/Xilinx/finn/labels/good%20first%20issue">good first issues</a> in the issue tracker. Feel free to ask questions in the <a href="https://github.com/Xilinx/finn/discussions">FINN GitHub discussions</a> as well.
 
 	We welcome submissions to:
 
@@ -17,7 +17,7 @@ Please follow the steps below and be sure that your contribution complies with o
 
 	2. Clone the fork to your local computer using *git clone*. Checkout the branch you want to work on.
 
-	3. Please install <a href="https://pre-commit.com/" target="_blank">pre-commit</a> to ensure your code is formatted to our style guidelines. The hooks we use for pre-commit can be found in <a href="https://github.com/Xilinx/finn/blob/master/.pre-commit-config.yaml" target="_blank">this file</a>
+	3. Please install <a href="https://pre-commit.com/" target="_blank">pre-commit</a> to ensure your code is formatted to our style guidelines. The hooks we use for pre-commit can be found in <a href="https://github.com/Xilinx/finn/blob/main/.pre-commit-config.yaml" target="_blank">this file</a>
 
 	4. Modify the Python source code, Jupyter notebooks and Sphinx documentation etc. as needed.
 
@@ -26,9 +26,9 @@ Please follow the steps below and be sure that your contribution complies with o
 	6. If you are introducing new functionality, add at least one unit test under the `test/` folder and make sure it passes before you submit the pull request.
 
 	7. Submit a pull request by clicking the *pull request* button on your GitHub repo:
-		1. The <a href="https://github.com/Xilinx/finn" target="_blank">master branch</a> should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break.
+		1. The <a href="https://github.com/Xilinx/finn" target="_blank">main branch</a> should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break.
         2. For new features, smaller bug fixes, doc updates, and many other fixes, users should pull request against the <a href="https://github.com/Xilinx/finn/tree/dev" target="_blank">development branch</a>.
 
 3. We will review your contribution and, if any additional fixes or modifications are
 necessary, may provide feedback to guide you. When accepted, your pull request will
-be merged to the repository. If you have more questions please contact us via the <a href="https://gitter.im/xilinx-finn/community" target="_blank">FINN gitter channel</a>.
+be merged to the repository. If you have more questions please contact us.
diff --git a/README.md b/README.md
index 4cc995fc8c991ccc851e95fd30897aeea8ca266a..1b8efc8f19d0b664a17320585f5ea60acbe03eb4 100644
--- a/README.md
+++ b/README.md
@@ -24,9 +24,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s
 
 ## What's New in FINN?
 
-* **2021-11-05:** v0.7 is released, introducing QONNX support, three new example networks and many other improvements. Read more on the [v0.7 release blog post](https://xilinx.github.io/finn//2021/11/05/finn-v07-is-released.html).
-* **2021-06-15:** v0.6 is released, with ResNet-50 on U250 and ZCU104 MobileNet-v1 in finn-examples showcasing new features plus a lot more. Read more on the [v0.6 release blog post](https://xilinx.github.io/finn//2021/06/15/finn-v06-is-released.html).
-* **2020-12-17:** v0.5b (beta) is released, with a new [examples repo](https://github.com/Xilinx/finn-examples) including MobileNet-v1. Read more on the <a href="https://xilinx.github.io/finn/2020/12/17/finn-v05b-beta-is-released.html">release blog post</a>.
+* Please find all news under [GitHub discussions Announcements](https://github.com/Xilinx/finn/discussions/categories/announcements).
 
 ## Documentation
 
diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index 6036f2e744f53dfaf287b97d2789bb20bdd9d9f7..9c18c03d7bdb8406d43aa8fc4efdb8a206b1217e 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -29,8 +29,7 @@
 FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime
 LABEL maintainer="Yaman Umuroglu <yamanu@xilinx.com>"
 
-# XRT version to be installed
-ARG XRT_DEB_VERSION="xrt_202010.2.7.766_18.04-amd64-xrt"
+ARG XRT_DEB_VERSION="xrt_202210.2.13.466_18.04-amd64-xrt"
 
 WORKDIR /workspace
 
@@ -47,7 +46,6 @@ RUN apt-get update && \
     libsm6 \
     libxext6 \
     libxrender-dev \
-    verilator \
     nano \
     zsh \
     rsync \
@@ -63,6 +61,16 @@ RUN apt-get update && \
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 RUN locale-gen "en_US.UTF-8"
 
+# install Verilator from source to get the right version
+RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev
+RUN git clone https://github.com/verilator/verilator
+RUN cd verilator && \
+    git checkout v4.012 && \
+    autoconf && \
+    ./configure && \
+    make -j4 && \
+    make install
+
 # install XRT
 RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb
 RUN apt install -y /tmp/$XRT_DEB_VERSION.deb
@@ -80,7 +88,7 @@ RUN pip install jupyter==1.0.0
 RUN pip install markupsafe==2.0.1
 RUN pip install matplotlib==3.3.1 --ignore-installed
 RUN pip install pytest-dependency==0.5.1
-RUN pip install sphinx==3.1.2
+RUN pip install sphinx==5.0.2
 RUN pip install sphinx_rtd_theme==0.5.0
 RUN pip install pytest-xdist[setproctitle]==2.4.0
 RUN pip install pytest-parallel==0.1.0
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 556e6d040db3140916d75632b9bdead3c1d38747..b5c702111ac706df8d29947ce8768e499ee3627a 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -113,5 +113,6 @@ else
   yecho "If you need Vitis HLS, ensure HLS_PATH is set correctly and mounted into the Docker container."
 fi
 
+export PATH=$PATH:$HOME/.local/bin
 # execute the provided command(s) as root
 exec "$@"
diff --git a/docker/jenkins/Dockerfile.jenkins b/docker/jenkins/Dockerfile.jenkins
deleted file mode 100644
index e1939b642e1493ee97daf6472009649d3634632f..0000000000000000000000000000000000000000
--- a/docker/jenkins/Dockerfile.jenkins
+++ /dev/null
@@ -1,11 +0,0 @@
-FROM jenkins/jenkins:lts
-# if we want to install via apt
-USER root
-RUN apt-get update
-RUN apt-get install -y gnupg-agent curl ca-certificates apt-transport-https software-properties-common
-RUN curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add -
-RUN add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
-RUN apt-get update
-RUN apt-get install -y docker-ce-cli
-# drop back to the regular jenkins user - good practice
-USER jenkins
diff --git a/docker/jenkins/Jenkinsfile b/docker/jenkins/Jenkinsfile
index ad533efa5d8bbab68837e6092f91c4767cde60f7..e3e5b5f7f93c312269f2c96942e44318875903e0 100644
--- a/docker/jenkins/Jenkinsfile
+++ b/docker/jenkins/Jenkinsfile
@@ -9,7 +9,7 @@ node {
         "FINN_XILINX_VERSION=2022.1",
         "FINN_DOCKER_TAG=xilinx/finn:jenkins",
         "FINN_HOST_BUILD_DIR=/scratch/users/finn_ci",
-        "PLATFORM_REPO_PATHS=/opt/xilinx/dsa"
+        "PLATFORM_REPO_PATHS=/opt/xilinx/platforms"
     ]){
         parallel firstBranch: {
             stage('Brevitas export') {
diff --git a/docker/jenkins/launch-jenkins.sh b/docker/jenkins/launch-jenkins.sh
deleted file mode 100755
index 64dc1ec73f68e621cdd737595983b6b9a217f6fe..0000000000000000000000000000000000000000
--- a/docker/jenkins/launch-jenkins.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-# defaults, can be overriden by environment variables
-# user to run Jenkins as -- see NOTE below regarding Docker access permissions
-: ${JENKINS_USER=jenkins}
-# port for Jenkins on host machine
-: ${JENKINS_PORT=8080}
-# make Jenkins config persistent by mounting into this folder
-: ${JENKINS_HOME=$(pwd)/jenkins_home}
-
-mkdir -p $JENKINS_HOME
-
-# build a Jenkins Docker image that also has the Docker CLI installed
-docker build -t finn_jenkins -f Dockerfile.jenkins .
-
-# launch Docker container mounted to local Docker socket
-# NOTE: we allow customizing the user (e.g. as root) to work around permission
-# issues, may not al
-docker run -u $JENKINS_USER -p $JENKINS_PORT:8080 -v /var/run/docker.sock:/var/run/docker.sock -v $JENKINS_HOME:/var/jenkins_home finn_jenkins
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
index f625f2b1ef722f386180a8409a9eb9e759a2f3b6..b4ad37232fa69754a86e9064d7592d7474e8617e 100755
--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -2,7 +2,7 @@
 
 : ${PYTEST_PARALLEL=auto}
 
-cd $FINN_ROOT/finn
+cd $FINN_ROOT
 # check if command line argument is empty or not present
 if [ -z $1 ]; then
   echo "Running quicktest: not (vivado or slow or board) with pytest-xdist"
diff --git a/docs/finn/brevitas_export.rst b/docs/finn/brevitas_export.rst
index 408b14fd2b6c99ce3ec128a0361a25b3f2c193a5..304aa30854118e1ebd3258169ee4698a873e8689 100644
--- a/docs/finn/brevitas_export.rst
+++ b/docs/finn/brevitas_export.rst
@@ -8,7 +8,7 @@ Brevitas Export
    :scale: 70%
    :align: center
 
-FINN expects an ONNX model as input. This can be a model trained with `Brevitas <https://github.com/Xilinx/brevitas>`_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks <https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq>`_. Brevitas provides an export of a quantized network in ONNX representation in several flavors.
+FINN expects an ONNX model as input. This can be a model trained with `Brevitas <https://github.com/Xilinx/brevitas>`_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks <https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq>`_. Brevitas provides an export of a quantized network in ONNX representation in several flavors.
 Two of the Brevitas-exported ONNX variants can be ingested by FINN:
 
    * FINN-ONNX: Quantized weights exported as tensors with additional attributes to mark low-precision datatypes. Quantized activations exported as MultiThreshold nodes.
diff --git a/docs/finn/command_line.rst b/docs/finn/command_line.rst
index 54ffca9430a57ed4513ce822afbe0f1642b77404..12e01db5544e847a775d330929d1eea916cae74e 100644
--- a/docs/finn/command_line.rst
+++ b/docs/finn/command_line.rst
@@ -41,7 +41,7 @@ To use it, first create a folder with the necessary configuration and model file
 2. Put your ONNX model to be converted under ``dataflow_build_dir/model.onnx``.
    The filename is important and must exactly be ``model.onnx``.
 3. Create a JSON file with the build configuration. It must be named ``dataflow_build_dir/dataflow_build_config.json``.
-   Read more about the build configuration options on :py:mod:``finn.builder.build_dataflow_config.DataflowBuildConfig``.
+   Read more about the build configuration options on :py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig`.
    You can find an example .json file under ``src/finn/qnn-data/build_dataflow/dataflow_build_config.json``
 4. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``.
    You can find an example .json file under ``src/finn/qnn-data/build_dataflow/folding_config.json``.
@@ -55,7 +55,7 @@ Now you can invoke the simple dataflow build as follows:
   ./run-docker.sh build_dataflow <path/to/dataflow_build_dir/>
 
 Depending on the chosen output products, the dataflow build will run for a while
-as it go through numerous steps:
+as it goes through numerous steps:
 
 .. code-block:: none
 
diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst
index 2e05761d1fc1b9a23abb29f7bc062cf99a8acf5c..b152dfef66d0eb47e086d3c5cd51174c5df52128 100644
--- a/docs/finn/developers.rst
+++ b/docs/finn/developers.rst
@@ -84,7 +84,6 @@ The finn.dev image is built and launched as follows:
 
 4. Entrypoint script (docker/finn_entrypoint.sh) upon launching container performs the following:
 
-  * Do `pip install` on the dependency git repos at specified commits.
   * Source Vivado settings64.sh from specified path to make vivado and vivado_hls available.
   * Download PYNQ board files into the finn root directory, unless they already exist.
   * Source Vitits settings64.sh if Vitis is mounted.
@@ -92,7 +91,7 @@ The finn.dev image is built and launched as follows:
 5. Depending on the arguments to run-docker.sh a different application is launched. run-docker.sh notebook launches a Jupyter server for the tutorials, whereas run-docker.sh build_custom and run-docker.sh build_dataflow trigger a dataflow build (see documentation). Running without arguments yields an interactive shell. See run-docker.sh for other options.
 
 (Re-)launching builds outside of Docker
-======================================
+========================================
 
 It is possible to launch builds for FINN-generated HLS IP and stitched-IP folders outside of the Docker container.
 This may be necessary for visual inspection of the generated designs inside the Vivado GUI, if you run into licensing
@@ -122,16 +121,16 @@ The checks are configured in .pre-commit-config.yaml under the repo root.
 Testing
 =======
 
-Tests are vital to keep FINN running.  All the FINN tests can be found at https://github.com/Xilinx/finn/tree/master/tests.
+Tests are vital to keep FINN running.  All the FINN tests can be found at https://github.com/Xilinx/finn/tree/main/tests.
 These tests can be roughly grouped into three categories:
 
- * Unit tests: targeting unit functionality, e.g. a single transformation. Example: https://github.com/Xilinx/finn/blob/master/tests/transformation/streamline/test_sign_to_thres.py tests the expected behavior of the `ConvertSignToThres` transformation pass.
+ * Unit tests: targeting unit functionality, e.g. a single transformation. Example: https://github.com/Xilinx/finn/blob/main/tests/transformation/streamline/test_sign_to_thres.py tests the expected behavior of the `ConvertSignToThres` transformation pass.
 
- * Small-scale integration tests: targeting a group of related classes or functions that to test how they behave together. Example: https://github.com/Xilinx/finn/blob/master/tests/fpgadataflow/test_convert_to_hls_conv_layer.py sets up variants of ONNX Conv nodes that are first lowered and then converted to FINN HLS layers.
+ * Small-scale integration tests: targeting a group of related classes or functions that to test how they behave together. Example: https://github.com/Xilinx/finn/blob/main/tests/fpgadataflow/test_convert_to_hls_conv_layer.py sets up variants of ONNX Conv nodes that are first lowered and then converted to FINN HLS layers.
 
- * End-to-end tests: testing a typical 'end-to-end' compilation flow in FINN, where one end is a trained QNN and the other end is a hardware implementation. These tests can be quite large and are typically broken into several steps that depend on prior ones. Examples: https://github.com/Xilinx/finn/tree/master/tests/end2end
+ * End-to-end tests: testing a typical 'end-to-end' compilation flow in FINN, where one end is a trained QNN and the other end is a hardware implementation. These tests can be quite large and are typically broken into several steps that depend on prior ones. Examples: https://github.com/Xilinx/finn/tree/main/tests/end2end
 
-Additionally, finn-base, brevitas and finn-hlslib also include their own test suites.
+Additionally, qonnx, brevitas and finn-hlslib also include their own test suites.
 The full FINN compiler test suite
 (which will take several hours to run and require a PYNQ board) can be executed
 by:
diff --git a/docs/finn/end_to_end_flow.rst b/docs/finn/end_to_end_flow.rst
index a51d56d771384fddbc51271a074748e23ec8295c..bc5c5230718bcc8dd50334cc1f20c3c84c012ca4 100644
--- a/docs/finn/end_to_end_flow.rst
+++ b/docs/finn/end_to_end_flow.rst
@@ -11,7 +11,7 @@ As you can see in the picture, FINN has a high modularity and has the property t
 
 The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into five sections, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vivado HLS and Vivado IPI (orange section). There is also a section for testing and verification in software (red section) and the hardware generation and deployment on the PYNQ board (yellow section).
 
-This example flow is covered in the `end2end_example <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example>`_ Jupyter notebooks.
+This example flow is covered in the `end2end_example <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example>`_ Jupyter notebooks.
 For a more detailed overview about the different flow sections, please have a look at the corresponding pages:
 
 .. toctree::
diff --git a/docs/finn/example_networks.rst b/docs/finn/example_networks.rst
index 3f1ae0d603b18e8467477ea6e44863a02dee467b..ee58926578df58fab7264a22aa915e527b7edc4a 100644
--- a/docs/finn/example_networks.rst
+++ b/docs/finn/example_networks.rst
@@ -13,22 +13,16 @@ compiler.
 End-to-end Integration tests
 ============================
 
-The FINN compiler uses `several pre-trained QNNs <https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq>`_
+The FINN compiler uses `several pre-trained QNNs <https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq>`_
 that serve as both examples and testcases.
 
 * TFC, SFC, LFC... are fully-connected networks trained on the MNIST dataset
 * CNV is a convolutional network trained on the CIFAR-10 dataset
 * w\_a\_ refers to the quantization used for the weights (w) and activations (a) in bits
 
-These networks are built end-to-end as part of the `FINN integration tests <https://github.com/Xilinx/finn/blob/master/tests/end2end/test_end2end_bnn_pynq.py>`_ ,
+These networks are built end-to-end as part of the `FINN integration tests <https://github.com/Xilinx/finn/blob/main/tests/end2end/test_end2end_bnn_pynq.py>`_ ,
 and the key performance indicators (FPGA resource, frames per second...) are
 automatically posted to the dashboard below.
-To implement a new network, you can use the `integration test code <https://github.com/Xilinx/finn/blob/dev/tests/end2end/test_end2end_bnn_pynq.py>`_
+To implement a new network, you can use the `integration test code <https://github.com/Xilinx/finn/blob/main/tests/end2end/test_end2end_bnn_pynq.py>`_
 as a starting point, as well as the `relevant Jupyter notebooks
-<https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/bnn-pynq>`_.
-
-.. image:: https://firebasestorage.googleapis.com/v0/b/drive-assets.google.com.a.appspot.com/o/Asset%20-%20Drive%20Icon512.png?alt=media
-  :width: 50px
-  :align: left
-
-`FINN end-to-end integration tests dashboard on Google Drive <https://bit.ly/finn-end2end-dashboard>`_
+<https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example/bnn-pynq>`_.
diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst
index 3ddd13664432ceefdd0379004d856abd096f93ff..ef4457f53a8391621c54a70e29780c833a52aaf3 100644
--- a/docs/finn/faq.rst
+++ b/docs/finn/faq.rst
@@ -1,8 +1,8 @@
 .. _faq:
 
-***********************
+***************************
 Frequently Asked Questions
-***********************
+***************************
 
 Can't find the answer to your question here? Check `FINN GitHub Discussions <https://github.com/Xilinx/finn/discussions>`_.
 
@@ -100,7 +100,7 @@ Which data layout do FINN-generated accelerators use? Big-endian? Little-endian?
     If you need to do this manually, first examine how the `FINN PYNQ Python drivers <https://github.com/Xilinx/finn-examples/blob/main/finn_examples/driver.py#L379>`_ do this â€“ notice how the input data is
     first reshaped to create the â€œfolded input shapeâ€ that reflects the word size of the first layer based on how much it
     was parallelized, then data packing is applied to obtain a raw byte array (with some reversals going on) that can be
-    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation:
+    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
 
 Why does FIFO sizing take so long for my network? Is something wrong?
     The automatic FIFO sizing in FINN can take quite long. It unfortunately doesnâ€™t really parallelize on multiple cores since
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 3e730924c032765ebf8f58afaa9ae2e694fb3d11..40425c119fafdcd03292b05c7a7e71310f767239 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -8,7 +8,7 @@ Quickstart
 ==========
 
 1. Install Docker to run `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
-2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2020.1``)
+2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2022.1``)
 3. Clone the FINN compiler from the repo: ``git clone https://github.com/Xilinx/finn/`` and go into the directory where it is cloned
 4. Execute ``./run-docker.sh quicktest`` to verify your installation.
 5. Optionally, follow the instructions on :ref:`PYNQ board first-time setup` or :ref:`Alveo first-time setup` for board setup.
@@ -47,7 +47,7 @@ by using the "advanced mode" described in the :ref:`command_line` section.
 
 Running FINN in Docker
 ======================
-FINN only running inside a Docker container, and comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources <https://docker-curriculum.com/>`_ to get started.
+FINN runs inside a Docker container, it comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources <https://docker-curriculum.com/>`_ to get started.
 You may want to review the :ref:`General FINN Docker tips` and :ref:`Environment variables` as well.
 If you want to use prebuilt images, read :ref:`Using a prebuilt image`.
 The ``run-docker.sh`` script that can be launched in the following modes:
@@ -82,9 +82,11 @@ FINN comes with numerous Jupyter notebook tutorials, which you can launch with:
   bash ./run-docker.sh notebook
 
 This will launch the `Jupyter notebook <https://jupyter.org/>`_ server inside a Docker container, and print a link on the terminal that you can open in your browser to run the FINN notebooks or create new ones.
-.. note:: The link will look something like this (the token you get will be different):
-http://127.0.0.1:8888/?token=f5c6bd32ae93ec103a88152214baedff4ce1850d81065bfc.
-The ``run-docker.sh`` script forwards ports 8888 for Jupyter and 8081 for Netron, and launches the notebook server with appropriate arguments.
+
+.. note::
+  The link will look something like this (the token you get will be different):
+  http://127.0.0.1:8888/?token=f5c6bd32ae93ec103a88152214baedff4ce1850d81065bfc.
+  The ``run-docker.sh`` script forwards ports 8888 for Jupyter and 8081 for Netron, and launches the notebook server with appropriate arguments.
 
 
 Environment variables
@@ -94,7 +96,7 @@ Prior to running the `run-docker.sh` script, there are several environment varia
 These are summarized below:
 
 * (required) ``FINN_XILINX_PATH`` points to your Xilinx tools installation on the host (e.g. ``/opt/Xilinx``)
-* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2020.1``)
+* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2022.1``)
 * (required for Alveo) ``PLATFORM_REPO_PATHS`` points to the Vitis platform files (DSA).
 * (required for Alveo) ``XRT_DEB_VERSION`` specifies the .deb to be installed for XRT inside the container (see default value in ``run-docker.sh``).
 * (optional) ``NUM_DEFAULT_WORKERS`` (default 4) specifies the degree of parallelization for the transformations that can be run in parallel, potentially reducing build time
@@ -121,7 +123,7 @@ General FINN Docker tips
 ************************
 * Several folders including the root directory of the FINN compiler and the ``FINN_HOST_BUILD_DIR`` will be mounted into the Docker container and can be used to exchange files.
 * Do not use ``sudo`` to launch the FINN Docker. Instead, setup Docker to run `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_.
-* If you want a new terminal on an already-running container, you can do this with `docker exec -it <name_of_container> bash`.
+* If you want a new terminal on an already-running container, you can do this with ``docker exec -it <name_of_container> bash``.
 * The container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the finn compiler folder (which is mounted from the host computer) or otherwise backed up.
 
 Using a prebuilt image
@@ -138,8 +140,10 @@ If you are having trouble building the Docker image or need offline access, you
 
 Supported FPGA Hardware
 =======================
-**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <https://pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards.
-As of FINN v0.4b we also have preliminary support for `Xilinx Alveo boards <https://www.xilinx.com/products/boards-and-kits/alveo.html>`_ using PYNQ and Vitis, see instructions below for Alveo setup.
+**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <http://www.pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards.
+
+.. warning::
+  In previous FINN versions (v0.4b - v0.7) we had support for `Xilinx Alveo boards <https://www.xilinx.com/products/boards-and-kits/alveo.html>`_ using PYNQ and Vitis 2020.1, see instructions below for Alveo setup that works with older versions. Please note that with the new release with Vitis 2022.1, we do only have experimental support to automatically deployment for Alveo cards.
 
 **Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator.
 
@@ -201,11 +205,10 @@ System Requirements
 
 * Ubuntu 18.04 with ``bash`` installed
 * Docker `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
-* A working Vivado 2020.1 installation
+* A working Vitis/Vivado 2022.1 installation
 * ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables correctly set, see `Quickstart`_
 * *(optional)* `Vivado/Vitis license`_ if targeting non-WebPack FPGA parts.
 * *(optional)* A PYNQ board with a network connection, see `PYNQ board first-time setup`_
-* *(optional)* An Alveo board, and a working Vitis 2020.1 installation if you want to use Vitis and Alveo (see `Alveo first-time setup`_ )
 
 We also recommend running the FINN compiler on a system with sufficiently
 strong hardware:
diff --git a/docs/finn/hw_build.rst b/docs/finn/hw_build.rst
index d03fc400bde90da905c45d408c95badc85b7d6ec..2a64b87943075ff004f79c9d457136e41e27723d 100644
--- a/docs/finn/hw_build.rst
+++ b/docs/finn/hw_build.rst
@@ -9,12 +9,14 @@ Hardware Build and Deployment
    :align: center
 
 A model where all layers have been converted to HLS layers can be processed by
-FINN to build a bitfile targeting either a Zynq or Alveo system.
+FINN to build a bitfile and driver targeting a Zynq system or to generate a Vivado IP Integrator (IPI)
+design with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system.
+
 
 Hardware Build
 ==============
 
-Internally, the hardware build consists of the following steps:
+Internally, the hardware build for Zynq devices consists of the following steps:
 
 1. Driver generation
 2. DMA and DWC node insertion
@@ -22,12 +24,9 @@ Internally, the hardware build consists of the following steps:
 4. FIFO insertion and IP generation
 5. Vivado/Vitis project generation and synthesis
 
-.. note:: **In previous FINN releases it was necessary to step through the
-individual sub-steps for hardware build manually by calling each transformation.
-The hardware build transformations `ZynqBuild` and `VitisBuild` now execute all
-necessary sub-transformations. For more control over the build process, the
-transformations listed below can still be called individually.
-**
+.. note::
+  In previous FINN releases it was necessary to step through the individual sub-steps for hardware build manually by calling each transformation. The hardware build transformations `ZynqBuild` now execute all necessary sub-transformations. For more control over the build process, the transformations listed below can still be called individually.
+
 
 Driver Generation
 ------------------
@@ -60,9 +59,7 @@ This is accomplished by the :py:mod:`finn.transformation.fpgadataflow.floorplan.
 and :py:mod:`finn.transformation.fpgadataflow.create_dataflow_partition.CreateDataflowPartition`
 transformations.
 
-.. note:: **For Vitis, each partition will be compiled as a separate kernel,
-and linked together afterwards. For Zynq, each partition will become an IP
-block. **
+.. note:: For Vitis, each partition will be compiled as a separate kernel, and linked together afterwards. For Zynq, each partition will become an IP block.
 
 
 FIFO Insertion and IP Generation
diff --git a/docs/finn/img/repo-structure.png b/docs/finn/img/repo-structure.png
index 05031ff9a5500c3302a36ea88309b3707bc5d108..704e5e5bdab8d51d88f5a18893153b5c0827f755 100644
Binary files a/docs/finn/img/repo-structure.png and b/docs/finn/img/repo-structure.png differ
diff --git a/docs/finn/index.rst b/docs/finn/index.rst
index 751b105bb4ec35c880664e85a9550207e8a1f076..c13bf81cec949498fd6ebdf971b23535c47f3ef1 100644
--- a/docs/finn/index.rst
+++ b/docs/finn/index.rst
@@ -33,9 +33,7 @@ More FINN Resources
 
 * `The FINN examples repository <https://github.com/Xilinx/finn-examples>`_
 
-* `List of publications <https://github.com/Xilinx/finn/blob/master/docs/publications.md>`_
-
-* `Roadmap <https://github.com/Xilinx/finn/projects/1>`_
+* `List of publications <https://xilinx.github.io/finn/publications>`_
 
 .. toctree::
    :maxdepth: 5
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index e28874145d6d61232b0d63b0e53e4dd5dcdc4cfc..0b33affc76484d2175a336b188661550731ca1ab 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -1,8 +1,8 @@
 .. _internals:
 
-*********
+**********
 Internals
-*********
+**********
 
 Intermediate Representation: QONNX and FINN-ONNX
 ================================================
@@ -14,16 +14,18 @@ FINN uses `ONNX <https://github.com/onnx/onnx>`_ as an intermediate representati
 Custom Quantization Annotations
 ===============================
 
-ONNX does not support datatypes smaller than 8-bit integers, whereas in FINN we are interested in smaller integers down to ternary and bipolar. To make this work, FINN uses the quantization_annotation field in ONNX to annotate tensors with their FINN DataType (:py:mod:`qonnx.core.datatype.DataType`) information. However, all tensors are expected to use single-precision floating point (float32) storage in FINN. This means we store even a 1-bit value as floating point for the purposes of representation. The FINN compiler flow is responsible for eventually producing a packed representation for the target hardware, where the 1-bit is actually stored as 1-bit.
+ONNX does not support datatypes smaller than 8-bit integers, whereas in FINN we are interested in smaller integers down to ternary and bipolar. To make this work, FINN-ONNX uses the quantization_annotation field in ONNX to annotate tensors with their FINN DataType (:py:mod:`qonnx.core.datatype.DataType`) information. However, all tensors are expected to use single-precision floating point (float32) storage in FINN. This means we store even a 1-bit value as floating point for the purposes of representation. The FINN compiler flow is responsible for eventually producing a packed representation for the target hardware, where the 1-bit is actually stored as 1-bit.
 
 Note that FINN uses floating point tensors as a carrier data type to represent integers. Floating point arithmetic can introduce rounding errors, e.g. (int_num * float_scale) / float_scale is not always equal to int_num.
 When using the custom ONNX execution flow, FINN will attempt to sanitize any rounding errors for integer tensors. See (:py:mod:`qonnx.util.basic.sanitize_quant_values`) for more information.
 This behavior can be disabled (not recommended!) by setting the environment variable SANITIZE_QUANT_TENSORS=0.
 
+.. note:: In QONNX the quantization is represented differently, for details please check the `QONNX repository <https://github.com/fastmachinelearning/qonnx>`_ .
+
 Custom Operations/Nodes
 =======================
 
-FINN uses many custom operations (op_type in ONNX NodeProto) that are not defined in the ONNX operator schema. These custom nodes are marked with domain="finn.*" in the protobuf to identify them as such. These nodes can represent specific operations that we need for low-bit networks, or operations that are specific to a particular hardware backend. To get more familiar with custom operations and how they are created, please take a look in the Jupyter notebook about CustomOps (see chapter :ref:`tutorials` for details) or directly in the module :py:mod:`finn.custom_op`.
+FINN uses many custom operations (op_type in ONNX NodeProto) that are not defined in the ONNX operator schema. These custom nodes are marked with domain="finn.*" or domain="qonnx.*" in the protobuf to identify them as such. These nodes can represent specific operations that we need for low-bit networks, or operations that are specific to a particular hardware backend. To get more familiar with custom operations and how they are created, please take a look in the Jupyter notebook about CustomOps (see chapter :ref:`tutorials` for details) or directly in the module :py:mod:`finn.custom_op`.
 
 .. note:: See the description of `this PR <https://github.com/Xilinx/finn-base/pull/6>`_ for more on how the operator wrapper library is organized.
 
@@ -118,7 +120,7 @@ As mentioned above there are FINN DataTypes additional to the container datatype
   # set tensor datatype of third tensor in model tensor list
   from qonnx.core.datatype import DataType
 
-  finn_dtype = DataType.BIPOLAR
+  finn_dtype = DataType["BIPOLAR"]
   model.set_tensor_datatype(tensor_list[2], finn_dtype)
 
 ModelWrapper contains two helper functions for tensor initializers, one to determine the current initializer and one to set the initializer of a tensor. If there is no initializer, None is returned.
@@ -147,15 +149,17 @@ A transformation passes changes (transforms) the given model, it gets the model
 .. _mem_mode:
 
 MatrixVectorActivation *mem_mode*
-===========================
+==================================
 
-FINN supports two types of the so-called *mem_mode* attrÄ±bute for the node MatrixVectorActivation. This mode controls how the weight values are accessed during the execution. That means the mode setting has direct influence on the resulting circuit. Currently two settings for the *mem_mode* are supported in FINN:
+FINN supports three types of the so-called *mem_mode* attrÄ±bute for the node MatrixVectorActivation. This mode controls how the weight values are accessed during the execution. That means the mode setting has direct influence on the resulting circuit. Currently three settings for the *mem_mode* are supported in FINN:
 
 * "const"
 
 * "decoupled"
 
-The following picture shows the idea behind the two modes.
+* "external"
+
+The following picture shows the idea behind the "const" and "decoupled" mode.
 
 .. image:: img/mem_mode.png
    :scale: 55%
diff --git a/docs/finn/nw_prep.rst b/docs/finn/nw_prep.rst
index 8d0403fc9bb6a45fae60f14c0fb0acf862792abb..566eda5bac38855e9ed8edfdf53193bb6c025256 100644
--- a/docs/finn/nw_prep.rst
+++ b/docs/finn/nw_prep.rst
@@ -17,7 +17,7 @@ Various transformations are involved in the network preparation. The following i
 Tidy-up transformations
 =======================
 
-These transformations do not appear in the diagram above, but are applied in many steps in the FINN flow to postprocess the model after a transformation and/or prepare it for the next transformation. They ensure that all information is set and behave like a "tidy-up". These transformations are the following:
+These transformations do not appear in the diagram above, but are applied in many steps in the FINN flow to postprocess the model after a transformation and/or prepare it for the next transformation. They ensure that all information is set and behave like a "tidy-up". These transformations are located in the `QONNX repository <https://github.com/fastmachinelearning/qonnx>`_ and can be imported:
 
 * :py:mod:`qonnx.transformation.general.GiveReadableTensorNames` and :py:mod:`qonnx.transformation.general.GiveUniqueNodeNames`
 
@@ -35,7 +35,7 @@ After this transformation the ONNX model is streamlined and contains now custom
 Convert to HLS Layers
 =====================
 
-Pairs of binary XNORPopcountMatMul layers are converted to MatrixVectorActivation layers and following Multithreshold layers are absorbed into the Matrix-Vector-Activate-Unit (MVAU). The result is a model consisting of a mixture of HLS and non-HLS layers. For more details, see :py:mod:`finn.transformation.fpgadataflow.convert_to_hls_layers`. The MVAU can be implemented in two different modes, *const* and *decoupled*, see chapter :ref:`mem_mode`.
+In this step standard or custom layers are converted to HLS layers. HLS layers are layers that directly correspond to a finn-hlslib function call. For example pairs of binary XNORPopcountMatMul and MultiThreshold layers are converted to MatrixVectorActivation layers. The result is a model consisting of a mixture of HLS and non-HLS layers. For more details, see :py:mod:`finn.transformation.fpgadataflow.convert_to_hls_layers`. The MatrixVectorActivation layer can be implemented in three different modes, *const*, *decoupled* (see chapter :ref:`mem_mode`) and *external*.
 
 Dataflow Partitioning
 =====================
@@ -47,4 +47,4 @@ Folding
 
 To adjust the folding, the values for PE and SIMD can be increased to achieve also an increase in the performance. The result can be verified using the same simulation flow as for the network with maximum folding (*cppsim* using C++), for details please have a look at chapter :ref:`verification`.
 
-The result is a network of HLS layers with desired folding and it can be passed to :ref:`vivado_synth`.
+The result is a network of HLS layers with desired folding and it can be passed to :ref:`hw_build`.
diff --git a/docs/finn/source_code/finn.analysis.rst b/docs/finn/source_code/finn.analysis.rst
index 1de42ac32bc62ce71e039f63168302b22711f454..f2321dbee7ee0ba98d7b982202ae4918e0973489 100644
--- a/docs/finn/source_code/finn.analysis.rst
+++ b/docs/finn/source_code/finn.analysis.rst
@@ -15,26 +15,26 @@ Submodules
 Analysis Passes
 ===============
 
-finn.analysis.base
+qonnx.analysis.base
 -----------------------------
 
-.. automodule:: finn.analysis.base
+.. automodule:: qonnx.analysis.base
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.analysis.inference\_cost
------------------------------
+qonnx.analysis.inference\_cost
+-------------------------------
 
-.. automodule:: finn.analysis.inference_cost
+.. automodule:: qonnx.analysis.inference_cost
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.analysis.topology
+qonnx.analysis.topology
 -----------------------------
 
-.. automodule:: finn.analysis.topology
+.. automodule:: qonnx.analysis.topology
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index 2e2a8532c6419198c5075a08bef5207b39d4658b..4e3de458e153871d1d5969442af5940dc1673ecd 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -5,7 +5,7 @@ Core
 Modules
 =======
 
-finn.core.data\_layout
+qonnx.core.data\_layout
 -------------------------
 
 .. automodule:: qonnx.core.data_layout
@@ -21,10 +21,10 @@ qonnx.core.datatype
    :undoc-members:
    :show-inheritance:
 
-finn.core.execute\_custom\_node
+qonnx.core.execute\_custom\_node
 --------------------------------------
 
-.. automodule:: finn.core.execute_custom_node
+.. automodule:: qonnx.core.execute_custom_node
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index 7de038248d418e1964effd7678bc1cad4cb48c14..cc56ea603e589d7000fe5b2b2943e67cdb90c884 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -22,7 +22,7 @@ finn.custom\_op.fpgadataflow.addstreams\_batch
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.channelwise\_op\_batch
------------------------------------------------
+-----------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.channelwise_op_batch
    :members:
@@ -55,7 +55,7 @@ finn.custom\_op.fpgadataflow.downsampler
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.duplicatestreams\_batch
------------------------------------------------
+-------------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.duplicatestreams_batch
    :members:
@@ -71,7 +71,7 @@ finn.custom\_op.fpgadataflow.fmpadding\_batch
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.globalaccpool\_batch
------------------------------------------------
+---------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.globalaccpool_batch
    :members:
@@ -160,7 +160,7 @@ finn.custom\_op.fpgadataflow.templates
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.thresholding\_batch
------------------------------------------------
+-------------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.thresholding_batch
    :members:
@@ -185,7 +185,7 @@ finn.custom\_op.fpgadataflow.upsampler
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.vectorvectoractivation
------------------------------------------------
+-----------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.vectorvectoractivation
    :members:
diff --git a/docs/finn/source_code/finn.custom_op.rst b/docs/finn/source_code/finn.custom_op.rst
index 3e91eff9a16b3dedf0e1682c79d6f8022ebe0db8..20d90a7bb596d6ce5638d9b2d9bae8a5c7e5c723 100644
--- a/docs/finn/source_code/finn.custom_op.rst
+++ b/docs/finn/source_code/finn.custom_op.rst
@@ -17,12 +17,12 @@ Custom Op Nodes
 Base Class
 ----------
 
-.. automodule:: finn.custom_op.base
+.. automodule:: qonnx.custom_op.base
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.registry
+qonnx.custom\_op.registry
 -------------------------
 
 .. automodule:: qonnx.custom_op.registry
diff --git a/docs/finn/source_code/finn.rst b/docs/finn/source_code/finn.rst
index 607ac636a43d88150493eebb86b1e568b38b681a..5547a46623d4cd80b82ac334ae082f9a99b7e8dd 100644
--- a/docs/finn/source_code/finn.rst
+++ b/docs/finn/source_code/finn.rst
@@ -3,7 +3,7 @@ FINN API
 ********
 The FINN sources are divided into different modules. They are listed below.
 
-.. note:: **Some of these functions and modules are located in the `finn-base` repository.**
+.. note:: **Some of these functions and modules are located in the `qonnx` repository.**
 
 Modules
 =======
diff --git a/docs/finn/source_code/finn.transformation.qonnx.rst b/docs/finn/source_code/finn.transformation.qonnx.rst
index 8320e19efb81dd5a52f750e22e280f41070bf48c..1332639b1d694ce7c230b8926edfc82f2521e580 100644
--- a/docs/finn/source_code/finn.transformation.qonnx.rst
+++ b/docs/finn/source_code/finn.transformation.qonnx.rst
@@ -1,4 +1,4 @@
-***********************
+************************
 Transformation - QONNX
 ************************
 
diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst
index acd09993472d56bc3b9c4db49042601e4cef7547..6a28eeedb2aa547ba80677864ae9fb8c6aa64097 100644
--- a/docs/finn/source_code/finn.transformation.rst
+++ b/docs/finn/source_code/finn.transformation.rst
@@ -25,7 +25,7 @@ Base Class
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.batchnorm\_to\_affine
+qonnx.transformation.batchnorm\_to\_affine
 ------------------------------------------------
 
 .. automodule:: qonnx.transformation.batchnorm_to_affine
@@ -33,55 +33,55 @@ finn.transformation.batchnorm\_to\_affine
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.bipolar\_to\_xnor
+qonnx.transformation.bipolar\_to\_xnor
 --------------------------------------------
 
-.. automodule:: finn.transformation.bipolar_to_xnor
+.. automodule:: qonnx.transformation.bipolar_to_xnor
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.change\_3d\_tensors\_to\_4d
+qonnx.transformation.change\_3d\_tensors\_to\_4d
 ------------------------------------------------
 
-.. automodule:: finn.transformation.change_3d_tensors_to_4d
+.. automodule:: qonnx.transformation.change_3d_tensors_to_4d
   :members:
   :undoc-members:
   :show-inheritance:
 
-finn.transformation.change\_datalayout
+qonnx.transformation.change\_datalayout
 --------------------------------------------
 
-.. automodule:: finn.transformation.change_datalayout
+.. automodule:: qonnx.transformation.change_datalayout
   :members:
   :undoc-members:
   :show-inheritance:
 
-finn.transformation.create\_generic\_partitions
+qonnx.transformation.create\_generic\_partitions
 ------------------------------------------------
 
-.. automodule:: finn.transformation.create_generic_partitions
+.. automodule:: qonnx.transformation.create_generic_partitions
   :members:
   :undoc-members:
   :show-inheritance:
 
-finn.transformation.double\_to\_single\_float
+qonnx.transformation.double\_to\_single\_float
 ----------------------------------------------------
 
-.. automodule:: finn.transformation.double_to_single_float
+.. automodule:: qonnx.transformation.double_to_single_float
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.extend\_partition
+qonnx.transformation.extend\_partition
 ------------------------------------------
 
-.. automodule:: finn.transformation.extend_partition
+.. automodule:: qonnx.transformation.extend_partition
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.extract\_conv\_bias
+qonnx.transformation.extract\_conv\_bias
 ------------------------------------------
 
 .. automodule:: qonnx.transformation.extract_conv_bias
@@ -90,7 +90,7 @@ finn.transformation.extract\_conv\_bias
    :show-inheritance:
 
 
-finn.transformation.fold\_constants
+qonnx.transformation.fold\_constants
 ------------------------------------------
 
 .. automodule:: qonnx.transformation.fold_constants
@@ -98,7 +98,7 @@ finn.transformation.fold\_constants
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.gemm\_to\_matmul
+qonnx.transformation.gemm\_to\_matmul
 ------------------------------------------
 
 .. automodule:: qonnx.transformation.gemm_to_matmul
@@ -114,7 +114,7 @@ qonnx.transformation.general
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.infer\_data\_layouts
+qonnx.transformation.infer\_data\_layouts
 -------------------------------------------
 
 .. automodule:: qonnx.transformation.infer_data_layouts
@@ -122,7 +122,7 @@ finn.transformation.infer\_data\_layouts
   :undoc-members:
   :show-inheritance:
 
-finn.transformation.infer\_datatypes
+qonnx.transformation.infer\_datatypes
 -------------------------------------------
 
 .. automodule:: qonnx.transformation.infer_datatypes
@@ -130,7 +130,7 @@ finn.transformation.infer\_datatypes
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.infer\_shapes
+qonnx.transformation.infer\_shapes
 ----------------------------------------
 
 .. automodule:: qonnx.transformation.infer_shapes
@@ -138,7 +138,7 @@ finn.transformation.infer\_shapes
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.insert\_topk
+qonnx.transformation.insert\_topk
 ---------------------------------------
 
 .. automodule:: qonnx.transformation.insert_topk
@@ -146,15 +146,15 @@ finn.transformation.insert\_topk
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.lower\_convs\_to\_matmul
+qonnx.transformation.lower\_convs\_to\_matmul
 ---------------------------------------------------
 
-.. automodule:: finn.transformation.lower_convs_to_matmul
+.. automodule:: qonnx.transformation.lower_convs_to_matmul
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.transformation.make\_input\_chanlast
+qonnx.transformation.make\_input\_chanlast
 ------------------------------------------
 
 .. automodule:: qonnx.transformation.make_input_chanlast
@@ -162,7 +162,7 @@ finn.transformation.make\_input\_chanlast
   :undoc-members:
   :show-inheritance:
 
-finn.transformation.merge\_onnx\_models
+qonnx.transformation.merge\_onnx\_models
 ----------------------------------------
 
 .. automodule:: qonnx.transformation.merge_onnx_models
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index aec42ae905445947a59cb256f55eda2070347edf..8dffa016327c3bbe50f21278c859c83556b2b213 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -5,24 +5,33 @@ Util
 Utility Modules
 ===============
 
-finn.util.basic
+qonnx.util.basic
 ----------------------
 
-.. automodule:: finn.util.basic
+.. automodule:: qonnx.util.basic
    :members:
    :undoc-members:
    :show-inheritance:
 
+
 qonnx.util.config
-----------------
+--------------------
 
 .. automodule:: qonnx.util.config
   :members:
   :undoc-members:
   :show-inheritance:
 
+finn.util.basic
+----------------------
+
+.. automodule:: finn.util.basic
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.util.create
-----------------
+------------------
 
 .. automodule:: finn.util.create
   :members:
@@ -63,11 +72,10 @@ finn.util.imagenet
   :undoc-members:
   :show-inheritance:
 
-
-finn.util.onnx
+qonnx.util.onnx
 ---------------------
 
-.. automodule:: finn.util.onnx
+.. automodule:: qonnx.util.onnx
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/finn/source_code/finn.custom_op.general.rst b/docs/finn/source_code/qonnx.custom_op.general.rst
similarity index 75%
rename from docs/finn/source_code/finn.custom_op.general.rst
rename to docs/finn/source_code/qonnx.custom_op.general.rst
index dfca29a8f3b6836e2af3fb566e0394eb920c2f6e..84609971edf4ce22696ca131bb9fc4494b3a12c6 100644
--- a/docs/finn/source_code/finn.custom_op.general.rst
+++ b/docs/finn/source_code/qonnx.custom_op.general.rst
@@ -5,7 +5,7 @@ Custom Op - General
 General Custom Ops
 ===================
 
-finn.custom\_op.general.bipolar_quant
+qonnx.custom\_op.general.bipolar_quant
 --------------------------------------
 
 .. automodule:: qonnx.custom_op.general.bipolar_quant
@@ -13,15 +13,15 @@ finn.custom\_op.general.bipolar_quant
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.general.debugmarker
------------------------------------
+qonnx.custom\_op.general.debugmarker
+------------------------------------
 
 .. automodule:: qonnx.custom_op.general.debugmarker
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.general.genericpartition
+qonnx.custom\_op.general.genericpartition
 -----------------------------------------
 
 .. automodule:: qonnx.custom_op.general.genericpartition
@@ -29,15 +29,15 @@ finn.custom\_op.general.genericpartition
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.general.im2col
-------------------------------
+qonnx.custom\_op.general.im2col
+-------------------------------
 
 .. automodule:: qonnx.custom_op.general.im2col
    :members:
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.general.maxpoolnhwc
+qonnx.custom\_op.general.maxpoolnhwc
 ------------------------------------
 
 .. automodule:: qonnx.custom_op.general.maxpoolnhwc
@@ -45,7 +45,7 @@ finn.custom\_op.general.maxpoolnhwc
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.general.multithreshold
+qonnx.custom\_op.general.multithreshold
 ---------------------------------------
 
 .. automodule:: qonnx.custom_op.general.multithreshold
@@ -53,7 +53,7 @@ finn.custom\_op.general.multithreshold
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.general.quant
+qonnx.custom\_op.general.quant
 ------------------------------
 
 .. automodule:: qonnx.custom_op.general.quant
@@ -61,15 +61,15 @@ finn.custom\_op.general.quant
   :undoc-members:
   :show-inheritance:
 
-finn.custom\_op.general.quantavgpool2d
---------------------------------------
+qonnx.custom\_op.general.quantavgpool2d
+---------------------------------------
 
 .. automodule:: qonnx.custom_op.general.quantavgpool2d
   :members:
   :undoc-members:
   :show-inheritance:
 
-finn.custom\_op.general.trunc
+qonnx.custom\_op.general.trunc
 ------------------------------
 
 .. automodule:: qonnx.custom_op.general.trunc
@@ -77,7 +77,7 @@ finn.custom\_op.general.trunc
   :undoc-members:
   :show-inheritance:
 
-finn.custom\_op.general.xnorpopcount
+qonnx.custom\_op.general.xnorpopcount
 -------------------------------------
 
 .. automodule:: qonnx.custom_op.general.xnorpopcount
diff --git a/docs/finn/tutorials.rst b/docs/finn/tutorials.rst
index 4c260ecfb1b25448b4b8e1fe71d8c257cd7e31ff..110f77c5b10d2415ac2d2ff7b716567ec5cb76fa 100644
--- a/docs/finn/tutorials.rst
+++ b/docs/finn/tutorials.rst
@@ -5,7 +5,7 @@ Tutorials
 *********
 
 FINN provides several Jupyter notebooks that can help to get familiar with the basics, the internals and the end-to-end flow in FINN.
-All Jupyter notebooks can be found in the repo in the `notebook folder <https://github.com/Xilinx/finn/tree/master/notebooks>`_.
+All Jupyter notebooks can be found in the repo in the `notebook folder <https://github.com/Xilinx/finn/tree/main/notebooks>`_.
 
 Basics
 ======
@@ -23,7 +23,7 @@ The notebooks in this folder should give a basic insight into FINN, how to get s
 End-to-End Flow
 ===============
 
-There are two groups of notebooks currently available under `the end2end_example directory <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example>`_ :
+There are two groups of notebooks currently available under `the end2end_example directory <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example>`_ :
 
 * ``cybersecurity`` shows how to train a quantized MLP with Brevitas and deploy it with FINN using the :ref:`command_line` build system.
 
diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst
index 7c636941ad5b8d3d95a152f78e883f6f4782a2f0..e1a9ac4b31ebaebbc3dfcb672b5ead2c0fd8a806 100644
--- a/docs/finn/verification.rst
+++ b/docs/finn/verification.rst
@@ -8,7 +8,7 @@ Functional Verification
    :scale: 70%
    :align: center
 
-This part of the flow is covered by the Jupyter notebook about the verification of a simple fully-connected network, which you can find in the `end2end notebook folder <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb>`_.
+This part of the flow is covered by the Jupyter notebook about the verification of a simple fully-connected network, which you can find in the `end2end notebook folder <https://github.com/Xilinx/finn/blob/main/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb>`_.
 
 When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS custom nodes. A single node can be executed using one or more of the following methods:
 
diff --git a/docs/finn/vivado_synth.rst b/docs/finn/vivado_synth.rst
deleted file mode 100644
index ca8b8ad655df7b227441f020aca6d629ce1b6afc..0000000000000000000000000000000000000000
--- a/docs/finn/vivado_synth.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-.. _vivado_synth:
-
-*************************
-Vivado HLS and Vivado IPI
-*************************
-
-.. image:: img/vivado-synth.png
-   :scale: 70%
-   :align: center
-
-In this step the system is handed over to Vivado. To do this, IP blocks are created from each layer using Vivado HLS and then stitched together using Vivado IP Integrator. This creates a Vivado design of the entire network. The design can be verified using `PyVerilator <https://github.com/maltanar/pyverilator>`_ either on the network with the unstitched IP blocks or on the stitched IP. The generated verilog files are passed to PyVerilator and in this way the model can be emulated. This procedure is called *rtlsim* in FINN flow and details can be found in the chapter :ref:`verification`.
-
-Once the model is in the form of a stitched IP, it can be passed to the next flow step :ref:`pynq_deploy`.
diff --git a/fetch-repos.sh b/fetch-repos.sh
index 8e147785c92e964863ad1bcd9662862050791547..74d910478e83ce9a18000350c04e213a3e1f381e 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,13 +27,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="4a4826641db8d34619d31eac155fe95af11692eb"
+QONNX_COMMIT="398a0ecfcb32407c0a3df39246cf6d2bca02886c"
 FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
 BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
 PYVERILATOR_COMMIT="64b8294ff1afebb47be76fcad6ae87027e0402c2"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="bea971285a506cd4c2032f133a8ec23a15f935e1"
-OMX_COMMIT="a97f0bf145a2f7e57ca416ea76c9e45df4e9aa37"
+HLSLIB_COMMIT="79d7c61fbe318bfcd56e3c35bbfb774995a7870c"
+OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
 EXP_BOARD_FILES_MD5="30eecc497c31050bd46d10ea20eba232"
diff --git a/notebooks/advanced/0_custom_analysis_pass.ipynb b/notebooks/advanced/0_custom_analysis_pass.ipynb
index a6e06921516fd624ad9e8e1884677c7791f5734a..a4ad32ed7f547a4c035b5cbe4da11ebe2565883a 100644
--- a/notebooks/advanced/0_custom_analysis_pass.ipynb
+++ b/notebooks/advanced/0_custom_analysis_pass.ipynb
@@ -151,7 +151,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/advanced/1_custom_transformation_pass.ipynb b/notebooks/advanced/1_custom_transformation_pass.ipynb
index 7f78bea9e57e7145a75cd8c9f822ac5f57bcdf5f..e40a534af56352712f20bfb250112aeacfee278f 100644
--- a/notebooks/advanced/1_custom_transformation_pass.ipynb
+++ b/notebooks/advanced/1_custom_transformation_pass.ipynb
@@ -56,9 +56,11 @@
    "source": [
     "When the function is called, the model, the name of the transformation and, if required, the flag make_deepcopy are passed. It is also possible not to make a copy of the model. In this case `make_deepcopy` must be set to False. Then the branch `if make_deepcopy:` would not be taken and no copy of the model would be made. \n",
     "\n",
-    "The unchanged model is first passed to the variable `transformed_model` to pass this variable on to the transformation later. \n",
+    "Additionally, the attribute `fix_float64` of the model is checked and if it is set to `True` all double values are converted to float. This assures a correct functionality of the model.\n",
     "\n",
-    "`model_was_changed` indicates whether the transformation needs to be applied more then once. Because it needs to be applied at least one time `model_was_changed` is first set to True and then depending on the return values of the transformation function the transformation can be applied more then once. \n",
+    "The unchanged model is passed to the variable `transformed_model` to pass this variable on to the transformation later. \n",
+    "\n",
+    "`model_was_changed` indicates whether the transformation needs to be applied more than once. Because it needs to be applied at least one time `model_was_changed` is first set to True and then depending on the return values of the transformation function the transformation can be applied more then once. \n",
     "\n",
     "**Important**: Due to the structure of this function, `model_was_changed` must be set to False at some point. Otherwise the loop is infinite.\n",
     "    \n",
@@ -205,7 +207,7 @@
    "source": [
     "Transformations that are to be executed in parallel must have the method `applyNodeLocal()` implemented. Please note that the transformation is only executed on a single node, the parallel transformations do not have access to the entire model or tensors. Parallelization has the advantage that especially time-consuming transformations such as compilation can be executed more effectively. \n",
     "\n",
-    "To control the degree of parallelization the argument `num_workers` can be specified. When the Docker container is started, the env variable `NUM_DEFAULT_WORKERS` is set to 1 by default, this can be increased depending on the system. You can also set the number of workers manually to a specific value when calling a transformation that allows parallelization. If the value is set to 0, all available CPU cores are used.\n",
+    "To control the degree of parallelization the argument `num_workers` can be specified. When the Docker container is started, the env variable `NUM_DEFAULT_WORKERS` is set to 4 by default, this can be increased or decreased depending on the system. You can also set the number of workers manually to a specific value when calling a transformation that allows parallelization. If the value is set to 0, all available CPU cores are used.\n",
     "\n",
     "In the following we want to take a closer look at the implementation using the compile transformation as example."
    ]
@@ -245,7 +247,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
index e3b5d8cf0bd01bf2588331d346e706b3a36fed10..c27f8bdca788e6404fbc01e226b06e8cfaaba066 100644
--- a/notebooks/advanced/2_custom_op.ipynb
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Introduction to custom ops in FINN\n",
     "\n",
-    "Suppose that you want to introduce a new (custom) operation type into the FINN. Custom operations in FINN are useful for a variety of things ranging from code generation to functional verification. This is achieved by creating a new Python module for your custom operation that fulfills certain interface specifications.\n",
+    "Suppose that you want to introduce a new (custom) operation type into the FINN compiler. Custom operations in FINN are useful for a variety of things ranging from code generation to functional verification. This is achieved by creating a new Python module for your custom operation that fulfills certain interface specifications.\n",
     "\n",
     "One thing to point out before we start is that **these custom operations are generic** and not really tied to e.g. Vivado HLS or few-bit quantization. As you will see in this notebook, it's possible to provide arbitrary Python/C/C++/... execution and code generation paths for custom nodes.\n",
     "\n",
@@ -32,7 +32,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.custom_op.base import CustomOp\n",
+    "from qonnx.custom_op.base import CustomOp\n",
     "dir(CustomOp)"
    ]
   },
@@ -653,13 +653,6 @@
     "ret = execute_onnx(mixedop_graph_new, inp_dict)\n",
     "ret"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/basics/0_how_to_work_with_onnx.ipynb b/notebooks/basics/0_how_to_work_with_onnx.ipynb
index a4ea75fe38aac6720671a9b51de0ef31951cccb0..514efd1693d667af896e89902a264ea7e6e01da7 100644
--- a/notebooks/basics/0_how_to_work_with_onnx.ipynb
+++ b/notebooks/basics/0_how_to_work_with_onnx.ipynb
@@ -313,7 +313,7 @@
    "source": [
     "In the following we assume that we do not know the appearance of the model, so we first try to identify whether there are two consecutive adders in the graph and then convert them into a sum node. \n",
     "\n",
-    "Here we make use of FINN. FINN provides a thin wrapper around the model which provides several additional helper functions to manipulate the graph. The code can be found [here](https://github.com/Xilinx/finn/blob/master/src/finn/core/modelwrapper.py)."
+    "Here we make use of FINN. FINN provides a thin wrapper around the model which provides several additional helper functions to manipulate the graph. The so called `ModelWrapper` can be found in the QONNX repository which contains a lot of functionality that is used by FINN, you can find it [here](https://github.com/fastmachinelearning/qonnx/blob/main/src/qonnx/core/modelwrapper.py)."
    ]
   },
   {
@@ -395,36 +395,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Among other helper functions, `ModelWrapper` offers two functions that can help to determine the preceding and succeeding node of a node. However, these functions are not getting a node as input, but can determine the consumer or producer of a tensor. We write two functions that uses these helper functions to determine the previous and the next node of a node."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def find_predecessor(model, node):\n",
-    "    predecessors = []\n",
-    "    for i in range(len(node.input)):\n",
-    "        producer = model.find_producer(node.input[i])\n",
-    "        predecessors.append(producer)\n",
-    "    return predecessors\n",
-    "        \n",
-    "\n",
-    "def find_successor(model, node):\n",
-    "    successors = []\n",
-    "    for i in range(len(node.output)):\n",
-    "        consumer = model.find_consumer(node.output[i])\n",
-    "        successors.append(consumer)\n",
-    "    return successors"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The first function uses `find_producer` from `ModelWrapper` to create a list of the producers of the inputs of the given node. So the returned list is indirectly filled with the predecessors of the node. The second function works in a similar way, `find_consumer` from `ModelWrapper` is used to find the consumers of the output tensors of the node and so a list with the successors can be created. "
+    "Among other helper functions, `ModelWrapper` offers two functions that can help to determine the preceding and succeeding node of a node: `find_direct_successors` and `find_direct_predecessors`. So we can use one of them to define a function to find adder pairs."
    ]
   },
   {
@@ -436,7 +407,7 @@
     "def adder_pair(model, node):\n",
     "    adder_pairs = []\n",
     "    node_pair = []\n",
-    "    successor_list = find_successor(model, node)\n",
+    "    successor_list = model.find_direct_successors(node)\n",
     "    \n",
     "    for successor in successor_list:\n",
     "        if successor.op_type == \"Add\":\n",
@@ -444,15 +415,14 @@
     "            node_pair.append(successor)\n",
     "            adder_pairs.append((node_pair))\n",
     "            node_pair = []\n",
-    "    return adder_pairs\n",
-    "            "
+    "    return adder_pairs     "
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The function gets a node and the model as input. Two empty lists are created to be filled with a list of adder node pairs that can be returned as result of the function. Then the function `find_successor` is used to return all of the successors of the node. If one of the successors is an adder node, the node is saved in `node_pair` together with the successive adder node and put in the list `adder_pairs`. Then the temporary list is cleaned and can be filled with the next adder node pair. Since it is theoretically possible for an adder node to have more than one subsequent adder node, a list of lists is created. This list of the node with all its successive adder nodes is returned.\n",
+    "The function gets a node and the model as input. Two empty lists are created to be filled with a list of adder node pairs that can be returned as result of the function. Then the function `find_direct_successors` is used to return all of the successors of the node. If one of the successors is an adder node, the node is saved in `node_pair` together with the successive adder node and put in the list `adder_pairs`. Then the temporary list is cleaned and can be filled with the next adder node pair. Since it is theoretically possible for an adder node to have more than one subsequent adder node, a list of lists is created. This list of the node with all its successive adder nodes is returned.\n",
     "\n",
     "So now we can find out which adder node has an adder node as successor. Since the model is known, one adder pair (Add1+Add2) should be found when applying the function to the previously determined adder node list (`add_nodes`)."
    ]
@@ -522,7 +492,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The summary node can be created with this information."
+    "The sum node can be created with this information."
    ]
   },
   {
@@ -642,7 +612,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/basics/1_brevitas_network_import.ipynb b/notebooks/basics/1_brevitas_network_import.ipynb
index ecd3c89c1afb12593ec68bef9016f9e2bf083dde..5fb29754dc0ad56c2d07c783cf43102975b1621b 100644
--- a/notebooks/basics/1_brevitas_network_import.ipynb
+++ b/notebooks/basics/1_brevitas_network_import.ipynb
@@ -80,7 +80,7 @@
     "from pkgutil import get_data\n",
     "import onnx\n",
     "import onnx.numpy_helper as nph\n",
-    "raw_i = get_data(\"finn.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
+    "raw_i = get_data(\"qonnx.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "input_tensor = onnx.load_tensor_from_string(raw_i)\n",
     "input_tensor_npy = nph.to_array(input_tensor)\n",
     "input_tensor_pyt = torch.from_numpy(input_tensor_npy).float()\n",
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index b628fa455a27649791c2b6f72409b85f71f7c704..a2747e3921dc8e5a8427b4d5d9b7f143a57b018f 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -63,7 +63,7 @@
     "from finn.util.visualization import showInNetron\n",
     "import os\n",
     "    \n",
-    "build_dir = os.environ[\"FINN_ROOT\"]"
+    "build_dir = os.environ[\"FINN_BUILD_DIR\"]"
    ]
   },
   {
@@ -120,7 +120,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You can see that the network is composed of a repeating convolution-convolution-maxpool layer pattern to extract features using 3x3 convolution kernels (with weights binarized) and `Sign` activations, followed by fully connected layers acting as the classifier. Also notice the initial `MultiThreshold` layer at the beginning of the network, which is quantizing float inputs to 8-bit ones."
+    "You can see that the network is composed of a repeating convolution-convolution-maxpool layer pattern to extract features using 3x3 convolution kernels (with weights binarized), followed by fully connected layers acting as the classifier. Also notice the initial `MultiThreshold` layer at the beginning of the network, which is quantizing float inputs to 8-bit ones."
    ]
   },
   {
@@ -202,7 +202,9 @@
     "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vivado HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).\n",
     "\n",
     "\n",
-    "To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. **The current implementation of streamlining is highly network-specific and may not work for your network if its topology is very different than the example network here. We hope to rectify this in future releases.**"
+    "To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. \n",
+    "\n",
+    "**The current implementation of streamlining is highly network-specific and may not work for your network if its topology is very different than the example network here. We hope to rectify this in future releases.**"
    ]
   },
   {
@@ -422,12 +424,37 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "test_pynq_board = \"Pynq-Z2\"\n",
+    "test_pynq_board = \"Pynq-Z1\"\n",
     "target_clk_ns = 10\n",
     "\n",
     "from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild\n",
     "model = ModelWrapper(build_dir+\"/end2end_cnv_w1a1_folded.onnx\")\n",
-    "model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))\n",
+    "model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the `ZynqBuild` we run one additional transformation to generate a PYNQ driver for the accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver\n",
+    "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "model.save(build_dir + \"/end2end_cnv_w1a1_synth.onnx\")"
    ]
   },
@@ -437,7 +464,7 @@
    "source": [
     "## 5. Deployment and Remote Execution\n",
     "\n",
-    "Now that we're done with the hardware generation, we can generate a Python driver for accelerator and copy the necessary files onto our PYNQ board.\n",
+    "Now that we're done with the hardware generation, we can copy the necessary files onto our PYNQ board.\n",
     "\n",
     "**Make sure you've [set up the SSH keys for your PYNQ board](https://finn-dev.readthedocs.io/en/latest/getting_started.html#pynq-board-first-time-setup) before executing this step.**"
    ]
@@ -452,7 +479,7 @@
     "\n",
     "# set up the following values according to your own environment\n",
     "# FINN will use ssh to deploy and run the generated accelerator\n",
-    "ip = os.getenv(\"PYNQ_IP\", \"192.168.2.99\")\n",
+    "ip = \"192.168.2.99\"\n",
     "username = os.getenv(\"PYNQ_USERNAME\", \"xilinx\")\n",
     "password = os.getenv(\"PYNQ_PASSWORD\", \"xilinx\")\n",
     "port = os.getenv(\"PYNQ_PORT\", 22)\n",
@@ -612,13 +639,6 @@
    "source": [
     "We see that the final top-1 accuracy is 84.19%, which is very close to the 84.22% reported on the [BNN-PYNQ accuracy table in Brevitas](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq). "
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index 5501e030e28e3cbd52d226d7d9b8014974ca38a9..a6f05df30925250df1704afb6f9ff9dc7dc17dc0 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -50,7 +50,7 @@
     "from finn.util.basic import make_build_dir\n",
     "import os\n",
     "    \n",
-    "build_dir = os.environ[\"FINN_ROOT\"]"
+    "build_dir = os.environ[\"FINN_BUILD_DIR\"]"
    ]
   },
   {
@@ -70,7 +70,7 @@
    "metadata": {},
    "source": [
     "## 1. Brevitas export <a id='brev_exp'></a>\n",
-    "FINN expects an ONNX model as input. This can be a model trained with [Brevitas](https://github.com/Xilinx/brevitas). Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several [example Brevitas networks](https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq). To show the FINN end-to-end flow, we'll use the TFC-w1a1 model as example network.\n",
+    "FINN expects an ONNX model as input. This can be a model trained with [Brevitas](https://github.com/Xilinx/brevitas). Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several [example Brevitas networks](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq). To show the FINN end-to-end flow, we'll use the TFC-w1a1 model as example network.\n",
     "\n",
     "First a few things have to be imported. Then the model can be loaded with the pretrained weights."
    ]
@@ -93,7 +93,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The model was now exported, loaded with the pretrained weights and saved under the name \"lfc_w1_a1.onnx\".\n",
+    "The model was now exported, loaded with the pretrained weights and saved under the name \"tfc_w1_a1.onnx\".\n",
     "To visualize the exported model, Netron can be used. Netron is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties."
    ]
   },
@@ -110,7 +110,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now that we have the model in .onnx format, we can work with it using FINN. For that FINN `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
+    "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. 'ModelWrapper' is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN."
    ]
   },
   {
@@ -248,7 +248,7 @@
     "\n",
     "In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing. \n",
     "\n",
-    "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/brevitas_examples/bnn_pynq/trainer.py#L85), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use."
+    "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L104), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use."
    ]
   },
   {
@@ -370,7 +370,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You can see that the network has become simplified considerably compared to the previous step -- a lot of nodes have disappeared between the `MatMul` layers, and the `Sign` nodes have been replaced with `MultiThreshold` nodes instead. \n",
+    "You can see that the network has become simplified considerably compared to the previous step -- a lot of nodes have disappeared between the `MatMul` layers. \n",
     "\n",
     "**The current implementation of streamlining is highly network-specific and may not work for your network if its topology is very different than the example network here. We hope to rectify this in future releases.**\n",
     "\n",
@@ -457,7 +457,7 @@
    "source": [
     "### Creating a Dataflow Partition <a id='dataflow_partition'></a>\n",
     "\n",
-    "In the graph above, you can see that there is a mixture of FINN HLS layers (MatrixVectorActivation) with regular ONNX layers (Reshape, Mul, Add). To create a bitstream, FINN needs a model with only HLS layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition:"
+    "In the graph above, you can see that there is a mixture of FINN HLS layers (MatrixVectorActivation and Thresholding_Batch) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HLS layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition."
    ]
   },
   {
@@ -478,7 +478,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can see that the MatrixVectorActivation instances have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HLS dataflow-only graph:"
+    "We can see that the `MatrixVectorActivation` instances and the `Thresholding_Batch` in the beginning have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HLS dataflow-only graph:"
    ]
   },
   {
@@ -498,7 +498,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can see all the extracted `MatrixVectorActivation` instances have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it."
+    "We can see all the extracted `MatrixVectorActivation` instances and the `Thresholding_Batch` have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it."
    ]
   },
   {
@@ -518,14 +518,14 @@
     "\n",
     "*Folding* in FINN describes how much a layer is time-multiplexed in terms of execution resources. There are several *folding factors* for each layer, controlled by the PE (parallelization over outputs) and SIMD (parallelization over inputs) parameters as described by the original [FINN paper](https://arxiv.org/pdf/1612.07119). The higher the PE and SIMD values are set, the faster the generated accelerator will run, and the more FPGA resources it will consume. \n",
     "\n",
-    "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a MatrixVectorActivation operation. This is where the Netron visualization helps us, in the above diagram we can see that the first four nodes are MatrixVectorActivation. So as an example we extract the first node."
+    "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a MatrixVectorActivation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four MatrixVectorActivation. So as an example we extract the second node of the graph."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/master/src/finn/custom_op/fpgadataflow/__init__.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes."
+    "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/main/src/finn/custom_op/fpgadataflow/__init__.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes."
    ]
   },
   {
@@ -534,7 +534,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "fc0 = model.graph.node[0]\n",
+    "fc0 = model.graph.node[1]\n",
     "fc0w = getCustomOp(fc0)\n",
     "\n",
     "print(\"CustomOp wrapper is of class \" + fc0w.__class__.__name__)\n",
@@ -546,7 +546,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can see that the PE and SIMD are listed as node attributes, as well as the depths of the FIFOs that will be inserted between consecutive layers, and all can be adjusted using `set_nodeattr` subject to certain constraints.\n",
+    "We can see that the PE and SIMD are listed as node attributes, as well as the depths of the FIFOs that will be inserted between consecutive layers, and all can be adjusted using `set_nodeattr` subject to certain constraints. There are also a lot of additional attributes that can be set for this node type.\n",
     "**In this notebook we are setting the folding factors and FIFO depths manually, but in a future version we will support determining the folding factors given an FPGA resource budget according to the analytical model from the [FINN-R paper](https://arxiv.org/pdf/1809.04570).**"
    ]
   },
@@ -664,6 +664,23 @@
     "model = model.transform(ZynqBuild(platform = pynq_board, period_ns = target_clk_ns))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the `ZynqBuild` we run one additional transformation to generate a PYNQ driver for the accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver\n",
+    "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -810,7 +827,7 @@
     "\n",
     "# set up the following values according to your own environment\n",
     "# FINN will use ssh to deploy and run the generated accelerator\n",
-    "ip = os.getenv(\"PYNQ_IP\", \"192.168.2.99\")\n",
+    "ip = \"192.168.2.99\"\n",
     "username = os.getenv(\"PYNQ_USERNAME\", \"xilinx\")\n",
     "password = os.getenv(\"PYNQ_PASSWORD\", \"xilinx\")\n",
     "port = os.getenv(\"PYNQ_PORT\", 22)\n",
@@ -886,7 +903,7 @@
     "import onnx.numpy_helper as nph\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
-    "raw_i = get_data(\"finn.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
+    "raw_i = get_data(\"qonnx.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "x = nph.to_array(onnx.load_tensor_from_string(raw_i))\n",
     "plt.imshow(x.reshape(28,28), cmap='gray')"
    ]
@@ -948,7 +965,7 @@
     "\n",
     "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`echo password | sudo -S command`) to get that working from this notebook running on the host computer.\n",
     "\n",
-    "**Ensure that your PYNQ board has a working internet connecting for the next steps, since some there is some downloading involved.**\n",
+    "**Ensure that your PYNQ board has a working internet connecting for the next steps, since there is some downloading involved.**\n",
     "\n",
     "To validate the accuracy, we first need to install the [`dataset-loading`](https://github.com/fbcotter/dataset_loading) Python package to the PYNQ board. This will give us a convenient way of downloading and accessing the MNIST dataset.\n",
     "\n",
@@ -991,7 +1008,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We see that the final top-1 accuracy is 92.96%, which is very close to the 93.17% reported on the [BNN-PYNQ accuracy table in Brevitas](https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq). "
+    "We see that the final top-1 accuracy is 92.96%, which is very close to the 93.17% reported on the [BNN-PYNQ accuracy table in Brevitas](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq). "
    ]
   },
   {
@@ -1048,13 +1065,6 @@
    "source": [
     "The measured values were recorded with a batch size of 10000 and at a frequency of 100 MHz. We will be improving the efficiency of the generated accelerator examples in the coming FINN releases."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
index 1e07781b66a8eaa816921a5ff721756bf418a26c..813127197e07e4ddb5ec5ff39aed0278e117babc 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
@@ -36,7 +36,7 @@
     "from finn.util.visualization import showSrc, showInNetron\n",
     "import os\n",
     "\n",
-    "build_dir = os.environ[\"FINN_ROOT\"]"
+    "build_dir = os.environ[\"FINN_BUILD_DIR\"]"
    ]
   },
   {
@@ -59,7 +59,7 @@
     "from finn.util.test import get_test_model_trained\n",
     "\n",
     "fc = get_test_model_trained(\"TFC\", 1, 1)\n",
-    "raw_i = get_data(\"finn.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
+    "raw_i = get_data(\"qonnx.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "input_tensor = onnx.load_tensor_from_string(raw_i)\n",
     "input_brevitas = torch.from_numpy(nph.to_array(input_tensor)).float()\n",
     "output_golden = fc.forward(input_brevitas).detach().numpy()\n",
@@ -72,9 +72,9 @@
    "source": [
     "## Simulation using Python <a id='simpy'></a>\n",
     "\n",
-    "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/master/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (backend $\\neq$ \"fpgadataflow\") this model can be checked for functionality using Python.\n",
+    "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/master/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow`) this model can be checked for functionality using Python.\n",
     "\n",
-    "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution functions are defined. The following is an example of the execution function of a XNOR popcount node.\n"
+    "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of a XNOR popcount node.\n"
    ]
   },
   {
@@ -142,7 +142,7 @@
    "source": [
     "## Simulation (cppsim) using C++\n",
     "\n",
-    "When dealing with HLS custom op nodes in FINN the simulation using Python is no longer sufficient. After the nodes have been converted to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in an .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding finn-hlslib function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS layers is used, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model."
+    "When dealing with HLS custom op nodes in FINN the simulation using Python is no longer sufficient. After the nodes have been converted to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in a .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding finn-hlslib function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS layers is used, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model."
    ]
   },
   {
@@ -250,7 +250,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Before the model can be executed using `execute_onnx`, we integrate the child model in the parent model. The function reads then the `exec_mode` and writes the input into the correct directory in a .npy file. To be able to read this in C++, there is an additional .hpp file ([npy2apintstream.hpp](https://github.com/Xilinx/finn/blob/master/src/finn/data/cpp/npy2apintstream.hpp)) in FINN, which uses cnpy to read .npy files and convert them into streams, or to read a stream and write it into an .npy. [cnpy](https://github.com/rogersce/cnpy) is a helper to read and write .npy and .npz formates in C++.\n",
+    "Before the model can be executed using `execute_onnx`, we integrate the child model in the parent model. The function reads then the `exec_mode` and writes the input into the correct directory in a .npy file. To be able to read this in C++, there is an additional .hpp file ([npy2apintstream.hpp](https://github.com/Xilinx/finn/blob/main/src/finn/qnn-data/cpp/npy2apintstream.hpp)) in FINN, which uses cnpy to read .npy files and convert them into streams, or to read a stream and write it into an .npy. [cnpy](https://github.com/rogersce/cnpy) is a helper to read and write .npy and .npz formates in C++.\n",
     "\n",
     "The result is again compared to the \"golden\" output."
    ]
@@ -329,7 +329,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The next step is to load the parent model and set the node attribute `model` in the StreamingDataflowPartition node (`sdp_node`). Afterwards the `exec_mode` is set in the parent model in each node."
+    "The next step is to load the parent model and set the node attribute `model` in the StreamingDataflowPartition node (`sdp_node`). Afterwards the `exec_mode` is set in the parent model in each node and the model can be executed."
    ]
   },
   {
@@ -347,13 +347,6 @@
     "model_for_rtlsim = model_for_rtlsim.transform(SetExecMode(\"rtlsim\"))"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Because the necessary files for the emulation are already generated in Jupyter notebook [tfc_end2end_example](tfc_end2end_example.ipynb), in the next step the execution of the model can be done directly."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -420,8 +413,15 @@
    "outputs": [],
    "source": [
     "output_dict = oxe.execute_onnx(model_for_rtlsim, input_dict)\n",
-    "output_rtlsim = output_dict[list(output_dict.keys())[0]]\n",
-    "\n",
+    "output_rtlsim = output_dict[list(output_dict.keys())[0]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "if np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
     "    print(\"Results are the same!\")\n",
     "else:\n",
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index 68b345ed348f7a3f6fff507e1a4e45f6942a6a60..5625a6f1c20ee5e4a66df28931a6a891f699a738 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -47,6 +47,7 @@
     "-------------\n",
     "\n",
     "* [Load the UNSW_NB15 Dataset](#load_dataset) \n",
+    "* [Define a PyTorch Device](#define_pytorch_device)\n",
     "* [Define the Quantized MLP Model](#define_quantized_mlp)\n",
     "* [Define Train and Test  Methods](#train_test)\n",
     "    * [(Option 1) Train the Model from Scratch](#train_scratch)\n",
@@ -76,7 +77,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Load the UNSW_NB15 Dataset <a id='load_dataset'></a>\n",
+    "# Load the UNSW_NB15 Dataset <a id='load_dataset'></a>\n",
     "\n",
     "### Dataset Quantization <a id='dataset_qnt'></a>\n",
     "\n",
@@ -740,7 +741,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -754,7 +755,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.0"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index 3e116b1adbcfddcd3cf61d8ad11130988fc4e2d4..370312c77e90c67a3095e0800ad0c6046bfd75f4 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -377,13 +377,6 @@
    "source": [
     "This concludes our second notebook. In the next one, we'll take the ONNX model we just verified all the way down to FPGA hardware with the FINN compiler."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index 980a770fe2b47aebd9da2fe2fdb8943b542c07b2..33adb68dc8ddfff1b427d82e4666a70e883bf2c8 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -620,13 +620,6 @@
     "\n",
     "Finally, we can see that `throughput[images/s]`, which is the pure hardware throughput without any software and data movement overheads, is close to 1M inferences per second."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/requirements.txt b/requirements.txt
index 3bab23fb7d6c6cc80155b9f4b42c5db48ab0723e..970acc342bb7984e69929d1ef5eaa027b765ced0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
 bitstring==3.1.7
 clize==4.1.1
-dataclasses-json==0.5.2
+dataclasses-json==0.5.7
 docrep==0.2.7
 future==0.18.2
 gspread==3.6.0
-numpy==1.18.0
+numpy==1.22.0
 onnx==1.11.0
 onnxoptimizer
 onnxruntime==1.11.1
@@ -13,6 +13,7 @@ protobuf==3.20.1
 pyscaffold==3.2.1
 scipy==1.5.2
 setupext-janitor>=1.1.2
+sigtools==2.0.3
 toposort==1.5
 vcdvcd==1.0.5
 wget==3.2
diff --git a/run-docker.sh b/run-docker.sh
index ff4161ce06d8d922fd153ad37cddcdcad50effcc..381be35293dddbabe077be2aeae609f8c5621842 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -86,9 +86,9 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${ALVEO_BOARD="U250"}
 : ${ALVEO_TARGET_DIR="/tmp"}
 : ${PLATFORM_REPO_PATHS="/opt/xilinx/platforms"}
-: ${XRT_DEB_VERSION="xrt_202010.2.7.766_18.04-amd64-xrt"}
+: ${XRT_DEB_VERSION="xrt_202210.2.13.466_18.04-amd64-xrt"}
 : ${FINN_HOST_BUILD_DIR="/tmp/$DOCKER_INST_NAME"}
-: ${FINN_DOCKER_TAG="xilinx/finn:$(git describe --tags --dirty).$XRT_DEB_VERSION"}
+: ${FINN_DOCKER_TAG="xilinx/finn:$(git describe --always --tags --dirty).$XRT_DEB_VERSION"}
 : ${FINN_DOCKER_PREBUILT="0"}
 : ${FINN_DOCKER_RUN_AS_ROOT="0"}
 : ${FINN_DOCKER_GPU="$(docker info | grep nvidia | wc -m)"}
diff --git a/setup.cfg b/setup.cfg
index 94d2cb2b8dd5b9a43931c165ef998b4af2f50192..a1d0fef6cb08994ae8666fd2ea37166bf1cd3752 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -76,7 +76,7 @@ exclude =
 docs =
     finn-base==0.0.3
     docutils==0.17.1
-    dataclasses-json==0.5.2
+    dataclasses-json==0.5.7
     gspread==3.6.0
     pytest
     netron
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 238083f653d410772a81115ff12dd987835d1f32..d6864994a70a0ea4c24567155ff7c0599bc0fb6f 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -155,12 +155,14 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
                 % (step_name, step_num, len(build_dataflow_steps))
             )
             # redirect output to logfile
-            sys.stdout = stdout_logger
-            sys.stderr = stderr_logger
-            print(
-                "Running step: %s [%d/%d]"
-                % (step_name, step_num, len(build_dataflow_steps))
-            )
+            if not cfg.verbose:
+                sys.stdout = stdout_logger
+                sys.stderr = stderr_logger
+                # also log current step name to logfile
+                print(
+                    "Running step: %s [%d/%d]"
+                    % (step_name, step_num, len(build_dataflow_steps))
+                )
             # run the step
             step_start = time.time()
             model = transform_step(model, cfg)
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index eec55e502207a3dccb6ac6def06dd0edebf78c22..e16711f63b954707bc7ad9050dd7627ca1ce99c1 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -211,7 +211,7 @@ class DataflowBuildConfig:
 
     #: Insert a signature node to the stitched-IP to read/write information
     #: to the design: e.g. Customer signature, application signature, version
-    signature: Optional[tuple] = ()
+    signature: Optional[List[int]] = None
 
     #: (Optional) Control the maximum width of the per-PE MVAU stream while
     #: exploring the parallelization attributes to reach target_fps
@@ -289,6 +289,10 @@ class DataflowBuildConfig:
     #: Whether pdb postmortem debuggig will be launched when the build fails
     enable_build_pdb_debug: Optional[bool] = True
 
+    #: When True, all warnings and compiler output will be printed in stdout.
+    #: Otherwise, these will be suppressed and only appear in the build log.
+    verbose: Optional[bool] = False
+
     #: If given, only run the steps in the list. If not, run default steps.
     #: See `default_build_dataflow_steps` for the default list of steps.
     #: When specified:
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
index 07eda6aa1d82df0a9f9a01d4f17f7880a8cf8b26..3533fd13399a4ba4392d66af785979afc32cab29 100644
--- a/src/finn/core/throughput_test.py
+++ b/src/finn/core/throughput_test.py
@@ -157,8 +157,8 @@ def throughput_test_rtlsim(model, batchsize=100):
     res["cycles"] = cycles
     res["runtime[ms]"] = runtime_s * 1000
     res["throughput[images/s]"] = batchsize / runtime_s
-    res["DRAM_in_bandwidth[Mb/s]"] = i_bytes * 0.000001 / runtime_s
-    res["DRAM_out_bandwidth[Mb/s]"] = o_bytes * 0.000001 / runtime_s
+    res["DRAM_in_bandwidth[MB/s]"] = i_bytes * 0.000001 / runtime_s
+    res["DRAM_out_bandwidth[MB/s]"] = o_bytes * 0.000001 / runtime_s
     res["fclk[mhz]"] = fclk_mhz
     res["N"] = batchsize
 
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 9978ab0c7138aa6846a1427cd346c5257e4f8728..b202e95a28a26de3dabc098c2030bafcf840d164 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -397,18 +397,20 @@ class HLSCustomOp(CustomOp):
         builder.build(code_gen_dir)
         self.set_nodeattr("executable_path", builder.executable_path)
 
-    def dynamic_input_to_npy(self, context, count):
+    def dynamic_input_to_npy(self, context, count, target_dir=""):
         """Saves input (given context) into .npy files.
 
         Count indicates the number of inputs that have to be saved."""
         node = self.onnx_node
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        if code_gen_dir == "":
-            raise Exception(
+        if target_dir == "":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+            if code_gen_dir == "":
+                raise Exception(
+                    """
+    Found no codegen dir for this node, did you run the prepare_cppsim transformation?
                 """
-Found no codegen dir for this node, did you run the prepare_cppsim transformation?
-            """
-            )
+                )
+            target_dir = code_gen_dir
         # create a npy file for each input of the node (in_ind is input index)
         # assuming dynamic inputs start from 0
         for in_ind in range(count):
@@ -427,7 +429,7 @@ Found no codegen dir for this node, did you run the prepare_cppsim transformatio
             # make copy before saving the array
             reshaped_input = reshaped_input.copy()
             np.save(
-                os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                os.path.join(target_dir, "input_{}.npy".format(in_ind)),
                 reshaped_input,
             )
 
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index d9ffea4d9cd8895fdf55a497e8c7d0e49808ac95..882b40a0aaf542e6dcaf427ca3567ae78394ede5 100755
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -138,14 +138,22 @@ class StreamingMaxPool_Batch(HLSCustomOp):
     def get_exp_cycles(self):
         # derived from StreamingMaxPool_Batch loop nest
         ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
-        _, _, ofm_dim_w, nf, _ = self.get_folded_output_shape()
 
+        warnings.warn(
+            """Estimated latency for layer {} can be lower than
+             actual latency!""".format(
+                self.onnx_node.name
+            )
+        )
         if self.is_1d():
-            exp_cycles = ofm_dim_w * nf * (k[1] + 1)
+            _, _, _, nf, _ = self.get_folded_output_shape()
+            ceil_mode = self.get_nodeattr("CeilMode")
+            ofm_dim = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode)
+            exp_cycles = ofm_dim * nf * (k[1] + 1)
             return int(exp_cycles)
         else:
             # TODO: adjust inaccurate formula
-            return int(ifm_dim[1] * (ifm_dim[1] + (ifm_dim[1] / k[1])))
+            return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1])))
 
     def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py
index b62e4f2f6784e8964232efcc9971f0b8bc35ac5d..eb51fe39fc6e7ec84204f9d541a0e47c333bbf43 100644
--- a/src/finn/custom_op/fpgadataflow/upsampler.py
+++ b/src/finn/custom_op/fpgadataflow/upsampler.py
@@ -27,7 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
-import os
 import warnings
 from qonnx.core.datatype import DataType
 
@@ -57,6 +56,8 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             # Batch size
             "numInputVectors": ("i", False, 1),
+            # Dimensionality mode: 0 = 2D square, 1 = 1D in H dim
+            "DimMode": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -64,21 +65,34 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
     def get_exp_cycles(self):
         OFMDim = self.get_nodeattr("OFMDim")
         batch_size = self.get_nodeattr("numInputVectors")
-        exp_cycles = OFMDim * OFMDim * batch_size
+        is_2d = self.get_nodeattr("DimMode") == 0
+        reps = 1
+        if is_2d:
+            OFMDim = OFMDim * OFMDim
+            reps = batch_size
+        exp_cycles = OFMDim * reps
         return int(exp_cycles)
 
     def get_normal_input_shape(self):
         IFMDim = self.get_nodeattr("IFMDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        ishape = (batch, IFMDim, IFMDim, num_ch)
+        is_2d = self.get_nodeattr("DimMode") == 0
+        if is_2d:
+            ishape = (batch, IFMDim, IFMDim, num_ch)
+        else:
+            ishape = (batch, IFMDim, 1, num_ch)
         return ishape
 
     def get_normal_output_shape(self):
         OFMDim = self.get_nodeattr("OFMDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        oshape = (batch, OFMDim, OFMDim, num_ch)
+        is_2d = self.get_nodeattr("DimMode") == 0
+        if is_2d:
+            oshape = (batch, OFMDim, OFMDim, num_ch)
+        else:
+            oshape = (batch, OFMDim, 1, num_ch)
         return oshape
 
     def get_folded_input_shape(self):
@@ -187,10 +201,19 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
         )
 
     def docompute(self):
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """UpsampleNearestNeighbour_Batch<OFMDim, IFMDim, IFMChannels,
-            ap_uint<Input_precision> > (in0, out, numReps);"""
-        ]
+        is_2d = self.get_nodeattr("DimMode") == 0
+        batch = self.get_nodeattr("numInputVectors")
+        if is_2d:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """UpsampleNearestNeighbour_Batch<OFMDim, IFMDim, IFMChannels,
+                ap_uint<Input_precision> > (in0, out, numReps);"""
+            ]
+        else:
+            assert batch == 1, "1D upsampler currently needs numReps=1"
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """UpsampleNearestNeighbour_1D<OFMDim, IFMDim, IFMChannels,
+                ap_uint<Input_precision> > (in0, out);"""
+            ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -246,7 +269,6 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
-        folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
@@ -268,9 +290,7 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
         ), """Input shape doesn't
         match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels)."""
         export_idt = self.get_input_datatype()
-
-        reshaped_input = inp.reshape(folded_ishape)
-        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+        self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir)
 
         if mode == "cppsim":
             # execute the precompiled model
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
index 497477da9d4cff736dc32eb27532e658890d5cc7..2096760580b4f33ba1ab09564ebba1601c4dc23c 100644
--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -439,13 +439,13 @@ class FINNExampleOverlay(Overlay):
         total_in = 0
         for i in range(self.num_inputs):
             total_in += np.prod(self.ishape_packed(i))
-        res["DRAM_in_bandwidth[Mb/s]"] = total_in * 0.000001 / runtime
+        res["DRAM_in_bandwidth[MB/s]"] = total_in * 0.000001 / runtime
         total_out = 0
         for o in range(self.num_outputs):
             total_out += np.prod(self.oshape_packed(o))
-        res["DRAM_out_bandwidth[Mb/s]"] = total_out * 0.000001 / runtime
+        res["DRAM_out_bandwidth[MB/s]"] = total_out * 0.000001 / runtime
         for iwdma, iwbuf, iwdma_name in self.external_weights:
-            res["DRAM_extw_%s_bandwidth[Mb/s]" % iwdma_name] = (
+            res["DRAM_extw_%s_bandwidth[MB/s]" % iwdma_name] = (
                 self.batch_size * np.prod(iwbuf.shape) * 0.000001 / runtime
             )
         if self.platform == "zynq-iodma":
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index b8b7233c8073e23bb00779ba82e1123f6aadaa74..8306024eaaf39b0be017d3d2ce1d76627c3e98ac 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -312,20 +312,25 @@ class InferUpsample(Transformation):
                 )
 
                 # Assumes nhwc layout for scales and input
-                assert scales[1] == scales[2], (
-                    "%s: Upsampling is only supported for quadratic scales." % n.name
+                is_scale_square_2d = scales[1] == scales[2]
+                is_scale_1d = scales[1] > 1 and scales[2] == 1
+                assert is_scale_square_2d or is_scale_1d, (
+                    "%s: Upsampling only supported for 1D H, or 2D square scaling"
+                    % n.name
                 )
                 assert scales[0] == scales[3] == 1, (
                     n.name + ": Upsampling is only supported for scales with "
-                    "the first and last dimensions being 1."
+                    "the first and last dimensions being 1 in NHWC."
                 )
                 spatial_scale = scales[1]
                 assert spatial_scale == int(spatial_scale), (
                     "%s: Upsampling is only supported for integer scales." % n.name
                 )
+                is_shape_square_2d = in_shape[1] == in_shape[2]
+                is_shape_1d = in_shape[1] > 1 and in_shape[2] == 1
 
-                assert in_shape[1] == in_shape[2], (
-                    "%s: Upsampling is only supported for quadratic input shapes."
+                assert is_shape_square_2d or is_shape_1d, (
+                    "%s: Upsampling is only supported for 1D H or 2D square inputs."
                     % n.name
                 )
 
@@ -335,6 +340,7 @@ class InferUpsample(Transformation):
                 NumChannels = in_shape[-1]
                 numInputVectors = in_shape[0]
                 inputDataType = dt.name
+                dim_mode = 0 if is_shape_square_2d else 1
 
                 # Insert the HLSCustomOp node
                 Upsample_HLS_node = helper.make_node(
@@ -348,6 +354,7 @@ class InferUpsample(Transformation):
                     NumChannels=NumChannels,
                     inputDataType=inputDataType,
                     numInputVectors=numInputVectors,
+                    DimMode=dim_mode,
                     name="UpsampleNearestNeighbour_Batch_" + n.name,
                 )
 
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 35ac736aabe325c037c47371ad71e5008770bdbb..892ab09fdf41947f86e2bf122e057e94585dfa8c 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -536,7 +536,7 @@ class CreateStitchedIP(Transformation):
         tcl.append(
             "set all_v_files [get_files -filter {USED_IN_SYNTHESIS == 1 "
             + "&& (FILE_TYPE == Verilog || FILE_TYPE == SystemVerilog "
-            + "|| FILE_TYPE ==\"Verilog Header\")}]"
+            + '|| FILE_TYPE =="Verilog Header")}]'
         )
         v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir
         tcl.append("set fp [open %s w]" % v_file_list)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 863523605580ef77559b65a1abd72802daff187d..dce98e54a3d62d72b83ebed21aa0604f0f6fa8ce 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -118,12 +118,21 @@ class MakePYNQDriver(Transformation):
         files_to_copy.append(
             (qonnx_path + "/util/basic.py", qonnx_target_path + "/util/basic.py")
         )
+        files_to_copy.append(
+            (qonnx_path + "/util/__init__.py", qonnx_target_path + "/util/__init__.py")
+        )
         files_to_copy.append(
             (
                 finn_util_path + "/data_packing.py",
                 finn_target_path + "/util/data_packing.py",
             )
         )
+        files_to_copy.append(
+            (
+                finn_util_path + "/__init__.py",
+                finn_target_path + "/util/__init__.py",
+            )
+        )
         for (src_file, target_file) in files_to_copy:
             shutil.copy(src_file, target_file)
         # extract input-output shapes from the graph
diff --git a/src/finn/transformation/qonnx/fold_quant_weights.py b/src/finn/transformation/qonnx/fold_quant_weights.py
index 80b6042d03ea11a45493011288133ed3a6f57c8d..e8339ae24472fa238e5c5da176b1316611218a54 100644
--- a/src/finn/transformation/qonnx/fold_quant_weights.py
+++ b/src/finn/transformation/qonnx/fold_quant_weights.py
@@ -126,10 +126,20 @@ class FoldQuantWeights(Transformation):
                         model.set_tensor_datatype(node_out, new_dtype)
 
                         # Reshape scale for Conv if required
+                        target_output_shape = model.get_tensor_shape(
+                            target_node.output[0]
+                        )
                         if target_node.op_type == "Conv" and len(scale.shape) > 0:
-                            bias_shape = [1] * len(scale.shape)
-                            bias_shape[1] = -1
-                            scale = scale.reshape(bias_shape)
+                            conv_out_shape = [1] * len(target_output_shape)
+                            # only support per-output channel scaling
+                            # (i.e. all scale shape elems besides 0th must be 1s)
+                            if len(scale.shape) > 1:
+                                assert (
+                                    np.prod(scale.shape[1:]) == 1
+                                ), "Can't fold scale beyond per-out-channel granularity"
+                            # collect all scaling in channels dim (since we constrain)
+                            conv_out_shape[1] = -1
+                            scale = scale.reshape(conv_out_shape)
 
                         if scale.shape == (1,):
                             scale = scale[0]
diff --git a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
index c52d69b0f09d306c5b076bb6ef1775f38977241a..77025ecdf57d5a422992d4163d05c740454986bb 100644
--- a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
+++ b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
@@ -110,11 +110,6 @@ class ConvertQuantActToMultiThreshold(Transformation):
                     predecessor_op_type = predecessor[0].op_type
                 else:
                     predecessor_op_type = predecessor
-                if model.is_fork_node(n):
-                    raise ValueError(
-                        "Forking Quant/BipolarQuant nodes are currently "
-                        "not supported by FINN."
-                    )
                 if n.op_type == "Quant" and not model.get_initializer(n.input[2]) == 0:
                     raise ValueError(
                         "Only Quant nodes with zero-point == 0 are currently supported."
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 0299c4f4d89d1fdd94434db77c77a0e529c86d26..a983e67750a0a860eeeb4b429f7d6b181fc84fe3 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -473,7 +473,7 @@ class AbsorbConsecutiveTransposes(Transformation):
     """Remove (Transpose -> Transpose) patterns when the input and output
     of the pattern have the same layout."""
 
-    def Are_opposite_permutations(self, perms1, perms2):
+    def are_opposite_permutations(self, perms1, perms2):
         if len(perms1) != len(perms2):
             return False
         assert 0 <= max(perms2) < len(perms2), "invalid permutation"
@@ -488,72 +488,40 @@ class AbsorbConsecutiveTransposes(Transformation):
     def apply(self, model):
         graph = model.graph
         graph_modified = False
-        for n in graph.node:
-            if n.op_type == "Transpose":
-                if model.is_fork_node(n):
-                    next_nodes = model.find_direct_successors(n)
-                    perms1 = list(get_by_name(n.attribute, "perm").ints)
-
-                    # check if all nodes after fork are opposite transposes
-                    all_opposite_transposes = True
-                    for next_node in next_nodes:
-                        if next_node is not None and next_node.op_type == "Transpose":
-                            perms2 = list(get_by_name(next_node.attribute, "perm").ints)
-                            if not self.Are_opposite_permutations(perms1, perms2):
-                                all_opposite_transposes = False
-                                break
-                        else:
-                            all_opposite_transposes = False
-                            break
-
-                    if not all_opposite_transposes:
-                        continue
-
-                    prod = model.find_producer(n.input[0])
-                    for next_node in next_nodes:
-                        # connect next_node's consumer input to n's producer output
-                        # TODO implement this to allow for forks as producers and
-                        # joins as consumers
-                        cons = model.find_consumer(next_node.output[0])
-                        cons.input[0] = prod.output[0]
-
-                        # remove consumer transpose
-                        graph.node.remove(next_node)
-
-                    # remove producer transpose
-                    graph.node.remove(n)
-                    graph_modified = True
-
-                else:
-                    next_node = model.find_consumer(n.output[0])
+        for node in graph.node:
+            if node.op_type == "Transpose":
+                next_nodes = model.find_consumers(node.output[0])
+                perms1 = list(get_by_name(node.attribute, "perm").ints)
+                # check if all nodes after fork are opposite transposes
+                all_opposite_transposes = True
+                for next_node in next_nodes:
                     if next_node is not None and next_node.op_type == "Transpose":
-                        perms1 = list(get_by_name(n.attribute, "perm").ints)
                         perms2 = list(get_by_name(next_node.attribute, "perm").ints)
-                        if self.Are_opposite_permutations(perms1, perms2):
-
-                            # connect next_node's consumer input to n's producer output
-                            # TODO implement this to allow for forks as producers
-                            consumers = model.find_direct_successors(next_node)
-                            prod = model.find_producer(n.input[0])
-                            if prod is not None:
-                                for cons in consumers:
-                                    for cons_in in cons.input:
-                                        if cons_in == next_node.output[0]:
-                                            prod.output[0] = cons_in
-                                            break
-                            else:
-                                # n.input[0] is top-level graph input
-                                # wire consumers directly to that
-                                for cons in consumers:
-                                    for i, iname in enumerate(cons.input):
-                                        if iname == next_node.output[0]:
-                                            cons.input[i] = n.input[0]
-
-                            # remove both transposes
-                            graph.node.remove(n)
-                            graph.node.remove(next_node)
+                        if not self.are_opposite_permutations(perms1, perms2):
+                            all_opposite_transposes = False
+                            break
+                    else:
+                        all_opposite_transposes = False
+                        break
+                if not all_opposite_transposes:
+                    continue
+                source_tensor = node.input[0]
+                for next_node in next_nodes:
+                    # connect next_node's consumers' appropriate input to n's input
+                    # TODO how to handle top-level outputs if any?
+                    nextnode_out = next_node.output[0]
+                    assert nextnode_out not in [x.name for x in model.graph.output]
+                    consumers = model.find_consumers(nextnode_out)
+                    for cons in consumers:
+                        for i, iname in enumerate(cons.input):
+                            if iname == nextnode_out:
+                                cons.input[i] = source_tensor
+                    # remove consumer transpose
+                    graph.node.remove(next_node)
+                # remove producer transpose
+                graph.node.remove(node)
+                graph_modified = True
 
-                            graph_modified = True
         if graph_modified:
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 9ff8a2173ce81e2a19c56bbd20a326759c3b9df2..3e815c1537353cc2be970a2068d4ded30cc48bc8 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -553,6 +553,8 @@ class MoveLinearPastEltwiseAdd(Transformation):
                 # Other transform should handle that
                 if prod0 is None or prod1 is None or (prod0 == prod1):
                     continue
+                if len(prod0.input) < 2 or len(prod1.input) < 2:
+                    continue
                 init0 = model.get_initializer(prod0.input[1])
                 init1 = model.get_initializer(prod1.input[1])
                 # if either initializer is None, skip
@@ -728,9 +730,10 @@ class MoveOpPastFork(Transformation):
     can be merged with nodes in the branches
     """
 
-    def __init__(self, op_name_list):
+    def __init__(self, op_name_list, get_attrs_fxn=lambda x: {}):
         super().__init__()
         self.ops_to_move = op_name_list
+        self.get_attrs_fxn = get_attrs_fxn
 
     def apply(self, model):
         graph = model.graph
@@ -747,9 +750,10 @@ class MoveOpPastFork(Transformation):
 
                 # Restrict this transform to operations with constant parameters
                 # Assuming parameters is in input 1
-                op_init_param = model.get_initializer(n.input[1])
-                if op_init_param is None:
-                    continue
+                if len(n.input) > 1:
+                    op_init_param = model.get_initializer(n.input[1])
+                else:
+                    op_init_param = None
 
                 # Check case when branches are empty and go
                 # to the same node
@@ -766,16 +770,20 @@ class MoveOpPastFork(Transformation):
 
                 for consumer_node in consumers[1:]:
                     # create new node
-                    new_param_name = model.make_new_valueinfo_name()
                     new_output_tensor_name = model.make_new_valueinfo_name()
+                    if op_init_param is None:
+                        new_inp_list = [n.input[0]]
+                    else:
+                        new_param_name = model.make_new_valueinfo_name()
+                        new_inp_list = [n.input[0], new_param_name]
+                        model.set_initializer(new_param_name, op_init_param)
+                    attrs = self.get_attrs_fxn(n)
+                    # TODO use copy of original node instead to get attrs?
                     new_node = oh.make_node(
-                        n.op_type,
-                        [n.input[0], new_param_name],
-                        [new_output_tensor_name],
+                        n.op_type, new_inp_list, [new_output_tensor_name], **attrs
                     )
                     graph.node.insert(node_ind, new_node)
                     node_ind += 1
-                    model.set_initializer(new_param_name, op_init_param)
 
                     # change consumer input tensor
                     graph.node.remove(consumer_node)
@@ -811,6 +819,13 @@ class MoveLinearPastFork(MoveOpPastFork):
         super().__init__(["Add", "Mul"])
 
 
+class MoveTransposePastFork(MoveOpPastFork):
+    def __init__(self):
+        super().__init__(
+            ["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints}
+        )
+
+
 class MoveMaxPoolPastMultiThreshold(Transformation):
     """Move MaxPool nodes past MultiThreshold nodes on linear segments of the graph."""
 
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index c90985ebc9932c56c840e34464b838f3141c79a8..4aba87216c8999612f748e989a945ceff33da167 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -61,7 +61,7 @@ alveo_part_map["U280"] = "xcu280-fsvh2892-2L-e"
 alveo_default_platform = dict()
 alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_201920_3"
 alveo_default_platform["U200"] = "xilinx_u200_xdma_201830_2"
-alveo_default_platform["U250"] = "xilinx_u250_xdma_201830_2"
+alveo_default_platform["U250"] = "xilinx_u250_gen3x16_xdma_2_1_202010_1"
 alveo_default_platform["U280"] = "xilinx_u280_xdma_201920_3"
 
 
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
index ee7df3ed5b3f34777bffec48392cabef024c58a8..f6a51da8e44ea60ae5693cdd033b39bdf51376ac 100644
--- a/src/finn/util/pyverilator.py
+++ b/src/finn/util/pyverilator.py
@@ -74,8 +74,9 @@ def pyverilate_stitched_ip(
     # are identical but in multiple directories (regslice_core.v)
 
     # remove duplicates from list by doing list -> set -> list
-    all_verilog_files = list(set(filter(lambda x: x.endswith(".v") or x.endswith(".sv"),
-                                        all_verilog_srcs)))
+    all_verilog_files = list(
+        set(filter(lambda x: x.endswith(".v") or x.endswith(".sv"), all_verilog_srcs))
+    )
 
     # remove all but one instances of regslice_core.v
     filtered_verilog_files = []
diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py
index b0c3d6088c27291f1f49dd2f1ee746b65ca0a737..3dc46ec31e49d7115b19b3373d54be6ddc29bb80 100644
--- a/tests/brevitas/test_brevitas_relu_act_export.py
+++ b/tests/brevitas/test_brevitas_relu_act_export.py
@@ -41,6 +41,7 @@ from brevitas.nn import QuantReLU
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
+from torch import nn
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
@@ -179,3 +180,83 @@ scaling_impl.learned_value": rand_tensor.type(
 
     assert np.isclose(produced, expected, atol=1e-3).all()
     os.remove(export_onnx_path)
+
+
+class PyTorchTestModel(nn.Module):
+    def __init__(self, abits):
+        super(PyTorchTestModel, self).__init__()
+        out_channels = 32
+        self.b_act = QuantReLU(
+            bit_width=abits,
+            quant_type=QuantType.INT,
+            scaling_impl_type=ScalingImplType.PARAMETER,
+            scaling_per_channel=True,
+            restrict_scaling_type=RestrictValueType.LOG_FP,
+            scaling_min_val=2e-16,
+            max_val=6.0,
+            return_quant_tensor=False,
+            per_channel_broadcastable_shape=(1, out_channels, 1, 1),
+        )
+
+    def forward(self, x):
+        act_out = self.b_act(x)
+        y0 = act_out * 2.0
+        y1 = act_out * -1.0
+        y = y0 + y1
+        return y
+
+
+@pytest.mark.brevitas_export
+@pytest.mark.parametrize("abits", [2, 4, 8])
+@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)])
+@pytest.mark.parametrize("scaling_per_channel", [True])
+@pytest.mark.parametrize("QONNX_export", [True])
+def test_brevitas_act_export_relu_forking(
+    abits, max_val, scaling_per_channel, QONNX_export
+):
+    out_channels = 32
+    ishape = (1, out_channels, 1, 1)
+    min_val = -1.0
+    model_pyt = PyTorchTestModel(abits)
+
+    rand_tensor = (2) * torch.rand((1, out_channels, 1, 1))
+
+    checkpoint = {
+        "b_act.act_quant_proxy.fused_activation_quant_proxy."
+        "tensor_quant.scaling_impl.learned_value": rand_tensor.type(torch.FloatTensor)
+    }
+    model_pyt.load_state_dict(checkpoint)
+
+    if QONNX_export:
+        m_path = export_onnx_path
+        BrevitasONNXManager.export(model_pyt, ishape, m_path)
+        qonnx_cleanup(m_path, out_file=m_path)
+        model = ModelWrapper(m_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(m_path)
+
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(InferShapes())
+    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
+        np.float32
+    )
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    inp_tensor = torch.from_numpy(inp_tensor).float()
+    model_pyt.eval()
+    expected = model_pyt.forward(inp_tensor).detach().numpy()
+    if not np.isclose(produced, expected, atol=1e-3).all():
+        print(abits, max_val)
+        print("scale: ", model_pyt.quant_act_scale().type(torch.FloatTensor).detach())
+        if abits < 5:
+            print(
+                "thres:",
+                ", ".join(["{:8.4f}".format(x) for x in model_pyt.export_thres[0]]),
+            )
+        print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]]))
+        print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]]))
+        print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]]))
+
+    assert np.isclose(produced, expected, atol=1e-3).all()
+    os.remove(export_onnx_path)
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index ab82a00c234b48ced48f3987d929bb1f340083f5..103f18b514c23c4e1ad35a85d020dc0481aa9c47 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -673,9 +673,6 @@ class TestEnd2End:
     @pytest.mark.vitis
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
     def test_build(self, topology, wbits, abits, QONNX_export, kind):
-        # temporarily adding skip for alveo builds
-        if kind == "alveo":
-            pytest.skip("Alveo tests temporarily excluded")
         if kind == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
         prev_chkpt_name = get_checkpoint_name(
@@ -698,9 +695,6 @@ class TestEnd2End:
     @pytest.mark.vitis
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
     def test_make_pynq_driver(self, topology, wbits, abits, QONNX_export, kind):
-        # temporarily adding skip for alveo builds
-        if kind == "alveo":
-            pytest.skip("Alveo tests temporarily excluded")
         if kind == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
         prev_chkpt_name = get_checkpoint_name(
@@ -715,9 +709,6 @@ class TestEnd2End:
 
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
     def test_deploy(self, topology, wbits, abits, QONNX_export, kind):
-        # temporarily adding skip for alveo builds
-        if kind == "alveo":
-            pytest.skip("Alveo tests temporarily excluded")
         prev_chkpt_name = get_checkpoint_name(
             topology, wbits, abits, QONNX_export, "driver_" + kind
         )
@@ -741,9 +732,6 @@ class TestEnd2End:
 
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
     def test_run_on_hw(self, topology, wbits, abits, QONNX_export, kind):
-        # temporarily adding skip for alveo builds
-        if kind == "alveo":
-            pytest.skip("Alveo tests temporarily excluded")
         prev_chkpt_name = get_checkpoint_name(
             topology, wbits, abits, QONNX_export, "deploy_" + kind
         )
@@ -768,9 +756,6 @@ class TestEnd2End:
 
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
     def test_throughput_hw(self, topology, wbits, abits, QONNX_export, kind):
-        # temporarily adding skip for alveo builds
-        if kind == "alveo":
-            pytest.skip("Alveo tests temporarily excluded")
         prev_chkpt_name = get_checkpoint_name(
             topology, wbits, abits, QONNX_export, "deploy_" + kind
         )
@@ -803,7 +788,7 @@ class TestEnd2End:
         ret_str += "\n" + "Raw data:"
 
         ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
-            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[Mb/s]", "DRAM wr[Mb/s]"
+            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[MB/s]", "DRAM wr[MB/s]"
         )
         for k in bsize_range:
             v = ret[k]
@@ -812,8 +797,8 @@ class TestEnd2End:
                 np.round(v["runtime[ms]"], 4),
                 v["fclk[mhz]"],
                 np.round(v["throughput[images/s]"], 2),
-                np.round(v["DRAM_in_bandwidth[Mb/s]"], 2),
-                np.round(v["DRAM_out_bandwidth[Mb/s]"], 2),
+                np.round(v["DRAM_in_bandwidth[MB/s]"], 2),
+                np.round(v["DRAM_out_bandwidth[MB/s]"], 2),
             )
         ret_str += "\n" + "-----------------------------"
         warnings.warn(ret_str)
diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py
index 5e79ea2dad2aa4200f998fd8953672b9f49b2b86..495fcd10b6a977c6b0917ac37b58ec5595185c25 100644
--- a/tests/fpgadataflow/test_fpgadataflow_checksum.py
+++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py
@@ -133,6 +133,7 @@ def create_two_fc_model():
     return model
 
 
+@pytest.mark.vivado
 @pytest.mark.fpgadataflow
 def test_fpgadataflow_checksum():
     # use a graph consisting of two fc layers to test
diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py
index dddc470ec2ed88faf078f19bd0d2a7a4a6b5b6cd..8488a34dff52d39c28fbea25275c9a4b59c37f80 100644
--- a/tests/fpgadataflow/test_fpgadataflow_concat.py
+++ b/tests/fpgadataflow/test_fpgadataflow_concat.py
@@ -144,6 +144,5 @@ def test_fpgadataflow_concat_stitchedip():
     )
     model.set_metadata_prop("exec_mode", "rtlsim")
     model.set_metadata_prop("rtlsim_trace", "trace.vcd")
-    model.save("dbg.onnx")
     ret_sim = execute_onnx(model, inp_dict)
     assert (exp_out == ret_sim[oname]).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index a3809e61304ef031407e7fbec0f9037382d999ad..80f2d724ad7ccbf563c23076155313bad1ecb336 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -335,8 +335,6 @@ def test_fpgadataflow_ipstitch_iodma_floorplan():
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.vitis
-# temporarily marked as xfail
-@pytest.mark.xfail
 def test_fpgadataflow_ipstitch_vitis_end2end(board, period_ns, extw):
     if "VITIS_PATH" not in os.environ:
         pytest.skip("VITIS_PATH not set")
@@ -348,6 +346,8 @@ def test_fpgadataflow_ipstitch_vitis_end2end(board, period_ns, extw):
         assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
         assert os.path.isfile(sdp_node.get_nodeattr("model"))
         model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(fpga_part, period_ns))
     model = model.transform(VitisBuild(fpga_part, period_ns, platform))
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_vitis.onnx")
     assert model.get_metadata_prop("platform") == "alveo"
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
similarity index 84%
rename from tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
rename to tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
index 55c90644dfbb23fbc2da10cf969461abe6d38bf3..a3968cf79704092ffb5ec53c887842372b625f4d 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
@@ -32,6 +32,7 @@ from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
+from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.basic import gen_finn_dt_tensor
@@ -82,46 +83,6 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_
     return model
 
 
-def make_single_streamingmaxpool_modelwrapper(
-    k, ifm_ch, pe, ifm_dim, ofm_dim, idt, ceil_mode
-):
-    k_h, k_w = k
-    ifm_dim_h, ifm_dim_w = ifm_dim
-    ofm_dim_h, ofm_dim_w = ofm_dim
-    odt = idt
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
-    )
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
-    )
-
-    smp_node = helper.make_node(
-        "StreamingMaxPool_Batch",
-        ["inp"],
-        ["outp"],
-        domain="finn.custom_op.fpgadataflow",
-        backend="fpgadataflow",
-        PoolDim=[k_h, k_w],
-        NumChannels=ifm_ch,
-        PE=pe,
-        ImgDim=[ifm_dim_h, ifm_dim_w],
-        CeilMode=ceil_mode,
-        dataType=idt.name,
-    )
-    graph = helper.make_graph(
-        nodes=[smp_node], name="smp_graph", inputs=[inp], outputs=[outp]
-    )
-
-    model = helper.make_model(graph, producer_name="smp-model")
-    model = ModelWrapper(model)
-
-    model.set_tensor_datatype("inp", idt)
-    model.set_tensor_datatype("outp", odt)
-
-    return model
-
-
 def prepare_inputs(input_tensor):
     return {"inp": input_tensor}
 
@@ -187,6 +148,10 @@ def test_fpgadataflow_streamingmaxpool(
 
     assert model.graph.node[0].op_type == "StreamingMaxPool_Batch"
 
+    # Ensure PE value is set
+    streamingmaxpool_node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0]
+    getCustomOp(streamingmaxpool_node).set_nodeattr("PE", pe)
+
     if exec_mode == "cppsim":
         model = model.transform(SetExecMode("cppsim"))
         model = model.transform(PrepareCppSim())
@@ -198,7 +163,7 @@ def test_fpgadataflow_streamingmaxpool(
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
     else:
-        raise Exception("Unknown exec_mode in test_layer_streaming_maxpool_batch")
+        raise Exception("Unknown exec_mode in test_fpgadataflow_streamingmaxpool")
 
     # execute model
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
@@ -211,6 +176,7 @@ def test_fpgadataflow_streamingmaxpool(
         exp_cycles_dict = model.analysis(exp_cycles_per_layer)
         exp_cycles = exp_cycles_dict[node.name]
         # FIXME: maxpool cycles prediction needs a fix
-        # mostl likely due to some loops not flattening
+        # most likely due to inaccurate cycle prediction of
+        # nested for-loops
         # assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
         assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
index d1ef0b890a66524b7cbd055a413561961ebcb4a7..a08d31f7b05184a4d5c84ef927a05fe1fd6e43c3 100644
--- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py
+++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
@@ -30,6 +30,7 @@ import pytest
 
 import numpy as np
 import os
+import shutil
 import torch
 from brevitas.export import FINNManager
 from qonnx.core.datatype import DataType
@@ -51,6 +52,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.util.basic import make_build_dir
 
 tmpdir = os.environ["FINN_BUILD_DIR"]
 
@@ -117,7 +119,7 @@ class PyTorchTestModel(nn.Module):
 
 # param datatype
 @pytest.mark.parametrize("dt", [DataType["INT8"]])
-# Width/height of square input feature map
+# spatial dim input feature map
 @pytest.mark.parametrize("IFMDim", [3, 5])
 # upscaling factor
 @pytest.mark.parametrize("scale", [2, 3])
@@ -125,14 +127,22 @@ class PyTorchTestModel(nn.Module):
 @pytest.mark.parametrize("NumChannels", [4])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+# whether to use 1D or 2D square testcases
+@pytest.mark.parametrize("is_1d", [False, True])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
-def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode):
+def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d):
+    tmpdir = make_build_dir("upsample_export_")
     atol = 1e-3
+    if is_1d:
+        input_shape = (1, NumChannels, IFMDim, 1)
+        upscale_factor = (scale, 1)
+    else:
+        input_shape = (1, NumChannels, IFMDim, IFMDim)
+        upscale_factor = (scale, scale)
     # Create the test model and inputs for it
-    torch_model = PyTorchTestModel(upscale_factor=scale)
-    input_shape = (1, NumChannels, IFMDim, IFMDim)
+    torch_model = PyTorchTestModel(upscale_factor=upscale_factor)
     test_in = torch.arange(0, np.prod(np.asarray(input_shape)))
     # Limit the input to values valid for the given datatype
     test_in %= dt.max() - dt.min() + 1
@@ -200,3 +210,4 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode):
         assert output_matches, "Cppsim output doesn't match ONNX/PyTorch."
     elif exec_mode == "rtlsim":
         assert output_matches, "Rtlsim output doesn't match ONNX/PyTorch."
+    shutil.rmtree(tmpdir, ignore_errors=True)
diff --git a/tests/transformation/streamline/test_absorb_opposite_transposes.py b/tests/transformation/streamline/test_absorb_opposite_transposes.py
index 51ea5edfc420bf935de3e196df1b150934782a91..6d8d2b9f0cd4ad28c3ea0922d69b9b963a0deb08 100644
--- a/tests/transformation/streamline/test_absorb_opposite_transposes.py
+++ b/tests/transformation/streamline/test_absorb_opposite_transposes.py
@@ -29,8 +29,7 @@
 import pytest
 
 import numpy as np
-import onnx.helper as oh
-from onnx import TensorProto
+import onnx.parser as oprs
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
 
@@ -41,39 +40,42 @@ from finn.transformation.streamline.absorb import AbsorbConsecutiveTransposes
 @pytest.mark.streamline
 def test_absorb_opposite_transposes():
     np.random.seed(0)
-    input_shape = [1, 3, 4, 2]
-    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
-    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
-    value_info = [oh.make_tensor_value_info("add_param_0", TensorProto.FLOAT, [1])]
-    value_info += [oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [1])]
-    value_info += [oh.make_tensor_value_info("mul_param_0", TensorProto.FLOAT, [1])]
-    modelproto = oh.make_model(
-        oh.make_graph(
-            name="test",
-            inputs=[top_in],
-            outputs=[top_out],
-            value_info=value_info,
-            nodes=[
-                oh.make_node("Add", ["top_in", "add_param_0"], ["t0"]),
-                oh.make_node("Transpose", ["t0"], ["t1"], perm=[0, 2, 3, 1]),
-                oh.make_node("Transpose", ["t1"], ["t2"], perm=[0, 3, 1, 2]),
-                oh.make_node("Add", ["t2", "add_param_1"], ["t3"]),
-                oh.make_node("Transpose", ["t3"], ["t4"], perm=[0, 2, 3, 1]),
-                oh.make_node("Transpose", ["t4"], ["t5"], perm=[0, 3, 1, 2]),
-                oh.make_node("Add", ["t5", "t2"], ["t6"]),
-                oh.make_node("Mul", ["t6", "mul_param_0"], ["top_out"]),
-            ],
-        )
-    )
-    model = ModelWrapper(modelproto)
+    shp = [1, 3, 4, 2]
+    shp_str = str(shp)
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0) => (float{shp_str} out0)
+    <
+        float[1] add0_param = {{1.0}},
+        float[1] add1_param = {{3.0}},
+        float[1] mul0_param = {{2.0}}
+    >
+    {{
+        add0_out = Add(in0, add0_param)
+        t0_out = Transpose<perm=[0,2,3,1]>(add0_out)
+        t1_out = Transpose<perm=[0,3,1,2]>(t0_out)
+        add1_out = Add(t1_out, add1_param)
+        t2_out = Transpose<perm=[0,2,3,1]>(add1_out)
+        t3_out = Transpose<perm=[0,3,1,2]>(t2_out)
+        add2_out = Add(t1_out, t3_out)
+        t4_out = Transpose<perm=[0,2,3,1]>(add2_out)
+        t5_out = Transpose<perm=[0,3,1,2]>(t4_out)
+        t6_out = Transpose<perm=[0,3,1,2]>(t4_out)
+        m0_out = Mul(t5_out, mul0_param)
+        m1_out = Mul(t6_out, mul0_param)
+        out0 = Mul(m0_out, m1_out)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
     model = model.transform(InferShapes())
-    model.set_initializer("add_param_0", np.asarray([1], dtype=np.float32))
-    model.set_initializer("add_param_1", np.asarray([3], dtype=np.float32))
-    model.set_initializer("mul_param_0", np.asarray([2], dtype=np.float32))
     new_model = model.transform(AbsorbConsecutiveTransposes())
     new_model = new_model.transform(InferShapes())
-    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+    inp_dict = {"top_in": np.random.rand(*shp).astype(np.float32)}
     assert ox.compare_execution(model, model, inp_dict)
-    assert len(new_model.graph.node) == 4
+    assert len(new_model.graph.node) == 6
     for n in new_model.graph.node:
         assert new_model.graph.node[0].op_type != "Transpose"
diff --git a/tests/transformation/streamline/test_move_past_fork.py b/tests/transformation/streamline/test_move_past_fork.py
index 5064fa3fca869a245c87cf0c1680d1357e5de60b..7e77d7f9b3502429f08c40558e330b6261d0dbad 100644
--- a/tests/transformation/streamline/test_move_past_fork.py
+++ b/tests/transformation/streamline/test_move_past_fork.py
@@ -28,80 +28,113 @@
 import pytest
 
 import numpy as np
-from onnx import TensorProto, helper
+import onnx.parser as oprs
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import get_by_name
 
 import finn.core.onnx_exec as oxe
-from finn.transformation.streamline.reorder import MoveLinearPastFork
+from finn.transformation.streamline.reorder import (
+    MoveLinearPastFork,
+    MoveTransposePastFork,
+)
+
+
+@pytest.mark.streamline
+def test_move_past_fork_transpose():
+    shp = [1, 3, 32, 32]
+    shp_str = str(shp)
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0) => (float{shp_str} out0)
+    {{
+        t0_out = Transpose<perm=[0,2,3,1]>(in0)
+        t1_out = Transpose<perm=[0,3,1,2]>(t0_out)
+        t2_out = Transpose<perm=[0,3,1,2]>(t0_out)
+        out0 = Add(t1_out, t2_out)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    new_model = model.transform(MoveTransposePastFork())
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    nodes = new_model.graph.node
+    assert oxe.compare_execution(
+        model, new_model, {"in0": np.random.rand(*shp).astype(np.float32)}
+    )
+    assert len(nodes) == 5
+    assert not new_model.is_fork_node(get_by_name(nodes, "Transpose_0"))
 
 
 @pytest.mark.streamline
 @pytest.mark.parametrize("ch", [64, 1])
 # ifmdim
 @pytest.mark.parametrize("ifmdim", [-1, 7])
-def test_move_past_fork(ch, ifmdim):
-    # generate test vectors of correct shape
+def test_move_past_fork_linear(ch, ifmdim):
     if ifmdim == -1:
-        input_shape = (1, ch)
+        shp = [1, ch]
     else:
-        input_shape = (1, ch, ifmdim, ifmdim)
+        shp = [1, ch, ifmdim, ifmdim]
+    shp_str = str(shp)
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0) => (float{shp_str} out0)
+    <
+        float{shp_str} add0_param,
+        float{shp_str} mul_shared_param,
+        float{shp_str} add2_param,
+        float{shp_str} mul2_param,
+        float{shp_str} add3_param,
+        float{shp_str} add4_param,
+        float{shp_str} mul3_param,
+        float{shp_str} add6_param
+    >
+    {{
 
-    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
-    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
-
-    num_of_params = 8
-    value_info = []
-    for i in range(num_of_params):
-        value_info += [
-            helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
-        ]
-
-    add_1_to_move = helper.make_node("Add", ["top_in", "p0"], ["fork1"])
-    mul_1_to_move = helper.make_node("Mul", ["t5", "p4"], ["fork2"])
-    add_2_to_move = helper.make_node("Add", ["fork2", "p5"], ["t6"])
-    mul_1_not_to_move = helper.make_node("Mul", ["t8", "p7"], ["fork3"])
-    modelproto = helper.make_model(
-        helper.make_graph(
-            name="test",
-            inputs=[top_in],
-            outputs=[top_out],
-            value_info=value_info,
-            nodes=[
-                # fork1
-                add_1_to_move,
-                helper.make_node("Mul", ["fork1", "p1"], ["t2"]),
-                helper.make_node("Mul", ["fork1", "p2"], ["t3"]),
-                helper.make_node("Add", ["t2", "t3"], ["t4"]),
-                helper.make_node("Add", ["t4", "p3"], ["t5"]),
-                # fork2
-                mul_1_to_move,
-                add_2_to_move,
-                helper.make_node("Add", ["fork2", "p6"], ["t7"]),
-                helper.make_node("Add", ["t6", "t7"], ["t8"]),
-                # empty branches: do nothing
-                mul_1_not_to_move,
-                helper.make_node("Add", ["fork3", "fork3"], ["top_out"]),
-            ],
-        )
-    )
-    model = ModelWrapper(modelproto)
+        add0_out = Add(in0, add0_param)
+        mul0_out = Mul(add0_out, mul_shared_param)
+        mul1_out = Mul(add0_out, mul_shared_param)
+        add1_out = Add(mul0_out, mul1_out)
+        add2_out = Add(add1_out, add2_param)
+        mul2_out = Mul(add2_out, mul2_param)
+        add3_out = Add(mul2_out, add3_param)
+        add4_out = Add(mul2_out, add4_param)
+        add5_out = Add(add3_out, add4_out)
+        mul3_out = Mul(add5_out, mul3_param)
+        out0 = Add(mul3_out, add6_param)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
     model = model.transform(InferShapes())
 
     np.random.seed(0)
-    for i in range(num_of_params):
-        model.set_initializer(
-            "p" + str(i), np.random.rand(*input_shape).astype(np.float32)
-        )
-
+    for tensor_name in model.get_all_tensor_names():
+        if tensor_name.endswith("_param"):
+            pshape = model.get_tensor_shape(tensor_name)
+            model.set_initializer(
+                tensor_name, np.random.rand(*pshape).astype(np.float32)
+            )
+    model = model.transform(GiveUniqueNodeNames())
     # Transform
     new_model = model.transform(MoveLinearPastFork())
-    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
-
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    inp_dict = {"top_in": np.random.rand(*shp).astype(np.float32)}
     # Test
     assert oxe.compare_execution(model, new_model, inp_dict)
-    assert not new_model.is_fork_node(add_1_to_move)
-    assert not new_model.is_fork_node(mul_1_to_move)
-    assert not new_model.is_fork_node(add_2_to_move)
-    assert new_model.is_fork_node(mul_1_not_to_move)
+    nodes = new_model.graph.node
+    assert len(new_model.get_nodes_by_op_type("Add")) == 9
+    assert len(new_model.get_nodes_by_op_type("Mul")) == 5
+    assert not new_model.is_fork_node(get_by_name(nodes, "Add_0"))
+    assert new_model.is_join_node(get_by_name(nodes, "Add_2"))
+    assert not new_model.is_fork_node(get_by_name(nodes, "Mul_2"))
+    assert not new_model.is_join_node(get_by_name(nodes, "Add_5"))
     assert len(new_model.graph.node) == 14
diff --git a/tests/transformation/test_qonnx_to_finn.py b/tests/transformation/test_qonnx_to_finn.py
index 43055f6704732866569ac4770202f1b4ff6bfb22..7e438b4b8ba9d9befca79100bb9727735afa27d3 100644
--- a/tests/transformation/test_qonnx_to_finn.py
+++ b/tests/transformation/test_qonnx_to_finn.py
@@ -94,6 +94,9 @@ def analysis_testing_for_no_quant_nodes(model):
 @pytest.mark.parametrize("wbits", [1, 2])
 @pytest.mark.parametrize("model_name", ["TFC", "SFC", "LFC", "CNV", "mobilenet"])
 def test_QONNX_to_FINN(model_name, wbits, abits):
+    if model_name == "mobilenet":
+        pytest.xfail("MobileNet test is temporarily excluded from QONNX testing.")
+
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     if model_name == "LFC" and wbits == 2 and abits == 2:
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
index d33a4f2fd6c974b13ac315c7ef621eacb04002c4..cdf69aebddc4d6af2288774acbff5dd8a52512b3 100644
--- a/tests/util/test_build_dataflow.py
+++ b/tests/util/test_build_dataflow.py
@@ -39,6 +39,7 @@ from finn.util.basic import make_build_dir
 
 @pytest.mark.slow
 @pytest.mark.vivado
+@pytest.mark.end2end
 def test_end2end_build_dataflow_directory():
     test_dir = make_build_dir("test_build_dataflow_directory_")
     target_dir = test_dir + "/build_dataflow"