diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 9b3111b70eae97a3644e1de23c368bd5b09f7927..c575ca7e3b3b04eb651fb1135d949baad038f1ad 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -107,9 +107,6 @@ These are summarized below:
 * (optional) ``LOCALHOST_URL`` (default localhost) sets the base URL for accessing e.g. Netron from inside the container. Useful when running FINN remotely.
 * (optional) ``NETRON_PORT`` (default 8081) changes the port for Netron inside Docker
 * (optional) ``PYNQ_BOARD`` or ``ALVEO_BOARD`` specifies the type of PYNQ/Alveo board used (see "supported hardware" below) for the test suite
-* (optional) ``PYNQ_IP`` and ``PYNQ_PORT`` (or ``ALVEO_IP`` and ``ALVEO_PORT``) specify ip address and port number to access the PYNQ board / Alveo target
-* (optional) ``PYNQ_USERNAME`` and ``PYNQ_PASSWORD`` (or ``ALVEO_USERNAME`` and ``ALVEO_PASSWORD``) specify the PYNQ board / Alveo host access credentials for the test suite. For PYNQ, password is always needed to run as sudo. For Alveo, you can leave the password empty and place your ssh private key in the ``finn/ssh_keys`` folder to use keypair authentication.
-* (optional) ``PYNQ_TARGET_DIR`` (or ``ALVEO_TARGET_DIR``) specifies the target dir on the PYNQ board / Alveo host for the test suite
 * (optional) ``IMAGENET_VAL_PATH`` specifies the path to the ImageNet validation directory for tests.
 * (optional) ``FINN_DOCKER_PREBUILT`` (default 0) if set to 1 then skip Docker image building and use the image tagged with ``FINN_DOCKER_TAG``.
 * (optional) ``FINN_DOCKER_TAG`` (autogenerated) specifies the Docker image tag to use.
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index d0c4cd20650a7cb1ef63f68ff559bebbba93ae05..652c94ac248437bdf83c0c3047f6cbd2d3b85651 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -206,6 +206,64 @@ How to set *mem_mode*
 ---------------------
 When the nodes in the network are converted to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the conversion to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is passed as argument. Note that if no argument is passed, the default is *const*.
 
+
+.. _folding_factors:
+
+Constraints to folding factors per layer
+=========================================
+
+.. list-table:: Folding factor constraints
+
+   * - **Layers**
+     - **Parameters**
+     - **Constraints**
+   * - Addstreams_Batch
+     - PE
+     - inp_channels % PE == 0
+   * - ChannelwiseOp_Batch
+     - PE
+     - channels % PE == 0
+   * - ConvolutionInputGenerator
+     - SIMD
+     - inp_channels % SIMD == 0
+   * - ConvolutionInputGenerator1d
+     - SIMD
+     - inp_channels % SIMD == 0
+   * - Downsampler
+     - SIMD
+     - inp_channels % SIMD == 0
+   * - DuplicateStreams_Batch
+     - PE
+     - channels % PE == 0
+   * - Eltwise
+     - PE
+     - inp_channels % PE == 0
+   * - FMPadding_batch
+     - SIMD
+     - inp_channels % SIMD == 0
+   * - FMPadding_rtl
+     - SIMD
+     - inp_channels % SIMD == 0
+   * - Globalaccpool_Batch
+     - PE
+     - channels % PE == 0
+   * - Labelselect_Batch
+     - PE
+     - num_labels % PE == 0
+   * - MatrixVectorActivation
+     - PE & SIMD
+     - MH % PE == 0 & MW % SIMD == 0
+   * - Pool_Batch
+     - PE
+     - inp_channels % PE == 0
+   * - Thresholding_Batch
+     - PE
+     - MH % PE == 0
+   * - VectorVectorActivation
+     - PE & SIMD
+     - k_h * k_w % SIMD == 0 & channels % PE == 0
+
+
 RTL ConvolutionInputGenerator
 =============================
 
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index afa1ecffa08213db6a282076c6fdf59694f9e13e..28cb47eaf70cade96a1146559cbbd92248923a34 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -54,14 +54,6 @@ finn.core.onnx\_exec
    :undoc-members:
    :show-inheritance:
 
-finn.core.remote\_exec
------------------------------
-
-.. automodule:: finn.core.remote_exec
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 finn.core.rtlsim\_exec
 -----------------------------
 
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index 7ba3b252abfa0086a8c0281eb9a792fb239d6ec3..aebd0604f4c555f2b1bc637bc4c3d94b35309722 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -99,14 +99,6 @@ finn.util.fpgadataflow
    :undoc-members:
    :show-inheritance:
 
-finn.util.gdrive
------------------------------
-
-.. automodule:: finn.util.gdrive
-  :members:
-  :undoc-members:
-  :show-inheritance:
-
 finn.util.hls
 ---------------
 
diff --git a/finn-rtllib/swg/swg_common.sv b/finn-rtllib/swg/swg_common.sv
index ff6778973c4d5d5663bc0c4f7043fca76ebdbf26..f2cdc333cad0a546aa9cfb55c9ca1dd74a753dc2 100644
--- a/finn-rtllib/swg/swg_common.sv
+++ b/finn-rtllib/swg/swg_common.sv
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,10 @@
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
 
+
 // loop controller used for both, "default" and "parallel", implementation styles
-module swg_controller #(
+module swg_controller
+import swg::*; #(
     int unsigned  LOOP_H_ITERATIONS,
     int unsigned  LOOP_W_ITERATIONS,
     int unsigned  LOOP_KH_ITERATIONS,
@@ -50,7 +52,7 @@ module swg_controller #(
     int TAIL_INCR_H,
     int TAIL_INCR_LAST,
 
-    parameter INNERMOST_STATE
+    state_e INNERMOST_STATE
 )(
     input   logic  clk,
     input   logic  rst_n,
@@ -61,14 +63,6 @@ module swg_controller #(
 );
 
     // state and counters
-    typedef enum logic [2:0] {
-        STATE_START,
-        STATE_LOOP_SIMD,
-        STATE_LOOP_KW,
-        STATE_LOOP_KH,
-        STATE_LOOP_W,
-        STATE_LOOP_H
-    }  state_e;
     state_e  State = INNERMOST_STATE;
     state_e  state_next;
 
diff --git a/finn-rtllib/swg/swg_pkg.sv b/finn-rtllib/swg/swg_pkg.sv
new file mode 100644
index 0000000000000000000000000000000000000000..1200310acad2931568235a82fd0277b8dd50a424
--- /dev/null
+++ b/finn-rtllib/swg/swg_pkg.sv
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+package swg;
+	typedef enum logic [2:0] {
+		STATE_START,
+		STATE_LOOP_SIMD,
+		STATE_LOOP_KW,
+		STATE_LOOP_KH,
+		STATE_LOOP_W,
+		STATE_LOOP_H
+	} state_e;
+endpackage : swg
diff --git a/finn-rtllib/swg/swg_template_axilite.v b/finn-rtllib/swg/swg_template_axilite.v
index 9479c7f80d7d82b27141dbe5abcce442049237bd..1f39e4440e47f752816907dc454d15b849c8fa85 100644
--- a/finn-rtllib/swg/swg_template_axilite.v
+++ b/finn-rtllib/swg/swg_template_axilite.v
@@ -1,8 +1,35 @@
+/******************************************************************************
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
 
-`timescale 1 ns / 1 ps
-
-module $TOP_MODULE_NAME$_axilite #
-(
+module $TOP_MODULE_NAME$_axilite #(
     // Users to add parameters here
 
     // User parameters ends
@@ -12,8 +39,7 @@ module $TOP_MODULE_NAME$_axilite #
     parameter integer C_S_AXI_DATA_WIDTH	= 32,
     // Width of S_AXI address bus
     parameter integer C_S_AXI_ADDR_WIDTH	= 6
-)
-(
+)(
     // Users to add ports here
     output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg0,
     output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg1,
diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv
index 4970762172b5bcc1c418c5bbb60bdfee52568dd8..78a8d0a3b984b987df6ca62f3789fbbedcbc6d8b 100644
--- a/finn-rtllib/swg/swg_template_default.sv
+++ b/finn-rtllib/swg/swg_template_default.sv
@@ -98,7 +98,7 @@ module $TOP_MODULE_NAME$_impl #(
         .TAIL_INCR_LAST($TAIL_INCR_LAST$),
         .INCR_BITWIDTH($INCR_BITWIDTH$),
         .IS_DEPTHWISE($IS_DEPTHWISE$),
-        .INNERMOST_STATE($INNERMOST_STATE$)
+        .INNERMOST_STATE(swg::$INNERMOST_STATE$)
     )
     controller_inst (
         .clk(ap_clk),
diff --git a/finn-rtllib/swg/swg_template_default_dynamic.sv b/finn-rtllib/swg/swg_template_default_dynamic.sv
index 412f8689ba33ec248ba7ebd50ca201204b001b1a..5a6fdda170242b804353cac186c1f920a4a71d7a 100644
--- a/finn-rtllib/swg/swg_template_default_dynamic.sv
+++ b/finn-rtllib/swg/swg_template_default_dynamic.sv
@@ -1,3 +1,34 @@
+/******************************************************************************
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
 module $TOP_MODULE_NAME$_controller #(
     int unsigned  CNTR_BITWIDTH,
     int unsigned  INCR_BITWIDTH,
@@ -27,6 +58,8 @@ module $TOP_MODULE_NAME$_controller #(
     input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_last
 );
 
+    import  swg::*;
+
     // (dynamic) configuration registers
     logic [CNTR_BITWIDTH-1:0] Cfg_cntr_simd      = $LOOP_SIMD_ITERATIONS$;
     logic [CNTR_BITWIDTH-1:0] Cfg_cntr_kw        = $LOOP_KW_ITERATIONS$;
@@ -62,14 +95,6 @@ module $TOP_MODULE_NAME$_controller #(
     end
 
     // state and counters
-    typedef enum logic [2:0] {
-        STATE_START,
-        STATE_LOOP_SIMD,
-        STATE_LOOP_KW,
-        STATE_LOOP_KH,
-        STATE_LOOP_W,
-        STATE_LOOP_H
-    }  state_e;
     state_e  State = $INNERMOST_STATE$;
     state_e  state_next;
 
diff --git a/finn-rtllib/swg/swg_template_parallel.sv b/finn-rtllib/swg/swg_template_parallel.sv
index b55a51e4005a1a8332ebe74acb61eac10f246f7f..83a525ff36b883122cd31a59675557a23085a872 100644
--- a/finn-rtllib/swg/swg_template_parallel.sv
+++ b/finn-rtllib/swg/swg_template_parallel.sv
@@ -123,7 +123,7 @@ module $TOP_MODULE_NAME$_impl #(
         .TAIL_INCR_LAST($TAIL_INCR_LAST$),
         .INCR_BITWIDTH($INCR_BITWIDTH$),
         .IS_DEPTHWISE($IS_DEPTHWISE$),
-        .INNERMOST_STATE($INNERMOST_STATE$)
+        .INNERMOST_STATE(swg::$INNERMOST_STATE$)
     )
     controller_inst (
         .clk(ap_clk),
diff --git a/notebooks/advanced/3_folding.ipynb b/notebooks/advanced/3_folding.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..07b66da52fbc340cc7ff14f47e42980f011aec3a
--- /dev/null
+++ b/notebooks/advanced/3_folding.ipynb
@@ -0,0 +1,664 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# FINN - Folding\n",
+    "--------------------------------------\n",
+    "**Note: We will utilize one of the intermediate models generated in the process of the cybersecurity end2end example**\n",
+    "\n",
+    "There is a local copy of `step_convert_to_hls.onnx` in this directory, which was renamed to `cybsec_PE_SIMD.onnx` to be able to go through this tutorial without requisites. But you can also generate it yourself with the [third cybersecurity Jupyter notebook](../end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb). After the execution of the estimates only build flow, it can be found in `../end2end_example/cybersecurity/output_estimates_only/intermediate_models/step_convert_to_hls.onnx`. \n",
+    "\n",
+    "This notebook describes the use of FINN parallelization parameters (PE & SIMD), also called folding factors, to efficiently optimize models so as to extract the maximum performance out of them. \n",
+    "\n",
+    "Please be aware that the folding factors can not be selected arbitrarily, each layer has constraints on which values the parallelization parameters can be set to, for more information see here: https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer\n",
+    "\n",
+    "We'll use the utility function `showInNetron()` to visualize and interact with our network in the Jupyter Notebook and `showSrc()` to show source code of FINN library calls."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.util.visualization import showInNetron, showSrc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note: The build_flow in the cybsec_mlp notebook comprises a transformation step `step_target_fps_parallelization` that automatically sets custom parallelization parameters needed to achieve a given `target_fps` by invoking the [`SetFolding` transformation](https://github.com/Xilinx/finn/blob/main/src/finn/transformation/fpgadataflow/set_folding.py#L46).\n",
+    "\n",
+    "More details of the above step can be found [here](https://github.com/Xilinx/finn/blob/main/src/finn/builder/build_dataflow_steps.py#L394)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook shows the manual version of this step and explains how these attributes can improve performance and what are their effects on resource utilization for developers who need to maximize the performance of their network. \n",
+    "\n",
+    "For that we will use the `cybsec_PE_SIMD.onnx` file as starting point. This intermediate model from the cybersecurity example is the model representation after the high-level ONNX layers are converted to HLS layers. Each node in the graph now corresponds to an HLS C++ function call and the parallelization parameters can be set using the node attributes.\n",
+    "\n",
+    "We will take this model to show how to set the folding factors manually and analyze the estimated execution clock cycles and the resource utilization of each layer in the network."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### FINN-style Dataflow Architectures <a id='dataflow_arch'></a>\n",
+    "\n",
+    "We start with a quick recap of FINN-style dataflow architectures. The key idea in such architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, as illustrated in the figure below.\n",
+    "\n",
+    "![](finn-dataflow.png)\n",
+    "\n",
+    "In practice, the layers are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library.\n",
+    "\n",
+    "Since each layer will be instantiated, we can flexibly set the parallelization of each layer and thus control resources and throughput of our network, as visualized in the image below:\n",
+    "\n",
+    "![](finn-folding.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part-1 : Loading the ONNX model.\n",
+    "\n",
+    "As discussed above, the network needs to go through a few preparation steps before it can be fed into our estimation functions.\n",
+    "\n",
+    "The `.onnx` file loaded here is taken from the cybersecurity end2end example notebook. \n",
+    "We pick the onnx file `cybsec_PE_SIMD.onnx` to which the necessary transformations have been applied for this notebook. This means, network layers mapped to necessary FINN-HLS blocks. In this case, the `MatrixVectorActivation` units. \n",
+    "\n",
+    "To interact with the `.onnx` file we use `ModelWrapper()`. This wrapper simplifies the access to different model attributes and allows us to apply custom transformations on the model.\n",
+    "\n",
+    "In the below cell, we load our onnx file and view the cybersecurity MLP network in Netron."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from qonnx.core.modelwrapper import ModelWrapper\n",
+    "model_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD.onnx\" \n",
+    "model = ModelWrapper(model_path)\n",
+    "\n",
+    "showInNetron(model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 2 : Parallelization Parameters: PE & SIMD"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The computational parallelism can be varied by setting the folding factors or also called parallelization parameters **PE** and **SIMD** of each layer. These parallelization attributes are subject to certain constraints and should be selected accordingly.\n",
+    "\n",
+    "To see more details about how this is implemented in the `MatrixVectorActivation` layer (MVAU), please have a look at [this documentation](https://github.com/Xilinx/finn/blob/github-pages/docs/finn-sheduling-and-folding.pptx). A schematic of the folding in an MVAU for a fully-connected layer is shown below:\n",
+    "\n",
+    "![](finn-folding-mvau.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the case of the MVAU, `PE` & `SIMD` are subject to the following constraints: \n",
+    "\n",
+    "If `MW` is the number of input features and `MH` the number of output features:\n",
+    "\n",
+    "        MW % SIMD == 0\n",
+    "        MH % PE == 0\n",
+    "        \n",
+    "Total folding in the case of the MVAU is defined as:\n",
+    "\n",
+    "    Total folding = (MH/PE) x (MW/SIMD)\n",
+    "\n",
+    "In a streaming dataflow architecture like it is in FINN designs the throughput is determined by the slowest layer. So, the goal of adjusting these parameters is to get an almost balanced pipeline i.e. equalizing the throughput rate of layers in the generated dataflow architecture.\n",
+    "\n",
+    "The FINN compiler provides analysis passes to facilitate the exploration of the folding factors of each layer. In this notebook we will show how to use these functions and explore how the parallelization parameters affect the clock cycles and the resource utilization of the generated dataflow architecture.\n",
+    "\n",
+    "We start with a naive case where `PE` & `SIMD` values across all layers are 1, this is the starting point of our exploration and is also the state the network is in after the conversion to HLS layers. If you take a look at the model using Netron and click on one of the MVAU layers, you can see that `PE` and `SIMD` are both set to 1 by default."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showInNetron(model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We import the analysis passes  `exp_cycles_per_layer()` and  `res_estimation()` to estimate the number of clock cycles and resource utilization of each network layer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer\n",
+    "from finn.analysis.fpgadataflow.res_estimation import res_estimation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Analysis passes in FINN return information about the model in form of a dictionary, you can learn more about analysis passes in general in this Jupyter notebook: [0_custom_analysis_pass.ipynb](0_custom_analysis_pass.ipynb).\n",
+    "\n",
+    "We start by calling the analysis pass `exp_cycles_per_layer()`, which returns a dictionary with the layer names as keys and the expected cycles as values. Afterwards, we plot the result in a block diagram."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cycles_dict = model.analysis(exp_cycles_per_layer)\n",
+    "cycles_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "fig = plt.figure(figsize = (10, 5))\n",
+    "plt.bar(cycles_dict.keys(), cycles_dict.values(), color ='blue', width = 0.3)\n",
+    "plt.xlabel(\"Network layers\")\n",
+    "plt.ylabel(\"Number of clock cycles\")\n",
+    "plt.title(\"Clock cycles per layer PE=SIMD=1\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We observe that the bottleneck in the execution of the model on hardware would come from the execution of the first layer which takes estimated 38400 clock cycles to execute one set of its inputs.\n",
+    "\n",
+    "No matter how quickly the other layers execute, the throughput will be defined by the first layer's execution latency.\n",
+    "\n",
+    "Let's have a look now at the estimated resources per layer by calling another analysis pass.\n",
+    "The keys are again the layer names, but the values are now a dictionary with the resource estimates per layer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res_dict = model.analysis(res_estimation)\n",
+    "res_dict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next to the absolute numbers of LUTs, BRAM, URAM and DSPs, the analysis pass also provides information about the efficiency of the memory usage. If the memory type is not utilized, the efficiency is by default 1. You can see that above for the `URAM_efficiency`. In all other cases the efficiency indicates the actual parameter storage needed divided by the allocated BRAM/URAM storage. So, this means in our example MVAU_0 uses 5 block ram and they are 83% utilized. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After we extract that information from the model, we plot the number of LUTs. In this notebook we concentrate on the influence on the LUT usage, but by manipulating the code below, you can also extract information about memory and dsp usage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extracting LUTs from res_dict\n",
+    "LUTs = [res_dict[key][\"LUT\"] for key in res_dict.keys()]   \n",
+    "\n",
+    "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n",
+    "fig = plt.figure(figsize = (10, 5))\n",
+    "plt.bar(res_dict.keys(), LUTs, color ='green', width = 0.3)\n",
+    "plt.xlabel(\"Network layers\")\n",
+    "plt.ylabel(\"Number of LUTs\")\n",
+    "plt.title(\"No. of LUTs per layer PE=SIMD=1\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Since we identified above that the first layer takes the highest number of cycles to complete the execution, we will now try to adjust the folding parameters to reduce its latency at the expense of an increase in resource utilization."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modify Parameters\n",
+    "\n",
+    "We now modify the parallelization parameters of the first network layer to reduce its latency.\n",
+    "We only extract the first `MatrixVectorActivation` block from the model and set the parallelization parameters manually.\n",
+    "\n",
+    "In the first step, we left the `PE` & `SIMD` values for all the layers on default (=1) to establish a baseline and measure the estimated clock cycles and resource utilization for each of the individual layers.\n",
+    "\n",
+    "To set `PE` & `SIMD`, we will utilize functionality from the FINN compiler. Each layer type has a Python wrapper which can be instantiated using the `getCustomOp()` function. The wrapper offers several helper functions like `get_nodeattr()` and `set_nodeattr()` to access and set the attributes of a node."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.custom_op.registry import getCustomOp\n",
+    "\n",
+    "list_of_mvaus = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+    "mvau0 = list_of_mvaus[0]\n",
+    "\n",
+    "mvau0_inst = getCustomOp(mvau0)\n",
+    "\n",
+    "# Get the node attributes to check the current setting\n",
+    "print(\"The parallelization parameters of %s were: \" % mvau0.name)\n",
+    "print(\"PE: \" + str(mvau0_inst.get_nodeattr(\"PE\")))\n",
+    "print(\"SIMD: \" + str(mvau0_inst.get_nodeattr(\"SIMD\")))\n",
+    "\n",
+    "# Set the new node attributes\n",
+    "mvau0_inst.set_nodeattr(\"PE\", 2)\n",
+    "mvau0_inst.set_nodeattr(\"SIMD\", 5)\n",
+    "\n",
+    "# Get the node attributes to check the updated setting\n",
+    "print(\"The parallelization parameters of %s are updated to: \" % mvau0.name)\n",
+    "print(\"PE: \" + str(mvau0_inst.get_nodeattr(\"PE\")))\n",
+    "print(\"SIMD: \" + str(mvau0_inst.get_nodeattr(\"SIMD\")))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We save the model and view it. On expanding the first `MatrixVectorActivation` we can see the updated `PE` & `SIMD` parameters for that layer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(\"cybsec_PE_SIMD_modified.onnx\")\n",
+    "showInNetron(\"cybsec_PE_SIMD_modified.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From the above total folding formula, we have reduced the total folding of our layer from `600 x 64` to `120 x 32`. Hence, resulting in an estimated `10x` decrease in the execution latency of our layer. \n",
+    "This can be observed in the new estimated clock cycles."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cycles_dict_updated = model.analysis(exp_cycles_per_layer)\n",
+    "cycles_dict_updated"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure(figsize = (10, 5))\n",
+    "plt.bar(cycles_dict_updated.keys(), cycles_dict_updated.values(), color ='blue', width = 0.3)\n",
+    "plt.xlabel(\"Network layers\")\n",
+    "plt.ylabel(\"Number of clock cycles\")\n",
+    "plt.title(\"Clock cycles per layer with updated folding factors\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This has of course consequences for the resource usage of the network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res_dict_updated = model.analysis(res_estimation)\n",
+    "res_dict_updated"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extracting LUTs from res_dict\n",
+    "LUTs_updated = [res_dict_updated[key][\"LUT\"] for key in res_dict_updated.keys()]   \n",
+    "\n",
+    "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n",
+    "fig = plt.figure(figsize = (10, 5))\n",
+    "plt.bar(res_dict_updated.keys(), LUTs_updated, color ='green', width = 0.3)\n",
+    "plt.xlabel(\"Network Layers\")\n",
+    "plt.ylabel(\"LUT Utilisation\")\n",
+    "plt.title(\"No. of LUTs per layer with updated folding factors\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From these numbers, we see that the first layer has been removed as the bottleneck and that the entire network can now perform one inference in ~4096 clock cycles (when the pipeline is full) as compared to the earlier configuration where it took ~38400 execution cycles.\n",
+    "\n",
+    "This decrease in execution latency of the network though comes at a cost of a 45% increase in LUT resource utilization for the first layer of the network."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Important Note : StreamingDataWidthConverters"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next to resources and performance, folding factors (or parallelization parameters) are influencing also other properties of the generated design. Since we are able to generate results in parallel, the data that gets fed into the layer needs to be packed in a specific format to provide the correct data at the correct time for the internal parallelism. Also, the data that comes out of a layer will be in a specific format depending on the internal parallelism."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To analyze the influence of the folding factors on the data streams between layers, we first will import the original model (with `PE=SIMD=1`) and then we will import the updated model, so that we can compare the two of them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dir_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/\" \n",
+    "model_orig = ModelWrapper(dir_path + \"cybsec_PE_SIMD.onnx\")\n",
+    "model_updated = ModelWrapper(\"cybsec_PE_SIMD_modified.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the next step we extract the information from all layers. For MVAUs the input shape is (1, MW/SIMD, SIMD) and the output shape is (1, MH/PE, PE)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Original model\n",
+    "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+    "print(\"In the original model (pe=simd=1): \")\n",
+    "for mvau in list_of_mvaus:\n",
+    "    mvau_inst = getCustomOp(mvau)\n",
+    "    print(\"Layer: \" + mvau.name)\n",
+    "    print(\"Input shape: \" + str(mvau_inst.get_folded_input_shape()))\n",
+    "    print(\"Output shape: \" + str(mvau_inst.get_folded_output_shape()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Updated model\n",
+    "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+    "print(\"In the original model (pe=simd=1): \")\n",
+    "for mvau in list_of_mvaus:\n",
+    "    mvau_inst = getCustomOp(mvau)\n",
+    "    print(\"Layer: \" + mvau.name)\n",
+    "    print(\"Input shape: \" + str(mvau_inst.get_folded_input_shape()))\n",
+    "    print(\"Output shape: \" + str(mvau_inst.get_folded_output_shape()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that the input and output shape for MatrixVectorActivation_0 has changed after we have changed the folding factors. These changes have direct influence on the in/out stream width. We can have a closer look at the formula to calculate the stream width of an MVAU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showSrc(mvau_inst.get_instream_width)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "showSrc(mvau_inst.get_outstream_width)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The input stream width can be calculated by multiplying the input bit width with SIMD and the output stream width can be calculated by multiplying the output bit width with PE."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To connect two layers with each other for the final design, the input stream width of a node needs to match the output stream width of the preceding node. If that is not the case FINN inserts DataWidthConverters (DWCs) to resolve this mismatch. Let's have a look at the input/output stream width of the layers before updating the parallelization parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Original model\n",
+    "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+    "print(\"In the original model (pe=simd=1): \")\n",
+    "for mvau in list_of_mvaus:\n",
+    "    mvau_inst = getCustomOp(mvau)\n",
+    "    print(\"Layer: \" + mvau.name)\n",
+    "    print(\"Input stream width: \" + str(mvau_inst.get_instream_width()))\n",
+    "    print(\"Output stream width: \" + str(mvau_inst.get_outstream_width()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the original model the output stream width of one layer matches the input stream width of the following layer. So there would be no DWC required when generating the final design."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For the updated model, the situation is different. Let's have a look how the stream widths have changed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Updated model\n",
+    "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+    "print(\"In the original model (pe=simd=1): \")\n",
+    "for mvau in list_of_mvaus:\n",
+    "    mvau_inst = getCustomOp(mvau)\n",
+    "    print(\"Layer: \" + mvau.name)\n",
+    "    print(\"Input stream width: \" + str(mvau_inst.get_instream_width()))\n",
+    "    print(\"Output stream width: \" + str(mvau_inst.get_outstream_width()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As we can see, the output stream width of MatrixVectorActivation_0 has now changed to `4`, while the input stream width of MatrixVectorActivation_1 stayed `2`. So, the FINN compiler would insert a DWC between these nodes, we can manually invoke this behavior by calling the transformation `InsertDWC` on our model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n",
+    "from qonnx.transformation.general import GiveUniqueNodeNames\n",
+    "\n",
+    "model_updated = model_updated.transform(InsertDWC())\n",
+    "model_updated = model_updated.transform(GiveUniqueNodeNames())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_updated.save(\"cybsec_DWC.onnx\")\n",
+    "showInNetron(\"cybsec_DWC.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can observe in the model that a DWC was inserted between the first two layers.\n",
+    "Since the DWC will also be a hardware block in our final FINN design, it has a latency and resources associated with it. Let's have a final look in our resource estimates."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_dwc = ModelWrapper(\"cybsec_DWC.onnx\")\n",
+    "res_dict_dwc = model_dwc.analysis(res_estimation)\n",
+    "res_dict_dwc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Since we have now one additional layer, we manipulate the data to shorten the layer names in the plot."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "layers = res_dict_dwc.keys()\n",
+    "# replace names of layers with abbreviations\n",
+    "layers = [n.replace(\"MatrixVectorActivation_\", \"MVU\") for n in layers]\n",
+    "layers = [n.replace(\"StreamingDataWidthConverter_Batch\", \"DWC\") for n in layers]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extracting LUTs from res_dict\n",
+    "LUTs_dwc = [res_dict_dwc[key][\"LUT\"] for key in res_dict_dwc.keys()]   \n",
+    "\n",
+    "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n",
+    "fig = plt.figure(figsize = (10, 5))\n",
+    "plt.bar(layers, LUTs_dwc, color ='red', width = 0.3)\n",
+    "plt.xlabel(\"Network Layers\")\n",
+    "plt.ylabel(\"LUT Utilisation\")\n",
+    "plt.title(\"Estimated LUT values used for each network layer\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the case of our example network, the `StreamingDataWidthConverter_Batch` layer does not consume a large number of LUT resources as shown in the graph. This might be different for larger models and if there are a higher number of DWCs inserted. Please be aware of this when setting the folding factors for your network."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/advanced/cybsec_PE_SIMD.onnx b/notebooks/advanced/cybsec_PE_SIMD.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..b450cc9e43361e845fda8c95d743e1b461a1a9ad
Binary files /dev/null and b/notebooks/advanced/cybsec_PE_SIMD.onnx differ
diff --git a/notebooks/advanced/finn-dataflow.png b/notebooks/advanced/finn-dataflow.png
new file mode 100755
index 0000000000000000000000000000000000000000..ebe98d0fbd1878fabb9ae2d87bd9b111d62dc39e
Binary files /dev/null and b/notebooks/advanced/finn-dataflow.png differ
diff --git a/notebooks/advanced/finn-folding-mvau.png b/notebooks/advanced/finn-folding-mvau.png
new file mode 100755
index 0000000000000000000000000000000000000000..bbba00182c888b072432116a3a9eafbb1d8cec0e
Binary files /dev/null and b/notebooks/advanced/finn-folding-mvau.png differ
diff --git a/notebooks/advanced/finn-folding.png b/notebooks/advanced/finn-folding.png
new file mode 100755
index 0000000000000000000000000000000000000000..019b4aa1e7d2f447949d9450609b2e5e9cbd04c0
Binary files /dev/null and b/notebooks/advanced/finn-folding.png differ
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
index 2695113661ed286c94ae9cb5f20ca99cc1fced7f..daecb59743d1b843e9d7fd40fdbf5bf10fac2fe1 100644
--- a/src/finn/core/onnx_exec.py
+++ b/src/finn/core/onnx_exec.py
@@ -31,7 +31,6 @@ import numpy as np
 import qonnx.analysis.topology as ta
 from qonnx.core.onnx_exec import execute_onnx as execute_onnx_base
 
-from finn.core.remote_exec import remote_exec
 from finn.core.rtlsim_exec import rtlsim_exec
 
 
@@ -51,7 +50,6 @@ def execute_onnx(
 
     # check if model has an execution mode set
     # if None, execute model node using the QONNX-provided execute_onnx impl
-    # if set to "remote_pynq" execute model on PYNQ board
     # if set to "rtlsim" execute model using pyverilator
     model_exec_mode = model.get_metadata_prop("exec_mode")
     if (model_exec_mode is None) or (model_exec_mode == ""):
@@ -91,22 +89,17 @@ def execute_onnx(
 
     # check if model has an execution mode set
     # if None, execute model node by node using execute_node()
-    # if set to "remote_pynq" execute model on PYNQ board
     # if set to "rtlsim" execute model using pyverilator
     model_exec_mode = model.get_metadata_prop("exec_mode")
     if (model_exec_mode is None) or (model_exec_mode == ""):
         return execute_onnx_base()
-    elif model_exec_mode == "remote_pynq":
-        # use remote exec metadata built into model to execute on a remote PYNQ
-        remote_exec(model, execution_context)
     elif model_exec_mode == "rtlsim":
         # use stitched IP for rtlsim
         rtlsim_exec(model, execution_context)
     else:
         raise Exception(
-            """Metadata property "exec_mode" is set to an unknown value.
-        Can be left unset or has to be set to "remote_pynq" for remote execution
-        on PYNQ board or "rtlsim" for execution using pyverilator!"""
+            """Metadata property "exec_mode" is set to an unknown value. Can be left
+            unset or has to be set to "rtlsim" for execution using pyverilator!"""
         )
 
     if return_full_exec_context:
diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
deleted file mode 100644
index f487b48f86f1ef0440ed4a8bf371083369dd096c..0000000000000000000000000000000000000000
--- a/src/finn/core/remote_exec.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) 2020 Xilinx, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of Xilinx nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import numpy as np
-import os
-import subprocess
-import warnings
-
-
-def remote_exec(model, execution_context):
-    """Executes the given model remotely on the pynq board. The metadata properties
-    related to the pynq board have to be set. The execution context contains the
-    input values."""
-    # TODO fix for multi input-output
-    pynq_ip = model.get_metadata_prop("pynq_ip")
-    pynq_port = int(model.get_metadata_prop("pynq_port"))
-    pynq_username = model.get_metadata_prop("pynq_username")
-    pynq_password = model.get_metadata_prop("pynq_password")
-    pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
-    deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
-    platform = model.get_metadata_prop("platform")
-    assert platform in ["alveo", "zynq-iodma"]
-    bitfile = model.get_metadata_prop("bitfile")
-    bitfile = os.path.basename(bitfile)
-    if pynq_password == "":
-        if "zynq" in platform:
-            raise Exception("PYNQ board remote exec needs password for sudo")
-        else:
-            local_prefix = ""  # assume we are using an ssh key
-            warnings.warn("Empty password, make sure you've set up an ssh key")
-    else:
-        local_prefix = "sshpass -p %s " % pynq_password
-
-    if platform == "alveo":
-        # Alveo can run without sudo
-        remote_prefix = ""
-    elif "zynq" in platform:
-        # PYNQ Zynq boards need to execute with sudo
-        remote_prefix = "echo %s | sudo -S " % pynq_password
-
-    inp = execution_context[model.graph.input[0].name]
-    # make copy of array before saving it
-    inp = inp.copy()
-    batchsize = inp.shape[0]
-    np.save(os.path.join(deployment_dir, "input.npy"), inp)
-    # extracting last folder of absolute path (deployment_dir)
-    deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
-    # copy input to PYNQ board
-    cmd = local_prefix + "scp -P{} -r {}/input.npy {}@{}:{}/{}".format(
-        pynq_port,
-        deployment_dir,
-        pynq_username,
-        pynq_ip,
-        pynq_target_dir,
-        deployment_folder,
-    )
-    bash_command = ["/bin/bash", "-c", cmd]
-    process_scp_in = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-    process_scp_in.communicate()
-
-    # use platform attribute for correct remote execution
-    if platform == "alveo":
-        remote_cmd = "bash -ic 'bash alveo_run.sh execute %d' \"" % batchsize
-    else:
-        remote_cmd = (
-            "python3.6 driver.py --exec_mode=execute --batchsize={} "
-            "--bitfile={} --inputfile=input.npy --outputfile=output.npy "
-            '--platform={} "'
-        ).format(batchsize, bitfile, platform)
-    cmd = (
-        local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd
-    ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder)
-    bash_command = ["/bin/bash", "-c", cmd]
-    process_exec_accel = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-    process_exec_accel.communicate()
-    # remove stale output file from local dir, if any
-    try:
-        os.remove("{}/output.npy".format(deployment_dir))
-    except FileNotFoundError:
-        pass
-    # copy generated output to local
-    cmd = local_prefix + "scp -P{} {}@{}:{}/{}/output.npy {}".format(
-        pynq_port,
-        pynq_username,
-        pynq_ip,
-        pynq_target_dir,
-        deployment_folder,
-        deployment_dir,
-    )
-    bash_command = ["/bin/bash", "-c", cmd]
-    process_scp_out = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-    process_scp_out.communicate()
-    outp = np.load("{}/output.npy".format(deployment_dir))
-    execution_context[model.graph.output[0].name] = outp
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
index 3533fd13399a4ba4392d66af785979afc32cab29..08633be33b8ab6d096275aca2c362a8bac43d704 100644
--- a/src/finn/core/throughput_test.py
+++ b/src/finn/core/throughput_test.py
@@ -28,90 +28,11 @@
 
 import numpy as np
 import os
-import subprocess
-import warnings
 from qonnx.util.basic import gen_finn_dt_tensor
 
 from finn.core.rtlsim_exec import rtlsim_exec
 
 
-def throughput_test_remote(model, batchsize=1000, timeout=None):
-    """Runs the throughput test for the given model remotely on the pynq board.
-    The metadata properties related to the pynq board have to be set.
-    Additionally a timeout for the SSH communication can be set.
-    Returns a dictionary with results of the throughput test. Returns None
-    if the test fails."""
-
-    pynq_ip = model.get_metadata_prop("pynq_ip")
-    pynq_port = int(model.get_metadata_prop("pynq_port"))
-    pynq_username = model.get_metadata_prop("pynq_username")
-    pynq_password = model.get_metadata_prop("pynq_password")
-    pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
-    deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
-    # extracting last folder of absolute path (deployment_dir)
-    deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
-    platform = model.get_metadata_prop("platform")
-    assert platform in ["alveo", "zynq-iodma"]
-    bitfile = model.get_metadata_prop("bitfile")
-    bitfile = os.path.basename(bitfile)
-    if pynq_password == "":
-        if "zynq" in platform:
-            raise Exception("PYNQ board remote exec needs password for sudo")
-        else:
-            local_prefix = ""  # assume we are using an ssh key
-            warnings.warn("Empty password, make sure you've set up an ssh key")
-    else:
-        local_prefix = "sshpass -p %s " % pynq_password
-
-    if platform == "alveo":
-        # Alveo can run without sudo but needs correct environment
-        remote_prefix = "conda activate finn-pynq-alveo; "
-    elif "zynq" in platform:
-        # PYNQ Zynq boards need to execute with sudo
-        remote_prefix = "echo %s | sudo -S " % pynq_password
-
-    # use platform attribute for correct remote execution
-    if platform == "alveo":
-        remote_cmd = "bash -ic 'bash alveo_run.sh throughput_test %d' \"" % batchsize
-    else:
-        remote_cmd = (
-            "python3.6 driver.py --exec_mode=throughput_test --batchsize={} "
-            "--bitfile={} --inputfile=input.npy --outputfile=output.npy "
-            '--platform={} "'
-        ).format(batchsize, bitfile, platform)
-    cmd = (
-        local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd
-    ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder)
-    bash_command = ["/bin/bash", "-c", cmd]
-    process_throughput_test = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-    process_throughput_test.communicate(timeout=timeout)
-
-    # remove any pre-existing metrics file
-    try:
-        os.remove("{}/nw_metrics.txt".format(deployment_dir))
-    except FileNotFoundError:
-        pass
-
-    cmd = local_prefix + "scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format(
-        pynq_port,
-        pynq_username,
-        pynq_ip,
-        pynq_target_dir,
-        deployment_folder,
-        deployment_dir,
-    )
-    bash_command = ["/bin/bash", "-c", cmd]
-    process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-    process_compile.communicate(timeout=timeout)
-
-    try:
-        with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
-            res = eval(file.read())
-        return res
-    except FileNotFoundError:
-        return None
-
-
 def throughput_test_rtlsim(model, batchsize=100):
     """Runs a throughput test for the given IP-stitched model. When combined
     with tracing, useful to determine bottlenecks and required FIFO sizes."""
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
index 173a1578417b76dfb8ae24c94f3d40616dbe0d55..c54c4ac1c90ed72331c61b0a28cf6040b8d66881 100755
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -617,13 +617,13 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
             # skip innermost SIMD loop completely
             if loop_kw_iterations == 1:
                 # skip innermost KW loop completely
-                code_gen_dict["$INNERMOST_STATE$"] = [str(3)]  # STATE_LOOP_KH
+                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"]
                 loop_kh_iterations -= 1  # -1 because state is initial state
             else:
-                code_gen_dict["$INNERMOST_STATE$"] = [str(2)]  # STATE_LOOP_KW
+                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KW"]
                 loop_kw_iterations -= 1  # -1 because state is initial state
         else:
-            code_gen_dict["$INNERMOST_STATE$"] = [str(1)]  # STATE_LOOP_SIMD
+            code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"]
             loop_simd_iterations -= 1  # -1 because state is initial state
 
         cntr_bitwidth = math.ceil(
@@ -736,10 +736,10 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         loop_simd_iterations = 1
 
         if loop_w_iterations == 1:
-            code_gen_dict["$INNERMOST_STATE$"] = [str(5)]  # STATE_LOOP_H
+            code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_H"]
             loop_h_iterations -= 1  # -1 because state is initial state
         else:
-            code_gen_dict["$INNERMOST_STATE$"] = [str(4)]  # STATE_LOOP_W
+            code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_W"]
             loop_w_iterations -= 1  # -1 because state is initial state
 
         # set head and tail address increment values
@@ -1064,6 +1064,9 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         shutil.copy2(
             os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_common.sv", code_gen_dir
         )
+        shutil.copy2(
+            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_pkg.sv", code_gen_dir
+        )
 
         # set ipgen_path and ip_path so that HLS-Synth transformation
         # and stich_ip transformation do not complain
@@ -1082,6 +1085,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         verilog_paths = [code_gen_dir]
         verilog_files = [
+            "swg_pkg.sv",
             self.get_nodeattr("gen_top_module") + "_wrapper.v",
             self.get_nodeattr("gen_top_module") + "_impl.sv",
             "swg_common.sv",
@@ -1106,6 +1110,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
 
         sourcefiles = [
+            "swg_pkg.sv",
             self.get_nodeattr("gen_top_module") + "_wrapper.v",
             self.get_nodeattr("gen_top_module") + "_impl.sv",
             "swg_common.sv",
diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py
deleted file mode 100644
index d4684dc83ce1f22ecae2ca04af5e5973519db4f6..0000000000000000000000000000000000000000
--- a/src/finn/transformation/fpgadataflow/make_deployment.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import subprocess
-from distutils.dir_util import copy_tree
-from qonnx.transformation.base import Transformation
-from shutil import copy
-
-import finn.transformation.fpgadataflow.templates as templates
-from finn.util.basic import make_build_dir
-
-
-class DeployToPYNQ(Transformation):
-    """Collects all necessary files for deployment and copies them to the PYNQ board.
-    Expects information about PYNQ board to make scp possible:
-
-    IP address of board, username and password for board and target directory where
-    the files are stored on the board"""
-
-    def __init__(self, ip, port, username, password, target_dir):
-        super().__init__()
-        self.ip = ip
-        self.port = port
-        self.username = username
-        self.password = password
-        self.target_dir = target_dir
-
-    def apply(self, model):
-        # set metadata properties accordingly to user input specifications
-        model.set_metadata_prop("pynq_ip", self.ip)
-        model.set_metadata_prop("pynq_port", str(self.port))
-        model.set_metadata_prop("pynq_username", self.username)
-        model.set_metadata_prop("pynq_password", self.password)
-        model.set_metadata_prop("pynq_target_dir", self.target_dir)
-
-        # create directory for deployment files
-        deployment_dir = make_build_dir(prefix="pynq_deployment_")
-        model.set_metadata_prop("pynq_deployment_dir", deployment_dir)
-
-        # get and copy necessary files
-        # .bit and .hwh file
-        bitfile = model.get_metadata_prop("bitfile")
-        hwh_file = model.get_metadata_prop("hw_handoff")
-        deploy_files = [bitfile, hwh_file]
-
-        for dfile in deploy_files:
-            if dfile is not None:
-                copy(dfile, deployment_dir)
-
-        # helper script for Alveo
-        platform = model.get_metadata_prop("platform")
-        if platform == "alveo":
-            alveo_run_sh = templates.alveo_run_sh_template
-            fill_dict = {
-                "$REMOTE_DEPLOY_DIR$": self.target_dir
-                + "/"
-                + os.path.basename(deployment_dir),
-                "$CONDA_ENV_NAME$": "finn-pynq-alveo",
-                "$REMOTE_XRT$": os.environ["XILINX_XRT"],
-                "$REMOTE_PLATFORM_REPO_PATHS$": os.environ["PLATFORM_REPO_PATHS"],
-                "$BITFILE$": os.path.basename(bitfile),
-            }
-            for key, value in fill_dict.items():
-                alveo_run_sh = alveo_run_sh.replace(key, value)
-            alveo_run_sh_path = deployment_dir + "/alveo_run.sh"
-            with open(alveo_run_sh_path, "w") as f:
-                f.write(alveo_run_sh)
-
-        # driver.py and python libraries
-        pynq_driver_dir = model.get_metadata_prop("pynq_driver_dir")
-        copy_tree(pynq_driver_dir, deployment_dir)
-        model.set_metadata_prop("pynq_deploy_dir", deployment_dir)
-        model.set_metadata_prop("exec_mode", "remote_pynq")
-
-        # create target directory on PYNQ board
-        cmd = 'ssh {}@{} -p {} "mkdir -p {}"'.format(
-            self.username, self.ip, self.port, self.target_dir
-        )
-        bash_command = ["/bin/bash", "-c", cmd]
-        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-        process_compile.communicate()
-        # copy directory to PYNQ board using scp
-        cmd = "scp -P{} -r {} {}@{}:{}".format(
-            self.port, deployment_dir, self.username, self.ip, self.target_dir
-        )
-        bash_command = ["/bin/bash", "-c", cmd]
-        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-        process_compile.communicate()
-
-        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py
index 05ee6ad920d7e921dc9611a7936e28288ba53a0a..158825191e3372fc133e01b47ea7138b0aba899f 100644
--- a/src/finn/transformation/fpgadataflow/template_driver.py
+++ b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -135,5 +135,5 @@ if __name__ == "__main__":
         file.close()
         print("Results written to nw_metrics.txt")
     else:
-        raise Exception("Exec mode has to be set to remote_pynq or throughput_test")
+        raise Exception("Exec mode has to be set to execute or throughput_test")
 """
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 5ffb5e4f46204954288f5e1d922cbb07392b6613..c82dd3779196126d6b8a1566dcbc9b43f2739f5b 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -246,22 +246,6 @@ report_utilization -hierarchical -hierarchical_depth 4 -file synth_report.xml -f
 close_project
 """
 
-alveo_run_sh_template = """#!/bin/bash
-
-if [ "$#" -ne 2 ]; then
-    echo "Usage: alveo_run.sh <exec_mode={execute, throughput_test}> <batch_size>"
-    exit -1
-fi
-
-cd $REMOTE_DEPLOY_DIR$
-eval "$(conda shell.bash hook)"
-conda activate $CONDA_ENV_NAME$
-source $REMOTE_XRT$/setup.sh
-export PLATFORM_REPO_PATHS=$REMOTE_PLATFORM_REPO_PATHS$
-python3.6 driver.py --exec_mode=$1 --batchsize=$2 --bitfile=$BITFILE$ \
-    --inputfile=input.npy --outputfile=output.npy --platform=alveo
-"""
-
 vitis_gen_xml_report_tcl_template = """
 open_project $VITIS_PROJ_PATH$/_x/link/vivado/vpl/prj/prj.xpr
 open_run impl_1
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index 9819086d826a51d1df5240d88c4fda8513cc9ba6..bbe5e1a0e319a8f62e9a1bcd4f0857f36295049e 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -286,6 +286,7 @@ class QuantReluHandler(QuantActBaseHandler):
     def valid_predecessor_op_types(self):
         return [
             "Relu",
+            "Selu",
         ]
 
     def _check_compatibility(self):
@@ -293,16 +294,19 @@ class QuantReluHandler(QuantActBaseHandler):
             q_inst = getCustomOp(self._q_node)
             narrow = q_inst.get_nodeattr("narrow")
             signed = q_inst.get_nodeattr("signed")
-            if signed or narrow:
-                raise ValueError(
-                    "FINN only supports unsigned and non-narrow Quant nodes "
-                    "for Relu activations."
-                )
             if not self._model.get_initializer(self._q_node.input[2]) == 0:
                 raise ValueError(
                     "Only Quant nodes with zero-point == 0 "
                     "are currently supported for ReLu activations."
                 )
+            act_node = self._model.find_direct_predecessors(self._q_node)
+            act_node = act_node[0]
+            if act_node.op_type == "Relu":
+                if signed or narrow:
+                    raise ValueError(
+                        "FINN only supports unsigned and non-narrow Quant nodes "
+                        "for Relu activations."
+                    )
         elif self._q_node.op_type == "BipolarQuant":
             return
         else:
@@ -312,7 +316,31 @@ class QuantReluHandler(QuantActBaseHandler):
         # No bias allowed for Relu activations, see: https://github.com/Xilinx/
         # brevitas/blob/a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/
         # export/onnx/finn/handler/act.py#L48
-        bias = np.array([0.0], dtype=np_default_dtype)
+        act_node = self._model.find_direct_predecessors(self._q_node)
+        act_node = act_node[0]
+        if act_node.op_type == "Relu":
+            bias = np.array([0.0], dtype=np_default_dtype)
+        elif act_node.op_type == "Selu":
+            # Gather parameters
+            q_inst = getCustomOp(self._q_node)
+            if self._q_node.op_type == "Quant":
+                bit_width = self._model.get_initializer(self._q_node.input[3])
+                narrow = q_inst.get_nodeattr("narrow")
+            elif self._q_node.op_type == "BipolarQuant":
+                bit_width = 1.0
+            else:
+                raise RuntimeError("Got an unexpected quantizer node type")
+            # Calculate bias, see: https://github.com/Xilinx/brevitas/blob/
+            # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
+            # onnx/finn/handler/act.py#L64
+            if bit_width == 1.0:
+                bias = np.array([-0.5], dtype=np_default_dtype)
+            else:
+                if narrow:
+                    min_non_scaled_val = -(2 ** (bit_width - 1) - 1)
+                else:
+                    min_non_scaled_val = -(2 ** (bit_width - 1))
+                bias = np.array([min_non_scaled_val], dtype=np_default_dtype)
         return bias
 
     def _calculate_thresholds(self):
@@ -326,24 +354,53 @@ class QuantReluHandler(QuantActBaseHandler):
         quant_scale = self._model.get_initializer(self._q_node.input[1]).astype(
             np.float32
         )
-        # q_inst = getCustomOp(self._q_node)
-        # narrow = q_inst.get_nodeattr("narrow")
+        act_node = self._model.find_direct_predecessors(self._q_node)
+        act_node = act_node[0]
+        if act_node.op_type == "Relu":
 
-        # Calculate thersholds, see: https://github.com/Xilinx/brevitas/blob/
-        # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
-        # onnx/finn/handler/act.py#L21
-        num_distinct_values = 2**bit_width
-        num_thresholds = int(num_distinct_values - 1)
-        flat_scale = quant_scale.flatten().astype(np.float32)
-        num_scale_channels = flat_scale.shape[0]
-        step = np.abs(flat_scale).astype(np.float32)
-        min_threshold = step / 2
-        thresholds = np.empty(
-            (num_scale_channels, num_thresholds), dtype=np_default_dtype
-        )
-        for c in range(num_scale_channels):
-            for t in range(num_thresholds):
-                thresholds[c][t] = min_threshold[c] + step[c] * t
+            # Calculate thersholds, see: https://github.com/Xilinx/brevitas/blob/
+            # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
+            # onnx/finn/handler/act.py#L21
+            num_distinct_values = 2**bit_width
+            num_thresholds = int(num_distinct_values - 1)
+            flat_scale = quant_scale.flatten().astype(np.float32)
+            num_scale_channels = flat_scale.shape[0]
+            step = np.abs(flat_scale).astype(np.float32)
+            min_threshold = step / 2
+            thresholds = np.empty(
+                (num_scale_channels, num_thresholds), dtype=np_default_dtype
+            )
+            for c in range(num_scale_channels):
+                for t in range(num_thresholds):
+                    thresholds[c][t] = min_threshold[c] + step[c] * t
+
+        elif act_node.op_type == "Selu":
+            q_inst = getCustomOp(self._q_node)
+            narrow = q_inst.get_nodeattr("narrow")
+            if narrow:
+                num_distinct_values = 2**bit_width - 1
+            else:
+                num_distinct_values = 2**bit_width
+
+            num_thresholds = int(num_distinct_values - 1)
+            flat_scale = quant_scale.flatten().astype(np.float32)
+            num_scale_channels = flat_scale.shape[0]
+            scale = np.abs(flat_scale).astype(np.float32)
+            half_scale = scale / 2
+            # alpha and lambda
+            # from https://pytorch.org/docs/stable/generated/torch.nn.SELU.html
+            alpha = 1.6732632423543772848170429916717
+            selu_scale = 1.0507009873554804934193349852946
+            thresholds = np.empty(
+                (num_scale_channels, num_thresholds), dtype=np_default_dtype
+            )
+            for c in range(num_scale_channels):
+                for t in range(num_thresholds):
+                    step = -1.0 + half_scale + scale[c] * t
+                    if step <= 0:
+                        thresholds[c][t] = np.log(step / (alpha * selu_scale) + 1)
+                    else:
+                        thresholds[c][t] = step / selu_scale
 
         # ToDo: The index 1 needs to be changed to -1 for the channels last format
         num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1]
@@ -371,10 +428,10 @@ class QuantReluHandler(QuantActBaseHandler):
                 "the Quant node must exist."
             )
         act_node = act_node[0]
-        if not act_node.op_type == "Relu":
+        if act_node.op_type not in self.valid_predecessor_op_types():
             raise RuntimeError(
-                "The predecesor of the Quant node must be Relu for handling "
-                "of Relu activations."
+                "The predecesor of the Quant node must be Relu or Selu for handling "
+                "of activations."
             )
 
         # Reroute upstream tensor
diff --git a/src/finn/util/gdrive.py b/src/finn/util/gdrive.py
deleted file mode 100644
index d525437300b6aee081bb073d40a517b5e3aa14be..0000000000000000000000000000000000000000
--- a/src/finn/util/gdrive.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import gspread
-import os
-import warnings
-from datetime import datetime
-
-from finn.util.basic import get_finn_root
-
-
-def upload_to_end2end_dashboard(data_dict):
-    gdrive_key = get_finn_root() + "/gdrive-key/service_account.json"
-    if not os.path.isfile(gdrive_key):
-        warnings.warn("Google Drive key not found, skipping dashboard upload")
-        return
-    gc = gspread.service_account(filename=gdrive_key)
-    spreadsheet = gc.open("finn-end2end-dashboard")
-    worksheet = spreadsheet.get_worksheet(0)
-    keys = list(data_dict.keys())
-    vals = list(data_dict.values())
-    # check against existing header
-    existing_keys = worksheet.row_values(1)
-    if not set(existing_keys).issuperset(set(keys)):
-        # create new worksheet
-        dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        worksheet = spreadsheet.add_worksheet(
-            title="Dashboard " + dtstr, rows=10, cols=len(keys), index=0
-        )
-        # create header row with keys
-        worksheet.update("A1:1", [keys])
-        # freeze and make header bold
-        worksheet.freeze(rows=1)
-        worksheet.format("A1:1", {"textFormat": {"bold": True}})
-    # insert values into new row at appropriate positions
-    worksheet.insert_row([], index=2)
-    for i in range(len(keys)):
-        colind = existing_keys.index(keys[i])
-        col_letter = chr(ord("A") + colind)
-        worksheet.update("%s2" % col_letter, vals[i])
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
index 8d188585694c172d97d73fa6b5820edb7b48a948..74523945242c1de68e56659087c87c349ebae4bc 100644
--- a/src/finn/util/pyverilator.py
+++ b/src/finn/util/pyverilator.py
@@ -118,6 +118,8 @@ def prepare_stitched_ip_for_verilator(model):
             if not remove_entry:
                 filtered_verilog_files.append(vfile)
             remove_entry = True
+        elif "swg_pkg" in vfile:
+            continue
         else:
             filtered_verilog_files.append(vfile)
 
@@ -315,8 +317,10 @@ def pyverilate_stitched_ip(
     xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv"
     xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv"
 
+    swg_pkg = os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_pkg.sv"
+
     sim = PyVerilator.build(
-        [top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc],
+        [swg_pkg, top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc],
         verilog_path=[vivado_stitch_proj_dir, verilog_header_dir],
         build_dir=build_dir,
         trace_depth=get_rtlsim_trace_depth(),
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index bd8bde2820fa87ed972d699cae905d7f6cc310ff..4250079ef3e994f62a3e9f9150eb5b66371b5895 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -114,25 +114,14 @@ def get_build_env(kind, target_clk_ns):
     if kind == "zynq":
         ret["board"] = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
         ret["part"] = pynq_part_map[ret["board"]]
-        ret["ip"] = os.getenv("PYNQ_IP", "")
-        ret["username"] = os.getenv("PYNQ_USERNAME", "xilinx")
-        ret["password"] = os.getenv("PYNQ_PASSWORD", "xilinx")
-        ret["port"] = os.getenv("PYNQ_PORT", 22)
-        ret["target_dir"] = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
         ret["build_fxn"] = ZynqBuild(ret["board"], target_clk_ns)
     elif kind == "alveo":
         ret["board"] = os.getenv("ALVEO_BOARD", default="U250")
         ret["part"] = alveo_part_map[ret["board"]]
-        ret["platform"] = alveo_default_platform[ret["board"]]
-        ret["ip"] = os.getenv("ALVEO_IP", "")
-        ret["username"] = os.getenv("ALVEO_USERNAME", "")
-        ret["password"] = os.getenv("ALVEO_PASSWORD", "")
-        ret["port"] = os.getenv("ALVEO_PORT", 22)
-        ret["target_dir"] = os.getenv("ALVEO_TARGET_DIR", "/tmp/finn_alveo_deploy")
         ret["build_fxn"] = VitisBuild(
             ret["part"],
             target_clk_ns,
-            ret["platform"],
+            alveo_default_platform[ret["board"]],
             strategy=VitisOptStrategy.BUILD_SPEED,
         )
     else:
diff --git a/tests/brevitas/test_brevitas_selu_act_export.py b/tests/brevitas/test_brevitas_selu_act_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f4807c5d7286c265a856d73e9aaa886f342555e
--- /dev/null
+++ b/tests/brevitas/test_brevitas_selu_act_export.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx  # noqa
+import os
+import torch
+from brevitas.export import export_qonnx
+from brevitas.nn import QuantIdentity
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import get_preferred_onnx_opset
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+
+
+@pytest.mark.brevitas_export
+@pytest.mark.parametrize("abits", [2, 4, 8])
+@pytest.mark.parametrize("ishape", [(1, 15), (1, 32, 1, 1)])
+@pytest.mark.parametrize("narrow", [True, False])
+def test_brevitas_act_export_selu(abits, ishape, narrow):
+    export_path = "test_brevitas_selu_act_export_%s.onnx" % str(abits)
+    b_act = torch.nn.Sequential(
+        torch.nn.SELU(), QuantIdentity(bit_width=abits, narrow=narrow)
+    )
+
+    export_qonnx(
+        b_act,
+        torch.randn(ishape),
+        export_path,
+        opset_version=get_preferred_onnx_opset(),
+    )
+    qonnx_cleanup(export_path, out_file=export_path)
+    model = ModelWrapper(export_path)
+    model = model.transform(ConvertQONNXtoFINN())
+
+    inp_tensor = np.random.uniform(low=-1.0, high=6.0, size=ishape).astype(np.float32)
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    inp_tensor = torch.from_numpy(inp_tensor).float()
+    b_act.eval()
+    expected = b_act.forward(inp_tensor).detach().numpy()
+
+    assert np.isclose(produced, expected, atol=1e-3).all()
+    os.remove(export_path)
diff --git a/tests/end2end/test_end2end_access_board.py b/tests/end2end/test_end2end_access_board.py
deleted file mode 100644
index ba3c49195b298059149303c63ef2db8ab6e16039..0000000000000000000000000000000000000000
--- a/tests/end2end/test_end2end_access_board.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2021, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import pytest
-
-import subprocess
-
-from finn.util.test import get_build_env
-
-
-@pytest.mark.board
-@pytest.mark.end2end
-def test_end2end_access_board():
-    build_env = get_build_env("zynq", 5)
-    if build_env["ip"] == "":
-        pytest.skip("PYNQ board IP address not specified")
-    remote_cmd_base = [
-        "ssh",
-        "-o",
-        "PreferredAuthentications=publickey",
-        "-o",
-        "PasswordAuthentication=no",
-        "%s@%s" % (build_env["username"], build_env["ip"]),
-    ]
-    test_text = "BoardIsAccessible"
-    touch_cmd = remote_cmd_base + ["echo %s" % test_text]
-    verif_res = subprocess.run(
-        touch_cmd, stdout=subprocess.PIPE, universal_newlines=True
-    )
-    assert verif_res.returncode == 0
-    assert verif_res.stdout.split("\n")[0] == test_text
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 4c68a018dbf0275b1eab409e82b33c46f5952cda..5edd77d95dce5100b9105ddf675a3fc6ad8808ca 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -34,13 +34,10 @@ import numpy as np
 # import pytorch before onnx, so we make sure to import onnx first
 import onnx  # NOQA
 import os
-import subprocess
 import torch
 import warnings
 from brevitas.export import export_finn_onnx, export_qonnx
-from collections import OrderedDict
 from dataset_loading import cifar, mnist
-from datetime import datetime
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
@@ -59,13 +56,12 @@ from qonnx.transformation.insert_topk import InsertTopK
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from qonnx.transformation.merge_onnx_models import MergeONNXModels
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
-from scipy.stats import linregress
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.core.onnx_exec import execute_onnx
-from finn.core.throughput_test import throughput_test_remote, throughput_test_rtlsim
+from finn.core.throughput_test import throughput_test_rtlsim
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -75,7 +71,6 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
-from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.minimize_accumulator_width import (
     MinimizeAccumulatorWidth,
@@ -95,7 +90,6 @@ from finn.transformation.streamline.reorder import (
     MoveScalarLinearPastInvariants,
 )
 from finn.util.basic import get_finn_root
-from finn.util.gdrive import upload_to_end2end_dashboard
 from finn.util.pytorch import ToTensor
 from finn.util.test import (
     execute_parent,
@@ -122,24 +116,6 @@ def get_checkpoint_name(topology, wbits, abits, QONNX_export, step):
     )
 
 
-def get_dashboard_data(topology, wbits, abits):
-    stats_file = build_dir + "/end2end_%s_w%da%d.txt" % (topology, wbits, abits)
-    stats_dict = OrderedDict()
-    if os.path.isfile(stats_file):
-        with open(stats_file, "r") as f:
-            stats_dict_txt = f.read()
-        stats_dict = eval(stats_dict_txt)
-    return stats_dict
-
-
-def update_dashboard_data(topology, wbits, abits, key, val):
-    stats_dict = get_dashboard_data(topology, wbits, abits)
-    stats_dict[key] = val
-    stats_file = build_dir + "/end2end_%s_w%da%d.txt" % (topology, wbits, abits)
-    with open(stats_file, "w") as f:
-        f.write(str(stats_dict))
-
-
 def fold_tfc(model):
     fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
     # (PE, SIMD, ramstyle) for each layer
@@ -334,16 +310,7 @@ class TestEnd2End:
             model = model.transform(ConvertQONNXtoFINN())
             model.save(chkpt_name)
         else:
-            export_finn_onnx(model, torch.randn(ishape), chkpt_name, opset_version=13)
-        nname = "%s_w%da%d" % (topology, wbits, abits)
-        update_dashboard_data(topology, wbits, abits, "network", nname)
-        dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        update_dashboard_data(topology, wbits, abits, "datetime", dtstr)
-        finn_commit = subprocess.check_output(
-            ["git", "rev-parse", "HEAD"], cwd=get_finn_root()
-        )
-        finn_commit = finn_commit.decode("utf-8").strip()
-        update_dashboard_data(topology, wbits, abits, "finn-commit", finn_commit)
+            export_finn_onnx(model, torch.randn(ishape), chkpt_name)
         assert os.path.isfile(chkpt_name)
 
     def test_import_and_tidy(self, topology, wbits, abits, QONNX_export):
@@ -644,10 +611,6 @@ class TestEnd2End:
         ret = throughput_test_rtlsim(model, batchsize=batchsize)
         res_cycles = ret["cycles"]
         est_cycles = latency + cycles_per_sample_est * batchsize
-        # warnings.warn("Estimated & rtlsim performance: " + str(perf))
-        # for (k, v) in perf.items():
-        #    update_dashboard_data(topology, wbits, abits, k, v)
-        update_dashboard_data(topology, wbits, abits, "cycles_rtlsim", latency)
         assert (abs(res_cycles - est_cycles) / res_cycles) < 0.15
 
     @pytest.mark.slow
@@ -691,10 +654,6 @@ class TestEnd2End:
         cfg = get_build_env(kind, target_clk_ns)
         model = model.transform(cfg["build_fxn"])
         model = model.transform(AnnotateResources("synth"))
-        synth_dct = eval(model.get_metadata_prop("res_total_top_synth"))
-        for (k, v) in synth_dct.items():
-            update_dashboard_data(topology, wbits, abits, k, v)
-        update_dashboard_data(topology, wbits, abits, "board", cfg["board"])
         model.save(
             get_checkpoint_name(topology, wbits, abits, QONNX_export, "build_" + kind)
         )
@@ -715,121 +674,3 @@ class TestEnd2End:
         model.save(
             get_checkpoint_name(topology, wbits, abits, QONNX_export, "driver_" + kind)
         )
-
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_deploy(self, topology, wbits, abits, QONNX_export, kind):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "driver_" + kind
-        )
-        model = load_test_checkpoint_or_skip(prev_chkpt_name)
-        cfg = get_build_env(kind, target_clk_ns)
-        if cfg["ip"] == "":
-            pytest.skip("PYNQ board IP address not specified")
-        model = model.transform(
-            DeployToPYNQ(
-                cfg["ip"],
-                cfg["port"],
-                cfg["username"],
-                cfg["password"],
-                cfg["target_dir"],
-            )
-        )
-        # save the model to be able to link it to the parent
-        model.save(
-            get_checkpoint_name(topology, wbits, abits, QONNX_export, "deploy_" + kind)
-        )
-
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_run_on_hw(self, topology, wbits, abits, QONNX_export, kind):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "deploy_" + kind
-        )
-        model = load_test_checkpoint_or_skip(prev_chkpt_name)  # NOQA
-        cfg = get_build_env(kind, target_clk_ns)
-        if cfg["ip"] == "":
-            pytest.skip("PYNQ board IP address not specified")
-        (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
-            topology, wbits, abits, return_topk=1
-        )
-        parent_model = load_test_checkpoint_or_skip(
-            get_checkpoint_name(topology, wbits, abits, QONNX_export, "dataflow_parent")
-        )
-        iname = parent_model.graph.input[0].name
-        oname = parent_model.graph.output[0].name
-        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
-        sdp_node = getCustomOp(sdp_node)
-        sdp_node.set_nodeattr("model", prev_chkpt_name)
-        ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True)
-        y = ret[oname]
-        assert np.isclose(y, output_tensor_npy).all()
-
-    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_throughput_hw(self, topology, wbits, abits, QONNX_export, kind):
-        prev_chkpt_name = get_checkpoint_name(
-            topology, wbits, abits, QONNX_export, "deploy_" + kind
-        )
-        end2end_example = "%s_w%da%d_%s" % (topology, wbits, abits, kind)
-        model = load_test_checkpoint_or_skip(prev_chkpt_name)  # NOQA
-        cfg = get_build_env(kind, target_clk_ns)
-        if cfg["ip"] == "":
-            pytest.skip("PYNQ board IP address not specified")
-        ret = dict()
-        # try a range of batch sizes, some may fail due to insufficient DMA
-        # buffers
-        bsize_range_in = [8**i for i in range(5)]
-        bsize_range = []
-        for bsize in bsize_range_in:
-            res = throughput_test_remote(model, bsize)
-            if res is not None:
-                ret[bsize] = res
-                bsize_range.append(bsize)
-            else:
-                # assume we reached largest possible N
-                break
-        y = [ret[key]["runtime[ms]"] for key in bsize_range]
-        lrret = linregress(bsize_range, y)
-        ret_str = ""
-        ret_str += "\n" + "%s Throughput Test Results" % end2end_example
-        ret_str += "\n" + "-----------------------------"
-        ret_str += "\n" + "From linear regression:"
-        ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept
-        ret_str += "\n" + "Time per sample: %f ms" % lrret.slope
-        ret_str += "\n" + "Raw data:"
-
-        ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
-            "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[MB/s]", "DRAM wr[MB/s]"
-        )
-        for k in bsize_range:
-            v = ret[k]
-            ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
-                k,
-                np.round(v["runtime[ms]"], 4),
-                v["fclk[mhz]"],
-                np.round(v["throughput[images/s]"], 2),
-                np.round(v["DRAM_in_bandwidth[MB/s]"], 2),
-                np.round(v["DRAM_out_bandwidth[MB/s]"], 2),
-            )
-        ret_str += "\n" + "-----------------------------"
-        warnings.warn(ret_str)
-        largest_bsize = bsize_range[-1]
-        update_dashboard_data(
-            topology, wbits, abits, "fclk[mhz]", ret[largest_bsize]["fclk[mhz]"]
-        )
-        update_dashboard_data(
-            topology,
-            wbits,
-            abits,
-            "throughput[images/s]",
-            ret[largest_bsize]["throughput[images/s]"],
-        )
-
-    def test_upload_results_to_dashboard(self, topology, wbits, abits, QONNX_export):
-        # ToDo: Extend the dashboard to also upload QONNX exported models?
-        if QONNX_export:
-            pytest.skip("Dashboard data upload is disabled for QONNX exported models.")
-        else:
-            dashboard_data = get_dashboard_data(topology, wbits, abits)
-            if len(dashboard_data.keys()) > 0:
-                upload_to_end2end_dashboard(dashboard_data)
-            else:
-                pytest.skip("No data to upload to dashboard")
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
index 1ab2d012283ee4cd34cad002df7a95bfcb3c5a8a..e6ca90b7b27a59f676d040728aef9f80ed3be4ea 100644
--- a/tests/end2end/test_end2end_cybsec_mlp.py
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -34,10 +34,8 @@ import json
 import numpy as np
 import os
 import shutil
-import subprocess
 import torch
 import torch.nn as nn
-import wget
 from brevitas.core.quant import QuantType
 from brevitas.export import export_finn_onnx, export_qonnx
 from brevitas.nn import QuantIdentity, QuantLinear, QuantReLU
@@ -225,62 +223,3 @@ def test_end2end_cybsec_mlp_build(QONNX_export):
         assert est_res_dict["total"]["LUT"] == 7904.0
         assert est_res_dict["total"]["BRAM_18K"] == 36.0
     shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build", QONNX_export))
-
-
-@pytest.mark.end2end
-@pytest.mark.xfail
-@pytest.mark.parametrize("QONNX_export", [False, True])
-def test_end2end_cybsec_mlp_run_on_hw(QONNX_export):
-    build_env = get_build_env(build_kind, target_clk_ns)
-    assets_dir = pk.resource_filename("finn.qnn-data", "cybsec-mlp/")
-    deploy_dir = get_checkpoint_name("build", QONNX_export)
-    if not os.path.isdir(deploy_dir):
-        pytest.skip(deploy_dir + " not found from previous test step, skipping")
-    driver_dir = deploy_dir + "/driver"
-    assert os.path.isdir(driver_dir)
-    # put all assets into driver dir
-    shutil.copy(assets_dir + "/validate-unsw-nb15.py", driver_dir)
-    # put a copy of binarized dataset into driver dir
-    dataset_url = (
-        "https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1"
-    )
-    dataset_local = driver_dir + "/unsw_nb15_binarized.npz"
-    if not os.path.isfile(dataset_local):
-        wget.download(dataset_url, out=dataset_local)
-    assert os.path.isfile(dataset_local)
-    # create a shell script for running validation: 10 batches x 10 imgs
-    with open(driver_dir + "/validate.sh", "w") as f:
-        f.write(
-            """#!/bin/bash
-cd %s/driver
-echo %s | sudo -S python3.6 validate-unsw-nb15.py --batchsize=10 --limit_batches=10
-        """
-            % (
-                build_env["target_dir"] + "/end2end_cybsecmlp_build",
-                build_env["password"],
-            )
-        )
-    # set up rsync command
-    remote_target = "%s@%s:%s" % (
-        build_env["username"],
-        build_env["ip"],
-        build_env["target_dir"],
-    )
-    rsync_res = subprocess.run(["rsync", "-avz", deploy_dir, remote_target])
-    assert rsync_res.returncode == 0
-    remote_verif_cmd = [
-        "ssh",
-        "%s@%s" % (build_env["username"], build_env["ip"]),
-        "sh",
-        build_env["target_dir"] + "/end2end_cybsecmlp_build/driver/validate.sh",
-    ]
-    verif_res = subprocess.run(
-        remote_verif_cmd,
-        stdout=subprocess.PIPE,
-        universal_newlines=True,
-        input=build_env["password"],
-    )
-    assert verif_res.returncode == 0
-    log_output = verif_res.stdout.split("\n")
-    assert log_output[-3] == "batch 10 / 10 : total OK 93 NOK 7"
-    assert log_output[-2] == "Final accuracy: 93.000000"
diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py
index 0a92c74a38d64ade37d576f3830f3a5628c94d88..bef2e0ffa77cb96ff45956e380aeb376def61228 100644
--- a/tests/end2end/test_ext_weights.py
+++ b/tests/end2end/test_ext_weights.py
@@ -110,69 +110,3 @@ def test_end2end_ext_weights_build():
     if os.path.isdir(get_checkpoint_name("build")):
         shutil.rmtree(get_checkpoint_name("build"))
     shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build"))
-
-
-@pytest.mark.board
-@pytest.mark.end2end
-@pytest.mark.xfail
-def test_end2end_ext_weights_dataset():
-    # make sure we have local copies of mnist dataset files
-    subprocess.check_output(["mkdir", "-p", mnist_local])
-    for f in mnist_files:
-        if not os.path.isfile(mnist_local + "/" + f):
-            wget.download(mnist_url + "/" + f, out=mnist_local + "/" + f)
-        assert os.path.isfile(mnist_local + "/" + f)
-    # rsync to board
-    build_env = get_build_env(build_kind, target_clk_ns)
-    mnist_target = "%s@%s:%s" % (build_env["username"], build_env["ip"], "/tmp/")
-
-    rsync_dataset_cmd = ["rsync", "-rv", mnist_local + "/", mnist_target]
-    subprocess.check_output(rsync_dataset_cmd)
-
-
-@pytest.mark.end2end
-@pytest.mark.xfail
-def test_end2end_ext_weights_run_on_hw():
-    build_env = get_build_env(build_kind, target_clk_ns)
-    deploy_dir = get_checkpoint_name("build")
-    if not os.path.isdir(deploy_dir):
-        pytest.skip(deploy_dir + " not found from previous test step, skipping")
-    driver_dir = deploy_dir + "/driver"
-    assert os.path.isdir(driver_dir)
-    # create a shell script for running validation: 10 batches x 10 imgs
-    with open(driver_dir + "/validate.sh", "w") as f:
-        f.write(
-            """#!/bin/bash
-cd %s/driver
-echo %s | sudo -S python3.6 validate.py --dataset mnist --bitfile %s
-        """
-            % (
-                build_env["target_dir"] + "/end2end_ext_weights_build",
-                build_env["password"],
-                "../bitfile/finn-accel.bit",
-            )
-        )
-    # set up rsync command
-    remote_target = "%s@%s:%s" % (
-        build_env["username"],
-        build_env["ip"],
-        build_env["target_dir"],
-    )
-    rsync_res = subprocess.run(["rsync", "-avz", deploy_dir, remote_target])
-    assert rsync_res.returncode == 0
-    remote_verif_cmd = [
-        "ssh",
-        "%s@%s" % (build_env["username"], build_env["ip"]),
-        "sh",
-        build_env["target_dir"] + "/end2end_ext_weights_build/driver/validate.sh",
-    ]
-    verif_res = subprocess.run(
-        remote_verif_cmd,
-        stdout=subprocess.PIPE,
-        universal_newlines=True,
-        input=build_env["password"],
-    )
-    assert verif_res.returncode == 0
-    log_output = verif_res.stdout.split("\n")
-    assert log_output[-3] == "batch 100 / 100 : total OK 9296 NOK 704"
-    assert log_output[-2] == "Final accuracy: 92.960000"
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
index 7f7bf649a9284e7716aec5adfb91957fdabb55d5..e586984b31f741d2b4744acd1886e76a4179a59e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
@@ -189,6 +189,10 @@ cfg0 = {
     "ofm": 64,
     "depthwise": True,
     "pad_mode": "SAME_UPPER",
+    # run synthesis for one configuration
+    # this helped expose a bug in enum decls previously
+    # (which config the synth runs on does not matter)
+    "do_synth": True,
 }
 cfg1 = {
     "idims": [(32, 16), (16, 8)],
@@ -198,6 +202,7 @@ cfg1 = {
     "ofm": 8,
     "depthwise": False,
     "pad_mode": "SAME_UPPER",
+    "do_synth": False,
 }
 cfg2 = {
     "idims": [(64, 128), (2, 4)],
@@ -207,6 +212,7 @@ cfg2 = {
     "ofm": 64,
     "depthwise": True,
     "pad_mode": "SAME_UPPER",
+    "do_synth": False,
 }
 
 
@@ -215,6 +221,7 @@ cfg2 = {
 @pytest.mark.vivado
 @pytest.mark.fpgadataflow
 def test_fpgadataflow_conv_dynamic(cfg):
+    do_synth = cfg["do_synth"]
     pad_mode = cfg["pad_mode"]
     depthwise = cfg["depthwise"]
     idims = cfg["idims"]
@@ -292,7 +299,7 @@ def test_fpgadataflow_conv_dynamic(cfg):
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(PrepareIP("xc7z020clg400-1", 5))
     model = model.transform(HLSSynthIP())
-    model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5))
+    model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5, vitis=do_synth))
     model.set_metadata_prop("exec_mode", "rtlsim")
 
     # loop through experiment configurations
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index b220338e6919e8eeaeef0f6e5343fed9b1dfca10..7e4069f5c481344560509d17c086ca2cbdbd0fda 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -206,7 +206,6 @@ def test_fpgadataflow_ipstitch_gen_model(mem_mode):
         assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
         assert os.path.isfile(sdp_node.get_nodeattr("model"))
         model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
-        model.set_metadata_prop("exec_mode", "remote_pynq")
     model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(test_fpga_part, 5))
diff --git a/tests/notebooks/test_jupyter_notebooks.py b/tests/notebooks/test_jupyter_notebooks.py
index 819b4ccde0333cfdf6e16f30e25fb5303fbf1f70..836f1e059efc3cfae95fa9e2ccd0b74f6fca9c11 100644
--- a/tests/notebooks/test_jupyter_notebooks.py
+++ b/tests/notebooks/test_jupyter_notebooks.py
@@ -21,6 +21,7 @@ advanced_notebooks = [
     pytest.param(notebook_advanced_dir + "0_custom_analysis_pass.ipynb"),
     pytest.param(notebook_advanced_dir + "1_custom_transformation_pass.ipynb"),
     pytest.param(notebook_advanced_dir + "2_custom_op.ipynb"),
+    pytest.param(notebook_advanced_dir + "3_folding.ipynb"),
 ]
 
 cyber_notebooks = [