diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index 69046127945791f6c21b9d5a9201f1ea550625b5..269fa38b73a2edd904bc6aad52522cddb3d33d25 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -34,7 +34,7 @@ WORKDIR /workspace
 RUN apt-get update
 RUN apt-get -y upgrade
 RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
-RUN apt-get install -y verilator zsh nano
+RUN apt-get install -y verilator zsh nano rsync
 RUN apt-get install -y sshpass wget unzip
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 
@@ -47,6 +47,8 @@ RUN rm xrtdeps.sh
 # cloning dependency repos
 # finn-base
 RUN git clone https://github.com/Xilinx/finn-base.git /workspace/finn-base
+# finn-experimental
+RUN git clone https://github.com/Xilinx/finn-experimental.git /workspace/finn-experimental
 # Brevitas
 RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
 # CNPY
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index 7875378ca20d2a5b0aa7123cfceef3ebc2451fe3..5976d0360e63168ed9d20e8cb0ee5de8e69656a5 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -73,6 +73,8 @@ USER $UNAME
 # cloning dependency repos (as user)
 # finn-base
 RUN git clone https://github.com/fpjentzsch/finn-base.git /workspace/finn-base
+# finn-experimental
+RUN git clone https://github.com/Xilinx/finn-experimental.git /workspace/finn-experimental
 # Brevitas
 RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
 # CNPY
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 13939a2e44536068867e0f7c79bafe3fb16427d7..7a1a76e24163ca9fc79ffd19f909d9e79983d8ed 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -13,6 +13,7 @@ gecho () {
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
 FINN_BASE_COMMIT=4b40ff84e7c9210325a11bf73b8b9142b776f94c
+FINN_EXP_COMMIT=e9f97dcdb4db2f889b0f36af079a6a1792b7d4de
 BREVITAS_COMMIT=aff49758ec445d77c75721c7de3091a2a1797ca8
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
 HLSLIB_COMMIT=2e49322d1bbc4969ca293843bda1f3f9c05456fc
@@ -25,6 +26,11 @@ gecho "finn-base @ $FINN_BASE_COMMIT"
 git -C /workspace/finn-base pull --quiet
 git -C /workspace/finn-base checkout $FINN_BASE_COMMIT --quiet
 pip install --user -e /workspace/finn-base
+# finn-experimental
+gecho "finn-experimental @ $FINN_EXP_COMMIT"
+git -C /workspace/finn-experimental pull --quiet
+git -C /workspace/finn-experimental checkout $FINN_EXP_COMMIT --quiet
+pip install --user -e /workspace/finn-experimental
 # Brevitas
 gecho "brevitas @ $BREVITAS_COMMIT"
 git -C /workspace/brevitas pull --quiet
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
index b06feccdc578a59c8ef00531871e1211c2a407e5..b4ad37232fa69754a86e9064d7592d7474e8617e 100755
--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -5,8 +5,8 @@
 cd $FINN_ROOT
 # check if command line argument is empty or not present
 if [ -z $1 ]; then
-  echo "Running quicktest: not (vivado or slow) with pytest-xdist"
-  python setup.py test --addopts "-m 'not (vivado or slow or vitis)' --dist=loadfile -n $PYTEST_PARALLEL"
+  echo "Running quicktest: not (vivado or slow or board) with pytest-xdist"
+  python setup.py test --addopts "-m 'not (vivado or slow or vitis or board)' --dist=loadfile -n $PYTEST_PARALLEL"
 elif [ $1 = "main" ]; then
   echo "Running main test suite: not (rtlsim or end2end) with pytest-xdist"
   python setup.py test --addopts "-k 'not (rtlsim or end2end)' --dist=loadfile -n $PYTEST_PARALLEL"
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index bff31cde45122ebc25f515422ffc523f4f78e3be..25538a3ad1fa21b8e841b5f3fadc5388137d68bb 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -41,8 +41,7 @@ System Requirements
 * Docker `without root <https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user>`_
 * A working Vivado 2019.1 or 2020.1 installation
 * A ``VIVADO_PATH`` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located)
-* *(optional)* A PYNQ board with a network connection
-   * the ``bitstring`` package must be installed on the PYNQ: ``sudo pip3 install bitstring``
+* *(optional)* A PYNQ board with a network connection, see `PYNQ board first-time setup`_ below
 * *(optional)* An Alveo board, and a working Vitis 2020.1 installation if you want to use Vitis and Alveo (see `Alveo first-time setup`_ below)
 
 We also recommend running the FINN compiler on a system with sufficiently
@@ -139,6 +138,24 @@ As of FINN v0.4b we also have preliminary support for `Xilinx Alveo boards <http
 
 **Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator.
 
+PYNQ board first-time setup
+****************************
+We use *host* to refer to the PC running the FINN Docker environment, which will build the accelerator+driver and package it up, and *target* to refer to the PYNQ board. To be able to access the target from the host, you'll need to set up SSH public key authentication:
+
+Start on the target side:
+
+1. Note down the IP address of your PYNQ board. This IP address must be accessible from the host.
+2. Ensure the ``bitstring`` package is installed: ``sudo pip3 install bitstring``
+
+Continue on the host side (replace the ``<PYNQ_IP>`` and ``<PYNQ_USERNAME>`` with the IP address and username of your board from the first step):
+
+1. Launch the Docker container from where you cloned finn with ``./run-docker.sh``
+2. Go into the `ssh_keys` directory  (e.g. ``cd /workspace/finn/ssh_keys``)
+3. Run ``ssh-keygen`` to create a key pair e.g. ``id_rsa`` private and ``id_rsa.pub`` public key
+4. Run ``ssh-copy-id -i id_rsa.pub <PYNQ_USERNAME>@<PYNQ_IP>`` to install the keys on the remote system
+5. Test that you can ``ssh <PYNQ_USERNAME>@<PYNQ_IP>`` without having to enter the password. Pass the ``-v`` flag to the ssh command if it doesn't work to help you debug.
+
+
 Alveo first-time setup
 **********************
 We use *host* to refer to the PC running the FINN Docker environment, which will build the accelerator+driver and package it up, and *target* to refer to the PC where the Alveo card is installed. These two can be the same PC, or connected over the network -- FINN includes some utilities to make it easier to test on remote PCs too. Prior to first usage, you need to set up both the host and the target in the following manner:
@@ -150,7 +167,7 @@ On the target side:
 3. Create a conda environment named *finn-pynq-alveo* by following this guide `to set up PYNQ for Alveo <https://pynq.readthedocs.io/en/latest/getting_started/alveo_getting_started.html>`_. It's best to follow the recommended environment.yml (set of package versions) in this guide.
 4. Activate the environment with `conda activate finn-pynq-alveo` and install the bitstring package with ``pip install bitstring``.
 5. Done! You should now be able to e.g. ``import pynq`` in Python scripts.
-6. (optional) If you don't want to specify the ``ALVEO_PASSWORD`` environment variable, you can `set up public key authentication <https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server>`_. Copy your private key to the ``finn/ssh_keys`` folder on the host to get password-less deployment and remote execution.
+
 
 
 On the host side:
@@ -159,4 +176,5 @@ On the host side:
 2. Install Xilinx XRT and set up the ``XILINX_XRT`` environment variable to point to your installation. *This must be the same path as the target's XRT (target step 1)*
 3. Install the Vitis platform files for Alveo and set up the ``PLATFORM_REPO_PATHS`` environment variable to point to your installation. *This must be the same path as the target's platform files (target step 2)*
 4. Set up the ``ALVEO_*`` environment variables accordingly for your target, see description of environment variables above.
+5. `Set up public key authentication <https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server>`_. Copy your private key to the ``finn/ssh_keys`` folder on the host to get password-less deployment and remote execution.
 5. Done! You can try the ``test_end2end_vitis`` tests in the FINN Docker to verify your setup, although this will take some time.
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index 4130f35d7a371711fe1f6bf494358e3c93d8c136..a141caf423f5238245b509e077d32c6bd1a85fcd 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -624,17 +624,30 @@
    "source": [
     "## 5. Deployment and Remote Execution\n",
     "\n",
-    "Now that we're done with the hardware generation, we can generate a Python driver for accelerator and copy the necessary files onto our PYNQ board."
+    "Now that we're done with the hardware generation, we can generate a Python driver for accelerator and copy the necessary files onto our PYNQ board.\n",
+    "\n",
+    "**Make sure you've [set up the SSH keys for your PYNQ board](https://finn-dev.readthedocs.io/en/latest/getting_started.html#pynq-board-first-time-setup) before executing this step.**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Welcome to PYNQ Linux, based on Ubuntu 18.04 (GNU/Linux 5.4.0-xilinx-v2020.1 armv7l)\r\n",
+      "\r\n",
+      " * Pure upstream Kubernetes 1.21, smallest, simplest cluster ops!\r\n",
+      "\r\n",
+      "     https://microk8s.io/\r\n"
+     ]
+    }
+   ],
    "source": [
     "import os\n",
-    "from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ\n",
     "\n",
     "# set up the following values according to your own environment\n",
     "# FINN will use ssh to deploy and run the generated accelerator\n",
@@ -643,6 +656,20 @@
     "password = os.getenv(\"PYNQ_PASSWORD\", \"xilinx\")\n",
     "port = os.getenv(\"PYNQ_PORT\", 22)\n",
     "target_dir = os.getenv(\"PYNQ_TARGET_DIR\", \"/home/xilinx/finn_cnv_end2end_example\")\n",
+    "# set up ssh options to only allow publickey authentication\n",
+    "options = \"-o PreferredAuthentications=publickey -o PasswordAuthentication=no\"\n",
+    "\n",
+    "# test access to PYNQ board\n",
+    "! ssh {options} {username}@{ip} -p {port} cat /var/run/motd.dynamic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ\n",
     "\n",
     "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_synth.onnx\")\n",
     "model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))\n",
@@ -689,7 +716,7 @@
     }
    ],
    "source": [
-    "! sshpass -p {password} ssh {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'"
+    "! ssh {options} {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'"
    ]
   },
   {
@@ -795,7 +822,7 @@
    "source": [
     "### Validating the Accuracy on a PYNQ Board <a id='validation'></a>\n",
     "\n",
-    "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`sshpass` and `echo password | sudo -S command`) to get that working from this notebook running on the host computer.\n",
+    "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`echo password | sudo -S command`) to get that working from this notebook running on the host computer.\n",
     "\n",
     "**Ensure that your PYNQ board has a working internet connecting for the next steps, since some there is some downloading involved.**\n",
     "\n",
@@ -824,7 +851,7 @@
     }
    ],
    "source": [
-    "! sshpass -p {password} ssh -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'"
+    "! ssh {options} -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'"
    ]
   },
   {
@@ -866,7 +893,7 @@
     }
    ],
    "source": [
-    "! sshpass -p {password} ssh -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset cifar10 --batchsize 1000'"
+    "! ssh {options} -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset cifar10 --batchsize 1000'"
    ]
   },
   {
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index 8cbff4fcea58d452b1e35c0dab647a8f922dc2c0..5ed4b170b4eeee4b438d9539d2317a7d5eab5df2 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -1337,7 +1337,43 @@
    "source": [
     "### Deployment and Remote Execution <a id='deploy'></a>\n",
     "\n",
-    "We'll now use the `DeployToPYNQ` transformation to create a deployment folder with the bitfile and driver file(s), and copy that to the PYNQ board. You can change the default IP address, username, password and target folder for the PYNQ below."
+    "We'll now use the `DeployToPYNQ` transformation to create a deployment folder with the bitfile and driver file(s), and copy that to the PYNQ board. You can change the default IP address, username, password and target folder for the PYNQ below.\n",
+    "\n",
+    "**Make sure you've [set up the SSH keys for your PYNQ board](https://finn-dev.readthedocs.io/en/latest/getting_started.html#pynq-board-first-time-setup) before executing this step.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Welcome to PYNQ Linux, based on Ubuntu 18.04 (GNU/Linux 5.4.0-xilinx-v2020.1 armv7l)\r\n",
+      "\r\n",
+      " * Pure upstream Kubernetes 1.21, smallest, simplest cluster ops!\r\n",
+      "\r\n",
+      "     https://microk8s.io/\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "# set up the following values according to your own environment\n",
+    "# FINN will use ssh to deploy and run the generated accelerator\n",
+    "ip = os.getenv(\"PYNQ_IP\", \"192.168.2.99\")\n",
+    "username = os.getenv(\"PYNQ_USERNAME\", \"xilinx\")\n",
+    "password = os.getenv(\"PYNQ_PASSWORD\", \"xilinx\")\n",
+    "port = os.getenv(\"PYNQ_PORT\", 22)\n",
+    "target_dir = os.getenv(\"PYNQ_TARGET_DIR\", \"/home/xilinx/finn_tfc_end2end_example\")\n",
+    "# set up ssh options to only allow publickey authentication\n",
+    "options = \"-o PreferredAuthentications=publickey -o PasswordAuthentication=no\"\n",
+    "\n",
+    "# test access to PYNQ board\n",
+    "! ssh {options} {username}@{ip} -p {port} cat /var/run/motd.dynamic"
    ]
   },
   {
@@ -1347,11 +1383,7 @@
    "outputs": [],
    "source": [
     "from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ\n",
-    "ip = \"192.168.2.99\"\n",
-    "port = \"22\"\n",
-    "username = \"xilinx\"\n",
-    "password = \"xilinx\"\n",
-    "target_dir = \"/home/xilinx/finn_tfc_end2end_example\"\n",
+    "\n",
     "model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))\n",
     "model.save(build_dir + \"/tfc_w1_a1_pynq_deploy.onnx\")"
    ]
@@ -1456,7 +1488,7 @@
     }
    ],
    "source": [
-    "! sshpass -p {password} ssh {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'"
+    "! ssh {options} {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'"
    ]
   },
   {
@@ -1578,7 +1610,7 @@
    "source": [
     "### Validating the Accuracy on a PYNQ Board <a id='validation'></a>\n",
     "\n",
-    "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`sshpass` and `echo password | sudo -S command`) to get that working from this notebook running on the host computer.\n",
+    "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`echo password | sudo -S command`) to get that working from this notebook running on the host computer.\n",
     "\n",
     "**Ensure that your PYNQ board has a working internet connecting for the next steps, since some there is some downloading involved.**\n",
     "\n",
@@ -1587,7 +1619,7 @@
     "\n",
     "Command to execute on PYNQ:\n",
     "\n",
-    "```pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading```"
+    "```sudo pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading```"
    ]
   },
   {
@@ -1609,7 +1641,7 @@
     }
    ],
    "source": [
-    "! sshpass -p {password} ssh -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'"
+    "! ssh {options} -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'"
    ]
   },
   {
@@ -1656,7 +1688,7 @@
     }
    ],
    "source": [
-    "! sshpass -p {password} ssh -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset mnist --batchsize 1000'"
+    "! ssh {options} -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset mnist --batchsize 1000'"
    ]
   },
   {
diff --git a/setup.cfg b/setup.cfg
index 45fe40156acd966fed302522e9e8ca716a4d331c..e98077ddf1e60f69513d85193374414636a0f355 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -109,6 +109,7 @@ markers =
     slow: marks tests as slow (deselect with '-m "not slow"')
     vivado: mark tests that require Vivado or Vivado HLS
     vitis: mark tests that require Vitis
+    board: mark tests that require a PYNQ board
 norecursedirs =
     dist
     build
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index b206e00a2eb6da1d76ccf57c078b16f61868a98c..bd938f17411ee42e94e95e02776ad8e973ea10fa 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -114,9 +114,9 @@ default_build_dataflow_steps = [
     "step_set_fifo_depths",
     "step_create_stitched_ip",
     "step_measure_rtlsim_performance",
-    "step_make_pynq_driver",
     "step_out_of_context_synthesis",
     "step_synthesize_bitfile",
+    "step_make_pynq_driver",
     "step_deployment_package",
 ]
 
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index 05e41a48a8f4cb34616bf06c01b652afb9ae4257..38940ccb94f11fe49af5f49ee020f150326a026c 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -63,7 +63,7 @@ class AddStreams_Batch(HLSCustomOp):
         ishape = tuple(vecs + [ich])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         assert ich % pe == 0, "PE must divide NumChannels"
@@ -362,5 +362,5 @@ class AddStreams_Batch(HLSCustomOp):
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
-        intf_names["s_axis"] = ["in0_V_V", "in1_V_V"]
+        intf_names["s_axis"].append(("in1_V_V", self.get_instream_width_padded()))
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 10a8051730217b56873b5a53c0803e3b90dada90..73da77bd3f940cee5ffd10fcfc43571f1a612eb4 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -312,7 +312,8 @@ class DuplicateStreams_Batch(HLSCustomOp):
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = [
             """DuplicateStreams_Batch<{}, {}> (in0, out0, out1, 1);""".format(
-                self.get_outstream_width(), self.get_number_output_values() // 2,
+                self.get_outstream_width(),
+                self.get_number_output_values() // 2,
             )
         ]
 
@@ -378,5 +379,8 @@ class DuplicateStreams_Batch(HLSCustomOp):
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
-        intf_names["m_axis"] = ["out0_V_V", "out1_V_V"]
+        intf_names["m_axis"] = [
+            ("out0_V_V", self.get_outstream_width_padded()),
+            ("out1_V_V", self.get_outstream_width_padded()),
+        ]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 02912b2d5f45b3bab0eaca13ee0a0bf19bf9cfca..2ab070b2fdc059a554930345a81abc368c29bfa7 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -123,15 +123,16 @@ class HLSCustomOp(CustomOp):
         """Return a dict of names of input and output interfaces.
         The keys reflect the protocols each interface implements:
         'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
-        Values are lists of names:
-        's_axis' names correspond to the list of node inputs in order,
-        'm_axis' names correspond to the list of node outputs in order'
+        Values are lists of tuples (axis, aximm) or names (axilite):
+        'axis' tuples correspond to the list of node inputs in order,
+        each tuple is (interface_name, interface_width_bits).
+        axilite always assumed to be 32 bits and is not tuple (name only).
         Each block must have at most one aximm and one axilite."""
         intf_names = {}
         intf_names["clk"] = ["ap_clk"]
         intf_names["rst"] = ["ap_rst_n"]
-        intf_names["s_axis"] = ["in0_V_V"]
-        intf_names["m_axis"] = ["out_V_V"]
+        intf_names["s_axis"] = [("in0_V_V", self.get_instream_width_padded())]
+        intf_names["m_axis"] = [("out_V_V", self.get_outstream_width_padded())]
         intf_names["aximm"] = []
         intf_names["axilite"] = []
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index a6cddcc4aeb45957c16249cd57f122fe5e58b85a..857496a2614894588ebf065db3e384cf2cecf106 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -355,11 +355,9 @@ class IODMA(HLSCustomOp):
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         if self.get_nodeattr("direction") == "out":
-            intf_names["s_axis"] = ["in0_V_V"]
             intf_names["m_axis"] = []
         else:
             intf_names["s_axis"] = []
-            intf_names["m_axis"] = ["out_V_V"]
         intf_names["axilite"] = ["s_axi_control"]
-        intf_names["aximm"] = ["m_axi_gmem"]
+        intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("intfWidth"))]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 23c1779a27c123583c0c8af5f53d022d03e78126..4d84b74dce001fca769ed2850a8f718ac942f14c 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -395,8 +395,8 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 8868002c9e2cb8726eeb573e104140e3e1a61d27..3cc01ade73fc6b735509f2839e5c10785a8b9f54 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -444,12 +444,24 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         single_pe_w = simd * weight_bits
         return max([weightstream, max_of_io, single_pe_w])
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
         simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
         sf = mw // simd
+        nf = mh // pe
         vecs = list(self.get_nodeattr("numInputVectors"))
-        folded_input_shape = tuple(vecs + [sf, simd])
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple(vecs + [sf, simd])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple(vecs + [sf * nf, simd * pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
         return folded_input_shape
 
     def get_folded_output_shape(self):
@@ -1253,8 +1265,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
@@ -1348,8 +1360,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 # TODO calculate and pass in segment size here
                 cmd.append("assign_bd_address")
             cmd.append("save_bd_design")
-        elif mem_mode == "const":
-            # base class impl sufficient for const mode
+        elif mem_mode == "const" or mem_mode == "external":
+            # base class impl sufficient for const/external modes
             return super().code_generation_ipi()
         else:
             raise Exception("Unrecognized mem_mode for StreamingFCLayer")
@@ -1359,7 +1371,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         intf_names = super().get_verilog_top_module_intf_names()
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "external":
-            intf_names["s_axis"] = ["in0_V_V", "weights_V_V"]
+            intf_names["s_axis"].append(
+                ("weights_V_V", self.get_weightstream_width_padded())
+            )
         if mem_mode == "decoupled":
             # only expose axilite interface if attribute is set
             runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index fb41bceca09fe544bd729537b1af726c9c43d290..133a869b28cf9968a719e243a3266dfb25b637ba 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -343,8 +343,8 @@ class StreamingFIFO(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 30374a7d97f4d2189e142a9b7b6e44a5abbb46b0..0b248c15035a2b685ebfb024c8a944a6ea6c65bf 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -825,8 +825,8 @@ class Thresholding_Batch(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index bedaf0984c39ef7603e6829961d7a3efb6ff489f..70edaee9cfc0662411d005325e781f13b4f1b510 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -243,12 +243,13 @@ class TLastMarker(HLSCustomOp):
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
+        stream_width = self.get_nodeattr("StreamWidth")
         if self.get_nodeattr("Direction") == "in":
-            intf_names["s_axis"] = ["in0"]
-            intf_names["m_axis"] = ["out_V_V"]
+            intf_names["s_axis"] = [("in0", stream_width)]
+            intf_names["m_axis"] = [("out_V_V", stream_width)]
         else:
-            intf_names["s_axis"] = ["in0_V_V"]
-            intf_names["m_axis"] = ["out_r"]
+            intf_names["s_axis"] = [("in0_V_V", stream_width)]
+            intf_names["m_axis"] = [("out_r", stream_width)]
         if self.get_nodeattr("DynIters") == 1:
             intf_names["axilite"] = ["s_axi_control"]
         return intf_names
diff --git a/src/finn/qnn-data/cybsec-mlp/state_dict.pth b/src/finn/qnn-data/cybsec-mlp/state_dict.pth
new file mode 100644
index 0000000000000000000000000000000000000000..53c002e3fa6f2ae3e7c8f0abb71fa446d80a8f09
Binary files /dev/null and b/src/finn/qnn-data/cybsec-mlp/state_dict.pth differ
diff --git a/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py b/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fabc716a66a3cc24697e49aa26ec3bbbb231b43
--- /dev/null
+++ b/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from driver import io_shape_dict
+from driver_base import FINNExampleOverlay
+import numpy as np
+
+
+def make_unsw_nb15_test_batches(bsize, dataset_root, limit_batches):
+    unsw_nb15_data = np.load(dataset_root + "/unsw_nb15_binarized.npz")["test"][:82000]
+    test_imgs = unsw_nb15_data[:, :-1]
+    test_labels = unsw_nb15_data[:, -1]
+    n_batches = int(test_imgs.shape[0] / bsize)
+    if limit_batches == -1:
+        limit_batches = n_batches
+    test_imgs = test_imgs.reshape(n_batches, bsize, -1)[:limit_batches]
+    test_labels = test_labels.reshape(n_batches, bsize)[:limit_batches]
+    return (test_imgs, test_labels)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Validate top-1 accuracy for FINN-generated accelerator"
+    )
+    parser.add_argument("--batchsize", help="samples per batch", type=int, default=1000)
+    parser.add_argument(
+        "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma"
+    )
+    parser.add_argument(
+        "--bitfile",
+        help='name of bitfile (i.e. "resizer.bit")',
+        default="../bitfile/finn-accel.bit",
+    )
+    parser.add_argument(
+        "--dataset_root", help="dataset root dir for download/reuse", default="."
+    )
+    parser.add_argument(
+        "--limit_batches", help="number of batches, -1 for max", type=int, default=-1
+    )
+    # parse arguments
+    args = parser.parse_args()
+    bsize = args.batchsize
+    bitfile = args.bitfile
+    platform = args.platform
+    dataset_root = args.dataset_root
+    limit_batches = args.limit_batches
+
+    print("Loading dataset...")
+    (test_imgs, test_labels) = make_unsw_nb15_test_batches(
+        bsize, dataset_root, limit_batches
+    )
+
+    ok = 0
+    nok = 0
+    n_batches = test_imgs.shape[0]
+    total = n_batches * bsize
+
+    print("Initializing driver, flashing bitfile...")
+
+    driver = FINNExampleOverlay(
+        bitfile_name=bitfile,
+        platform=platform,
+        io_shape_dict=io_shape_dict,
+        batch_size=bsize,
+    )
+
+    n_batches = int(total / bsize)
+
+    print("Starting...")
+
+    for i in range(n_batches):
+        inp = np.pad(test_imgs[i].astype(np.float32), [(0, 0), (0, 7)], mode="constant")
+        exp = test_labels[i].astype(np.float32)
+        inp = 2 * inp - 1
+        exp = 2 * exp - 1
+        out = driver.execute(inp)
+        matches = np.count_nonzero(out.flatten() == exp.flatten())
+        nok += bsize - matches
+        ok += matches
+        print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))
+
+    acc = 100.0 * ok / (total)
+    print("Final accuracy: %f" % acc)
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
index ef16a537ce18c52ea42ce9178a7178e8f8b667dd..df3c9881372659a4d8f6fceb8a385e6055c161e1 100644
--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -37,6 +37,9 @@ from finn.util.data_packing import (
     packed_bytearray_to_finnpy,
 )
 
+from finn.util.basic import gen_finn_dt_tensor
+from finn.core.datatype import DataType
+
 # Driver base class for FINN-generated dataflow accelerators.
 # The particulars of the generated accelerator are specified via the
 # io_shape_dict (generated by the MakePYNQDriver transformation).
@@ -84,25 +87,78 @@ class FINNExampleOverlay(Overlay):
         self.batch_size = batch_size
         self.fclk_mhz = fclk_mhz
         if self.platform == "alveo":
-            self.idma = self.idma0
+            if "input_dma_name" in io_shape_dict.keys():
+                self.idma = getattr(self, io_shape_dict["input_dma_name"])
+            else:
+                self.idma = self.idma0
             self.odma = self.odma0
             self.odma_handle = None
         elif self.platform == "zynq-iodma":
-            self.idma = self.idma0
+            if "input_dma_name" in io_shape_dict.keys():
+                self.idma = getattr(self, io_shape_dict["input_dma_name"])
+            else:
+                self.idma = self.idma0
             self.odma = self.odma0
             # set the clock frequency as specified by user during transformations
             if self.fclk_mhz > 0:
                 Clocks.fclk0_mhz = self.fclk_mhz
         else:
             raise ValueError("Supported platforms are zynq-iodma alveo")
-        # load any runtime weights
+        # load any external + runtime weights
+        self.load_external_weights()
         self.load_runtime_weights()
 
+    def load_external_weights(self):
+        """Load any existing external (DRAM) weights from the specified dir into the
+        appropriate layer of the accelerator. Note that this must be enabled
+        during the accelerator build process. The weights directory
+        is specified as the class member ``runtime_weight_dir``. External (DRAM)
+        weights are one .npy file per layer.
+        """
+
+        self.external_weights = []
+        w_filenames = []
+        if not os.path.isdir(self.runtime_weight_dir):
+            return
+        for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir):
+            w_filenames.extend(filenames)
+
+        tmp_weight_dict = {}
+
+        for w_filename in w_filenames:
+            if w_filename.endswith(".npy"):
+                weight_tensor = np.load(self.runtime_weight_dir + "/" + w_filename)
+            else:
+                continue
+
+            idma_name = w_filename.split(".")[0]
+            tmp_weight_dict[idma_name] = weight_tensor
+
+        for idma_name in tmp_weight_dict.keys():
+            if idma_name in self.ip_dict.keys():
+                iwdma = getattr(self, idma_name)
+                weight_tensor = tmp_weight_dict[idma_name]
+                weight_buf = allocate(weight_tensor.shape, dtype=np.uint8)
+                weight_buf[:] = weight_tensor
+                # weight_buf.sync_to_device()
+                weight_buf.flush()
+
+                self.external_weights += [(iwdma, weight_buf, idma_name)]
+
+        if "number_of_external_weights" in self._io_shape_dict:
+            hw_ext_weights = self._io_shape_dict["number_of_external_weights"]
+            assert len(self.external_weights) == hw_ext_weights, (
+                "Number of hardware external weights and number of external "
+                + "weight tensors available do not match. \n"
+                + "Is runtime_weight_dir pointing to the correct folder?"
+            )
+
     def load_runtime_weights(self, flush_accel=True, verify=True):
-        """Load any existing runtime weights from the specified dir into the
+        """Load any existing runtime-writable weights from the specified dir into the
         appropriate layer of the accelerator. Note that this must be enabled
         during the accelerator build process. The runtime weights directory
-        is specified as the class member ``runtime_weight_dir``.
+        is specified as the class member ``runtime_weight_dir``. Runtime-writable
+        weights are provided as one .dat file per layer.
 
         Parameters
         ----------
@@ -122,18 +178,25 @@ class FINNExampleOverlay(Overlay):
             if w_filename.endswith(".dat"):
                 with open(self.runtime_weight_dir + "/" + w_filename, "r") as f:
                     dat = f.read()
+            else:
+                continue
             layer_w = np.fromiter(
                 [int(x, 16) for x in dat.strip().split()], dtype=np.uint32
             )
-            layer_ind = int(w_filename.split("_")[0])
-            rt_weight_dict[layer_ind] = layer_w
-        for layer_ind in rt_weight_dict.keys():
-            cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind
+            sdp_ind = int(w_filename.split("_")[0])
+            layer_ind = int(w_filename.split("_")[1])
+            rt_weight_dict[(sdp_ind, layer_ind)] = layer_w
+        for sdp_ind, layer_ind in rt_weight_dict.keys():
+            cand_if_name = "StreamingDataflowPartition_%d/s_axilite_%d" % (
+                sdp_ind,
+                layer_ind,
+            )
             if cand_if_name in self.ip_dict.keys():
                 layer_mmio = getattr(
-                    self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind
+                    getattr(self, "StreamingDataflowPartition_%d" % sdp_ind),
+                    "s_axilite_%d" % layer_ind,
                 ).mmio
-                layer_w = rt_weight_dict[layer_ind]
+                layer_w = rt_weight_dict[(sdp_ind, layer_ind)]
                 layer_mmio.write_mm(0, layer_w.tobytes())
                 if verify:
                     new_w = np.copy(layer_mmio.array[: layer_w.shape[0]])
@@ -278,6 +341,10 @@ class FINNExampleOverlay(Overlay):
         if self.platform == "zynq-iodma":
             assert self.odma.read(0x00) & 0x4 != 0, "Output DMA is not idle"
             # manually launch IODMAs since signatures are missing
+            for iwdma, iwbuf, iwdma_name in self.external_weights:
+                iwdma.write(0x10, iwbuf.device_address)
+                iwdma.write(0x1C, batch_size)
+                iwdma.write(0x00, 1)
             self.idma.write(0x10, self.ibuf_packed_device.device_address)
             self.idma.write(0x1C, batch_size)
             self.odma.write(0x10, self.obuf_packed_device.device_address)
@@ -287,6 +354,8 @@ class FINNExampleOverlay(Overlay):
         elif self.platform == "alveo":
             assert self.odma_handle is None, "Output DMA is already running"
             self.idma.start(self.ibuf_packed_device, batch_size)
+            for iwdma, iwbuf, iwdma_name in self.external_weights:
+                iwdma.start(iwbuf, batch_size)
             self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size)
         else:
             raise Exception("Unrecognized platform: %s" % self.platform)
@@ -338,46 +407,55 @@ class FINNExampleOverlay(Overlay):
         res["DRAM_out_bandwidth[Mb/s]"] = (
             np.prod(self.oshape_packed) * 0.000001 / runtime
         )
+        for iwdma, iwbuf, iwdma_name in self.external_weights:
+            res["DRAM_extw_%s_bandwidth[Mb/s]" % iwdma_name] = (
+                self.batch_size * np.prod(iwbuf.shape) * 0.000001 / runtime
+            )
         if self.platform == "zynq-iodma":
             res["fclk[mhz]"] = Clocks.fclk0_mhz
         elif self.platform == "alveo":
             res["fclk[mhz]"] = self.clock_dict["clock0"]["frequency"]
         res["batch_size"] = self.batch_size
         # also benchmark driver-related overheads
-        input_npy = np.zeros(self.ishape_normal, dtype=self.idt.to_numpy_dt())
+        input_npy = gen_finn_dt_tensor(self.idt, self.ishape_normal)
+        # provide as int8/uint8 to support fast packing path where possible
+        if self.idt == DataType.UINT8:
+            input_npy = input_npy.astype(np.uint8)
+        elif self.idt == DataType.INT8:
+            input_npy = input_npy.astype(np.int8)
         start = time.time()
         ibuf_folded = self.fold_input(input_npy)
         end = time.time()
         runtime = end - start
-        res["fold_input[ms]"] = runtime
+        res["fold_input[ms]"] = runtime * 1000
 
         start = time.time()
         ibuf_packed = self.pack_input(ibuf_folded)
         end = time.time()
         runtime = end - start
-        res["pack_input[ms]"] = runtime
+        res["pack_input[ms]"] = runtime * 1000
 
         start = time.time()
         self.copy_input_data_to_device(ibuf_packed)
         end = time.time()
         runtime = end - start
-        res["copy_input_data_to_device[ms]"] = runtime
+        res["copy_input_data_to_device[ms]"] = runtime * 1000
 
         start = time.time()
         self.copy_output_data_from_device(self.obuf_packed)
         end = time.time()
         runtime = end - start
-        res["copy_output_data_from_device[ms]"] = runtime
+        res["copy_output_data_from_device[ms]"] = runtime * 1000
 
         start = time.time()
         obuf_folded = self.unpack_output(self.obuf_packed)
         end = time.time()
         runtime = end - start
-        res["unpack_output[ms]"] = runtime
+        res["unpack_output[ms]"] = runtime * 1000
 
         start = time.time()
         self.unfold_output(obuf_folded)
         end = time.time()
         runtime = end - start
-        res["unfold_output[ms]"] = runtime
+        res["unfold_output[ms]"] = runtime * 1000
         return res
diff --git a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json
new file mode 100644
index 0000000000000000000000000000000000000000..299a8be815aeaba70c0f41e4b1b3252b77c6f042
--- /dev/null
+++ b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json
@@ -0,0 +1,30 @@
+{
+    "Defaults": {},
+    "Thresholding_Batch_0": {
+      "PE": 49,
+      "ram_style": "distributed"
+    },
+    "StreamingFCLayer_Batch_0": {
+      "PE": 16,
+      "SIMD": 49,
+      "ram_style": "block"
+    },
+    "StreamingFCLayer_Batch_1": {
+      "PE": 8,
+      "SIMD": 8,
+      "mem_mode": "external"
+    },
+    "StreamingFCLayer_Batch_2": {
+      "PE": 8,
+      "SIMD": 8,
+      "mem_mode": "external"
+    },
+    "StreamingFCLayer_Batch_3": {
+      "PE": 10,
+      "SIMD": 8,
+      "ram_style": "distributed"
+    },
+    "LabelSelect_Batch_0": {
+      "PE": 1
+    }
+  }
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 19fa5c603bfafe16ed151e10fa8eb11a79106ede..738f2000a1929024d3808dd7bad0267338b51659 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -29,17 +29,43 @@
 import os
 import warnings
 import subprocess
+import json
 
 from finn.transformation.base import Transformation
-from finn.util.basic import get_by_name, make_build_dir, is_finn_op
+from finn.util.basic import make_build_dir, get_num_default_workers
+from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.custom_op.registry import getCustomOp
-from finn.util.basic import get_num_default_workers
 import multiprocessing as mp
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
 
 
+def is_external_input(model, node, i):
+    # indicate whether input i of node should be made external
+    # True only if input is unconnected and has no initializer
+    # Only esception is second input of FC layers when mem_mode is external
+    node_inst = getCustomOp(node)
+    producer = model.find_producer(node.input[i])
+    if producer is None:
+        if model.get_initializer(node.input[i]) is None:
+            return True
+        else:
+            if node.op_type == "StreamingFCLayer_Batch":
+                if node_inst.get_nodeattr("mem_mode") == "external":
+                    return True
+    return False
+
+
+def is_external_output(model, node, i):
+    # indicate whether output i of node should be made external
+    # True only if output is unconnected
+    consumers = model.find_consumers(node.output[i])
+    if consumers is None:
+        return True
+    return False
+
+
 class CreateStitchedIP(Transformation):
     """Create a Vivado IP Block Design project from all the generated IPs of a
     graph. All nodes in the graph must have the fpgadataflow backend attribute,
@@ -134,21 +160,24 @@ class CreateStitchedIP(Transformation):
         if len(aximm_intf_name) != 0:
             self.connect_cmds.append(
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
-                % (inst_name, aximm_intf_name[0])
+                % (inst_name, aximm_intf_name[0][0])
             )
             self.connect_cmds.append(
                 "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]"
             )
-            self.intf_names["aximm"] = ["m_axi_gmem0"]
+            self.intf_names["aximm"] = [("m_axi_gmem0", aximm_intf_name[0][1])]
             assert self.has_aximm is False, "Currently limited to one AXI-MM interface"
             self.has_aximm = True
 
-    def connect_m_axis_external(self, node):
+    def connect_m_axis_external(self, node, idx=None):
         inst_name = node.name
         node_inst = getCustomOp(node)
         output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"]
         # make output axis external
-        for output_intf_name in output_intf_names:
+        for i in range(len(output_intf_names)):
+            if idx is not None and idx != i:
+                continue
+            output_intf_name = output_intf_names[i][0]
             self.connect_cmds.append(
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
                 % (inst_name, output_intf_name)
@@ -158,15 +187,20 @@ class CreateStitchedIP(Transformation):
                 % (self.m_axis_idx, output_intf_name)
             )
             self.has_m_axis = True
-            self.intf_names["m_axis"].append("m_axis_%d" % self.m_axis_idx)
+            self.intf_names["m_axis"].append(
+                ("m_axis_%d" % self.m_axis_idx, output_intf_names[i][1])
+            )
             self.m_axis_idx += 1
 
-    def connect_s_axis_external(self, node):
+    def connect_s_axis_external(self, node, idx=None):
         inst_name = node.name
         node_inst = getCustomOp(node)
         input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"]
         # make input axis external
-        for input_intf_name in input_intf_names:
+        for i in range(len(input_intf_names)):
+            if idx is not None and idx != i:
+                continue
+            input_intf_name = input_intf_names[i][0]
             self.connect_cmds.append(
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
                 % (inst_name, input_intf_name)
@@ -176,7 +210,9 @@ class CreateStitchedIP(Transformation):
                 % (self.s_axis_idx, input_intf_name)
             )
             self.has_s_axis = True
-            self.intf_names["s_axis"].append("s_axis_%d" % self.s_axis_idx)
+            self.intf_names["s_axis"].append(
+                ("s_axis_%d" % self.s_axis_idx, input_intf_names[i][1])
+            )
             self.s_axis_idx += 1
 
     def apply(self, model):
@@ -187,70 +223,38 @@ class CreateStitchedIP(Transformation):
         ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
         # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
-            assert is_finn_op(node.domain), "Found non-FINN node"
-            backend_attribute = get_by_name(node.attribute, "backend")
-            assert backend_attribute is not None, "Backend node attribute is not set."
-            backend_value = backend_attribute.s.decode("UTF-8")
-            assert (
-                backend_value == "fpgadataflow"
-            ), """Backend node attribute is not
-            set to "fpgadataflow"."""
+            assert is_fpgadataflow_node(
+                node
+            ), "All nodes must be FINN fpgadataflow nodes."
             node_inst = getCustomOp(node)
             ip_dir_value = node_inst.get_nodeattr("ip_path")
             assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist."
             ip_dirs += [ip_dir_value]
             self.create_cmds += node_inst.code_generation_ipi()
-            my_producer = model.find_producer(node.input[0])
             self.connect_clk_rst(node)
             self.connect_axi(node)
-            if my_producer is None:
-                # first node in graph
-                self.connect_s_axis_external(node)
-                if node.op_type == "TLastMarker":
-                    assert (
-                        node_inst.get_nodeattr("Direction") == "in"
-                    ), """Output TLastMarker incorrect direction"""
-                elif node.op_type == "IODMA" and len(model.graph.node) != 1:
-                    # don't apply this check for a 1-node partition
-                    assert (
-                        node_inst.get_nodeattr("direction") == "in"
-                    ), """Input DMA incorrect direction"""
-            else:
-                # intermediate node
-                # wire up input(s) to previous node output(s)
-                # foreach input
-                #     find producer
-                #     find index of producer output connected to our target input
-                #     get names of hdl interfaces for input and producer output
-                #     issue a TCL directive to connect input to output
-                #     if FC layer with mode "decoupled", add a streamer on input 1
-                for i in range(len(node.input)):
+            for i in range(len(node.input)):
+                if is_external_input(model, node, i):
+                    self.connect_s_axis_external(node, idx=i)
+                else:
                     producer = model.find_producer(node.input[i])
                     if producer is None:
                         continue
                     j = list(producer.output).index(node.input[i])
                     src_intf_name = getCustomOp(
                         producer
-                    ).get_verilog_top_module_intf_names()["m_axis"][j]
+                    ).get_verilog_top_module_intf_names()["m_axis"][j][0]
                     dst_intf_name = node_inst.get_verilog_top_module_intf_names()[
                         "s_axis"
-                    ][i]
+                    ][i][0]
                     self.connect_cmds.append(
                         "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
                         "[get_bd_intf_pins %s/%s]"
                         % (producer.name, src_intf_name, node.name, dst_intf_name)
                     )
-            if model.find_consumers(node.output[0]) is None:
-                # last node in graph
-                self.connect_m_axis_external(node)
-                if node.op_type == "TLastMarker":
-                    assert (
-                        node_inst.get_nodeattr("Direction") == "out"
-                    ), """Output TLastMarker incorrect direction"""
-                elif node.op_type == "IODMA" and len(model.graph.node) != 1:
-                    assert (
-                        node_inst.get_nodeattr("direction") == "out"
-                    ), """Output DMA incorrect direction"""
+            for i in range(len(node.output)):
+                if is_external_output(model, node, i):
+                    self.connect_m_axis_external(node, idx=i)
 
         # create a temporary folder for the project
         prjname = "finn_vivado_stitch_proj"
@@ -316,7 +320,7 @@ class CreateStitchedIP(Transformation):
         block_library = "finn"
         block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name)
         model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv)
-        model.set_metadata_prop("vivado_stitch_ifnames", str(self.intf_names))
+        model.set_metadata_prop("vivado_stitch_ifnames", json.dumps(self.intf_names))
         tcl.append(
             (
                 "ipx::package_project -root_dir %s/ip -vendor %s "
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index c6bedd466e31efb622640cbd203d344ff9b3d88f..3434183b1480eb38ee267328042aec33e87e0446 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -58,16 +58,21 @@ class Floorplan(Transformation):
 
         # read in a user-specified floorplan or generate a default one
         if self.user_floorplan is None:
-            floorplan = model.analysis(floorplan_params)
+            self.user_floorplan = model.analysis(floorplan_params)
             json_dir = make_build_dir(prefix="vitis_floorplan_")
             json_file = json_dir + "/floorplan.json"
             model.set_metadata_prop("floorplan_json", json_file)
             with open(json_file, "w") as f:
-                json.dump(floorplan, f, indent=4)
+                json.dump(self.user_floorplan, f, indent=4)
         else:
             model.set_metadata_prop("floorplan_json", self.user_floorplan)
             model = model.transform(ApplyConfig(self.user_floorplan))
 
+        try:
+            default_slr = self.user_floorplan["Defaults"]["slr"][0]
+        except:
+            default_slr = -1
+
         # perform DWC and FIFO specific adjustments
         unassigned_nodes = 0
         for node in model.graph.node:
@@ -75,6 +80,7 @@ class Floorplan(Transformation):
             node_slr = node_inst.get_nodeattr("slr")
             if node_slr == -1:
                 unassigned_nodes += 1
+                node_inst.set_nodeattr("slr", default_slr)
             if node.op_type == "StreamingDataWidthConverter_Batch":
                 # if we have SLR assignment already. use that
                 if node_slr != -1:
@@ -100,8 +106,8 @@ class Floorplan(Transformation):
         if unassigned_nodes > 0:
             warnings.warn(
                 str(unassigned_nodes)
-                + " nodes have no entry in the provided floorplan "
-                + "and no default value was set"
+                + " nodes have no entry in the provided floorplan,"
+                + " SLR was set to " + str(default_slr)
             )
 
         # partition id generation
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 0f2b8ef6a4c0858cd98218538930c97c6df2ad9d..c8df80659d30e1855fc658bad83c3fe9bccb9bf9 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -44,8 +44,8 @@ class InsertDWC(Transformation):
         for n in graph.node:
             node_ind += 1
             if _suitable_node(n):
-                for n_output in n.output:
-                    consumers = model.find_consumers(n_output)
+                for output_name in n.output:
+                    consumers = model.find_consumers(output_name)
                     if consumers is None:
                         continue
                     if len(consumers) > 1:
@@ -59,7 +59,22 @@ class InsertDWC(Transformation):
                         n0 = getCustomOp(n)
                         n1 = getCustomOp(consumer)
                         n0_out_shape = n0.get_folded_output_shape()
-                        n1_in_shape = n1.get_folded_input_shape()
+
+                        # If FC and external mem, it could be connected to input 1
+                        if (
+                            consumer.op_type == "StreamingFCLayer_Batch"
+                            and n1.get_nodeattr("mem_mode") == "external"
+                        ):
+                            # get input idx
+                            in_idx = None
+                            for idx, n_input in enumerate(consumer.input):
+                                if output_name == n_input:
+                                    in_idx = idx
+                            assert in_idx is not None, "Malformed model"
+                            n1_in_shape = n1.get_folded_input_shape(in_idx)
+                        else:
+                            n1_in_shape = n1.get_folded_input_shape()
+
                         if n0_out_shape[-1] != n1_in_shape[-1]:
                             graph_modified = True
                             # determine dwc inwidth
@@ -82,7 +97,7 @@ class InsertDWC(Transformation):
 
                             dwc_node = oh.make_node(
                                 "StreamingDataWidthConverter_Batch",
-                                [n_output],
+                                [output_name],
                                 [dwc_output_tensor.name],
                                 domain="finn.custom_op.fpgadataflow",
                                 backend="fpgadataflow",
@@ -96,7 +111,7 @@ class InsertDWC(Transformation):
 
                             # set dwc output tensor as new input tensor of second node
                             for idx, inp in enumerate(consumer.input):
-                                if inp == n_output:
+                                if inp == output_name:
                                     consumer.input[idx] = dwc_output_tensor.name
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 67143547557a9b24b311e69cff6f885f8745cd3c..27055a4fd29dba3849c0e4a889f27802f8c36081 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -33,7 +33,6 @@ from finn.util.basic import get_by_name
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.transformation.general import SortGraph
-import finn.core.data_layout as DataLayout
 import math
 import numpy as np
 
@@ -48,6 +47,45 @@ class InsertIODMA(Transformation):
         ), "max_intfwidth must be a power of 2"
         self.max_intfwidth = max_intfwidth
 
+    def get_mem_init(self, weights, pe, simd):
+        """
+        Returns matrix ready for pack_innermost_dim_as_hex_string with
+        reverse=False (finn.util.data_packing) to return the memory init file
+        little endian packed.
+        That is, get_mem_init returns:
+        elem(pe,simd)
+        addr = 0: [(pe-1,simd-1),(pe-1,simd-2),...(0,1),(0,0)]
+        addr = 1: [(pe-1,simd*2-1),.......(0,simd+1),(0,simd)]
+        .
+        """
+
+        # TODO: refactor this into streamingfclayer_batch.py, could go into
+        # make_weight_file except it doesn't write a file but returns a npy
+        # array instead
+        w_shape = weights.shape
+        assert len(w_shape) == 2, "weights withincorrect number of dims"
+        inp_w, out_w = w_shape
+
+        assert out_w % pe == 0, "Malformed weight matrix"
+        assert inp_w % simd == 0, "Malformed weight matrix"
+        reshaped_w = np.zeros(inp_w * out_w).reshape(-1, pe * simd)
+
+        addr = 0
+        for fr in range(out_w // pe):
+            for fc in range(inp_w // simd):
+                w0_lower = fc * simd
+                w0_upper = (fc + 1) * simd
+                w1_lower = fr * pe
+                w1_upper = (fr + 1) * pe
+                tile = weights[w0_lower:w0_upper, w1_lower:w1_upper]
+                for p in range(pe):
+                    rw0_lower = p * simd
+                    rw0_upper = (p + 1) * simd
+                    reshaped_w[addr, rw0_lower:rw0_upper] = tile[:, p].transpose()
+                addr += 1
+        reshaped_w = np.flip(reshaped_w, axis=-1)
+        return reshaped_w
+
     def apply(self, model):
         # only makes sense for a pure fpgadataflow graph -- so we check!
         all_nodes = list(model.graph.node)
@@ -59,8 +97,7 @@ class InsertIODMA(Transformation):
         fc_extw_nodes = list(
             filter(
                 lambda x: x.op_type == "StreamingFCLayer_Batch"
-                and get_by_name(x.attribute, "mem_mode") is not None
-                and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external"
+                and getCustomOp(x).get_nodeattr("mem_mode") == "external"
                 and model.find_producer(x.input[1]) is None,
                 all_nodes,
             )
@@ -78,11 +115,6 @@ class InsertIODMA(Transformation):
             return (model, False)
         else:
             if final_node.op_type != "IODMA":
-                # check if tensor is NHWC
-                assert (
-                    model.get_tensor_layout(graph_out_name) == DataLayout.NHWC
-                    or model.get_tensor_layout(graph_out_name) == DataLayout.NC
-                ), "Data layout of output tensor must be NHWC or NC"
                 out_shape = model.get_tensor_shape(graph_out_name)
                 out_dtype = model.get_tensor_datatype(graph_out_name)
                 final_node_inst = getCustomOp(final_node)
@@ -123,11 +155,6 @@ class InsertIODMA(Transformation):
                 )
                 model.graph.node.append(dma_node)
             if first_node.op_type != "IODMA":
-                # check if tensor is NHWC
-                assert (
-                    model.get_tensor_layout(graph_in_name) == DataLayout.NHWC
-                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
-                ), "Data layout of input tensor must be NHWC or NC"
                 in_shape = model.get_tensor_shape(graph_in_name)
                 in_dtype = model.get_tensor_datatype(graph_in_name)
                 first_node_inst = getCustomOp(first_node)
@@ -168,11 +195,7 @@ class InsertIODMA(Transformation):
                 )
                 model.graph.node.insert(0, dma_node)
             for fc_node in fc_extw_nodes:
-                # check if tensor is NHWC
-                assert (
-                    model.get_tensor_layout(fc_node.input[1]) == DataLayout.NHWC
-                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
-                ), "Data layout of tensors must be NHWC or NC"
+                fc_inst = getCustomOp(fc_node)
                 fc_w_name = fc_node.input[1]
                 w_shape = model.get_tensor_shape(fc_w_name)
                 w_dtype = model.get_tensor_datatype(fc_w_name)
@@ -185,21 +208,24 @@ class InsertIODMA(Transformation):
                 # calculate width of stream output from DMA
                 pe = get_by_name(fc_node.attribute, "PE").i
                 simd = get_by_name(fc_node.attribute, "SIMD").i
-                assert pe * simd == w_shape[0], "Malformed weight matrix"
-                streamWidth = simd * pe * w_dtype.bitwidth()
+                streamWidth = fc_inst.get_weightstream_width_padded()
                 # make new buffer
+                W = model.get_initializer(fc_w_name)
+                iodma_mem = self.get_mem_init(W, pe, simd)
+                model.set_initializer(fc_w_name, iodma_mem)
+
                 fc_node_in = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
                 )
                 model.graph.value_info.append(fc_node_in)
                 model.set_tensor_datatype(fc_node_in.name, w_dtype)
-                model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name))
+                model.set_initializer(fc_node_in.name, W)
                 dma_node = oh.make_node(
                     "IODMA",
                     [fc_w_name],
                     [fc_node_in.name],
-                    numInputVectors=[w_shape[1]],
-                    NumChannels=w_shape[0],
+                    numInputVectors=[iodma_mem.shape[0]],
+                    NumChannels=pe * simd,
                     dataType=str(w_dtype.name),
                     intfWidth=intfwidth,
                     streamWidth=streamWidth,
diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py
index 6d37f567c9a20cf692df126c1c3560324b61d06d..84d3f4cd94c9e0870b90941f7c46447da4cee631 100644
--- a/src/finn/transformation/fpgadataflow/make_deployment.py
+++ b/src/finn/transformation/fpgadataflow/make_deployment.py
@@ -26,7 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import warnings
 import os
 import subprocess
 from distutils.dir_util import copy_tree
@@ -98,22 +97,17 @@ class DeployToPYNQ(Transformation):
         copy_tree(pynq_driver_dir, deployment_dir)
         model.set_metadata_prop("pynq_deploy_dir", deployment_dir)
         model.set_metadata_prop("exec_mode", "remote_pynq")
-        if self.password == "":
-            prefix = ""  # assume we are using an ssh key
-            warnings.warn("Empty password, make sure you've set up an ssh key")
-        else:
-            prefix = "sshpass -p %s " % self.password
 
         # create target directory on PYNQ board
-        cmd = prefix + 'ssh {}@{} -p {} "mkdir -p {}"'.format(
+        cmd = 'ssh {}@{} -p {} "mkdir -p {}"'.format(
             self.username, self.ip, self.port, self.target_dir
         )
         bash_command = ["/bin/bash", "-c", cmd]
         process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
         process_compile.communicate()
-        # copy directory to PYNQ board using scp and sshpass
-        cmd = prefix + "scp -P{} -r {} {}@{}:{}".format(
-            self.port, deployment_dir, self.username, self.ip, self.target_dir,
+        # copy directory to PYNQ board using scp
+        cmd = "scp -P{} -r {} {}@{}:{}".format(
+            self.port, deployment_dir, self.username, self.ip, self.target_dir
         )
         bash_command = ["/bin/bash", "-c", cmd]
         process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 42f18d9a812d2db2119351dabfbb38e68c33194e..6ab12548abbcbe00496101bd146b2c9b873204c8 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -38,12 +38,38 @@ import warnings
 import pkg_resources as pk
 from . import template_driver
 from finn.core.modelwrapper import ModelWrapper
+import numpy as np
+
+from finn.util.data_packing import (
+    pack_innermost_dim_as_hex_string,
+    hexstring2npbytearray,
+)
+from finn.util.basic import roundup_to_integer_multiple
+
+
+def to_external_tensor(init, w_dtype):
+    """Return an appropriately formatted and packed numpy byte array for given
+    external parameter tensor."""
+
+    weight_width = init.shape[1] * w_dtype.bitwidth()
+    weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+    hex_init = pack_innermost_dim_as_hex_string(
+        init, w_dtype, weight_width_padded, prefix="0x"
+    )
+    ext_weight = np.array([], dtype=np.uint8)
+    for line in hex_init:
+        array_line = [
+            x for x in reversed(hexstring2npbytearray(line, remove_prefix="0x"))
+        ]
+        ext_weight = np.append(ext_weight, array_line)
+
+    return ext_weight
 
 
 class MakePYNQDriver(Transformation):
     """Create PYNQ Python code to correctly interface the generated
     accelerator, including data packing/unpacking. Should be called
-    after conversion to HLS layers and folding, but prior to the creation of
+    after conversion to HLS layers, folding and the creation of
     dataflow partitions for correct operation.
 
     platform: one of ["zynq-iodma", "alveo"]
@@ -123,6 +149,35 @@ class MakePYNQDriver(Transformation):
         i_tensor_shape_packed = i_tensor_dummy_packed.shape
         o_tensor_shape_packed = o_tensor_dummy_packed.shape
 
+        # generate external weights npy files
+        weights_dir = pynq_driver_dir + "/runtime_weights"
+
+        os.makedirs(weights_dir)
+        idma_idx = 0
+        ext_weight_dma_cnt = 0
+
+        for node in model.graph.node:
+            assert (
+                node.op_type == "StreamingDataflowPartition"
+            ), "CreateDataflowPartition needs to be applied before driver generation"
+
+            producer = model.find_producer(node.input[0])
+            init_tensor = model.get_initializer(node.input[0])
+
+            if producer is None:  # input dma?
+                idma_name = "idma" + str(idma_idx)
+                if init_tensor is not None:  # input weights dma?
+                    ext_weight_dma_cnt += 1
+                    w_dtype = model.get_tensor_datatype(node.input[0])
+                    init_external_tensor = to_external_tensor(init_tensor, w_dtype)
+                    np.save(
+                        weights_dir + "/" + idma_name + ".npy", init_external_tensor
+                    )
+                else:
+                    net_input_name = idma_name
+
+                idma_idx += 1
+
         # fill in the driver template
         driver_py = pynq_driver_dir + "/driver.py"
         driver = template_driver.pynq_driver_template
@@ -146,6 +201,8 @@ class MakePYNQDriver(Transformation):
         driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal))
         driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded))
         driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
+        driver = driver.replace("$INPUT_DMA_NAME$", "'%s'" % net_input_name)
+        driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt))
 
         with open(driver_py, "w") as f:
             f.write(driver)
@@ -172,25 +229,35 @@ class MakePYNQDriver(Transformation):
         shutil.copytree(dtp_root, pynq_driver_dir + "/finn/core")
 
         # generate weight files for runtime-writable layers
-        weights_dir = pynq_driver_dir + "/runtime_weights"
-        rt_layer_ind = 0
-        os.makedirs(weights_dir)
-        for node in model.graph.node:
-            if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
-                node_inst = getCustomOp(node)
-                is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights")
-                if is_rt_weights == 1:
-                    fcl_w = model.get_initializer(node.input[1])
-                    w_filename = weights_dir + "/%d_%s.dat" % (rt_layer_ind, node.name)
-                    node_inst.make_weight_file(fcl_w, "decoupled_runtime", w_filename)
-                    rt_layer_ind += 1
-            elif node.op_type == "StreamingDataflowPartition":
-                warnings.warn(
-                    """Please call MakePYNQDriver prior to
-                CreateDataflowPartition. Can only extract runtime-writable
-                weights from HLSCustomOp instances and not StreamingDataflowPartition.
-                """
-                )
-            else:
-                continue
+
+        for sdp_ind, sdp_node in enumerate(model.graph.node):
+            assert sdp_node.op_type == "StreamingDataflowPartition"
+            # get dataflow model
+            sdp_node = getCustomOp(sdp_node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            dataflow_model = ModelWrapper(dataflow_model_filename)
+            rt_layer_ind = 0
+            for node in dataflow_model.graph.node:
+                if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
+                    node_inst = getCustomOp(node)
+                    is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights")
+                    if is_rt_weights == 1:
+                        fcl_w = dataflow_model.get_initializer(node.input[1])
+                        w_filename = weights_dir + "/%d_%d_%s.dat" % (
+                            sdp_ind,
+                            rt_layer_ind,
+                            node.name,
+                        )
+                        node_inst.make_weight_file(
+                            fcl_w, "decoupled_runtime", w_filename
+                        )
+                        rt_layer_ind += 1
+                elif node.op_type == "StreamingDataflowPartition":
+                    warnings.warn(
+                        """Nested StreamingDataflowPartition are not supported
+                    """
+                    )
+                else:
+                    continue
+
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 1ac7ee178531e745bf68405d1ae9df35c0c216fb..3dab426ccf9bab73ddac83299bdc47f89ea46bdc 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -286,7 +286,10 @@ class MakeZYNQProject(Transformation):
 
 
 class ZynqBuild(Transformation):
-    """Best-effort attempt at building the accelerator for Zynq."""
+    """Best-effort attempt at building the accelerator for Zynq.
+    It assumes the model has only fpgadataflow nodes
+
+    """
 
     def __init__(self, platform, period_ns, enable_debug=False):
         super().__init__()
@@ -300,7 +303,6 @@ class ZynqBuild(Transformation):
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
         prep_transforms = [
-            MakePYNQDriver(platform="zynq-iodma"),
             InsertIODMA(64),
             InsertDWC(),
             Floorplan(),
@@ -335,6 +337,10 @@ class ZynqBuild(Transformation):
         model = model.transform(
             MakeZYNQProject(self.platform, enable_debug=self.enable_debug)
         )
+
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "zynq-iodma")
+
+        # create driver
+        model = model.transform(MakePYNQDriver(platform="zynq-iodma"))
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py
index b595205714d8cb630816d2b42fe96640e49e506e..5265835dd2530a5c93ceefbef629a43d6f33de52 100644
--- a/src/finn/transformation/fpgadataflow/template_driver.py
+++ b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -77,7 +77,9 @@ io_shape_dict = {
     "ishape_folded" : $INPUT_SHAPE_FOLDED$,
     "oshape_folded" : $OUTPUT_SHAPE_FOLDED$,
     "ishape_packed" : $INPUT_SHAPE_PACKED$,
-    "oshape_packed" : $OUTPUT_SHAPE_PACKED$
+    "oshape_packed" : $OUTPUT_SHAPE_PACKED$,
+    "input_dma_name" : $INPUT_DMA_NAME$,
+    "number_of_external_weights": $EXT_WEIGHT_NUM$
 }
 
 if __name__ == "__main__":
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index e52fb14b158a7927311d1b7e90067fea4bde6e27..c52dfcf0cde4cbbb393786809852bc965c2856db 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -28,6 +28,7 @@
 
 import os
 import subprocess
+import json
 
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.base import Transformation
@@ -38,14 +39,17 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 )
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
-from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.floorplan import Floorplan
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    RemoveUnusedTensors,
+)
 from finn.util.basic import make_build_dir
 from finn.transformation.infer_data_layouts import InferDataLayouts
 from . import templates
@@ -89,63 +93,47 @@ class CreateVitisXO(Transformation):
         _check_vitis_envvars()
         vivado_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
         stitched_ip_dir = vivado_proj_dir + "/ip"
+        interfaces = json.loads(model.get_metadata_prop("vivado_stitch_ifnames"))
         args_string = []
-        m_axis_idx = 0
-        s_axis_idx = 0
+        arg_id = 0
         # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface
         # developed from instructions in UG1393 (v2019.2) and package_xo documentation
         # package_xo is responsible for generating the kernel xml
-        ifnames = eval(model.get_metadata_prop("vivado_stitch_ifnames"))
         assert (
-            len(ifnames["axilite"]) <= 1
+            len(interfaces["axilite"]) <= 1
         ), "CreateVitisXO supports max 1 AXI lite interface"
-        if len(ifnames["axilite"]) == 1:
-            axilite_intf_name = ifnames["axilite"][0]
-        else:
-            axilite_intf_name = None
-
-        for node in model.graph.node:
-            node_inst = getCustomOp(node)
-            arg_id = 0
-            if node.op_type == "TLastMarker":
-                stream_width = node_inst.get_nodeattr("StreamWidth")
-                # add a stream input or output port, based on direction
-                if node_inst.get_nodeattr("Direction") == "in":
-                    args_string.append(
-                        "{in:4:%s:s_axis_%d:0x0:0x0:ap_uint&lt;%s>:0}"
-                        % (str(arg_id), s_axis_idx, str(stream_width))
-                    )
-                    s_axis_idx += 1
-                else:
-                    args_string.append(
-                        "{out:4:%s:m_axis_%d:0x0:0x0:ap_uint&lt;%s>:0}"
-                        % (str(arg_id), m_axis_idx, str(stream_width))
+        axilite_intf_name = None
+        if len(interfaces["axilite"]) == 1:
+            axilite_intf_name = interfaces["axilite"][0]
+            if len(interfaces["aximm"]) > 0:
+                args_string.append(
+                    "{addr:1:%s:%s:0x8:0x10:ap_uint&lt;%s>*:0}"
+                    % (
+                        str(arg_id),
+                        interfaces["aximm"][0][0],
+                        str(interfaces["aximm"][0][1]),
                     )
-                    m_axis_idx += 1
+                )
                 arg_id += 1
-                # add a axilite port if dynamic
-                # add a count parameter if dynamic
-                if node_inst.get_nodeattr("DynIters") == 1:
-                    assert axilite_intf_name is not None
-                    args_string.append(
-                        "{numReps:0:%s:%s:0x4:0x10:uint:0}"
-                        % (str(arg_id), axilite_intf_name)
-                    )
-                    arg_id += 1
-            elif node.op_type == "IODMA":
-                port_width = node_inst.get_nodeattr("intfWidth")
-                # add an address parameter
-                # add a count parameter
                 args_string.append(
-                    "{addr:1:%s:m_axi_gmem0:0x8:0x10:ap_uint&lt;%s>*:0}"
-                    % (str(arg_id), str(port_width))
+                    "{numReps:0:%s:%s:0x4:0x1C:uint:0}"
+                    % (str(arg_id), axilite_intf_name)
                 )
                 arg_id += 1
+            else:
                 args_string.append(
-                    "{numReps:0:%s:%s:0x4:0x1C:uint:0}"
+                    "{numReps:0:%s:%s:0x4:0x10:uint:0}"
                     % (str(arg_id), axilite_intf_name)
                 )
                 arg_id += 1
+        for intf in interfaces["s_axis"] + interfaces["m_axis"]:
+            stream_width = intf[1]
+            stream_name = intf[0]
+            args_string.append(
+                "{%s:4:%s:%s:0x0:0x0:ap_uint&lt;%s>:0}"
+                % (stream_name, str(arg_id), stream_name, str(stream_width))
+            )
+            arg_id += 1
 
         # save kernel xml then run package_xo
         xo_name = self.ip_name + ".xo"
@@ -219,8 +207,6 @@ class VitisLink(Transformation):
             # has axis, aximm and axilite
             # everything else is axis-only
             # assume only one connection from each ip to the next
-            # all aximm allocated to DDR[0]
-            # all kernels allocated to SLR0
             producer = model.find_producer(node.input[0])
             consumer = model.find_consumers(node.output[0])
             # define kernel instances
@@ -237,13 +223,35 @@ class VitisLink(Transformation):
             else:
                 instance_names[node.name] = node.name
                 config.append("nk=%s:1:%s" % (node.name, instance_names[node.name]))
-            # assign SLRs
-            config.append("slr=%s:SLR0" % instance_names[node.name])
+            # explicitly assign SLRs if the slr attribute is not -1
+            node_slr = sdp_node.get_nodeattr("slr")
+            if node_slr != -1:
+                config.append("slr=%s:SLR%d" % (instance_names[node.name], node_slr))
             # assign memory banks
             if producer is None or consumer is None:
-                config.append(
-                    "sp=%s.m_axi_gmem0:DDR[%d]" % (instance_names[node.name], 0)
-                )
+                node_mem_port = sdp_node.get_nodeattr("mem_port")
+                if node_mem_port == "":
+                    #configure good defaults based on board
+                    if "u50" in self.platform or "u280" in self.platform:
+                        # Use HBM where available (also U50 does not have DDR)
+                        mem_type = "HBM"
+                        mem_idx = 0
+                    elif "u200" in self.platform:
+                        # Use DDR controller in static region of U200
+                        mem_type = "DDR"
+                        mem_idx = 1
+                    elif "u250" in self.platform:
+                        # Use DDR controller on the node's SLR if set, otherwise 0
+                        mem_type = "DDR"
+                        if node_slr == -1:
+                            mem_idx = 0
+                        else:
+                            mem_idx = node_slr
+                    else:
+                        mem_type = "DDR"
+                        mem_idx = 1
+                    node_mem_port = "%s[%d]" % (mem_type, mem_idx)
+                config.append("sp=%s.m_axi_gmem0:%s" % (instance_names[node.name], node_mem_port))
             # connect streams
             if producer is not None:
                 for i in range(len(node.input)):
@@ -342,6 +350,7 @@ class VitisLink(Transformation):
 
 class VitisBuild(Transformation):
     """Best-effort attempt at building the accelerator with Vitis.
+    It assumes the model has only fpgadataflow nodes
 
     fpga_part: string identifying the target FPGA
     period_ns: target clock period
@@ -351,7 +360,8 @@ class VitisBuild(Transformation):
     floorplan_file: path to a JSON containing a dictionary with SLR assignments
                     for each node in the ONNX graph. Must be parse-able by
                     the ApplyConfig transform.
-
+    enable_link: enable linking kernels (.xo files), otherwise just synthesize
+                    them independently.
     """
 
     def __init__(
@@ -362,6 +372,7 @@ class VitisBuild(Transformation):
         strategy=VitisOptStrategy.PERFORMANCE,
         enable_debug=False,
         floorplan_file=None,
+        enable_link=True,
     ):
         super().__init__()
         self.fpga_part = fpga_part
@@ -370,17 +381,14 @@ class VitisBuild(Transformation):
         self.strategy = strategy
         self.enable_debug = enable_debug
         self.floorplan_file = floorplan_file
+        self.enable_link = enable_link
 
     def apply(self, model):
         _check_vitis_envvars()
         # first infer layouts
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
-        prep_transforms = [
-            MakePYNQDriver(platform="alveo"),
-            InsertIODMA(512),
-            InsertDWC(),
-        ]
+        prep_transforms = [InsertIODMA(512), InsertDWC()]
         for trn in prep_transforms:
             model = model.transform(trn)
             model = model.transform(GiveUniqueNodeNames())
@@ -399,9 +407,7 @@ class VitisBuild(Transformation):
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
             kernel_model = kernel_model.transform(InsertFIFO())
-            kernel_model = kernel_model.transform(
-                InsertTLastMarker(both=True, external=False, dynamic=False)
-            )
+            kernel_model = kernel_model.transform(RemoveUnusedTensors())
             kernel_model = kernel_model.transform(GiveUniqueNodeNames())
             kernel_model.save(dataflow_model_filename)
             kernel_model = kernel_model.transform(
@@ -419,15 +425,18 @@ class VitisBuild(Transformation):
             kernel_model.set_metadata_prop("platform", "alveo")
             kernel_model.save(dataflow_model_filename)
         # Assemble design from kernels
-        model = model.transform(
-            VitisLink(
-                self.platform,
-                round(1000 / self.period_ns),
-                strategy=self.strategy,
-                enable_debug=self.enable_debug,
+        if self.enable_link:
+            model = model.transform(
+                VitisLink(
+                    self.platform,
+                    round(1000 / self.period_ns),
+                    strategy=self.strategy,
+                    enable_debug=self.enable_debug,
+                )
             )
-        )
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "alveo")
 
+        # create driver
+        model = model.transform(MakePYNQDriver(platform="alveo"))
         return (model, False)
diff --git a/tests/end2end/test_end2end_access_board.py b/tests/end2end/test_end2end_access_board.py
new file mode 100644
index 0000000000000000000000000000000000000000..21b495c74ca8179e1f9e1e3955e665c4c81763b8
--- /dev/null
+++ b/tests/end2end/test_end2end_access_board.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import subprocess
+from finn.util.test import get_build_env
+
+
+@pytest.mark.board
+def test_end2end_access_board():
+    build_env = get_build_env("zynq", 5)
+    if build_env["ip"] == "":
+        pytest.skip("PYNQ board IP address not specified")
+    remote_cmd_base = [
+        "ssh",
+        "-o",
+        "PreferredAuthentications=publickey",
+        "-o",
+        "PasswordAuthentication=no",
+        "%s@%s" % (build_env["username"], build_env["ip"]),
+    ]
+    test_text = "BoardIsAccessible"
+    touch_cmd = remote_cmd_base + ["echo %s" % test_text]
+    verif_res = subprocess.run(
+        touch_cmd, stdout=subprocess.PIPE, universal_newlines=True
+    )
+    assert verif_res.returncode == 0
+    assert verif_res.stdout.split("\n")[0] == test_text
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dba19f586e3235a582e72d4c3936a60ebc4a703
--- /dev/null
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import torch
+from brevitas.nn import QuantLinear, QuantReLU
+import torch.nn as nn
+import numpy as np
+from brevitas.core.quant import QuantType
+from brevitas.nn import QuantIdentity
+import brevitas.onnx as bo
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+import os
+import shutil
+from finn.util.test import get_build_env, load_test_checkpoint_or_skip
+import pytest
+from finn.util.basic import make_build_dir
+import pkg_resources as pk
+import json
+import wget
+import subprocess
+
+target_clk_ns = 10
+build_kind = "zynq"
+build_dir = os.environ["FINN_BUILD_DIR"]
+
+
+def get_checkpoint_name(step):
+    if step == "build":
+        # checkpoint for build step is an entire dir
+        return build_dir + "/end2end_cybsecmlp_build"
+    else:
+        # other checkpoints are onnx files
+        return build_dir + "/end2end_cybsecmlp_%s.onnx" % (step)
+
+
+class CybSecMLPForExport(nn.Module):
+    def __init__(self, my_pretrained_model):
+        super(CybSecMLPForExport, self).__init__()
+        self.pretrained = my_pretrained_model
+        self.qnt_output = QuantIdentity(
+            quant_type=QuantType.BINARY, bit_width=1, min_val=-1.0, max_val=1.0
+        )
+
+    def forward(self, x):
+        # assume x contains bipolar {-1,1} elems
+        # shift from {-1,1} -> {0,1} since that is the
+        # input range for the trained network
+        x = (x + torch.tensor([1.0])) / 2.0
+        out_original = self.pretrained(x)
+        out_final = self.qnt_output(out_original)  # output as {-1,1}
+        return out_final
+
+
+def test_end2end_cybsec_mlp_export():
+    assets_dir = pk.resource_filename("finn.qnn-data", "cybsec-mlp/")
+    # load up trained net in Brevitas
+    input_size = 593
+    hidden1 = 64
+    hidden2 = 64
+    hidden3 = 64
+    weight_bit_width = 2
+    act_bit_width = 2
+    num_classes = 1
+    model = nn.Sequential(
+        QuantLinear(input_size, hidden1, bias=True, weight_bit_width=weight_bit_width),
+        nn.BatchNorm1d(hidden1),
+        nn.Dropout(0.5),
+        QuantReLU(bit_width=act_bit_width),
+        QuantLinear(hidden1, hidden2, bias=True, weight_bit_width=weight_bit_width),
+        nn.BatchNorm1d(hidden2),
+        nn.Dropout(0.5),
+        QuantReLU(bit_width=act_bit_width),
+        QuantLinear(hidden2, hidden3, bias=True, weight_bit_width=weight_bit_width),
+        nn.BatchNorm1d(hidden3),
+        nn.Dropout(0.5),
+        QuantReLU(bit_width=act_bit_width),
+        QuantLinear(hidden3, num_classes, bias=True, weight_bit_width=weight_bit_width),
+    )
+    trained_state_dict = torch.load(assets_dir + "/state_dict.pth")[
+        "models_state_dict"
+    ][0]
+    model.load_state_dict(trained_state_dict, strict=False)
+    W_orig = model[0].weight.data.detach().numpy()
+    # pad the second (593-sized) dimensions with 7 zeroes at the end
+    W_new = np.pad(W_orig, [(0, 0), (0, 7)])
+    model[0].weight.data = torch.from_numpy(W_new)
+    model_for_export = CybSecMLPForExport(model)
+    export_onnx_path = get_checkpoint_name("export")
+    input_shape = (1, 600)
+    bo.export_finn_onnx(model_for_export, input_shape, export_onnx_path)
+    assert os.path.isfile(export_onnx_path)
+    # fix input datatype
+    finn_model = ModelWrapper(export_onnx_path)
+    finnonnx_in_tensor_name = finn_model.graph.input[0].name
+    finn_model.set_tensor_datatype(finnonnx_in_tensor_name, DataType.BIPOLAR)
+    finn_model.save(export_onnx_path)
+    assert tuple(finn_model.get_tensor_shape(finnonnx_in_tensor_name)) == (1, 600)
+    assert len(finn_model.graph.node) == 30
+    assert finn_model.graph.node[0].op_type == "Add"
+    assert finn_model.graph.node[1].op_type == "Div"
+    assert finn_model.graph.node[2].op_type == "MatMul"
+    assert finn_model.graph.node[-1].op_type == "MultiThreshold"
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_cybsec_mlp_build():
+    model_file = get_checkpoint_name("export")
+    load_test_checkpoint_or_skip(model_file)
+    build_env = get_build_env(build_kind, target_clk_ns)
+    output_dir = make_build_dir("test_end2end_cybsec_mlp_build")
+
+    cfg = build.DataflowBuildConfig(
+        output_dir=output_dir,
+        target_fps=1000000,
+        synth_clk_period_ns=target_clk_ns,
+        board=build_env["board"],
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.BITFILE,
+            build_cfg.DataflowOutputType.PYNQ_DRIVER,
+            build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
+        ],
+    )
+    build.build_dataflow_cfg(model_file, cfg)
+    # check the generated files
+    assert os.path.isfile(output_dir + "/time_per_step.json")
+    assert os.path.isfile(output_dir + "/final_hw_config.json")
+    assert os.path.isfile(output_dir + "/driver/driver.py")
+    est_cycles_report = output_dir + "/report/estimate_layer_cycles.json"
+    assert os.path.isfile(est_cycles_report)
+    est_res_report = output_dir + "/report/estimate_layer_resources.json"
+    assert os.path.isfile(est_res_report)
+    assert os.path.isfile(output_dir + "/report/estimate_network_performance.json")
+    assert os.path.isfile(output_dir + "/bitfile/finn-accel.bit")
+    assert os.path.isfile(output_dir + "/bitfile/finn-accel.hwh")
+    assert os.path.isfile(output_dir + "/report/post_synth_resources.xml")
+    assert os.path.isfile(output_dir + "/report/post_route_timing.rpt")
+    # examine the report contents
+    with open(est_cycles_report, "r") as f:
+        est_cycles_dict = json.load(f)
+        assert est_cycles_dict["StreamingFCLayer_Batch_0"] == 80
+        assert est_cycles_dict["StreamingFCLayer_Batch_1"] == 64
+    with open(est_res_report, "r") as f:
+        est_res_dict = json.load(f)
+        assert est_res_dict["total"]["LUT"] == 11360.0
+        assert est_res_dict["total"]["BRAM_18K"] == 36.0
+    shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build"))
+
+
+def test_end2end_cybsec_mlp_run_on_hw():
+    build_env = get_build_env(build_kind, target_clk_ns)
+    assets_dir = pk.resource_filename("finn.qnn-data", "cybsec-mlp/")
+    deploy_dir = get_checkpoint_name("build")
+    if not os.path.isdir(deploy_dir):
+        pytest.skip(deploy_dir + " not found from previous test step, skipping")
+    driver_dir = deploy_dir + "/driver"
+    assert os.path.isdir(driver_dir)
+    # put all assets into driver dir
+    shutil.copy(assets_dir + "/validate-unsw-nb15.py", driver_dir)
+    # put a copy of binarized dataset into driver dir
+    dataset_url = (
+        "https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1"
+    )
+    dataset_local = driver_dir + "/unsw_nb15_binarized.npz"
+    if not os.path.isfile(dataset_local):
+        wget.download(dataset_url, out=dataset_local)
+    assert os.path.isfile(dataset_local)
+    # create a shell script for running validation: 10 batches x 10 imgs
+    with open(driver_dir + "/validate.sh", "w") as f:
+        f.write(
+            """#!/bin/bash
+cd %s/driver
+echo %s | sudo -S python3.6 validate-unsw-nb15.py --batchsize=10 --limit_batches=10
+        """
+            % (
+                build_env["target_dir"] + "/end2end_cybsecmlp_build",
+                build_env["password"],
+            )
+        )
+    # set up rsync command
+    remote_target = "%s@%s:%s" % (
+        build_env["username"],
+        build_env["ip"],
+        build_env["target_dir"],
+    )
+    rsync_res = subprocess.run(["rsync", "-avz", deploy_dir, remote_target])
+    assert rsync_res.returncode == 0
+    remote_verif_cmd = [
+        "ssh",
+        "%s@%s" % (build_env["username"], build_env["ip"]),
+        "sh",
+        build_env["target_dir"] + "/end2end_cybsecmlp_build/driver/validate.sh",
+    ]
+    verif_res = subprocess.run(
+        remote_verif_cmd,
+        stdout=subprocess.PIPE,
+        universal_newlines=True,
+        input=build_env["password"],
+    )
+    assert verif_res.returncode == 0
+    log_output = verif_res.stdout.split("\n")
+    assert log_output[-3] == "batch 10 / 10 : total OK 93 NOK 7"
+    assert log_output[-2] == "Final accuracy: 93.000000"
diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa0ce7a6c6c6148ca28b58428ad60d7eb0347bea
--- /dev/null
+++ b/tests/end2end/test_ext_weights.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+import os
+import shutil
+from finn.util.test import get_build_env, load_test_checkpoint_or_skip
+import pytest
+from finn.util.basic import make_build_dir
+import pkg_resources as pk
+import wget
+import subprocess
+
+target_clk_ns = 10
+build_kind = "zynq"
+build_dir = os.environ["FINN_BUILD_DIR"]
+onnx_zip_url = "https://github.com/Xilinx/finn-examples"
+onnx_zip_url += "/releases/download/v0.0.1a/onnx-models-bnn-pynq.zip"
+onnx_zip_local = build_dir + "/onnx-models-bnn-pynq.zip"
+onnx_dir_local = build_dir + "/onnx-models-bnn-pynq"
+mnist_url = "https://raw.githubusercontent.com/fgnt/mnist/master"
+mnist_local = build_dir + "/mnist"
+mnist_files = [
+    "train-images-idx3-ubyte.gz",
+    "train-labels-idx1-ubyte.gz",
+    "t10k-images-idx3-ubyte.gz",
+    "t10k-labels-idx1-ubyte.gz",
+]
+
+
+def get_checkpoint_name(step):
+    if step == "build":
+        # checkpoint for build step is an entire dir
+        return build_dir + "/end2end_ext_weights_build"
+    elif step == "download":
+        return onnx_dir_local + "/tfc-w1a1.onnx"
+    else:
+        # other checkpoints are onnx files
+        return build_dir + "/end2end_ext_weights_%s.onnx" % (step)
+
+
+def test_end2end_ext_weights_download():
+    if not os.path.isfile(onnx_zip_local):
+        wget.download(onnx_zip_url, out=onnx_zip_local)
+    assert os.path.isfile(onnx_zip_local)
+    subprocess.check_output(["unzip", "-o", onnx_zip_local, "-d", onnx_dir_local])
+    assert os.path.isfile(get_checkpoint_name("download"))
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_ext_weights_build():
+    model_file = get_checkpoint_name("download")
+    load_test_checkpoint_or_skip(model_file)
+    build_env = get_build_env(build_kind, target_clk_ns)
+    folding_config_file = pk.resource_filename(
+        "finn.qnn-data", "test_ext_weights/tfc-w1a1-extw.json"
+    )
+    output_dir = make_build_dir("test_end2end_ext_weights_build")
+    cfg = build.DataflowBuildConfig(
+        output_dir=output_dir,
+        folding_config_file=folding_config_file,
+        synth_clk_period_ns=target_clk_ns,
+        board=build_env["board"],
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.BITFILE,
+            build_cfg.DataflowOutputType.PYNQ_DRIVER,
+            build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
+        ],
+    )
+    build.build_dataflow_cfg(model_file, cfg)
+    assert os.path.isfile(output_dir + "/deploy/bitfile/finn-accel.bit")
+    assert os.path.isfile(output_dir + "/deploy/bitfile/finn-accel.hwh")
+    assert os.path.isfile(output_dir + "/deploy/driver/driver.py")
+    assert os.path.isfile(output_dir + "/deploy/driver/runtime_weights/idma0.npy")
+    if os.path.isdir(get_checkpoint_name("build")):
+        shutil.rmtree(get_checkpoint_name("build"))
+    shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build"))
+
+
+@pytest.mark.board
+def test_end2end_ext_weights_dataset():
+    # make sure we have local copies of mnist dataset files
+    subprocess.check_output(["mkdir", "-p", mnist_local])
+    for f in mnist_files:
+        if not os.path.isfile(mnist_local + "/" + f):
+            wget.download(mnist_url + "/" + f, out=mnist_local + "/" + f)
+        assert os.path.isfile(mnist_local + "/" + f)
+    # rsync to board
+    build_env = get_build_env(build_kind, target_clk_ns)
+    mnist_target = "%s@%s:%s" % (build_env["username"], build_env["ip"], "/tmp/")
+
+    rsync_dataset_cmd = ["rsync", "-rv", mnist_local + "/", mnist_target]
+    subprocess.check_output(rsync_dataset_cmd)
+
+
+def test_end2end_ext_weights_run_on_hw():
+    build_env = get_build_env(build_kind, target_clk_ns)
+    deploy_dir = get_checkpoint_name("build")
+    if not os.path.isdir(deploy_dir):
+        pytest.skip(deploy_dir + " not found from previous test step, skipping")
+    driver_dir = deploy_dir + "/driver"
+    assert os.path.isdir(driver_dir)
+    # create a shell script for running validation: 10 batches x 10 imgs
+    with open(driver_dir + "/validate.sh", "w") as f:
+        f.write(
+            """#!/bin/bash
+cd %s/driver
+echo %s | sudo -S python3.6 validate.py --dataset mnist --bitfile %s
+        """
+            % (
+                build_env["target_dir"] + "/end2end_ext_weights_build",
+                build_env["password"],
+                "../bitfile/finn-accel.bit",
+            )
+        )
+    # set up rsync command
+    remote_target = "%s@%s:%s" % (
+        build_env["username"],
+        build_env["ip"],
+        build_env["target_dir"],
+    )
+    rsync_res = subprocess.run(["rsync", "-avz", deploy_dir, remote_target])
+    assert rsync_res.returncode == 0
+    remote_verif_cmd = [
+        "ssh",
+        "%s@%s" % (build_env["username"], build_env["ip"]),
+        "sh",
+        build_env["target_dir"] + "/end2end_ext_weights_build/driver/validate.sh",
+    ]
+    verif_res = subprocess.run(
+        remote_verif_cmd,
+        stdout=subprocess.PIPE,
+        universal_newlines=True,
+        input=build_env["password"],
+    )
+    assert verif_res.returncode == 0
+    log_output = verif_res.stdout.split("\n")
+    assert log_output[-3] == "batch 100 / 100 : total OK 9296 NOK 704"
+    assert log_output[-2] == "Final accuracy: 92.960000"