diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index a8e05114c312028d18a006d10d5b210b44afb9d3..bd2338305ef24d98f582d09a140175a243c62c7e 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -12,7 +12,7 @@ gecho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-FINN_BASE_COMMIT=91fb6066927d965471e66e103fd5201ac217c755
+FINN_BASE_COMMIT=8908c6a3f6674c4fa790954bd41c23ee5bf053df
 BREVITAS_COMMIT=aff49758ec445d77c75721c7de3091a2a1797ca8
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
 HLSLIB_COMMIT=2e49322d1bbc4969ca293843bda1f3f9c05456fc
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index eba942ecc23dbf7d3a75b256c22b3e3fceb3475f..03ec339aff67f5fbbd01f5098936da3353c1ccdb 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -105,19 +105,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
     "import finn.builder.build_dataflow as build\n",
     "import finn.builder.build_dataflow_config as build_cfg\n",
+    "import os\n",
     "\n",
     "model_file = \"cybsec-mlp-ready.onnx\"\n",
     "\n",
     "estimates_output_dir = \"output_estimates_only\"\n",
     "\n",
+    "#Delete previous run results if exist\n",
+    "if os.path.exists(estimates_output_dir):\n",
+    "    shutil.rmtree(estimates_output_dir)\n",
+    "    print(\"Previous run results deleted!\")\n",
+    "\n",
+    "\n",
     "cfg_estimates = build.DataflowBuildConfig(\n",
     "    output_dir          = estimates_output_dir,\n",
+    "    mvau_wwidth_max     = 80,\n",
     "    target_fps          = 1000000,\n",
     "    synth_clk_period_ns = 10.0,\n",
     "    fpga_part           = \"xc7z020clg400-1\",\n",
@@ -130,15 +138,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 2 碌s, sys: 1 碌s, total: 3 碌s\n",
-      "Wall time: 6.91 碌s\n",
       "Building dataflow accelerator from cybsec-mlp-ready.onnx\n",
       "Intermediate outputs will be generated in /tmp/finn_dev_maltanar\n",
       "Final outputs will be generated in output_estimates_only\n",
@@ -150,7 +156,9 @@
       "Running step: step_target_fps_parallelization [5/7]\n",
       "Running step: step_apply_folding_config [6/7]\n",
       "Running step: step_generate_estimate_reports [7/7]\n",
-      "Completed successfully\n"
+      "Completed successfully\n",
+      "CPU times: user 1.53 s, sys: 724 ms, total: 2.25 s\n",
+      "Wall time: 1.45 s\n"
      ]
     },
     {
@@ -159,7 +167,7 @@
        "0"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -178,14 +186,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "build_dataflow.log  intermediate_models  report  time_per_step.json\r\n"
+      "intermediate_models  report  time_per_step.json\r\n"
      ]
     }
    ],
@@ -195,7 +203,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -221,7 +229,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -229,11 +237,11 @@
      "output_type": "stream",
      "text": [
       "{\r\n",
-      "  \"critical_path_cycles\": 272,\r\n",
-      "  \"max_cycles\": 80,\r\n",
-      "  \"max_cycles_node_name\": \"StreamingFCLayer_Batch_0\",\r\n",
-      "  \"estimated_throughput_fps\": 1250000.0,\r\n",
-      "  \"estimated_latency_ns\": 2720.0\r\n",
+      "  \"critical_path_cycles\": 252,\r\n",
+      "  \"max_cycles\": 64,\r\n",
+      "  \"max_cycles_node_name\": \"StreamingFCLayer_Batch_1\",\r\n",
+      "  \"estimated_throughput_fps\": 1562500.0,\r\n",
+      "  \"estimated_latency_ns\": 2520.0\r\n",
       "}"
      ]
     }
@@ -251,7 +259,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -264,19 +272,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'StreamingFCLayer_Batch_0': 80,\n",
+       "{'StreamingFCLayer_Batch_0': 60,\n",
        " 'StreamingFCLayer_Batch_1': 64,\n",
        " 'StreamingFCLayer_Batch_2': 64,\n",
        " 'StreamingFCLayer_Batch_3': 64}"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -296,27 +304,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'StreamingFCLayer_Batch_0': {'BRAM_18K': 27,\n",
-       "  'BRAM_efficiency': 0.15432098765432098,\n",
-       "  'LUT': 8149,\n",
+       "{'StreamingFCLayer_Batch_0': {'BRAM_18K': 36,\n",
+       "  'BRAM_efficiency': 0.11574074074074074,\n",
+       "  'LUT': 8184,\n",
        "  'URAM': 0,\n",
        "  'URAM_efficiency': 1,\n",
        "  'DSP': 0},\n",
        " 'StreamingFCLayer_Batch_1': {'BRAM_18K': 4,\n",
        "  'BRAM_efficiency': 0.1111111111111111,\n",
-       "  'LUT': 1435,\n",
+       "  'LUT': 1217,\n",
        "  'URAM': 0,\n",
        "  'URAM_efficiency': 1,\n",
        "  'DSP': 0},\n",
        " 'StreamingFCLayer_Batch_2': {'BRAM_18K': 4,\n",
        "  'BRAM_efficiency': 0.1111111111111111,\n",
-       "  'LUT': 1435,\n",
+       "  'LUT': 1217,\n",
        "  'URAM': 0,\n",
        "  'URAM_efficiency': 1,\n",
        "  'DSP': 0},\n",
@@ -326,10 +334,10 @@
        "  'URAM': 0,\n",
        "  'URAM_efficiency': 1,\n",
        "  'DSP': 0},\n",
-       " 'total': {'BRAM_18K': 36.0, 'LUT': 11360.0, 'URAM': 0.0, 'DSP': 0.0}}"
+       " 'total': {'BRAM_18K': 45.0, 'LUT': 10959.0, 'URAM': 0.0, 'DSP': 0.0}}"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -355,7 +363,7 @@
     "\n",
     "Once we have a configuration that gives satisfactory estimates, we can move on to generating the accelerator. We can do this in different ways depending on how we want to integrate the accelerator into a larger system. For instance, if we have a larger streaming system built in Vivado or if we'd like to re-use this generated accelerator as an IP component in other projects, the `STITCHED_IP` output product is a good choice. We can also use the `OOC_SYNTH` output product to get post-synthesis resource and clock frequency numbers for our accelerator.\n",
     "\n",
-    "<font color=\"red\">**FPGA'21 tutorial:** These next builds will take about 10 minutes to complete since multiple calls to Vivado and a call to RTL simulation are involved. While this is running, you can examine the generated files with noVNC -- it is running on (your AWS URL):6080/vnc.html:\n",
+    "<font color=\"red\">**FPGA'21 tutorial:** These next builds will take about 10 minutes to complete since multiple calls to Vivado and a call to RTL simulation are involved. While this is running, you can examine the generated files with noVNC -- it is running on (your AWS URL):6080/vnc.html\n",
     "\n",
     "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_StreamingFCLayer_Batch_XXXXXX`\n",
     "    \n",
@@ -365,7 +373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -385,6 +393,7 @@
     "\n",
     "cfg_stitched_ip = build.DataflowBuildConfig(\n",
     "    output_dir          = rtlsim_output_dir,\n",
+    "    mvau_wwidth_max     = 80,\n",
     "    target_fps          = 1000000,\n",
     "    synth_clk_period_ns = 10.0,\n",
     "    fpga_part           = \"xc7z020clg400-1\",\n",
@@ -398,7 +407,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -426,8 +435,8 @@
       "Running step: step_synthesize_bitfile [15/16]\n",
       "Running step: step_deployment_package [16/16]\n",
       "Completed successfully\n",
-      "CPU times: user 3.69 s, sys: 756 ms, total: 4.45 s\n",
-      "Wall time: 7min 11s\n"
+      "CPU times: user 3.81 s, sys: 658 ms, total: 4.46 s\n",
+      "Wall time: 6min 23s\n"
      ]
     },
     {
@@ -436,7 +445,7 @@
        "0"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -462,9 +471,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "all_verilog_srcs.txt\t\t       ip\r\n",
+      "finn_vivado_stitch_proj.cache\t       make_project.sh\r\n",
+      "finn_vivado_stitch_proj.hw\t       make_project.tcl\r\n",
+      "finn_vivado_stitch_proj.ip_user_files  vivado.jou\r\n",
+      "finn_vivado_stitch_proj.srcs\t       vivado.log\r\n",
+      "finn_vivado_stitch_proj.xpr\r\n"
+     ]
+    }
+   ],
    "source": [
     "! ls {rtlsim_output_dir}/stitched_ip"
    ]
@@ -478,9 +500,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "estimate_layer_resources_hls.json  rtlsim_performance.json\r\n",
+      "ooc_synth_and_timing.json\r\n"
+     ]
+    }
+   ],
    "source": [
     "! ls {rtlsim_output_dir}/report"
    ]
@@ -494,9 +525,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\r\n",
+      "  \"vivado_proj_folder\": \"/tmp/finn_dev_maltanar/synth_out_of_context_g_urbes5/results_finn_design_wrapper\",\r\n",
+      "  \"LUT\": 8667.0,\r\n",
+      "  \"FF\": 9063.0,\r\n",
+      "  \"DSP\": 0.0,\r\n",
+      "  \"BRAM\": 22.0,\r\n",
+      "  \"WNS\": 0.946,\r\n",
+      "  \"\": 0,\r\n",
+      "  \"fmax_mhz\": 110.44842058758559,\r\n",
+      "  \"estimated_throughput_fps\": 1725756.5716810247\r\n",
+      "}"
+     ]
+    }
+   ],
    "source": [
     "! cat {rtlsim_output_dir}/report/ooc_synth_and_timing.json"
    ]
@@ -505,14 +554,31 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In `rtlsim_performance.json` we can find the steady-state throughput and latency for the accelerator, as obtained by rtlsim. If the DRAM bandwidth numbers reported here are below what the hardware platform is capable of (i.e. the accelerator is not memory-bound), you can expect the same steady-state throughput in real hardware."
+    "In `rtlsim_performance.json` we can find the steady-state throughput and latency for the accelerator, as obtained by rtlsim. If the DRAM bandwidth numbers reported here are below what the hardware platform is capable of (i.e. the accelerator is not memory-bound), you can expect the same steady-state throughput (excluding any software/driver overheads) in real hardware."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\r\n",
+      "  \"cycles\": 643,\r\n",
+      "  \"runtime[ms]\": 0.00643,\r\n",
+      "  \"throughput[images/s]\": 1088646.967340591,\r\n",
+      "  \"DRAM_in_bandwidth[Mb/s]\": 81.64852255054431,\r\n",
+      "  \"DRAM_out_bandwidth[Mb/s]\": 0.13608087091757387,\r\n",
+      "  \"fclk[mhz]\": 100.0,\r\n",
+      "  \"N\": 7,\r\n",
+      "  \"latency_cycles\": 211\r\n",
+      "}"
+     ]
+    }
+   ],
    "source": [
     "! cat {rtlsim_output_dir}/report/rtlsim_performance.json"
    ]
@@ -526,9 +592,62 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\r\n",
+      "  \"Defaults\": {},\r\n",
+      "  \"StreamingFIFO_0\": {\r\n",
+      "    \"ram_style\": \"auto\",\r\n",
+      "    \"depth\": 32,\r\n",
+      "    \"impl_style\": \"rtl\"\r\n",
+      "  },\r\n",
+      "  \"StreamingFCLayer_Batch_0\": {\r\n",
+      "    \"PE\": 16,\r\n",
+      "    \"SIMD\": 40,\r\n",
+      "    \"ram_style\": \"auto\",\r\n",
+      "    \"resType\": \"lut\",\r\n",
+      "    \"mem_mode\": \"decoupled\",\r\n",
+      "    \"runtime_writeable_weights\": 0\r\n",
+      "  },\r\n",
+      "  \"StreamingDataWidthConverter_Batch_0\": {\r\n",
+      "    \"impl_style\": \"hls\"\r\n",
+      "  },\r\n",
+      "  \"StreamingFCLayer_Batch_1\": {\r\n",
+      "    \"PE\": 1,\r\n",
+      "    \"SIMD\": 64,\r\n",
+      "    \"ram_style\": \"auto\",\r\n",
+      "    \"resType\": \"lut\",\r\n",
+      "    \"mem_mode\": \"decoupled\",\r\n",
+      "    \"runtime_writeable_weights\": 0\r\n",
+      "  },\r\n",
+      "  \"StreamingDataWidthConverter_Batch_1\": {\r\n",
+      "    \"impl_style\": \"hls\"\r\n",
+      "  },\r\n",
+      "  \"StreamingFCLayer_Batch_2\": {\r\n",
+      "    \"PE\": 1,\r\n",
+      "    \"SIMD\": 64,\r\n",
+      "    \"ram_style\": \"auto\",\r\n",
+      "    \"resType\": \"lut\",\r\n",
+      "    \"mem_mode\": \"decoupled\",\r\n",
+      "    \"runtime_writeable_weights\": 0\r\n",
+      "  },\r\n",
+      "  \"StreamingFCLayer_Batch_3\": {\r\n",
+      "    \"PE\": 1,\r\n",
+      "    \"SIMD\": 1,\r\n",
+      "    \"ram_style\": \"auto\",\r\n",
+      "    \"resType\": \"lut\",\r\n",
+      "    \"mem_mode\": \"decoupled\",\r\n",
+      "    \"runtime_writeable_weights\": 0\r\n",
+      "  }\r\n",
+      "}"
+     ]
+    }
+   ],
    "source": [
     "! cat {rtlsim_output_dir}/final_hw_config.json"
    ]
@@ -544,12 +663,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
     "import finn.builder.build_dataflow as build\n",
     "import finn.builder.build_dataflow_config as build_cfg\n",
+    "import os\n",
     "\n",
     "model_file = \"cybsec-mlp-ready.onnx\"\n",
     "\n",
@@ -562,6 +682,7 @@
     "\n",
     "cfg = build.DataflowBuildConfig(\n",
     "    output_dir          = final_output_dir,\n",
+    "    mvau_wwidth_max     = 80,\n",
     "    target_fps          = 1000000,\n",
     "    synth_clk_period_ns = 10.0,\n",
     "    board               = \"Pynq-Z1\",\n",
@@ -576,10 +697,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Building dataflow accelerator from cybsec-mlp-ready.onnx\n",
+      "Intermediate outputs will be generated in /tmp/finn_dev_maltanar\n",
+      "Final outputs will be generated in output_final\n",
+      "Build log is at output_final/build_dataflow.log\n",
+      "Running step: step_tidy_up [1/16]\n",
+      "Running step: step_streamline [2/16]\n",
+      "Running step: step_convert_to_hls [3/16]\n",
+      "Running step: step_create_dataflow_partition [4/16]\n",
+      "Running step: step_target_fps_parallelization [5/16]\n",
+      "Running step: step_apply_folding_config [6/16]\n",
+      "Running step: step_generate_estimate_reports [7/16]\n",
+      "Running step: step_hls_codegen [8/16]\n",
+      "Running step: step_hls_ipgen [9/16]\n",
+      "Running step: step_set_fifo_depths [10/16]\n",
+      "Running step: step_create_stitched_ip [11/16]\n",
+      "Running step: step_measure_rtlsim_performance [12/16]\n",
+      "Running step: step_make_pynq_driver [13/16]\n",
+      "Running step: step_out_of_context_synthesis [14/16]\n",
+      "Running step: step_synthesize_bitfile [15/16]\n",
+      "Running step: step_deployment_package [16/16]\n",
+      "Completed successfully\n",
+      "CPU times: user 3.66 s, sys: 892 ms, total: 4.56 s\n",
+      "Wall time: 17min 15s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "#%%time\n",
     "#build.build_dataflow_cfg(model_file, cfg)"
    ]
   },
@@ -592,9 +754,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "finn-accel.bit\tfinn-accel.hwh\r\n"
+     ]
+    }
+   ],
    "source": [
     "#! ls {final_output_dir}/bitfile"
    ]
@@ -608,9 +778,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "driver.py  driver_base.py  finn  runtime_weights  validate.py\r\n"
+     ]
+    }
+   ],
    "source": [
     "#! ls {final_output_dir}/driver"
    ]
@@ -624,9 +802,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "estimate_layer_resources_hls.json  post_synth_resources.xml\r\n",
+      "post_route_timing.rpt\r\n"
+     ]
+    }
+   ],
    "source": [
     "#! ls {final_output_dir}/report"
    ]
@@ -640,13 +827,133 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bitfile  driver\r\n"
+     ]
+    }
+   ],
    "source": [
     "#! ls {final_output_dir}/deploy"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## (Optional) Run on PYNQ board <a id=\"run_on_pynq\"></a>\n",
+    "\n",
+    "<font color=\"red\">**FPGA'21 tutorial:** This section is not included in the hands-on tutorial due to the bitfile synthesis time (15-20 min) of the previous section. If you own a PYNQ board, we encourage you to uncomment the cells below to try it out on your own after the tutorial.</font>\n",
+    "\n",
+    "To test the accelerator on the board, we'll put a copy of the dataset and a premade Python script that validates the accuracy into the `driver` folder, then make a zip archive of the whole deployment folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#! cp unsw_nb15_binarized.npz {final_output_dir}/deploy/driver"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#! cp validate-unsw-nb15.py {final_output_dir}/deploy/driver"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/workspace/finn/notebooks/end2end_example/cybersecurity/deploy-on-pynq.zip'"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#from shutil import make_archive\n",
+    "#make_archive('deploy-on-pynq', 'zip', final_output_dir+\"/deploy\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can now download the created zipfile (File -> Open, mark the checkbox next to the `deploy-on-pynq.zip` and select Download from the toolbar), then copy it to your PYNQ board (for instance via `scp` or `rsync`). Then, run the following commands **on the PYNQ board** to extract the archive and run the validation:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```shell\n",
+    "unzip deploy-on-pynq.zip -d finn-cybsec-mlp-demo\n",
+    "cd finn-cybsec-mlp-demo/driver\n",
+    "sudo python3.6 validate-unsw-nb15.py --batchsize 1000\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should see `Final accuracy: 91.868293` at the end. You may have noticed that the validation doesn't *quite* run at 1M inferences per second. This is because of the Python packing/unpacking and data movement overheads. To see this in more detail, the generated driver includes a benchmarking mode that shows the runtime breakdown:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```shell\n",
+    "sudo python3.6 driver.py --exec_mode throughput_test --bitfile ../bitfile/finn-accel.bit --batchsize 1000\n",
+    "cat nw_metrics.txt\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```{'runtime[ms]': 1.0602474212646484,\n",
+    " 'throughput[images/s]': 943176.0737575893,\n",
+    " 'DRAM_in_bandwidth[Mb/s]': 70.7382055318192,\n",
+    " 'DRAM_out_bandwidth[Mb/s]': 0.9431760737575894,\n",
+    " 'fclk[mhz]': 100.0,\n",
+    " 'batch_size': 1000,\n",
+    " 'fold_input[ms]': 9.679794311523438e-05,\n",
+    " 'pack_input[ms]': 0.060115814208984375,\n",
+    " 'copy_input_data_to_device[ms]': 0.002428770065307617,\n",
+    " 'copy_output_data_from_device[ms]': 0.0005249977111816406,\n",
+    " 'unpack_output[ms]': 0.3773000240325928,\n",
+    " 'unfold_output[ms]': 6.818771362304688e-05}```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, the various `pack_input/unpack_output` calls show the overhead of packing/unpacking the inputs/outputs to convert from numpy arrays to the bit-contiguous data representation our accelerator expects. The `copy_input_data_to_device` and `copy_output_data_from_device` indicate the cost of moving the data between the CPU and accelerator memories. These overheads can dominate the execution time when running with small batch sizes.\n",
+    "\n",
+    "Finally, we can see that `throughput[images/s]`, which is the pure hardware throughput without any software and data movement overheads, is close to 1M inferences per second."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py b/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
new file mode 100644
index 0000000000000000000000000000000000000000..622c69c8d0abdf8025b0486c63bf336e4f8675f5
--- /dev/null
+++ b/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from driver import io_shape_dict
+from driver_base import FINNExampleOverlay
+import numpy as np
+
+
+def make_unsw_nb15_test_batches(bsize, dataset_root):
+    unsw_nb15_data = np.load(dataset_root + "/unsw_nb15_binarized.npz")["test"][:82000]
+    test_imgs = unsw_nb15_data[:, :-1]
+    test_labels = unsw_nb15_data[:, -1]
+    n_batches = int(test_imgs.shape[0] / bsize)
+    test_imgs = test_imgs.reshape(n_batches, bsize, -1)
+    test_labels = test_labels.reshape(n_batches, bsize)
+    return (test_imgs, test_labels)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Validate top-1 accuracy for FINN-generated accelerator"
+    )
+    parser.add_argument(
+        "--batchsize", help="number of samples for inference", type=int, default=1000
+    )
+    parser.add_argument(
+        "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma"
+    )
+    parser.add_argument(
+        "--bitfile",
+        help='name of bitfile (i.e. "resizer.bit")',
+        default="../bitfile/finn-accel.bit",
+    )
+    parser.add_argument(
+        "--dataset_root", help="dataset root dir for download/reuse", default="."
+    )
+    # parse arguments
+    args = parser.parse_args()
+    bsize = args.batchsize
+    bitfile = args.bitfile
+    platform = args.platform
+    dataset_root = args.dataset_root
+
+    print("Loading dataset...")
+    (test_imgs, test_labels) = make_unsw_nb15_test_batches(bsize, dataset_root)
+
+    ok = 0
+    nok = 0
+    n_batches = test_imgs.shape[0]
+    total = n_batches * bsize
+
+    print("Initializing driver, flashing bitfile...")
+
+    driver = FINNExampleOverlay(
+        bitfile_name=bitfile,
+        platform=platform,
+        io_shape_dict=io_shape_dict,
+        batch_size=bsize,
+    )
+
+    n_batches = int(total / bsize)
+
+    print("Starting...")
+
+    for i in range(n_batches):
+        inp = np.pad(test_imgs[i].astype(np.float32), [(0, 0), (0, 7)], mode="constant")
+        exp = test_labels[i].astype(np.float32)
+        inp = 2 * inp - 1
+        exp = 2 * exp - 1
+        out = driver.execute(inp)
+        matches = np.count_nonzero(out.flatten() == exp.flatten())
+        nok += bsize - matches
+        ok += matches
+        print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))
+
+    acc = 100.0 * ok / (total)
+    print("Final accuracy: %f" % acc)