diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 945bb86760e22b40fb8465c3161e2577a305a6e5..035bba3b53d85a8457eff1e7c1a23e0efff60caa 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -16,9 +16,9 @@ gecho () {
 BREVITAS_COMMIT=215cf44c76d562339fca368c8c3afee3110033e8
 BREVITAS_EXAMPLES_COMMIT=2059f96bd576bf71f32c757e7f92617a70190c90
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=b139bf051ac8f8e0a3625509247f714127cf3317
+HLSLIB_COMMIT=6b88db826bb023937506913a23d964775a7606af
 PYVERILATOR_COMMIT=fb1afefa5b207acf6fec28f8abb72a862f2ca1d2
-PYNQSHELL_COMMIT=db7e418767ce2a8e08fe732ddb3aa56ee79b7560
+PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d
 
 
 gecho "Setting up known-good commit versions for FINN dependencies"
diff --git a/notebooks/end2end_example/tfc_end2end_example.ipynb b/notebooks/end2end_example/tfc_end2end_example.ipynb
index 5d30e830842c2549c1d8b197e3cdc7939b01b9ec..a0e905c83eab7a52f70bfb45923b9b59d1c8cea6 100644
--- a/notebooks/end2end_example/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/tfc_end2end_example.ipynb
@@ -140,7 +140,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcf2b7bf828>"
+       "<IPython.lib.display.IFrame at 0x7f4310b476a0>"
       ]
      },
      "execution_count": 3,
@@ -299,7 +299,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcf3e4739e8>"
+       "<IPython.lib.display.IFrame at 0x7f43177c2a20>"
       ]
      },
      "execution_count": 6,
@@ -406,7 +406,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcec56f1be0>"
+       "<IPython.lib.display.IFrame at 0x7f431826d860>"
       ]
      },
      "execution_count": 8,
@@ -460,7 +460,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcf3ef258d0>"
+       "<IPython.lib.display.IFrame at 0x7f42977e39b0>"
       ]
      },
      "execution_count": 9,
@@ -529,7 +529,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcf2a6540f0>"
+       "<IPython.lib.display.IFrame at 0x7f43177c73c8>"
       ]
      },
      "execution_count": 10,
@@ -589,7 +589,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcf2a654080>"
+       "<IPython.lib.display.IFrame at 0x7f43177c2f60>"
       ]
      },
      "execution_count": 11,
@@ -624,7 +624,7 @@
      "text": [
       "\n",
       "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/finn_dev_jakobap/dataflow_partition_dkjtsnwj/df_model.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/tmp/finn_jakobap/dataflow_partition_sqcfkplo/df_model.onnx' at http://0.0.0.0:8081\n"
      ]
     },
     {
@@ -641,7 +641,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcec56f1978>"
+       "<IPython.lib.display.IFrame at 0x7f42977d4978>"
       ]
      },
      "execution_count": 12,
@@ -870,10 +870,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcec5707978>"
+       "<IPython.lib.display.IFrame at 0x7f43177c7518>"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -930,9 +930,9 @@
    "outputs": [],
    "source": [
     "# change this if you have a different PYNQ board, see list above\n",
-    "pynq_board = \"Ultra96\"\n",
+    "pynq_board = \"Pynq-Z1\"\n",
     "fpga_part = pynq_part_map[pynq_board]\n",
-    "target_clk_ns = 5"
+    "target_clk_ns = 10"
    ]
   },
   {
@@ -1023,7 +1023,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcec56c4f98>"
+       "<IPython.lib.display.IFrame at 0x7f42977edf60>"
       ]
      },
      "execution_count": 23,
@@ -1081,7 +1081,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "cat: /tmp/finn_dev_jakobap/code_gen_ipgen_StreamingFIFO_0_ruu9s3g8/ipgen.sh: No such file or directory\r\n"
+      "#!/bin/bash \r\n",
+      "cd /tmp/finn_jakobap/code_gen_ipgen_StreamingFCLayer_Batch_0_pfp8r_i6\r\n",
+      "vivado_hls /tmp/finn_jakobap/code_gen_ipgen_StreamingFCLayer_Batch_0_pfp8r_i6/hls_syn_StreamingFCLayer_Batch_0.tcl\r\n",
+      "cd /workspace/finn\r\n"
      ]
     }
    ],
@@ -1108,7 +1111,32 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "cat: /tmp/finn_dev_jakobap/code_gen_ipgen_StreamingFIFO_0_ruu9s3g8/hls_syn_StreamingFCLayer_Batch_0.tcl: No such file or directory\r\n"
+      "\r\n",
+      "set config_proj_name project_StreamingFCLayer_Batch_0\r\n",
+      "puts \"HLS project: $config_proj_name\"\r\n",
+      "set config_hwsrcdir \"/tmp/finn_jakobap/code_gen_ipgen_StreamingFCLayer_Batch_0_pfp8r_i6\"\r\n",
+      "puts \"HW source dir: $config_hwsrcdir\"\r\n",
+      "set config_proj_part \"xc7z020clg400-1\"\r\n",
+      "\r\n",
+      "set config_bnnlibdir \"/workspace/finn-hlslib\"\r\n",
+      "\r\n",
+      "set config_toplevelfxn \"StreamingFCLayer_Batch_0\"\r\n",
+      "set config_clkperiod 5\r\n",
+      "\r\n",
+      "open_project $config_proj_name\r\n",
+      "add_files $config_hwsrcdir/top_StreamingFCLayer_Batch_0.cpp -cflags \"-std=c++0x -I$config_bnnlibdir\"\r\n",
+      "\r\n",
+      "set_top $config_toplevelfxn\r\n",
+      "open_solution sol1\r\n",
+      "set_part $config_proj_part\r\n",
+      "\r\n",
+      "config_interface -m_axi_addr64\r\n",
+      "config_rtl -auto_prefix\r\n",
+      "\r\n",
+      "create_clock -period $config_clkperiod -name default\r\n",
+      "csynth_design\r\n",
+      "export_design -format ip_catalog\r\n",
+      "exit 0\r\n"
      ]
     }
    ],
@@ -1164,11 +1192,11 @@
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_dev_jakobap/vivado_stitch_proj_b75moqdt\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
        ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_dev_jakobap/vivado_stitch_proj_b75moqdt/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        "]"
       ]
      },
@@ -1189,7 +1217,7 @@
     {
      "data": {
       "text/plain": [
-       "'/tmp/finn_dev_jakobap/vivado_stitch_proj_b75moqdt'"
+       "'/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j'"
       ]
      },
      "execution_count": 29,
@@ -1266,15 +1294,15 @@
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_dev_jakobap/vivado_stitch_proj_b75moqdt\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
        ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_dev_jakobap/vivado_stitch_proj_b75moqdt/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_dev_jakobap/vivado_pynq_proj_dz1m1usu\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j\"\n",
        ", key: \"vivado_synth_rpt\"\n",
-       "value: \"/tmp/finn_dev_jakobap/vivado_pynq_proj_dz1m1usu/synth_report.xml\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j/synth_report.xml\"\n",
        "]"
       ]
      },
@@ -1292,7 +1320,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -1318,7 +1346,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1341,26 +1369,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
+   "execution_count": 34,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
        ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_hq9mfroo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j\"\n",
+       ", key: \"vivado_synth_rpt\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j/synth_report.xml\"\n",
        ", key: \"vivado_pynq_bitfile\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_hq9mfroo/resizer.bit\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j/resizer.bit\"\n",
        "]"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1374,7 +1406,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1392,7 +1424,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1410,74 +1442,131 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "\r\n",
+      "import argparse\r\n",
       "\r\n",
       "from pynq import Overlay\r\n",
       "import numpy as np\r\n",
       "from pynq import allocate\r\n",
+      "import time\r\n",
       "from finn.util.data_packing import (\r\n",
       "    finnpy_to_packed_bytearray,\r\n",
       "    packed_bytearray_to_finnpy\r\n",
       ")\r\n",
       "from finn.core.datatype import DataType\r\n",
       "\r\n",
-      "bitfile_path = \"resizer.bit\"\r\n",
-      "ol = Overlay(bitfile_path)\r\n",
-      "dma=ol.axi_dma_0\r\n",
+      "def load_input(N):\r\n",
+      "    ishape_normal = (N, 784)\r\n",
+      "    # load desired input .npy file\r\n",
+      "    ibuf_normal = np.load(\"input.npy\")\r\n",
+      "    # ensure that shape is as expected\r\n",
+      "    assert ibuf_normal.shape == ishape_normal\r\n",
+      "    return ibuf_normal\r\n",
       "\r\n",
-      "# declare input/output types and shapes for the accelerator\r\n",
-      "# input FINN DataType\r\n",
-      "idt = DataType.BINARY\r\n",
-      "# normal, folded and packed input shapes\r\n",
-      "ishape_normal = (1, 784)\r\n",
-      "ishape_folded = (1, 49, 16)\r\n",
-      "ishape_packed = (1, 49, 2)\r\n",
-      "# output FINN DataType\r\n",
-      "odt = DataType.UINT32\r\n",
-      "# normal, folded and packed output shapes\r\n",
-      "oshape_normal = (1, 10)\r\n",
-      "oshape_folded = (1, 1, 10)\r\n",
-      "oshape_packed = (1, 1, 40)\r\n",
+      "def pack_input(ibuf_normal, N):\r\n",
+      "    # input FINN DataType\r\n",
+      "    idt = DataType.BINARY\r\n",
+      "    ishape_folded = (N, 49, 16)\r\n",
+      "    # convert to folded form\r\n",
+      "    ibuf_folded = ibuf_normal.reshape(ishape_folded)\r\n",
+      "    # pack the input buffer, reversing both SIMD dim and endianness\r\n",
+      "    ibuf_packed = finnpy_to_packed_bytearray(\r\n",
+      "        ibuf_folded, idt, reverse_endian=True, reverse_inner=True\r\n",
+      "    )\r\n",
+      "    return ibuf_packed\r\n",
       "\r\n",
-      "# load desired input .npy file\r\n",
-      "ibuf_normal = np.load(\"input.npy\")\r\n",
-      "# ensure that shape is as expected\r\n",
-      "assert ibuf_normal.shape == ishape_normal\r\n",
-      "# convert to folded form\r\n",
-      "ibuf_folded = ibuf_normal.reshape(ishape_folded)\r\n",
+      "def unpack_output(obuf_packed, N):\r\n",
+      "    # output FINN DataType\r\n",
+      "    odt = DataType.UINT32\r\n",
+      "    oshape_folded = (N, 1, 10)\r\n",
+      "    # unpack the packed output buffer from accelerator\r\n",
+      "    obuf_folded = packed_bytearray_to_finnpy(\r\n",
+      "        obuf_packed, odt, oshape_folded, reverse_endian=True, reverse_inner=True\r\n",
+      "    )\r\n",
+      "    return obuf_folded\r\n",
       "\r\n",
-      "# pack the input buffer, reversing both SIMD dim and endianness\r\n",
-      "ibuf_packed = finnpy_to_packed_bytearray(\r\n",
-      "    ibuf_folded, idt, reverse_endian=True, reverse_inner=True\r\n",
-      ")\r\n",
-      "# allocate a PYNQ buffer for the packed input buffer\r\n",
-      "ibuf_packed_device = allocate(shape=ishape_packed, dtype=np.uint8)\r\n",
-      "# copy the packed data into the PYNQ buffer\r\n",
-      "# TODO optimization: pack directly into the PYNQ buffer?\r\n",
-      "np.copyto(ibuf_packed_device, ibuf_packed)\r\n",
+      "def save_output(obuf_folded, N):\r\n",
+      "    # convert to normal reshape and save\r\n",
+      "    oshape_normal = (N, 10)\r\n",
+      "    obuf_normal = obuf_folded.reshape(oshape_normal)\r\n",
+      "    np.save(\"output.npy\", obuf_normal)\r\n",
       "\r\n",
-      "# allocate a PYNQ buffer for the returned packed output buffer\r\n",
-      "obuf_packed = allocate(shape=oshape_packed, dtype=np.uint8)\r\n",
+      "if __name__ == \"__main__\":\r\n",
+      "    parser = argparse.ArgumentParser(description='Please select functional verification (\"remote_pynq\") or throughput test (\"throughput_test\")')\r\n",
+      "    parser.add_argument('exec_mode', help='metadata prop exec_mode as string')\r\n",
+      "    args = parser.parse_args()\r\n",
+      "    exec_mode = args.exec_mode\r\n",
       "\r\n",
-      "# set up the DMA and wait until all transfers complete\r\n",
-      "dma.sendchannel.transfer(ibuf_packed_device)\r\n",
-      "dma.recvchannel.transfer(obuf_packed)\r\n",
-      "dma.sendchannel.wait()\r\n",
-      "dma.recvchannel.wait()\r\n",
+      "    bitfile_path = \"resizer.bit\"\r\n",
+      "    ol = Overlay(bitfile_path)\r\n",
+      "    dma=ol.axi_dma_0\r\n",
+      "    ctrl_regs=ol.resize_accel_0\r\n",
+      "    # AXI lite register offset for number of iterations\r\n",
+      "    # used by TLastMarker to signal end of transmission for AXI CDMA\r\n",
+      "    REG_OFFSET_NUM_ITERS = 0x10\r\n",
       "\r\n",
-      "# unpack the packed output buffer from accelerator\r\n",
-      "obuf_folded = packed_bytearray_to_finnpy(\r\n",
-      "    obuf_packed, odt, oshape_folded, reverse_endian=True, reverse_inner=True\r\n",
-      ")\r\n",
-      "# convert to normal reshape and save\r\n",
-      "obuf_normal = obuf_folded.reshape(oshape_normal)\r\n",
-      "np.save(\"output.npy\", obuf_normal)\r\n"
+      "    # number of samples for inference\r\n",
+      "    if exec_mode == \"remote_pynq\":\r\n",
+      "        N = 1\r\n",
+      "    elif exec_mode == \"throughput_test\":\r\n",
+      "        res={}\r\n",
+      "        N = 1000\r\n",
+      "    else:\r\n",
+      "        raise Exception(\"Exec mode has to be set to remote_pynq or throughput_test\")\r\n",
+      "\r\n",
+      "    # declare input/output types and shapes for the accelerator\r\n",
+      "    ishape_packed = (N, 49, 2)\r\n",
+      "    oshape_packed = (N, 1, 40)\r\n",
+      "    \r\n",
+      "    if exec_mode == \"remote_pynq\":\r\n",
+      "        ibuf_normal = load_input(N)\r\n",
+      "        ibuf_packed = pack_input(ibuf_normal, N)\r\n",
+      "    elif exec_mode == \"throughput_test\":\r\n",
+      "        ibuf_packed = np.asarray(np.random.uniform(low=0, high=1, size=tuple(ishape_packed)), dtype=np.uint8)\r\n",
+      "\r\n",
+      "    # set up TLastMarker with correct num. samples\r\n",
+      "    ctrl_regs.write(REG_OFFSET_NUM_ITERS, N)\r\n",
+      "\r\n",
+      "    # allocate a PYNQ buffer for the packed input buffer\r\n",
+      "    ibuf_packed_device = allocate(shape=ishape_packed, dtype=np.uint8)\r\n",
+      "    # copy the packed data into the PYNQ buffer\r\n",
+      "    # TODO optimization: pack directly into the PYNQ buffer?\r\n",
+      "    np.copyto(ibuf_packed_device, ibuf_packed)\r\n",
+      "\r\n",
+      "    # allocate a PYNQ buffer for the returned packed output buffer\r\n",
+      "    obuf_packed = allocate(shape=oshape_packed, dtype=np.uint8)\r\n",
+      "\r\n",
+      "    if exec_mode == \"throughput_test\":\r\n",
+      "        # measure runtime of network\r\n",
+      "        start = time.time()\r\n",
+      "\r\n",
+      "    # set up the DMA and wait until all transfers complete\r\n",
+      "    dma.sendchannel.transfer(ibuf_packed_device)\r\n",
+      "    dma.recvchannel.transfer(obuf_packed)\r\n",
+      "    dma.sendchannel.wait()\r\n",
+      "    dma.recvchannel.wait()\r\n",
+      "\r\n",
+      "\r\n",
+      "    if exec_mode == \"throughput_test\":\r\n",
+      "        end = time.time()\r\n",
+      "        runtime = end - start\r\n",
+      "        res[\"runtime[ms]\"] = runtime*1000\r\n",
+      "        res[\"throughput[images/s]\"] = N / runtime\r\n",
+      "        file = open(\"nw_metrics.txt\", \"w\")\r\n",
+      "        file.write(str(res))\r\n",
+      "        file.close()\r\n",
+      "\r\n",
+      "    else:\r\n",
+      "        obuf_folded = unpack_output(obuf_packed, N)\r\n",
+      "        save_output(obuf_folded, N)\r\n",
+      "\r\n"
      ]
     }
    ],
@@ -1504,16 +1593,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [],
    "source": [
     "from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ\n",
-    "ip = \"192.168.3.1\"\n",
+    "ip = \"51.37.26.64\"\n",
+    "port = \"23\"\n",
     "username = \"xilinx\"\n",
-    "password = \"xilinx\"\n",
+    "password = \"x1l1nx_f1nn\"\n",
     "target_dir = \"/home/xilinx/finn_tfc_end2end_example\"\n",
-    "model = model.transform(DeployToPYNQ(ip, username, password, target_dir))\n",
+    "model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))\n",
     "model.save(build_dir + \"/tfc_w1_a1_pynq_deploy.onnx\")"
    ]
   },
@@ -1526,42 +1616,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 58,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
        ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_hq9mfroo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j\"\n",
+       ", key: \"vivado_synth_rpt\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j/synth_report.xml\"\n",
        ", key: \"vivado_pynq_bitfile\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_hq9mfroo/resizer.bit\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j/resizer.bit\"\n",
        ", key: \"pynq_driver_dir\"\n",
-       "value: \"/tmp/finn_maltanar/pynq_driver_25t8u9sd\"\n",
+       "value: \"/tmp/finn_jakobap/pynq_driver_1r1_0kz6\"\n",
        ", key: \"pynq_ip\"\n",
-       "value: \"192.168.3.1\"\n",
+       "value: \"51.37.26.64\"\n",
+       ", key: \"pynq_port\"\n",
+       "value: \"23\"\n",
        ", key: \"pynq_username\"\n",
        "value: \"xilinx\"\n",
        ", key: \"pynq_password\"\n",
-       "value: \"xilinx\"\n",
+       "value: \"x1l1nx_f1nn\"\n",
        ", key: \"pynq_target_dir\"\n",
        "value: \"/home/xilinx/finn_tfc_end2end_example\"\n",
        ", key: \"pynq_deployment_dir\"\n",
-       "value: \"/tmp/finn_maltanar/pynq_deployment_mpyziv7h\"\n",
+       "value: \"/tmp/finn_jakobap/pynq_deployment_kvurnk0c\"\n",
        ", key: \"pynq_deploy_dir\"\n",
-       "value: \"/tmp/finn_maltanar/pynq_deployment_mpyziv7h\"\n",
+       "value: \"/tmp/finn_jakobap/pynq_deployment_kvurnk0c\"\n",
        ", key: \"exec_mode\"\n",
        "value: \"remote_pynq\"\n",
        "]"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 58,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1572,34 +1666,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 59,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/home/xilinx/finn_tfc_end2end_example/pynq_deployment_1oyo7x66:\r\n",
-      "total 5820\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    1934 Feb 13 13:36 driver.py\r\n",
-      "drwxr-xr-x 4 xilinx xilinx    4096 Feb 13 13:36 finn\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    3264 Feb 13 14:24 input.npy\r\n",
-      "-rw-r--r-- 1 root   root       120 Feb 13 14:24 output.npy\r\n",
-      "-rw-r--r-- 1 xilinx xilinx 5568787 Feb 13 13:36 resizer.bit\r\n",
-      "-rw-r--r-- 1 xilinx xilinx  368173 Feb 13 13:36 resizer.hwh\r\n",
-      "-rw-r--r-- 1 root   root        32 Feb 13 14:24 sds_trace_data.dat\r\n",
-      "\r\n",
-      "/home/xilinx/finn_tfc_end2end_example/pynq_deployment_mpyziv7h:\r\n",
-      "total 5808\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    1934 Feb 28 16:09 driver.py\r\n",
-      "drwxr-xr-x 4 xilinx xilinx    4096 Feb 28 16:09 finn\r\n",
-      "-rw-r--r-- 1 xilinx xilinx 5568787 Feb 28 16:09 resizer.bit\r\n",
-      "-rw-r--r-- 1 xilinx xilinx  368173 Feb 28 16:09 resizer.hwh\r\n"
+      "total 4284\r\n",
+      "-rw-r--r-- 1 xilinx xilinx    3861 Apr 27 12:36 driver.py\r\n",
+      "drwxr-xr-x 4 xilinx xilinx    4096 Apr 27 12:37 finn\r\n",
+      "-rw-r--r-- 1 xilinx xilinx 4045675 Apr 27 12:36 resizer.bit\r\n",
+      "-rw-r--r-- 1 xilinx xilinx  329531 Apr 27 12:36 resizer.hwh\r\n"
      ]
     }
    ],
    "source": [
-    "! sshpass -p {password} ssh {username}@{ip} 'ls -l {target_dir}/*'"
+    "! sshpass -p {password} ssh {username}@{ip} -p {port} 'ls -l {target_dir}/*'"
    ]
   },
   {
@@ -1611,16 +1694,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 60,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<matplotlib.image.AxesImage at 0x7f17e0a82e10>"
+       "<matplotlib.image.AxesImage at 0x7f4277550ef0>"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 60,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1644,7 +1727,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 61,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1664,7 +1747,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 62,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1686,7 +1769,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 63,
    "metadata": {},
    "outputs": [
     {
@@ -1695,13 +1778,13 @@
        "<BarContainer object of 10 artists>"
       ]
      },
-     "execution_count": 49,
+     "execution_count": 63,
      "metadata": {},
      "output_type": "execute_result"
     },
     {
      "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAMoUlEQVR4nO3cf6jd913H8edryercD1sxV9AkLgEzNQyl5dJVC1pshbSV5A9FGqjoKMs/y6yuKJlKHfWfzcn8gXUa5xzO2azWIcFGI7iKILbkdp11SYxcstrcrNK7rtYfQ7Pg2z/uiZzd3ptzkp57T/u+zwcEzvf7/XC+75ObPDn3e36kqpAkvfa9btoDSJImw6BLUhMGXZKaMOiS1IRBl6QmNk/rxFu2bKkdO3ZM6/SS9Jr05JNPfqmqZlY6NrWg79ixg7m5uWmdXpJek5L8y2rHvOQiSU0YdElqwqBLUhMjg57kY0meT/L5VY4nyW8mmU/ydJIbJj+mJGmUcZ6hfxzYc5njtwO7Bn8OAB955WNJkq7UyKBX1d8CX77Mkn3AH9aSx4HrknzLpAaUJI1nEtfQtwLnhrYXBvteJsmBJHNJ5hYXFydwaknSJev6omhVHa6q2aqanZlZ8X3xkqSrNImgnwe2D21vG+yTJK2jSXxS9ChwMMkR4B3AS1X13ATuV8vsOPTomp/jmQ/cuebnkLQ2RgY9yUPALcCWJAvALwGvB6iq3wGOAXcA88BXgHeu1bCSpNWNDHpV7R9xvIB3T2wiSdJV8ZOiktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1MRYQU+yJ8mZJPNJDq1w/NuSPJbkqSRPJ7lj8qNKki5nZNCTbAIeBG4HdgP7k+xetuwXgYer6nrgLuC3Jz2oJOnyxnmGfiMwX1Vnq+oCcATYt2xNAd8wuH0t8MXJjShJGsc4Qd8KnBvaXhjsG/Z+4O4kC8Ax4D0r3VGSA0nmkswtLi5exbiSpNVM6kXR/cDHq2obcAfwiSQvu++qOlxVs1U1OzMzM6FTS5JgvKCfB7YPbW8b7Bt2D/AwQFX9PfAGYMskBpQkjWecoJ8AdiXZmeQall70PLpszbPArQBJvouloHtNRZLW0cigV9VF4CBwHDjN0rtZTiZ5IMnewbL7gHcl+QfgIeAnq6rWamhJ0sttHmdRVR1j6cXO4X33D90+Bdw82dEkSVfCT4pKUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSE2MFPcmeJGeSzCc5tMqaH0tyKsnJJH882TElSaNsHrUgySbgQeCHgAXgRJKjVXVqaM0u4H3AzVX1YpJvXquBJUkrG+cZ+o3AfFWdraoLwBFg37I17wIerKoXAarq+cmOKUkaZZygbwXODW0vDPYNexvwtiR/l+TxJHtWuqMkB5LMJZlbXFy8uoklSSua1Iuim4FdwC3AfuD3kly3fFFVHa6q2aqanZmZmdCpJUkwXtDPA9uHtrcN9g1bAI5W1Ver6gvAP7MUeEnSOhkn6CeAXUl2JrkGuAs4umzNn7H07JwkW1i6BHN2gnNKkkYYGfSquggcBI4Dp4GHq+pkkgeS7B0sOw68kOQU8Bjws1X1wloNLUl6uZFvWwSoqmPAsWX77h+6XcB7B38kSVPgJ0UlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpibGCnmRPkjNJ5pMcusy6H0lSSWYnN6IkaRwjg55kE/AgcDuwG9ifZPcK694C3As8MekhJUmjjfMM/UZgvqrOVtUF4Aiwb4V1vwx8EPjvCc4nSRrTOEHfCpwb2l4Y7Pt/SW4AtlfVo5e7oyQHkswlmVtcXLziYSVJq3vFL4omeR3wYeC+UWur6nBVzVbV7MzMzCs9tSRpyDhBPw9sH9reNth3yVuAtwN/k+QZ4CbgqC+MStL6GifoJ4BdSXYmuQa4Czh66WBVvVRVW6pqR1XtAB4H9lbV3JpMLEla0cigV9VF4CBwHDgNPFxVJ5M8kGTvWg8oSRrP5nEWVdUx4NiyffevsvaWVz6WJOlK+UlRSWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJamKsoCfZk+RMkvkkh1Y4/t4kp5I8neSvk7x18qNKki5nZNCTbAIeBG4HdgP7k+xetuwpYLaqvht4BPiVSQ8qSbq8cZ6h3wjMV9XZqroAHAH2DS+oqseq6iuDzceBbZMdU5I0yjhB3wqcG9peGOxbzT3AX6x0IMmBJHNJ5hYXF8efUpI00kRfFE1yNzALfGil41V1uKpmq2p2ZmZmkqeWpA1v8xhrzgPbh7a3DfZ9jSS3Ab8A/EBV/c9kxpMkjWucZ+gngF1Jdia5BrgLODq8IMn1wO8Ce6vq+cmPKUkaZWTQq+oicBA4DpwGHq6qk0keSLJ3sOxDwJuBP0nyuSRHV7k7SdIaGeeSC1V1DDi2bN/9Q7dvm/BckqQr5CdFJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqYmxgp5kT5IzSeaTHFrh+Ncl+dTg+BNJdkx6UEnS5Y0MepJNwIPA7cBuYH+S3cuW3QO8WFXfDvwa8MFJDypJurzNY6y5EZivqrMASY4A+4BTQ2v2Ae8f3H4E+K0kqaqa4Kyaoh2HHl3zczzzgTvX/ByvNWv99+7feS/jBH0rcG5oewF4x2prqupikpeAbwK+NLwoyQHgwGDzP5OcuZqhr9KW5fNsEFf0uDPF360mfG5/3mOY5s97wjbSz/utqx0YJ+gTU1WHgcPrec5LksxV1ew0zj1NPu6Nxce9sY3zouh5YPvQ9rbBvhXXJNkMXAu8MIkBJUnjGSfoJ4BdSXYmuQa4Czi6bM1R4CcGt38U+IzXzyVpfY285DK4Jn4QOA5sAj5WVSeTPADMVdVR4PeBTySZB77MUvRfbaZyqedVwMe9sfi4N7D4RFqSevCTopLUhEGXpCbaB33U1xZ0lGR7kseSnEpyMsm9055pPSXZlOSpJH8+7VnWU5LrkjyS5J+SnE7yvdOeaT0k+ZnBv/PPJ3koyRumPdO0tA76mF9b0NFF4L6q2g3cBLx7gzzuS+4FTk97iCn4DeAvq+o7ge9hA/wdJNkK/BQwW1VvZ+mNG6/GN2Wsi9ZBZ+hrC6rqAnDpawtaq6rnquqzg9v/wdJ/7K3TnWp9JNkG3Al8dNqzrKck1wLfz9I7zqiqC1X1b9Odat1sBr5+8BmYNwJfnPI8U9M96Ct9bcGGCNslg2++vB54YrqTrJtfB34O+N9pD7LOdgKLwB8MLjd9NMmbpj3UWquq88CvAs8CzwEvVdVfTXeq6eke9A0tyZuBPwV+uqr+fdrzrLUkPww8X1VPTnuWKdgM3AB8pKquB/4LaP+aUZJvZOm37p3AtwJvSnL3dKeanu5BH+drC1pK8nqWYv7Jqvr0tOdZJzcDe5M8w9LltR9M8kfTHWndLAALVXXpN7FHWAp8d7cBX6iqxar6KvBp4PumPNPUdA/6OF9b0E6SsHQt9XRVfXja86yXqnpfVW2rqh0s/aw/U1Ub4tlaVf0rcC7Jdwx23crXfsV1V88CNyV54+Df/a1sgBeDV7Ou37a43lb72oIpj7UebgZ+HPjHJJ8b7Pv5qjo2xZm09t4DfHLw5OUs8M4pz7PmquqJJI8An2Xp3V1PsYG/BsCP/ktSE90vuUjShmHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUxP8BwjHuoBhu1y0AAAAASUVORK5CYII=\n",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAMp0lEQVR4nO3cf6zdd13H8eeL1qoMgia7f2jbcRttMA2iI9cyJUHDZtJlpjVhJl0CYQbSmFCZQqKdmv1R/4Fhpv7RGJoxQxQsOPnj4qrVCP7hHyy9+xGgq43XOtdWDHeAYDRaGt7+0VNyvLvt/XY79572fZ+PZMn5fr+f3O/7bN0z336/95xUFZKkm9+rpj2AJGkyDLokNWHQJakJgy5JTRh0SWpi87ROfOutt9bs7Oy0Ti9JN6WnnnrqxaqaWenY1II+OzvLwsLCtE4vSTelJP96tWPecpGkJgy6JDVh0CWpCYMuSU0MCnqSPUnOJFlMcmiF4/cnWUry7Oif905+VEnStaz6Wy5JNgFHgJ8HzgMnk8xX1XPLln6qqg6uwYySpAGGXKHvBhar6mxVXQSOAfvWdixJ0vUaEvStwLmx7fOjfcu9I8kXkzyeZPtKPyjJgSQLSRaWlpZexriSpKuZ1EPRzwKzVfUm4G+Bj6+0qKqOVtVcVc3NzKz4QSdJ0ss05JOiF4DxK+5to33fVVVfG9t8FHj4lY+m5WYPPbHm53j+Q/es+TkkrY0hV+gngZ1JdiTZAuwH5scXJPmhsc29wOnJjShJGmLVK/SqupTkIHAC2AQ8VlWnkhwGFqpqHnh/kr3AJeDrwP1rOLMkaQWDvpyrqo4Dx5fte2js9YPAg5MdTZJ0PfykqCQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgYFPcmeJGeSLCY5dI1170hSSeYmN6IkaYhVg55kE3AEuBvYBdyXZNcK614LPAA8OekhJUmrG3KFvhtYrKqzVXUROAbsW2Hd7wIfBv5ngvNJkgYaEvStwLmx7fOjfd+V5M3A9qp6YoKzSZKuwyt+KJrkVcAjwAcHrD2QZCHJwtLS0is9tSRpzJCgXwC2j21vG+274rXAG4G/T/I8cAcwv9KD0ao6WlVzVTU3MzPz8qeWJL3EkKCfBHYm2ZFkC7AfmL9ysKq+WVW3VtVsVc0CXwD2VtXCmkwsSVrRqkGvqkvAQeAEcBr4dFWdSnI4yd61HlCSNMzmIYuq6jhwfNm+h66y9ude+ViSpOvlJ0UlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpiUFBT7InyZkki0kOrXD8V5J8KcmzSf4hya7JjypJupZVg55kE3AEuBvYBdy3QrA/WVU/XlU/CTwMPDLxSSVJ1zTkCn03sFhVZ6vqInAM2De+oKq+NbZ5C1CTG1GSNMTmAWu2AufGts8Db1m+KMn7gA8AW4C3r/SDkhwADgDcdttt1zurJOkaJvZQtKqOVNWPAL8J/M5V1hytqrmqmpuZmZnUqSVJDAv6BWD72Pa20b6rOQb84isZSpJ0/YYE/SSwM8mOJFuA/cD8+IIkO8c27wH+aXIjSpKGWPUeelVdSnIQOAFsAh6rqlNJDgMLVTUPHExyF/Bt4BvAu9dyaEnSSw15KEpVHQeOL9v30NjrByY8lyTpOvlJUUlqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWpiUNCT7ElyJslikkMrHP9AkueSfDHJ3yV5/eRHlSRdy6pBT7IJOALcDewC7kuya9myZ4C5qnoT8Djw8KQHlSRd25Ar9N3AYlWdraqLwDFg3/iCqvp8Vf33aPMLwLbJjilJWs2QoG8Fzo1tnx/tu5r3AH+10oEkB5IsJFlYWloaPqUkaVUTfSia5J3AHPCRlY5X1dGqmququZmZmUmeWpI2vM0D1lwAto9tbxvt+3+S3AX8NvCzVfW/kxlPkjTUkCv0k8DOJDuSbAH2A/PjC5LcDnwU2FtVX538mJKk1awa9Kq6BBwETgCngU9X1akkh5PsHS37CPAa4M+TPJtk/io/TpK0RobccqGqjgPHl+17aOz1XROeS5J0nfykqCQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDUxKOhJ9iQ5k2QxyaEVjr8tydNJLiW5d/JjSpJWs2rQk2wCjgB3A7uA+5LsWrbsBeB+4JOTHlCSNMzmAWt2A4tVdRYgyTFgH/DclQVV9fzo2HfWYEZJ0gBDbrlsBc6NbZ8f7btuSQ4kWUiysLS09HJ+hCTpKtb1oWhVHa2quaqam5mZWc9TS1J7Q4J+Adg+tr1ttE+SdAMZEvSTwM4kO5JsAfYD82s7liTpeq0a9Kq6BBwETgCngU9X1akkh5PsBUjyU0nOA78EfDTJqbUcWpL0UkN+y4WqOg4cX7bvobHXJ7l8K0aSNCV+UlSSmjDoktSEQZekJgy6JDUx6KGoJK2n2UNPrPk5nv/QPWt+jvVm0DWI/4NJNz5vuUhSEzflFbpXi5L0Ul6hS1ITBl2SmjDoktTETXkPXdLa81nVzceg66aw1nExLOrAWy6S1IRBl6QmvOUi3cC81aTrYdClVRhV3Sy85SJJTRh0SWrCoEtSE95Dv05+2ELSjcqgS9KYm/mizVsuktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNTEo6En2JDmTZDHJoRWOf2+ST42OP5lkdtKDSpKubdWgJ9kEHAHuBnYB9yXZtWzZe4BvVNWPAr8PfHjSg0qSrm3IFfpuYLGqzlbVReAYsG/Zmn3Ax0evHwfuTJLJjSlJWk2q6toLknuBPVX13tH2u4C3VNXBsTVfHq05P9r+59GaF5f9rAPAgdHmG4Azk3ojA9wKvLjqqn583xuL77u/11fVzEoH1vX70KvqKHB0Pc95RZKFqpqbxrmnyfe9sfi+N7Yht1wuANvHtreN9q24Jslm4HXA1yYxoCRpmCFBPwnsTLIjyRZgPzC/bM088O7R63uBz9Vq93IkSRO16i2XqrqU5CBwAtgEPFZVp5IcBhaqah74GPAnSRaBr3M5+jeaqdzquQH4vjcW3/cGtupDUUnSzcFPikpSEwZdkppoH/TVvragoyTbk3w+yXNJTiV5YNozrackm5I8k+Qvpz3LekryA0keT/KPSU4n+elpz7Qekvz66M/5l5P8WZLvm/ZM09I66AO/tqCjS8AHq2oXcAfwvg3yvq94ADg97SGm4A+Bv66qHwN+gg3w7yDJVuD9wFxVvZHLv7hxI/5SxrpoHXSGfW1BO1X1lap6evT6P7n8P/bW6U61PpJsA+4BHp32LOspyeuAt3H5N86oqotV9R/TnWrdbAa+f/QZmFcD/zbleaame9C3AufGts+zQcJ2xeibL28HnpzuJOvmD4DfAL4z7UHW2Q5gCfjj0e2mR5PcMu2h1lpVXQB+D3gB+Arwzar6m+lONT3dg76hJXkN8BfAr1XVt6Y9z1pL8gvAV6vqqWnPMgWbgTcDf1RVtwP/BbR/ZpTkB7n8t+4dwA8DtyR553Snmp7uQR/ytQUtJfkeLsf8E1X1mWnPs07eCuxN8jyXb6+9PcmfTnekdXMeOF9VV/4m9jiXA9/dXcC/VNVSVX0b+AzwM1OeaWq6B33I1xa0M/rq4o8Bp6vqkWnPs16q6sGq2lZVs1z+b/25qtoQV2tV9e/AuSRvGO26E3huiiOtlxeAO5K8evTn/k42wMPgq1nXb1tcb1f72oIpj7Ue3gq8C/hSkmdH+36rqo5PcSatvV8FPjG6eDkL/PKU51lzVfVkkseBp7n8213PsIG/BsCP/ktSE91vuUjShmHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUxP8B9uoCk0KMtNwAAAAASUVORK5CYII=\n",
       "text/plain": [
        "<Figure size 432x288 with 1 Axes>"
       ]
@@ -1731,6 +1814,37 @@
     "We see that the network correctly predicts this as a digit 2 with high probability. This concludes our tutorial on how to take a simple fully-connected BNN all the way down to hardware with FINN, and execute it remotely on a PYNQ board."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Throughput Test on PYNQ Board <a id='throughput'></a>\n",
+    "In addition to the functional verification, FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done using the core function `throughput_test`. In the next section we import the function and execute it.\n",
+    "First we extract the `remote_exec_model` again and pass it to the function. The function returns the metrics of the network as dictionary."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Network metrics: \n",
+      "{'runtime[ms]': 3.5953521728515625, 'throughput[images/s]': 278136.8700265252}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from finn.core.throughput_test import throughput_test\n",
+    "\n",
+    "child_model = ModelWrapper(getCustomOp(sdp_node).get_nodeattr(\"model\"))\n",
+    "res = throughput_test(child_model)\n",
+    "print(\"Network metrics: \\n\" + str(res))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
index eff9cea291b106d69e99055d5b6e2af448fb7517..e97eb19a101e83f9d9603637e131b2ec9b7d16a4 100644
--- a/src/finn/core/remote_exec.py
+++ b/src/finn/core/remote_exec.py
@@ -38,6 +38,7 @@ def remote_exec(model, execution_context):
     input values."""
     # TODO fix for multi input-output
     pynq_ip = model.get_metadata_prop("pynq_ip")
+    pynq_port = int(model.get_metadata_prop("pynq_port"))
     pynq_username = model.get_metadata_prop("pynq_username")
     pynq_password = model.get_metadata_prop("pynq_password")
     pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
@@ -49,8 +50,9 @@ def remote_exec(model, execution_context):
     # extracting last folder of absolute path (deployment_dir)
     deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
     # copy input to PYNQ board
-    cmd = "sshpass -p {} scp -r {}/input.npy {}@{}:{}/{}".format(
+    cmd = "sshpass -p {} scp -P{} -r {}/input.npy {}@{}:{}/{}".format(
         pynq_password,
+        pynq_port,
         deployment_dir,
         pynq_username,
         pynq_ip,
@@ -60,13 +62,15 @@ def remote_exec(model, execution_context):
     bash_command = ["/bin/bash", "-c", cmd]
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
-
     cmd = (
-        "sshpass -p {} ssh {}@{} " '"cd {}/{}; echo "{}" | sudo -S python3.6 driver.py"'
+        "sshpass -p {} ssh {}@{} -p {} "
+        '"cd {}/{}; echo "{}" | '
+        'sudo -S python3.6 driver.py remote_pynq 1 resizer.bit input.npy output.npy"'
     ).format(
         pynq_password,
         pynq_username,
         pynq_ip,
+        pynq_port,
         pynq_target_dir,
         deployment_folder,
         pynq_password,
@@ -74,9 +78,9 @@ def remote_exec(model, execution_context):
     bash_command = ["/bin/bash", "-c", cmd]
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
-
-    cmd = "sshpass -p {} scp {}@{}:{}/{}/output.npy {}".format(
+    cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/output.npy {}".format(
         pynq_password,
+        pynq_port,
         pynq_username,
         pynq_ip,
         pynq_target_dir,
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc929237bf6c985997e49cc3f74c7d492d79839a
--- /dev/null
+++ b/src/finn/core/throughput_test.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import subprocess
+
+
+def throughput_test(model):
+    """Runs the throughput test for the given model remotely on the pynq board.
+    The metadata properties related to the pynq board have to be set.
+    Returns a dictionary with results of the throughput test"""
+
+    pynq_ip = model.get_metadata_prop("pynq_ip")
+    pynq_port = int(model.get_metadata_prop("pynq_port"))
+    pynq_username = model.get_metadata_prop("pynq_username")
+    pynq_password = model.get_metadata_prop("pynq_password")
+    pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
+    deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
+    # extracting last folder of absolute path (deployment_dir)
+    deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
+
+    cmd = (
+        "sshpass -p {} ssh {}@{} -p {} "
+        '"cd {}/{}; echo "{}" | '
+        "sudo -S python3.6 driver.py throughput_test 1000 "
+        'resizer.bit input.npy output.npy"'
+    ).format(
+        pynq_password,
+        pynq_username,
+        pynq_ip,
+        pynq_port,
+        pynq_target_dir,
+        deployment_folder,
+        pynq_password,
+    )
+    bash_command = ["/bin/bash", "-c", cmd]
+    process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+    process_compile.communicate()
+
+    cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format(
+        pynq_password,
+        pynq_port,
+        pynq_username,
+        pynq_ip,
+        pynq_target_dir,
+        deployment_folder,
+        deployment_dir,
+    )
+    bash_command = ["/bin/bash", "-c", cmd]
+    process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+    process_compile.communicate()
+
+    with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
+        res = eval(file.read())
+
+    return res
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 8579fdfba73eaceba92c7435d97e02f873c13aba..8430a56bc2688627f82da6ae92140f5cff82cb60 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -200,6 +200,7 @@ class HLSCustomOp(CustomOp):
         self.code_gen_dict["$FINNHLSLIBDIR$"] = ["/workspace/finn-hlslib"]
         self.code_gen_dict["$TOPFXN$"] = [node.name]
         self.code_gen_dict["$CLKPERIOD$"] = [str(clk)]
+        self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives()
 
         template = self.ipgentcl_template
 
@@ -213,6 +214,10 @@ class HLSCustomOp(CustomOp):
         f.close()
         self.code_gen_dict.clear()
 
+    def ipgen_extra_directives(self):
+        "Return a list of extra tcl directives for HLS synthesis."
+        return []
+
     def ipgen_singlenode_code(self):
         """Builds the bash script for ip generation using the IPGenBuilder from
         finn.util.fpgadataflow."""
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 55b9a2753b50f76c57fb08c7a24b29b49d82c8b8..2b469f7b0d6e5ddc3068fa3fd2d6cb487a560d92 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -60,6 +60,12 @@ class ConvolutionInputGenerator(HLSCustomOp):
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
+            # FPGA resource type for ConvolutionInputGenerator input buffer
+            # auto -- let Vivado HLS decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use URAM
+            "ram_style": ("s", False, "distributed"),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -73,8 +79,13 @@ class ConvolutionInputGenerator(HLSCustomOp):
         return ishape
 
     def get_folded_input_shape(self):
-        """Assumption: No folding on input"""
-        return self.get_normal_input_shape()
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        wf = int(ifm_ch / simd)
+        folded_ishape = (1, ifm_dim, ifm_dim, wf, simd)
+        return folded_ishape
 
     def get_normal_output_shape(self):
         k = self.get_nodeattr("ConvKernelDim")
@@ -94,7 +105,8 @@ class ConvolutionInputGenerator(HLSCustomOp):
         simd = self.get_nodeattr("SIMD")
         pad = 0
         ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad)
-        assert k * k * ifm_ch % simd == 0, "SIMD must divide sliding window size"
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        assert k % stride == 0, "stride must divide kernel size k"
         wf = int((k * k * ifm_ch) // simd)
         folded_oshape = (1, ofm_dim, ofm_dim, wf, simd)
         return folded_oshape
@@ -141,7 +153,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
-        assert simd == ifm_ch, "SWG currently requires SIMD=IFM"
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
         in_width = simd * ibits
         return in_width
 
@@ -161,6 +173,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
         # TODO ensure codegen dir exists
@@ -188,7 +201,8 @@ class ConvolutionInputGenerator(HLSCustomOp):
             export_idt = DataType.BINARY
         else:
             export_idt = self.get_input_datatype()
-        # no reshaping for input since assuming no folding on input
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
         # make copy before saving array
         reshaped_input = inp.copy()
         np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
@@ -342,3 +356,17 @@ class ConvolutionInputGenerator(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
+
+    def ipgen_extra_directives(self):
+        # add directive to control input buffer memory resources
+        ram_style = self.get_nodeattr("ram_style")
+        map_to_hls_ram_style = {
+            "auto": "RAM_2P",
+            "block": "RAM_2P_BRAM",
+            "distributed": "RAM_2P_LUTRAM",
+            "ultra": "RAM_2P_URAM",
+        }
+        hls_ram_style = map_to_hls_ram_style[ram_style]
+        directive = "set_directive_resource -core %s " % hls_ram_style
+        directive += "ConvolutionInputGenerator inputBuf"
+        return [directive]
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 83bc19030ebba66907e08c5b1e52d7c0ff9207a6..7334c913b6f85cad4835b6e65eb14c488432af6b 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -65,7 +65,12 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         return ishape
 
     def get_folded_input_shape(self):
-        return self.get_normal_input_shape()
+        # even though there is no folding in the current hlslib op,
+        # insert a time multiplexing axis to remain compatible with the
+        # shapes produced by the rest of the dataflow pipeline
+        ret = list(self.get_normal_input_shape())
+        ret.insert(-1, 1)
+        return tuple(ret)
 
     def get_normal_output_shape(self):
         k = self.get_nodeattr("PoolDim")
@@ -79,9 +84,12 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         return oshape
 
     def get_folded_output_shape(self):
-        # no folding for StreamingMaxPool
-        oshape = self.get_normal_output_shape()
-        return oshape
+        # even though there is no folding in the current hlslib op,
+        # insert a time multiplexing axis to remain compatible with the
+        # shapes produced by the rest of the dataflow pipeline
+        ret = list(self.get_normal_output_shape())
+        ret.insert(-1, 1)
+        return tuple(ret)
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 6313bb79c21231c4be5b242558da5ac40fb2aa78..5f526aa2aa1917144c7a048c9d9314aa9288a2d8 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -79,7 +79,7 @@ $DOCOMPUTE$
 }
 """
 
-# tcl script
+# tcl script for IP generation
 ipgentcl_template = """
 set config_proj_name $PROJECTNAME$
 puts "HLS project: $config_proj_name"
@@ -101,6 +101,7 @@ set_part $config_proj_part
 
 config_interface -m_axi_addr64
 config_rtl -auto_prefix
+$EXTRA_DIRECTIVES$
 
 create_clock -period $config_clkperiod -name default
 csynth_design
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 4d4dee6506f04909c53cd05e4898a7ad77e4a83a..25ea05e3607a52731ae1b64de421837bf137ee2b 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -82,7 +82,7 @@ class TLastMarker(HLSCustomOp):
         self.code_gen_dict["$DEFINES$"] = [
             "#define StreamWidth %d" % stream_width,
             "#define OutDType %s" % out_stream_dtype,
-            "#define NumIters %d" % self.get_nodeattr("NumIters"),
+            "#define NumItersPerImg %d" % self.get_nodeattr("NumIters"),
         ]
 
     def read_npy_data(self):
@@ -90,12 +90,23 @@ class TLastMarker(HLSCustomOp):
 
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            "for(int i=0; i<NumIters; i++) {",
-            "#pragma HLS PIPELINE II=1",
+            "unsigned int n = 1;",
             "OutDType t;",
-            "t.set_data(in0.read());",
             "t.set_keep(-1);",
-            "t.set_last(i==(NumIters-1));",
+            "io_section: { // start of cycle accurate region",
+            "#pragma HLS protocol fixed",
+            "// do a first read from stream before we decide on numIters",
+            "// giving software a chance to set up the numIters prior to startup",
+            "t.set_data(in0.read());",
+            "n = (numIters == 0 ? NumItersPerImg : numIters);",
+            "t.set_last(n==1);",
+            "out.write(t);",
+            "} // end of cycle accurate region",
+            "// do one less iteration than spec since we already did one",
+            "for(unsigned int i=1; i<n; i++) {",
+            "#pragma HLS PIPELINE II=1",
+            "t.set_data(in0.read());",
+            "t.set_last(i==(n-1));",
             "out.write(t);",
             "}",
         ]
@@ -109,13 +120,16 @@ class TLastMarker(HLSCustomOp):
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
             """void %s(hls::stream<ap_uint<StreamWidth> > &in0,
-                hls::stream<OutDType> &out)"""
+                hls::stream<OutDType> &out, unsigned int numIters)"""
             % self.onnx_node.name
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE s_axilite port=numIters bundle=control"
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/src/finn/transformation/fpgadataflow/codegen_ipstitch.py b/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
index f482db793018933883a068bb16fd99ece671064b..bc1fce836a16f49e6549f6b24de2973b902bf066 100644
--- a/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
+++ b/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
@@ -121,6 +121,11 @@ class CodeGen_ipstitch(Transformation):
                 connect_cmds.append(
                     "make_bd_intf_pins_external [get_bd_intf_pins %s/out_r]" % inst_name
                 )
+                # make AXI lite IF external
+                connect_cmds.append(
+                    "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi_control]"
+                    % inst_name
+                )
 
         # create a temporary folder for the project
         prjname = "finn_vivado_stitch_proj"
@@ -142,6 +147,9 @@ class CodeGen_ipstitch(Transformation):
         tcl.append('create_bd_design "%s"' % block_name)
         tcl.extend(create_cmds)
         tcl.extend(connect_cmds)
+        # TODO get from Transformation arg or metadata_prop
+        fclk_hz = 100 * 1000000
+        tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk_0]" % fclk_hz)
         tcl.append("regenerate_bd_layout")
         tcl.append("validate_bd_design")
         tcl.append("save_bd_design")
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index f66d0dc087ecbdd112422484ee1e01cb5ceef1c0..95a7a7154f10c15b38d94d4bef653649aa30a569 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -4,6 +4,7 @@ from onnx import helper as oh
 from finn.custom_op.registry import getCustomOp
 from finn.transformation import Transformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
+import numpy as np
 
 
 def _is_fifo_node(node):
@@ -26,6 +27,14 @@ def _suitable_node(node):
         return False
 
 
+def _suitable_folded_shapes(ishape, oshape):
+    i_dummy = np.random.rand(*ishape)
+    o_dummy = np.random.rand(*oshape)
+    ishape_canonical = np.squeeze(i_dummy).shape
+    oshape_canonical = np.squeeze(o_dummy).shape
+    return ishape_canonical == oshape_canonical
+
+
 class InsertFIFO(Transformation):
     """Inserting FIFOs in the beginning and end of the graph as well as
     between fpgadataflow nodes.
@@ -50,7 +59,6 @@ class InsertFIFO(Transformation):
                 n_output = n.output[0]
                 consumer = model.find_consumer(n_output)
                 if _suitable_node(consumer) is True:
-                    graph_modified = True
                     n0 = getCustomOp(n)
                     # determine fifo node attributes
                     fld_shape = n0.get_folded_output_shape()
@@ -59,8 +67,9 @@ class InsertFIFO(Transformation):
                     # check if folded_shape of output of first node and
                     # input of the second node is equal
                     n1 = getCustomOp(consumer)
-                    assert (
-                        fld_shape == n1.get_folded_input_shape()
+                    fld_shape_2 = n1.get_folded_input_shape()
+                    assert _suitable_folded_shapes(
+                        fld_shape, fld_shape_2
                     ), """The
                     folded output shape of the first node is not the same as the
                     folded output shape of the second node. A streaming fifo can't
@@ -74,33 +83,39 @@ class InsertFIFO(Transformation):
                         fifo_depth = n0_depth
                     elif n0_depth != n1_depth:
                         fifo_depth = max(n0_depth, n1_depth)
+
+                    if fifo_depth > 2:
+                        # assumption: HLS streaming components already have
+                        # depth-2 FIFOs on inputs and outputs, so no point
+                        # creating additional small FIFOs in between --
+                        # we only create the larger FIFOs specified
+                        # create fifo node
+                        fifo_output_tensor = oh.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            n0.get_normal_output_shape(),
+                        )
+                        graph.value_info.append(fifo_output_tensor)
+                        model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                        fifo_node = oh.make_node(
+                            "StreamingFIFO",
+                            [n_output],
+                            [fifo_output_tensor.name],
+                            domain="finn",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=str(dtype.name),
+                        )
+                        # insert fifo
+                        graph.node.insert(node_ind + 1, fifo_node)
+                        # set fifo output tensor as new input tensor of second node
+                        consumer.input[0] = fifo_output_tensor.name
+                        # ensure created FIFO depth is reflected on both sides
                         n0.set_nodeattr("outFIFODepth", fifo_depth)
                         n1.set_nodeattr("inFIFODepth", fifo_depth)
-
-                    # create fifo node
-                    fifo_output_tensor = oh.make_tensor_value_info(
-                        model.make_new_valueinfo_name(),
-                        TensorProto.FLOAT,
-                        n0.get_normal_output_shape(),
-                    )
-                    graph.value_info.append(fifo_output_tensor)
-                    model.set_tensor_datatype(fifo_output_tensor.name, dtype)
-
-                    fifo_node = oh.make_node(
-                        "StreamingFIFO",
-                        [n_output],
-                        [fifo_output_tensor.name],
-                        domain="finn",
-                        backend="fpgadataflow",
-                        depth=fifo_depth,
-                        folded_shape=fld_shape,
-                        dataType=str(dtype.name),
-                    )
-                    # insert fifo
-                    graph.node.insert(node_ind + 1, fifo_node)
-
-                    # set fifo output tensor as new input tensor of second node
-                    consumer.input[0] = fifo_output_tensor.name
+                        graph_modified = True
 
         if graph_modified is False:
             # insert FIFO as first node
diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py
index d797773fe540e930267839c5926269a73736f354..a185f5392c4b5ec848cd463e02ebab4be9c56a46 100644
--- a/src/finn/transformation/fpgadataflow/make_deployment.py
+++ b/src/finn/transformation/fpgadataflow/make_deployment.py
@@ -42,9 +42,10 @@ class DeployToPYNQ(Transformation):
     IP address of board, username and password for board and target directory where
     the files are stored on the board"""
 
-    def __init__(self, ip, username, password, target_dir):
+    def __init__(self, ip, port, username, password, target_dir):
         super().__init__()
         self.ip = ip
+        self.port = port
         self.username = username
         self.password = password
         self.target_dir = target_dir
@@ -52,6 +53,7 @@ class DeployToPYNQ(Transformation):
     def apply(self, model):
         # set metadata properties accordingly to user input specifications
         model.set_metadata_prop("pynq_ip", self.ip)
+        model.set_metadata_prop("pynq_port", str(self.port))
         model.set_metadata_prop("pynq_username", self.username)
         model.set_metadata_prop("pynq_password", self.password)
         model.set_metadata_prop("pynq_target_dir", self.target_dir)
@@ -76,18 +78,21 @@ class DeployToPYNQ(Transformation):
         copy_tree(pynq_driver_dir, deployment_dir)
         model.set_metadata_prop("pynq_deploy_dir", deployment_dir)
         model.set_metadata_prop("exec_mode", "remote_pynq")
-
         # create target directory on PYNQ board
-        cmd = 'sshpass -p {} ssh {}@{} "mkdir -p {}"'.format(
-            self.password, self.username, self.ip, self.target_dir
+        cmd = 'sshpass -p {} ssh {}@{} -p {} "mkdir -p {}"'.format(
+            self.password, self.username, self.ip, self.port, self.target_dir
         )
         bash_command = ["/bin/bash", "-c", cmd]
         process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
         process_compile.communicate()
-
         # copy directory to PYNQ board using scp and sshpass
-        cmd = "sshpass -p {} scp -r {} {}@{}:{}".format(
-            self.password, deployment_dir, self.username, self.ip, self.target_dir
+        cmd = "sshpass -p {} scp -P{} -r {} {}@{}:{}".format(
+            self.password,
+            self.port,
+            deployment_dir,
+            self.username,
+            self.ip,
+            self.target_dir,
         )
         bash_command = ["/bin/bash", "-c", cmd]
         process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 0bde02fa2c330748a718f6debf931b7d83ac7814..c5b8d35dba1069ac749e0a0d92060c8216ada507 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -87,14 +87,25 @@ class MakePYNQDriver(Transformation):
         # fill in the driver template
         driver_py = pynq_driver_dir + "/driver.py"
         driver = templates.pynq_driver_template
+
+        def mss(x, batch_var_name="N"):
+            # "make shape string"
+            # for a shape like (1, ...) emit a string (N, ...)
+            # where N is the default value for batch_var_name
+            # this lets the driver work with a batch of samples at once
+            ret = str(x)
+            ret = ret.replace("(1,", "(%s," % batch_var_name)
+            ret = ret.replace("[1,", "[%s," % batch_var_name)
+            return ret
+
         driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt))
-        driver = driver.replace("$INPUT_SHAPE_NORMAL$", str(i_tensor_shape_normal))
-        driver = driver.replace("$INPUT_SHAPE_FOLDED$", str(i_tensor_shape_folded))
-        driver = driver.replace("$INPUT_SHAPE_PACKED$", str(i_tensor_shape_packed))
+        driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal))
+        driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded))
+        driver = driver.replace("$INPUT_SHAPE_PACKED$", mss(i_tensor_shape_packed))
         driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(o_tensor_dt))
-        driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", str(o_tensor_shape_normal))
-        driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", str(o_tensor_shape_folded))
-        driver = driver.replace("$OUTPUT_SHAPE_PACKED$", str(o_tensor_shape_packed))
+        driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal))
+        driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded))
+        driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
 
         with open(driver_py, "w") as f:
             f.write(driver)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
index 9921ce7caf2aaffd197f9bc863ab77502a963647..9fe5781ecd3aa885281bde772571d307ad0669c8 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
@@ -108,7 +108,10 @@ class MakePYNQProject(Transformation):
         out_if_name = "out_r_0"
         clk_name = "ap_clk_0"
         nrst_name = "ap_rst_n_0"
+        axi_lite_if_name = "s_axi_control_0"
         vivado_ip_cache = os.getenv("VIVADO_IP_CACHE", default="")
+        # TODO get from Transformation arg or metadata_prop
+        fclk_mhz = 100.0
 
         # create a temporary folder for the project
         vivado_pynq_proj_dir = make_build_dir(prefix="vivado_pynq_proj_")
@@ -129,7 +132,9 @@ class MakePYNQProject(Transformation):
             out_if_name,
             clk_name,
             nrst_name,
+            axi_lite_if_name,
             vivado_ip_cache,
+            fclk_mhz,
         )
 
         with open(vivado_pynq_proj_dir + "/ip_config.tcl", "w") as f:
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 81cb954bb4503c8daf18bad5881661018e9d17b7..55a5af2ad887e4a8cfa5e3836bef00f2defe7284 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -26,6 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# flake8: noqa
+
 # template for the PYNQ shell integration configuration tcl script
 ip_config_tcl_template = """
 variable config_ip_repo
@@ -35,10 +37,12 @@ variable config_ip_bytes_out
 variable config_ip_axis_name_in
 variable config_ip_axis_name_out
 variable config_ip_use_axilite
+variable config_ip_axilite_name
 variable config_ip_project_dir
 variable config_output_products_dir
 variable config_remote_cache
 variable config_util_report_filename
+variable config_ip_fclk
 
 # for arguments involving paths below: use absolute paths or relative to the
 # platform/overlay/bitstream folder
@@ -67,9 +71,13 @@ set config_ip_clk_name %s
 # the name of the active-low reset signal
 set config_ip_nrst_name %s
 # whether the IP needs an AXI Lite interface for control
-set config_ip_use_axilite 0
+set config_ip_use_axilite 1
+# name of AXI Lite interface
+set config_ip_axilite_name %s
 # Vivado OOC IP cache
 set config_remote_cache "%s"
+# clock frequency
+set config_ip_fclk %f
 """
 
 call_pynqshell_makefile_template = """
@@ -82,64 +90,156 @@ cd %s
 """
 
 pynq_driver_template = """
+import argparse
+
 from pynq import Overlay
 import numpy as np
 from pynq import allocate
+import time
 from finn.util.data_packing import (
     finnpy_to_packed_bytearray,
     packed_bytearray_to_finnpy
 )
 from finn.core.datatype import DataType
 
-bitfile_path = "resizer.bit"
-ol = Overlay(bitfile_path)
-dma=ol.axi_dma_0
-
-# declare input/output types and shapes for the accelerator
-# input FINN DataType
-idt = $INPUT_FINN_DATATYPE$
-# normal, folded and packed input shapes
-ishape_normal = $INPUT_SHAPE_NORMAL$
-ishape_folded = $INPUT_SHAPE_FOLDED$
-ishape_packed = $INPUT_SHAPE_PACKED$
-# output FINN DataType
-odt = $OUTPUT_FINN_DATATYPE$
-# normal, folded and packed output shapes
-oshape_normal = $OUTPUT_SHAPE_NORMAL$
-oshape_folded = $OUTPUT_SHAPE_FOLDED$
-oshape_packed = $OUTPUT_SHAPE_PACKED$
-
-# load desired input .npy file
-ibuf_normal = np.load("input.npy")
-# ensure that shape is as expected
-assert ibuf_normal.shape == ishape_normal
-# convert to folded form
-ibuf_folded = ibuf_normal.reshape(ishape_folded)
-
-# pack the input buffer, reversing both SIMD dim and endianness
-ibuf_packed = finnpy_to_packed_bytearray(
-    ibuf_folded, idt, reverse_endian=True, reverse_inner=True
-)
-# allocate a PYNQ buffer for the packed input buffer
-ibuf_packed_device = allocate(shape=ishape_packed, dtype=np.uint8)
-# copy the packed data into the PYNQ buffer
-# TODO optimization: pack directly into the PYNQ buffer?
-np.copyto(ibuf_packed_device, ibuf_packed)
-
-# allocate a PYNQ buffer for the returned packed output buffer
-obuf_packed = allocate(shape=oshape_packed, dtype=np.uint8)
-
-# set up the DMA and wait until all transfers complete
-dma.sendchannel.transfer(ibuf_packed_device)
-dma.recvchannel.transfer(obuf_packed)
-dma.sendchannel.wait()
-dma.recvchannel.wait()
-
-# unpack the packed output buffer from accelerator
-obuf_folded = packed_bytearray_to_finnpy(
-    obuf_packed, odt, oshape_folded, reverse_endian=True, reverse_inner=True
-)
-# convert to normal reshape and save
-obuf_normal = obuf_folded.reshape(oshape_normal)
-np.save("output.npy", obuf_normal)
+class RemoteTest():
+    def __init__(
+        self,
+        exec_mode,
+        N,
+        bitfile="resizer.bit",
+        inputfile="input.npy",
+        outputfile="output.npy"):
+
+        self.exec_mode = exec_mode
+        self.N = N
+        self.inputfile = inputfile
+        self.outputfile = outputfile
+        self.ol = Overlay(bitfile)
+        self.dma = self.ol.axi_dma_0
+        self.ctrl_regs = self.ol.resize_accel_0
+        self.ishape_packed = $INPUT_SHAPE_PACKED$
+        self.oshape_packed = $OUTPUT_SHAPE_PACKED$
+        # neuron folding factor of output = iterations per sample
+        self.itersPerSample = self.oshape_packed[-2]
+        # AXI lite register offset for number of iterations
+        # used by TLastMarker to signal end of transmission for AXI CDMA
+        self.REG_OFFSET_NUM_ITERS = 0x10
+
+    def load_input(self):
+        N = self.N
+        ishape_normal = $INPUT_SHAPE_NORMAL$
+        # load desired input .npy file
+        ibuf_normal = np.load(self.inputfile)
+        # ensure that shape is as expected
+        assert ibuf_normal.shape == ishape_normal
+        return ibuf_normal
+
+    def pack_input(self, ibuf_normal):
+        N = self.N
+        # input FINN DataType
+        idt = $INPUT_FINN_DATATYPE$
+        ishape_folded = $INPUT_SHAPE_FOLDED$
+        # convert to folded form
+        ibuf_folded = ibuf_normal.reshape(ishape_folded)
+        # pack the input buffer, reversing both SIMD dim and endianness
+        ibuf_packed = finnpy_to_packed_bytearray(
+            ibuf_folded, idt, reverse_endian=True, reverse_inner=True
+        )
+        return ibuf_packed
+
+    def unpack_output(self, obuf_packed):
+        N = self.N
+        # output FINN DataType
+        odt = $OUTPUT_FINN_DATATYPE$
+        oshape_folded = $OUTPUT_SHAPE_FOLDED$
+        # unpack the packed output buffer from accelerator
+        obuf_folded = packed_bytearray_to_finnpy(
+            obuf_packed, odt, oshape_folded, reverse_endian=True, reverse_inner=True
+        )
+        return obuf_folded
+
+    def save_output(self, obuf_folded):
+        N = self.N
+        # convert to normal reshape and save
+        oshape_normal = $OUTPUT_SHAPE_NORMAL$
+        obuf_normal = obuf_folded.reshape(oshape_normal)
+        np.save(self.outputfile, obuf_normal)
+
+    def allocate_pynqbuffer(self, shape, data=None):
+        buf_device = allocate(shape=shape, dtype=np.uint8)
+
+        # if necessary copy the packed data into the PYNQ buffer
+        # TODO optimization: pack directly into the PYNQ buffer?
+        if data is not None:
+            np.copyto(buf_device, data)
+
+        return buf_device
+
+
+    def run_nw(self):
+        exec_mode = self.exec_mode
+        if exec_mode == "remote_pynq":
+            ibuf_normal = self.load_input()
+            ibuf_packed = self.pack_input(ibuf_normal)
+        elif exec_mode != "throughput_test":
+            raise Exception("Exec mode has to be set to remote_pynq or throughput_test")
+
+        # set up TLastMarker with correct num. samples
+        self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, N*self.itersPerSample)
+
+        # allocate a PYNQ buffer for the packed input buffer
+        if exec_mode == "remote_pynq":
+            ibuf_packed_device = self.allocate_pynqbuffer(self.ishape_packed, ibuf_packed)
+        else:
+            ibuf_packed_device = self.allocate_pynqbuffer(self.ishape_packed)
+
+        # allocate a PYNQ buffer for the returned packed output buffer
+        obuf_packed = self.allocate_pynqbuffer(self.oshape_packed)
+
+        if exec_mode == "throughput_test":
+            # measure runtime of network
+            start = time.time()
+            res={}
+
+        # set up the DMA and wait until all transfers complete
+        dma = self.dma
+        dma.sendchannel.transfer(ibuf_packed_device)
+        dma.recvchannel.transfer(obuf_packed)
+        dma.sendchannel.wait()
+        dma.recvchannel.wait()
+
+
+        if exec_mode == "throughput_test":
+            end = time.time()
+            runtime = end - start
+            res["runtime[ms]"] = runtime*1000
+            res["throughput[images/s]"] = N / runtime
+            res["DRAM_in_bandwidth[Mb/s]"] = np.prod(self.ishape_packed)*0.000001 / runtime
+            res["DRAM_out_bandwidth[Mb/s]"] = np.prod(self.oshape_packed)*0.000001 / runtime
+            file = open("nw_metrics.txt", "w")
+            file.write(str(res))
+            file.close()
+        else:
+            obuf_folded = self.unpack_output(obuf_packed)
+            self.save_output(obuf_folded)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Set exec mode, batchsize N, bitfile name, inputfile name and outputfile name')
+    parser.add_argument('exec_mode', help='Please select functional verification ("remote_pynq") or throughput test ("throughput_test")')
+    parser.add_argument('N', help='number of samples for inference', type=int)
+    parser.add_argument('bitfile', default="resizer.bit")
+    parser.add_argument('inputfile', default="input.npy")
+    parser.add_argument('outputfile', default="output.npy")
+    args = parser.parse_args()
+    exec_mode = args.exec_mode
+    N = args.N
+    bitfile = args.bitfile
+    inputfile = args.inputfile
+    outputfile = args.outputfile
+
+    Test = RemoteTest(exec_mode, N, bitfile, inputfile, outputfile)
+    Test.run_nw()
+
 """
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
index 527145ab0c1686fba7f93ddedfce6f28db09c01a..34e0df8402ea0d1b880781185cd17e3ccb1a0ae0 100644
--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -72,12 +72,13 @@ from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 5
-mem_mode = "const"
+mem_mode = "decoupled"
 
 
 def test_end2end_cnv_w1a1_export():
@@ -134,35 +135,32 @@ def test_end2end_cnv_w1a1_create_dataflow_partition():
 def test_end2end_cnv_w1a1_fold_and_tlastmarker():
     model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx")
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
-    fc0w = getCustomOp(fc_layers[0])
-    fc1w = getCustomOp(fc_layers[1])
-    fc2w = getCustomOp(fc_layers[2])
-    fc3w = getCustomOp(fc_layers[3])
-    fc4w = getCustomOp(fc_layers[4])
-    fc5w = getCustomOp(fc_layers[5])
-    fc6w = getCustomOp(fc_layers[6])
-    fc7w = getCustomOp(fc_layers[7])
-    fc8w = getCustomOp(fc_layers[8])
-    fc0w.set_nodeattr("SIMD", 27)
-    fc0w.set_nodeattr("PE", 8)
-    fc1w.set_nodeattr("SIMD", 32)
-    fc1w.set_nodeattr("PE", 8)
-    fc2w.set_nodeattr("SIMD", 32)
-    fc2w.set_nodeattr("PE", 16)
-    fc3w.set_nodeattr("SIMD", 32)
-    fc3w.set_nodeattr("PE", 16)
-    fc4w.set_nodeattr("SIMD", 32)
-    fc4w.set_nodeattr("PE", 32)
-    fc5w.set_nodeattr("SIMD", 64)
-    fc5w.set_nodeattr("PE", 16)
-    fc6w.set_nodeattr("SIMD", 32)
-    fc6w.set_nodeattr("PE", 16)
-    fc7w.set_nodeattr("SIMD", 64)
-    fc7w.set_nodeattr("PE", 8)
-    fc8w.set_nodeattr("SIMD", 16)
-    fc8w.set_nodeattr("PE", 10)
+    # each tuple is (PE, SIMD, in_fifo_depth) for a layer
+    folding = [
+        (16, 3, 128),
+        (32, 32, 128),
+        (16, 32, 128),
+        (16, 32, 128),
+        (4, 32, 81),
+        (1, 32, 2),
+        (1, 4, 2),
+        (1, 8, 128),
+        (5, 1, 3),
+    ]
+    for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):
+        fcl_inst = getCustomOp(fcl)
+        fcl_inst.set_nodeattr("PE", pe)
+        fcl_inst.set_nodeattr("SIMD", simd)
+        fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
+
+    swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
+    for i in range(len(swg_layers)):
+        swg_inst = getCustomOp(swg_layers[i])
+        simd = folding[i][1]
+        swg_inst.set_nodeattr("SIMD", simd)
 
     model = model.transform(InsertDWC())
+    model = model.transform(InsertFIFO())
     model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(AnnotateResources("estimate"))
@@ -284,8 +282,9 @@ def test_end2end_cnv_w1a1_deploy_on_pynq():
             pytest.skip("PYNQ board IP address not specified")
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
         # save the model to be able to link it to the parent
         model.save(build_dir + "/end2end_cnv_w1a1_pynq_deploy.onnx")
     except KeyError:
diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
similarity index 91%
rename from tests/end2end/test_end2end_tfc_w1a1.py
rename to tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
index faf2d1031562ea9dabd63345c3583bfcffa83081..ded0bd107ab9f15a72018137c79eac640e09d3a2 100644
--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
@@ -42,6 +42,7 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
 from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
+from finn.core.throughput_test import throughput_test
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from finn.transformation.fold_constants import FoldConstants
@@ -77,7 +78,7 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
-target_clk_ns = 5
+target_clk_ns = 10
 mem_mode = "decoupled"
 
 
@@ -134,28 +135,42 @@ def test_end2end_tfc_w1a1_fold_and_tlastmarker():
     fc1w = getCustomOp(fc_layers[1])
     fc2w = getCustomOp(fc_layers[2])
     fc3w = getCustomOp(fc_layers[3])
-    fc0w.set_nodeattr("inFIFODepth", 50)
-    fc0w.set_nodeattr("SIMD", 16)
+    fc0w.set_nodeattr("inFIFODepth", 256)
+    fc0w.set_nodeattr("SIMD", 196)
     fc0w.set_nodeattr("PE", 16)
-    fc0w.set_nodeattr("outFIFODepth", 4)
+    fc0w.set_nodeattr("outFIFODepth", 64)
+    fc1w.set_nodeattr("inFIFODepth", 64)
     fc0w.set_nodeattr("ram_style", "block")
-    fc1w.set_nodeattr("inFIFODepth", 4)
-    fc1w.set_nodeattr("SIMD", 8)
-    fc1w.set_nodeattr("PE", 8)
-    fc1w.set_nodeattr("outFIFODepth", 4)
-    fc2w.set_nodeattr("inFIFODepth", 4)
+    fc1w.set_nodeattr("SIMD", 16)
+    fc1w.set_nodeattr("PE", 16)
+    fc1w.set_nodeattr("outFIFODepth", 64)
+    fc2w.set_nodeattr("inFIFODepth", 64)
     fc2w.set_nodeattr("SIMD", 16)
     fc2w.set_nodeattr("PE", 16)
-    fc2w.set_nodeattr("outFIFODepth", 4)
-    fc3w.set_nodeattr("inFIFODepth", 4)
+    fc2w.set_nodeattr("outFIFODepth", 64)
+    fc3w.set_nodeattr("inFIFODepth", 64)
     fc3w.set_nodeattr("SIMD", 16)
     fc3w.set_nodeattr("PE", 10)
-    fc3w.set_nodeattr("outFIFODepth", 50)
+    fc3w.set_nodeattr("outFIFODepth", 10)
     fc3w.set_nodeattr("ram_style", "distributed")
     model = model.transform(InsertDWC())
     model = model.transform(InsertFIFO())
     model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
+    fifos = []
+    for n in model.graph.node:
+        if n.op_type == "StreamingFIFO":
+            fifos.append(n)
+    fifo0 = getCustomOp(fifos[0])
+    fifo1 = getCustomOp(fifos[1])
+    fifo2 = getCustomOp(fifos[2])
+    fifo3 = getCustomOp(fifos[3])
+    fifo4 = getCustomOp(fifos[4])
+    fifo0.set_nodeattr("depth", 256)
+    fifo1.set_nodeattr("depth", 64)
+    fifo2.set_nodeattr("depth", 64)
+    fifo3.set_nodeattr("depth", 64)
+    fifo4.set_nodeattr("depth", 10)
     model = model.transform(AnnotateResources("estimate"))
     model.save(build_dir + "/end2end_tfc_w1a1_folded.onnx")
 
@@ -269,8 +284,9 @@ def test_end2end_tfc_w1a1_deploy_on_pynq():
             pytest.skip("PYNQ board IP address not specified")
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
         # save the model to be able to link it to the parent
         model.save(build_dir + "/end2end_tfc_w1a1_pynq_deploy.onnx")
     except KeyError:
@@ -305,6 +321,9 @@ def test_end2end_tfc_w1a1_run_on_pynq():
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
         assert np.isclose(y, y_golden).all()
+        child_model = ModelWrapper(sdp_node.get_nodeattr("model"))
+        res = throughput_test(child_model)
+        assert res is not None
 
     except KeyError:
         pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/end2end/test_end2end_tfc_w1a2.py b/tests/end2end/test_end2end_tfc_w1a2.py
index 18b42e245819ef9b4ea188a2e153043f85feb31a..52771e6d149810d70f908ac2af07e1d81f8f46ec 100644
--- a/tests/end2end/test_end2end_tfc_w1a2.py
+++ b/tests/end2end/test_end2end_tfc_w1a2.py
@@ -72,7 +72,7 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
-target_clk_ns = 5
+target_clk_ns = 10
 mem_mode = "decoupled"
 
 
@@ -253,8 +253,9 @@ def test_end2end_tfc_w1a2_deploy_on_pynq():
             pytest.skip("PYNQ board IP address not specified")
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
         # save the model to be able to link it to the parent
         model.save(build_dir + "/end2end_tfc_w1a2_pynq_deploy.onnx")
     except KeyError:
diff --git a/tests/end2end/test_end2end_tfc_w2a2.py b/tests/end2end/test_end2end_tfc_w2a2.py
index bd52a4585f660bfc46822722acd3188ba403d200..67111da400d475311cc29b45bb24573128981958 100644
--- a/tests/end2end/test_end2end_tfc_w2a2.py
+++ b/tests/end2end/test_end2end_tfc_w2a2.py
@@ -72,7 +72,7 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
-target_clk_ns = 5
+target_clk_ns = 10
 mem_mode = "decoupled"
 
 
@@ -253,8 +253,9 @@ def test_end2end_tfc_w2a2_deploy_on_pynq():
             pytest.skip("PYNQ board IP address not specified")
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
         # save the model to be able to link it to the parent
         model.save(build_dir + "/end2end_tfc_w2a2_pynq_deploy.onnx")
     except KeyError:
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 2ec47915b01c92c7b7c11d0cf160543fb71dd27d..7c19ebbfaeed09cb1e367cf6567e5b149aa4236c 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -130,13 +130,14 @@ def prepare_inputs(input_tensor):
 # input dimension
 @pytest.mark.parametrize("ifm_dim", [4, 6, 8])
 # input channels
-@pytest.mark.parametrize("ifm_ch", [1, 2])  # , 2, 3, 4])
+@pytest.mark.parametrize("ifm_ch", [2, 4])  # , 2, 3, 4])
 # Stride
 @pytest.mark.parametrize("stride", [1, 2])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["npysim", "rtlsim"])
-def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode):
-    simd = ifm_ch
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [1, 2])
+def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd):
     ofm_dim = int(((ifm_dim - k) / stride) + 1)
 
     x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch))
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index fa80f0050f0cc687c20a8e1007ed67b63989b977..8ab4809928d91d8456b7720f897763b206c4e5f5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -1,16 +1,38 @@
 import pytest
+import os
 
 from onnx import TensorProto, helper
 
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
+from finn.transformation.fpgadataflow.codegen_ipstitch import CodeGen_ipstitch
 
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
+
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
+
 from finn.util.basic import gen_finn_dt_tensor
+
 import finn.core.onnx_exec as oxe
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
+from finn.util.basic import pynq_part_map
+from finn.core.throughput_test import throughput_test
+
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 10
 
 
 def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
@@ -47,13 +69,13 @@ def prepare_inputs(input_tensor, dt):
 
 
 # shape
-@pytest.mark.parametrize("Shape", [[1, 4]])
+@pytest.mark.parametrize("Shape", [[1, 128]])
 # inWidth
-@pytest.mark.parametrize("folded_shape", [[1, 1, 4]])
+@pytest.mark.parametrize("folded_shape", [[1, 1, 128]])
 # outWidth
-@pytest.mark.parametrize("depth", [2])
+@pytest.mark.parametrize("depth", [16])
 # finn_dtype
-@pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR])  # , DataType.INT2])
 def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
 
     # generate input data
@@ -63,13 +85,40 @@ def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
     model = make_single_fifo_modelwrapper(Shape, depth, folded_shape, finn_dtype)
 
     model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
+    model = model.transform(CodeGen_ipgen(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynth_IPGen())
+    model = model.transform(PrepareRTLSim())
     y = oxe.execute_onnx(model, input_dict)["outp"]
-
     assert (
         y == x
     ).all(), """The output values are not the same as the
-        input values anymore."""
+       input values anymore."""
     assert y.shape == tuple(Shape), """The output shape is incorrect."""
+
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(CodeGen_ipstitch(test_fpga_part))
+    model = model.transform(MakePYNQProject(test_pynq_board))
+    model = model.transform(SynthPYNQProject())
+    model = model.transform(MakePYNQDriver())
+    ip = os.environ["PYNQ_IP"]
+    username = os.getenv("PYNQ_USERNAME", "xilinx")
+    password = os.getenv("PYNQ_PASSWORD", "xilinx")
+    port = os.getenv("PYNQ_PORT", 22)
+    target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+    model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+
+    res = throughput_test(model)
+    expected_dict = {}
+    expected_dict["runtime[ms]"] = []
+    expected_dict["throughput[images/s]"] = []
+    expected_dict["DRAM_in_bandwidth[Mb/s]"] = []
+    expected_dict["DRAM_out_bandwidth[Mb/s]"] = []
+    for key in expected_dict:
+        assert (
+            key in res
+        ), """Throughput test not successful, no value for {}
+        in result dictionary""".format(
+            key
+        )
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
index 1c5ae02e4c662f48be4f7f70b9de24a1f9f72ecf..af0c7b0755c7aad5dd145ea5ea8ace59941dd74a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
@@ -69,8 +69,8 @@ def create_one_fc_model():
     no_act = 1
     binary_xnor_mode = 0
     actval = 0
-    simd = 2
-    pe = 2
+    simd = 4
+    pe = 4
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, m])
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m])
@@ -199,7 +199,7 @@ def create_two_fc_model():
 # exec_mode of StreamingDataflowPartition
 # @pytest.mark.parametrize("exec_mode", ["remote_pynq"]) #, "rtlsim"])
 def test_fpgadataflow_ipstitch_gen_model():  # exec_mode):
-    model = create_two_fc_model()
+    model = create_one_fc_model()
     if model.graph.node[0].op_type == "StreamingDataflowPartition":
         sdp_node = getCustomOp(model.graph.node[0])
         assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
@@ -246,6 +246,23 @@ def test_fpgadataflow_ipstitch_rtlsim():
         "out_r_0_tlast",
         "out_r_0_tready",
         "out_r_0_tvalid",
+        "s_axi_control_0_araddr",
+        "s_axi_control_0_arready",
+        "s_axi_control_0_arvalid",
+        "s_axi_control_0_awaddr",
+        "s_axi_control_0_awready",
+        "s_axi_control_0_awvalid",
+        "s_axi_control_0_bready",
+        "s_axi_control_0_bresp",
+        "s_axi_control_0_bvalid",
+        "s_axi_control_0_rdata",
+        "s_axi_control_0_rready",
+        "s_axi_control_0_rresp",
+        "s_axi_control_0_rvalid",
+        "s_axi_control_0_wdata",
+        "s_axi_control_0_wready",
+        "s_axi_control_0_wstrb",
+        "s_axi_control_0_wvalid",
     ]
     assert dir(sim.io) == exp_io
     model.set_metadata_prop("exec_mode", "rtlsim")
@@ -295,8 +312,9 @@ def test_fpgadataflow_ipstitch_pynq_deployment_folder():
         )
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
         pynq_ip = model.get_metadata_prop("pynq_ip")
         pynq_username = model.get_metadata_prop("pynq_username")
         pynq_password = model.get_metadata_prop("pynq_password")
@@ -326,8 +344,10 @@ def test_fpgadataflow_ipstitch_remote_execution():
         model = ModelWrapper(
             ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_deployment.onnx"
         )
-        idt = DataType.INT2
-        x = gen_finn_dt_tensor(idt, (1, 4))
+        iname = "inp"
+        idt = model.get_tensor_datatype(iname)
+        ishape = model.get_tensor_shape(iname)
+        x = gen_finn_dt_tensor(idt, ishape)
         input_dict = {"inp": x}
         outp = execute_onnx(model, input_dict)
         assert np.isclose(outp["outp"], x).all()