diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index e28492bd31f3a2115ac566ed06a0125d348208f4..f1ad0dac4691741618694c2e60d52b5429635740 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -96,8 +96,8 @@ RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
 RUN git -C /workspace/pyverilator checkout 307fc5c82db748620836307a2002fdc9fe170226
 
 # PYNQ-HelloWorld
-RUN git clone --branch feature/synth_rpt https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld
-RUN git -C /workspace/PYNQ-HelloWorld checkout db7e418767ce2a8e08fe732ddb3aa56ee79b7560
+RUN git clone https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld
+RUN git -C /workspace/PYNQ-HelloWorld checkout 8c46ceb0cfaa7d6756e9b6ef2d337202ae27ea7c
 
 # Note that we expect the cloned finn directory on the host to be
 # mounted on /workspace/finn -- see run-docker.sh for an example
diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
index 80b015f6d4eb69df36831b25262cda3539ac8ae9..6c619c51ceb4a99a077fc61c52ce81763cfd27f5 100644
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -193,7 +193,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
       if (shift_en_) begin
 	 // synthesis loop_limit 256
 	 for (a_=depth-2; a_>0; a_=a_-1) begin
-	    srl[a_] <= srl[a_-1];
+	    srl[a_] = srl[a_-1];
 	 end
 	 srl[0] <= i_d;
       end
diff --git a/notebooks/end2end_example/tfc_end2end_example.ipynb b/notebooks/end2end_example/tfc_end2end_example.ipynb
index 64a87571afcc52aa081344f9579f4ba74111a1a9..2f8e0207fb5856182ca77f6df03f3cef572eeaab 100644
--- a/notebooks/end2end_example/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/tfc_end2end_example.ipynb
@@ -42,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -85,9 +85,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/brevitas_cnv_lfc/training_scripts/models/TFC.py:85: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "  x = 2.0 * x - torch.tensor([1.0]).to(self.device)\n"
+     ]
+    }
+   ],
    "source": [
     "import onnx\n",
     "from finn.util.test import get_test_model_trained\n",
@@ -107,15 +116,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
       "Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
      ]
     },
@@ -133,10 +140,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f186ccfbe10>"
+       "<IPython.lib.display.IFrame at 0x7f4310b476a0>"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -154,7 +161,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -239,7 +246,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -266,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -292,10 +299,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f186e386240>"
+       "<IPython.lib.display.IFrame at 0x7f43177c2a20>"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -316,7 +323,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -329,11 +336,14 @@
       "    def apply(self, model):\n",
       "        streamline_transformations = [\n",
       "            ConvertSubToAdd(),\n",
+      "            ConvertDivToMul(),\n",
       "            BatchNormToAffine(),\n",
       "            ConvertSignToThres(),\n",
       "            MoveAddPastMul(),\n",
       "            MoveScalarAddPastMatMul(),\n",
+      "            MoveScalarAddPastConv(),\n",
       "            MoveScalarMulPastMatMul(),\n",
+      "            MoveScalarMulPastConv(),\n",
       "            MoveAddPastMul(),\n",
       "            CollapseRepeatedAdd(),\n",
       "            CollapseRepeatedMul(),\n",
@@ -341,6 +351,7 @@
       "            FactorOutMulSignMagnitude(),\n",
       "            AbsorbMulIntoMultiThreshold(),\n",
       "            Absorb1BitMulIntoMatMul(),\n",
+      "            Absorb1BitMulIntoConv(),\n",
       "            RoundAndClipThresholds(),\n",
       "        ]\n",
       "        for trn in streamline_transformations:\n",
@@ -369,7 +380,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -395,10 +406,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f186cd470b8>"
+       "<IPython.lib.display.IFrame at 0x7f431826d860>"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -423,7 +434,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -449,10 +460,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f17f04bbc18>"
+       "<IPython.lib.display.IFrame at 0x7f42977e39b0>"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -490,7 +501,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 10,
    "metadata": {
     "scrolled": false
    },
@@ -518,10 +529,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f1868061eb8>"
+       "<IPython.lib.display.IFrame at 0x7f43177c73c8>"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -552,7 +563,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -578,10 +589,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f186cc55e48>"
+       "<IPython.lib.display.IFrame at 0x7f43177c2f60>"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -604,7 +615,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -613,7 +624,7 @@
      "text": [
       "\n",
       "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/finn_maltanar/dataflow_partition_h1c4i5gn/df_model.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/tmp/finn_jakobap/dataflow_partition_sqcfkplo/df_model.onnx' at http://0.0.0.0:8081\n"
      ]
     },
     {
@@ -630,10 +641,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f17f04c70f0>"
+       "<IPython.lib.display.IFrame at 0x7f42977d4978>"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -654,7 +665,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -674,7 +685,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -708,7 +719,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -734,17 +745,24 @@
        " 'noActivation': ('i', False, 0),\n",
        " 'inFIFODepth': ('i', False, 0),\n",
        " 'outFIFODepth': ('i', False, 0),\n",
+       " 'numInputVectors': ('ints', False, [1]),\n",
+       " 'mem_mode': ('s', False, 'const'),\n",
        " 'backend': ('s', True, 'fpgadataflow'),\n",
        " 'code_gen_dir_npysim': ('s', False, ''),\n",
        " 'code_gen_dir_ipgen': ('s', False, ''),\n",
        " 'executable_path': ('s', False, ''),\n",
        " 'ipgen_path': ('s', False, ''),\n",
+       " 'ip_path': ('s', False, ''),\n",
+       " 'ip_vlnv': ('s', False, ''),\n",
        " 'exec_mode': ('s', False, ''),\n",
        " 'sim_cycles': ('i', False, 0),\n",
-       " 'rtlsim_trace': ('s', False, '')}"
+       " 'rtlsim_trace': ('s', False, ''),\n",
+       " 'res_estimate': ('s', False, ''),\n",
+       " 'res_hls': ('s', False, ''),\n",
+       " 'res_synth': ('s', False, '')}"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -770,7 +788,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -804,7 +822,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -830,10 +848,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f1868061d30>"
+       "<IPython.lib.display.IFrame at 0x7f43177c7518>"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -866,14 +884,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "dict_keys(['Ultra96', 'Pynq-Z1'])\n"
+      "dict_keys(['Ultra96', 'Pynq-Z1', 'Pynq-Z2', 'ZCU104'])\n"
      ]
     }
    ],
@@ -885,12 +903,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
     "# change this if you have a different PYNQ board, see list above\n",
-    "pynq_board = \"Ultra96\"\n",
+    "pynq_board = \"Pynq-Z1\"\n",
     "fpga_part = pynq_part_map[pynq_board]\n",
     "target_clk_ns = 5"
    ]
@@ -916,7 +934,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -938,7 +956,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -957,7 +975,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -983,10 +1001,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f17f04c9470>"
+       "<IPython.lib.display.IFrame at 0x7f42977edf60>"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1008,7 +1026,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -1037,7 +1055,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -1045,8 +1063,8 @@
      "output_type": "stream",
      "text": [
       "#!/bin/bash \r\n",
-      "cd /tmp/finn_maltanar/code_gen_ipgen_StreamingFCLayer_Batch_5f0hmok_\r\n",
-      "vivado_hls /tmp/finn_maltanar/code_gen_ipgen_StreamingFCLayer_Batch_5f0hmok_/hls_syn_StreamingFCLayer_Batch_0.tcl\r\n",
+      "cd /tmp/finn_jakobap/code_gen_ipgen_StreamingFCLayer_Batch_0_pfp8r_i6\r\n",
+      "vivado_hls /tmp/finn_jakobap/code_gen_ipgen_StreamingFCLayer_Batch_0_pfp8r_i6/hls_syn_StreamingFCLayer_Batch_0.tcl\r\n",
       "cd /workspace/finn\r\n"
      ]
     }
@@ -1067,7 +1085,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -1077,9 +1095,9 @@
       "\r\n",
       "set config_proj_name project_StreamingFCLayer_Batch_0\r\n",
       "puts \"HLS project: $config_proj_name\"\r\n",
-      "set config_hwsrcdir \"/tmp/finn_maltanar/code_gen_ipgen_StreamingFCLayer_Batch_5f0hmok_\"\r\n",
+      "set config_hwsrcdir \"/tmp/finn_jakobap/code_gen_ipgen_StreamingFCLayer_Batch_0_pfp8r_i6\"\r\n",
       "puts \"HW source dir: $config_hwsrcdir\"\r\n",
-      "set config_proj_part \"xczu3eg-sbva484-1-e\"\r\n",
+      "set config_proj_part \"xc7z020clg400-1\"\r\n",
       "\r\n",
       "set config_bnnlibdir \"/workspace/finn-hlslib\"\r\n",
       "\r\n",
@@ -1128,7 +1146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1148,22 +1166,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
        ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        "]"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1174,16 +1192,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo'"
+       "'/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j'"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1208,7 +1226,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1248,7 +1266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 31,
    "metadata": {
     "scrolled": true
    },
@@ -1257,17 +1275,19 @@
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
        ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_hq9mfroo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j\"\n",
+       ", key: \"vivado_synth_rpt\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j/synth_report.xml\"\n",
        "]"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1281,7 +1301,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -1307,7 +1327,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1330,26 +1350,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
+   "execution_count": 34,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
        ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_hq9mfroo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j\"\n",
+       ", key: \"vivado_synth_rpt\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j/synth_report.xml\"\n",
        ", key: \"vivado_pynq_bitfile\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_hq9mfroo/resizer.bit\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j/resizer.bit\"\n",
        "]"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1363,7 +1387,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1381,7 +1405,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1399,74 +1423,131 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "\r\n",
+      "import argparse\r\n",
       "\r\n",
       "from pynq import Overlay\r\n",
       "import numpy as np\r\n",
       "from pynq import allocate\r\n",
+      "import time\r\n",
       "from finn.util.data_packing import (\r\n",
       "    finnpy_to_packed_bytearray,\r\n",
       "    packed_bytearray_to_finnpy\r\n",
       ")\r\n",
       "from finn.core.datatype import DataType\r\n",
       "\r\n",
-      "bitfile_path = \"resizer.bit\"\r\n",
-      "ol = Overlay(bitfile_path)\r\n",
-      "dma=ol.axi_dma_0\r\n",
+      "def load_input(N):\r\n",
+      "    ishape_normal = (N, 784)\r\n",
+      "    # load desired input .npy file\r\n",
+      "    ibuf_normal = np.load(\"input.npy\")\r\n",
+      "    # ensure that shape is as expected\r\n",
+      "    assert ibuf_normal.shape == ishape_normal\r\n",
+      "    return ibuf_normal\r\n",
       "\r\n",
-      "# declare input/output types and shapes for the accelerator\r\n",
-      "# input FINN DataType\r\n",
-      "idt = DataType.BINARY\r\n",
-      "# normal, folded and packed input shapes\r\n",
-      "ishape_normal = (1, 784)\r\n",
-      "ishape_folded = (1, 49, 16)\r\n",
-      "ishape_packed = (1, 49, 2)\r\n",
-      "# output FINN DataType\r\n",
-      "odt = DataType.UINT32\r\n",
-      "# normal, folded and packed output shapes\r\n",
-      "oshape_normal = (1, 10)\r\n",
-      "oshape_folded = (1, 1, 10)\r\n",
-      "oshape_packed = (1, 1, 40)\r\n",
+      "def pack_input(ibuf_normal, N):\r\n",
+      "    # input FINN DataType\r\n",
+      "    idt = DataType.BINARY\r\n",
+      "    ishape_folded = (N, 49, 16)\r\n",
+      "    # convert to folded form\r\n",
+      "    ibuf_folded = ibuf_normal.reshape(ishape_folded)\r\n",
+      "    # pack the input buffer, reversing both SIMD dim and endianness\r\n",
+      "    ibuf_packed = finnpy_to_packed_bytearray(\r\n",
+      "        ibuf_folded, idt, reverse_endian=True, reverse_inner=True\r\n",
+      "    )\r\n",
+      "    return ibuf_packed\r\n",
       "\r\n",
-      "# load desired input .npy file\r\n",
-      "ibuf_normal = np.load(\"input.npy\")\r\n",
-      "# ensure that shape is as expected\r\n",
-      "assert ibuf_normal.shape == ishape_normal\r\n",
-      "# convert to folded form\r\n",
-      "ibuf_folded = ibuf_normal.reshape(ishape_folded)\r\n",
+      "def unpack_output(obuf_packed, N):\r\n",
+      "    # output FINN DataType\r\n",
+      "    odt = DataType.UINT32\r\n",
+      "    oshape_folded = (N, 1, 10)\r\n",
+      "    # unpack the packed output buffer from accelerator\r\n",
+      "    obuf_folded = packed_bytearray_to_finnpy(\r\n",
+      "        obuf_packed, odt, oshape_folded, reverse_endian=True, reverse_inner=True\r\n",
+      "    )\r\n",
+      "    return obuf_folded\r\n",
       "\r\n",
-      "# pack the input buffer, reversing both SIMD dim and endianness\r\n",
-      "ibuf_packed = finnpy_to_packed_bytearray(\r\n",
-      "    ibuf_folded, idt, reverse_endian=True, reverse_inner=True\r\n",
-      ")\r\n",
-      "# allocate a PYNQ buffer for the packed input buffer\r\n",
-      "ibuf_packed_device = allocate(shape=ishape_packed, dtype=np.uint8)\r\n",
-      "# copy the packed data into the PYNQ buffer\r\n",
-      "# TODO optimization: pack directly into the PYNQ buffer?\r\n",
-      "np.copyto(ibuf_packed_device, ibuf_packed)\r\n",
+      "def save_output(obuf_folded, N):\r\n",
+      "    # convert to normal reshape and save\r\n",
+      "    oshape_normal = (N, 10)\r\n",
+      "    obuf_normal = obuf_folded.reshape(oshape_normal)\r\n",
+      "    np.save(\"output.npy\", obuf_normal)\r\n",
       "\r\n",
-      "# allocate a PYNQ buffer for the returned packed output buffer\r\n",
-      "obuf_packed = allocate(shape=oshape_packed, dtype=np.uint8)\r\n",
+      "if __name__ == \"__main__\":\r\n",
+      "    parser = argparse.ArgumentParser(description='Please select functional verification (\"remote_pynq\") or throughput test (\"throughput_test\")')\r\n",
+      "    parser.add_argument('exec_mode', help='metadata prop exec_mode as string')\r\n",
+      "    args = parser.parse_args()\r\n",
+      "    exec_mode = args.exec_mode\r\n",
       "\r\n",
-      "# set up the DMA and wait until all transfers complete\r\n",
-      "dma.sendchannel.transfer(ibuf_packed_device)\r\n",
-      "dma.recvchannel.transfer(obuf_packed)\r\n",
-      "dma.sendchannel.wait()\r\n",
-      "dma.recvchannel.wait()\r\n",
+      "    bitfile_path = \"resizer.bit\"\r\n",
+      "    ol = Overlay(bitfile_path)\r\n",
+      "    dma=ol.axi_dma_0\r\n",
+      "    ctrl_regs=ol.resize_accel_0\r\n",
+      "    # AXI lite register offset for number of iterations\r\n",
+      "    # used by TLastMarker to signal end of transmission for AXI CDMA\r\n",
+      "    REG_OFFSET_NUM_ITERS = 0x10\r\n",
       "\r\n",
-      "# unpack the packed output buffer from accelerator\r\n",
-      "obuf_folded = packed_bytearray_to_finnpy(\r\n",
-      "    obuf_packed, odt, oshape_folded, reverse_endian=True, reverse_inner=True\r\n",
-      ")\r\n",
-      "# convert to normal reshape and save\r\n",
-      "obuf_normal = obuf_folded.reshape(oshape_normal)\r\n",
-      "np.save(\"output.npy\", obuf_normal)\r\n"
+      "    # number of samples for inference\r\n",
+      "    if exec_mode == \"remote_pynq\":\r\n",
+      "        N = 1\r\n",
+      "    elif exec_mode == \"throughput_test\":\r\n",
+      "        res={}\r\n",
+      "        N = 1000\r\n",
+      "    else:\r\n",
+      "        raise Exception(\"Exec mode has to be set to remote_pynq or throughput_test\")\r\n",
+      "\r\n",
+      "    # declare input/output types and shapes for the accelerator\r\n",
+      "    ishape_packed = (N, 49, 2)\r\n",
+      "    oshape_packed = (N, 1, 40)\r\n",
+      "    \r\n",
+      "    if exec_mode == \"remote_pynq\":\r\n",
+      "        ibuf_normal = load_input(N)\r\n",
+      "        ibuf_packed = pack_input(ibuf_normal, N)\r\n",
+      "    elif exec_mode == \"throughput_test\":\r\n",
+      "        ibuf_packed = np.asarray(np.random.uniform(low=0, high=1, size=tuple(ishape_packed)), dtype=np.uint8)\r\n",
+      "\r\n",
+      "    # set up TLastMarker with correct num. samples\r\n",
+      "    ctrl_regs.write(REG_OFFSET_NUM_ITERS, N)\r\n",
+      "\r\n",
+      "    # allocate a PYNQ buffer for the packed input buffer\r\n",
+      "    ibuf_packed_device = allocate(shape=ishape_packed, dtype=np.uint8)\r\n",
+      "    # copy the packed data into the PYNQ buffer\r\n",
+      "    # TODO optimization: pack directly into the PYNQ buffer?\r\n",
+      "    np.copyto(ibuf_packed_device, ibuf_packed)\r\n",
+      "\r\n",
+      "    # allocate a PYNQ buffer for the returned packed output buffer\r\n",
+      "    obuf_packed = allocate(shape=oshape_packed, dtype=np.uint8)\r\n",
+      "\r\n",
+      "    if exec_mode == \"throughput_test\":\r\n",
+      "        # measure runtime of network\r\n",
+      "        start = time.time()\r\n",
+      "\r\n",
+      "    # set up the DMA and wait until all transfers complete\r\n",
+      "    dma.sendchannel.transfer(ibuf_packed_device)\r\n",
+      "    dma.recvchannel.transfer(obuf_packed)\r\n",
+      "    dma.sendchannel.wait()\r\n",
+      "    dma.recvchannel.wait()\r\n",
+      "\r\n",
+      "\r\n",
+      "    if exec_mode == \"throughput_test\":\r\n",
+      "        end = time.time()\r\n",
+      "        runtime = end - start\r\n",
+      "        res[\"runtime[ms]\"] = runtime*1000\r\n",
+      "        res[\"throughput[images/s]\"] = N / runtime\r\n",
+      "        file = open(\"nw_metrics.txt\", \"w\")\r\n",
+      "        file.write(str(res))\r\n",
+      "        file.close()\r\n",
+      "\r\n",
+      "    else:\r\n",
+      "        obuf_folded = unpack_output(obuf_packed, N)\r\n",
+      "        save_output(obuf_folded, N)\r\n",
+      "\r\n"
      ]
     }
    ],
@@ -1493,16 +1574,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [],
    "source": [
     "from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ\n",
-    "ip = \"192.168.3.1\"\n",
+    "ip = \"51.37.26.64\"\n",
+    "port = \"23\"\n",
     "username = \"xilinx\"\n",
-    "password = \"xilinx\"\n",
+    "password = \"x1l1nx_f1nn\"\n",
     "target_dir = \"/home/xilinx/finn_tfc_end2end_example\"\n",
-    "model = model.transform(DeployToPYNQ(ip, username, password, target_dir))\n",
+    "model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))\n",
     "model.save(build_dir + \"/tfc_w1_a1_pynq_deploy.onnx\")"
    ]
   },
@@ -1515,42 +1597,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 58,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
        ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_stitch_proj_tqp4ib4j/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_hq9mfroo\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j\"\n",
+       ", key: \"vivado_synth_rpt\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j/synth_report.xml\"\n",
        ", key: \"vivado_pynq_bitfile\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_hq9mfroo/resizer.bit\"\n",
+       "value: \"/tmp/finn_jakobap/vivado_pynq_proj_gkwfg31j/resizer.bit\"\n",
        ", key: \"pynq_driver_dir\"\n",
-       "value: \"/tmp/finn_maltanar/pynq_driver_25t8u9sd\"\n",
+       "value: \"/tmp/finn_jakobap/pynq_driver_1r1_0kz6\"\n",
        ", key: \"pynq_ip\"\n",
-       "value: \"192.168.3.1\"\n",
+       "value: \"51.37.26.64\"\n",
+       ", key: \"pynq_port\"\n",
+       "value: \"23\"\n",
        ", key: \"pynq_username\"\n",
        "value: \"xilinx\"\n",
        ", key: \"pynq_password\"\n",
-       "value: \"xilinx\"\n",
+       "value: \"x1l1nx_f1nn\"\n",
        ", key: \"pynq_target_dir\"\n",
        "value: \"/home/xilinx/finn_tfc_end2end_example\"\n",
        ", key: \"pynq_deployment_dir\"\n",
-       "value: \"/tmp/finn_maltanar/pynq_deployment_mpyziv7h\"\n",
+       "value: \"/tmp/finn_jakobap/pynq_deployment_kvurnk0c\"\n",
        ", key: \"pynq_deploy_dir\"\n",
-       "value: \"/tmp/finn_maltanar/pynq_deployment_mpyziv7h\"\n",
+       "value: \"/tmp/finn_jakobap/pynq_deployment_kvurnk0c\"\n",
        ", key: \"exec_mode\"\n",
        "value: \"remote_pynq\"\n",
        "]"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 58,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1561,34 +1647,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 59,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/home/xilinx/finn_tfc_end2end_example/pynq_deployment_1oyo7x66:\r\n",
-      "total 5820\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    1934 Feb 13 13:36 driver.py\r\n",
-      "drwxr-xr-x 4 xilinx xilinx    4096 Feb 13 13:36 finn\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    3264 Feb 13 14:24 input.npy\r\n",
-      "-rw-r--r-- 1 root   root       120 Feb 13 14:24 output.npy\r\n",
-      "-rw-r--r-- 1 xilinx xilinx 5568787 Feb 13 13:36 resizer.bit\r\n",
-      "-rw-r--r-- 1 xilinx xilinx  368173 Feb 13 13:36 resizer.hwh\r\n",
-      "-rw-r--r-- 1 root   root        32 Feb 13 14:24 sds_trace_data.dat\r\n",
-      "\r\n",
-      "/home/xilinx/finn_tfc_end2end_example/pynq_deployment_mpyziv7h:\r\n",
-      "total 5808\r\n",
-      "-rw-r--r-- 1 xilinx xilinx    1934 Feb 28 16:09 driver.py\r\n",
-      "drwxr-xr-x 4 xilinx xilinx    4096 Feb 28 16:09 finn\r\n",
-      "-rw-r--r-- 1 xilinx xilinx 5568787 Feb 28 16:09 resizer.bit\r\n",
-      "-rw-r--r-- 1 xilinx xilinx  368173 Feb 28 16:09 resizer.hwh\r\n"
+      "total 4284\r\n",
+      "-rw-r--r-- 1 xilinx xilinx    3861 Apr 27 12:36 driver.py\r\n",
+      "drwxr-xr-x 4 xilinx xilinx    4096 Apr 27 12:37 finn\r\n",
+      "-rw-r--r-- 1 xilinx xilinx 4045675 Apr 27 12:36 resizer.bit\r\n",
+      "-rw-r--r-- 1 xilinx xilinx  329531 Apr 27 12:36 resizer.hwh\r\n"
      ]
     }
    ],
    "source": [
-    "! sshpass -p {password} ssh {username}@{ip} 'ls -l {target_dir}/*'"
+    "! sshpass -p {password} ssh {username}@{ip} -p {port} 'ls -l {target_dir}/*'"
    ]
   },
   {
@@ -1600,16 +1675,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 60,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<matplotlib.image.AxesImage at 0x7f17e0a82e10>"
+       "<matplotlib.image.AxesImage at 0x7f4277550ef0>"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 60,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1633,7 +1708,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 61,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1653,7 +1728,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 62,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1675,7 +1750,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 63,
    "metadata": {},
    "outputs": [
     {
@@ -1684,13 +1759,13 @@
        "<BarContainer object of 10 artists>"
       ]
      },
-     "execution_count": 49,
+     "execution_count": 63,
      "metadata": {},
      "output_type": "execute_result"
     },
     {
      "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAMoUlEQVR4nO3cf6jd913H8edryercD1sxV9AkLgEzNQyl5dJVC1pshbSV5A9FGqjoKMs/y6yuKJlKHfWfzcn8gXUa5xzO2azWIcFGI7iKILbkdp11SYxcstrcrNK7rtYfQ7Pg2z/uiZzd3ptzkp57T/u+zwcEzvf7/XC+75ObPDn3e36kqpAkvfa9btoDSJImw6BLUhMGXZKaMOiS1IRBl6QmNk/rxFu2bKkdO3ZM6/SS9Jr05JNPfqmqZlY6NrWg79ixg7m5uWmdXpJek5L8y2rHvOQiSU0YdElqwqBLUhMjg57kY0meT/L5VY4nyW8mmU/ydJIbJj+mJGmUcZ6hfxzYc5njtwO7Bn8OAB955WNJkq7UyKBX1d8CX77Mkn3AH9aSx4HrknzLpAaUJI1nEtfQtwLnhrYXBvteJsmBJHNJ5hYXFydwaknSJev6omhVHa6q2aqanZlZ8X3xkqSrNImgnwe2D21vG+yTJK2jSXxS9ChwMMkR4B3AS1X13ATuV8vsOPTomp/jmQ/cuebnkLQ2RgY9yUPALcCWJAvALwGvB6iq3wGOAXcA88BXgHeu1bCSpNWNDHpV7R9xvIB3T2wiSdJV8ZOiktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1MRYQU+yJ8mZJPNJDq1w/NuSPJbkqSRPJ7lj8qNKki5nZNCTbAIeBG4HdgP7k+xetuwXgYer6nrgLuC3Jz2oJOnyxnmGfiMwX1Vnq+oCcATYt2xNAd8wuH0t8MXJjShJGsc4Qd8KnBvaXhjsG/Z+4O4kC8Ax4D0r3VGSA0nmkswtLi5exbiSpNVM6kXR/cDHq2obcAfwiSQvu++qOlxVs1U1OzMzM6FTS5JgvKCfB7YPbW8b7Bt2D/AwQFX9PfAGYMskBpQkjWecoJ8AdiXZmeQall70PLpszbPArQBJvouloHtNRZLW0cigV9VF4CBwHDjN0rtZTiZ5IMnewbL7gHcl+QfgIeAnq6rWamhJ0sttHmdRVR1j6cXO4X33D90+Bdw82dEkSVfCT4pKUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSE2MFPcmeJGeSzCc5tMqaH0tyKsnJJH882TElSaNsHrUgySbgQeCHgAXgRJKjVXVqaM0u4H3AzVX1YpJvXquBJUkrG+cZ+o3AfFWdraoLwBFg37I17wIerKoXAarq+cmOKUkaZZygbwXODW0vDPYNexvwtiR/l+TxJHtWuqMkB5LMJZlbXFy8uoklSSua1Iuim4FdwC3AfuD3kly3fFFVHa6q2aqanZmZmdCpJUkwXtDPA9uHtrcN9g1bAI5W1Ver6gvAP7MUeEnSOhkn6CeAXUl2JrkGuAs4umzNn7H07JwkW1i6BHN2gnNKkkYYGfSquggcBI4Dp4GHq+pkkgeS7B0sOw68kOQU8Bjws1X1wloNLUl6uZFvWwSoqmPAsWX77h+6XcB7B38kSVPgJ0UlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpibGCnmRPkjNJ5pMcusy6H0lSSWYnN6IkaRwjg55kE/AgcDuwG9ifZPcK694C3As8MekhJUmjjfMM/UZgvqrOVtUF4Aiwb4V1vwx8EPjvCc4nSRrTOEHfCpwb2l4Y7Pt/SW4AtlfVo5e7oyQHkswlmVtcXLziYSVJq3vFL4omeR3wYeC+UWur6nBVzVbV7MzMzCs9tSRpyDhBPw9sH9reNth3yVuAtwN/k+QZ4CbgqC+MStL6GifoJ4BdSXYmuQa4Czh66WBVvVRVW6pqR1XtAB4H9lbV3JpMLEla0cigV9VF4CBwHDgNPFxVJ5M8kGTvWg8oSRrP5nEWVdUx4NiyffevsvaWVz6WJOlK+UlRSWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJamKsoCfZk+RMkvkkh1Y4/t4kp5I8neSvk7x18qNKki5nZNCTbAIeBG4HdgP7k+xetuwpYLaqvht4BPiVSQ8qSbq8cZ6h3wjMV9XZqroAHAH2DS+oqseq6iuDzceBbZMdU5I0yjhB3wqcG9peGOxbzT3AX6x0IMmBJHNJ5hYXF8efUpI00kRfFE1yNzALfGil41V1uKpmq2p2ZmZmkqeWpA1v8xhrzgPbh7a3DfZ9jSS3Ab8A/EBV/c9kxpMkjWucZ+gngF1Jdia5BrgLODq8IMn1wO8Ce6vq+cmPKUkaZWTQq+oicBA4DpwGHq6qk0keSLJ3sOxDwJuBP0nyuSRHV7k7SdIaGeeSC1V1DDi2bN/9Q7dvm/BckqQr5CdFJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqYmxgp5kT5IzSeaTHFrh+Ncl+dTg+BNJdkx6UEnS5Y0MepJNwIPA7cBuYH+S3cuW3QO8WFXfDvwa8MFJDypJurzNY6y5EZivqrMASY4A+4BTQ2v2Ae8f3H4E+K0kqaqa4Kyaoh2HHl3zczzzgTvX/ByvNWv99+7feS/jBH0rcG5oewF4x2prqupikpeAbwK+NLwoyQHgwGDzP5OcuZqhr9KW5fNsEFf0uDPF360mfG5/3mOY5s97wjbSz/utqx0YJ+gTU1WHgcPrec5LksxV1ew0zj1NPu6Nxce9sY3zouh5YPvQ9rbBvhXXJNkMXAu8MIkBJUnjGSfoJ4BdSXYmuQa4Czi6bM1R4CcGt38U+IzXzyVpfY285DK4Jn4QOA5sAj5WVSeTPADMVdVR4PeBTySZB77MUvRfbaZyqedVwMe9sfi4N7D4RFqSevCTopLUhEGXpCbaB33U1xZ0lGR7kseSnEpyMsm9055pPSXZlOSpJH8+7VnWU5LrkjyS5J+SnE7yvdOeaT0k+ZnBv/PPJ3koyRumPdO0tA76mF9b0NFF4L6q2g3cBLx7gzzuS+4FTk97iCn4DeAvq+o7ge9hA/wdJNkK/BQwW1VvZ+mNG6/GN2Wsi9ZBZ+hrC6rqAnDpawtaq6rnquqzg9v/wdJ/7K3TnWp9JNkG3Al8dNqzrKck1wLfz9I7zqiqC1X1b9Odat1sBr5+8BmYNwJfnPI8U9M96Ct9bcGGCNslg2++vB54YrqTrJtfB34O+N9pD7LOdgKLwB8MLjd9NMmbpj3UWquq88CvAs8CzwEvVdVfTXeq6eke9A0tyZuBPwV+uqr+fdrzrLUkPww8X1VPTnuWKdgM3AB8pKquB/4LaP+aUZJvZOm37p3AtwJvSnL3dKeanu5BH+drC1pK8nqWYv7Jqvr0tOdZJzcDe5M8w9LltR9M8kfTHWndLAALVXXpN7FHWAp8d7cBX6iqxar6KvBp4PumPNPUdA/6OF9b0E6SsHQt9XRVfXja86yXqnpfVW2rqh0s/aw/U1Ub4tlaVf0rcC7Jdwx23crXfsV1V88CNyV54+Df/a1sgBeDV7Ou37a43lb72oIpj7UebgZ+HPjHJJ8b7Pv5qjo2xZm09t4DfHLw5OUs8M4pz7PmquqJJI8An2Xp3V1PsYG/BsCP/ktSE90vuUjShmHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUxP8BwjHuoBhu1y0AAAAASUVORK5CYII=\n",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAMp0lEQVR4nO3cf6zdd13H8eeL1qoMgia7f2jbcRttMA2iI9cyJUHDZtJlpjVhJl0CYQbSmFCZQqKdmv1R/4Fhpv7RGJoxQxQsOPnj4qrVCP7hHyy9+xGgq43XOtdWDHeAYDRaGt7+0VNyvLvt/XY79572fZ+PZMn5fr+f3O/7bN0z336/95xUFZKkm9+rpj2AJGkyDLokNWHQJakJgy5JTRh0SWpi87ROfOutt9bs7Oy0Ti9JN6WnnnrqxaqaWenY1II+OzvLwsLCtE4vSTelJP96tWPecpGkJgy6JDVh0CWpCYMuSU0MCnqSPUnOJFlMcmiF4/cnWUry7Oif905+VEnStaz6Wy5JNgFHgJ8HzgMnk8xX1XPLln6qqg6uwYySpAGGXKHvBhar6mxVXQSOAfvWdixJ0vUaEvStwLmx7fOjfcu9I8kXkzyeZPtKPyjJgSQLSRaWlpZexriSpKuZ1EPRzwKzVfUm4G+Bj6+0qKqOVtVcVc3NzKz4QSdJ0ss05JOiF4DxK+5to33fVVVfG9t8FHj4lY+m5WYPPbHm53j+Q/es+TkkrY0hV+gngZ1JdiTZAuwH5scXJPmhsc29wOnJjShJGmLVK/SqupTkIHAC2AQ8VlWnkhwGFqpqHnh/kr3AJeDrwP1rOLMkaQWDvpyrqo4Dx5fte2js9YPAg5MdTZJ0PfykqCQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgYFPcmeJGeSLCY5dI1170hSSeYmN6IkaYhVg55kE3AEuBvYBdyXZNcK614LPAA8OekhJUmrG3KFvhtYrKqzVXUROAbsW2Hd7wIfBv5ngvNJkgYaEvStwLmx7fOjfd+V5M3A9qp6YoKzSZKuwyt+KJrkVcAjwAcHrD2QZCHJwtLS0is9tSRpzJCgXwC2j21vG+274rXAG4G/T/I8cAcwv9KD0ao6WlVzVTU3MzPz8qeWJL3EkKCfBHYm2ZFkC7AfmL9ysKq+WVW3VtVsVc0CXwD2VtXCmkwsSVrRqkGvqkvAQeAEcBr4dFWdSnI4yd61HlCSNMzmIYuq6jhwfNm+h66y9ude+ViSpOvlJ0UlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpiUFBT7InyZkki0kOrXD8V5J8KcmzSf4hya7JjypJupZVg55kE3AEuBvYBdy3QrA/WVU/XlU/CTwMPDLxSSVJ1zTkCn03sFhVZ6vqInAM2De+oKq+NbZ5C1CTG1GSNMTmAWu2AufGts8Db1m+KMn7gA8AW4C3r/SDkhwADgDcdttt1zurJOkaJvZQtKqOVNWPAL8J/M5V1hytqrmqmpuZmZnUqSVJDAv6BWD72Pa20b6rOQb84isZSpJ0/YYE/SSwM8mOJFuA/cD8+IIkO8c27wH+aXIjSpKGWPUeelVdSnIQOAFsAh6rqlNJDgMLVTUPHExyF/Bt4BvAu9dyaEnSSw15KEpVHQeOL9v30NjrByY8lyTpOvlJUUlqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWpiUNCT7ElyJslikkMrHP9AkueSfDHJ3yV5/eRHlSRdy6pBT7IJOALcDewC7kuya9myZ4C5qnoT8Djw8KQHlSRd25Ar9N3AYlWdraqLwDFg3/iCqvp8Vf33aPMLwLbJjilJWs2QoG8Fzo1tnx/tu5r3AH+10oEkB5IsJFlYWloaPqUkaVUTfSia5J3AHPCRlY5X1dGqmququZmZmUmeWpI2vM0D1lwAto9tbxvt+3+S3AX8NvCzVfW/kxlPkjTUkCv0k8DOJDuSbAH2A/PjC5LcDnwU2FtVX538mJKk1awa9Kq6BBwETgCngU9X1akkh5PsHS37CPAa4M+TPJtk/io/TpK0RobccqGqjgPHl+17aOz1XROeS5J0nfykqCQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNWHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUhEGXpCYMuiQ1YdAlqQmDLklNGHRJasKgS1ITBl2SmjDoktSEQZekJgy6JDUxKOhJ9iQ5k2QxyaEVjr8tydNJLiW5d/JjSpJWs2rQk2wCjgB3A7uA+5LsWrbsBeB+4JOTHlCSNMzmAWt2A4tVdRYgyTFgH/DclQVV9fzo2HfWYEZJ0gBDbrlsBc6NbZ8f7btuSQ4kWUiysLS09HJ+hCTpKtb1oWhVHa2quaqam5mZWc9TS1J7Q4J+Adg+tr1ttE+SdAMZEvSTwM4kO5JsAfYD82s7liTpeq0a9Kq6BBwETgCngU9X1akkh5PsBUjyU0nOA78EfDTJqbUcWpL0UkN+y4WqOg4cX7bvobHXJ7l8K0aSNCV+UlSSmjDoktSEQZekJgy6JDUx6KGoJK2n2UNPrPk5nv/QPWt+jvVm0DWI/4NJNz5vuUhSEzflFbpXi5L0Ul6hS1ITBl2SmjDoktTETXkPXdLa81nVzceg66aw1nExLOrAWy6S1IRBl6QmvOUi3cC81aTrYdClVRhV3Sy85SJJTRh0SWrCoEtSE95Dv05+2ELSjcqgS9KYm/mizVsuktSEQZekJgy6JDVh0CWpCYMuSU0YdElqwqBLUhMGXZKaMOiS1IRBl6QmDLokNTEo6En2JDmTZDHJoRWOf2+ST42OP5lkdtKDSpKubdWgJ9kEHAHuBnYB9yXZtWzZe4BvVNWPAr8PfHjSg0qSrm3IFfpuYLGqzlbVReAYsG/Zmn3Ax0evHwfuTJLJjSlJWk2q6toLknuBPVX13tH2u4C3VNXBsTVfHq05P9r+59GaF5f9rAPAgdHmG4Azk3ojA9wKvLjqqn583xuL77u/11fVzEoH1vX70KvqKHB0Pc95RZKFqpqbxrmnyfe9sfi+N7Yht1wuANvHtreN9q24Jslm4HXA1yYxoCRpmCFBPwnsTLIjyRZgPzC/bM088O7R63uBz9Vq93IkSRO16i2XqrqU5CBwAtgEPFZVp5IcBhaqah74GPAnSRaBr3M5+jeaqdzquQH4vjcW3/cGtupDUUnSzcFPikpSEwZdkppoH/TVvragoyTbk3w+yXNJTiV5YNozrackm5I8k+Qvpz3LekryA0keT/KPSU4n+elpz7Qekvz66M/5l5P8WZLvm/ZM09I66AO/tqCjS8AHq2oXcAfwvg3yvq94ADg97SGm4A+Bv66qHwN+gg3w7yDJVuD9wFxVvZHLv7hxI/5SxrpoHXSGfW1BO1X1lap6evT6P7n8P/bW6U61PpJsA+4BHp32LOspyeuAt3H5N86oqotV9R/TnWrdbAa+f/QZmFcD/zbleaame9C3AufGts+zQcJ2xeibL28HnpzuJOvmD4DfAL4z7UHW2Q5gCfjj0e2mR5PcMu2h1lpVXQB+D3gB+Arwzar6m+lONT3dg76hJXkN8BfAr1XVt6Y9z1pL8gvAV6vqqWnPMgWbgTcDf1RVtwP/BbR/ZpTkB7n8t+4dwA8DtyR553Snmp7uQR/ytQUtJfkeLsf8E1X1mWnPs07eCuxN8jyXb6+9PcmfTnekdXMeOF9VV/4m9jiXA9/dXcC/VNVSVX0b+AzwM1OeaWq6B33I1xa0M/rq4o8Bp6vqkWnPs16q6sGq2lZVs1z+b/25qtoQV2tV9e/AuSRvGO26E3huiiOtlxeAO5K8evTn/k42wMPgq1nXb1tcb1f72oIpj7Ue3gq8C/hSkmdH+36rqo5PcSatvV8FPjG6eDkL/PKU51lzVfVkkseBp7n8213PsIG/BsCP/ktSE91vuUjShmHQJakJgy5JTRh0SWrCoEtSEwZdkpow6JLUxP8B9uoCk0KMtNwAAAAASUVORK5CYII=\n",
       "text/plain": [
        "<Figure size 432x288 with 1 Axes>"
       ]
@@ -1720,6 +1795,37 @@
     "We see that the network correctly predicts this as a digit 2 with high probability. This concludes our tutorial on how to take a simple fully-connected BNN all the way down to hardware with FINN, and execute it remotely on a PYNQ board."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Throughput Test on PYNQ Board <a id='throughput'></a>\n",
+    "In addition to the functional verification, FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done using the core function `throughput_test`. In the next section we import the function and execute it.\n",
+    "First we extract the `remote_exec_model` again and pass it to the function. The function returns the metrics of the network as dictionary."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Network metrics: \n",
+      "{'runtime[ms]': 3.5953521728515625, 'throughput[images/s]': 278136.8700265252}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from finn.core.throughput_test import throughput_test\n",
+    "\n",
+    "child_model = ModelWrapper(getCustomOp(sdp_node).get_nodeattr(\"model\"))\n",
+    "res = throughput_test(child_model)\n",
+    "print(\"Network metrics: \\n\" + str(res))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index c7db5b1d9d22ea89740f4c82633c96746a6fa5ee..958890f9e6a84d796ecb4a817dbf740c117ede0b 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -25,7 +25,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
+import warnings
 import os
 import xml.etree.ElementTree as ET
 
@@ -50,9 +50,16 @@ def hls_synth_res_estimation(model):
                 inst = registry.custom_op[op_type](node)
                 code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen")
                 if code_gen_dir == "":
-                    raise Exception(
-                        """Please run "CodeGen_ipgen" transformation and
-                            "HLSSynth_IPGen" first to generate the report files"""
+                    res_dict[node.name] = dict()
+                    res_dict[node.name]["BRAM_18K"] = 0
+                    res_dict[node.name]["FF"] = 0
+                    res_dict[node.name]["LUT"] = 0
+                    res_dict[node.name]["DSP48E"] = 0
+                    res_dict[node.name]["URAM"] = 0
+                    warnings.warn(
+                        """Could not find report files, values will be set to zero
+                        for this node. Please run "CodeGen_ipgen" transformation and
+                        "HLSSynth_IPGen" first to generate the report files"""
                     )
                 else:
                     xmlfile = "{}/project_{}/sol1/syn/report/{}_csynth.xml".format(
@@ -67,9 +74,16 @@ def hls_synth_res_estimation(model):
                             for child in item:
                                 res_dict[node.name][child.tag] = child.text
                     else:
-                        raise Exception(
-                            """Please run "HLSSynth_IPGen" first
-                                to generate the report files"""
+                        res_dict[node.name] = dict()
+                        res_dict[node.name]["BRAM_18K"] = 0
+                        res_dict[node.name]["FF"] = 0
+                        res_dict[node.name]["LUT"] = 0
+                        res_dict[node.name]["DSP48E"] = 0
+                        res_dict[node.name]["URAM"] = 0
+                        warnings.warn(
+                            """Could not find report files, values will be set to zero
+                            for this node. Please run "HLSSynth_IPGen" first
+                            to generate the report files"""
                         )
 
     return res_dict
diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
index eff9cea291b106d69e99055d5b6e2af448fb7517..5c330e29a52ab73343fbf7fd1858020b3d0cdd30 100644
--- a/src/finn/core/remote_exec.py
+++ b/src/finn/core/remote_exec.py
@@ -38,6 +38,7 @@ def remote_exec(model, execution_context):
     input values."""
     # TODO fix for multi input-output
     pynq_ip = model.get_metadata_prop("pynq_ip")
+    pynq_port = model.get_metadata_prop("pynq_port")
     pynq_username = model.get_metadata_prop("pynq_username")
     pynq_password = model.get_metadata_prop("pynq_password")
     pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
@@ -49,8 +50,9 @@ def remote_exec(model, execution_context):
     # extracting last folder of absolute path (deployment_dir)
     deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
     # copy input to PYNQ board
-    cmd = "sshpass -p {} scp -r {}/input.npy {}@{}:{}/{}".format(
+    cmd = "sshpass -p {} scp -P{} -r {}/input.npy {}@{}:{}/{}".format(
         pynq_password,
+        pynq_port,
         deployment_dir,
         pynq_username,
         pynq_ip,
@@ -60,13 +62,15 @@ def remote_exec(model, execution_context):
     bash_command = ["/bin/bash", "-c", cmd]
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
-
     cmd = (
-        "sshpass -p {} ssh {}@{} " '"cd {}/{}; echo "{}" | sudo -S python3.6 driver.py"'
+        "sshpass -p {} ssh {}@{} -p {} "
+        '"cd {}/{}; echo "{}" | '
+        'sudo -S python3.6 driver.py remote_pynq 1 resizer.bit input.npy output.npy"'
     ).format(
         pynq_password,
         pynq_username,
         pynq_ip,
+        pynq_port,
         pynq_target_dir,
         deployment_folder,
         pynq_password,
@@ -74,9 +78,9 @@ def remote_exec(model, execution_context):
     bash_command = ["/bin/bash", "-c", cmd]
     process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
     process_compile.communicate()
-
-    cmd = "sshpass -p {} scp {}@{}:{}/{}/output.npy {}".format(
+    cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/output.npy {}".format(
         pynq_password,
+        pynq_port,
         pynq_username,
         pynq_ip,
         pynq_target_dir,
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dda30b638255562e0dd1795ae5dc7586177308dc
--- /dev/null
+++ b/src/finn/core/throughput_test.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import subprocess
+
+
+def throughput_test(model):
+    """Runs the throughput test for the given model remotely on the pynq board.
+    The metadata properties related to the pynq board have to be set.
+    Returns a dictionary with results of the throughput test"""
+
+    pynq_ip = model.get_metadata_prop("pynq_ip")
+    pynq_port = model.get_metadata_prop("pynq_port")
+    pynq_username = model.get_metadata_prop("pynq_username")
+    pynq_password = model.get_metadata_prop("pynq_password")
+    pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
+    deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
+    # extracting last folder of absolute path (deployment_dir)
+    deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
+
+    cmd = (
+        "sshpass -p {} ssh {}@{} -p {} "
+        '"cd {}/{}; echo "{}" | '
+        "sudo -S python3.6 driver.py throughput_test 1000 "
+        'resizer.bit input.npy output.npy"'
+    ).format(
+        pynq_password,
+        pynq_username,
+        pynq_ip,
+        pynq_port,
+        pynq_target_dir,
+        deployment_folder,
+        pynq_password,
+    )
+    bash_command = ["/bin/bash", "-c", cmd]
+    process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+    process_compile.communicate()
+
+    cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format(
+        pynq_password,
+        pynq_port,
+        pynq_username,
+        pynq_ip,
+        pynq_target_dir,
+        deployment_folder,
+        deployment_dir,
+    )
+    bash_command = ["/bin/bash", "-c", cmd]
+    process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+    process_compile.communicate()
+
+    with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
+        res = eval(file.read())
+
+    return res
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 2500b1f03b917225d92b00de033299f20e3d9f5d..db787cf2b4bd01a8b3ee6fd4aac37fbfd4667cee 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -490,11 +490,11 @@ compilation transformations?
         """Returns folded output shape (according to neuron folding), if implemented."""
         raise Exception("get_folded_output_shape not implemented for this op")
 
-    def get_instream_width(self):
+    def get_instream_width(self, axi_strm_padding=False):
         """Returns input stream width, if implemented."""
         raise Exception("get_instream_width not implemented for this op")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, axi_strm_padding=False):
         """Returns output stream width, if implemented."""
         raise Exception("get_outstream_width not implemented for this op")
 
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index e05b2dcea7e17231617f9d3880b778d1978b4ead..3c16e8dabeca6848daf595a6b12e14595a38581d 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -34,6 +34,7 @@ from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.custom_op.im2col import compute_conv_output_dim
 from onnx import TensorProto, helper
+from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator:
@@ -135,20 +136,23 @@ class ConvolutionInputGenerator(HLSCustomOp):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, axi_strm_padding=False):
         """Returns stream width, input and output stream width are equal for
         the sliding window function"""
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
         assert simd == ifm_ch, "SWG currently requires SIMD=IFM"
-        return simd * ibits
+        in_width = simd * ibits
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, axi_strm_padding=False):
         """Returns stream width, input and output stream width are equal for
         the sliding window function, so the function to determine the input
         stream width can be reused."""
-        return self.get_instream_width()
+        return self.get_instream_width(axi_strm_padding)
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 6a4070528ee50d97e62881d00b57355d2a2baf2d..ce4f883fa029225a5748c08463858e3bf1bfd35c 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -32,6 +32,7 @@ import numpy as np
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
+from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # does not do anything at the ONNX node-by-node level, and input-output
@@ -150,11 +151,17 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         folded_ishape = self.get_folded_input_shape()
         return np.prod(folded_ishape[:-1])
 
-    def get_instream_width(self):
-        return self.get_nodeattr("inWidth")
-
-    def get_outstream_width(self):
-        return self.get_nodeattr("outWidth")
+    def get_instream_width(self, axi_strm_padding=False):
+        in_width = self.get_nodeattr("inWidth")
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
+
+    def get_outstream_width(self, axi_strm_padding=False):
+        out_width = self.get_nodeattr("outWidth")
+        if axi_strm_padding is True:
+            out_width = roundup_to_integer_multiple(out_width, 8)
+        return out_width
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index eab3decc696cb86622bbdd8f22f015515ea936d5..015db8ccf005aa34f9e82ad31d386f9d0e91733e 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -282,19 +282,28 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, axi_strm_padding=False):
         i_bits = self.get_input_datatype().bitwidth()
-        return i_bits * self.get_nodeattr("SIMD")
+        in_width = i_bits * self.get_nodeattr("SIMD")
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, axi_strm_padding=False):
         o_bits = self.get_output_datatype().bitwidth()
-        return o_bits * self.get_nodeattr("PE")
+        out_width = o_bits * self.get_nodeattr("PE")
+        if axi_strm_padding is True:
+            out_width = roundup_to_integer_multiple(out_width, 8)
+        return out_width
 
-    def get_weightstream_width(self):
+    def get_weightstream_width(self, axi_strm_padding=False):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
         wp = self.get_weight_datatype().bitwidth()
-        return pe * simd * wp
+        w_width = pe * simd * wp
+        if axi_strm_padding is True:
+            w_width = roundup_to_integer_multiple(w_width, 8)
+        return w_width
 
     def get_ap_int_max_w(self):
         temp_value = super().get_ap_int_max_w()
@@ -979,7 +988,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             in_width = roundup_to_integer_multiple(self.get_instream_width(), 8)
             self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
             self.code_gen_dict["$OUT_RANGE$"] = [
-                "[{}:0]".format(self.get_outstream_width() - 1)
+                "[{}:0]".format(self.get_outstream_width(axi_strm_padding=True) - 1)
             ]
             # make weight stream width a multiple of 8 for AXI stream interface
             weight_width = roundup_to_integer_multiple(self.get_weightstream_width(), 8)
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fcb1fe43a3927a7d49b6e041727a54cc384942f
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -0,0 +1,293 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import numpy as np
+from shutil import copy
+import subprocess
+
+from pyverilator import PyVerilator
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.core.datatype import DataType
+from onnx import TensorProto, helper
+from finn.util.basic import roundup_to_integer_multiple
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+from . import templates
+
+
+class StreamingFIFO(HLSCustomOp):
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+        self.strm_fifo_wrapper = templates.strm_fifo_wrapper
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # FIFO depth
+            "depth": ("i", True, 0),
+            # folded shape of input/output
+            "folded_shape": ("ints", True, []),
+            # FINN DataTypes for inputs/outputs
+            "dataType": ("s", True, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+
+        return my_attrs
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingFIFO."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # copy Q_srl.v from finn-rtllib to code gen directory
+        memstream_dir = "/workspace/finn/finn-rtllib/memstream/hdl/"
+        Q_file = os.path.join(memstream_dir, "Q_srl.v")
+        copy(Q_file, code_gen_dir)
+
+        # empty code gen dictionary for new entries
+        self.code_gen_dict.clear()
+        self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)]
+        self.code_gen_dict["$LAYER_NAME$"] = [
+            "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
+        ]
+        # make instream width a multiple of 8 for axi interface
+        in_width = self.get_instream_width(axi_strm_padding=True)
+        self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
+        self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)]
+        self.code_gen_dict["$WIDTH$"] = [str(in_width)]
+        self.code_gen_dict["$DEPTH$"] = [str(self.get_nodeattr("depth"))]
+
+        template = self.strm_fifo_wrapper
+
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        f = open(os.path.join(code_gen_dir, "{}.v".format(self.onnx_node.name)), "w",)
+        f.write(template)
+        f.close()
+        self.code_gen_dict.clear()
+
+    def ipgen_singlenode_code(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # prepare the IP packaging tcl template
+        template = templates.ip_package_tcl
+        self.code_gen_dict.clear()
+        self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)]
+        self.code_gen_dict["$VERILOG_DIR$"] = [code_gen_dir]
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        f = open(os.path.join(code_gen_dir, "package_ip.tcl"), "w")
+        f.write(template)
+        f.close()
+        # create a shell script and call Vivado to invoke the IP pkg script
+        make_project_sh = code_gen_dir + "/make_ip.sh"
+        working_dir = os.environ["PWD"]
+        with open(make_project_sh, "w") as f:
+            f.write("#!/bin/bash \n")
+            f.write("cd {}\n".format(code_gen_dir))
+            f.write("vivado -mode batch -source package_ip.tcl\n")
+            f.write("cd {}\n".format(working_dir))
+        bash_command = ["bash", make_project_sh]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
+        # set ipgen_path and ip_path to point to the new packaged IP
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+        vlnv = "xilinx.com:hls:%s:1.0" % (self.onnx_node.name)
+        self.set_nodeattr("ip_vlnv", vlnv)
+        self.code_gen_dict.clear()
+
+    def get_normal_input_shape(self):
+        depth = self.get_nodeattr("depth")
+        assert (
+            depth >= 2
+        ), """Depth is too low. Please set node attribute "depth" to a value
+        between 2 and 256"""
+        assert (
+            depth <= 256
+        ), """Depth is too high. Please set node attribute "depth" to a value
+        between 2 and 256"""
+        folded_shape = self.get_nodeattr("folded_shape")
+        inner_dim = folded_shape[-1]
+        folding_factor = folded_shape[-2] * inner_dim
+        normal_ishape = []
+        for i in range(len(folded_shape) - 2):
+            normal_ishape.append(folded_shape[i])
+        normal_ishape.append(folding_factor)
+
+        return normal_ishape
+
+    def get_normal_output_shape(self):
+        return self.get_normal_input_shape()
+
+    def get_folded_input_shape(self):
+        return self.get_nodeattr("folded_shape")
+
+    def get_folded_output_shape(self):
+        return self.get_nodeattr("folded_shape")
+
+    def get_instream_width(self, axi_strm_padding=False):
+        dtype = DataType[self.get_nodeattr("dataType")]
+        folded_shape = self.get_nodeattr("folded_shape")
+        in_width = folded_shape[-1] * dtype.bitwidth()
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
+
+    def get_outstream_width(self, axi_strm_padding=False):
+        dtype = DataType[self.get_nodeattr("dataType")]
+        folded_shape = self.get_nodeattr("folded_shape")
+        in_width = folded_shape[-1] * dtype.bitwidth()
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        inp = context[node.input[0]]
+        exp_shape = self.get_normal_input_shape()
+
+        if mode == "npysim":
+            output = inp
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
+            context[node.output[0]] = output
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            # create a npy file for the input of the node
+            assert (
+                str(inp.dtype) == "float32"
+            ), """Input datatype is
+                not float32 as expected."""
+            expected_inp_shape = self.get_folded_input_shape()
+            reshaped_input = inp.reshape(expected_inp_shape)
+            if DataType[self.get_nodeattr("dataType")] == DataType.BIPOLAR:
+                # store bipolar activations as binary
+                reshaped_input = (reshaped_input + 1) / 2
+                export_idt = DataType.BINARY
+            else:
+                export_idt = DataType[self.get_nodeattr("dataType")]
+            # make copy before saving the array
+            reshaped_input = reshaped_input.copy()
+            np.save(
+                os.path.join(code_gen_dir, "input_0.npy"), reshaped_input,
+            )
+            verilog_file = os.path.join(
+                code_gen_dir, "{}.v".format(self.onnx_node.name)
+            )
+            if os.path.isfile(verilog_file):
+                nbits = self.get_instream_width(axi_strm_padding=True)
+                inp = npy_to_rtlsim_input(
+                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+                )
+                sim = PyVerilator.build(verilog_file, verilog_path=[code_gen_dir],)
+                super().reset_rtlsim(sim)
+                super().toggle_clk(sim)
+                output = self.rtlsim(sim, inp)
+                odt = DataType[self.get_nodeattr("dataType")]
+                target_bits = odt.bitwidth()
+                packed_bits = self.get_outstream_width(axi_strm_padding=True)
+                out_npy_path = "{}/output.npy".format(code_gen_dir)
+                out_shape = self.get_folded_output_shape()
+                rtlsim_output_to_npy(
+                    output, out_npy_path, odt, out_shape, packed_bits, target_bits
+                )
+
+                # load and reshape output
+                output = np.load(out_npy_path)
+                oshape = self.get_normal_output_shape()
+                output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+                context[node.output[0]] = output
+
+            else:
+                raise Exception(
+                    """Found no verilog files for this node,
+                    did you run the codegen_ipgen transformation?"""
+                )
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def get_number_input_values(self):
+        folded_ishape = self.get_folded_input_shape()
+        return np.prod(folded_ishape[:-1])
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index f370d417aa0ac1ce5d62af878575332941e2c1d0..ef1a5ee1bdc0bbe5c773aa375bf4402a8cb16ddb 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -33,6 +33,7 @@ from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.custom_op.im2col import compute_conv_output_dim
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
+from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -87,14 +88,17 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
-    def get_instream_width(self):
+    def get_instream_width(self, axi_strm_padding=False):
         dt_bits = self.get_input_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
-        return int(dt_bits * ifm_ch)
+        in_width = int(dt_bits * ifm_ch)
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, axi_strm_padding=False):
         """For streaming maxpool out stream with is the same as in stream width"""
-        return self.get_instream_width()
+        return self.get_instream_width(axi_strm_padding)
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index bfa90ebeda06e55ffaa9b8ea5b40369ed246ba86..6313bb79c21231c4be5b242558da5ac40fb2aa78 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -402,3 +402,43 @@ ipx::create_xgui_files [ipx::current_core]
 ipx::update_checksums [ipx::current_core]
 ipx::save_core [ipx::current_core]
 """
+
+strm_fifo_wrapper = """
+module $TOPNAME$(
+ap_clk,
+ap_rst_n,
+in0_V_V_TDATA,
+in0_V_V_TVALID,
+in0_V_V_TREADY,
+out_V_V_TDATA,
+out_V_V_TVALID,
+out_V_V_TREADY
+);
+
+input   ap_clk;
+input   ap_rst_n;
+input  $IN_RANGE$ in0_V_V_TDATA;
+input   in0_V_V_TVALID;
+output   in0_V_V_TREADY;
+output  $OUT_RANGE$ out_V_V_TDATA;
+output   out_V_V_TVALID;
+input   out_V_V_TREADY;
+
+Q_srl #(
+.depth($DEPTH$),
+.width($WIDTH$)
+)
+$LAYER_NAME$
+(
+ .clock(ap_clk),
+ .reset(!ap_rst_n),
+ .i_d(in0_V_V_TDATA),
+ .i_v(in0_V_V_TVALID),
+ .i_r(in0_V_V_TREADY),
+ .o_d(out_V_V_TDATA),
+ .o_v(out_V_V_TVALID),
+ .o_r(out_V_V_TREADY)
+);
+
+endmodule
+"""
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 4d4dee6506f04909c53cd05e4898a7ad77e4a83a..e5a5fed6c9d5d31fbf0082707879480e0c0a2dc7 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.util.basic import roundup_to_integer_multiple
 
 
 class TLastMarker(HLSCustomOp):
@@ -82,7 +83,7 @@ class TLastMarker(HLSCustomOp):
         self.code_gen_dict["$DEFINES$"] = [
             "#define StreamWidth %d" % stream_width,
             "#define OutDType %s" % out_stream_dtype,
-            "#define NumIters %d" % self.get_nodeattr("NumIters"),
+            "#define NumItersPerImg %d" % self.get_nodeattr("NumIters"),
         ]
 
     def read_npy_data(self):
@@ -90,12 +91,23 @@ class TLastMarker(HLSCustomOp):
 
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            "for(int i=0; i<NumIters; i++) {",
-            "#pragma HLS PIPELINE II=1",
+            "unsigned int n = 1;",
             "OutDType t;",
-            "t.set_data(in0.read());",
             "t.set_keep(-1);",
-            "t.set_last(i==(NumIters-1));",
+            "io_section: { // start of cycle accurate region",
+            "#pragma HLS protocol fixed",
+            "// do a first read from stream before we decide on numIters",
+            "// giving software a chance to set up the numIters prior to startup",
+            "t.set_data(in0.read());",
+            "n = (numIters == 0 ? NumItersPerImg : numIters);",
+            "t.set_last(n==1);",
+            "out.write(t);",
+            "} // end of cycle accurate region",
+            "// do one less iteration than spec since we already did one",
+            "for(unsigned int i=1; i<n; i++) {",
+            "#pragma HLS PIPELINE II=1",
+            "t.set_data(in0.read());",
+            "t.set_last(i==(n-1));",
             "out.write(t);",
             "}",
         ]
@@ -109,13 +121,16 @@ class TLastMarker(HLSCustomOp):
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
             """void %s(hls::stream<ap_uint<StreamWidth> > &in0,
-                hls::stream<OutDType> &out)"""
+                hls::stream<OutDType> &out, unsigned int numIters)"""
             % self.onnx_node.name
         ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE s_axilite port=numIters bundle=control"
+        )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
@@ -133,12 +148,16 @@ class TLastMarker(HLSCustomOp):
     def get_folded_output_shape(self):
         return self.get_folded_input_shape()
 
-    def get_instream_width(self):
+    def get_instream_width(self, axi_strm_padding=False):
         stream_width = self.get_nodeattr("StreamWidth")
+        if axi_strm_padding is True:
+            stream_width = roundup_to_integer_multiple(stream_width, 8)
         return stream_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, axi_strm_padding=False):
         stream_width = self.get_nodeattr("StreamWidth")
+        if axi_strm_padding is True:
+            stream_width = roundup_to_integer_multiple(stream_width, 8)
         return stream_width
 
     def strm_decl(self):
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index c797affff9dbf1310c413db0847e0e2dae222a97..411311c2b9def953ee5ac6d03adfafb81704c177 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -33,6 +33,7 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
 )
 from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
+from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
 from finn.custom_op.im2col import Im2Col
 from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
 from finn.custom_op.multithreshold import MultiThreshold
@@ -56,6 +57,7 @@ custom_op["TLastMarker"] = TLastMarker
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
 custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
+custom_op["StreamingFIFO"] = StreamingFIFO
 
 
 def getCustomOp(node):
diff --git a/src/finn/transformation/fpgadataflow/codegen_ipstitch.py b/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
index f482db793018933883a068bb16fd99ece671064b..ace8dfaf682d9d55637586fc55f621352ce5bc21 100644
--- a/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
+++ b/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
@@ -121,6 +121,11 @@ class CodeGen_ipstitch(Transformation):
                 connect_cmds.append(
                     "make_bd_intf_pins_external [get_bd_intf_pins %s/out_r]" % inst_name
                 )
+                # make AXI lite IF external
+                connect_cmds.append(
+                    "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi_control]"
+                    % inst_name
+                )
 
         # create a temporary folder for the project
         prjname = "finn_vivado_stitch_proj"
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7efb95c8df4fbe83c210f7a3f0832f3e2a3d18d
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -0,0 +1,149 @@
+from onnx import TensorProto
+from onnx import helper as oh
+
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+def _is_fifo_node(node):
+    if node.op_type == "StreamingFIFO":
+        return True
+    else:
+        return False
+
+
+def _suitable_node(node):
+    if node is not None:
+        if is_fpgadataflow_node(node) is True:
+            if _is_fifo_node(node) is False:
+                return True
+            else:
+                return False
+        else:
+            return False
+    else:
+        return False
+
+
+class InsertFIFO(Transformation):
+    """Ensure that the graph is terminated with a TLastMarker node, inserting
+    one if necessary."""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        # default depth for FIFOs
+        default_depth = 2
+        graph = model.graph
+        node_ind = -1
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if _suitable_node(n):
+                n_output = n.output[0]
+                consumer = model.find_consumer(n_output)
+                if _suitable_node(consumer) is True:
+                    graph_modified = True
+                    n0 = getCustomOp(n)
+                    # determine fifo node attributes
+                    fld_shape = n0.get_folded_output_shape()
+                    dtype = n0.get_output_datatype()
+
+                    # create fifo node
+                    fifo_output_tensor = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        n0.get_normal_output_shape(),
+                    )
+                    graph.value_info.append(fifo_output_tensor)
+
+                    fifo_node = oh.make_node(
+                        "StreamingFIFO",
+                        [n_output],
+                        [fifo_output_tensor.name],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        depth=default_depth,
+                        folded_shape=fld_shape,
+                        dataType=str(dtype.name),
+                    )
+                    # insert fifo
+                    graph.node.insert(node_ind + 1, fifo_node)
+
+                    # set fifo output tensor as new input tensor of second node
+                    consumer.input[0] = fifo_output_tensor.name
+
+        if graph_modified is False:
+            # insert FIFO as first node
+            if graph.node[0].op_type != "StreamingFIFO":
+                n = graph.node[0]
+                n_input = n.input[0]
+                n0 = getCustomOp(n)
+                # determine fifo node attributes
+                fld_shape = n0.get_folded_input_shape()
+                dtype = n0.get_input_datatype()
+
+                # create fifo node
+                fifo_output_tensor = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    n0.get_normal_input_shape(),
+                )
+                graph.value_info.append(fifo_output_tensor)
+
+                fifo_node = oh.make_node(
+                    "StreamingFIFO",
+                    [n_input],
+                    [fifo_output_tensor.name],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    depth=default_depth,
+                    folded_shape=fld_shape,
+                    dataType=str(dtype.name),
+                )
+                # insert fifo
+                graph.node.insert(0, fifo_node)
+
+                # set fifo output tensor as new input tensor of second node
+                n.input[0] = fifo_output_tensor.name
+
+            # insert FIFO as first node
+            if graph.node[-1].op_type != "StreamingFIFO":
+                n = graph.node[-1]
+                assert (
+                    n.op_type != "TLastMarker"
+                ), """Insert tlast marker should be done
+                    after inserting the FIFOs"""
+                graph_out_name = graph.output[0].name
+                n0 = getCustomOp(n)
+                # determine fifo node attributes
+                fld_shape = n0.get_folded_output_shape()
+                dtype = n0.get_output_datatype()
+
+                # create fifo node
+                fifo_input_tensor = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    n0.get_normal_output_shape(),
+                )
+                graph.value_info.append(fifo_input_tensor)
+
+                fifo_node = oh.make_node(
+                    "StreamingFIFO",
+                    [fifo_input_tensor.name],
+                    [graph_out_name],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    depth=default_depth,
+                    folded_shape=fld_shape,
+                    dataType=str(dtype.name),
+                )
+                # insert fifo
+                graph.node.append(fifo_node)
+
+                # set fifo output tensor as new input tensor of second node
+                n.output[0] = fifo_input_tensor.name
+
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py
index d797773fe540e930267839c5926269a73736f354..28b6b2c34ac0e18ab4e4d92a82d03249f5126d0c 100644
--- a/src/finn/transformation/fpgadataflow/make_deployment.py
+++ b/src/finn/transformation/fpgadataflow/make_deployment.py
@@ -42,9 +42,10 @@ class DeployToPYNQ(Transformation):
     IP address of board, username and password for board and target directory where
     the files are stored on the board"""
 
-    def __init__(self, ip, username, password, target_dir):
+    def __init__(self, ip, port, username, password, target_dir):
         super().__init__()
         self.ip = ip
+        self.port = port
         self.username = username
         self.password = password
         self.target_dir = target_dir
@@ -52,6 +53,7 @@ class DeployToPYNQ(Transformation):
     def apply(self, model):
         # set metadata properties accordingly to user input specifications
         model.set_metadata_prop("pynq_ip", self.ip)
+        model.set_metadata_prop("pynq_port", self.port)
         model.set_metadata_prop("pynq_username", self.username)
         model.set_metadata_prop("pynq_password", self.password)
         model.set_metadata_prop("pynq_target_dir", self.target_dir)
@@ -76,18 +78,21 @@ class DeployToPYNQ(Transformation):
         copy_tree(pynq_driver_dir, deployment_dir)
         model.set_metadata_prop("pynq_deploy_dir", deployment_dir)
         model.set_metadata_prop("exec_mode", "remote_pynq")
-
         # create target directory on PYNQ board
-        cmd = 'sshpass -p {} ssh {}@{} "mkdir -p {}"'.format(
-            self.password, self.username, self.ip, self.target_dir
+        cmd = 'sshpass -p {} ssh {}@{} -p {} "mkdir -p {}"'.format(
+            self.password, self.username, self.ip, self.port, self.target_dir
         )
         bash_command = ["/bin/bash", "-c", cmd]
         process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
         process_compile.communicate()
-
         # copy directory to PYNQ board using scp and sshpass
-        cmd = "sshpass -p {} scp -r {} {}@{}:{}".format(
-            self.password, deployment_dir, self.username, self.ip, self.target_dir
+        cmd = "sshpass -p {} scp -P{} -r {} {}@{}:{}".format(
+            self.password,
+            self.port,
+            deployment_dir,
+            self.username,
+            self.ip,
+            self.target_dir,
         )
         bash_command = ["/bin/bash", "-c", cmd]
         process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 0bde02fa2c330748a718f6debf931b7d83ac7814..c5b8d35dba1069ac749e0a0d92060c8216ada507 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -87,14 +87,25 @@ class MakePYNQDriver(Transformation):
         # fill in the driver template
         driver_py = pynq_driver_dir + "/driver.py"
         driver = templates.pynq_driver_template
+
+        def mss(x, batch_var_name="N"):
+            # "make shape string"
+            # for a shape like (1, ...) emit a string (N, ...)
+            # where N is the default value for batch_var_name
+            # this lets the driver work with a batch of samples at once
+            ret = str(x)
+            ret = ret.replace("(1,", "(%s," % batch_var_name)
+            ret = ret.replace("[1,", "[%s," % batch_var_name)
+            return ret
+
         driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt))
-        driver = driver.replace("$INPUT_SHAPE_NORMAL$", str(i_tensor_shape_normal))
-        driver = driver.replace("$INPUT_SHAPE_FOLDED$", str(i_tensor_shape_folded))
-        driver = driver.replace("$INPUT_SHAPE_PACKED$", str(i_tensor_shape_packed))
+        driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal))
+        driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded))
+        driver = driver.replace("$INPUT_SHAPE_PACKED$", mss(i_tensor_shape_packed))
         driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(o_tensor_dt))
-        driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", str(o_tensor_shape_normal))
-        driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", str(o_tensor_shape_folded))
-        driver = driver.replace("$OUTPUT_SHAPE_PACKED$", str(o_tensor_shape_packed))
+        driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal))
+        driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded))
+        driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
 
         with open(driver_py, "w") as f:
             f.write(driver)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
index 9921ce7caf2aaffd197f9bc863ab77502a963647..1e479f24d9fc192f5a95a6cdbfc122e581f7a136 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
@@ -108,6 +108,7 @@ class MakePYNQProject(Transformation):
         out_if_name = "out_r_0"
         clk_name = "ap_clk_0"
         nrst_name = "ap_rst_n_0"
+        axi_lite_if_name = "s_axi_control_0"
         vivado_ip_cache = os.getenv("VIVADO_IP_CACHE", default="")
 
         # create a temporary folder for the project
@@ -129,6 +130,7 @@ class MakePYNQProject(Transformation):
             out_if_name,
             clk_name,
             nrst_name,
+            axi_lite_if_name,
             vivado_ip_cache,
         )
 
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 81cb954bb4503c8daf18bad5881661018e9d17b7..6ff4a4f0b484530ae0785f74b7c5e83da76c81fe 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -35,6 +35,7 @@ variable config_ip_bytes_out
 variable config_ip_axis_name_in
 variable config_ip_axis_name_out
 variable config_ip_use_axilite
+variable config_ip_axilite_name
 variable config_ip_project_dir
 variable config_output_products_dir
 variable config_remote_cache
@@ -67,7 +68,9 @@ set config_ip_clk_name %s
 # the name of the active-low reset signal
 set config_ip_nrst_name %s
 # whether the IP needs an AXI Lite interface for control
-set config_ip_use_axilite 0
+set config_ip_use_axilite 1
+# name of AXI Lite interface
+set config_ip_axilite_name %s
 # Vivado OOC IP cache
 set config_remote_cache "%s"
 """
@@ -82,64 +85,154 @@ cd %s
 """
 
 pynq_driver_template = """
+import argparse
+
 from pynq import Overlay
 import numpy as np
 from pynq import allocate
+import time
 from finn.util.data_packing import (
     finnpy_to_packed_bytearray,
     packed_bytearray_to_finnpy
 )
 from finn.core.datatype import DataType
 
-bitfile_path = "resizer.bit"
-ol = Overlay(bitfile_path)
-dma=ol.axi_dma_0
-
-# declare input/output types and shapes for the accelerator
-# input FINN DataType
-idt = $INPUT_FINN_DATATYPE$
-# normal, folded and packed input shapes
-ishape_normal = $INPUT_SHAPE_NORMAL$
-ishape_folded = $INPUT_SHAPE_FOLDED$
-ishape_packed = $INPUT_SHAPE_PACKED$
-# output FINN DataType
-odt = $OUTPUT_FINN_DATATYPE$
-# normal, folded and packed output shapes
-oshape_normal = $OUTPUT_SHAPE_NORMAL$
-oshape_folded = $OUTPUT_SHAPE_FOLDED$
-oshape_packed = $OUTPUT_SHAPE_PACKED$
-
-# load desired input .npy file
-ibuf_normal = np.load("input.npy")
-# ensure that shape is as expected
-assert ibuf_normal.shape == ishape_normal
-# convert to folded form
-ibuf_folded = ibuf_normal.reshape(ishape_folded)
-
-# pack the input buffer, reversing both SIMD dim and endianness
-ibuf_packed = finnpy_to_packed_bytearray(
-    ibuf_folded, idt, reverse_endian=True, reverse_inner=True
-)
-# allocate a PYNQ buffer for the packed input buffer
-ibuf_packed_device = allocate(shape=ishape_packed, dtype=np.uint8)
-# copy the packed data into the PYNQ buffer
-# TODO optimization: pack directly into the PYNQ buffer?
-np.copyto(ibuf_packed_device, ibuf_packed)
-
-# allocate a PYNQ buffer for the returned packed output buffer
-obuf_packed = allocate(shape=oshape_packed, dtype=np.uint8)
-
-# set up the DMA and wait until all transfers complete
-dma.sendchannel.transfer(ibuf_packed_device)
-dma.recvchannel.transfer(obuf_packed)
-dma.sendchannel.wait()
-dma.recvchannel.wait()
-
-# unpack the packed output buffer from accelerator
-obuf_folded = packed_bytearray_to_finnpy(
-    obuf_packed, odt, oshape_folded, reverse_endian=True, reverse_inner=True
-)
-# convert to normal reshape and save
-obuf_normal = obuf_folded.reshape(oshape_normal)
-np.save("output.npy", obuf_normal)
+class RemoteTest():
+    def __init__(
+        self,
+        exec_mode,
+        N,
+        bitfile="resizer.bit",
+        inputfile="input.npy",
+        outputfile="output.npy"):
+
+        self.exec_mode = exec_mode
+        self.N = N
+        self.inputfile = inputfile
+        self.outputfile = outputfile
+        self.ol = Overlay(bitfile)
+        self.dma = self.ol.axi_dma_0
+        self.ctrl_regs = self.ol.resize_accel_0
+        self.ishape_packed = $INPUT_SHAPE_PACKED$
+        self.oshape_packed = $OUTPUT_SHAPE_PACKED$
+        # AXI lite register offset for number of iterations
+        # used by TLastMarker to signal end of transmission for AXI CDMA
+        self.REG_OFFSET_NUM_ITERS = 0x10
+
+    def load_input(self):
+        N = self.N
+        ishape_normal = $INPUT_SHAPE_NORMAL$
+        # load desired input .npy file
+        ibuf_normal = np.load(self.inputfile)
+        # ensure that shape is as expected
+        assert ibuf_normal.shape == ishape_normal
+        return ibuf_normal
+
+    def pack_input(self, ibuf_normal):
+        N = self.N
+        # input FINN DataType
+        idt = $INPUT_FINN_DATATYPE$
+        ishape_folded = $INPUT_SHAPE_FOLDED$
+        # convert to folded form
+        ibuf_folded = ibuf_normal.reshape(ishape_folded)
+        # pack the input buffer, reversing both SIMD dim and endianness
+        ibuf_packed = finnpy_to_packed_bytearray(
+            ibuf_folded, idt, reverse_endian=True, reverse_inner=True
+        )
+        return ibuf_packed
+
+    def unpack_output(self, obuf_packed):
+        N = self.N
+        # output FINN DataType
+        odt = $OUTPUT_FINN_DATATYPE$
+        oshape_folded = $OUTPUT_SHAPE_FOLDED$
+        # unpack the packed output buffer from accelerator
+        obuf_folded = packed_bytearray_to_finnpy(
+            obuf_packed, odt, oshape_folded, reverse_endian=True, reverse_inner=True
+        )
+        return obuf_folded
+
+    def save_output(self, obuf_folded):
+        N = self.N
+        # convert to normal reshape and save
+        oshape_normal = $OUTPUT_SHAPE_NORMAL$
+        obuf_normal = obuf_folded.reshape(oshape_normal)
+        np.save(self.outputfile, obuf_normal)
+
+    def allocate_pynqbuffer(self, shape, data=None):
+        buf_device = allocate(shape=shape, dtype=np.uint8)
+
+        # if necessary copy the packed data into the PYNQ buffer
+        # TODO optimization: pack directly into the PYNQ buffer?
+        if data is not None:
+            np.copyto(buf_device, data)
+
+        return buf_device
+
+
+    def run_nw(self):
+        exec_mode = self.exec_mode
+        if exec_mode == "remote_pynq":
+            ibuf_normal = self.load_input()
+            ibuf_packed = self.pack_input(ibuf_normal)
+        elif exec_mode != "throughput_test":
+            raise Exception("Exec mode has to be set to remote_pynq or throughput_test")
+
+        # set up TLastMarker with correct num. samples
+        self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, N)
+
+        # allocate a PYNQ buffer for the packed input buffer
+        if exec_mode == "remote_pynq":
+            ibuf_packed_device = self.allocate_pynqbuffer(self.ishape_packed, ibuf_packed)
+        else:
+            ibuf_packed_device = self.allocate_pynqbuffer(self.ishape_packed)
+
+        # allocate a PYNQ buffer for the returned packed output buffer
+        obuf_packed = self.allocate_pynqbuffer(self.oshape_packed)
+
+        if exec_mode == "throughput_test":
+            # measure runtime of network
+            start = time.time()
+            res={}
+
+        # set up the DMA and wait until all transfers complete
+        dma = self.dma
+        dma.sendchannel.transfer(ibuf_packed_device)
+        dma.recvchannel.transfer(obuf_packed)
+        dma.sendchannel.wait()
+        dma.recvchannel.wait()
+
+
+        if exec_mode == "throughput_test":
+            end = time.time()
+            runtime = end - start
+            res["runtime[ms]"] = runtime*1000
+            res["throughput[images/s]"] = N / runtime
+            res["DRAM_in_bandwidth[Mb/s]"] = np.prod(self.ishape_packed)*0.000001 / runtime
+            res["DRAM_out_bandwidth[Mb/s]"] = np.prod(self.oshape_packed)*0.000001 / runtime
+            file = open("nw_metrics.txt", "w")
+            file.write(str(res))
+            file.close()
+        else:
+            obuf_folded = self.unpack_output(obuf_packed)
+            self.save_output(obuf_folded)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Set exec mode, batchsize N, bitfile name, inputfile name and outputfile name')
+    parser.add_argument('exec_mode', help='Please select functional verification ("remote_pynq") or throughput test ("throughput_test")')
+    parser.add_argument('N', help='number of samples for inference', type=int)
+    parser.add_argument('bitfile', default="resizer.bit")
+    parser.add_argument('inputfile', default="input.npy")
+    parser.add_argument('outputfile', default="output.npy")
+    args = parser.parse_args()
+    exec_mode = args.exec_mode
+    N = args.N
+    bitfile = args.bitfile
+    inputfile = args.inputfile
+    outputfile = args.outputfile
+
+    Test = RemoteTest(exec_mode, N, bitfile, inputfile, outputfile)
+    Test.run_nw()
+
 """
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
index 53f34d4d772a458eed3d417cdeb8a962338b099c..d0571aec42fe2afc55c21758433a6f16f8e77fa4 100644
--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -288,8 +288,9 @@ def test_end2end_cnv_w1a1_deploy_on_pynq():
             pytest.skip("PYNQ board IP address not specified")
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
         # save the model to be able to link it to the parent
         model.save(build_dir + "/end2end_cnv_w1a1_pynq_deploy.onnx")
     except KeyError:
diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
similarity index 91%
rename from tests/end2end/test_end2end_tfc_w1a1.py
rename to tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
index 8a670fce2e7e6585c98efa9e4a6e27a660edf925..989379d80aec70a5a0a53339184aeaf5dbc3b38f 100644
--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
@@ -42,6 +42,7 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.streamline.absorb as absorb
 from finn.core.modelwrapper import ModelWrapper
 from finn.core.onnx_exec import execute_onnx
+from finn.core.throughput_test import throughput_test
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 from finn.transformation.fold_constants import FoldConstants
@@ -55,6 +56,7 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
@@ -133,24 +135,41 @@ def test_end2end_tfc_w1a1_fold_and_tlastmarker():
     fc1w = getCustomOp(fc_layers[1])
     fc2w = getCustomOp(fc_layers[2])
     fc3w = getCustomOp(fc_layers[3])
-    fc0w.set_nodeattr("inFIFODepth", 50)
-    fc0w.set_nodeattr("SIMD", 16)
+    fc0w.set_nodeattr("inFIFODepth", 256)
+    fc0w.set_nodeattr("SIMD", 196)
     fc0w.set_nodeattr("PE", 16)
-    fc0w.set_nodeattr("outFIFODepth", 4)
+    fc0w.set_nodeattr("outFIFODepth", 64)
     fc0w.set_nodeattr("ram_style", "block")
-    fc1w.set_nodeattr("SIMD", 8)
-    fc1w.set_nodeattr("PE", 8)
-    fc1w.set_nodeattr("outFIFODepth", 4)
+    fc1w.set_nodeattr("SIMD", 16)
+    fc1w.set_nodeattr("PE", 16)
+    fc1w.set_nodeattr("outFIFODepth", 64)
     fc2w.set_nodeattr("SIMD", 16)
     fc2w.set_nodeattr("PE", 16)
-    fc2w.set_nodeattr("outFIFODepth", 4)
+    fc2w.set_nodeattr("outFIFODepth", 64)
     fc3w.set_nodeattr("SIMD", 16)
     fc3w.set_nodeattr("PE", 10)
     fc3w.set_nodeattr("outFIFODepth", 50)
     fc3w.set_nodeattr("ram_style", "distributed")
+    fc3w.set_nodeattr("outFIFODepth", 10)
+    fc3w.set_nodeattr("ram_style", "distributed")
     model = model.transform(InsertDWC())
+    model = model.transform(InsertFIFO())
     model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
+    fifos = []
+    for n in model.graph.node:
+        if n.op_type == "StreamingFIFO":
+            fifos.append(n)
+    fifo0 = getCustomOp(fifos[0])
+    fifo1 = getCustomOp(fifos[1])
+    fifo2 = getCustomOp(fifos[2])
+    fifo3 = getCustomOp(fifos[3])
+    fifo4 = getCustomOp(fifos[4])
+    fifo0.set_nodeattr("depth", 256)
+    fifo1.set_nodeattr("depth", 64)
+    fifo2.set_nodeattr("depth", 64)
+    fifo3.set_nodeattr("depth", 64)
+    fifo4.set_nodeattr("depth", 10)
     model = model.transform(AnnotateResources("estimate"))
     model.save(build_dir + "/end2end_tfc_w1a1_folded.onnx")
 
@@ -199,7 +218,7 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
     res_rtlsim_whole = ret_rtlsim_whole[out_name]
     assert np.isclose(res_npysim, res_rtlsim_nodebynode).all()
-    assert np.isclose(res_npysim, res_rtlsim_whole).all()
+    assert np.isclose(res_rtlsim_nodebynode, res_rtlsim_whole).all()
 
 
 def test_end2end_tfc_w1a1_verify_all():
@@ -268,8 +287,9 @@ def test_end2end_tfc_w1a1_deploy_on_pynq():
             pytest.skip("PYNQ board IP address not specified")
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
         # save the model to be able to link it to the parent
         model.save(build_dir + "/end2end_tfc_w1a1_pynq_deploy.onnx")
     except KeyError:
@@ -304,6 +324,9 @@ def test_end2end_tfc_w1a1_run_on_pynq():
         ret = execute_onnx(parent_model, {iname: x}, True)
         y = ret[oname]
         assert np.isclose(y, y_golden).all()
+        child_model = ModelWrapper(sdp_node.get_nodeattr("model"))
+        res = throughput_test(child_model)
+        assert res is not None
 
     except KeyError:
         pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/end2end/test_end2end_tfc_w1a2.py b/tests/end2end/test_end2end_tfc_w1a2.py
index b55d985e07ac40fc875c49ba201c9552fd62c411..f3dd8382609e88658fbd0a86105dd068e7e03ce6 100644
--- a/tests/end2end/test_end2end_tfc_w1a2.py
+++ b/tests/end2end/test_end2end_tfc_w1a2.py
@@ -257,8 +257,9 @@ def test_end2end_tfc_w1a2_deploy_on_pynq():
             pytest.skip("PYNQ board IP address not specified")
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
         # save the model to be able to link it to the parent
         model.save(build_dir + "/end2end_tfc_w1a2_pynq_deploy.onnx")
     except KeyError:
diff --git a/tests/end2end/test_end2end_tfc_w2a2.py b/tests/end2end/test_end2end_tfc_w2a2.py
index 92b8b18bc0253a07eec988c2bace9a9178682147..3aa1fd031a52f4ae48fff707b0dcf6cdbda5486a 100644
--- a/tests/end2end/test_end2end_tfc_w2a2.py
+++ b/tests/end2end/test_end2end_tfc_w2a2.py
@@ -257,8 +257,9 @@ def test_end2end_tfc_w2a2_deploy_on_pynq():
             pytest.skip("PYNQ board IP address not specified")
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
         # save the model to be able to link it to the parent
         model.save(build_dir + "/end2end_tfc_w2a2_pynq_deploy.onnx")
     except KeyError:
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef456142ddf214bb3599235549215b6168377517
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -0,0 +1,108 @@
+import pytest
+import os
+
+from onnx import TensorProto, helper
+
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
+from finn.transformation.fpgadataflow.codegen_ipstitch import CodeGen_ipstitch
+
+from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
+
+# from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+
+# from finn.util.basic import gen_finn_dt_tensor
+
+# import finn.core.onnx_exec as oxe
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
+from finn.util.basic import pynq_part_map
+
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 5
+
+
+def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape)
+
+    FIFO_node = helper.make_node(
+        "StreamingFIFO",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        depth=Depth,
+        folded_shape=fld_shape,
+        dataType=str(finn_dtype.name),
+    )
+
+    graph = helper.make_graph(
+        nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="fifo-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", finn_dtype)
+    model.set_tensor_datatype("outp", finn_dtype)
+
+    return model
+
+
+def prepare_inputs(input_tensor, dt):
+    return {"inp": input_tensor}
+
+
+# shape
+@pytest.mark.parametrize("Shape", [[1, 128]])
+# inWidth
+@pytest.mark.parametrize("folded_shape", [[1, 1, 128]])
+# outWidth
+@pytest.mark.parametrize("depth", [256])
+# finn_dtype
+@pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR])  # , DataType.INT2])
+def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
+
+    # generate input data
+    # x = gen_finn_dt_tensor(finn_dtype, Shape)
+    #    input_dict = prepare_inputs(x, finn_dtype)
+
+    model = make_single_fifo_modelwrapper(Shape, depth, folded_shape, finn_dtype)
+
+    # model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(InsertTLastMarker())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(CodeGen_ipgen(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynth_IPGen())
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(CodeGen_ipstitch(test_fpga_part))
+    model = model.transform(MakePYNQProject(test_pynq_board))
+    model = model.transform(SynthPYNQProject())
+    model = model.transform(MakePYNQDriver())
+    ip = os.environ["PYNQ_IP"]
+    username = os.getenv("PYNQ_USERNAME", "xilinx")
+    password = os.getenv("PYNQ_PASSWORD", "xilinx")
+    port = os.getenv("PYNQ_PORT", 22)
+    target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+    model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+
+    # y = oxe.execute_onnx(model, input_dict)["outp"]
+
+    # assert (
+    #    y == x
+    # ).all(), """The output values are not the same as the
+    #    input values anymore."""
+    # assert y.shape == tuple(Shape), """The output shape is incorrect."""
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
index 1c5ae02e4c662f48be4f7f70b9de24a1f9f72ecf..89b01e4f2c9b80bbed6e0285be8756c65f11cb7c 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
@@ -295,8 +295,9 @@ def test_fpgadataflow_ipstitch_pynq_deployment_folder():
         )
         username = os.getenv("PYNQ_USERNAME", "xilinx")
         password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
         target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-        model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
         pynq_ip = model.get_metadata_prop("pynq_ip")
         pynq_username = model.get_metadata_prop("pynq_username")
         pynq_password = model.get_metadata_prop("pynq_password")