diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
index 80b015f6d4eb69df36831b25262cda3539ac8ae9..6c619c51ceb4a99a077fc61c52ce81763cfd27f5 100644
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -193,7 +193,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
       if (shift_en_) begin
 	 // synthesis loop_limit 256
 	 for (a_=depth-2; a_>0; a_=a_-1) begin
-	    srl[a_] <= srl[a_-1];
+	    srl[a_] = srl[a_-1];
 	 end
 	 srl[0] <= i_d;
       end
diff --git a/notebooks/end2end_example/tfc_end2end_example.ipynb b/notebooks/end2end_example/tfc_end2end_example.ipynb
index 64a87571afcc52aa081344f9579f4ba74111a1a9..5d30e830842c2549c1d8b197e3cdc7939b01b9ec 100644
--- a/notebooks/end2end_example/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/tfc_end2end_example.ipynb
@@ -42,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -85,9 +85,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/brevitas_cnv_lfc/training_scripts/models/TFC.py:85: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "  x = 2.0 * x - torch.tensor([1.0]).to(self.device)\n"
+     ]
+    }
+   ],
    "source": [
     "import onnx\n",
     "from finn.util.test import get_test_model_trained\n",
@@ -107,15 +116,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
       "Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
      ]
     },
@@ -133,10 +140,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f186ccfbe10>"
+       "<IPython.lib.display.IFrame at 0x7fcf2b7bf828>"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -154,7 +161,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -239,7 +246,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -266,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -292,10 +299,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f186e386240>"
+       "<IPython.lib.display.IFrame at 0x7fcf3e4739e8>"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -316,7 +323,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -329,11 +336,14 @@
       "    def apply(self, model):\n",
       "        streamline_transformations = [\n",
       "            ConvertSubToAdd(),\n",
+      "            ConvertDivToMul(),\n",
       "            BatchNormToAffine(),\n",
       "            ConvertSignToThres(),\n",
       "            MoveAddPastMul(),\n",
       "            MoveScalarAddPastMatMul(),\n",
+      "            MoveScalarAddPastConv(),\n",
       "            MoveScalarMulPastMatMul(),\n",
+      "            MoveScalarMulPastConv(),\n",
       "            MoveAddPastMul(),\n",
       "            CollapseRepeatedAdd(),\n",
       "            CollapseRepeatedMul(),\n",
@@ -341,6 +351,7 @@
       "            FactorOutMulSignMagnitude(),\n",
       "            AbsorbMulIntoMultiThreshold(),\n",
       "            Absorb1BitMulIntoMatMul(),\n",
+      "            Absorb1BitMulIntoConv(),\n",
       "            RoundAndClipThresholds(),\n",
       "        ]\n",
       "        for trn in streamline_transformations:\n",
@@ -369,7 +380,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -395,10 +406,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f186cd470b8>"
+       "<IPython.lib.display.IFrame at 0x7fcec56f1be0>"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -423,7 +434,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -449,10 +460,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f17f04bbc18>"
+       "<IPython.lib.display.IFrame at 0x7fcf3ef258d0>"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -490,7 +501,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 10,
    "metadata": {
     "scrolled": false
    },
@@ -518,10 +529,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f1868061eb8>"
+       "<IPython.lib.display.IFrame at 0x7fcf2a6540f0>"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -552,7 +563,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -578,10 +589,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f186cc55e48>"
+       "<IPython.lib.display.IFrame at 0x7fcf2a654080>"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -604,7 +615,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -613,7 +624,7 @@
      "text": [
       "\n",
       "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/finn_maltanar/dataflow_partition_h1c4i5gn/df_model.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/tmp/finn_dev_jakobap/dataflow_partition_dkjtsnwj/df_model.onnx' at http://0.0.0.0:8081\n"
      ]
     },
     {
@@ -630,10 +641,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f17f04c70f0>"
+       "<IPython.lib.display.IFrame at 0x7fcec56f1978>"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -654,7 +665,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -674,7 +685,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -708,7 +719,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -732,19 +743,28 @@
        " 'outputDataType': ('s', True, ''),\n",
        " 'binaryXnorMode': ('i', False, 0),\n",
        " 'noActivation': ('i', False, 0),\n",
-       " 'inFIFODepth': ('i', False, 0),\n",
-       " 'outFIFODepth': ('i', False, 0),\n",
+       " 'numInputVectors': ('ints', False, [1]),\n",
+       " 'mem_mode': ('s', False, 'const'),\n",
+       " 'ram_style': ('s', False, 'auto'),\n",
        " 'backend': ('s', True, 'fpgadataflow'),\n",
        " 'code_gen_dir_npysim': ('s', False, ''),\n",
        " 'code_gen_dir_ipgen': ('s', False, ''),\n",
        " 'executable_path': ('s', False, ''),\n",
        " 'ipgen_path': ('s', False, ''),\n",
+       " 'ip_path': ('s', False, ''),\n",
+       " 'ip_vlnv': ('s', False, ''),\n",
        " 'exec_mode': ('s', False, ''),\n",
        " 'sim_cycles': ('i', False, 0),\n",
-       " 'rtlsim_trace': ('s', False, '')}"
+       " 'rtlsim_trace': ('s', False, ''),\n",
+       " 'res_estimate': ('s', False, ''),\n",
+       " 'res_hls': ('s', False, ''),\n",
+       " 'res_synth': ('s', False, ''),\n",
+       " 'rtlsim_so': ('s', False, ''),\n",
+       " 'inFIFODepth': ('i', False, 2),\n",
+       " 'outFIFODepth': ('i', False, 2)}"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -770,7 +790,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -782,29 +802,49 @@
     "fc0w.set_nodeattr(\"PE\", 16)\n",
     "fc0w.set_nodeattr(\"outFIFODepth\", 4)\n",
     "\n",
+    "fc1w.set_nodeattr(\"inFIFODepth\", 4)\n",
     "fc1w.set_nodeattr(\"SIMD\", 16)\n",
     "fc1w.set_nodeattr(\"PE\", 16)\n",
     "fc1w.set_nodeattr(\"outFIFODepth\", 4)\n",
     "\n",
+    "fc2w.set_nodeattr(\"inFIFODepth\", 4)\n",
     "fc2w.set_nodeattr(\"SIMD\", 16)\n",
     "fc2w.set_nodeattr(\"PE\", 16)\n",
     "fc2w.set_nodeattr(\"outFIFODepth\", 4)\n",
     "\n",
+    "fc3w.set_nodeattr(\"inFIFODepth\", 4)\n",
     "fc3w.set_nodeattr(\"SIMD\", 16)\n",
     "fc3w.set_nodeattr(\"PE\", 10)\n",
-    "fc3w.set_nodeattr(\"outFIFODepth\", 50)"
+    "fc3w.set_nodeattr(\"outFIFODepth\", 50)\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Finally, we will run the `InsertTLastMarker` transformation to get a `TLastMarker` node at the output of this graph, which is necessary to run the DMA engines correctly. "
+    "After setting the FIFO node attributes, we can insert FIFO nodes inbetween the fpgadataflow nodes and in the beginning and end of the graph. This can be done using the transformation `InsertFIFO`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO\n",
+    "model = model.transform(InsertFIFO())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we will run the `InsertTLastMarker` transformation to get a `TLastMarker` node at the output of this graph, which is necessary to run the DMA engines correctly. Using netron we can observe that now the nodes contain the set folding, inbetween the nodes are FIFOs inserted and the last node is the `TLastMarker` node we insert in the following."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -830,10 +870,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f1868061d30>"
+       "<IPython.lib.display.IFrame at 0x7fcec5707978>"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -866,14 +906,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "dict_keys(['Ultra96', 'Pynq-Z1'])\n"
+      "dict_keys(['Ultra96', 'Pynq-Z1', 'Pynq-Z2', 'ZCU104'])\n"
      ]
     }
    ],
@@ -885,7 +925,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -916,7 +956,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -938,7 +978,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -957,7 +997,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -983,10 +1023,10 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f17f04c9470>"
+       "<IPython.lib.display.IFrame at 0x7fcec56c4f98>"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1008,17 +1048,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "hls_syn_StreamingFCLayer_Batch_0.tcl  thresh.h\r\n",
-      "ipgen.sh\t\t\t      top_StreamingFCLayer_Batch_0.cpp\r\n",
-      "params.h\t\t\t      vivado_hls.log\r\n",
-      "project_StreamingFCLayer_Batch_0\r\n"
+      "project_StreamingFIFO_0\r\n"
      ]
     }
    ],
@@ -1037,17 +1074,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "#!/bin/bash \r\n",
-      "cd /tmp/finn_maltanar/code_gen_ipgen_StreamingFCLayer_Batch_5f0hmok_\r\n",
-      "vivado_hls /tmp/finn_maltanar/code_gen_ipgen_StreamingFCLayer_Batch_5f0hmok_/hls_syn_StreamingFCLayer_Batch_0.tcl\r\n",
-      "cd /workspace/finn\r\n"
+      "cat: /tmp/finn_dev_jakobap/code_gen_ipgen_StreamingFIFO_0_ruu9s3g8/ipgen.sh: No such file or directory\r\n"
      ]
     }
    ],
@@ -1067,39 +1101,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "set config_proj_name project_StreamingFCLayer_Batch_0\r\n",
-      "puts \"HLS project: $config_proj_name\"\r\n",
-      "set config_hwsrcdir \"/tmp/finn_maltanar/code_gen_ipgen_StreamingFCLayer_Batch_5f0hmok_\"\r\n",
-      "puts \"HW source dir: $config_hwsrcdir\"\r\n",
-      "set config_proj_part \"xczu3eg-sbva484-1-e\"\r\n",
-      "\r\n",
-      "set config_bnnlibdir \"/workspace/finn-hlslib\"\r\n",
-      "\r\n",
-      "set config_toplevelfxn \"StreamingFCLayer_Batch_0\"\r\n",
-      "set config_clkperiod 5\r\n",
-      "\r\n",
-      "open_project $config_proj_name\r\n",
-      "add_files $config_hwsrcdir/top_StreamingFCLayer_Batch_0.cpp -cflags \"-std=c++0x -I$config_bnnlibdir\"\r\n",
-      "\r\n",
-      "set_top $config_toplevelfxn\r\n",
-      "open_solution sol1\r\n",
-      "set_part $config_proj_part\r\n",
-      "\r\n",
-      "config_interface -m_axi_addr64\r\n",
-      "config_rtl -auto_prefix\r\n",
-      "\r\n",
-      "create_clock -period $config_clkperiod -name default\r\n",
-      "csynth_design\r\n",
-      "export_design -format ip_catalog\r\n",
-      "exit 0\r\n"
+      "cat: /tmp/finn_dev_jakobap/code_gen_ipgen_StreamingFIFO_0_ruu9s3g8/hls_syn_StreamingFCLayer_Batch_0.tcl: No such file or directory\r\n"
      ]
     }
    ],
@@ -1128,7 +1137,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1148,22 +1157,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo\"\n",
+       "value: \"/tmp/finn_dev_jakobap/vivado_stitch_proj_b75moqdt\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
        ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
+       "value: \"/tmp/finn_dev_jakobap/vivado_stitch_proj_b75moqdt/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        "]"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1174,16 +1183,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo'"
+       "'/tmp/finn_dev_jakobap/vivado_stitch_proj_b75moqdt'"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1208,7 +1217,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1248,7 +1257,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 31,
    "metadata": {
     "scrolled": true
    },
@@ -1257,17 +1266,19 @@
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo\"\n",
+       "value: \"/tmp/finn_dev_jakobap/vivado_stitch_proj_b75moqdt\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
        ", key: \"wrapper_filename\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_oo2lpoeo/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
+       "value: \"/tmp/finn_dev_jakobap/vivado_stitch_proj_b75moqdt/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_hq9mfroo\"\n",
+       "value: \"/tmp/finn_dev_jakobap/vivado_pynq_proj_dz1m1usu\"\n",
+       ", key: \"vivado_synth_rpt\"\n",
+       "value: \"/tmp/finn_dev_jakobap/vivado_pynq_proj_dz1m1usu/synth_report.xml\"\n",
        "]"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index 7b4ca37cd78c6299fa824ecfc16d79ae013bab37..78fc2ccfc92f9b7ca3ae6beafe7d24bdbfada2bc 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -25,7 +25,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
+import warnings
 import os
 import xml.etree.ElementTree as ET
 
@@ -41,13 +41,21 @@ def hls_synth_res_estimation(model):
     res_dict = {}
     for node in model.graph.node:
         if is_fpgadataflow_node(node) is True:
+            # init values to zero
+            res_dict[node.name] = dict()
+            res_dict[node.name]["BRAM_18K"] = 0
+            res_dict[node.name]["FF"] = 0
+            res_dict[node.name]["LUT"] = 0
+            res_dict[node.name]["DSP48E"] = 0
+            res_dict[node.name]["URAM"] = 0
             op_type = node.op_type
             inst = registry.custom_op[op_type](node)
             code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen")
             if code_gen_dir == "":
-                raise Exception(
-                    """Please run "CodeGen_ipgen" transformation and
-                        "HLSSynth_IPGen" first to generate the report files"""
+                warnings.warn(
+                    """Could not find report files, values will be set to zero
+                    for this node. Please run "CodeGen_ipgen" transformation and
+                    "HLSSynth_IPGen" first to generate the report files"""
                 )
             else:
                 xmlfile = "{}/project_{}/sol1/syn/report/{}_csynth.xml".format(
@@ -55,15 +63,15 @@ def hls_synth_res_estimation(model):
                 )
 
                 if os.path.isfile(xmlfile):
-                    res_dict[node.name] = dict()
                     tree = ET.parse(xmlfile)
                     root = tree.getroot()
                     for item in root.findall("AreaEstimates/Resources"):
                         for child in item:
                             res_dict[node.name][child.tag] = child.text
                 else:
-                    raise Exception(
-                        """Please run "HLSSynth_IPGen" first
-                            to generate the report files"""
+                    warnings.warn(
+                        """Could not find report files, values will be set to zero
+                        for this node. Please run "CodeGen_ipgen" transformation and
+                        "HLSSynth_IPGen" first to generate the report files"""
                     )
     return res_dict
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 2500b1f03b917225d92b00de033299f20e3d9f5d..9a6f66087fafff3745e239da4cb9f05c4ec73451 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -31,7 +31,7 @@ import numpy as np
 import os
 import subprocess
 from finn.custom_op import CustomOp
-from finn.util.basic import CppBuilder, make_build_dir
+from finn.util.basic import CppBuilder, make_build_dir, roundup_to_integer_multiple
 from finn.util.fpgadataflow import (
     IPGenBuilder,
     pyverilate_get_liveness_threshold_cycles,
@@ -82,6 +82,9 @@ class HLSCustomOp(CustomOp):
             "res_hls": ("s", False, ""),
             "res_synth": ("s", False, ""),
             "rtlsim_so": ("s", False, ""),
+            # input and output FIFO depths
+            "inFIFODepth": ("i", False, 2),
+            "outFIFODepth": ("i", False, 2),
         }
 
     def get_verilog_top_module_name(self):
@@ -498,7 +501,20 @@ compilation transformations?
         """Returns output stream width, if implemented."""
         raise Exception("get_outstream_width not implemented for this op")
 
+    def get_instream_width_padded(self):
+        """Returns input stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec."""
+        in_width = self.get_instream_width()
+        return roundup_to_integer_multiple(in_width, 8)
+
+    def get_outstream_width_padded(self):
+        """Returns output stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec."""
+        out_width = self.get_outstream_width()
+        return roundup_to_integer_multiple(out_width, 8)
+
     def get_ap_int_max_w(self):
+        "Return the maximum width of any ap_int used in this module."
         instream = self.get_instream_width()
         outstream = self.get_outstream_width()
         return max([instream, outstream])
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index e05b2dcea7e17231617f9d3880b778d1978b4ead..55b9a2753b50f76c57fb08c7a24b29b49d82c8b8 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -142,7 +142,8 @@ class ConvolutionInputGenerator(HLSCustomOp):
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
         assert simd == ifm_ch, "SWG currently requires SIMD=IFM"
-        return simd * ibits
+        in_width = simd * ibits
+        return in_width
 
     def get_outstream_width(self):
         """Returns stream width, input and output stream width are equal for
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 6a4070528ee50d97e62881d00b57355d2a2baf2d..f30871909b1c70f3b5df148f1b6eae22fdbadc25 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -151,10 +151,12 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         return np.prod(folded_ishape[:-1])
 
     def get_instream_width(self):
-        return self.get_nodeattr("inWidth")
+        in_width = self.get_nodeattr("inWidth")
+        return in_width
 
     def get_outstream_width(self):
-        return self.get_nodeattr("outWidth")
+        out_width = self.get_nodeattr("outWidth")
+        return out_width
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index eab3decc696cb86622bbdd8f22f015515ea936d5..46920711e13057178be9fca5fe3a18ce3e14feda 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -80,9 +80,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "binaryXnorMode": ("i", False, 0),
             # no-activation mode (produce accumulators)
             "noActivation": ("i", False, 0),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 0),
-            "outFIFODepth": ("i", False, 0),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -284,17 +281,24 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def get_instream_width(self):
         i_bits = self.get_input_datatype().bitwidth()
-        return i_bits * self.get_nodeattr("SIMD")
+        in_width = i_bits * self.get_nodeattr("SIMD")
+        return in_width
 
     def get_outstream_width(self):
         o_bits = self.get_output_datatype().bitwidth()
-        return o_bits * self.get_nodeattr("PE")
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
 
     def get_weightstream_width(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
         wp = self.get_weight_datatype().bitwidth()
-        return pe * simd * wp
+        w_width = pe * simd * wp
+        return w_width
+
+    def get_weightstream_width_padded(self):
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
 
     def get_ap_int_max_w(self):
         temp_value = super().get_ap_int_max_w()
@@ -976,13 +980,13 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
             ]
             # make instream width a multiple of 8 for AXI stream interface
-            in_width = roundup_to_integer_multiple(self.get_instream_width(), 8)
+            in_width = self.get_instream_width_padded()
             self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
             self.code_gen_dict["$OUT_RANGE$"] = [
-                "[{}:0]".format(self.get_outstream_width() - 1)
+                "[{}:0]".format(self.get_outstream_width_padded() - 1)
             ]
             # make weight stream width a multiple of 8 for AXI stream interface
-            weight_width = roundup_to_integer_multiple(self.get_weightstream_width(), 8)
+            weight_width = self.get_weightstream_width_padded()
             self.code_gen_dict["$WEIGHT_RANGE$"] = ["[{}:0]".format(weight_width - 1)]
             self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)]
             self.code_gen_dict["$WSTREAM_DEPTH$"] = [str(self.calc_wmem())]
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb96c6c04eb0b7b83c3f925e10f86b17ec399e42
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import numpy as np
+from shutil import copy
+import subprocess
+
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.core.datatype import DataType
+from onnx import TensorProto, helper
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+from . import templates
+
+
+class StreamingFIFO(HLSCustomOp):
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+        self.strm_fifo_wrapper = templates.strm_fifo_wrapper
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # FIFO depth
+            "depth": ("i", True, 0),
+            # folded shape of input/output
+            "folded_shape": ("ints", True, []),
+            # FINN DataTypes for inputs/outputs
+            "dataType": ("s", True, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+
+        return my_attrs
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingFIFO."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_verilog_top_module_name(self):
+        "Return the Verilog top module name for this node."
+
+        node = self.onnx_node
+        prefixed_top_name = "%s" % (node.name)
+        return prefixed_top_name
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_dir = "{}/project_{}/sol1/impl/verilog".format(
+            code_gen_dir, self.onnx_node.name
+        )
+        os.makedirs(verilog_dir)
+        # copy Q_srl.v from finn-rtllib to verilog directory
+        memstream_dir = "/workspace/finn/finn-rtllib/memstream/hdl/"
+        Q_file = os.path.join(memstream_dir, "Q_srl.v")
+        copy(Q_file, verilog_dir)
+
+        # empty code gen dictionary for new entries
+        self.code_gen_dict.clear()
+        self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)]
+        self.code_gen_dict["$LAYER_NAME$"] = [
+            "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
+        ]
+        # make instream width a multiple of 8 for axi interface
+        in_width = self.get_instream_width_padded()
+        self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
+        self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)]
+        self.code_gen_dict["$WIDTH$"] = [str(in_width)]
+        self.code_gen_dict["$DEPTH$"] = [str(self.get_nodeattr("depth"))]
+
+        template = self.strm_fifo_wrapper
+
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        f = open(os.path.join(verilog_dir, "{}.v".format(self.onnx_node.name,)), "w",)
+        f.write(template)
+        f.close()
+        self.code_gen_dict.clear()
+
+    def ipgen_singlenode_code(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_dir = "{}/project_{}/sol1/impl/verilog".format(
+            code_gen_dir, self.onnx_node.name
+        )
+        # prepare the IP packaging tcl template
+        template = templates.ip_package_tcl
+        self.code_gen_dict.clear()
+        self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)]
+        self.code_gen_dict["$VERILOG_DIR$"] = [verilog_dir]
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        f = open(os.path.join(verilog_dir, "package_ip.tcl"), "w")
+        f.write(template)
+        f.close()
+        # create a shell script and call Vivado to invoke the IP pkg script
+        make_project_sh = verilog_dir + "/make_ip.sh"
+        working_dir = os.environ["PWD"]
+        with open(make_project_sh, "w") as f:
+            f.write("#!/bin/bash \n")
+            f.write("cd {}\n".format(verilog_dir))
+            f.write("vivado -mode batch -source package_ip.tcl\n")
+            f.write("cd {}\n".format(working_dir))
+        bash_command = ["bash", make_project_sh]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
+        # set ipgen_path and ip_path to point to the new packaged IP
+        self.set_nodeattr("ipgen_path", verilog_dir)
+        self.set_nodeattr("ip_path", verilog_dir)
+        vlnv = "xilinx.com:hls:%s:1.0" % (self.onnx_node.name)
+        self.set_nodeattr("ip_vlnv", vlnv)
+        self.code_gen_dict.clear()
+
+    def get_normal_input_shape(self):
+        depth = self.get_nodeattr("depth")
+        # depth has to be between 2 and 256 with the current
+        # StreamingFIFO implementation
+        assert (
+            depth >= 2
+        ), """Depth is too low. Please set node attribute "depth" to a value
+        between 2 and 256"""
+        assert (
+            depth <= 256
+        ), """Depth is too high. Please set node attribute "depth" to a value
+        between 2 and 256"""
+        # derive normal shape from folded shape
+        # StreamingFIFOs are inserted in between fpgadataflow nodes
+        # the folded shape could be for example (1, nf, pe)
+        # with nf (neuron folding): mh // pe
+        # the normal input shape is in this case (1, mh)
+        # so to achieve this the two inner dimensions are multiplied
+        # and together with all previous dimensions
+        # this gives the normal input shape
+
+        folded_shape = self.get_nodeattr("folded_shape")
+        # extract inner dimension
+        inner_dim = folded_shape[-1]
+        # multiply with the next inner dimension
+        folding_factor = folded_shape[-2] * inner_dim
+        normal_ishape = []
+        # create the normal_ishape
+        for i in range(len(folded_shape) - 2):
+            normal_ishape.append(folded_shape[i])
+        normal_ishape.append(folding_factor)
+
+        return normal_ishape
+
+    def get_normal_output_shape(self):
+        return self.get_normal_input_shape()
+
+    def get_folded_input_shape(self):
+        return self.get_nodeattr("folded_shape")
+
+    def get_folded_output_shape(self):
+        return self.get_nodeattr("folded_shape")
+
+    def get_instream_width(self):
+        dtype = DataType[self.get_nodeattr("dataType")]
+        folded_shape = self.get_nodeattr("folded_shape")
+        in_width = folded_shape[-1] * dtype.bitwidth()
+        return in_width
+
+    def get_outstream_width(self):
+        dtype = DataType[self.get_nodeattr("dataType")]
+        folded_shape = self.get_nodeattr("folded_shape")
+        in_width = folded_shape[-1] * dtype.bitwidth()
+        return in_width
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        inp = context[node.input[0]]
+        exp_shape = self.get_normal_input_shape()
+
+        if mode == "npysim":
+            output = inp
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
+            context[node.output[0]] = output
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            # create a npy file for the input of the node
+            assert (
+                str(inp.dtype) == "float32"
+            ), """Input datatype is
+                not float32 as expected."""
+            expected_inp_shape = self.get_folded_input_shape()
+            reshaped_input = inp.reshape(expected_inp_shape)
+            if DataType[self.get_nodeattr("dataType")] == DataType.BIPOLAR:
+                # store bipolar activations as binary
+                reshaped_input = (reshaped_input + 1) / 2
+                export_idt = DataType.BINARY
+            else:
+                export_idt = DataType[self.get_nodeattr("dataType")]
+            # make copy before saving the array
+            reshaped_input = reshaped_input.copy()
+            np.save(
+                os.path.join(code_gen_dir, "input_0.npy"), reshaped_input,
+            )
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            output = self.rtlsim(sim, inp)
+            odt = DataType[self.get_nodeattr("dataType")]
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+
+        else:
+            raise Exception("Test")
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def get_number_input_values(self):
+        folded_ishape = self.get_folded_input_shape()
+        return np.prod(folded_ishape[:-1])
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index f370d417aa0ac1ce5d62af878575332941e2c1d0..83bc19030ebba66907e08c5b1e52d7c0ff9207a6 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -90,7 +90,8 @@ class StreamingMaxPool_Batch(HLSCustomOp):
     def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
-        return int(dt_bits * ifm_ch)
+        in_width = int(dt_bits * ifm_ch)
+        return in_width
 
     def get_outstream_width(self):
         """For streaming maxpool out stream with is the same as in stream width"""
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index bfa90ebeda06e55ffaa9b8ea5b40369ed246ba86..6313bb79c21231c4be5b242558da5ac40fb2aa78 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -402,3 +402,43 @@ ipx::create_xgui_files [ipx::current_core]
 ipx::update_checksums [ipx::current_core]
 ipx::save_core [ipx::current_core]
 """
+
+strm_fifo_wrapper = """
+module $TOPNAME$(
+ap_clk,
+ap_rst_n,
+in0_V_V_TDATA,
+in0_V_V_TVALID,
+in0_V_V_TREADY,
+out_V_V_TDATA,
+out_V_V_TVALID,
+out_V_V_TREADY
+);
+
+input   ap_clk;
+input   ap_rst_n;
+input  $IN_RANGE$ in0_V_V_TDATA;
+input   in0_V_V_TVALID;
+output   in0_V_V_TREADY;
+output  $OUT_RANGE$ out_V_V_TDATA;
+output   out_V_V_TVALID;
+input   out_V_V_TREADY;
+
+Q_srl #(
+.depth($DEPTH$),
+.width($WIDTH$)
+)
+$LAYER_NAME$
+(
+ .clock(ap_clk),
+ .reset(!ap_rst_n),
+ .i_d(in0_V_V_TDATA),
+ .i_v(in0_V_V_TVALID),
+ .i_r(in0_V_V_TREADY),
+ .o_d(out_V_V_TDATA),
+ .o_v(out_V_V_TVALID),
+ .o_r(out_V_V_TREADY)
+);
+
+endmodule
+"""
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index c797affff9dbf1310c413db0847e0e2dae222a97..411311c2b9def953ee5ac6d03adfafb81704c177 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -33,6 +33,7 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
 )
 from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
+from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
 from finn.custom_op.im2col import Im2Col
 from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
 from finn.custom_op.multithreshold import MultiThreshold
@@ -56,6 +57,7 @@ custom_op["TLastMarker"] = TLastMarker
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
 custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
+custom_op["StreamingFIFO"] = StreamingFIFO
 
 
 def getCustomOp(node):
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..f66d0dc087ecbdd112422484ee1e01cb5ceef1c0
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -0,0 +1,180 @@
+from onnx import TensorProto
+from onnx import helper as oh
+
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+def _is_fifo_node(node):
+    if node.op_type == "StreamingFIFO":
+        return True
+    else:
+        return False
+
+
+def _suitable_node(node):
+    if node is not None:
+        if is_fpgadataflow_node(node) is True:
+            if _is_fifo_node(node) is False:
+                return True
+            else:
+                return False
+        else:
+            return False
+    else:
+        return False
+
+
+class InsertFIFO(Transformation):
+    """Inserting FIFOs in the beginning and end of the graph as well as
+    between fpgadataflow nodes.
+
+    Takes the setting for the depth from the surrounding nodes by extracting
+    node attribute 'outFIFODepth' of the previous and node attribute 'inFIFODepth'
+    of the subsequent node. max() of these two values sets the FIFO depth.
+
+    The other node attributes necessary to create a FIFO node are taking from the
+    node the FIFO node is inserted after: 'folded_shape' and 'dtype'"""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = -1
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if _suitable_node(n):
+                n_output = n.output[0]
+                consumer = model.find_consumer(n_output)
+                if _suitable_node(consumer) is True:
+                    graph_modified = True
+                    n0 = getCustomOp(n)
+                    # determine fifo node attributes
+                    fld_shape = n0.get_folded_output_shape()
+                    dtype = n0.get_output_datatype()
+
+                    # check if folded_shape of output of first node and
+                    # input of the second node is equal
+                    n1 = getCustomOp(consumer)
+                    assert (
+                        fld_shape == n1.get_folded_input_shape()
+                    ), """The
+                    folded output shape of the first node is not the same as the
+                    folded output shape of the second node. A streaming fifo can't
+                    be implemented in between these nodes."""
+
+                    # check if outFIFOdepth attribute of first node
+                    # and inFIFOdepth attribute of consumer node is equal
+                    n0_depth = n0.get_nodeattr("outFIFODepth")
+                    n1_depth = n1.get_nodeattr("inFIFODepth")
+                    if n0_depth == n1_depth:
+                        fifo_depth = n0_depth
+                    elif n0_depth != n1_depth:
+                        fifo_depth = max(n0_depth, n1_depth)
+                        n0.set_nodeattr("outFIFODepth", fifo_depth)
+                        n1.set_nodeattr("inFIFODepth", fifo_depth)
+
+                    # create fifo node
+                    fifo_output_tensor = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        n0.get_normal_output_shape(),
+                    )
+                    graph.value_info.append(fifo_output_tensor)
+                    model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                    fifo_node = oh.make_node(
+                        "StreamingFIFO",
+                        [n_output],
+                        [fifo_output_tensor.name],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        depth=fifo_depth,
+                        folded_shape=fld_shape,
+                        dataType=str(dtype.name),
+                    )
+                    # insert fifo
+                    graph.node.insert(node_ind + 1, fifo_node)
+
+                    # set fifo output tensor as new input tensor of second node
+                    consumer.input[0] = fifo_output_tensor.name
+
+        if graph_modified is False:
+            # insert FIFO as first node
+            if graph.node[0].op_type != "StreamingFIFO":
+                n = graph.node[0]
+                n_input = n.input[0]
+                n0 = getCustomOp(n)
+                # determine fifo node attributes
+                fld_shape = n0.get_folded_input_shape()
+                dtype = n0.get_input_datatype()
+                fifo_depth = n0.get_nodeattr("inFIFODepth")
+
+                # create fifo node
+                fifo_output_tensor = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    n0.get_normal_input_shape(),
+                )
+                graph.value_info.append(fifo_output_tensor)
+                model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                fifo_node = oh.make_node(
+                    "StreamingFIFO",
+                    [n_input],
+                    [fifo_output_tensor.name],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    depth=fifo_depth,
+                    folded_shape=fld_shape,
+                    dataType=str(dtype.name),
+                )
+                # insert fifo
+                graph.node.insert(0, fifo_node)
+
+                # set fifo output tensor as new input tensor of second node
+                n.input[0] = fifo_output_tensor.name
+
+            # insert FIFO as last node
+            if graph.node[-1].op_type != "StreamingFIFO":
+                n = graph.node[-1]
+                assert (
+                    n.op_type != "TLastMarker"
+                ), """Insert tlast marker should be done
+                    after inserting the FIFOs"""
+                graph_out_name = graph.output[0].name
+                n0 = getCustomOp(n)
+                # determine fifo node attributes
+                fld_shape = n0.get_folded_output_shape()
+                dtype = n0.get_output_datatype()
+                fifo_depth = n0.get_nodeattr("inFIFODepth")
+
+                # create fifo node
+                fifo_input_tensor = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    n0.get_normal_output_shape(),
+                )
+                graph.value_info.append(fifo_input_tensor)
+                model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                fifo_node = oh.make_node(
+                    "StreamingFIFO",
+                    [fifo_input_tensor.name],
+                    [graph_out_name],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    depth=fifo_depth,
+                    folded_shape=fld_shape,
+                    dataType=str(dtype.name),
+                )
+                # insert fifo
+                graph.node.append(fifo_node)
+
+                # set fifo output tensor as new input tensor of second node
+                n.output[0] = fifo_input_tensor.name
+
+        return (model, graph_modified)
diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1.py
index 8a670fce2e7e6585c98efa9e4a6e27a660edf925..c30fd8dc04642be7fe59d24a2dfd10dbf29e8488 100644
--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1.py
@@ -55,6 +55,7 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
@@ -138,17 +139,21 @@ def test_end2end_tfc_w1a1_fold_and_tlastmarker():
     fc0w.set_nodeattr("PE", 16)
     fc0w.set_nodeattr("outFIFODepth", 4)
     fc0w.set_nodeattr("ram_style", "block")
+    fc1w.set_nodeattr("inFIFODepth", 4)
     fc1w.set_nodeattr("SIMD", 8)
     fc1w.set_nodeattr("PE", 8)
     fc1w.set_nodeattr("outFIFODepth", 4)
+    fc2w.set_nodeattr("inFIFODepth", 4)
     fc2w.set_nodeattr("SIMD", 16)
     fc2w.set_nodeattr("PE", 16)
     fc2w.set_nodeattr("outFIFODepth", 4)
+    fc3w.set_nodeattr("inFIFODepth", 4)
     fc3w.set_nodeattr("SIMD", 16)
     fc3w.set_nodeattr("PE", 10)
     fc3w.set_nodeattr("outFIFODepth", 50)
     fc3w.set_nodeattr("ram_style", "distributed")
     model = model.transform(InsertDWC())
+    model = model.transform(InsertFIFO())
     model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(AnnotateResources("estimate"))
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa80f0050f0cc687c20a8e1007ed67b63989b977
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -0,0 +1,75 @@
+import pytest
+
+from onnx import TensorProto, helper
+
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
+
+from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.util.basic import gen_finn_dt_tensor
+import finn.core.onnx_exec as oxe
+
+
+def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape)
+
+    FIFO_node = helper.make_node(
+        "StreamingFIFO",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        depth=Depth,
+        folded_shape=fld_shape,
+        dataType=str(finn_dtype.name),
+    )
+
+    graph = helper.make_graph(
+        nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="fifo-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", finn_dtype)
+    model.set_tensor_datatype("outp", finn_dtype)
+
+    return model
+
+
+def prepare_inputs(input_tensor, dt):
+    return {"inp": input_tensor}
+
+
+# shape
+@pytest.mark.parametrize("Shape", [[1, 4]])
+# inWidth
+@pytest.mark.parametrize("folded_shape", [[1, 1, 4]])
+# outWidth
+@pytest.mark.parametrize("depth", [2])
+# finn_dtype
+@pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR, DataType.INT2])
+def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
+
+    # generate input data
+    x = gen_finn_dt_tensor(finn_dtype, Shape)
+    input_dict = prepare_inputs(x, finn_dtype)
+
+    model = make_single_fifo_modelwrapper(Shape, depth, folded_shape, finn_dtype)
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynth_IPGen())
+    y = oxe.execute_onnx(model, input_dict)["outp"]
+
+    assert (
+        y == x
+    ).all(), """The output values are not the same as the
+        input values anymore."""
+    assert y.shape == tuple(Shape), """The output shape is incorrect."""