diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
index 80b015f6d4eb69df36831b25262cda3539ac8ae9..6c619c51ceb4a99a077fc61c52ce81763cfd27f5 100644
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -193,7 +193,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
       if (shift_en_) begin
 	 // synthesis loop_limit 256
 	 for (a_=depth-2; a_>0; a_=a_-1) begin
-	    srl[a_] <= srl[a_-1];
+	    srl[a_] = srl[a_-1];
 	 end
 	 srl[0] <= i_d;
       end
diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index c7db5b1d9d22ea89740f4c82633c96746a6fa5ee..958890f9e6a84d796ecb4a817dbf740c117ede0b 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -25,7 +25,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
+import warnings
 import os
 import xml.etree.ElementTree as ET
 
@@ -50,9 +50,16 @@ def hls_synth_res_estimation(model):
                 inst = registry.custom_op[op_type](node)
                 code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen")
                 if code_gen_dir == "":
-                    raise Exception(
-                        """Please run "CodeGen_ipgen" transformation and
-                            "HLSSynth_IPGen" first to generate the report files"""
+                    res_dict[node.name] = dict()
+                    res_dict[node.name]["BRAM_18K"] = 0
+                    res_dict[node.name]["FF"] = 0
+                    res_dict[node.name]["LUT"] = 0
+                    res_dict[node.name]["DSP48E"] = 0
+                    res_dict[node.name]["URAM"] = 0
+                    warnings.warn(
+                        """Could not find report files, values will be set to zero
+                        for this node. Please run "CodeGen_ipgen" transformation and
+                        "HLSSynth_IPGen" first to generate the report files"""
                     )
                 else:
                     xmlfile = "{}/project_{}/sol1/syn/report/{}_csynth.xml".format(
@@ -67,9 +74,16 @@ def hls_synth_res_estimation(model):
                             for child in item:
                                 res_dict[node.name][child.tag] = child.text
                     else:
-                        raise Exception(
-                            """Please run "HLSSynth_IPGen" first
-                                to generate the report files"""
+                        res_dict[node.name] = dict()
+                        res_dict[node.name]["BRAM_18K"] = 0
+                        res_dict[node.name]["FF"] = 0
+                        res_dict[node.name]["LUT"] = 0
+                        res_dict[node.name]["DSP48E"] = 0
+                        res_dict[node.name]["URAM"] = 0
+                        warnings.warn(
+                            """Could not find report files, values will be set to zero
+                            for this node. Please run "HLSSynth_IPGen" first
+                            to generate the report files"""
                         )
 
     return res_dict
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 4fc69985f7cdf09298f79055e159f63b2eabaf97..8d8d64b708117d48dda7a6bff8b35f4208e00dd1 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -428,11 +428,11 @@ compilation transformations?
         """Returns folded output shape (according to neuron folding), if implemented."""
         raise Exception("get_folded_output_shape not implemented for this op")
 
-    def get_instream_width(self):
+    def get_instream_width(self, axi_strm_padding=False):
         """Returns input stream width, if implemented."""
         raise Exception("get_instream_width not implemented for this op")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, axi_strm_padding=False):
         """Returns output stream width, if implemented."""
         raise Exception("get_outstream_width not implemented for this op")
 
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 2ef5d350fb972e448b9a3745eb8c98197ab87d94..a695fe6df209bb3810664c2ce7af5410e03a077c 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -39,6 +39,7 @@ from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.custom_op.im2col import compute_conv_output_dim
 from onnx import TensorProto, helper
+from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator:
@@ -140,20 +141,23 @@ class ConvolutionInputGenerator(HLSCustomOp):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, axi_strm_padding=False):
         """Returns stream width, input and output stream width are equal for
         the sliding window function"""
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
         assert simd == ifm_ch, "SWG currently requires SIMD=IFM"
-        return simd * ibits
+        in_width = simd * ibits
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, axi_strm_padding=False):
         """Returns stream width, input and output stream width are equal for
         the sliding window function, so the function to determine the input
         stream width can be reused."""
-        return self.get_instream_width()
+        return self.get_instream_width(axi_strm_padding)
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 5e4c99aa41216b05f66da8341870269c620c6c40..1a9ee1118596a95b624258d3ee8fe4c37a71edde 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -36,6 +36,7 @@ except ModuleNotFoundError:
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
+from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # does not do anything at the ONNX node-by-node level, and input-output
@@ -154,11 +155,17 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         folded_ishape = self.get_folded_input_shape()
         return np.prod(folded_ishape[:-1])
 
-    def get_instream_width(self):
-        return self.get_nodeattr("inWidth")
-
-    def get_outstream_width(self):
-        return self.get_nodeattr("outWidth")
+    def get_instream_width(self, axi_strm_padding=False):
+        in_width = self.get_nodeattr("inWidth")
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
+
+    def get_outstream_width(self, axi_strm_padding=False):
+        out_width = self.get_nodeattr("outWidth")
+        if axi_strm_padding is True:
+            out_width = roundup_to_integer_multiple(out_width, 8)
+        return out_width
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 7784024aae102989338df9b040fcfc1f9dc36983..f00c19ff1a7d2758af0e3320677b32b87279082a 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -40,7 +40,10 @@ except ModuleNotFoundError:
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
-from finn.util.basic import interleave_matrix_outer_dim_from_partitions
+from finn.util.basic import (
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
@@ -260,19 +263,28 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, axi_strm_padding=False):
         i_bits = self.get_input_datatype().bitwidth()
-        return i_bits * self.get_nodeattr("SIMD")
+        in_width = i_bits * self.get_nodeattr("SIMD")
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, axi_strm_padding=False):
         o_bits = self.get_output_datatype().bitwidth()
-        return o_bits * self.get_nodeattr("PE")
+        out_width = o_bits * self.get_nodeattr("PE")
+        if axi_strm_padding is True:
+            out_width = roundup_to_integer_multiple(out_width, 8)
+        return out_width
 
-    def get_weightstream_width(self):
+    def get_weightstream_width(self, axi_strm_padding=False):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
         wp = self.get_weight_datatype().bitwidth()
-        return pe * simd * wp
+        w_width = pe * simd * wp
+        if axi_strm_padding is True:
+            w_width = roundup_to_integer_multiple(w_width, 8)
+        return w_width
 
     def get_ap_int_max_w(self):
         temp_value = super().get_ap_int_max_w()
@@ -981,18 +993,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             self.code_gen_dict["$LAYER_NAME$"] = [
                 "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
             ]
-            # make instream width a multiple of 8 for axi interface
-            in_width = self.get_instream_width()
-            if in_width % 8 != 0:
-                in_width = math.floor(in_width / 8) + 8
+            in_width = self.get_instream_width(axi_strm_padding=True)
             self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
             self.code_gen_dict["$OUT_RANGE$"] = [
-                "[{}:0]".format(self.get_outstream_width() - 1)
+                "[{}:0]".format(self.get_outstream_width(axi_strm_padding=True) - 1)
             ]
-            # make weight stream width a multiple of 8 for axi interface
-            weight_width = self.get_weightstream_width()
-            if weight_width % 8 != 0:
-                weight_width = math.floor(weight_width / 8) + 8
+            weight_width = self.get_weightstream_width(axi_strm_padding=True)
             self.code_gen_dict["$WEIGHT_RANGE$"] = ["[{}:0]".format(weight_width - 1)]
             self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)]
             mw = self.get_nodeattr("MW")
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fcb1fe43a3927a7d49b6e041727a54cc384942f
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -0,0 +1,293 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import numpy as np
+from shutil import copy
+import subprocess
+
+from pyverilator import PyVerilator
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.core.datatype import DataType
+from onnx import TensorProto, helper
+from finn.util.basic import roundup_to_integer_multiple
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+from . import templates
+
+
+class StreamingFIFO(HLSCustomOp):
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+        self.strm_fifo_wrapper = templates.strm_fifo_wrapper
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # FIFO depth
+            "depth": ("i", True, 0),
+            # folded shape of input/output
+            "folded_shape": ("ints", True, []),
+            # FINN DataTypes for inputs/outputs
+            "dataType": ("s", True, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+
+        return my_attrs
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingFIFO."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # copy Q_srl.v from finn-rtllib to code gen directory
+        memstream_dir = "/workspace/finn/finn-rtllib/memstream/hdl/"
+        Q_file = os.path.join(memstream_dir, "Q_srl.v")
+        copy(Q_file, code_gen_dir)
+
+        # empty code gen dictionary for new entries
+        self.code_gen_dict.clear()
+        self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)]
+        self.code_gen_dict["$LAYER_NAME$"] = [
+            "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
+        ]
+        # make instream width a multiple of 8 for axi interface
+        in_width = self.get_instream_width(axi_strm_padding=True)
+        self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
+        self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)]
+        self.code_gen_dict["$WIDTH$"] = [str(in_width)]
+        self.code_gen_dict["$DEPTH$"] = [str(self.get_nodeattr("depth"))]
+
+        template = self.strm_fifo_wrapper
+
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        f = open(os.path.join(code_gen_dir, "{}.v".format(self.onnx_node.name)), "w",)
+        f.write(template)
+        f.close()
+        self.code_gen_dict.clear()
+
+    def ipgen_singlenode_code(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # prepare the IP packaging tcl template
+        template = templates.ip_package_tcl
+        self.code_gen_dict.clear()
+        self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)]
+        self.code_gen_dict["$VERILOG_DIR$"] = [code_gen_dir]
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        f = open(os.path.join(code_gen_dir, "package_ip.tcl"), "w")
+        f.write(template)
+        f.close()
+        # create a shell script and call Vivado to invoke the IP pkg script
+        make_project_sh = code_gen_dir + "/make_ip.sh"
+        working_dir = os.environ["PWD"]
+        with open(make_project_sh, "w") as f:
+            f.write("#!/bin/bash \n")
+            f.write("cd {}\n".format(code_gen_dir))
+            f.write("vivado -mode batch -source package_ip.tcl\n")
+            f.write("cd {}\n".format(working_dir))
+        bash_command = ["bash", make_project_sh]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
+        # set ipgen_path and ip_path to point to the new packaged IP
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+        vlnv = "xilinx.com:hls:%s:1.0" % (self.onnx_node.name)
+        self.set_nodeattr("ip_vlnv", vlnv)
+        self.code_gen_dict.clear()
+
+    def get_normal_input_shape(self):
+        depth = self.get_nodeattr("depth")
+        assert (
+            depth >= 2
+        ), """Depth is too low. Please set node attribute "depth" to a value
+        between 2 and 256"""
+        assert (
+            depth <= 256
+        ), """Depth is too high. Please set node attribute "depth" to a value
+        between 2 and 256"""
+        folded_shape = self.get_nodeattr("folded_shape")
+        inner_dim = folded_shape[-1]
+        folding_factor = folded_shape[-2] * inner_dim
+        normal_ishape = []
+        for i in range(len(folded_shape) - 2):
+            normal_ishape.append(folded_shape[i])
+        normal_ishape.append(folding_factor)
+
+        return normal_ishape
+
+    def get_normal_output_shape(self):
+        return self.get_normal_input_shape()
+
+    def get_folded_input_shape(self):
+        return self.get_nodeattr("folded_shape")
+
+    def get_folded_output_shape(self):
+        return self.get_nodeattr("folded_shape")
+
+    def get_instream_width(self, axi_strm_padding=False):
+        dtype = DataType[self.get_nodeattr("dataType")]
+        folded_shape = self.get_nodeattr("folded_shape")
+        in_width = folded_shape[-1] * dtype.bitwidth()
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
+
+    def get_outstream_width(self, axi_strm_padding=False):
+        dtype = DataType[self.get_nodeattr("dataType")]
+        folded_shape = self.get_nodeattr("folded_shape")
+        in_width = folded_shape[-1] * dtype.bitwidth()
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        inp = context[node.input[0]]
+        exp_shape = self.get_normal_input_shape()
+
+        if mode == "npysim":
+            output = inp
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
+            context[node.output[0]] = output
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            # create a npy file for the input of the node
+            assert (
+                str(inp.dtype) == "float32"
+            ), """Input datatype is
+                not float32 as expected."""
+            expected_inp_shape = self.get_folded_input_shape()
+            reshaped_input = inp.reshape(expected_inp_shape)
+            if DataType[self.get_nodeattr("dataType")] == DataType.BIPOLAR:
+                # store bipolar activations as binary
+                reshaped_input = (reshaped_input + 1) / 2
+                export_idt = DataType.BINARY
+            else:
+                export_idt = DataType[self.get_nodeattr("dataType")]
+            # make copy before saving the array
+            reshaped_input = reshaped_input.copy()
+            np.save(
+                os.path.join(code_gen_dir, "input_0.npy"), reshaped_input,
+            )
+            verilog_file = os.path.join(
+                code_gen_dir, "{}.v".format(self.onnx_node.name)
+            )
+            if os.path.isfile(verilog_file):
+                nbits = self.get_instream_width(axi_strm_padding=True)
+                inp = npy_to_rtlsim_input(
+                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+                )
+                sim = PyVerilator.build(verilog_file, verilog_path=[code_gen_dir],)
+                super().reset_rtlsim(sim)
+                super().toggle_clk(sim)
+                output = self.rtlsim(sim, inp)
+                odt = DataType[self.get_nodeattr("dataType")]
+                target_bits = odt.bitwidth()
+                packed_bits = self.get_outstream_width(axi_strm_padding=True)
+                out_npy_path = "{}/output.npy".format(code_gen_dir)
+                out_shape = self.get_folded_output_shape()
+                rtlsim_output_to_npy(
+                    output, out_npy_path, odt, out_shape, packed_bits, target_bits
+                )
+
+                # load and reshape output
+                output = np.load(out_npy_path)
+                oshape = self.get_normal_output_shape()
+                output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+                context[node.output[0]] = output
+
+            else:
+                raise Exception(
+                    """Found no verilog files for this node,
+                    did you run the codegen_ipgen transformation?"""
+                )
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def get_number_input_values(self):
+        folded_ishape = self.get_folded_input_shape()
+        return np.prod(folded_ishape[:-1])
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index a7c2d5166b6af41327abcfeaa5cb5ae25fd23856..5e77a60de07e0b6de5c001f6e889476f496db50f 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -37,6 +37,7 @@ from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.custom_op.im2col import compute_conv_output_dim
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
+from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -91,14 +92,17 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
-    def get_instream_width(self):
+    def get_instream_width(self, axi_strm_padding=False):
         dt_bits = self.get_input_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
-        return int(dt_bits * ifm_ch)
+        in_width = int(dt_bits * ifm_ch)
+        if axi_strm_padding is True:
+            in_width = roundup_to_integer_multiple(in_width, 8)
+        return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, axi_strm_padding=False):
         """For streaming maxpool out stream with is the same as in stream width"""
-        return self.get_instream_width()
+        return self.get_instream_width(axi_strm_padding)
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 5323aac2e344fb8b3c1166e695753e68a435b08f..c53a17aafc496a2ffb6dd8009f8bbf7358b90737 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -415,3 +415,43 @@ ipx::create_xgui_files [ipx::current_core]
 ipx::update_checksums [ipx::current_core]
 ipx::save_core [ipx::current_core]
 """
+
+strm_fifo_wrapper = """
+module $TOPNAME$(
+ap_clk,
+ap_rst_n,
+in0_V_V_TDATA,
+in0_V_V_TVALID,
+in0_V_V_TREADY,
+out_V_V_TDATA,
+out_V_V_TVALID,
+out_V_V_TREADY
+);
+
+input   ap_clk;
+input   ap_rst_n;
+input  $IN_RANGE$ in0_V_V_TDATA;
+input   in0_V_V_TVALID;
+output   in0_V_V_TREADY;
+output  $OUT_RANGE$ out_V_V_TDATA;
+output   out_V_V_TVALID;
+input   out_V_V_TREADY;
+
+Q_srl #(
+.depth($DEPTH$),
+.width($WIDTH$)
+)
+$LAYER_NAME$
+(
+ .clock(ap_clk),
+ .reset(!ap_rst_n),
+ .i_d(in0_V_V_TDATA),
+ .i_v(in0_V_V_TVALID),
+ .i_r(in0_V_V_TREADY),
+ .o_d(out_V_V_TDATA),
+ .o_v(out_V_V_TVALID),
+ .o_r(out_V_V_TREADY)
+);
+
+endmodule
+"""
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 4d4dee6506f04909c53cd05e4898a7ad77e4a83a..a04b2a886984f3f98bd765ce617be6ca7c0170a8 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.util.basic import roundup_to_integer_multiple
 
 
 class TLastMarker(HLSCustomOp):
@@ -133,12 +134,16 @@ class TLastMarker(HLSCustomOp):
     def get_folded_output_shape(self):
         return self.get_folded_input_shape()
 
-    def get_instream_width(self):
+    def get_instream_width(self, axi_strm_padding=False):
         stream_width = self.get_nodeattr("StreamWidth")
+        if axi_strm_padding is True:
+            stream_width = roundup_to_integer_multiple(stream_width, 8)
         return stream_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, axi_strm_padding=False):
         stream_width = self.get_nodeattr("StreamWidth")
+        if axi_strm_padding is True:
+            stream_width = roundup_to_integer_multiple(stream_width, 8)
         return stream_width
 
     def strm_decl(self):
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index c797affff9dbf1310c413db0847e0e2dae222a97..411311c2b9def953ee5ac6d03adfafb81704c177 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -33,6 +33,7 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
 )
 from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
+from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
 from finn.custom_op.im2col import Im2Col
 from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
 from finn.custom_op.multithreshold import MultiThreshold
@@ -56,6 +57,7 @@ custom_op["TLastMarker"] = TLastMarker
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
 custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
+custom_op["StreamingFIFO"] = StreamingFIFO
 
 
 def getCustomOp(node):
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7efb95c8df4fbe83c210f7a3f0832f3e2a3d18d
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -0,0 +1,149 @@
+from onnx import TensorProto
+from onnx import helper as oh
+
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+def _is_fifo_node(node):
+    if node.op_type == "StreamingFIFO":
+        return True
+    else:
+        return False
+
+
+def _suitable_node(node):
+    if node is not None:
+        if is_fpgadataflow_node(node) is True:
+            if _is_fifo_node(node) is False:
+                return True
+            else:
+                return False
+        else:
+            return False
+    else:
+        return False
+
+
+class InsertFIFO(Transformation):
+    """Ensure that the graph is terminated with a TLastMarker node, inserting
+    one if necessary."""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        # default depth for FIFOs
+        default_depth = 2
+        graph = model.graph
+        node_ind = -1
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if _suitable_node(n):
+                n_output = n.output[0]
+                consumer = model.find_consumer(n_output)
+                if _suitable_node(consumer) is True:
+                    graph_modified = True
+                    n0 = getCustomOp(n)
+                    # determine fifo node attributes
+                    fld_shape = n0.get_folded_output_shape()
+                    dtype = n0.get_output_datatype()
+
+                    # create fifo node
+                    fifo_output_tensor = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        n0.get_normal_output_shape(),
+                    )
+                    graph.value_info.append(fifo_output_tensor)
+
+                    fifo_node = oh.make_node(
+                        "StreamingFIFO",
+                        [n_output],
+                        [fifo_output_tensor.name],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        depth=default_depth,
+                        folded_shape=fld_shape,
+                        dataType=str(dtype.name),
+                    )
+                    # insert fifo
+                    graph.node.insert(node_ind + 1, fifo_node)
+
+                    # set fifo output tensor as new input tensor of second node
+                    consumer.input[0] = fifo_output_tensor.name
+
+        if graph_modified is False:
+            # insert FIFO as first node
+            if graph.node[0].op_type != "StreamingFIFO":
+                n = graph.node[0]
+                n_input = n.input[0]
+                n0 = getCustomOp(n)
+                # determine fifo node attributes
+                fld_shape = n0.get_folded_input_shape()
+                dtype = n0.get_input_datatype()
+
+                # create fifo node
+                fifo_output_tensor = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    n0.get_normal_input_shape(),
+                )
+                graph.value_info.append(fifo_output_tensor)
+
+                fifo_node = oh.make_node(
+                    "StreamingFIFO",
+                    [n_input],
+                    [fifo_output_tensor.name],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    depth=default_depth,
+                    folded_shape=fld_shape,
+                    dataType=str(dtype.name),
+                )
+                # insert fifo
+                graph.node.insert(0, fifo_node)
+
+                # set fifo output tensor as new input tensor of second node
+                n.input[0] = fifo_output_tensor.name
+
+            # insert FIFO as first node
+            if graph.node[-1].op_type != "StreamingFIFO":
+                n = graph.node[-1]
+                assert (
+                    n.op_type != "TLastMarker"
+                ), """Insert tlast marker should be done
+                    after inserting the FIFOs"""
+                graph_out_name = graph.output[0].name
+                n0 = getCustomOp(n)
+                # determine fifo node attributes
+                fld_shape = n0.get_folded_output_shape()
+                dtype = n0.get_output_datatype()
+
+                # create fifo node
+                fifo_input_tensor = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    n0.get_normal_output_shape(),
+                )
+                graph.value_info.append(fifo_input_tensor)
+
+                fifo_node = oh.make_node(
+                    "StreamingFIFO",
+                    [fifo_input_tensor.name],
+                    [graph_out_name],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    depth=default_depth,
+                    folded_shape=fld_shape,
+                    dataType=str(dtype.name),
+                )
+                # insert fifo
+                graph.node.append(fifo_node)
+
+                # set fifo output tensor as new input tensor of second node
+                n.output[0] = fifo_input_tensor.name
+
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 81cb954bb4503c8daf18bad5881661018e9d17b7..4a7845ee4f6f43edb067351352925d6c8bcb4fce 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -85,6 +85,7 @@ pynq_driver_template = """
 from pynq import Overlay
 import numpy as np
 from pynq import allocate
+import time
 from finn.util.data_packing import (
     finnpy_to_packed_bytearray,
     packed_bytearray_to_finnpy
@@ -129,12 +130,21 @@ np.copyto(ibuf_packed_device, ibuf_packed)
 # allocate a PYNQ buffer for the returned packed output buffer
 obuf_packed = allocate(shape=oshape_packed, dtype=np.uint8)
 
+# measure runtime of network
+start = time.time()
+
 # set up the DMA and wait until all transfers complete
 dma.sendchannel.transfer(ibuf_packed_device)
 dma.recvchannel.transfer(obuf_packed)
 dma.sendchannel.wait()
 dma.recvchannel.wait()
 
+end = time.time()
+runtime = end - start
+file = open("nw_runtime.txt", "w")
+file.write(str(runtime))
+file.close()
+
 # unpack the packed output buffer from accelerator
 obuf_folded = packed_bytearray_to_finnpy(
     obuf_packed, odt, oshape_folded, reverse_endian=True, reverse_inner=True
diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
similarity index 93%
rename from tests/end2end/test_end2end_tfc_w1a1.py
rename to tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
index 03d6f92f1c148ce444f08fd65a867ad9390a18fd..946e84f2ab386f2046cc4756d37a2438ed05238b 100644
--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
@@ -55,6 +55,7 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
@@ -132,22 +133,37 @@ def test_end2end_tfc_w1a1_fold_and_tlastmarker():
     fc1w = getCustomOp(fc_layers[1])
     fc2w = getCustomOp(fc_layers[2])
     fc3w = getCustomOp(fc_layers[3])
-    fc0w.set_nodeattr("inFIFODepth", 50)
-    fc0w.set_nodeattr("SIMD", 16)
+    fc0w.set_nodeattr("inFIFODepth", 256)
+    fc0w.set_nodeattr("SIMD", 196)
     fc0w.set_nodeattr("PE", 16)
-    fc0w.set_nodeattr("outFIFODepth", 4)
-    fc1w.set_nodeattr("SIMD", 8)
-    fc1w.set_nodeattr("PE", 8)
-    fc1w.set_nodeattr("outFIFODepth", 4)
+    fc0w.set_nodeattr("outFIFODepth", 64)
+    fc1w.set_nodeattr("SIMD", 16)
+    fc1w.set_nodeattr("PE", 16)
+    fc1w.set_nodeattr("outFIFODepth", 64)
     fc2w.set_nodeattr("SIMD", 16)
     fc2w.set_nodeattr("PE", 16)
-    fc2w.set_nodeattr("outFIFODepth", 4)
+    fc2w.set_nodeattr("outFIFODepth", 64)
     fc3w.set_nodeattr("SIMD", 16)
     fc3w.set_nodeattr("PE", 10)
-    fc3w.set_nodeattr("outFIFODepth", 50)
+    fc3w.set_nodeattr("outFIFODepth", 10)
     model = model.transform(InsertDWC())
+    model = model.transform(InsertFIFO())
     model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
+    fifos = []
+    for n in model.graph.node:
+        if n.op_type == "StreamingFIFO":
+            fifos.append(n)
+    fifo0 = getCustomOp(fifos[0])
+    fifo1 = getCustomOp(fifos[1])
+    fifo2 = getCustomOp(fifos[2])
+    fifo3 = getCustomOp(fifos[3])
+    fifo4 = getCustomOp(fifos[4])
+    fifo0.set_nodeattr("depth", 256)
+    fifo1.set_nodeattr("depth", 64)
+    fifo2.set_nodeattr("depth", 64)
+    fifo3.set_nodeattr("depth", 64)
+    fifo4.set_nodeattr("depth", 10)
     model = model.transform(AnnotateResources("estimate"))
     model.save(build_dir + "/end2end_tfc_w1a1_folded.onnx")
 
@@ -195,7 +211,7 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
     res_rtlsim_whole = ret_rtlsim_whole[out_name]
     assert np.isclose(res_npysim, res_rtlsim_nodebynode).all()
-    assert np.isclose(res_npysim, res_rtlsim_whole).all()
+    assert np.isclose(res_rtlsim_nodebynode, res_rtlsim_whole).all()
 
 
 def test_end2end_tfc_w1a1_verify_all():
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
new file mode 100644
index 0000000000000000000000000000000000000000..b561e8cc2f851b7f7a2a61b245d05bb98afc3f2e
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -0,0 +1,107 @@
+import pytest
+import os
+
+from onnx import TensorProto, helper
+
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
+from finn.transformation.fpgadataflow.codegen_ipstitch import CodeGen_ipstitch
+
+from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
+
+# from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+
+# from finn.util.basic import gen_finn_dt_tensor
+
+# import finn.core.onnx_exec as oxe
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
+from finn.util.basic import pynq_part_map
+
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 5
+
+
+def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape)
+
+    FIFO_node = helper.make_node(
+        "StreamingFIFO",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        depth=Depth,
+        folded_shape=fld_shape,
+        dataType=str(finn_dtype.name),
+    )
+
+    graph = helper.make_graph(
+        nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="fifo-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", finn_dtype)
+    model.set_tensor_datatype("outp", finn_dtype)
+
+    return model
+
+
+def prepare_inputs(input_tensor, dt):
+    return {"inp": input_tensor}
+
+
+# shape
+@pytest.mark.parametrize("Shape", [[1, 128]])
+# inWidth
+@pytest.mark.parametrize("folded_shape", [[1, 1, 128]])
+# outWidth
+@pytest.mark.parametrize("depth", [256])
+# finn_dtype
+@pytest.mark.parametrize("finn_dtype", [DataType.BIPOLAR])  # , DataType.INT2])
+def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
+
+    # generate input data
+    # x = gen_finn_dt_tensor(finn_dtype, Shape)
+    #    input_dict = prepare_inputs(x, finn_dtype)
+
+    model = make_single_fifo_modelwrapper(Shape, depth, folded_shape, finn_dtype)
+
+    # model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(InsertTLastMarker())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(CodeGen_ipgen(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynth_IPGen())
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(CodeGen_ipstitch(test_fpga_part))
+    model = model.transform(MakePYNQProject(test_pynq_board))
+    model = model.transform(SynthPYNQProject())
+    model = model.transform(MakePYNQDriver())
+    ip = os.environ["PYNQ_IP"]
+    username = os.getenv("PYNQ_USERNAME", "xilinx")
+    password = os.getenv("PYNQ_PASSWORD", "xilinx")
+    target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+    model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+
+    # y = oxe.execute_onnx(model, input_dict)["outp"]
+
+    # assert (
+    #    y == x
+    # ).all(), """The output values are not the same as the
+    #    input values anymore."""
+    # assert y.shape == tuple(Shape), """The output shape is incorrect."""