diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index 71e80d7da18c82d8f69a6c37829c4c6fc58b398d..78fc2ccfc92f9b7ca3ae6beafe7d24bdbfada2bc 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -74,5 +74,4 @@ def hls_synth_res_estimation(model):
                         for this node. Please run "CodeGen_ipgen" transformation and
                         "HLSSynth_IPGen" first to generate the report files"""
                     )
-
     return res_dict
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 38c2ed6638bef5cd709a7da83218145de91dce0a..9a6f66087fafff3745e239da4cb9f05c4ec73451 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -31,7 +31,7 @@ import numpy as np
 import os
 import subprocess
 from finn.custom_op import CustomOp
-from finn.util.basic import CppBuilder, make_build_dir
+from finn.util.basic import CppBuilder, make_build_dir, roundup_to_integer_multiple
 from finn.util.fpgadataflow import (
     IPGenBuilder,
     pyverilate_get_liveness_threshold_cycles,
@@ -493,15 +493,28 @@ compilation transformations?
         """Returns folded output shape (according to neuron folding), if implemented."""
         raise Exception("get_folded_output_shape not implemented for this op")
 
-    def get_instream_width(self, axi_strm_padding=False):
+    def get_instream_width(self):
         """Returns input stream width, if implemented."""
         raise Exception("get_instream_width not implemented for this op")
 
-    def get_outstream_width(self, axi_strm_padding=False):
+    def get_outstream_width(self):
         """Returns output stream width, if implemented."""
         raise Exception("get_outstream_width not implemented for this op")
 
+    def get_instream_width_padded(self):
+        """Returns input stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec."""
+        in_width = self.get_instream_width()
+        return roundup_to_integer_multiple(in_width, 8)
+
+    def get_outstream_width_padded(self):
+        """Returns output stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec."""
+        out_width = self.get_outstream_width()
+        return roundup_to_integer_multiple(out_width, 8)
+
     def get_ap_int_max_w(self):
+        "Return the maximum width of any ap_int used in this module."
         instream = self.get_instream_width()
         outstream = self.get_outstream_width()
         return max([instream, outstream])
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 0c8fe863fef9f2be97ab3d29e00d72b877d8108a..66daa9f7b408a1d17ee3cca6aea5ab4a843f7e4f 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -34,7 +34,6 @@ from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.custom_op.im2col import compute_conv_output_dim
 from onnx import TensorProto, helper
-from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator:
@@ -142,7 +141,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self, axi_strm_padding=False):
+    def get_instream_width(self):
         """Returns stream width, input and output stream width are equal for
         the sliding window function"""
         ibits = self.get_input_datatype().bitwidth()
@@ -150,15 +149,13 @@ class ConvolutionInputGenerator(HLSCustomOp):
         ifm_ch = self.get_nodeattr("IFMChannels")
         assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
         in_width = simd * ibits
-        if axi_strm_padding is True:
-            in_width = roundup_to_integer_multiple(in_width, 8)
         return in_width
 
-    def get_outstream_width(self, axi_strm_padding=False):
+    def get_outstream_width(self):
         """Returns stream width, input and output stream width are equal for
         the sliding window function, so the function to determine the input
         stream width can be reused."""
-        return self.get_instream_width(axi_strm_padding)
+        return self.get_instream_width()
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index ce4f883fa029225a5748c08463858e3bf1bfd35c..f30871909b1c70f3b5df148f1b6eae22fdbadc25 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -32,7 +32,6 @@ import numpy as np
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
-from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 # does not do anything at the ONNX node-by-node level, and input-output
@@ -151,16 +150,12 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         folded_ishape = self.get_folded_input_shape()
         return np.prod(folded_ishape[:-1])
 
-    def get_instream_width(self, axi_strm_padding=False):
+    def get_instream_width(self):
         in_width = self.get_nodeattr("inWidth")
-        if axi_strm_padding is True:
-            in_width = roundup_to_integer_multiple(in_width, 8)
         return in_width
 
-    def get_outstream_width(self, axi_strm_padding=False):
+    def get_outstream_width(self):
         out_width = self.get_nodeattr("outWidth")
-        if axi_strm_padding is True:
-            out_width = roundup_to_integer_multiple(out_width, 8)
         return out_width
 
     def make_shape_compatible_op(self, model):
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index f04ee7ca7830760f4ed2804b8b71f8fe5d29325f..46920711e13057178be9fca5fe3a18ce3e14feda 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -279,29 +279,27 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self, axi_strm_padding=False):
+    def get_instream_width(self):
         i_bits = self.get_input_datatype().bitwidth()
         in_width = i_bits * self.get_nodeattr("SIMD")
-        if axi_strm_padding is True:
-            in_width = roundup_to_integer_multiple(in_width, 8)
         return in_width
 
-    def get_outstream_width(self, axi_strm_padding=False):
+    def get_outstream_width(self):
         o_bits = self.get_output_datatype().bitwidth()
         out_width = o_bits * self.get_nodeattr("PE")
-        if axi_strm_padding is True:
-            out_width = roundup_to_integer_multiple(out_width, 8)
         return out_width
 
-    def get_weightstream_width(self, axi_strm_padding=False):
+    def get_weightstream_width(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
         wp = self.get_weight_datatype().bitwidth()
         w_width = pe * simd * wp
-        if axi_strm_padding is True:
-            w_width = roundup_to_integer_multiple(w_width, 8)
         return w_width
 
+    def get_weightstream_width_padded(self):
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
     def get_ap_int_max_w(self):
         temp_value = super().get_ap_int_max_w()
         weightstream = self.get_weightstream_width()
@@ -982,13 +980,13 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
             ]
             # make instream width a multiple of 8 for AXI stream interface
-            in_width = roundup_to_integer_multiple(self.get_instream_width(), 8)
+            in_width = self.get_instream_width_padded()
             self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
             self.code_gen_dict["$OUT_RANGE$"] = [
-                "[{}:0]".format(self.get_outstream_width(axi_strm_padding=True) - 1)
+                "[{}:0]".format(self.get_outstream_width_padded() - 1)
             ]
             # make weight stream width a multiple of 8 for AXI stream interface
-            weight_width = roundup_to_integer_multiple(self.get_weightstream_width(), 8)
+            weight_width = self.get_weightstream_width_padded()
             self.code_gen_dict["$WEIGHT_RANGE$"] = ["[{}:0]".format(weight_width - 1)]
             self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)]
             self.code_gen_dict["$WSTREAM_DEPTH$"] = [str(self.calc_wmem())]
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 6e004c47b1e13d95efa356b6b8984688f54027cc..eb96c6c04eb0b7b83c3f925e10f86b17ec399e42 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -33,7 +33,6 @@ import subprocess
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
-from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 from . import templates
@@ -110,7 +109,7 @@ class StreamingFIFO(HLSCustomOp):
             "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
         ]
         # make instream width a multiple of 8 for axi interface
-        in_width = self.get_instream_width(axi_strm_padding=True)
+        in_width = self.get_instream_width_padded()
         self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
         self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)]
         self.code_gen_dict["$WIDTH$"] = [str(in_width)]
@@ -164,6 +163,8 @@ class StreamingFIFO(HLSCustomOp):
 
     def get_normal_input_shape(self):
         depth = self.get_nodeattr("depth")
+        # depth has to be between 2 and 256 with the current
+        # StreamingFIFO implementation
         assert (
             depth >= 2
         ), """Depth is too low. Please set node attribute "depth" to a value
@@ -172,10 +173,22 @@ class StreamingFIFO(HLSCustomOp):
             depth <= 256
         ), """Depth is too high. Please set node attribute "depth" to a value
         between 2 and 256"""
+        # derive normal shape from folded shape
+        # StreamingFIFOs are inserted in between fpgadataflow nodes
+        # the folded shape could be for example (1, nf, pe)
+        # with nf (neuron folding): mh // pe
+        # the normal input shape is in this case (1, mh)
+        # so to achieve this the two inner dimensions are multiplied
+        # and together with all previous dimensions
+        # this gives the normal input shape
+
         folded_shape = self.get_nodeattr("folded_shape")
+        # extract inner dimension
         inner_dim = folded_shape[-1]
+        # multiply with the next inner dimension
         folding_factor = folded_shape[-2] * inner_dim
         normal_ishape = []
+        # create the normal_ishape
         for i in range(len(folded_shape) - 2):
             normal_ishape.append(folded_shape[i])
         normal_ishape.append(folding_factor)
@@ -191,20 +204,16 @@ class StreamingFIFO(HLSCustomOp):
     def get_folded_output_shape(self):
         return self.get_nodeattr("folded_shape")
 
-    def get_instream_width(self, axi_strm_padding=False):
+    def get_instream_width(self):
         dtype = DataType[self.get_nodeattr("dataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
-        if axi_strm_padding is True:
-            in_width = roundup_to_integer_multiple(in_width, 8)
         return in_width
 
-    def get_outstream_width(self, axi_strm_padding=False):
+    def get_outstream_width(self):
         dtype = DataType[self.get_nodeattr("dataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
-        if axi_strm_padding is True:
-            in_width = roundup_to_integer_multiple(in_width, 8)
         return in_width
 
     def execute_node(self, context, graph):
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index ef1a5ee1bdc0bbe5c773aa375bf4402a8cb16ddb..83bc19030ebba66907e08c5b1e52d7c0ff9207a6 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -33,7 +33,6 @@ from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.custom_op.im2col import compute_conv_output_dim
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
-from finn.util.basic import roundup_to_integer_multiple
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -88,17 +87,15 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
-    def get_instream_width(self, axi_strm_padding=False):
+    def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
         in_width = int(dt_bits * ifm_ch)
-        if axi_strm_padding is True:
-            in_width = roundup_to_integer_multiple(in_width, 8)
         return in_width
 
-    def get_outstream_width(self, axi_strm_padding=False):
+    def get_outstream_width(self):
         """For streaming maxpool out stream with is the same as in stream width"""
-        return self.get_instream_width(axi_strm_padding)
+        return self.get_instream_width()
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index e5a5fed6c9d5d31fbf0082707879480e0c0a2dc7..25ea05e3607a52731ae1b64de421837bf137ee2b 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -27,7 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from finn.custom_op.fpgadataflow import HLSCustomOp
-from finn.util.basic import roundup_to_integer_multiple
 
 
 class TLastMarker(HLSCustomOp):
@@ -148,16 +147,12 @@ class TLastMarker(HLSCustomOp):
     def get_folded_output_shape(self):
         return self.get_folded_input_shape()
 
-    def get_instream_width(self, axi_strm_padding=False):
+    def get_instream_width(self):
         stream_width = self.get_nodeattr("StreamWidth")
-        if axi_strm_padding is True:
-            stream_width = roundup_to_integer_multiple(stream_width, 8)
         return stream_width
 
-    def get_outstream_width(self, axi_strm_padding=False):
+    def get_outstream_width(self):
         stream_width = self.get_nodeattr("StreamWidth")
-        if axi_strm_padding is True:
-            stream_width = roundup_to_integer_multiple(stream_width, 8)
         return stream_width
 
     def strm_decl(self):
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index b80f2fbc1d9893d304f28c3494c44a69a1db052e..f66d0dc087ecbdd112422484ee1e01cb5ceef1c0 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -27,14 +27,20 @@ def _suitable_node(node):
 
 
 class InsertFIFO(Transformation):
-    """Ensure that the graph is terminated with a TLastMarker node, inserting
-    one if necessary."""
+    """Inserting FIFOs in the beginning and end of the graph as well as
+    between fpgadataflow nodes.
+
+    Takes the setting for the depth from the surrounding nodes by extracting
+    node attribute 'outFIFODepth' of the previous and node attribute 'inFIFODepth'
+    of the subsequent node. max() of these two values sets the FIFO depth.
+
+    The other node attributes necessary to create a FIFO node are taking from the
+    node the FIFO node is inserted after: 'folded_shape' and 'dtype'"""
 
     def __init__(self):
         super().__init__()
 
     def apply(self, model):
-        # default depth for FIFOs
         graph = model.graph
         node_ind = -1
         graph_modified = False
@@ -50,10 +56,19 @@ class InsertFIFO(Transformation):
                     fld_shape = n0.get_folded_output_shape()
                     dtype = n0.get_output_datatype()
 
+                    # check if folded_shape of output of first node and
+                    # input of the second node is equal
+                    n1 = getCustomOp(consumer)
+                    assert (
+                        fld_shape == n1.get_folded_input_shape()
+                    ), """The
+                    folded output shape of the first node is not the same as the
+                    folded output shape of the second node. A streaming fifo can't
+                    be implemented in between these nodes."""
+
                     # check if outFIFOdepth attribute of first node
                     # and inFIFOdepth attribute of consumer node is equal
                     n0_depth = n0.get_nodeattr("outFIFODepth")
-                    n1 = getCustomOp(consumer)
                     n1_depth = n1.get_nodeattr("inFIFODepth")
                     if n0_depth == n1_depth:
                         fifo_depth = n0_depth
@@ -69,6 +84,7 @@ class InsertFIFO(Transformation):
                         n0.get_normal_output_shape(),
                     )
                     graph.value_info.append(fifo_output_tensor)
+                    model.set_tensor_datatype(fifo_output_tensor.name, dtype)
 
                     fifo_node = oh.make_node(
                         "StreamingFIFO",
@@ -104,6 +120,7 @@ class InsertFIFO(Transformation):
                     n0.get_normal_input_shape(),
                 )
                 graph.value_info.append(fifo_output_tensor)
+                model.set_tensor_datatype(fifo_output_tensor.name, dtype)
 
                 fifo_node = oh.make_node(
                     "StreamingFIFO",
@@ -142,6 +159,7 @@ class InsertFIFO(Transformation):
                     n0.get_normal_output_shape(),
                 )
                 graph.value_info.append(fifo_input_tensor)
+                model.set_tensor_datatype(fifo_output_tensor.name, dtype)
 
                 fifo_node = oh.make_node(
                     "StreamingFIFO",
diff --git a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
index 26fd247aaf517158078a8cdbb577a55f1fdae6fa..6d72d9983d7a99d495b4e03e5ff0b5b633ee16ae 100644
--- a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
+++ b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
@@ -219,7 +219,7 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
     res_rtlsim_whole = ret_rtlsim_whole[out_name]
     assert np.isclose(res_npysim, res_rtlsim_nodebynode).all()
-    assert np.isclose(res_rtlsim_nodebynode, res_rtlsim_whole).all()
+    assert np.isclose(res_npysim, res_rtlsim_whole).all()
 
 
 def test_end2end_tfc_w1a1_verify_all():