diff --git a/fetch-repos.sh b/fetch-repos.sh
index 5d03c259d22d052628384ecca5189d71986db03e..16960c71e31671b042dcfb4c31208aaaf8e29906 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,7 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="7d50273a4dcccb445fb06f57f6bedc17b3707b35"
+QONNX_COMMIT="f14d7dc92a6baeffa2bef811e902abb121a6f696"
 FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
 BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
 PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index e9ad39961410a283865f3e4520a21353fbdf1cae..b0f7b6ec6cada69d402af9089c66636248150b19 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -528,7 +528,9 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             model = model.transform(DeriveFIFOSizes())
             model = model.transform(
                 InsertFIFO(
-                    vivado_ram_style=cfg.large_fifo_mem_style, max_qsrl_depth=256
+                    vivado_ram_style=cfg.large_fifo_mem_style,
+                    max_qsrl_depth=256,
+                    create_shallow_fifos=True,
                 )
             )
             model = model.transform(GiveUniqueNodeNames())
@@ -550,6 +552,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
                     force_python_sim=force_python_sim,
                 )
             )
+            # InsertAndSetFIFODepths internally removes any shallow FIFOs
+            # so no need to call RemoveShallowFIFOs here
         else:
             assert "Unsupported auto_fifo_strategy: " + cfg.auto_fifo_strategy
     else:
@@ -574,6 +578,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         "resType",
         "mem_mode",
         "runtime_writeable_weights",
+        "inFIFODepths",
+        "outFIFODepths",
     ]
     extract_model_config_to_json(
         model, cfg.output_dir + "/final_hw_config.json", hw_attrs
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index c71e8ffe323b1f2bb459a0f982e63d881a7ae58d..522305327ff7c5f1356aad4fdf6b9e0a942eca72 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -72,6 +72,9 @@ class StreamingFIFO(HLSCustomOp):
                 ),
                 # whether depth monitoring is enabled (impl_style=rtl only)
                 "depth_monitor": ("i", False, 0),
+                # the FIFO does not need its own FIFOs
+                "inFIFODepths": ("ints", False, [0]),
+                "outFIFODepths": ("ints", False, [0]),
             }
         )
 
diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py
index f783f7ae711739cf4e011315c6714ad95d3c7919..67eb96995ef3312dff72799c905216b82b7ef8ee 100644
--- a/src/finn/transformation/fpgadataflow/derive_characteristic.py
+++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py
@@ -134,8 +134,9 @@ class DeriveFIFOSizes(NodeLocalTransformation):
       NodeLocalTransformation for more details.
     """
 
-    def __init__(self, num_workers=None):
+    def __init__(self, num_workers=None, io_fifo_depth=32):
         super().__init__(num_workers=num_workers)
+        self.io_fifo_depth = io_fifo_depth
 
     def applyNodeLocal(self, node):
         op_type = node.op_type
@@ -161,7 +162,7 @@ class DeriveFIFOSizes(NodeLocalTransformation):
                     if cons_node is None:
                         # could be final node, will be overridden if so
                         # need an entry in the list anyway
-                        out_fifo_depths.append(2)
+                        out_fifo_depths.append(self.io_fifo_depth)
                         continue
                     cons = registry.getCustomOp(cons_node)
                     cons_chrc = cons.get_nodeattr("io_chrc_in")[0]
@@ -182,6 +183,14 @@ class DeriveFIFOSizes(NodeLocalTransformation):
                 # for each tensor
                 prod.set_nodeattr("outFIFODepths", out_fifo_depths)
 
+                # finally, check node inputs to ensure FIFOs are added to
+                # any top-level inputs (at least self.io_fifo_depth deep)
+                in_fifo_depths = prod.get_nodeattr("inFIFODepths")
+                for (i, input_name) in enumerate(node.input):
+                    if input_name in [x.name for x in model.graph.input]:
+                        in_fifo_depths[i] = max(self.io_fifo_depth, in_fifo_depths[i])
+                prod.set_nodeattr("inFIFODepths", in_fifo_depths)
+
             except KeyError:
                 # exception if op_type is not supported
                 raise Exception(
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 0546643d1220603d40651c45a0c4032dcf5cfaaf..50da9cdf1666c21f99a66e1d27e134b914738cb1 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -177,14 +177,9 @@ class InsertFIFO(Transformation):
                             for idx, inp in enumerate(consumer.input):
                                 if inp == output_name:
                                     consumer.input[idx] = fifo_output_tensor.name
-                            # ensure created FIFO depth is reflected on both sides
-                            odepths = n0.get_nodeattr("outFIFODepths")
-                            odepths[idx_out] = fifo_depth
-                            n0.set_nodeattr("outFIFODepths", odepths)
-                            idepths = n1.get_nodeattr("inFIFODepths")
-                            idepths[idx_inp] = fifo_depth
-                            n1.set_nodeattr("inFIFODepths", idepths)
-
+                            # removed setting of node attributes based on created
+                            # FIFO sizes here, better to preserve original attrs
+                            # as they are.
                             graph_modified = True
 
         if graph_modified is False:
@@ -204,41 +199,48 @@ class InsertFIFO(Transformation):
                     dtype = n0.get_input_datatype(inp_ind)
                     fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind]
 
-                    if fifo_depth <= 2:
-                        warnings.warn("Overriding input FIFO depth to 32")
-                        fifo_depth = 32
-
-                    # create fifo node
-                    fifo_output_tensor = oh.make_tensor_value_info(
-                        model.make_new_valueinfo_name(),
-                        TensorProto.FLOAT,
-                        n0.get_normal_input_shape(),
-                    )
-                    graph.value_info.append(fifo_output_tensor)
-                    model.set_tensor_datatype(fifo_output_tensor.name, dtype)
-
-                    if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth:
-                        impl_style = "rtl"
+                    if fifo_depth > 2 or self.create_shallow_fifos:
+                        # create fifo node
+                        fifo_output_tensor = oh.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            n0.get_normal_input_shape(),
+                        )
+                        graph.value_info.append(fifo_output_tensor)
+                        model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                        if (
+                            self.max_qsrl_depth is None
+                            or fifo_depth <= self.max_qsrl_depth
+                        ):
+                            impl_style = "rtl"
+                        else:
+                            impl_style = "vivado"
+
+                        fifo_node = oh.make_node(
+                            "StreamingFIFO",
+                            [n_input],
+                            [fifo_output_tensor.name],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=str(dtype.name),
+                            impl_style=impl_style,
+                            ram_style=self.vivado_ram_style,
+                        )
+                        # insert fifo
+                        graph.node.insert(0, fifo_node)
+
+                        # set fifo output tensor as new input tensor of second node
+                        first_node.input[inp_ind] = fifo_output_tensor.name
                     else:
-                        impl_style = "vivado"
-
-                    fifo_node = oh.make_node(
-                        "StreamingFIFO",
-                        [n_input],
-                        [fifo_output_tensor.name],
-                        domain="finn.custom_op.fpgadataflow",
-                        backend="fpgadataflow",
-                        depth=fifo_depth,
-                        folded_shape=fld_shape,
-                        dataType=str(dtype.name),
-                        impl_style=impl_style,
-                        ram_style=self.vivado_ram_style,
-                    )
-                    # insert fifo
-                    graph.node.insert(0, fifo_node)
-
-                    # set fifo output tensor as new input tensor of second node
-                    first_node.input[inp_ind] = fifo_output_tensor.name
+                        warnings.warn(
+                            """Input FIFO for %s has depth %d and won't
+                        be created. This may cause RTL simulation issues.
+                        """
+                            % (graph_in_name, fifo_depth)
+                        )
 
             # insert FIFO as last node, except when last node is DMA
             graph_out_names = [x.name for x in model.graph.output]
@@ -259,40 +261,47 @@ class InsertFIFO(Transformation):
                     dtype = n0.get_output_datatype(out_ind)
                     fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind]
 
-                    if fifo_depth <= 2:
-                        warnings.warn("Overriding output FIFO depth to 32")
-                        fifo_depth = 32
-
-                    # create fifo node
-                    fifo_input_tensor = oh.make_tensor_value_info(
-                        model.make_new_valueinfo_name(),
-                        TensorProto.FLOAT,
-                        n0.get_normal_output_shape(),
-                    )
-                    graph.value_info.append(fifo_input_tensor)
-                    model.set_tensor_datatype(fifo_input_tensor.name, dtype)
-
-                    if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth:
-                        impl_style = "rtl"
+                    if fifo_depth > 2 or self.create_shallow_fifos:
+                        # create fifo node
+                        fifo_input_tensor = oh.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            n0.get_normal_output_shape(),
+                        )
+                        graph.value_info.append(fifo_input_tensor)
+                        model.set_tensor_datatype(fifo_input_tensor.name, dtype)
+
+                        if (
+                            self.max_qsrl_depth is None
+                            or fifo_depth <= self.max_qsrl_depth
+                        ):
+                            impl_style = "rtl"
+                        else:
+                            impl_style = "vivado"
+
+                        fifo_node = oh.make_node(
+                            "StreamingFIFO",
+                            [fifo_input_tensor.name],
+                            [graph_out_name],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=str(dtype.name),
+                            impl_style=impl_style,
+                            ram_style=self.vivado_ram_style,
+                        )
+                        # insert fifo
+                        graph.node.append(fifo_node)
+
+                        # set fifo output tensor as new input tensor of second node
+                        final_node.output[0] = fifo_input_tensor.name
                     else:
-                        impl_style = "vivado"
-
-                    fifo_node = oh.make_node(
-                        "StreamingFIFO",
-                        [fifo_input_tensor.name],
-                        [graph_out_name],
-                        domain="finn.custom_op.fpgadataflow",
-                        backend="fpgadataflow",
-                        depth=fifo_depth,
-                        folded_shape=fld_shape,
-                        dataType=str(dtype.name),
-                        impl_style=impl_style,
-                        ram_style=self.vivado_ram_style,
-                    )
-                    # insert fifo
-                    graph.node.append(fifo_node)
-
-                    # set fifo output tensor as new input tensor of second node
-                    final_node.output[0] = fifo_input_tensor.name
+                        warnings.warn(
+                            """Output FIFO for %s has depth %d and won't
+                        be created. This may cause RTL simulation issues.
+                        """
+                            % (graph_out_name, fifo_depth)
+                        )
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 9ac1000468d72c49a3d6d19556dd8b96fb5fe7a4..2619557edfb92059f0ac0d824f7e9c289b282612 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -230,7 +230,7 @@ class InsertAndSetFIFODepths(Transformation):
     - run through rtlsim with stream of multiple random input images (to fill pipeline)
     - keep track of observed maximum occupancy for each FIFO during rtlsim
     - when sim finished, update each FIFO depth to maximum observed occupancy
-      and set inFIFODepths/outFIFODepths attrs to 0 on relevant nodes
+      and set inFIFODepths/outFIFODepths attrs to that depth as well
 
     """
 
@@ -295,7 +295,7 @@ class InsertAndSetFIFODepths(Transformation):
 
         # insert stream infrastructure (DWC/FIFO)
         model = model.transform(InsertDWC())
-        model = model.transform(InsertFIFO())
+        model = model.transform(InsertFIFO(create_shallow_fifos=True))
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
 
@@ -388,9 +388,9 @@ class InsertAndSetFIFODepths(Transformation):
                 # nodes as # inputs to drive the imulation
                 n_inputs = int(len(model.graph.node) / 2)
             else:
-                # convnet, single input is typically enough to fill entire
+                # convnet, two inputs are typically enough to fill entire
                 # layer pipeline due to overlaps
-                n_inputs = 1
+                n_inputs = 2
             sim = verilator_fifosim(model, n_inputs)
 
         for ind, node in enumerate(fifo_nodes):
@@ -422,11 +422,7 @@ class InsertAndSetFIFODepths(Transformation):
                 reset_implementation(node_inst)
                 del fifos[node.name]
             else:
-                inst = getCustomOp(node)
-                ifd = inst.get_nodeattr("inFIFODepths")
-                ofd = inst.get_nodeattr("outFIFODepths")
-                inst.set_nodeattr("inFIFODepths", [0] * len(ifd))
-                inst.set_nodeattr("outFIFODepths", [0] * len(ofd))
+                # (removed setting of node FIFO size attributes to 0 here)
                 # for every extw node we changed from external to decoupled,
                 # change back and reset implementation
                 if node.op_type in extw_optypes:
@@ -448,6 +444,51 @@ class InsertAndSetFIFODepths(Transformation):
         # remove shallow FIFOs
         model = model.transform(RemoveShallowFIFOs())
 
+        # reflect final values in attributes
+        for node in model.graph.node:
+            if node.op_type != "StreamingFIFO":
+                node_inst = getCustomOp(node)
+                fifodepth_in = []
+                for node_inp in node.input:
+                    prod = model.find_producer(node_inp)
+                    if prod is None:
+                        # no producer for this input
+                        if node_inp in [x.name for x in model.graph.input]:
+                            # top-level input with no FIFO
+                            fifodepth_in.append(0)
+                        else:
+                            # FIFO depth attr applies only to dynamic attributes
+                            pass
+                    else:
+                        # there is a producer for this input
+                        if prod.op_type == "StreamingFIFO":
+                            prod_inst = getCustomOp(prod)
+                            fifodepth_in.append(prod_inst.get_nodeattr("depth"))
+                        else:
+                            # explicitly no FIFO on this dynamic input
+                            fifodepth_in.append(0)
+                fifodepth_out = []
+                for node_out in node.output:
+                    cons = model.find_consumer(node_out)
+                    if cons is None:
+                        # no consumer for this output
+                        if node_out in [x.name for x in model.graph.output]:
+                            # top-level output with no FIFO
+                            fifodepth_out.append(0)
+                        else:
+                            # FIFO depth attr applies only to dynamic attributes
+                            pass
+                    else:
+                        # there is a consumer for this input
+                        if cons.op_type == "StreamingFIFO":
+                            cons_inst = getCustomOp(cons)
+                            fifodepth_out.append(cons_inst.get_nodeattr("depth"))
+                        else:
+                            # explicitly no FIFO on this dynamic output
+                            fifodepth_out.append(0)
+                node_inst.set_nodeattr("inFIFODepths", fifodepth_in)
+                node_inst.set_nodeattr("outFIFODepths", fifodepth_out)
+
         return (model, False)
 
 
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 79cfafa22d670f168c3c03a5ef01a51256912a8c..858363d6d31c7c17803bffdb87e7b168dec4b76d 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -564,12 +564,6 @@ class TestEnd2End:
         model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
         fifo_layers = model.get_nodes_by_op_type("StreamingFIFO")
         assert len(fifo_layers) > 0
-        hls_layers = model.get_finn_nodes()
-        for node in hls_layers:
-            if node.op_type != "StreamingFIFO":
-                op_inst = getCustomOp(node)
-                assert op_inst.get_nodeattr("inFIFODepths") == [0]
-                assert op_inst.get_nodeattr("outFIFODepths") == [0]
         model.save(
             get_checkpoint_name(
                 topology, wbits, abits, QONNX_export, "fifodepth_" + kind
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index 6b78d399eb100686277a92f1e35b9a98b433444b..f4f2b8dbfff0d720ec4eb901704581b096c0ea40 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -32,6 +32,8 @@ import pytest
 import json
 import shutil
 from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
 
 import finn.builder.build_dataflow as build
 import finn.builder.build_dataflow_config as build_cfg
@@ -53,15 +55,16 @@ def fetch_test_model(topology, wbits=2, abits=2):
 @pytest.mark.parametrize(
     "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"]
 )
-def test_fifosizing_linear(method):
+@pytest.mark.parametrize("topology", ["tfc"])
+def test_fifosizing_linear(method, topology):
     force_python_rtlsim = "python" in method
     method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize"
-    tmp_output_dir = fetch_test_model("tfc")
+    tmp_output_dir = fetch_test_model(topology)
     cfg = build_cfg.DataflowBuildConfig(
         output_dir=tmp_output_dir,
         auto_fifo_depths=True,
         auto_fifo_strategy=method_key,
-        target_fps=10000,
+        target_fps=10000 if topology == "tfc" else 1000,
         force_python_rtlsim=force_python_rtlsim,
         synth_clk_period_ns=10.0,
         board="Pynq-Z1",
@@ -84,4 +87,32 @@ def test_fifosizing_linear(method):
         / float(est_data["estimated_throughput_fps"])
         > 0.9
     )
+    # now run the same build using the generated folding and FIFO config
+    tmp_output_dir_cmp = fetch_test_model(topology)
+    cfg_cmp = cfg
+    cfg_cmp.output_dir = tmp_output_dir_cmp
+    cfg_cmp.auto_fifo_depths = False
+    cfg_cmp.target_fps = None
+    cfg_cmp.generate_outputs = [build_cfg.DataflowOutputType.STITCHED_IP]
+    cfg_cmp.folding_config_file = tmp_output_dir + "/final_hw_config.json"
+    build.build_dataflow_cfg(tmp_output_dir_cmp + "/model.onnx", cfg_cmp)
+
+    model0 = ModelWrapper(
+        tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx"
+    )
+    model1 = ModelWrapper(
+        tmp_output_dir_cmp + "/intermediate_models/step_create_stitched_ip.onnx"
+    )
+
+    assert len(model0.graph.node) == len(model1.graph.node)
+    for i in range(len(model0.graph.node)):
+        node0 = model0.graph.node[i]
+        node1 = model1.graph.node[i]
+        assert node0.op_type == node1.op_type
+        if node0.op_type == "StreamingFIFO":
+            node0_inst = getCustomOp(node0)
+            node1_inst = getCustomOp(node1)
+            assert node0_inst.get_nodeattr("depth") == node1_inst.get_nodeattr("depth")
+
     shutil.rmtree(tmp_output_dir)
+    shutil.rmtree(tmp_output_dir_cmp)