diff --git a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..201333aebdb3fc1d15464389e37326dcaf6848e0
--- /dev/null
+++ b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import finn.custom_op.registry as registry
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+def exp_cycles_per_layer(model):
+    """Estimates the number of cycles per sample for dataflow layers in the given model.
+    Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
+    transformation) prior to calling this analysis pass to ensure all nodes are
+    visible in the results.
+
+    Returns {node name : cycle estimation}."""
+
+    cycle_dict = {}
+    for node in model.graph.node:
+        if is_fpgadataflow_node(node) is True:
+            op_type = node.op_type
+            inst = registry.custom_op[op_type](node)
+            cycle_dict[node.name] = inst.get_exp_cycles()
+
+    return cycle_dict
diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index ad30282d93034f8d043a05a2172790349c31ec83..03b31b9c1ec51b45e17152d35d5824b6137ab4a2 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -35,6 +35,9 @@ from finn.util.fpgadataflow import is_fpgadataflow_node
 
 def hls_synth_res_estimation(model):
     """Extracts the FPGA resource results from the Vivado HLS synthesis estimates.
+    Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
+    transformation) prior to calling this analysis pass to ensure all nodes are
+    visible in the results.
 
     Returns {node name : resources_dict}."""
 
diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py
index 81accba23220d3f25e8560443ff22cf59d3733e9..9206f3f6fcd81de175babef54de990fe01c861e1 100644
--- a/src/finn/analysis/fpgadataflow/post_synth_res.py
+++ b/src/finn/analysis/fpgadataflow/post_synth_res.py
@@ -36,6 +36,9 @@ from finn.custom_op.registry import getCustomOp
 
 def post_synth_res(model, override_synth_report_filename=None):
     """Extracts the FPGA resource results from the Vivado synthesis.
+    Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
+    transformation) prior to calling this analysis pass to ensure all nodes are
+    visible in the results.
 
     Returns {node name : resources_dict}."""
 
diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py
index c190059eceb0cc111477c84f843f4a9f9bf2f393..e52557573dab072709da4452f4e2d477e99b98c9 100644
--- a/src/finn/analysis/fpgadataflow/res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/res_estimation.py
@@ -32,6 +32,9 @@ from finn.util.fpgadataflow import is_fpgadataflow_node
 
 def res_estimation(model):
     """Estimates the resources needed for the given model.
+    Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
+    transformation) prior to calling this analysis pass to ensure all nodes are
+    visible in the results.
 
     Returns {node name : resource estimation}."""
 
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index bc816f18c5f72338dc726e504182998f3f4430b7..97056ac77c5bff8cc287041c9b9bef01db6a66cb 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -209,6 +209,12 @@ class HLSCustomOp(CustomOp):
         HLSCustomOp class but has to be filled by every node"""
         return 0
 
+    def get_exp_cycles(self):
+        """Function for estimation of expected cycles for set folding,
+        is member function of HLSCustomOp class but has to be filled
+        by every node"""
+        return 0
+
     def code_generation_ipgen(self, model, fpgapart, clk):
         """Generates c++ code and tcl script for ip generation."""
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index d73f22672e7163eef0738d067f951e90fe80a89f..14fb65739dab4208edd0c61bb7ca8ae2d114baab 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -170,6 +170,10 @@ class AddStreams_Batch(HLSCustomOp):
     def get_number_output_values(self):
         return np.prod(self.get_folded_output_shape()[:-1])
 
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
index ad68a4bde29123b2498ac7789048bcd2e13bf3bc..d8e74a4d13043a741cf787477c51b63925b7aad8 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -224,6 +224,10 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         nf = np.prod(self.get_folded_output_shape()[:-1])
         return nf
 
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
     def get_template_param_values(self):
         """Returns the template parameter values according to input, output and weight
         data types."""
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 3e40ad70208909551365c51324153859ccc79ceb..d33d6c963c0c55309f7f258c9ec1d7723e112282 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -177,6 +177,23 @@ class ConvolutionInputGenerator(HLSCustomOp):
         num_output_elems = np.prod(folded_oshape[:-1])
         return num_output_elems
 
+    def get_exp_cycles(self):
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ofm_dim = self.get_nodeattr("OFMDim")
+        stride = self.get_nodeattr("Stride")
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
+        cycles_write_block = (ofm_dim * k * k * (ifm_ch / simd)) / mmv
+        cycles_read_block = stride * ifm_dim * (ifm_ch / simd)
+        max_cycles = max(cycles_write_block, cycles_read_block)
+        exp_cycles = ifm_dim * k * (ifm_ch / simd) + ofm_dim * max_cycles
+
+        return int(exp_cycles)
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
index 0ce4379a2c41baa5bc009e9df7623d133ee89a09..15d55653b4e431dead885d75650b1500150d8775 100644
--- a/src/finn/custom_op/fpgadataflow/downsampler.py
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -36,6 +36,14 @@ class DownSampler(HLSCustomOp):
         stride = self.get_nodeattr("Stride")
         return int(np.floor((idim - 1) / stride) + 1)
 
+    def get_exp_cycles(self):
+        idim = self.get_nodeattr("ImgDim")
+        channels = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        batch_size = self.get_nodeattr("numInputVectors")
+        exp_cycles = channels / simd * batch_size * idim * idim
+        return int(exp_cycles)
+
     def get_normal_input_shape(self):
         idim = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index e4762509fb6246bafa7441e194312d69ad585d1b..044cfddaab51a5f9bf7aa25e9123247b10de8529 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -164,6 +164,10 @@ class DuplicateStreams_Batch(HLSCustomOp):
     def get_number_output_values(self):
         return 2 * np.prod(self.get_folded_output_shape()[1:-1])
 
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index d326ae7dfc7830a0081c3b13233d67ef08b12eff..f9a9dc4340b18578550a9c453d90de86234d1cad 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -42,6 +42,14 @@ class FMPadding_Batch(HLSCustomOp):
         pad = self.get_nodeattr("Padding")
         return idim + pad
 
+    def get_exp_cycles(self):
+        odim = self.get_padded_odim()
+        channels = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        batch_size = self.get_nodeattr("numInputVectors")
+        exp_cycles = (channels / simd) * batch_size * odim * odim
+        return exp_cycles
+
     def get_normal_input_shape(self):
         idim = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index 83152dea6cc494b8464c78605399b21b38d48b80..1a75858880a072345ef942ca91feabf0bec9ab36 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -182,6 +182,13 @@ class GlobalAccPool_Batch(HLSCustomOp):
     def get_number_output_values(self):
         return np.prod(self.get_folded_output_shape()[1:-1])
 
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * idim * idim + Channels/PE
+        ch = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        folds = int(ch / pe)
+        return np.prod(self.get_folded_input_shape()[:-1]) + folds
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index 801a634fdba1cd5e16c7c211175c1e7380bf0070..4a2fa6889ae0ebb94976d50b0fc8362d01a63bea 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -136,6 +136,16 @@ class Pool_Batch(HLSCustomOp):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[1:-1])
 
+    def get_exp_cycles(self):
+        # (Channels * kernel * kernel) / PE * odim * odim * batch_size
+        ifm_ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        k = self.get_nodeattr("KernelSize")
+        odim = self.get_nodeattr("OutImgDim")
+        batch_size = self.get_nodeattr("BatchSize")
+        exp_cycles = ((ifm_ch * k * k) / pe) * odim * odim * batch_size
+        return int(exp_cycles)
+
     def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 9c3bd3ac87b94f3e0ff11a2937bf5083aae614f6..738bfa25403ded4bf22945e1dcd353ae9d5634fc 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -278,6 +278,17 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         return c0 + c1 * (P * Q) * (W * A)
 
+    def get_exp_cycles(self):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        mh = self.get_nodeattr("MH")
+        mw = self.get_nodeattr("MW")
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
+        return int(exp_cycles)
+
     def get_input_datatype(self):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 2344e12f7e87634c189563f9cde7b1c861a3606e..4c772358648f402467cee628afe410d7bce83ede 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -95,6 +95,12 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
+    def get_exp_cycles(self):
+        # derived from StreamingMaxPool_Batch loop nest
+        k = self.get_nodeattr("PoolDim")
+        ifm_dim = self.get_nodeattr("ImgDim")
+        return ifm_dim * (ifm_dim + (ifm_dim / k))
+
     def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index fa33c70218fab16f106da45e296f0d59ae4ea606..c2e3739e8f62b5ce0459ee8fbb1f3dcda7b50c1e 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -215,6 +215,10 @@ class Thresholding_Batch(HLSCustomOp):
         nf = np.prod(self.get_folded_output_shape()[:-1])
         return nf
 
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
     def get_template_param_values(self):
         """Returns the template parameter values according to input, output and weight
         data types."""
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index 188f20e22fc52e435f8ba0e7d76dff223e084d69..6d1ff31ab554ef1d3fe8ef1fac66e6bc3406efbb 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -24,6 +24,7 @@ from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.custom_op.im2col import compute_conv_output_dim
 from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 # conv_config  kernel_size,stride, pad
 
@@ -111,6 +112,14 @@ def test_convert_to_hls_conv_layer(conv_config, exec_mode):
     assert oxe.compare_execution(model, new_model, inp_dict)
     if kernel_size == 1 and stride > 1 and pad == 0:
         assert new_model.graph.node[1].op_type == "DownSampler"
+        if exec_mode == "rtlsim":
+            node = new_model.get_nodes_by_op_type("DownSampler")[0]
+            inst = getCustomOp(node)
+            sim_cycles = inst.get_nodeattr("sim_cycles")
+            exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
+            exp_cycles = exp_cycles_dict[node.name]
+            assert np.isclose(exp_cycles, sim_cycles, atol=11)
+            assert exp_cycles != 0
 
     if pad == 1:
         padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
index aba973051cb14e3e428e4de72a57924884c831de..fbf1e72da266141bd8328cc88c2e8bebff8301fb 100644
--- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -44,6 +44,7 @@ from finn.transformation.general import GiveUniqueNodeNames
 from finn.custom_op.registry import getCustomOp
 from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.infer_shapes import InferShapes
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt):
@@ -210,3 +211,11 @@ def test_convert_to_hls_pool_batch(
             assert len(new_model.graph.node) == 5
     else:
         assert len(new_model.graph.node) == 1
+
+    if exec_mode == "rtlsim":
+        node = new_model.get_nodes_by_op_type("Pool_Batch")[0]
+        inst = getCustomOp(node)
+        sim_cycles = inst.get_nodeattr("sim_cycles")
+        exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, sim_cycles, atol=10)
diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
index f94784457a43718516e76946269fc47119423b24..7a3df667b7feeafa017e3b03c11d4e55be07b195 100644
--- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+import numpy as np
 
 from onnx import TensorProto, helper
 
@@ -44,6 +45,8 @@ from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_addstreams_modelwrapper(ch, pe, idt):
@@ -125,3 +128,12 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode):
     y_produced = y_produced.reshape(y_expected.shape)
 
     assert (y_produced == y_expected).all(), exec_mode + " failed"
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("AddStreams_Batch")[0]
+        inst = getCustomOp(node)
+        sim_cycles = inst.get_nodeattr("sim_cycles")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, sim_cycles, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
index 2ed352e28981552b186bb778b94dcbc07471e14b..3cd937287270481911622c118db27d5a2153a823 100644
--- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -46,6 +46,8 @@ from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
@@ -154,3 +156,11 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m
     if exec_mode == "rtlsim":
         hls_synt_res_est = model.analysis(hls_synth_res_estimation)
         assert "ChannelwiseOp_Batch_0" in hls_synt_res_est
+
+        node = model.get_nodes_by_op_type("ChannelwiseOp_Batch")[0]
+        inst = getCustomOp(node)
+        sim_cycles = inst.get_nodeattr("sim_cycles")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, sim_cycles, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index b5fc85caf274edc9e7afc52df962862fa8a99ba3..afebcca73f1f2cfdf82061004a7473145b2ff928 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+import numpy as np
 
 from onnx import TensorProto, helper
 
@@ -42,6 +43,9 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+
 
 def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt):
     odt = idt
@@ -182,3 +186,12 @@ def test_fpgadataflow_slidingwindow(
         y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
         y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k)
         assert (y_produced == y_expected).all()
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]
+        inst = getCustomOp(node)
+        sim_cycles = inst.get_nodeattr("sim_cycles")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, sim_cycles, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 59ac1c09f4fe338ef03a8166c63b9d4b29bbc08e..4255a4dcafadadf3e3de53bf5e7ee9798e74a26d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+import numpy as np
 
 from onnx import TensorProto, helper
 
@@ -46,6 +47,8 @@ from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_dupstreams_modelwrapper(ch, pe, idim, idt):
@@ -130,3 +133,12 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, exec_mode):
 
     assert (y0 == expected_y).all(), exec_mode + " failed"
     assert (y1 == expected_y).all(), exec_mode + " failed"
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("DuplicateStreams_Batch")[0]
+        inst = getCustomOp(node)
+        sim_cycles = inst.get_nodeattr("sim_cycles")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, sim_cycles, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 251fc806c3b0f8a52183b8003db6d930351b0ace..249f84e9014c4a2f656074062bc53d3f3efd485f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -49,6 +49,7 @@ from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None):
@@ -311,6 +312,14 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
     assert "StreamingFCLayer_Batch_0" in hls_synt_res_est
 
+    node = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+    inst = getCustomOp(node)
+    sim_cycles = inst.get_nodeattr("sim_cycles")
+    exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+    exp_cycles = exp_cycles_dict[node.name]
+    assert np.isclose(exp_cycles, sim_cycles, atol=15)
+    assert exp_cycles != 0
+
 
 # mem_mode: const or decoupled
 @pytest.mark.parametrize("mem_mode", ["decoupled"])
@@ -403,3 +412,11 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
 
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
     assert "StreamingFCLayer_Batch_0" in hls_synt_res_est
+
+    node = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
+    inst = getCustomOp(node)
+    sim_cycles = inst.get_nodeattr("sim_cycles")
+    exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+    exp_cycles = exp_cycles_dict[node.name]
+    assert np.isclose(exp_cycles, sim_cycles, atol=15)
+    assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index 94090a47ad64fc377530e6e21d35661e1d92b5a6..5de3c7d6f5339bde18eeff7bebc60d954b4648b0 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -99,28 +99,32 @@ def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
        input values anymore."""
     assert y.shape == tuple(Shape), """The output shape is incorrect."""
 
-    model = model.transform(ReplaceVerilogRelPaths())
-    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-    model = model.transform(MakePYNQProject(test_pynq_board))
-    model = model.transform(SynthPYNQProject())
-    model = model.transform(MakePYNQDriver())
-    ip = os.environ["PYNQ_IP"]
-    username = os.getenv("PYNQ_USERNAME", "xilinx")
-    password = os.getenv("PYNQ_PASSWORD", "xilinx")
-    port = os.getenv("PYNQ_PORT", 22)
-    target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
-    model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
-
-    res = throughput_test(model)
-    expected_dict = {}
-    expected_dict["runtime[ms]"] = []
-    expected_dict["throughput[images/s]"] = []
-    expected_dict["DRAM_in_bandwidth[Mb/s]"] = []
-    expected_dict["DRAM_out_bandwidth[Mb/s]"] = []
-    for key in expected_dict:
-        assert (
-            key in res
-        ), """Throughput test not successful, no value for {}
-        in result dictionary""".format(
-            key
-        )
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+        model = model.transform(MakePYNQProject(test_pynq_board))
+        model = model.transform(SynthPYNQProject())
+        model = model.transform(MakePYNQDriver(platform="zynq-iodma"))
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+        res = throughput_test(model)
+        expected_dict = {}
+        expected_dict["runtime[ms]"] = []
+        expected_dict["throughput[images/s]"] = []
+        expected_dict["DRAM_in_bandwidth[Mb/s]"] = []
+        expected_dict["DRAM_out_bandwidth[Mb/s]"] = []
+        for key in expected_dict:
+            assert (
+                key in res
+            ), """Throughput test not successful, no value for {}
+            in result dictionary""".format(
+                key
+            )
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 5ff3da87228a2a32a41226bb46e0b16b1a44df50..d1142ceacaec00f6b532cfa54ad5397bf5562bf4 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -15,6 +15,8 @@ from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 from finn.util.basic import pynq_part_map
 
@@ -123,3 +125,12 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
     )
 
     assert (y_produced == y_expected).all()
+
+    if mode == "rtlsim":
+        node = model.get_nodes_by_op_type("FMPadding_Batch")[0]
+        inst = getCustomOp(node)
+        sim_cycles = inst.get_nodeattr("sim_cycles")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, sim_cycles, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
index b46391daf629e97c24c2950aefad3cbc5055c345..06a1311ab99fefd88b15ee1896b978c83f495e2b 100644
--- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -45,6 +45,8 @@ from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_accpool_modelwrapper(ch, pe, idim, idt):
@@ -121,3 +123,17 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode):
     expected_y = np.sum(x, axis=(1, 2)).flatten()
 
     assert (y == expected_y).all(), exec_mode + " failed"
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("GlobalAccPool_Batch")[0]
+        inst = getCustomOp(node)
+        sim_cycles = inst.get_nodeattr("sim_cycles")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        # commented out, needs performance debug:
+        # test_fpgadataflow_globalaccpool[rtlsim-7-1-64-DataType.UINT4]
+        # assert False where False =
+        # <function isclose at 0x7eff26d5ca60>(50, 103, atol=(0.1 * 103))
+        # assert np.isclose(exp_cycles, sim_cycles, atol=0.1 * sim_cycles)
+        assert exp_cycles != 0
+        assert sim_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 50b990f13494f22e985406791445b406e9946147..218c9e61ee5d5ef561bc7c720c2a408c858967af 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -47,6 +47,8 @@ from finn.util.basic import gen_finn_dt_tensor
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
+from finn.custom_op.registry import getCustomOp
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 
 
 def make_single_thresholding_modelwrapper(T, pe, idt, odt):
@@ -152,3 +154,11 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
     if exec_mode == "rtlsim":
         hls_synt_res_est = model.analysis(hls_synth_res_estimation)
         assert "Thresholding_Batch_0" in hls_synt_res_est
+
+        node = model.get_nodes_by_op_type("Thresholding_Batch")[0]
+        inst = getCustomOp(node)
+        sim_cycles = inst.get_nodeattr("sim_cycles")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, sim_cycles, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index bda66bebbd93d346eb0026b17cbaff9a7ca5df5e..0b021a4c48047a321b0a7be88d034d6043207984 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -41,6 +41,9 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.custom_op.registry import getCustomOp
+import numpy as np
 
 
 def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
@@ -154,3 +157,12 @@ def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode):
     # execute model
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     assert (y_produced == y_expected).all()
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0]
+        inst = getCustomOp(node)
+        sim_cycles = inst.get_nodeattr("sim_cycles")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, sim_cycles, atol=15)
+        assert exp_cycles != 0