diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 2ee898bc7d50822f962b6a70cf86b2893e0937b7..6e07a541e3d462b159792482dae4777999921a2c 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -668,7 +668,6 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
         rtlsim_bs = int(cfg.rtlsim_batch_size)
         orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
         if force_python_rtlsim:
-            # run with single input to get latency
             assert rtlsim_bs > 0, "rtlsim batch size must be >0"
             if cfg.verify_save_rtlsim_waveforms:
                 # set depth to 3 for layer-by-layer visibility
@@ -680,9 +679,11 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
                 "extra_verilator_args", str(["-CFLAGS", "-O3"])
+            # run with single input to get latency
+            rtlsim_latency_dict = throughput_test_rtlsim(rtlsim_model, 1)
+            # run with batch to get stable-state throughput
             rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
-            rtlsim_latency = rtlsim_perf_dict["cycles"]
-            rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+            rtlsim_perf_dict["latency_cycles"] = rtlsim_latency_dict["cycles"]
             rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs)
             # keep keys consistent between the Python and C++-styles
@@ -696,6 +697,19 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
             for (key, val) in rtlsim_perf_dict.items():
                 if "max_count" in key:
                     del rtlsim_perf_dict[key]
+        # estimate stable-state throughput based on latency+throughput
+        if rtlsim_bs == 1:
+            rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_perf_dict[
+                "throughput[images/s]"
+            ]
+        else:
+            total_cycles = rtlsim_perf_dict["cycles"]
+            latency_cycles = rtlsim_perf_dict["latency_cycles"]
+            stablestate_cycles = total_cycles - latency_cycles
+            clk_ns = float(model.get_metadata_prop("clk_ns"))
+            fclk_mhz = 1 / (clk_ns * 0.001)
+            runtime_s = (stablestate_cycles * clk_ns) * (10**-9)
+            rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_bs / runtime_s
         with open(report_dir + "/rtlsim_performance.json", "w") as f:
             json.dump(rtlsim_perf_dict, f, indent=2)
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 632d1f813b4d2509407930bc9294f7531d4c90af..cff8b602674fec41a1e6fd1d467acdc989b4afe2 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -81,15 +81,11 @@ class InsertDWC(Transformation):
                             dwc_in_width = n0.get_outstream_width()
                             # determine dwc outwidth
                             dwc_out_width = n1.get_instream_width()
-                            larger_width = max(dwc_in_width, dwc_out_width)
-                            smaller_width = min(dwc_in_width, dwc_out_width)
-                            both_8bit_aligned = (larger_width % 8 == 0) and (
-                                smaller_width % 8 == 0
-                            )
-                            if both_8bit_aligned:
-                                impl_style = "vivado"
-                            else:
-                                impl_style = "hls"
+                            # use hls mode by default since it supports more configs
+                            # vivado mode can be manually enabled by user, but does not
+                            # support e.g. node-by-node rtlsim neded for
+                            # characterization-based FIFO sizing
+                            impl_style = "hls"
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index f4f2b8dbfff0d720ec4eb901704581b096c0ea40..9399fbe3949a5d0052ba80b24b1a9e0c44c5597c 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -55,7 +55,7 @@ def fetch_test_model(topology, wbits=2, abits=2):
     "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"]
-@pytest.mark.parametrize("topology", ["tfc"])
+@pytest.mark.parametrize("topology", ["tfc", "cnv"])
 def test_fifosizing_linear(method, topology):
     force_python_rtlsim = "python" in method
     method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize"
@@ -68,7 +68,7 @@ def test_fifosizing_linear(method, topology):
-        rtlsim_batch_size=100,
+        rtlsim_batch_size=100 if topology == "tfc" else 2,
@@ -83,7 +83,7 @@ def test_fifosizing_linear(method, topology):
     with open(tmp_output_dir + "/report/rtlsim_performance.json") as f:
         sim_data = json.load(f)
     assert (
-        float(sim_data["throughput[images/s]"])
+        float(sim_data["stable_throughput[images/s]"])
         / float(est_data["estimated_throughput_fps"])
         > 0.9