Merge pull request #750 from Xilinx/feature/test_fifosizing_cnv

Add cnv-w2a2 to FIFO sizing test

Merge pull request #750 from Xilinx/feature/test_fifosizing_cnv
Add cnv-w2a2 to FIFO sizing test
e352acdf · auphelia · GitHub · e79ea42f · f3c4d2f4 · e352acdf
Unverified Commit e352acdf authored 2 years ago by auphelia Committed by GitHub 2 years ago
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -668,7 +668,6 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
        rtlsim_bs = int(cfg.rtlsim_batch_size)
        orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
        if force_python_rtlsim:
-            # run with single input to get latency
            assert rtlsim_bs > 0, "rtlsim batch size must be >0"
            if cfg.verify_save_rtlsim_waveforms:
                # set depth to 3 for layer-by-layer visibility
@@ -680,9 +679,11 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
            rtlsim_model.set_metadata_prop(
                "extra_verilator_args", str(["-CFLAGS", "-O3"])
            )
+            # run with single input to get latency
+            rtlsim_latency_dict = throughput_test_rtlsim(rtlsim_model, 1)
+            # run with batch to get stable-state throughput
            rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
-            rtlsim_latency = rtlsim_perf_dict["cycles"]
-            rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+            rtlsim_perf_dict["latency_cycles"] = rtlsim_latency_dict["cycles"]
        else:
            rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs)
            # keep keys consistent between the Python and C++-styles
@@ -696,6 +697,19 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
            for (key, val) in rtlsim_perf_dict.items():
                if "max_count" in key:
                    del rtlsim_perf_dict[key]
+        # estimate stable-state throughput based on latency+throughput
+        if rtlsim_bs == 1:
+            rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_perf_dict[
+                "throughput[images/s]"
+            ]
+        else:
+            total_cycles = rtlsim_perf_dict["cycles"]
+            latency_cycles = rtlsim_perf_dict["latency_cycles"]
+            stablestate_cycles = total_cycles - latency_cycles
+            clk_ns = float(model.get_metadata_prop("clk_ns"))
+            fclk_mhz = 1 / (clk_ns * 0.001)
+            runtime_s = (stablestate_cycles * clk_ns) * (10**-9)
+            rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_bs / runtime_s

        with open(report_dir + "/rtlsim_performance.json", "w") as f:
            json.dump(rtlsim_perf_dict, f, indent=2)

--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -81,15 +81,11 @@ class InsertDWC(Transformation):
                            dwc_in_width = n0.get_outstream_width()
                            # determine dwc outwidth
                            dwc_out_width = n1.get_instream_width()
-                            larger_width = max(dwc_in_width, dwc_out_width)
-                            smaller_width = min(dwc_in_width, dwc_out_width)
-                            both_8bit_aligned = (larger_width % 8 == 0) and (
-                                smaller_width % 8 == 0
-                            )
-                            if both_8bit_aligned:
-                                impl_style = "vivado"
-                            else:
-                                impl_style = "hls"
+                            # use hls mode by default since it supports more configs
+                            # vivado mode can be manually enabled by user, but does not
+                            # support e.g. node-by-node rtlsim neded for
+                            # characterization-based FIFO sizing
+                            impl_style = "hls"

                            # determine shape for dwc
                            dwc_shape = n0.get_normal_output_shape()

--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -55,7 +55,7 @@ def fetch_test_model(topology, wbits=2, abits=2):
 @pytest.mark.parametrize(
    "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"]
 )
-@pytest.mark.parametrize("topology", ["tfc"])
+@pytest.mark.parametrize("topology", ["tfc", "cnv"])
 def test_fifosizing_linear(method, topology):
    force_python_rtlsim = "python" in method
    method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize"
@@ -68,7 +68,7 @@ def test_fifosizing_linear(method, topology):
        force_python_rtlsim=force_python_rtlsim,
        synth_clk_period_ns=10.0,
        board="Pynq-Z1",
-        rtlsim_batch_size=100,
+        rtlsim_batch_size=100 if topology == "tfc" else 2,
        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
        generate_outputs=[
            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
@@ -83,7 +83,7 @@ def test_fifosizing_linear(method, topology):
    with open(tmp_output_dir + "/report/rtlsim_performance.json") as f:
        sim_data = json.load(f)
    assert (
-        float(sim_data["throughput[images/s]"])
+        float(sim_data["stable_throughput[images/s]"])
        / float(est_data["estimated_throughput_fps"])
        > 0.9
    )