Merge branch 'feature/throughput_test_rtlsim' into dev

5be0e52f · Yaman Umuroglu · 165d4c97 · b5c9ee07 · 5be0e52f · 5be0e52f
Commit 5be0e52f authored 4 years ago by Yaman Umuroglu
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -66,6 +66,11 @@ def rtlsim_exec(model, execution_context):
    i_stream_w = first_node.get_instream_width()
    # convert input into time multiplexed shape
    i_folded_shape = first_node.get_folded_input_shape()
+    batchsize = i_tensor.shape[0]
+    # override batch size for input
+    i_folded_shape = list(i_folded_shape)
+    i_folded_shape[0] = batchsize
+    i_folded_shape = tuple(i_folded_shape)
    # TODO any other layout transformations need to happen here!
    i_tensor = i_tensor.reshape(i_folded_shape)
    # extract output shape
@@ -74,12 +79,20 @@ def rtlsim_exec(model, execution_context):
    o_dt = model.get_tensor_datatype(o_name)
    last_node = getCustomOp(model.find_producer(o_name))
    o_folded_shape = last_node.get_folded_output_shape()
+    # override batch size from actual input
+    o_shape = list(o_shape)
+    o_shape[0] = batchsize
+    o_shape = tuple(o_shape)
+    o_folded_shape = list(o_folded_shape)
+    o_folded_shape[0] = batchsize
+    o_folded_shape = tuple(o_folded_shape)
    o_stream_w = last_node.get_outstream_width()
    packedBits = o_stream_w
    targetBits = o_dt.bitwidth()
    # pack input
    packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
    num_out_values = last_node.get_number_output_values()
+    num_out_values *= batchsize
    # prepare pyverilator model
    rtlsim_so = model.get_metadata_prop("rtlsim_so")
    if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):

--- a/src/finn/core/throughput_test.py
+++ b/src/finn/core/throughput_test.py
@@ -28,6 +28,10 @@

 import os
 import subprocess
+import numpy as np
+
+from finn.util.basic import gen_finn_dt_tensor
+from finn.core.rtlsim_exec import rtlsim_exec


 def throughput_test(model, batchsize=1000):
@@ -88,3 +92,50 @@ def throughput_test(model, batchsize=1000):
        return res
    except FileNotFoundError:
        return None
+
+
+def throughput_test_rtlsim(model, batchsize=100):
+    """Runs a throughput test for the given IP-stitched model. When combined
+    with tracing, useful to determine bottlenecks and required FIFO sizes."""
+
+    assert (
+        model.get_metadata_prop("exec_mode") == "rtlsim"
+    ), """Top-level exec_mode
+    metadata_prop must be set to rtlsim"""
+
+    # create random input
+    iname = model.graph.input[0].name
+    ishape = model.get_tensor_shape(iname)
+    ishape_batch = ishape
+    ishape_batch[0] = batchsize
+    idt = model.get_tensor_datatype(iname)
+    dummy_input = gen_finn_dt_tensor(idt, ishape_batch)
+    # compute input/output sizes
+    oname = model.graph.output[0].name
+    oshape = model.get_tensor_shape(oname)
+    oshape_batch = oshape
+    oshape_batch[0] = batchsize
+    odt = model.get_tensor_datatype(oname)
+    i_bytes = (np.prod(ishape_batch) * idt.bitwidth()) / 8
+    o_bytes = (np.prod(oshape_batch) * odt.bitwidth()) / 8
+    # make empty exec context and insert input
+    ctx = model.make_empty_exec_context()
+    ctx[iname] = dummy_input
+    # remove liveness threshold, launch rtlsim
+    os.environ["LIVENESS_THRESHOLD"] = "-1"
+    rtlsim_exec(model, ctx)
+    # extract metrics
+    cycles = int(model.get_metadata_prop("sim_cycles"))
+    clk_ns = float(model.get_metadata_prop("clk_ns"))
+    fclk_mhz = 1 / (clk_ns * 0.001)
+    runtime_s = (cycles * clk_ns) * (10 ** -9)
+    res = dict()
+    res["cycles"] = cycles
+    res["runtime[ms]"] = runtime_s * 1000
+    res["throughput[images/s]"] = batchsize / runtime_s
+    res["DRAM_in_bandwidth[Mb/s]"] = i_bytes * 0.000001 / runtime_s
+    res["DRAM_out_bandwidth[Mb/s]"] = o_bytes * 0.000001 / runtime_s
+    res["fclk[mhz]"] = fclk_mhz
+    res["N"] = batchsize
+
+    return res
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -70,6 +70,16 @@ def get_rtlsim_trace_depth():
        return 1


+def get_remote_vivado():
+    """Return the address of the remote Vivado synthesis server as set by the,
+    REMOTE_VIVADO environment variable, otherwise return None"""
+
+    try:
+        return os.environ["REMOTE_VIVADO"]
+    except KeyError:
+        return None
+
+
 def get_num_default_workers():
    """Return the number of workers for parallel transformations. Controllable
    via the NUM_DEFAULT_WORKERS environment variable. If the env.var. is

--- a/src/finn/util/create.py
+++ b/src/finn/util/create.py
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+from finn.core.modelwrapper import ModelWrapper
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+
+
+def hls_random_mlp_maker(layer_spec):
+    """Create an MLP of given specification using HLSCustomOp instances.
+    Generate random weights/thresholds of appropriate size."""
+    ret = []
+    for l in layer_spec:
+        idt = l["idt"]
+        wdt = l["wdt"]
+        mw = l["mw"]
+        mh = l["mh"]
+        act = l["act"]
+        l["W"] = gen_finn_dt_tensor(wdt, (mw, mh))
+        if act is None:
+            # no activation, produce accumulators
+            T = None
+            tdt = None
+            if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+                odt = DataType.UINT32
+            else:
+                odt = DataType.INT32
+        else:
+            odt = act
+            (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
+            n_steps = act.get_num_possible_values() - 1
+            T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32)
+            # provide non-decreasing thresholds
+            T = np.sort(T, axis=1)
+            # generate thresholds for activation
+            if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+                tdt = DataType.UINT32
+                # bias thresholds to be positive
+                T = np.ceil((T + mw) / 2)
+                assert (T >= 0).all()
+            else:
+                tdt = DataType.INT32
+        l["T"] = T
+        l["tdt"] = tdt
+        l["odt"] = odt
+        ret.append(l)
+
+    return hls_mlp_maker(ret)
+
+
+def hls_mlp_maker(layer_spec):
+    """Create an MLP of given specification using HLSCustomOp instances."""
+
+    current_in_name = ""
+    current_out_name = ""
+    i = 0
+
+    graph = helper.make_graph(nodes=[], name="mlp", inputs=[], outputs=[])
+
+    model = helper.make_model(graph, producer_name="finn")
+    model = ModelWrapper(model)
+
+    for l in layer_spec:
+        current_W_name = "W_%d" % i
+        current_T_name = "T_%d" % i
+        current_in_name = "act_%d" % i
+        current_out_name = "act_%d" % (i + 1)
+
+        W = l["W"]
+        (mw, mh) = W.shape
+        T = l["T"]
+        pe = l["pe"]
+        simd = l["simd"]
+        wdt = l["wdt"]
+        idt = l["idt"]
+        tdt = l["tdt"]
+        odt = l["odt"]
+
+        if i == 0:
+            global_in = helper.make_tensor_value_info(
+                current_in_name, TensorProto.FLOAT, [1, mw]
+            )
+            model.graph.input.append(global_in)
+
+        if i == len(layer_spec) - 1:
+            global_out = helper.make_tensor_value_info(
+                current_out_name, TensorProto.FLOAT, [1, mh]
+            )
+            model.graph.output.append(global_out)
+
+        # there are two ways to implement bipolar weights and inputs for
+        # StreamingFC:
+        # - specify their datatypes as such
+        # - specify their datatypes as BINARY as use binaryXnorMode
+        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+            # we'll internally convert weights/inputs to binary and specify the
+            # datatypes as such, and also set the binaryXnorMode attribute to 1
+            export_wdt = DataType.BINARY
+            export_idt = DataType.BINARY
+            binary_xnor_mode = 1
+        else:
+            export_wdt = wdt
+            export_idt = idt
+            binary_xnor_mode = 0
+
+        if T is not None:
+            no_act = 0
+            node_inp_list = [current_in_name, current_W_name, current_T_name]
+            if odt == DataType.BIPOLAR:
+                actval = 0
+            else:
+                actval = odt.min()
+        else:
+            # no thresholds
+            node_inp_list = [current_in_name, current_W_name]
+            actval = 0
+            no_act = 1
+        FCLayer_node = helper.make_node(
+            "StreamingFCLayer_Batch",
+            node_inp_list,
+            [current_out_name],
+            domain="finn",
+            backend="fpgadataflow",
+            resType="ap_resource_lut()",
+            MW=mw,
+            MH=mh,
+            SIMD=simd,
+            PE=pe,
+            inputDataType=export_idt.name,
+            weightDataType=export_wdt.name,
+            outputDataType=odt.name,
+            ActVal=actval,
+            binaryXnorMode=binary_xnor_mode,
+            noActivation=no_act,
+        )
+
+        model.graph.node.append(FCLayer_node)
+        model.set_tensor_datatype(current_in_name, idt)
+        model.set_tensor_datatype(current_out_name, odt)
+        model.set_tensor_datatype(current_W_name, wdt)
+        if binary_xnor_mode:
+            # convert bipolar to binary
+            model.set_initializer(current_W_name, (W + 1) / 2)
+        else:
+            model.set_initializer(current_W_name, W)
+        if T is not None:
+            model.set_tensor_datatype(current_T_name, tdt)
+            model.set_initializer(current_T_name, T)
+        i += 1
+
+    return model
--- a/src/finn/util/vivado.py
+++ b/src/finn/util/vivado.py
@@ -28,6 +28,7 @@

 import os
 import subprocess
+from finn.util.basic import get_remote_vivado


 def which(program):
@@ -57,6 +58,7 @@ def out_of_context_synth(
    fpga_part="xczu3eg-sbva484-1-e",
    clk_name="ap_clk_0",
    clk_period_ns=5.0,
+    remote_server=get_remote_vivado(),
 ):
    "Run out-of-context Vivado synthesis, return resources and slack."

@@ -67,7 +69,10 @@ def out_of_context_synth(
    if which("vivado") is None:
        raise Exception("vivado is not in PATH, ensure settings64.sh is sourced.")
    omx_path = os.environ["OHMYXILINX"]
-    script = "vivadocompile.sh"
+    if remote_server is None:
+        script = "vivadocompile.sh"
+    else:
+        script = "vivadoprojgen.sh"
    # vivadocompile.sh <top-level-entity> <clock-name (optional)> <fpga-part (optional)>
    call_omx = "zsh %s/%s %s %s %s %f" % (
        omx_path,
@@ -85,6 +90,37 @@ def out_of_context_synth(

    vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name)
    res_counts_path = vivado_proj_folder + "/res.txt"
+    if remote_server is not None:
+        run_synth = """
+#!/bin/bash
+which vivado;
+cd %s;
+vivado -mode tcl -source %s.tcl -tclargs %s;
+cat %s
+        """ % (
+            vivado_proj_folder,
+            top_name,
+            top_name,
+            res_counts_path,
+        )
+        with open(vivado_proj_folder + "/run.sh", "w") as f:
+            f.write(run_synth)
+        # note that this assumes the same temp folder can be created on the
+        # remote server
+        remote_server_uri = remote_server + ":" + verilog_dir
+        copy_files = "rsync -avz %s %s" % (verilog_dir + "/", remote_server_uri + "/")
+        copy_files = copy_files.split()
+        proc = subprocess.Popen(copy_files, cwd=verilog_dir, env=os.environ)
+        proc.communicate()
+        vivado_cmd = "bash %s/run.sh" % vivado_proj_folder
+        run_vivado = ["ssh", "-t", remote_server, vivado_cmd]
+        proc = subprocess.Popen(run_vivado, cwd=verilog_dir, env=os.environ)
+        proc.communicate()
+        remote_server_result = remote_server + ":" + res_counts_path
+        copy_results = "rsync -avz %s %s" % (remote_server_result, res_counts_path)
+        copy_results = copy_results.split()
+        proc = subprocess.Popen(copy_results, cwd=verilog_dir, env=os.environ)
+        proc.communicate()

    with open(res_counts_path, "r") as myfile:
        res_data = myfile.read().split("\n")

--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -72,6 +72,7 @@ from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.core.throughput_test import throughput_test_rtlsim

 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -142,15 +143,15 @@ def test_end2end_cnv_w1a1_fold_and_tlastmarker():
    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
    # each tuple is (PE, SIMD, in_fifo_depth) for a layer
    folding = [
-        (16, 3, 128),
-        (32, 32, 128),
-        (16, 32, 128),
-        (16, 32, 128),
-        (4, 32, 81),
+        (16, 3, 256),
+        (32, 32, 256),
+        (16, 32, 256),
+        (16, 32, 256),
+        (4, 32, 214),
        (1, 32, 2),
-        (1, 4, 2),
-        (1, 8, 128),
-        (5, 1, 3),
+        (1, 4, 126),
+        (1, 8, 62),
+        (5, 1, 6),
    ]
    for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):
        fcl_inst = getCustomOp(fcl)
@@ -159,10 +160,12 @@ def test_end2end_cnv_w1a1_fold_and_tlastmarker():
        fcl_inst.set_nodeattr("inFIFODepth", ififodepth)

    swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
+    swg_idepth = [2, 51, 9, 106, 2, 2]
    for i in range(len(swg_layers)):
        swg_inst = getCustomOp(swg_layers[i])
        simd = folding[i][1]
        swg_inst.set_nodeattr("SIMD", simd)
+        swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i])

    model = model.transform(InsertDWC())
    model = model.transform(InsertFIFO())
@@ -221,6 +224,20 @@ def test_end2end_cnv_w1a1_verify_dataflow_part():
    assert np.isclose(res_cppsim, res_rtlsim_whole).all()


+@pytest.mark.vivado
+def test_end2end_cnv_w1a1_throughput_test_rtlsim():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w1a1_ipstitch_whole_rtlsim.onnx"
+    )
+    model.set_metadata_prop("rtlsim_trace", "rtlsim_trace.vcd")
+    # os.environ["RTLSIM_TRACE_DEPTH"] = "4"
+    # run through IP-stitched rtlsim with increasing batch sizes and
+    # check the number of cycles it takes to execute
+    ret = throughput_test_rtlsim(model, 10)
+    # TODO check for expected performance
+    assert ret["cycles"] > 0
+
+
 @pytest.mark.vivado
 def test_end2end_cnv_w1a1_verify_all():
    # use the streamlined model as the "golden" model for right answers

--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1.py
@@ -72,6 +72,7 @@ from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.core.throughput_test import throughput_test_rtlsim
 import finn.util.vcd as vcd

 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
@@ -225,6 +226,21 @@ def test_end2end_tfc_w1a1_verify_fifo_fullness():
    )


+@pytest.mark.vivado
+def test_end2end_tfc_w1a1_throughput_test_rtlsim():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx"
+    )
+    # run through IP-stitched rtlsim with increasing batch sizes and
+    # check the number of cycles it takes to execute
+    ret = throughput_test_rtlsim(model, 1)
+    assert ret["cycles"] == 205
+    ret = throughput_test_rtlsim(model, 10)
+    assert ret["cycles"] == 844
+    ret = throughput_test_rtlsim(model, 100)
+    assert ret["cycles"] == 7234
+
+
 @pytest.mark.vivado
 def test_end2end_tfc_w1a1_verify_all():
    # use the streamlined model as the "golden" model for right answers

--- a/tests/util/test_create.py
+++ b/tests/util/test_create.py
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import finn.util.create as create
+from finn.core.datatype import DataType
+
+
+@pytest.mark.parametrize("bitwidth", [DataType.BIPOLAR, DataType.INT2, DataType.INT4])
+def test_hls_random_mlp_maker(bitwidth):
+    w = bitwidth
+    a = bitwidth
+    layer_spec = [
+        {
+            "mw": 185,
+            "mh": 100,
+            "simd": 185,
+            "pe": 100,
+            "idt": DataType.BIPOLAR,
+            "wdt": w,
+            "act": a,
+        },
+        {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
+        {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
+        {"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
+        {
+            "mw": 100,
+            "mh": 1,
+            "simd": 100,
+            "pe": 1,
+            "idt": a,
+            "wdt": w,
+            "act": DataType.BIPOLAR,
+        },
+    ]
+
+    ret = create.hls_random_mlp_maker(layer_spec)
+    assert len(ret.graph.node) == 5
+    ret.save("mlp-%s.onnx" % str(bitwidth))