Skip to content
Snippets Groups Projects
Commit 5be0e52f authored by Yaman Umuroglu's avatar Yaman Umuroglu
Browse files

Merge branch 'feature/throughput_test_rtlsim' into dev

parents 165d4c97 b5c9ee07
No related branches found
No related tags found
No related merge requests found
......@@ -66,6 +66,11 @@ def rtlsim_exec(model, execution_context):
i_stream_w = first_node.get_instream_width()
# convert input into time multiplexed shape
i_folded_shape = first_node.get_folded_input_shape()
batchsize = i_tensor.shape[0]
# override batch size for input
i_folded_shape = list(i_folded_shape)
i_folded_shape[0] = batchsize
i_folded_shape = tuple(i_folded_shape)
# TODO any other layout transformations need to happen here!
i_tensor = i_tensor.reshape(i_folded_shape)
# extract output shape
......@@ -74,12 +79,20 @@ def rtlsim_exec(model, execution_context):
o_dt = model.get_tensor_datatype(o_name)
last_node = getCustomOp(model.find_producer(o_name))
o_folded_shape = last_node.get_folded_output_shape()
# override batch size from actual input
o_shape = list(o_shape)
o_shape[0] = batchsize
o_shape = tuple(o_shape)
o_folded_shape = list(o_folded_shape)
o_folded_shape[0] = batchsize
o_folded_shape = tuple(o_folded_shape)
o_stream_w = last_node.get_outstream_width()
packedBits = o_stream_w
targetBits = o_dt.bitwidth()
# pack input
packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
num_out_values = last_node.get_number_output_values()
num_out_values *= batchsize
# prepare pyverilator model
rtlsim_so = model.get_metadata_prop("rtlsim_so")
if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):
......
......@@ -28,6 +28,10 @@
import os
import subprocess
import numpy as np
from finn.util.basic import gen_finn_dt_tensor
from finn.core.rtlsim_exec import rtlsim_exec
def throughput_test(model, batchsize=1000):
......@@ -88,3 +92,50 @@ def throughput_test(model, batchsize=1000):
return res
except FileNotFoundError:
return None
def throughput_test_rtlsim(model, batchsize=100):
"""Runs a throughput test for the given IP-stitched model. When combined
with tracing, useful to determine bottlenecks and required FIFO sizes."""
assert (
model.get_metadata_prop("exec_mode") == "rtlsim"
), """Top-level exec_mode
metadata_prop must be set to rtlsim"""
# create random input
iname = model.graph.input[0].name
ishape = model.get_tensor_shape(iname)
ishape_batch = ishape
ishape_batch[0] = batchsize
idt = model.get_tensor_datatype(iname)
dummy_input = gen_finn_dt_tensor(idt, ishape_batch)
# compute input/output sizes
oname = model.graph.output[0].name
oshape = model.get_tensor_shape(oname)
oshape_batch = oshape
oshape_batch[0] = batchsize
odt = model.get_tensor_datatype(oname)
i_bytes = (np.prod(ishape_batch) * idt.bitwidth()) / 8
o_bytes = (np.prod(oshape_batch) * odt.bitwidth()) / 8
# make empty exec context and insert input
ctx = model.make_empty_exec_context()
ctx[iname] = dummy_input
# remove liveness threshold, launch rtlsim
os.environ["LIVENESS_THRESHOLD"] = "-1"
rtlsim_exec(model, ctx)
# extract metrics
cycles = int(model.get_metadata_prop("sim_cycles"))
clk_ns = float(model.get_metadata_prop("clk_ns"))
fclk_mhz = 1 / (clk_ns * 0.001)
runtime_s = (cycles * clk_ns) * (10 ** -9)
res = dict()
res["cycles"] = cycles
res["runtime[ms]"] = runtime_s * 1000
res["throughput[images/s]"] = batchsize / runtime_s
res["DRAM_in_bandwidth[Mb/s]"] = i_bytes * 0.000001 / runtime_s
res["DRAM_out_bandwidth[Mb/s]"] = o_bytes * 0.000001 / runtime_s
res["fclk[mhz]"] = fclk_mhz
res["N"] = batchsize
return res
......@@ -70,6 +70,16 @@ def get_rtlsim_trace_depth():
return 1
def get_remote_vivado():
"""Return the address of the remote Vivado synthesis server as set by the,
REMOTE_VIVADO environment variable, otherwise return None"""
try:
return os.environ["REMOTE_VIVADO"]
except KeyError:
return None
def get_num_default_workers():
"""Return the number of workers for parallel transformations. Controllable
via the NUM_DEFAULT_WORKERS environment variable. If the env.var. is
......
# Copyright (c) 2020, Xilinx
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of FINN nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import numpy as np
from finn.core.modelwrapper import ModelWrapper
from onnx import TensorProto, helper
from finn.core.datatype import DataType
from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
def hls_random_mlp_maker(layer_spec):
"""Create an MLP of given specification using HLSCustomOp instances.
Generate random weights/thresholds of appropriate size."""
ret = []
for l in layer_spec:
idt = l["idt"]
wdt = l["wdt"]
mw = l["mw"]
mh = l["mh"]
act = l["act"]
l["W"] = gen_finn_dt_tensor(wdt, (mw, mh))
if act is None:
# no activation, produce accumulators
T = None
tdt = None
if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
odt = DataType.UINT32
else:
odt = DataType.INT32
else:
odt = act
(min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
n_steps = act.get_num_possible_values() - 1
T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32)
# provide non-decreasing thresholds
T = np.sort(T, axis=1)
# generate thresholds for activation
if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
tdt = DataType.UINT32
# bias thresholds to be positive
T = np.ceil((T + mw) / 2)
assert (T >= 0).all()
else:
tdt = DataType.INT32
l["T"] = T
l["tdt"] = tdt
l["odt"] = odt
ret.append(l)
return hls_mlp_maker(ret)
def hls_mlp_maker(layer_spec):
"""Create an MLP of given specification using HLSCustomOp instances."""
current_in_name = ""
current_out_name = ""
i = 0
graph = helper.make_graph(nodes=[], name="mlp", inputs=[], outputs=[])
model = helper.make_model(graph, producer_name="finn")
model = ModelWrapper(model)
for l in layer_spec:
current_W_name = "W_%d" % i
current_T_name = "T_%d" % i
current_in_name = "act_%d" % i
current_out_name = "act_%d" % (i + 1)
W = l["W"]
(mw, mh) = W.shape
T = l["T"]
pe = l["pe"]
simd = l["simd"]
wdt = l["wdt"]
idt = l["idt"]
tdt = l["tdt"]
odt = l["odt"]
if i == 0:
global_in = helper.make_tensor_value_info(
current_in_name, TensorProto.FLOAT, [1, mw]
)
model.graph.input.append(global_in)
if i == len(layer_spec) - 1:
global_out = helper.make_tensor_value_info(
current_out_name, TensorProto.FLOAT, [1, mh]
)
model.graph.output.append(global_out)
# there are two ways to implement bipolar weights and inputs for
# StreamingFC:
# - specify their datatypes as such
# - specify their datatypes as BINARY as use binaryXnorMode
if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
# we'll internally convert weights/inputs to binary and specify the
# datatypes as such, and also set the binaryXnorMode attribute to 1
export_wdt = DataType.BINARY
export_idt = DataType.BINARY
binary_xnor_mode = 1
else:
export_wdt = wdt
export_idt = idt
binary_xnor_mode = 0
if T is not None:
no_act = 0
node_inp_list = [current_in_name, current_W_name, current_T_name]
if odt == DataType.BIPOLAR:
actval = 0
else:
actval = odt.min()
else:
# no thresholds
node_inp_list = [current_in_name, current_W_name]
actval = 0
no_act = 1
FCLayer_node = helper.make_node(
"StreamingFCLayer_Batch",
node_inp_list,
[current_out_name],
domain="finn",
backend="fpgadataflow",
resType="ap_resource_lut()",
MW=mw,
MH=mh,
SIMD=simd,
PE=pe,
inputDataType=export_idt.name,
weightDataType=export_wdt.name,
outputDataType=odt.name,
ActVal=actval,
binaryXnorMode=binary_xnor_mode,
noActivation=no_act,
)
model.graph.node.append(FCLayer_node)
model.set_tensor_datatype(current_in_name, idt)
model.set_tensor_datatype(current_out_name, odt)
model.set_tensor_datatype(current_W_name, wdt)
if binary_xnor_mode:
# convert bipolar to binary
model.set_initializer(current_W_name, (W + 1) / 2)
else:
model.set_initializer(current_W_name, W)
if T is not None:
model.set_tensor_datatype(current_T_name, tdt)
model.set_initializer(current_T_name, T)
i += 1
return model
......@@ -28,6 +28,7 @@
import os
import subprocess
from finn.util.basic import get_remote_vivado
def which(program):
......@@ -57,6 +58,7 @@ def out_of_context_synth(
fpga_part="xczu3eg-sbva484-1-e",
clk_name="ap_clk_0",
clk_period_ns=5.0,
remote_server=get_remote_vivado(),
):
"Run out-of-context Vivado synthesis, return resources and slack."
......@@ -67,7 +69,10 @@ def out_of_context_synth(
if which("vivado") is None:
raise Exception("vivado is not in PATH, ensure settings64.sh is sourced.")
omx_path = os.environ["OHMYXILINX"]
script = "vivadocompile.sh"
if remote_server is None:
script = "vivadocompile.sh"
else:
script = "vivadoprojgen.sh"
# vivadocompile.sh <top-level-entity> <clock-name (optional)> <fpga-part (optional)>
call_omx = "zsh %s/%s %s %s %s %f" % (
omx_path,
......@@ -85,6 +90,37 @@ def out_of_context_synth(
vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name)
res_counts_path = vivado_proj_folder + "/res.txt"
if remote_server is not None:
run_synth = """
#!/bin/bash
which vivado;
cd %s;
vivado -mode tcl -source %s.tcl -tclargs %s;
cat %s
""" % (
vivado_proj_folder,
top_name,
top_name,
res_counts_path,
)
with open(vivado_proj_folder + "/run.sh", "w") as f:
f.write(run_synth)
# note that this assumes the same temp folder can be created on the
# remote server
remote_server_uri = remote_server + ":" + verilog_dir
copy_files = "rsync -avz %s %s" % (verilog_dir + "/", remote_server_uri + "/")
copy_files = copy_files.split()
proc = subprocess.Popen(copy_files, cwd=verilog_dir, env=os.environ)
proc.communicate()
vivado_cmd = "bash %s/run.sh" % vivado_proj_folder
run_vivado = ["ssh", "-t", remote_server, vivado_cmd]
proc = subprocess.Popen(run_vivado, cwd=verilog_dir, env=os.environ)
proc.communicate()
remote_server_result = remote_server + ":" + res_counts_path
copy_results = "rsync -avz %s %s" % (remote_server_result, res_counts_path)
copy_results = copy_results.split()
proc = subprocess.Popen(copy_results, cwd=verilog_dir, env=os.environ)
proc.communicate()
with open(res_counts_path, "r") as myfile:
res_data = myfile.read().split("\n")
......
......@@ -72,6 +72,7 @@ from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
from finn.core.throughput_test import throughput_test_rtlsim
build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
......@@ -142,15 +143,15 @@ def test_end2end_cnv_w1a1_fold_and_tlastmarker():
fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
# each tuple is (PE, SIMD, in_fifo_depth) for a layer
folding = [
(16, 3, 128),
(32, 32, 128),
(16, 32, 128),
(16, 32, 128),
(4, 32, 81),
(16, 3, 256),
(32, 32, 256),
(16, 32, 256),
(16, 32, 256),
(4, 32, 214),
(1, 32, 2),
(1, 4, 2),
(1, 8, 128),
(5, 1, 3),
(1, 4, 126),
(1, 8, 62),
(5, 1, 6),
]
for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):
fcl_inst = getCustomOp(fcl)
......@@ -159,10 +160,12 @@ def test_end2end_cnv_w1a1_fold_and_tlastmarker():
fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
swg_idepth = [2, 51, 9, 106, 2, 2]
for i in range(len(swg_layers)):
swg_inst = getCustomOp(swg_layers[i])
simd = folding[i][1]
swg_inst.set_nodeattr("SIMD", simd)
swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i])
model = model.transform(InsertDWC())
model = model.transform(InsertFIFO())
......@@ -221,6 +224,20 @@ def test_end2end_cnv_w1a1_verify_dataflow_part():
assert np.isclose(res_cppsim, res_rtlsim_whole).all()
@pytest.mark.vivado
def test_end2end_cnv_w1a1_throughput_test_rtlsim():
model = load_test_checkpoint_or_skip(
build_dir + "/end2end_cnv_w1a1_ipstitch_whole_rtlsim.onnx"
)
model.set_metadata_prop("rtlsim_trace", "rtlsim_trace.vcd")
# os.environ["RTLSIM_TRACE_DEPTH"] = "4"
# run through IP-stitched rtlsim with increasing batch sizes and
# check the number of cycles it takes to execute
ret = throughput_test_rtlsim(model, 10)
# TODO check for expected performance
assert ret["cycles"] > 0
@pytest.mark.vivado
def test_end2end_cnv_w1a1_verify_all():
# use the streamlined model as the "golden" model for right answers
......
......@@ -72,6 +72,7 @@ from finn.util.basic import pynq_part_map
from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
from finn.core.throughput_test import throughput_test_rtlsim
import finn.util.vcd as vcd
build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
......@@ -225,6 +226,21 @@ def test_end2end_tfc_w1a1_verify_fifo_fullness():
)
@pytest.mark.vivado
def test_end2end_tfc_w1a1_throughput_test_rtlsim():
model = load_test_checkpoint_or_skip(
build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx"
)
# run through IP-stitched rtlsim with increasing batch sizes and
# check the number of cycles it takes to execute
ret = throughput_test_rtlsim(model, 1)
assert ret["cycles"] == 205
ret = throughput_test_rtlsim(model, 10)
assert ret["cycles"] == 844
ret = throughput_test_rtlsim(model, 100)
assert ret["cycles"] == 7234
@pytest.mark.vivado
def test_end2end_tfc_w1a1_verify_all():
# use the streamlined model as the "golden" model for right answers
......
# Copyright (c) 2020, Xilinx
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of FINN nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import pytest
import finn.util.create as create
from finn.core.datatype import DataType
@pytest.mark.parametrize("bitwidth", [DataType.BIPOLAR, DataType.INT2, DataType.INT4])
def test_hls_random_mlp_maker(bitwidth):
w = bitwidth
a = bitwidth
layer_spec = [
{
"mw": 185,
"mh": 100,
"simd": 185,
"pe": 100,
"idt": DataType.BIPOLAR,
"wdt": w,
"act": a,
},
{"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
{"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
{"mw": 100, "mh": 100, "simd": 100, "pe": 100, "idt": a, "wdt": w, "act": a},
{
"mw": 100,
"mh": 1,
"simd": 100,
"pe": 1,
"idt": a,
"wdt": w,
"act": DataType.BIPOLAR,
},
]
ret = create.hls_random_mlp_maker(layer_spec)
assert len(ret.graph.node) == 5
ret.save("mlp-%s.onnx" % str(bitwidth))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment