diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py index 335dfec04e4abee41f914c5d912ce291a0d31a91..a533e4d36629f57f7c4a576570d75a1e051de5be 100644 --- a/src/finn/core/remote_exec.py +++ b/src/finn/core/remote_exec.py @@ -79,6 +79,12 @@ def remote_exec(model, execution_context): bash_command = ["/bin/bash", "-c", cmd] process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) process_compile.communicate() + # remove stale output file from local dir, if any + try: + os.remove("{}/output.npy".format(deployment_dir)) + except FileNotFoundError: + pass + # copy generated output to local cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/output.npy {}".format( pynq_password, pynq_port, diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py index c82d540e29fc59b92a22bf011e823a9f8c076843..8d3dabcf8af51327d5d951464c6d9b36e2f67497 100644 --- a/src/finn/core/throughput_test.py +++ b/src/finn/core/throughput_test.py @@ -30,10 +30,11 @@ import os import subprocess -def throughput_test(model): +def throughput_test(model, batchsize=1000): """Runs the throughput test for the given model remotely on the pynq board. The metadata properties related to the pynq board have to be set. - Returns a dictionary with results of the throughput test""" + Returns a dictionary with results of the throughput test. Returns None + if the test fails.""" pynq_ip = model.get_metadata_prop("pynq_ip") pynq_port = int(model.get_metadata_prop("pynq_port")) @@ -47,7 +48,8 @@ def throughput_test(model): cmd = ( "sshpass -p {} ssh {}@{} -p {} " '"cd {}/{}; echo "{}" | ' - 'sudo -S python3.6 driver.py --exec_mode="throughput_test" --batchsize=1000"' + 'sudo -S python3.6 driver.py --exec_mode="throughput_test" --batchsize=%d"' + % batchsize ).format( pynq_password, pynq_username, @@ -61,6 +63,12 @@ def throughput_test(model): process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) process_compile.communicate() + # remove any pre-existing metrics file + try: + os.remove("{}/nw_metrics.txt".format(deployment_dir)) + except FileNotFoundError: + pass + cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format( pynq_password, pynq_port, @@ -74,7 +82,9 @@ def throughput_test(model): process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) process_compile.communicate() - with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file: - res = eval(file.read()) - - return res + try: + with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file: + res = eval(file.read()) + return res + except FileNotFoundError: + return None diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 049ede5064d252bd6391184c4227e5367a8c1e2b..18d3db18da089a5dda4dbb6d97180dd4a20613b5 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -107,6 +107,13 @@ class MakePYNQDriver(Transformation): driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded)) driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) + # clock settings for driver + clk_ns = float(model.get_metadata_prop("clk_ns")) + fclk_mhz = 1 / (clk_ns * 0.001) + # TODO change according to PYNQ board? + driver = driver.replace("$CLK_NAME$", "fclk0_mhz") + driver = driver.replace("$CLOCK_FREQ_MHZ$", str(fclk_mhz)) + with open(driver_py, "w") as f: f.write(driver) # copy all the dependencies into the driver folder diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index 55ecb57decd2ac4fa08331b5ebbcb7fd2f0cd5c6..ab9fd03251819aee72f74cc0c1fa17b99b1e05a4 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -91,7 +91,7 @@ cd %s pynq_driver_template = """ import argparse - +import os from pynq import Overlay import numpy as np from pynq import allocate @@ -101,6 +101,7 @@ from finn.util.data_packing import ( packed_bytearray_to_finnpy ) from finn.core.datatype import DataType +from pynq.ps import Clocks class FINNAccelDriver(): def __init__(self, N, bitfile): @@ -118,8 +119,12 @@ class FINNAccelDriver(): self.oshape_folded = $OUTPUT_SHAPE_FOLDED$ self.ishape_packed = $INPUT_SHAPE_PACKED$ # datatype np.uint8 self.oshape_packed = $OUTPUT_SHAPE_PACKED$ # datatype np.uint8 + # clock frequency + self.fclk_mhz = $CLOCK_FREQ_MHZ$ # load bitfile and set up accelerator self.ol = Overlay(bitfile) + # set the clock frequency as specified by user during transformations + Clocks.$CLK_NAME$ = self.fclk_mhz self.dma = self.ol.axi_dma_0 self.ctrl_regs = self.ol.resize_accel_0 # neuron folding factor of output = iterations per sample @@ -202,6 +207,12 @@ if __name__ == "__main__": # for the remote execution the data from the input npy file has to be loaded, # packed and copied to the PYNQ buffer if exec_mode == "execute": + # remove old output file to prevent reusing old output + # in case execution fails + try: + os.remove(outputfile) + except FileNotFoundError: + pass # load desired input .npy file ibuf_normal = np.load(inputfile) ibuf_folded = finnDriver.fold_input(ibuf_normal) @@ -212,10 +223,15 @@ if __name__ == "__main__": # for the throughput test the runtime of the network has to be measured if exec_mode == "throughput_test": - # measure runtime of network - start = time.time() + # remove old metrics file + try: + os.remove("nw_metrics.txt") + except FileNotFoundError: + pass # dictionary for results of throughput test res={} + # measure runtime of network + start = time.time() # execute accelerator finnDriver.execute() @@ -228,6 +244,8 @@ if __name__ == "__main__": res["throughput[images/s]"] = N / runtime res["DRAM_in_bandwidth[Mb/s]"] = np.prod(finnDriver.ishape_packed)*0.000001 / runtime res["DRAM_out_bandwidth[Mb/s]"] = np.prod(finnDriver.oshape_packed)*0.000001 / runtime + res["fclk[mhz]"] = Clocks.fclk0_mhz + res["N"] = N file = open("nw_metrics.txt", "w") file.write(str(res)) file.close() diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index bc413bf665e96be1d58a5de13b0744fd6a80f855..3880bb9591e27af5fe9d063dba2485d304e4db54 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -43,6 +43,13 @@ pynq_part_map["Pynq-Z1"] = "xc7z020clg400-1" pynq_part_map["Pynq-Z2"] = "xc7z020clg400-1" pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e" +# native AXI HP port width (in bits) for PYNQ boards +pynq_native_port_width = dict() +pynq_native_port_width["Pynq-Z1"] = 64 +pynq_native_port_width["Pynq-Z2"] = 64 +pynq_native_port_width["Ultra96"] = 128 +pynq_native_port_width["ZCU104"] = 128 + def get_rtlsim_trace_depth(): """Return the trace depth for rtlsim via PyVerilator. Controllable diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py index e6d1fc4efd61c01654ee88638698215d23a82eb3..c3359dcc82650bf0e9e8a5bc5276f5ca770ee96c 100644 --- a/tests/end2end/test_end2end_cnv_w1a1.py +++ b/tests/end2end/test_end2end_cnv_w1a1.py @@ -76,7 +76,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") test_fpga_part = pynq_part_map[test_pynq_board] -target_clk_ns = 5 +target_clk_ns = 10 mem_mode = "decoupled" diff --git a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py b/tests/end2end/test_end2end_tfc_w1a1.py similarity index 98% rename from tests/end2end/test_end2end_tfc_w1a1_throughput_test.py rename to tests/end2end/test_end2end_tfc_w1a1.py index 1ba149687bb80a0f977115bd380a09f70eef23f1..15c1c41b006c6f87d79a0e7eb6a4458838de5fd2 100644 --- a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py +++ b/tests/end2end/test_end2end_tfc_w1a1.py @@ -41,7 +41,6 @@ import onnx.numpy_helper as nph import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls import finn.transformation.streamline.absorb as absorb from finn.core.onnx_exec import execute_onnx -from finn.core.throughput_test import throughput_test from finn.custom_op.registry import getCustomOp from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount from finn.transformation.fold_constants import FoldConstants @@ -332,9 +331,6 @@ def test_end2end_tfc_w1a1_run_on_pynq(): ret = execute_onnx(parent_model, {iname: x}, True) y = ret[oname] assert np.isclose(y, y_golden).all() - child_model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model")) - res = throughput_test(child_model) - assert res is not None except KeyError: pytest.skip("PYNQ board IP address not specified") diff --git a/tests/pynq/test_pynq_performance_end2end.py b/tests/pynq/test_pynq_performance_end2end.py new file mode 100644 index 0000000000000000000000000000000000000000..66a93a190061e0142637be19bb2ea841d192745a --- /dev/null +++ b/tests/pynq/test_pynq_performance_end2end.py @@ -0,0 +1,65 @@ +import os + +import pytest +import numpy as np +from scipy.stats import linregress +import warnings +from finn.util.test import load_test_checkpoint_or_skip +from finn.core.throughput_test import throughput_test + +build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] + + +@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1"]) +@pytest.mark.slow +def test_pynq_performance_end2end(end2end_example): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_%s_pynq_deploy.onnx" % end2end_example + ) + try: + ip = os.environ["PYNQ_IP"] # NOQA + board = os.environ["PYNQ_BOARD"] # NOQA + if ip == "" or board == "": + pytest.skip("PYNQ board or IP address not specified") + ret = dict() + # try a range of batch sizes, some may fail due to insufficient DMA + # buffers + bsize_range_in = [2 ** i for i in range(16)] + bsize_range = [] + for bsize in bsize_range_in: + res = throughput_test(model, bsize) + if res is not None: + ret[bsize] = res + bsize_range.append(bsize) + else: + # assume we reached largest possible N + break + + y = [ret[key]["runtime[ms]"] for key in bsize_range] + lrret = linregress(bsize_range, y) + ret_str = "" + ret_str += "\n" + "%s Throughput Test Results" % end2end_example + ret_str += "\n" + "-----------------------------" + ret_str += "\n" + "From linear regression:" + ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept + ret_str += "\n" + "Time per sample: %f ms" % lrret.slope + ret_str += "\n" + "Raw data:" + + ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( + "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[Mb/s]", "DRAM wr[Mb/s]" + ) + for k in bsize_range: + v = ret[k] + ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( + k, + np.round(v["runtime[ms]"], 4), + v["fclk[mhz]"], + np.round(v["throughput[images/s]"], 2), + np.round(v["DRAM_in_bandwidth[Mb/s]"], 2), + np.round(v["DRAM_out_bandwidth[Mb/s]"], 2), + ) + ret_str += "\n" + "-----------------------------" + warnings.warn(ret_str) + + except KeyError: + pytest.skip("PYNQ board or IP address not specified") diff --git a/tests/pynq/test_pynq_performance_fifo.py b/tests/pynq/test_pynq_performance_fifo.py new file mode 100644 index 0000000000000000000000000000000000000000..1d4542473c4b58d3baa62f4123fd0f2f76954d95 --- /dev/null +++ b/tests/pynq/test_pynq_performance_fifo.py @@ -0,0 +1,128 @@ +import os + +import pytest + +import numpy as np +from onnx import TensorProto, helper + +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker +from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ +from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver +from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject +from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject +import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp +from finn.transformation.general import GiveUniqueNodeNames +from finn.util.basic import pynq_part_map, pynq_native_port_width +from finn.core.throughput_test import throughput_test +from scipy.stats import linregress +import warnings + + +def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype): + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape) + + FIFO_node = helper.make_node( + "StreamingFIFO", + ["inp"], + ["outp"], + domain="finn", + backend="fpgadataflow", + depth=Depth, + folded_shape=fld_shape, + dataType=str(finn_dtype.name), + ) + + graph = helper.make_graph( + nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="fifo-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", finn_dtype) + model.set_tensor_datatype("outp", finn_dtype) + + return model + + +@pytest.mark.vivado +@pytest.mark.slow +def test_pynq_performance_fifo(): + try: + ip = os.environ["PYNQ_IP"] # NOQA + board = os.environ["PYNQ_BOARD"] # NOQA + if ip == "" or board == "": + pytest.skip("PYNQ board or IP address not specified") + fifo_width = pynq_native_port_width[board] + shape = (1, fifo_width) + folded_shape = (1, 1, fifo_width) + depth = 16 + clk_ns = 10 + dtype = DataType.BIPOLAR + fpga_part = pynq_part_map[board] + username = os.getenv("PYNQ_USERNAME", "xilinx") + password = os.getenv("PYNQ_PASSWORD", "xilinx") + port = os.getenv("PYNQ_PORT", 22) + target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") + + model = make_single_fifo_modelwrapper(shape, depth, folded_shape, dtype) + model = model.transform(InsertTLastMarker()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(fpga_part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(rvp.ReplaceVerilogRelPaths()) + model = model.transform(CreateStitchedIP(fpga_part, clk_ns)) + model = model.transform(MakePYNQProject(board)) + model = model.transform(SynthPYNQProject()) + model = model.transform(MakePYNQDriver()) + model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) + + ret = dict() + # try a range of batch sizes, some may fail due to insufficient DMA + # buffers + bsize_range_in = [2 ** i for i in range(20)] + bsize_range = [] + for bsize in bsize_range_in: + res = throughput_test(model, bsize) + if res is not None: + ret[bsize] = res + bsize_range.append(bsize) + else: + # assume we reached largest possible N + break + + y = [ret[key]["runtime[ms]"] for key in bsize_range] + lrret = linregress(bsize_range, y) + ret_str = "" + ret_str += "\n" + "FIFO Throughput Test Results" + ret_str += "\n" + "-----------------------------" + ret_str += "\n" + "From linear regression:" + ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept + ret_str += "\n" + "Time per sample: %f ms" % lrret.slope + ret_str += "\n" + "Raw data:" + + ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( + "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[Mb/s]", "DRAM wr[Mb/s]" + ) + for k in bsize_range: + v = ret[k] + ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( + k, + np.round(v["runtime[ms]"], 4), + v["fclk[mhz]"], + np.round(v["throughput[images/s]"], 2), + np.round(v["DRAM_in_bandwidth[Mb/s]"], 2), + np.round(v["DRAM_out_bandwidth[Mb/s]"], 2), + ) + ret_str += "\n" + "-----------------------------" + warnings.warn(ret_str) + + except KeyError: + pytest.skip("PYNQ board or IP address not specified")