diff --git a/.gitignore b/.gitignore index d7ee7e014a0c175a8a88060f2aa320efeb501ddc..0c1bbd84fe24be46446a7d714dd708d601813e53 100644 --- a/.gitignore +++ b/.gitignore @@ -84,3 +84,10 @@ MANIFEST # PYNQ board files /board_files/ + +# datasets for testing +/dataset/ +/data/ + +# Google Drive key for dashboard +/gdrive-key/ diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci index 27028a0fa3e64d0396ec8e69ab2ad725eccca75f..fac168d55edd565b1cf84c4d9b556c51feb4e526 100644 --- a/docker/Dockerfile.finn_ci +++ b/docker/Dockerfile.finn_ci @@ -59,6 +59,7 @@ RUN apt update; apt install nano RUN pip install pytest-dependency RUN pip install pytest-xdist RUN pip install pytest-parallel +RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src" ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator" diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev index a84cd7be48578d72cd931672298d5695a5fc8268..89cf2c6747b5adbf89f4dc8563e817965cec3394 100644 --- a/docker/Dockerfile.finn_dev +++ b/docker/Dockerfile.finn_dev @@ -57,6 +57,7 @@ RUN pip install sphinx_rtd_theme==0.5.0 RUN pip install pytest-xdist==2.0.0 RUN pip install pytest-parallel==0.1.0 RUN pip install netron +RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading # switch user RUN groupadd -g $GID $GNAME diff --git a/requirements.txt b/requirements.txt index 4aa1cbe3484a3447851879d7da9ce9d48b066592..ba7bc716b741911820e67f1455aeca4c05e6e005 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ bitstring==3.1.7 docrep==0.2.7 future==0.18.2 +gspread==3.6.0 numpy==1.18.0 onnx==1.6.0 onnxruntime==1.2.0 diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py index 15e2a69cd3f61c59287264b76b2229a7da3a3734..85b52c0f33baac609b4dad4df59f8442f737ffc2 100644 --- a/src/finn/core/onnx_exec.py +++ b/src/finn/core/onnx_exec.py @@ -42,7 +42,7 @@ import finn.analysis.topology as ta from finn.util.basic import sanitize_quant_values, get_sanitize_quant_tensors -def execute_node(node, context, graph): +def execute_node(node, context, graph, return_full_exec_context=False): """Executes a single node by using onnxruntime, with custom function or if dataflow partition by using remote execution or rtlsim. @@ -59,16 +59,21 @@ def execute_node(node, context, graph): if old_iname != new_iname: inp_ctx[new_iname] = inp_ctx[old_iname] del inp_ctx[old_iname] - ret = execute_onnx(model, inp_ctx, False) + ret = execute_onnx(model, inp_ctx, return_full_exec_context) # if the model was in ip-stitched rtlsim mode, may get annotation # for numbet of elapsed cycles, save again if model.get_metadata_prop("exec_mode") == "rtlsim": model.save(sdp_node.get_nodeattr("model")) # output may have been renamed in partition - assert len(ret) == 1 + assert len(model.graph.output) == 1 node_oname = node.output[0] model_oname = model.graph.output[0].name context[node_oname] = ret[model_oname] + # prefix and insert exec context entries + if return_full_exec_context: + for tname in ret.keys(): + if tname != model_oname: + context[node.name + "_" + tname] = ret[tname] else: if node.domain == "finn": @@ -198,7 +203,7 @@ def execute_onnx( execution_context = sanitize_quant_values( model, node.input, execution_context ) - execute_node(node, execution_context, graph) + execute_node(node, execution_context, graph, return_full_exec_context) if get_sanitize_quant_tensors() != 0: # round output values to quantization annotation execution_context = sanitize_quant_values( diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py index c6598a30ec3c1b50bdd5c532cefc071d422c40ab..6e206d2058076802a48b69f4c69cccf744489f31 100644 --- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py +++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py @@ -34,6 +34,7 @@ from finn.core.datatype import DataType from finn.custom_op.fpgadataflow import HLSCustomOp from onnx import TensorProto, helper from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.util.basic import roundup_to_integer_multiple class LabelSelect_Batch(HLSCustomOp): @@ -46,6 +47,10 @@ class LabelSelect_Batch(HLSCustomOp): # If not provided compute min size labels = self.get_nodeattr("Labels") odt = DataType.get_smallest_possible(labels - 1) + # ensure a datatype divisible by 8-bits in case this is the last node + bw = roundup_to_integer_multiple(odt.bitwidth(), 8) + new_odt_name = odt.name.replace(str(odt.bitwidth()), str(bw)) + odt = DataType[new_odt_name] odt_name = odt.name self.set_nodeattr("outputDataType", odt_name) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index 562bab0f18990096f7364b3a4e2bcbbbf4ce2b58..2429bf6190f822fb4a6c988fcbb34152d5a338e0 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -73,6 +73,8 @@ class Thresholding_Batch(HLSCustomOp): # [4] is four vectors (like a FC layer with batch=4) # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) "numInputVectors": ("ints", False, [1]), + # initialization value for the thresholding accumulator + "ActVal": ("i", False, 0), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -321,7 +323,7 @@ class Thresholding_Batch(HLSCustomOp): threshold_tensor.shape[-1], tdt_hls, odt_hls, - export_odt.min(), + self.get_nodeattr("ActVal"), "std::less_equal<%s>" % tdt_hls, ) ) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index c6edd6104e48e88a9233777a29e41b60fbb588ca..d4d5b006493b8db1da0184e98ba35493d3e6ccbd 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -820,7 +820,19 @@ class InferThresholdingLayer(Transformation): assert ifc % pe == 0, "Requirement IFC divisable by PE is violated." odt = model.get_tensor_datatype(thl_output) - # create and insert new StreamingFCLayer node + scale = getCustomOp(node).get_nodeattr("out_scale") + assert ( + scale == 1.0 + ), "MultiThreshold out_scale must be equal to 1.0 for HLS conversion." + actval = getCustomOp(node).get_nodeattr("out_bias") + assert ( + int(actval) == actval + ), "MultiThreshold out_bias must be integer for HLS conversion." + actval = int(actval) + assert (not odt.signed()) or ( + actval < 0 + ), "Signed output requres actval < 0" + # create and insert new Thresholding_Batch node new_node = helper.make_node( "Thresholding_Batch", [thl_input, thl_threshold], @@ -832,6 +844,7 @@ class InferThresholdingLayer(Transformation): inputDataType=idt.name, outputDataType=odt.name, numInputVectors=list(thl_in_shape[:-1]), + ActVal=actval, ) graph.node.insert(insert_point, new_node) # remove old node diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 334507affba51e948bc5a907af3003152821a3f9..0def25d8429f5d3f6c02a9db656650bc1baba6ee 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -54,7 +54,7 @@ class CreateStitchedIP(Transformation): The packaged block design IP can be found under the ip subdirectory. """ - def __init__(self, fpgapart, clk_ns=10.0, ip_name="finn_design", vitis=False): + def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False): super().__init__() self.fpgapart = fpgapart self.clk_ns = clk_ns diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 0e50213ee6feee5f45c18f87cb31a5faf5fb1c50..813b40698d1beec54e6ba3fa5344a8d0bb715a00 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -124,6 +124,13 @@ class MakePYNQDriver(Transformation): with open(driver_py, "w") as f: f.write(driver) + + # add validate.py to run full top-1 test (only for suitable networks) + validate_py = pynq_driver_dir + "/validate.py" + validate_src = templates.pynq_validation_template + with open(validate_py, "w") as f: + f.write(validate_src) + # copy all the dependencies into the driver folder shutil.copytree( get_finn_root() + "/src/finn/util", pynq_driver_dir + "/finn/util" diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index 66580c70d23a2d2b19bfe8d94fefcd39a3208bcb..2b3789dc21cb4fe3f62d6b2a2ea0888329c9db66 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -436,3 +436,55 @@ open_project $VITIS_PROJ_PATH$/_x/link/vivado/vpl/prj/prj.xpr open_run impl_1 report_utilization -hierarchical -hierarchical_depth 5 -file $VITIS_PROJ_PATH$/synth_report.xml -format xml """ + +pynq_validation_template = """ +import argparse +from driver import FINNAccelDriver +import numpy as np + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Validate top-1 accuracy for FINN accelerator') + parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=100) + parser.add_argument('--dataset', help='dataset to use (mnist of cifar10)', required=True) + # parse arguments + args = parser.parse_args() + bsize = args.batchsize + dataset = args.dataset + + if dataset == "mnist": + from dataset_loading import mnist + trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data("/tmp", download=True, one_hot=False) + elif dataset == "cifar10": + from dataset_loading import cifar + trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data("/tmp", download=True, one_hot=False) + else: + raise Exception("Unrecognized dataset") + + test_imgs = testx + test_labels = testy + + ok = 0 + nok = 0 + total = test_imgs.shape[0] + driver = FINNAccelDriver(bsize, "resizer.bit", "zynq-iodma") + + n_batches = int(total / bsize) + + test_imgs = test_imgs.reshape(n_batches, bsize, -1) + test_labels = test_labels.reshape(n_batches, bsize) + + for i in range(n_batches): + ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape) + exp = test_labels[i] + driver.copy_input_data_to_device(ibuf_normal) + driver.execute() + obuf_normal = np.empty_like(driver.obuf_packed_device) + driver.copy_output_data_from_device(obuf_normal) + ret = np.bincount(obuf_normal.flatten() == exp.flatten()) + nok += ret[0] + ok += ret[1] + print("batch %d / %d : total OK %d NOK %d" % (i, n_batches, ok, nok)) + + acc = 100.0 * ok / (total) + print("Final accuracy: %f" % acc) +""" diff --git a/src/finn/transformation/infer_data_layouts.py b/src/finn/transformation/infer_data_layouts.py index e7a6b88239a1735d5379e165333f8356ae6f88a1..d07162fa049bd016e91b8c5b01ea56eda6267655 100644 --- a/src/finn/transformation/infer_data_layouts.py +++ b/src/finn/transformation/infer_data_layouts.py @@ -75,6 +75,17 @@ def _infer_node_data_layout(model, node): inp_layout = model.get_tensor_layout(node.input[0]) out_layout = [inp_layout[i] for i in perm] model.set_tensor_layout(node.output[0], out_layout) + elif node.op_type == "Unsqueeze": + inp_layout = model.get_tensor_layout(node.input[0]) + # add dummy dimension at the output + out_layout = inp_layout + ["x"] + model.set_tensor_layout(node.output[0], out_layout) + elif node.op_type == "Squeeze": + inp_layout = model.get_tensor_layout(node.input[0]) + assert inp_layout[-1] == "x" + # remove dummy dimension + out_layout = inp_layout[:-1] + model.set_tensor_layout(node.output[0], out_layout) else: # try to guess based on number of output dims for o in node.output: diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py index 8398a277443530e84632d26fbfca6d90ea4b0b9e..0f2c5525d91263b44002677b505087d38408333a 100644 --- a/src/finn/transformation/streamline/absorb.py +++ b/src/finn/transformation/streamline/absorb.py @@ -424,8 +424,9 @@ class AbsorbTransposeIntoFlatten(Transformation): return (model, graph_modified) -class AbsorbScalarMulIntoTopK(Transformation): - """Absorb a mul node into a suceeding topk node if the mul is scalar.""" +class AbsorbScalarMulAddIntoTopK(Transformation): + """Remove mul/add node prior to topk node if the op is scalar. Note that + the TopK output probabilities will change, but the indices won't.""" def apply(self, model): graph = model.graph @@ -435,14 +436,17 @@ class AbsorbScalarMulIntoTopK(Transformation): node_ind += 1 if n.op_type == "TopK": prod = model.find_producer(n.input[0]) - if prod is not None and prod.op_type == "Mul": + if prod is not None and (prod.op_type in ["Mul", "Add"]): prod_input = prod.input[0] param_name = prod.input[1] A = model.get_initializer(param_name) if A is None: warnings.warn("Param is not constant, skipping") continue - if all(x == 1 for x in A.shape) and A > 0: + is_scalar = all(x == 1 for x in A.shape) + is_scalar_pos_mul = is_scalar and (prod.op_type == "Mul") and A > 0 + is_scalar_add = is_scalar and (prod.op_type == "Add") + if is_scalar_pos_mul or is_scalar_add: # if the mul is scalar and positive, we can just delete the # mul node and rewire the top k node. Because the top k node # works with probabilities and their relation to each other diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index b47f269dd6f2671c3d98c9316954483c0e72f14f..f4c1dc1306b67e5807c25cfb08c961729dbfbdf6 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -533,7 +533,7 @@ class MoveScalarLinearPastInvariants(Transformation): if prod0 is None: continue - if prod0.op_type == "Mul" or prod0.op_type == "Add": + if prod0.op_type in ["Mul", "Add", "Div"]: # check if second input of producer is an initializer init0 = model.get_initializer(prod0.input[1]) # if either initializer is None, skip diff --git a/src/finn/util/gdrive.py b/src/finn/util/gdrive.py new file mode 100644 index 0000000000000000000000000000000000000000..c2d9b89e354e42849a82b563fe391b9f6e603f4e --- /dev/null +++ b/src/finn/util/gdrive.py @@ -0,0 +1,60 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import gspread +import os +import warnings +from datetime import datetime + + +def upload_to_end2end_dashboard(data_dict): + gdrive_key = "/workspace/finn/gdrive-key/service_account.json" + if not os.path.isfile(gdrive_key): + warnings.warn("Google Drive key not found, skipping dashboard upload") + return + gc = gspread.service_account(filename=gdrive_key) + spreadsheet = gc.open("finn-end2end-dashboard") + worksheet = spreadsheet.get_worksheet(0) + keys = list(data_dict.keys()) + vals = list(data_dict.values()) + # check against existing header + existing_keys = worksheet.row_values(1) + if existing_keys != keys: + # create new worksheet + dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + worksheet = spreadsheet.add_worksheet( + title="Dashboard " + dtstr, rows=10, cols=len(keys), index=0 + ) + # create header row with keys + worksheet.update("A1:1", [keys]) + # freeze and make header bold + worksheet.freeze(rows=1) + worksheet.format("A1:1", {"textFormat": {"bold": True}}) + # insert values into new row + worksheet.insert_row([], index=2) + worksheet.update("A2:2", [vals]) diff --git a/src/finn/util/test.py b/src/finn/util/test.py index 3cd4248c5fbf438ac7dd7974adb38d251d389a07..32c6a0a3a3bb19b95590181dbe447e82cf9966a2 100644 --- a/src/finn/util/test.py +++ b/src/finn/util/test.py @@ -77,6 +77,11 @@ def get_test_model_untrained(netname, wbits, abits): return get_test_model(netname, wbits, abits, pretrained=False) +def get_topk(vec, k): + "Return indices of the top-k values in given array vec (treated as 1D)." + return np.flip(vec.flatten().argsort())[:k] + + def soft_verify_topk(invec, idxvec, k): """Check that the topK indices provided actually point to the topK largest values in the input vector""" @@ -140,7 +145,6 @@ def get_example_input(topology): elif topology == "cnv": fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz") input_tensor = np.load(fn)["arr_0"].astype(np.float32) - input_tensor = input_tensor / 255 return input_tensor else: raise Exception("Unknown topology, can't return example input") @@ -158,7 +162,7 @@ def get_trained_network_and_ishape(topology, wbits, abits): return (model, ishape) -def execute_parent(parent_path, child_path, input_tensor_npy): +def execute_parent(parent_path, child_path, input_tensor_npy, return_full_ctx=False): """Execute parent model containing a single StreamingDataflowPartition by replacing it with the model at child_path and return result.""" @@ -169,5 +173,7 @@ def execute_parent(parent_path, child_path, input_tensor_npy): sdp_node = getCustomOp(sdp_node) sdp_node.set_nodeattr("model", child_path) ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True) - y = ret[oname] - return y + if return_full_ctx: + return ret + else: + return ret[oname] diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 29ecb2c7e49444cecade6d3321aaba3b9add4b9c..4eed1a260974e4f842e9e93756caff135c5fbdde 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -63,12 +63,16 @@ from finn.util.test import ( get_example_input, get_trained_network_and_ishape, execute_parent, + get_topk, ) from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources from finn.transformation.infer_data_layouts import InferDataLayouts from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul -from finn.transformation.streamline.reorder import MakeMaxPoolNHWC +from finn.transformation.streamline.reorder import ( + MakeMaxPoolNHWC, + MoveScalarLinearPastInvariants, +) import warnings from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -84,6 +88,15 @@ from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.core.modelwrapper import ModelWrapper from scipy.stats import linregress from finn.core.throughput_test import throughput_test_remote, throughput_test_rtlsim +from finn.util.pytorch import ToTensor +from finn.transformation.merge_onnx_models import MergeONNXModels +from finn.transformation.insert_topk import InsertTopK +from finn.core.datatype import DataType +from dataset_loading import mnist, cifar +from datetime import datetime +import subprocess +from finn.util.gdrive import upload_to_end2end_dashboard +from collections import OrderedDict build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] target_clk_ns = 10 @@ -95,6 +108,24 @@ def get_checkpoint_name(topology, wbits, abits, step): return build_dir + "/end2end_%s_w%da%d_%s.onnx" % (topology, wbits, abits, step) +def get_dashboard_data(topology, wbits, abits): + stats_file = build_dir + "/end2end_%s_w%da%d.txt" % (topology, wbits, abits) + stats_dict = OrderedDict() + if os.path.isfile(stats_file): + with open(stats_file, "r") as f: + stats_dict_txt = f.read() + stats_dict = eval(stats_dict_txt) + return stats_dict + + +def update_dashboard_data(topology, wbits, abits, key, val): + stats_dict = get_dashboard_data(topology, wbits, abits) + stats_dict[key] = val + stats_file = build_dir + "/end2end_%s_w%da%d.txt" % (topology, wbits, abits) + with open(stats_file, "w") as f: + f.write(str(stats_dict)) + + def fold_tfc(model): fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer @@ -111,6 +142,10 @@ def fold_tfc(model): fcl_inst.set_nodeattr("inFIFODepth", ififo) fcl_inst.set_nodeattr("outFIFODepth", ofifo) fcl_inst.set_nodeattr("ram_style", ramstyle) + # set parallelism for input quantizer to be same as first layer's SIMD + inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + inp_qnt = getCustomOp(inp_qnt_node) + inp_qnt.set_nodeattr("PE", 49) return model @@ -187,14 +222,71 @@ def get_folding_function(topology, wbits, abits): raise Exception("Unknown topology/quantization combo for predefined folding") -def get_golden_io_pair(topology, wbits, abits): +def get_golden_io_pair(topology, wbits, abits, preproc=ToTensor(), return_topk=None): (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits) input_tensor_npy = get_example_input(topology) input_tensor_torch = torch.from_numpy(input_tensor_npy).float() + if preproc is not None: + input_tensor_torch = preproc.forward(input_tensor_torch).detach() output_tensor_npy = model.forward(input_tensor_torch).detach().numpy() + if return_topk is not None: + output_tensor_npy = get_topk(output_tensor_npy, k=return_topk) return (input_tensor_npy, output_tensor_npy) +def measure_top1_accuracy(model_chkpt, dataset, parent_chkpt=None): + if dataset == "cifar10": + trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data( + "/workspace/finn/dataset", download=True, one_hot=False + ) + elif dataset == "mnist": + trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data( + "/workspace/finn/dataset", download=True, one_hot=False + ) + else: + raise Exception("Unrecognized dataset") + # move from dataset_loader layout to ONNX layout: NHWC -> NCHW + testx = testx.transpose(0, 3, 1, 2) + model = ModelWrapper(model_chkpt) + iname = model.graph.input[0].name + oname = model.graph.output[0].name + if parent_chkpt is None: + ishape = model.get_tensor_shape(iname) + else: + parent_model = ModelWrapper(parent_chkpt) + parent_iname = parent_model.graph.input[0].name + ishape = parent_model.get_tensor_shape(parent_iname) + ok = 0 + nok = 0 + n_batches = testx.shape[0] + for i in range(n_batches): + tdata = testx[i].reshape(ishape).astype(np.float32) + exp = testy[i].item() + if parent_chkpt is not None: + y = execute_parent(parent_chkpt, model_chkpt, tdata) + else: + y = execute_onnx(model, {iname: tdata}, False)[oname] + ret = y.item() + if ret == exp: + ok += 1 + else: + nok += 1 + if i % 10 == 0: + print("%d : OK %d NOK %d " % (i, ok, nok)) + acc_top1 = ok * 100.0 / (ok + nok) + warnings.warn("Final OK %d NOK %d top-1 %f" % (ok, nok, acc_top1)) + return acc_top1 + + +def topology2dataset(topology): + if "fc" in topology: + return "mnist" + elif "cnv" in topology: + return "cifar10" + else: + raise Exception("Unrecognized topology") + + @pytest.mark.parametrize("wbits", [1, 2]) @pytest.mark.parametrize("abits", [1, 2]) @pytest.mark.parametrize("topology", ["tfc", "cnv"]) @@ -205,6 +297,15 @@ class TestEnd2End: (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits) chkpt_name = get_checkpoint_name(topology, wbits, abits, "export") bo.export_finn_onnx(model, ishape, chkpt_name) + nname = "%s_w%da%d" % (topology, wbits, abits) + update_dashboard_data(topology, wbits, abits, "network", nname) + dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + update_dashboard_data(topology, wbits, abits, "datetime", dtstr) + finn_commit = subprocess.check_output( + ["git", "rev-parse", "HEAD"], cwd="/workspace/finn" + ) + finn_commit = finn_commit.decode("utf-8").strip() + update_dashboard_data(topology, wbits, abits, "finn-commit", finn_commit) assert os.path.isfile(chkpt_name) def test_import_and_tidy(self, topology, wbits, abits): @@ -216,11 +317,43 @@ class TestEnd2End: model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataTypes()) model = model.transform(RemoveStaticGraphInputs()) - model.save(get_checkpoint_name(topology, wbits, abits, "import_and_tidy")) + chkpt = get_checkpoint_name(topology, wbits, abits, "import_and_tidy") + model.save(chkpt) - def test_streamline(self, topology, wbits, abits): + def test_add_pre_and_postproc(self, topology, wbits, abits): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "import_and_tidy") model = load_test_checkpoint_or_skip(prev_chkpt_name) + global_inp_name = model.graph.input[0].name + ishape = model.get_tensor_shape(global_inp_name) + # preprocessing: torchvision's ToTensor divides uint8 inputs by 255 + totensor_pyt = ToTensor() + chkpt_preproc_name = get_checkpoint_name(topology, wbits, abits, "preproc") + bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name) + assert os.path.isfile(chkpt_preproc_name) + # join preprocessing and core model + pre_model = ModelWrapper(chkpt_preproc_name) + model = model.transform(MergeONNXModels(pre_model)) + # add input quantization annotation: UINT8 for all BNN-PYNQ models + global_inp_name = model.graph.input[0].name + model.set_tensor_datatype(global_inp_name, DataType.UINT8) + # postprocessing: insert Top-1 node at the end + model = model.transform(InsertTopK(k=1)) + chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post") + # tidy-up again + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(RemoveStaticGraphInputs()) + model.save(chkpt_name) + assert os.path.isfile(chkpt_name) + + def test_streamline(self, topology, wbits, abits): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post") + model = load_test_checkpoint_or_skip(prev_chkpt_name) + # move past any reshapes to be able to streamline input scaling + model = model.transform(MoveScalarLinearPastInvariants()) model = model.transform(Streamline()) if "fc" not in topology: model = model.transform(LowerConvsToMatMul()) @@ -228,6 +361,9 @@ class TestEnd2End: model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) model = model.transform(ConvertBipolarMatMulToXnorPopcount()) model = model.transform(Streamline()) + # absorb final add-mul nodes into TopK + model = model.transform(absorb.AbsorbScalarMulAddIntoTopK()) + model = model.transform(InferDataLayouts()) model = model.transform(RemoveUnusedTensors()) model.save(get_checkpoint_name(topology, wbits, abits, "streamline")) @@ -238,11 +374,17 @@ class TestEnd2End: model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode)) # needed for non-bipolar MatMul layers model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode)) + # TopK to LabelSelect + model = model.transform(to_hls.InferLabelSelectLayer()) + # input quantization (if any) to standalone thresholding + model = model.transform(to_hls.InferThresholdingLayer()) # needed for convolutions if "fc" not in topology: model = model.transform(to_hls.InferConvInpGen()) model = model.transform(to_hls.InferStreamingMaxPool()) model = model.transform(RemoveCNVtoFCFlatten()) + # get rid of Tranpose -> Tranpose identity seq + model = model.transform(absorb.AbsorbConsecutiveTransposes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferDataLayouts()) model.save(get_checkpoint_name(topology, wbits, abits, "convert_to_hls_layers")) @@ -285,7 +427,7 @@ class TestEnd2End: model.save(cppsim_chkpt) parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") (input_tensor_npy, output_tensor_npy) = get_golden_io_pair( - topology, wbits, abits + topology, wbits, abits, return_topk=1 ) y = execute_parent(parent_chkpt, cppsim_chkpt, input_tensor_npy) assert np.isclose(y, output_tensor_npy).all() @@ -294,6 +436,8 @@ class TestEnd2End: @pytest.mark.vivado @pytest.mark.parametrize("kind", ["zynq", "alveo"]) def test_ipgen(self, topology, wbits, abits, kind): + if kind == "alveo" and ("VITIS_PATH" not in os.environ): + pytest.skip("VITIS_PATH not set") prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold") model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(kind, target_clk_ns)["part"] @@ -304,7 +448,7 @@ class TestEnd2End: @pytest.mark.slow @pytest.mark.vivado - @pytest.mark.parametrize("kind", ["zynq", "alveo"]) + @pytest.mark.parametrize("kind", ["zynq"]) def test_ipstitch_rtlsim(self, topology, wbits, abits, kind): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind) model = load_test_checkpoint_or_skip(prev_chkpt_name) @@ -326,23 +470,32 @@ class TestEnd2End: "rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits) ) os.environ["RTLSIM_TRACE_DEPTH"] = "3" - rtlsim_chkpt = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + kind) + rtlsim_chkpt = get_checkpoint_name( + topology, wbits, abits, "ipstitch_rtlsim_" + kind + ) model.save(rtlsim_chkpt) parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") (input_tensor_npy, output_tensor_npy) = get_golden_io_pair( - topology, wbits, abits + topology, wbits, abits, return_topk=1 ) y = execute_parent(parent_chkpt, rtlsim_chkpt, input_tensor_npy) model = ModelWrapper(rtlsim_chkpt) perf["cycles_rtlsim"] = model.get_metadata_prop("cycles_rtlsim") - warnings.warn("Estimated & rtlsim performance: " + str(perf)) + # warnings.warn("Estimated & rtlsim performance: " + str(perf)) + # for (k, v) in perf.items(): + # update_dashboard_data(topology, wbits, abits, k, v) + update_dashboard_data( + topology, wbits, abits, "cycles_rtlsim", perf["cycles_rtlsim"] + ) assert np.isclose(y, output_tensor_npy).all() @pytest.mark.slow @pytest.mark.vivado - @pytest.mark.parametrize("kind", ["zynq", "alveo"]) + @pytest.mark.parametrize("kind", ["zynq"]) def test_throughput_rtlsim(self, topology, wbits, abits, kind): - prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + kind) + prev_chkpt_name = get_checkpoint_name( + topology, wbits, abits, "ipstitch_rtlsim_" + kind + ) model = load_test_checkpoint_or_skip(prev_chkpt_name) n_nodes = len(model.graph.node) perf_est = model.analysis(dataflow_performance) @@ -354,6 +507,25 @@ class TestEnd2End: est_cycles = latency + cycles_per_sample_est * batchsize assert (abs(res_cycles - est_cycles) / res_cycles) < 0.15 + @pytest.mark.slow + @pytest.mark.vivado + @pytest.mark.parametrize("kind", ["zynq"]) + def test_validate_top1(self, topology, wbits, abits, kind): + if "TEST_END2END_VALIDATE_TOP1" not in os.environ: + pytest.skip("TEST_END2END_VALIDATE_TOP1 not set") + prepostproc_chkpt = get_checkpoint_name(topology, wbits, abits, "pre_post") + streamline_chkpt = get_checkpoint_name(topology, wbits, abits, "streamline") + parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") + cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim") + rtlsim_chkpt = get_checkpoint_name( + topology, wbits, abits, "ipstitch_rtlsim_" + kind + ) + dataset = topology2dataset(topology) + assert measure_top1_accuracy(prepostproc_chkpt, dataset) > 80 + assert measure_top1_accuracy(streamline_chkpt, dataset) > 80 + assert measure_top1_accuracy(cppsim_chkpt, dataset, parent_chkpt) > 80 + assert measure_top1_accuracy(rtlsim_chkpt, dataset, parent_chkpt) > 80 + @pytest.mark.slow @pytest.mark.vivado @pytest.mark.vitis @@ -366,14 +538,10 @@ class TestEnd2End: cfg = get_build_env(kind, target_clk_ns) model = model.transform(cfg["build_fxn"]) model = model.transform(AnnotateResources("synth")) - warnings.warn( - "Post-synthesis resources (excluding shell): " - + model.get_metadata_prop("res_total_synth") - ) - warnings.warn( - "Post-synthesis resources (all inclusive): " - + model.get_metadata_prop("res_total_top_synth") - ) + synth_dct = eval(model.get_metadata_prop("res_total_top_synth")) + for (k, v) in synth_dct.items(): + update_dashboard_data(topology, wbits, abits, k, v) + update_dashboard_data(topology, wbits, abits, "board", cfg["board"]) model.save(get_checkpoint_name(topology, wbits, abits, "build_" + kind)) @pytest.mark.parametrize("kind", ["zynq", "alveo"]) @@ -396,14 +564,14 @@ class TestEnd2End: model.save(get_checkpoint_name(topology, wbits, abits, "deploy_" + kind)) @pytest.mark.parametrize("kind", ["zynq", "alveo"]) - def test_run_on_pynq(self, topology, wbits, abits, kind): + def test_run_on_hw(self, topology, wbits, abits, kind): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "deploy_" + kind) model = load_test_checkpoint_or_skip(prev_chkpt_name) # NOQA cfg = get_build_env(kind, target_clk_ns) if cfg["ip"] == "": pytest.skip("PYNQ board IP address not specified") (input_tensor_npy, output_tensor_npy) = get_golden_io_pair( - topology, wbits, abits + topology, wbits, abits, return_topk=1 ) parent_model = load_test_checkpoint_or_skip( get_checkpoint_name(topology, wbits, abits, "dataflow_parent") @@ -412,9 +580,7 @@ class TestEnd2End: oname = parent_model.graph.output[0].name sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] sdp_node = getCustomOp(sdp_node) - sdp_chkpt = get_checkpoint_name(topology, wbits, abits, "deploy") - load_test_checkpoint_or_skip(sdp_chkpt) - sdp_node.set_nodeattr("model", sdp_chkpt) + sdp_node.set_nodeattr("model", prev_chkpt_name) ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True) y = ret[oname] assert np.isclose(y, output_tensor_npy).all() @@ -465,3 +631,21 @@ class TestEnd2End: ) ret_str += "\n" + "-----------------------------" warnings.warn(ret_str) + largest_bsize = bsize_range[-1] + update_dashboard_data( + topology, wbits, abits, "fclk[mhz]", ret[largest_bsize]["fclk[mhz]"] + ) + update_dashboard_data( + topology, + wbits, + abits, + "throughput[images/s]", + ret[largest_bsize]["throughput[images/s]"], + ) + + def test_upload_results_to_dashboard(self, topology, wbits, abits): + dashboard_data = get_dashboard_data(topology, wbits, abits) + if len(dashboard_data.keys()) > 0: + upload_to_end2end_dashboard(dashboard_data) + else: + pytest.skip("No data to upload to dashboard") diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py index 3c8da5de1d8629b3692646e1aa18120ffcc30b99..86875d2ac7f37e697c5de198e15aa3045a9e3d42 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py +++ b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py @@ -52,7 +52,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.streamline.absorb import ( - AbsorbScalarMulIntoTopK, + AbsorbScalarMulAddIntoTopK, AbsorbConsecutiveTransposes, ) from finn.transformation.streamline.collapse_repeated import ( @@ -192,7 +192,7 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(to_hls.InferGlobalAccPoolLayer()) model = model.transform(MoveScalarLinearPastInvariants()) model = model.transform(InsertTopK()) - model = model.transform(AbsorbScalarMulIntoTopK()) + model = model.transform(AbsorbScalarMulAddIntoTopK()) model = model.transform(InferDataTypes()) model = model.transform(to_hls.InferLabelSelectLayer()) model = model.transform(AbsorbConsecutiveTransposes()) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index ad69b89d8a159f6eb423c0739bdb4c0dd5103792..75fa625ff00ad6d367e2d6c94d98705f391fb9be 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -48,7 +48,7 @@ from finn.custom_op.registry import getCustomOp from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer -def make_single_thresholding_modelwrapper(T, pe, idt, odt): +def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval): NumChannels = T.shape[0] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, NumChannels]) @@ -66,6 +66,7 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt): PE=pe, inputDataType=idt.name, outputDataType=odt.name, + ActVal=actval, ) graph = helper.make_graph( nodes=[Thresholding_node], @@ -112,7 +113,12 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode): # provide non-decreasing thresholds T = np.sort(T, axis=1) - model = make_single_thresholding_modelwrapper(T, pe, idt, odt) + if odt == DataType.BIPOLAR: + actval = 0 + else: + actval = odt.min() + + model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) diff --git a/tests/transformation/test_absorb_mul_into_topk.py b/tests/transformation/test_absorb_mul_into_topk.py index 1394220f7c336ccea8fe9c494734c4175bf2e847..d0a089f9e5f894a5da635672eb58af1d8ddef3ef 100644 --- a/tests/transformation/test_absorb_mul_into_topk.py +++ b/tests/transformation/test_absorb_mul_into_topk.py @@ -35,7 +35,7 @@ from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames from finn.transformation.insert_topk import InsertTopK -from finn.transformation.streamline.absorb import AbsorbScalarMulIntoTopK +from finn.transformation.streamline.absorb import AbsorbScalarMulAddIntoTopK import finn.core.onnx_exec as oxe # parameter to indicate if mul parameter is negative or positive @@ -49,20 +49,24 @@ def test_absorb_mul_into_topk(mul_positive, scalar): shape = [1, 1, 1, 1000] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 1, 1, 1000]) a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, shape) + b0 = helper.make_tensor_value_info("b0", TensorProto.FLOAT, [1, 1, 1, 1000]) + c0 = helper.make_tensor_value_info("c0", TensorProto.FLOAT, shape) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, 1000]) - mul_node = helper.make_node("Mul", ["inp", "a0"], ["outp"]) + mul_node = helper.make_node("Mul", ["inp", "a0"], ["b0"]) + add_node = helper.make_node("Add", ["b0", "c0"], ["outp"]) mul_graph = helper.make_graph( - nodes=[mul_node], + nodes=[mul_node, add_node], name="mul-graph", inputs=[inp], outputs=[outp], - value_info=[a0], + value_info=[a0, b0, c0], ) model = helper.make_model(mul_graph, producer_name="mul_model") model = ModelWrapper(model) # initialize values + # for mul if mul_positive is True: a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype( np.float32 @@ -72,12 +76,17 @@ def test_absorb_mul_into_topk(mul_positive, scalar): np.float32 ) model.set_initializer("a0", a0_values) + # for add + c0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype( + np.float32 + ) + model.set_initializer("c0", c0_values) model = model.transform(InsertTopK()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) - model_transformed = model.transform(AbsorbScalarMulIntoTopK()) + model_transformed = model.transform(AbsorbScalarMulAddIntoTopK()) # compare execution results inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype( @@ -100,9 +109,5 @@ def test_absorb_mul_into_topk(mul_positive, scalar): # check for new order assert model.graph != model_transformed.graph - assert len(model.graph.node) - 1 == len(model_transformed.graph.node) + assert len(model.graph.node) - 2 == len(model_transformed.graph.node) assert model_transformed.graph.node[0].op_type == "TopK" - - else: - assert (y_values == y_tr_values).all() - assert model.graph == model_transformed.graph