From e202bca29297a4c24ad54b253cfda93fbfc7a35d Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu <yamanu@xilinx.com> Date: Sun, 13 Dec 2020 22:01:42 +0100 Subject: [PATCH] Driver data packing + improvements (#261) * [Driver] add driver_base.py as own template file + comments * [Driver] also move validation to won template + use in transform * [Driver] more comments * [Driver] suggested updates from PYNQ team + async mode exec_on_buffers * [Driver] allow smaller batchsize in execute_on_buffers * [Driver] optimize buffer alloc a bit * [Driver] wait condition fix * [Deps] update finn-base * [Deps] update finn-base * [Driver] enable fast_mode, expose more benchmarks * [Deps] update finn-base --- docker/finn_entrypoint.sh | 2 +- .../qnn-data/templates/driver/driver_base.py | 381 ++++++++++++++++++ .../qnn-data/templates/driver/validate.py | 109 +++++ .../fpgadataflow/make_pynq_driver.py | 14 +- .../fpgadataflow/template_driver.py | 379 ----------------- 5 files changed, 500 insertions(+), 385 deletions(-) create mode 100644 src/finn/qnn-data/templates/driver/driver_base.py create mode 100644 src/finn/qnn-data/templates/driver/validate.py diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 939189e17..76d522a0e 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -12,7 +12,7 @@ gecho () { # checkout the correct dependency repo commits # the repos themselves are cloned in the Dockerfile -FINN_BASE_COMMIT=ee0a7df4de00958cf4d71624a56e8a1acf4fd085 +FINN_BASE_COMMIT=efcc0324fbca2476af25f7d3c060d51d5270f09a BREVITAS_COMMIT=aff49758ec445d77c75721c7de3091a2a1797ca8 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py new file mode 100644 index 000000000..267c5e692 --- /dev/null +++ b/src/finn/qnn-data/templates/driver/driver_base.py @@ -0,0 +1,381 @@ +# Copyright (c) 2020 Xilinx, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import time +import os +from pynq import Overlay, allocate +from pynq.ps import Clocks + +from finn.util.data_packing import ( + finnpy_to_packed_bytearray, + packed_bytearray_to_finnpy, +) + +# Driver base class for FINN-generated dataflow accelerators. +# The particulars of the generated accelerator are specified via the +# io_shape_dict (generated by the MakePYNQDriver transformation). + + +class FINNExampleOverlay(Overlay): + def __init__( + self, + bitfile_name, + platform, + io_shape_dict, + batch_size=1, + fclk_mhz=100.0, + device=None, + download=True, + runtime_weight_dir="runtime_weights/", + ): + """Initialize the FINN accelerator. + + Parameters + ---------- + bitfile_name: str + Path to accelerator .bit/.xclbin file + platform: str + FINN platform type, either "alveo" or "zynq-iodma" + io_shape_dict: dict + Dictionary with particulars of the generated accelerator + batch_size: int + Maximum batch size in driver (hardware batchsize is always 1) + fclk_mhz: float + Override the clock frequency, only possible for Zynq. + device: pynq.Device + Which PYNQ device to use, None for default. + download: bool + Whether to flash the bitstream. + runtime_weight_dir: str + Path to runtime weights folder. + """ + super().__init__(bitfile_name, download=download, device=device) + self.runtime_weight_dir = runtime_weight_dir + self._io_shape_dict = io_shape_dict + self.ibuf_packed_device = None + self.obuf_packed_device = None + self.platform = platform + self.batch_size = batch_size + self.fclk_mhz = fclk_mhz + if self.platform == "alveo": + self.idma = self.idma0 + self.odma = self.odma0 + self.odma_handle = None + elif self.platform == "zynq-iodma": + self.idma = self.idma0 + self.odma = self.odma0 + # set the clock frequency as specified by user during transformations + if self.fclk_mhz > 0: + Clocks.fclk0_mhz = self.fclk_mhz + else: + raise ValueError("Supported platforms are zynq-iodma alveo") + # load any runtime weights + self.load_runtime_weights() + + def load_runtime_weights(self, flush_accel=True, verify=True): + """Load any existing runtime weights from the specified dir into the + appropriate layer of the accelerator. Note that this must be enabled + during the accelerator build process. The runtime weights directory + is specified as the class member ``runtime_weight_dir``. + + Parameters + ---------- + flush_accel: bool + Run the accelerator with dummy input after weights are written to + flush any stale weight data in the weight streamer FIFOs. + verify: bool + Whether the written weights will be re-read and verified. + """ + w_filenames = [] + for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir): + w_filenames.extend(filenames) + rt_weight_dict = {} + for w_filename in w_filenames: + if w_filename.endswith(".dat"): + with open(self.runtime_weight_dir + "/" + w_filename, "r") as f: + dat = f.read() + layer_w = np.fromiter( + [int(x, 16) for x in dat.strip().split()], dtype=np.uint32 + ) + layer_ind = int(w_filename.split("_")[0]) + rt_weight_dict[layer_ind] = layer_w + for layer_ind in rt_weight_dict.keys(): + cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind + if cand_if_name in self.ip_dict.keys(): + layer_mmio = getattr( + self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind + ).mmio + layer_w = rt_weight_dict[layer_ind] + layer_mmio.write_mm(0, layer_w.tobytes()) + if verify: + new_w = np.copy(layer_mmio.array[: layer_w.shape[0]]) + assert (layer_w == new_w).all() + if flush_accel: + # run accelerator to flush any stale weights from weight streamer FIFOs + self.execute_on_buffers() + + @property + def idt(self): + return self._io_shape_dict["idt"] + + @property + def odt(self): + return self._io_shape_dict["odt"] + + @property + def ishape_normal(self): + ret = list(self._io_shape_dict["ishape_normal"]) + ret[0] = self.batch_size + return tuple(ret) + + @property + def oshape_normal(self): + ret = list(self._io_shape_dict["oshape_normal"]) + ret[0] = self.batch_size + return tuple(ret) + + @property + def ishape_folded(self): + ret = list(self._io_shape_dict["ishape_folded"]) + ret[0] = self.batch_size + return tuple(ret) + + @property + def oshape_folded(self): + ret = list(self._io_shape_dict["oshape_folded"]) + ret[0] = self.batch_size + return tuple(ret) + + @property + def ishape_packed(self): + ret = list(self._io_shape_dict["ishape_packed"]) + ret[0] = self.batch_size + return tuple(ret) + + @property + def oshape_packed(self): + ret = list(self._io_shape_dict["oshape_packed"]) + ret[0] = self.batch_size + return tuple(ret) + + @property + def batch_size(self): + return self._batch_size + + @batch_size.setter + def batch_size(self, value): + self._batch_size = value + # free the old buffers by setting to None + # (reference counting should care of it) + if self.ibuf_packed_device is not None: + self.ibuf_packed_device = None + if self.obuf_packed_device is not None: + self.obuf_packed_device = None + if self.platform == "alveo": + self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) + self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8) + else: + self.ibuf_packed_device = allocate( + shape=self.ishape_packed, dtype=np.uint8, cacheable=True + ) + self.obuf_packed_device = allocate( + shape=self.oshape_packed, dtype=np.uint8, cacheable=True + ) + self.obuf_packed = np.empty_like(self.obuf_packed_device) + + def fold_input(self, ibuf_normal): + """Reshapes input in desired shape. + Gets input data (ibuf_normal), checks if data is in expected normal shape. + Returns folded input.""" + # ensure that shape is as expected + assert ibuf_normal.shape == self.ishape_normal + # convert to folded form + ibuf_folded = ibuf_normal.reshape(self.ishape_folded) + return ibuf_folded + + def pack_input(self, ibuf_folded): + """Packs folded input and reverses both SIMD dim and endianness. + Gets input data in folded shape and returns packed input data.""" + ibuf_packed = finnpy_to_packed_bytearray( + ibuf_folded, + self.idt, + reverse_endian=True, + reverse_inner=True, + fast_mode=True, + ) + return ibuf_packed + + def unpack_output(self, obuf_packed): + """Unpacks the packed output buffer from accelerator. + Gets packed output and returns output data in folded shape.""" + obuf_folded = packed_bytearray_to_finnpy( + obuf_packed, + self.odt, + self.oshape_folded, + reverse_endian=True, + reverse_inner=True, + fast_mode=True, + ) + return obuf_folded + + def unfold_output(self, obuf_folded): + """Unfolds output data to normal shape. + Gets folded output data and returns output data in normal shape.""" + obuf_normal = obuf_folded.reshape(self.oshape_normal) + return obuf_normal + + def copy_input_data_to_device(self, data): + """Copies given input data to PYNQ buffer.""" + np.copyto(self.ibuf_packed_device, data) + self.ibuf_packed_device.flush() + + def copy_output_data_from_device(self, data): + """Copies PYNQ output buffer from device.""" + self.obuf_packed_device.invalidate() + np.copyto(data, self.obuf_packed_device) + + def execute_on_buffers(self, asynch=False, batch_size=None): + """Executes accelerator by setting up the DMA(s) on pre-allocated buffers. + Blocking behavior depends on the asynch parameter: + * ``asynch=True`` will block until all transfers are complete. + * ``asynch=False`` won't block, use ``wait_until_finished()`` to check + completion + + The optional batch_size parameter can be used to execute on a smaller + batch than the initialized ``self.batch_size``. + """ + if batch_size is None: + batch_size = self.batch_size + assert batch_size <= self.batch_size, "Specified batch_size is too large." + if self.platform == "zynq-iodma": + assert self.odma.read(0x00) & 0x4 != 0, "Output DMA is not idle" + # manually launch IODMAs since signatures are missing + self.idma.write(0x10, self.ibuf_packed_device.device_address) + self.idma.write(0x1C, batch_size) + self.odma.write(0x10, self.obuf_packed_device.device_address) + self.odma.write(0x1C, batch_size) + self.idma.write(0x00, 1) + self.odma.write(0x00, 1) + elif self.platform == "alveo": + assert self.odma_handle is None, "Output DMA is already running" + self.idma.start(self.ibuf_packed_device, batch_size) + self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size) + else: + raise Exception("Unrecognized platform: %s" % self.platform) + # blocking behavior depends on asynch parameter + if asynch is False: + self.wait_until_finished() + + def wait_until_finished(self): + "Block until the output DMA has finished writing." + if self.platform == "zynq-iodma": + # check if output IODMA is finished via register reads + status = self.odma.read(0x00) + while status & 0x2 == 0: + status = self.odma.read(0x00) + elif self.platform == "alveo": + assert self.odma_handle is not None, "No odma_handle to wait on" + self.odma_handle.wait() + self.odma_handle = None + else: + raise Exception("Unrecognized platform: %s" % self.platform) + + def execute(self, input_npy): + """Given input numpy array, first perform necessary packing and copying + to device buffers, execute on accelerator, then unpack output and return + output numpy array from accelerator.""" + ibuf_folded = self.fold_input(input_npy) + ibuf_packed = self.pack_input(ibuf_folded) + self.copy_input_data_to_device(ibuf_packed) + self.execute_on_buffers() + self.copy_output_data_from_device(self.obuf_packed) + obuf_folded = self.unpack_output(self.obuf_packed) + obuf_normal = self.unfold_output(obuf_folded) + return obuf_normal + + def throughput_test(self): + """Run accelerator with empty inputs to measure throughput and other metrics. + Returns dictionary with various metrics.""" + # dictionary for results of throughput test + res = {} + start = time.time() + self.execute_on_buffers() + end = time.time() + runtime = end - start + res["runtime[ms]"] = runtime * 1000 + res["throughput[images/s]"] = self.batch_size / runtime + res["DRAM_in_bandwidth[Mb/s]"] = ( + np.prod(self.ishape_packed) * 0.000001 / runtime + ) + res["DRAM_out_bandwidth[Mb/s]"] = ( + np.prod(self.oshape_packed) * 0.000001 / runtime + ) + if self.platform != "alveo": + res["fclk[mhz]"] = Clocks.fclk0_mhz + else: + res["fclk[mhz]"] = self.fclk_mhz + res["batch_size"] = self.batch_size + # also benchmark driver-related overheads + input_npy = np.zeros(self.ishape_normal, dtype=self.idt.to_numpy_dt()) + start = time.time() + ibuf_folded = self.fold_input(input_npy) + end = time.time() + runtime = end - start + res["fold_input[ms]"] = runtime + + start = time.time() + ibuf_packed = self.pack_input(ibuf_folded) + end = time.time() + runtime = end - start + res["pack_input[ms]"] = runtime + + start = time.time() + self.copy_input_data_to_device(ibuf_packed) + end = time.time() + runtime = end - start + res["copy_input_data_to_device[ms]"] = runtime + + start = time.time() + self.copy_output_data_from_device(self.obuf_packed) + end = time.time() + runtime = end - start + res["copy_output_data_from_device[ms]"] = runtime + + start = time.time() + obuf_folded = self.unpack_output(self.obuf_packed) + end = time.time() + runtime = end - start + res["unpack_output[ms]"] = runtime + + start = time.time() + self.unfold_output(obuf_folded) + end = time.time() + runtime = end - start + res["unfold_output[ms]"] = runtime + return res diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py new file mode 100644 index 000000000..4aa7d67aa --- /dev/null +++ b/src/finn/qnn-data/templates/driver/validate.py @@ -0,0 +1,109 @@ +# Copyright (c) 2020 Xilinx, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +from driver import io_shape_dict +from driver_base import FINNExampleOverlay +import numpy as np + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Validate top-1 accuracy for FINN-generated accelerator" + ) + parser.add_argument( + "--batchsize", help="number of samples for inference", type=int, default=100 + ) + parser.add_argument( + "--dataset", help="dataset to use (mnist of cifar10)", required=True + ) + parser.add_argument( + "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma" + ) + parser.add_argument( + "--bitfile", help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit" + ) + parser.add_argument( + "--dataset_root", help="dataset root dir for download/reuse", default="/tmp" + ) + # parse arguments + args = parser.parse_args() + bsize = args.batchsize + dataset = args.dataset + bitfile = args.bitfile + platform = args.platform + dataset_root = args.dataset_root + + if dataset == "mnist": + from dataset_loading import mnist + + trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data( + dataset_root, download=True, one_hot=False + ) + elif dataset == "cifar10": + from dataset_loading import cifar + + trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data( + dataset_root, download=True, one_hot=False + ) + else: + raise Exception("Unrecognized dataset") + + test_imgs = testx + test_labels = testy + + ok = 0 + nok = 0 + total = test_imgs.shape[0] + + driver = FINNExampleOverlay( + bitfile_name=bitfile, + platform=platform, + io_shape_dict=io_shape_dict, + batch_size=bsize, + runtime_weight_dir="runtime_weights/", + ) + + n_batches = int(total / bsize) + + test_imgs = test_imgs.reshape(n_batches, bsize, -1) + test_labels = test_labels.reshape(n_batches, bsize) + + for i in range(n_batches): + ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape) + exp = test_labels[i] + driver.copy_input_data_to_device(ibuf_normal) + driver.execute_on_buffers() + obuf_normal = np.empty_like(driver.obuf_packed_device) + driver.copy_output_data_from_device(obuf_normal) + ret = np.bincount(obuf_normal.flatten() == exp.flatten()) + nok += ret[0] + ok += ret[1] + print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok)) + + acc = 100.0 * ok / (total) + print("Final accuracy: %f" % acc) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 27708b635..84dc01e53 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -35,6 +35,7 @@ import finn.core.datatype as dtp from finn.custom_op.registry import getCustomOp import os import warnings +import pkg_resources as pk from . import template_driver @@ -62,9 +63,11 @@ class MakePYNQDriver(Transformation): model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) # create the base FINN driver -- same for all accels + driver_base_template = pk.resource_filename( + "finn.qnn-data", "templates/driver/driver_base.py" + ) driver_base_py = pynq_driver_dir + "/driver_base.py" - with open(driver_base_py, "w") as f: - f.write(template_driver.driver_base) + shutil.copy(driver_base_template, driver_base_py) # extract input-output shapes from the graph # TODO convert this to an analysis pass? @@ -125,9 +128,10 @@ class MakePYNQDriver(Transformation): # add validate.py to run full top-1 test (only for suitable networks) validate_py = pynq_driver_dir + "/validate.py" - validate_src = template_driver.pynq_validation_template - with open(validate_py, "w") as f: - f.write(validate_src) + validate_template = pk.resource_filename( + "finn.qnn-data", "templates/driver/validate.py" + ) + shutil.copy(validate_template, validate_py) # copy all the dependencies into the driver folder # driver imports utils/data_packing and core/datatype diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py index eec79a24c..b59520571 100644 --- a/src/finn/transformation/fpgadataflow/template_driver.py +++ b/src/finn/transformation/fpgadataflow/template_driver.py @@ -28,292 +28,6 @@ # flake8: noqa -driver_base = ''' -# Copyright (c) 2020 Xilinx, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of Xilinx nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import time -import os -from pynq import Overlay, allocate -from pynq.ps import Clocks - -from finn.util.data_packing import ( - finnpy_to_packed_bytearray, - packed_bytearray_to_finnpy, -) - - -class FINNExampleOverlay(Overlay): - def __init__( - self, - bitfile_name, - platform, - io_shape_dict, - batch_size=1, - fclk_mhz=100.0, - download=None, - runtime_weight_dir="runtime_weights/" - ): - super().__init__(bitfile_name, download) - self.runtime_weight_dir = runtime_weight_dir - self._io_shape_dict = io_shape_dict - self.ibuf_packed_device = None - self.obuf_packed_device = None - self.platform = platform - self.batch_size = batch_size - self.fclk_mhz = fclk_mhz - if self.platform == "alveo": - self.idma = self.idma0 - self.odma = self.odma0 - elif self.platform == "zynq-iodma": - self.idma = self.idma0 - self.odma = self.odma0 - # set the clock frequency as specified by user during transformations - if self.fclk_mhz > 0: - Clocks.fclk0_mhz = self.fclk_mhz - else: - raise ValueError("Supported platforms are zynq-iodma alveo") - - # allocate a PYNQ buffer for the packed input and buffer - if self.platform == "alveo": - self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) - self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8) - else: - self.ibuf_packed_device = allocate( - shape=self.ishape_packed, dtype=np.uint8, cacheable=True - ) - self.obuf_packed_device = allocate( - shape=self.oshape_packed, dtype=np.uint8, cacheable=True - ) - # load any runtime weights - self.load_runtime_weights() - - def load_runtime_weights(self, flush_accel=True, verify=True): - w_filenames = [] - for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir): - w_filenames.extend(filenames) - rt_weight_dict = {} - for w_filename in w_filenames: - if w_filename.endswith(".dat"): - with open(self.runtime_weight_dir + "/" + w_filename, "r") as f: - dat = f.read() - layer_w=np.fromiter([int(x,16) for x in dat.strip().split()], dtype=np.uint32) - layer_ind=int(w_filename.split("_")[0]) - rt_weight_dict[layer_ind] = layer_w - for layer_ind in rt_weight_dict.keys(): - cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind - if cand_if_name in self.ip_dict.keys(): - layer_mmio = getattr(self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind).mmio - layer_w = rt_weight_dict[layer_ind] - layer_mmio.write_mm(0, layer_w.tobytes()) - if verify: - new_w = np.copy(layer_mmio.array[:layer_w.shape[0]]) - assert (layer_w == new_w).all() - if flush_accel: - # run accelerator to flush any stale weights from weight streamer FIFOs - self.execute_on_buffers() - - @property - def idt(self): - return self._io_shape_dict["idt"] - - @property - def odt(self): - return self._io_shape_dict["odt"] - - @property - def ishape_normal(self): - ret = list(self._io_shape_dict["ishape_normal"]) - ret[0] = self.batch_size - return tuple(ret) - - @property - def oshape_normal(self): - ret = list(self._io_shape_dict["oshape_normal"]) - ret[0] = self.batch_size - return tuple(ret) - - @property - def ishape_folded(self): - ret = list(self._io_shape_dict["ishape_folded"]) - ret[0] = self.batch_size - return tuple(ret) - - @property - def oshape_folded(self): - ret = list(self._io_shape_dict["oshape_folded"]) - ret[0] = self.batch_size - return tuple(ret) - - @property - def ishape_packed(self): - ret = list(self._io_shape_dict["ishape_packed"]) - ret[0] = self.batch_size - return tuple(ret) - - @property - def oshape_packed(self): - ret = list(self._io_shape_dict["oshape_packed"]) - ret[0] = self.batch_size - return tuple(ret) - - @property - def batch_size(self): - return self._batch_size - - @batch_size.setter - def batch_size(self, value): - self._batch_size = value - # free the old buffers - if self.ibuf_packed_device is not None: - self.ibuf_packed_device.freebuffer() - if self.obuf_packed_device is not None: - self.obuf_packed_device.freebuffer() - if self.platform == "alveo": - self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) - self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8) - else: - self.ibuf_packed_device = allocate( - shape=self.ishape_packed, dtype=np.uint8, cacheable=True - ) - self.obuf_packed_device = allocate( - shape=self.oshape_packed, dtype=np.uint8, cacheable=True - ) - - def fold_input(self, ibuf_normal): - """Reshapes input in desired shape. - Gets input data (ibuf_normal), checks if data is in expected normal shape. - Returns folded input.""" - # ensure that shape is as expected - assert ibuf_normal.shape == self.ishape_normal - # convert to folded form - ibuf_folded = ibuf_normal.reshape(self.ishape_folded) - return ibuf_folded - - def pack_input(self, ibuf_folded): - """Packs folded input and reverses both SIMD dim and endianness. - Gets input data in folded shape and returns packed input data.""" - ibuf_packed = finnpy_to_packed_bytearray( - ibuf_folded, self.idt, reverse_endian=True, reverse_inner=True - ) - return ibuf_packed - - def unpack_output(self, obuf_packed): - """Unpacks the packed output buffer from accelerator. - Gets packed output and returns output data in folded shape.""" - obuf_folded = packed_bytearray_to_finnpy( - obuf_packed, - self.odt, - self.oshape_folded, - reverse_endian=True, - reverse_inner=True, - ) - return obuf_folded - - def unfold_output(self, obuf_folded): - """Unfolds output data to normal shape. - Gets folded output data and returns output data in normal shape.""" - obuf_normal = obuf_folded.reshape(self.oshape_normal) - return obuf_normal - - def copy_input_data_to_device(self, data): - """Copies given input data to PYNQ buffer.""" - np.copyto(self.ibuf_packed_device, data) - self.ibuf_packed_device.flush() - - def copy_output_data_from_device(self, data): - """Copies PYNQ output buffer from device.""" - self.obuf_packed_device.invalidate() - np.copyto(data, self.obuf_packed_device) - - def execute_on_buffers(self): - """Executes accelerator by setting up the DMA(s) and - waiting until all transfers/calls complete. Uses only member variables and - returns nothing.""" - if self.platform == "zynq-iodma": - # manually launch IODMAs since signatures are missing - self.idma.write(0x10, self.ibuf_packed_device.device_address) - self.idma.write(0x1C, self.batch_size) - self.odma.write(0x10, self.obuf_packed_device.device_address) - self.odma.write(0x1C, self.batch_size) - self.idma.write(0x00, 1) - self.odma.write(0x00, 1) - # wait until output IODMA is finished - status = self.odma.read(0x00) - while status & 0x2 == 0: - status = self.odma.read(0x00) - elif self.platform == "alveo": - self.idma.start_sw(self.ibuf_packed_device, self.batch_size) - odma_handle = self.odma.start_sw(self.obuf_packed_device, self.batch_size) - odma_handle.wait() - else: - raise Exception("Unrecognized platform: %s" % self.platform) - - def execute(self, input_npy): - """Given input numpy array, first perform necessary packing and copying - to device buffers, execute on accelerator, then unpack output and return - output numpy array from accelerator.""" - ibuf_folded = self.fold_input(input_npy) - ibuf_packed = self.pack_input(ibuf_folded) - self.copy_input_data_to_device(ibuf_packed) - self.execute_on_buffers() - obuf_packed = np.empty_like(self.obuf_packed_device) - self.copy_output_data_from_device(obuf_packed) - obuf_folded = self.unpack_output(obuf_packed) - obuf_normal = self.unfold_output(obuf_folded) - return obuf_normal - - def throughput_test(self): - "Run accelerator with empty inputs to measure throughput and other metrics." - # dictionary for results of throughput test - res = {} - start = time.time() - self.execute_on_buffers() - end = time.time() - runtime = end - start - res["runtime[ms]"] = runtime * 1000 - res["throughput[images/s]"] = self.batch_size / runtime - res["DRAM_in_bandwidth[Mb/s]"] = ( - np.prod(self.ishape_packed) * 0.000001 / runtime - ) - res["DRAM_out_bandwidth[Mb/s]"] = ( - np.prod(self.oshape_packed) * 0.000001 / runtime - ) - if self.platform != "alveo": - res["fclk[mhz]"] = Clocks.fclk0_mhz - else: - res["fclk[mhz]"] = self.fclk_mhz - res["batch_size"] = self.batch_size - return res -''' - - pynq_driver_template = """ # Copyright (c) 2020 Xilinx, Inc. # All rights reserved. @@ -419,96 +133,3 @@ if __name__ == "__main__": else: raise Exception("Exec mode has to be set to remote_pynq or throughput_test") """ - -pynq_validation_template = """ -# Copyright (c) 2020 Xilinx, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of Xilinx nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse -from driver import io_shape_dict -from driver_base import FINNExampleOverlay -import numpy as np - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Validate top-1 accuracy for FINN-generated accelerator') - parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=100) - parser.add_argument('--dataset', help='dataset to use (mnist of cifar10)', required=True) - parser.add_argument('--platform', help='Target platform: zynq-iodma alveo', default="zynq-iodma") - parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit") - parser.add_argument('--dataset_root', help='dataset root dir for download/reuse', default="/tmp") - # parse arguments - args = parser.parse_args() - bsize = args.batchsize - dataset = args.dataset - bitfile = args.bitfile - platform = args.platform - dataset_root = args.dataset_root - - - if dataset == "mnist": - from dataset_loading import mnist - trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data(dataset_root, download=True, one_hot=False) - elif dataset == "cifar10": - from dataset_loading import cifar - trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(dataset_root, download=True, one_hot=False) - else: - raise Exception("Unrecognized dataset") - - test_imgs = testx - test_labels = testy - - ok = 0 - nok = 0 - total = test_imgs.shape[0] - - driver = FINNExampleOverlay( - bitfile_name = bitfile, platform = platform, - io_shape_dict = io_shape_dict, batch_size = bsize, - runtime_weight_dir = "runtime_weights/" - ) - - n_batches = int(total / bsize) - - test_imgs = test_imgs.reshape(n_batches, bsize, -1) - test_labels = test_labels.reshape(n_batches, bsize) - - for i in range(n_batches): - ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape) - exp = test_labels[i] - driver.copy_input_data_to_device(ibuf_normal) - driver.execute_on_buffers() - obuf_normal = np.empty_like(driver.obuf_packed_device) - driver.copy_output_data_from_device(obuf_normal) - ret = np.bincount(obuf_normal.flatten() == exp.flatten()) - nok += ret[0] - ok += ret[1] - print("batch %d / %d : total OK %d NOK %d" % (i+1, n_batches, ok, nok)) - - acc = 100.0 * ok / (total) - print("Final accuracy: %f" % acc) -""" -- GitLab