Driver data packing + improvements (#261)

* [Driver] add driver_base.py as own template file + comments * [Driver] also move validation to won template + use in transform * [Driver] more comments * [Driver] suggested updates from PYNQ team + async mode exec_on_buffers * [Driver] allow smaller batchsize in execute_on_buffers * [Driver] optimize buffer alloc a bit * [Driver] wait condition fix * [Deps] update finn-base * [Deps] update finn-base * [Driver] enable fast_mode, expose more benchmarks * [Deps] update finn-base

Driver data packing + improvements (#261)
* [Driver] add driver_base.py as own template file + comments * [Driver] also move validation to won template + use in transform * [Driver] more comments * [Driver] suggested updates from PYNQ team + async mode exec_on_buffers * [Driver] allow smaller batchsize in execute_on_buffers * [Driver] optimize buffer alloc a bit * [Driver] wait condition fix * [Deps] update finn-base * [Deps] update finn-base * [Driver] enable fast_mode, expose more benchmarks * [Deps] update finn-base
e202bca2 · Yaman Umuroglu · GitHub · 72b11ca6 · e202bca2 · e202bca2
Unverified Commit e202bca2 authored 4 years ago by Yaman Umuroglu Committed by GitHub 4 years ago
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -12,7 +12,7 @@ gecho () {
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-FINN_BASE_COMMIT=ee0a7df4de00958cf4d71624a56e8a1acf4fd085
+FINN_BASE_COMMIT=efcc0324fbca2476af25f7d3c060d51d5270f09a
 BREVITAS_COMMIT=aff49758ec445d77c75721c7de3091a2a1797ca8
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
 HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e

--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import numpy as np
+import time
+import os
+from pynq import Overlay, allocate
+from pynq.ps import Clocks
+from finn.util.data_packing import (
+    finnpy_to_packed_bytearray,
+    packed_bytearray_to_finnpy,
+)
+# Driver base class for FINN-generated dataflow accelerators.
+# The particulars of the generated accelerator are specified via the
+# io_shape_dict (generated by the MakePYNQDriver transformation).
+class FINNExampleOverlay(Overlay):
+    def __init__(
+        self,
+        bitfile_name,
+        platform,
+        io_shape_dict,
+        batch_size=1,
+        fclk_mhz=100.0,
+        device=None,
+        download=True,
+        runtime_weight_dir="runtime_weights/",
+    ):
+        """Initialize the FINN accelerator.
+        Parameters
+        ----------
+        bitfile_name: str
+            Path to accelerator .bit/.xclbin file
+        platform: str
+            FINN platform type, either "alveo" or "zynq-iodma"
+        io_shape_dict: dict
+            Dictionary with particulars of the generated accelerator
+        batch_size: int
+            Maximum batch size in driver (hardware batchsize is always 1)
+        fclk_mhz: float
+            Override the clock frequency, only possible for Zynq.
+        device: pynq.Device
+            Which PYNQ device to use, None for default.
+        download: bool
+            Whether to flash the bitstream.
+        runtime_weight_dir: str
+            Path to runtime weights folder.
+        """
+        super().__init__(bitfile_name, download=download, device=device)
+        self.runtime_weight_dir = runtime_weight_dir
+        self._io_shape_dict = io_shape_dict
+        self.ibuf_packed_device = None
+        self.obuf_packed_device = None
+        self.platform = platform
+        self.batch_size = batch_size
+        self.fclk_mhz = fclk_mhz
+        if self.platform == "alveo":
+            self.idma = self.idma0
+            self.odma = self.odma0
+            self.odma_handle = None
+        elif self.platform == "zynq-iodma":
+            self.idma = self.idma0
+            self.odma = self.odma0
+            # set the clock frequency as specified by user during transformations
+            if self.fclk_mhz > 0:
+                Clocks.fclk0_mhz = self.fclk_mhz
+        else:
+            raise ValueError("Supported platforms are zynq-iodma alveo")
+        # load any runtime weights
+        self.load_runtime_weights()
+    def load_runtime_weights(self, flush_accel=True, verify=True):
+        """Load any existing runtime weights from the specified dir into the
+        appropriate layer of the accelerator. Note that this must be enabled
+        during the accelerator build process. The runtime weights directory
+        is specified as the class member ``runtime_weight_dir``.
+        Parameters
+        ----------
+        flush_accel: bool
+            Run the accelerator with dummy input after weights are written to
+            flush any stale weight data in the weight streamer FIFOs.
+        verify: bool
+            Whether the written weights will be re-read and verified.
+        """
+        w_filenames = []
+        for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir):
+            w_filenames.extend(filenames)
+        rt_weight_dict = {}
+        for w_filename in w_filenames:
+            if w_filename.endswith(".dat"):
+                with open(self.runtime_weight_dir + "/" + w_filename, "r") as f:
+                    dat = f.read()
+            layer_w = np.fromiter(
+                [int(x, 16) for x in dat.strip().split()], dtype=np.uint32
+            )
+            layer_ind = int(w_filename.split("_")[0])
+            rt_weight_dict[layer_ind] = layer_w
+        for layer_ind in rt_weight_dict.keys():
+            cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind
+            if cand_if_name in self.ip_dict.keys():
+                layer_mmio = getattr(
+                    self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind
+                ).mmio
+                layer_w = rt_weight_dict[layer_ind]
+                layer_mmio.write_mm(0, layer_w.tobytes())
+                if verify:
+                    new_w = np.copy(layer_mmio.array[: layer_w.shape[0]])
+                    assert (layer_w == new_w).all()
+        if flush_accel:
+            # run accelerator to flush any stale weights from weight streamer FIFOs
+            self.execute_on_buffers()
+    @property
+    def idt(self):
+        return self._io_shape_dict["idt"]
+    @property
+    def odt(self):
+        return self._io_shape_dict["odt"]
+    @property
+    def ishape_normal(self):
+        ret = list(self._io_shape_dict["ishape_normal"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+    @property
+    def oshape_normal(self):
+        ret = list(self._io_shape_dict["oshape_normal"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+    @property
+    def ishape_folded(self):
+        ret = list(self._io_shape_dict["ishape_folded"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+    @property
+    def oshape_folded(self):
+        ret = list(self._io_shape_dict["oshape_folded"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+    @property
+    def ishape_packed(self):
+        ret = list(self._io_shape_dict["ishape_packed"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+    @property
+    def oshape_packed(self):
+        ret = list(self._io_shape_dict["oshape_packed"])
+        ret[0] = self.batch_size
+        return tuple(ret)
+    @property
+    def batch_size(self):
+        return self._batch_size
+    @batch_size.setter
+    def batch_size(self, value):
+        self._batch_size = value
+        # free the old buffers by setting to None
+        # (reference counting should care of it)
+        if self.ibuf_packed_device is not None:
+            self.ibuf_packed_device = None
+        if self.obuf_packed_device is not None:
+            self.obuf_packed_device = None
+        if self.platform == "alveo":
+            self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
+            self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
+        else:
+            self.ibuf_packed_device = allocate(
+                shape=self.ishape_packed, dtype=np.uint8, cacheable=True
+            )
+            self.obuf_packed_device = allocate(
+                shape=self.oshape_packed, dtype=np.uint8, cacheable=True
+            )
+        self.obuf_packed = np.empty_like(self.obuf_packed_device)
+    def fold_input(self, ibuf_normal):
+        """Reshapes input in desired shape.
+        Gets input data (ibuf_normal), checks if data is in expected normal shape.
+        Returns folded input."""
+        # ensure that shape is as expected
+        assert ibuf_normal.shape == self.ishape_normal
+        # convert to folded form
+        ibuf_folded = ibuf_normal.reshape(self.ishape_folded)
+        return ibuf_folded
+    def pack_input(self, ibuf_folded):
+        """Packs folded input and reverses both SIMD dim and endianness.
+        Gets input data in folded shape and returns packed input data."""
+        ibuf_packed = finnpy_to_packed_bytearray(
+            ibuf_folded,
+            self.idt,
+            reverse_endian=True,
+            reverse_inner=True,
+            fast_mode=True,
+        )
+        return ibuf_packed
+    def unpack_output(self, obuf_packed):
+        """Unpacks the packed output buffer from accelerator.
+        Gets packed output and returns output data in folded shape."""
+        obuf_folded = packed_bytearray_to_finnpy(
+            obuf_packed,
+            self.odt,
+            self.oshape_folded,
+            reverse_endian=True,
+            reverse_inner=True,
+            fast_mode=True,
+        )
+        return obuf_folded
+    def unfold_output(self, obuf_folded):
+        """Unfolds output data to normal shape.
+        Gets folded output data and returns output data in normal shape."""
+        obuf_normal = obuf_folded.reshape(self.oshape_normal)
+        return obuf_normal
+    def copy_input_data_to_device(self, data):
+        """Copies given input data to PYNQ buffer."""
+        np.copyto(self.ibuf_packed_device, data)
+        self.ibuf_packed_device.flush()
+    def copy_output_data_from_device(self, data):
+        """Copies PYNQ output buffer from device."""
+        self.obuf_packed_device.invalidate()
+        np.copyto(data, self.obuf_packed_device)
+    def execute_on_buffers(self, asynch=False, batch_size=None):
+        """Executes accelerator by setting up the DMA(s) on pre-allocated buffers.
+        Blocking behavior depends on the asynch parameter:
+        * ``asynch=True`` will block until all transfers are complete.
+        * ``asynch=False`` won't block, use ``wait_until_finished()`` to check
+           completion
+        The optional batch_size parameter can be used to execute on a smaller
+        batch than the initialized ``self.batch_size``.
+        """
+        if batch_size is None:
+            batch_size = self.batch_size
+        assert batch_size <= self.batch_size, "Specified batch_size is too large."
+        if self.platform == "zynq-iodma":
+            assert self.odma.read(0x00) & 0x4 != 0, "Output DMA is not idle"
+            # manually launch IODMAs since signatures are missing
+            self.idma.write(0x10, self.ibuf_packed_device.device_address)
+            self.idma.write(0x1C, batch_size)
+            self.odma.write(0x10, self.obuf_packed_device.device_address)
+            self.odma.write(0x1C, batch_size)
+            self.idma.write(0x00, 1)
+            self.odma.write(0x00, 1)
+        elif self.platform == "alveo":
+            assert self.odma_handle is None, "Output DMA is already running"
+            self.idma.start(self.ibuf_packed_device, batch_size)
+            self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size)
+        else:
+            raise Exception("Unrecognized platform: %s" % self.platform)
+        # blocking behavior depends on asynch parameter
+        if asynch is False:
+            self.wait_until_finished()
+    def wait_until_finished(self):
+        "Block until the output DMA has finished writing."
+        if self.platform == "zynq-iodma":
+            # check if output IODMA is finished via register reads
+            status = self.odma.read(0x00)
+            while status & 0x2 == 0:
+                status = self.odma.read(0x00)
+        elif self.platform == "alveo":
+            assert self.odma_handle is not None, "No odma_handle to wait on"
+            self.odma_handle.wait()
+            self.odma_handle = None
+        else:
+            raise Exception("Unrecognized platform: %s" % self.platform)
+    def execute(self, input_npy):
+        """Given input numpy array, first perform necessary packing and copying
+        to device buffers, execute on accelerator, then unpack output and return
+        output numpy array from accelerator."""
+        ibuf_folded = self.fold_input(input_npy)
+        ibuf_packed = self.pack_input(ibuf_folded)
+        self.copy_input_data_to_device(ibuf_packed)
+        self.execute_on_buffers()
+        self.copy_output_data_from_device(self.obuf_packed)
+        obuf_folded = self.unpack_output(self.obuf_packed)
+        obuf_normal = self.unfold_output(obuf_folded)
+        return obuf_normal
+    def throughput_test(self):
+        """Run accelerator with empty inputs to measure throughput and other metrics.
+        Returns dictionary with various metrics."""
+        # dictionary for results of throughput test
+        res = {}
+        start = time.time()
+        self.execute_on_buffers()
+        end = time.time()
+        runtime = end - start
+        res["runtime[ms]"] = runtime * 1000
+        res["throughput[images/s]"] = self.batch_size / runtime
+        res["DRAM_in_bandwidth[Mb/s]"] = (
+            np.prod(self.ishape_packed) * 0.000001 / runtime
+        )
+        res["DRAM_out_bandwidth[Mb/s]"] = (
+            np.prod(self.oshape_packed) * 0.000001 / runtime
+        )
+        if self.platform != "alveo":
+            res["fclk[mhz]"] = Clocks.fclk0_mhz
+        else:
+            res["fclk[mhz]"] = self.fclk_mhz
+        res["batch_size"] = self.batch_size
+        # also benchmark driver-related overheads
+        input_npy = np.zeros(self.ishape_normal, dtype=self.idt.to_numpy_dt())
+        start = time.time()
+        ibuf_folded = self.fold_input(input_npy)
+        end = time.time()
+        runtime = end - start
+        res["fold_input[ms]"] = runtime
+        start = time.time()
+        ibuf_packed = self.pack_input(ibuf_folded)
+        end = time.time()
+        runtime = end - start
+        res["pack_input[ms]"] = runtime
+        start = time.time()
+        self.copy_input_data_to_device(ibuf_packed)
+        end = time.time()
+        runtime = end - start
+        res["copy_input_data_to_device[ms]"] = runtime
+        start = time.time()
+        self.copy_output_data_from_device(self.obuf_packed)
+        end = time.time()
+        runtime = end - start
+        res["copy_output_data_from_device[ms]"] = runtime
+        start = time.time()
+        obuf_folded = self.unpack_output(self.obuf_packed)
+        end = time.time()
+        runtime = end - start
+        res["unpack_output[ms]"] = runtime
+        start = time.time()
+        self.unfold_output(obuf_folded)
+        end = time.time()
+        runtime = end - start
+        res["unfold_output[ms]"] = runtime
+        return res
--- a/src/finn/qnn-data/templates/driver/validate.py
+++ b/src/finn/qnn-data/templates/driver/validate.py
+# Copyright (c) 2020 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
+from driver import io_shape_dict
+from driver_base import FINNExampleOverlay
+import numpy as np
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Validate top-1 accuracy for FINN-generated accelerator"
+    )
+    parser.add_argument(
+        "--batchsize", help="number of samples for inference", type=int, default=100
+    )
+    parser.add_argument(
+        "--dataset", help="dataset to use (mnist of cifar10)", required=True
+    )
+    parser.add_argument(
+        "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma"
+    )
+    parser.add_argument(
+        "--bitfile", help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit"
+    )
+    parser.add_argument(
+        "--dataset_root", help="dataset root dir for download/reuse", default="/tmp"
+    )
+    # parse arguments
+    args = parser.parse_args()
+    bsize = args.batchsize
+    dataset = args.dataset
+    bitfile = args.bitfile
+    platform = args.platform
+    dataset_root = args.dataset_root
+    if dataset == "mnist":
+        from dataset_loading import mnist
+        trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data(
+            dataset_root, download=True, one_hot=False
+        )
+    elif dataset == "cifar10":
+        from dataset_loading import cifar
+        trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(
+            dataset_root, download=True, one_hot=False
+        )
+    else:
+        raise Exception("Unrecognized dataset")
+    test_imgs = testx
+    test_labels = testy
+    ok = 0
+    nok = 0
+    total = test_imgs.shape[0]
+    driver = FINNExampleOverlay(
+        bitfile_name=bitfile,
+        platform=platform,
+        io_shape_dict=io_shape_dict,
+        batch_size=bsize,
+        runtime_weight_dir="runtime_weights/",
+    )
+    n_batches = int(total / bsize)
+    test_imgs = test_imgs.reshape(n_batches, bsize, -1)
+    test_labels = test_labels.reshape(n_batches, bsize)
+    for i in range(n_batches):
+        ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape)
+        exp = test_labels[i]
+        driver.copy_input_data_to_device(ibuf_normal)
+        driver.execute_on_buffers()
+        obuf_normal = np.empty_like(driver.obuf_packed_device)
+        driver.copy_output_data_from_device(obuf_normal)
+        ret = np.bincount(obuf_normal.flatten() == exp.flatten())
+        nok += ret[0]
+        ok += ret[1]
+        print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))
+    acc = 100.0 * ok / (total)
+    print("Final accuracy: %f" % acc)
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -35,6 +35,7 @@ import finn.core.datatype as dtp
 from finn.custom_op.registry import getCustomOp
 import os
 import warnings
+import pkg_resources as pk
 from . import template_driver
@@ -62,9 +63,11 @@ class MakePYNQDriver(Transformation):
        model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir)
        # create the base FINN driver -- same for all accels
+        driver_base_template = pk.resource_filename(
+            "finn.qnn-data", "templates/driver/driver_base.py"
+        )
        driver_base_py = pynq_driver_dir + "/driver_base.py"
-        with open(driver_base_py, "w") as f:
+        shutil.copy(driver_base_template, driver_base_py)
-            f.write(template_driver.driver_base)
        # extract input-output shapes from the graph
        # TODO convert this to an analysis pass?
@@ -125,9 +128,10 @@ class MakePYNQDriver(Transformation):
        # add validate.py to run full top-1 test (only for suitable networks)
        validate_py = pynq_driver_dir + "/validate.py"
-        validate_src = template_driver.pynq_validation_template
+        validate_template = pk.resource_filename(
-        with open(validate_py, "w") as f:
+            "finn.qnn-data", "templates/driver/validate.py"
-            f.write(validate_src)
+        )
+        shutil.copy(validate_template, validate_py)
        # copy all the dependencies into the driver folder
        # driver imports utils/data_packing and core/datatype

--- a/src/finn/transformation/fpgadataflow/template_driver.py
+++ b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -28,292 +28,6 @@
 # flake8: noqa
-driver_base = '''
-# Copyright (c) 2020 Xilinx, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of Xilinx nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import numpy as np
-import time
-import os
-from pynq import Overlay, allocate
-from pynq.ps import Clocks
-from finn.util.data_packing import (
-    finnpy_to_packed_bytearray,
-    packed_bytearray_to_finnpy,
-)
-class FINNExampleOverlay(Overlay):
-    def __init__(
-        self,
-        bitfile_name,
-        platform,
-        io_shape_dict,
-        batch_size=1,
-        fclk_mhz=100.0,
-        download=None,
-        runtime_weight_dir="runtime_weights/"
-    ):
-        super().__init__(bitfile_name, download)
-        self.runtime_weight_dir = runtime_weight_dir
-        self._io_shape_dict = io_shape_dict
-        self.ibuf_packed_device = None
-        self.obuf_packed_device = None
-        self.platform = platform
-        self.batch_size = batch_size
-        self.fclk_mhz = fclk_mhz
-        if self.platform == "alveo":
-            self.idma = self.idma0
-            self.odma = self.odma0
-        elif self.platform == "zynq-iodma":
-            self.idma = self.idma0
-            self.odma = self.odma0
-            # set the clock frequency as specified by user during transformations
-            if self.fclk_mhz > 0:
-                Clocks.fclk0_mhz = self.fclk_mhz
-        else:
-            raise ValueError("Supported platforms are zynq-iodma alveo")
-        # allocate a PYNQ buffer for the packed input and buffer
-        if self.platform == "alveo":
-            self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
-            self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
-        else:
-            self.ibuf_packed_device = allocate(
-                shape=self.ishape_packed, dtype=np.uint8, cacheable=True
-            )
-            self.obuf_packed_device = allocate(
-                shape=self.oshape_packed, dtype=np.uint8, cacheable=True
-            )
-        # load any runtime weights
-        self.load_runtime_weights()
-    def load_runtime_weights(self, flush_accel=True, verify=True):
-        w_filenames = []
-        for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir):
-            w_filenames.extend(filenames)
-        rt_weight_dict = {}
-        for w_filename in w_filenames:
-            if w_filename.endswith(".dat"):
-                with open(self.runtime_weight_dir + "/" + w_filename, "r") as f:
-                    dat = f.read()
-            layer_w=np.fromiter([int(x,16) for x in dat.strip().split()], dtype=np.uint32)
-            layer_ind=int(w_filename.split("_")[0])
-            rt_weight_dict[layer_ind] = layer_w
-        for layer_ind in rt_weight_dict.keys():
-            cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind
-            if cand_if_name in self.ip_dict.keys():
-                layer_mmio = getattr(self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind).mmio
-                layer_w = rt_weight_dict[layer_ind]
-                layer_mmio.write_mm(0, layer_w.tobytes())
-                if verify:
-                    new_w = np.copy(layer_mmio.array[:layer_w.shape[0]])
-                    assert (layer_w == new_w).all()
-        if flush_accel:
-            # run accelerator to flush any stale weights from weight streamer FIFOs
-            self.execute_on_buffers()
-    @property
-    def idt(self):
-        return self._io_shape_dict["idt"]
-    @property
-    def odt(self):
-        return self._io_shape_dict["odt"]
-    @property
-    def ishape_normal(self):
-        ret = list(self._io_shape_dict["ishape_normal"])
-        ret[0] = self.batch_size
-        return tuple(ret)
-    @property
-    def oshape_normal(self):
-        ret = list(self._io_shape_dict["oshape_normal"])
-        ret[0] = self.batch_size
-        return tuple(ret)
-    @property
-    def ishape_folded(self):
-        ret = list(self._io_shape_dict["ishape_folded"])
-        ret[0] = self.batch_size
-        return tuple(ret)
-    @property
-    def oshape_folded(self):
-        ret = list(self._io_shape_dict["oshape_folded"])
-        ret[0] = self.batch_size
-        return tuple(ret)
-    @property
-    def ishape_packed(self):
-        ret = list(self._io_shape_dict["ishape_packed"])
-        ret[0] = self.batch_size
-        return tuple(ret)
-    @property
-    def oshape_packed(self):
-        ret = list(self._io_shape_dict["oshape_packed"])
-        ret[0] = self.batch_size
-        return tuple(ret)
-    @property
-    def batch_size(self):
-        return self._batch_size
-    @batch_size.setter
-    def batch_size(self, value):
-        self._batch_size = value
-        # free the old buffers
-        if self.ibuf_packed_device is not None:
-            self.ibuf_packed_device.freebuffer()
-        if self.obuf_packed_device is not None:
-            self.obuf_packed_device.freebuffer()
-        if self.platform == "alveo":
-            self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
-            self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
-        else:
-            self.ibuf_packed_device = allocate(
-                shape=self.ishape_packed, dtype=np.uint8, cacheable=True
-            )
-            self.obuf_packed_device = allocate(
-                shape=self.oshape_packed, dtype=np.uint8, cacheable=True
-            )
-    def fold_input(self, ibuf_normal):
-        """Reshapes input in desired shape.
-        Gets input data (ibuf_normal), checks if data is in expected normal shape.
-        Returns folded input."""
-        # ensure that shape is as expected
-        assert ibuf_normal.shape == self.ishape_normal
-        # convert to folded form
-        ibuf_folded = ibuf_normal.reshape(self.ishape_folded)
-        return ibuf_folded
-    def pack_input(self, ibuf_folded):
-        """Packs folded input and reverses both SIMD dim and endianness.
-        Gets input data in folded shape and returns packed input data."""
-        ibuf_packed = finnpy_to_packed_bytearray(
-            ibuf_folded, self.idt, reverse_endian=True, reverse_inner=True
-        )
-        return ibuf_packed
-    def unpack_output(self, obuf_packed):
-        """Unpacks the packed output buffer from accelerator.
-        Gets packed output and returns output data in folded shape."""
-        obuf_folded = packed_bytearray_to_finnpy(
-            obuf_packed,
-            self.odt,
-            self.oshape_folded,
-            reverse_endian=True,
-            reverse_inner=True,
-        )
-        return obuf_folded
-    def unfold_output(self, obuf_folded):
-        """Unfolds output data to normal shape.
-        Gets folded output data and returns output data in normal shape."""
-        obuf_normal = obuf_folded.reshape(self.oshape_normal)
-        return obuf_normal
-    def copy_input_data_to_device(self, data):
-        """Copies given input data to PYNQ buffer."""
-        np.copyto(self.ibuf_packed_device, data)
-        self.ibuf_packed_device.flush()
-    def copy_output_data_from_device(self, data):
-        """Copies PYNQ output buffer from device."""
-        self.obuf_packed_device.invalidate()
-        np.copyto(data, self.obuf_packed_device)
-    def execute_on_buffers(self):
-        """Executes accelerator by setting up the DMA(s) and
-        waiting until all transfers/calls complete. Uses only member variables and
-        returns nothing."""
-        if self.platform == "zynq-iodma":
-            # manually launch IODMAs since signatures are missing
-            self.idma.write(0x10, self.ibuf_packed_device.device_address)
-            self.idma.write(0x1C, self.batch_size)
-            self.odma.write(0x10, self.obuf_packed_device.device_address)
-            self.odma.write(0x1C, self.batch_size)
-            self.idma.write(0x00, 1)
-            self.odma.write(0x00, 1)
-            # wait until output IODMA is finished
-            status = self.odma.read(0x00)
-            while status & 0x2 == 0:
-                status = self.odma.read(0x00)
-        elif self.platform == "alveo":
-            self.idma.start_sw(self.ibuf_packed_device, self.batch_size)
-            odma_handle = self.odma.start_sw(self.obuf_packed_device, self.batch_size)
-            odma_handle.wait()
-        else:
-            raise Exception("Unrecognized platform: %s" % self.platform)
-    def execute(self, input_npy):
-        """Given input numpy array, first perform necessary packing and copying
-        to device buffers, execute on accelerator, then unpack output and return
-        output numpy array from accelerator."""
-        ibuf_folded = self.fold_input(input_npy)
-        ibuf_packed = self.pack_input(ibuf_folded)
-        self.copy_input_data_to_device(ibuf_packed)
-        self.execute_on_buffers()
-        obuf_packed = np.empty_like(self.obuf_packed_device)
-        self.copy_output_data_from_device(obuf_packed)
-        obuf_folded = self.unpack_output(obuf_packed)
-        obuf_normal = self.unfold_output(obuf_folded)
-        return obuf_normal
-    def throughput_test(self):
-        "Run accelerator with empty inputs to measure throughput and other metrics."
-        # dictionary for results of throughput test
-        res = {}
-        start = time.time()
-        self.execute_on_buffers()
-        end = time.time()
-        runtime = end - start
-        res["runtime[ms]"] = runtime * 1000
-        res["throughput[images/s]"] = self.batch_size / runtime
-        res["DRAM_in_bandwidth[Mb/s]"] = (
-            np.prod(self.ishape_packed) * 0.000001 / runtime
-        )
-        res["DRAM_out_bandwidth[Mb/s]"] = (
-            np.prod(self.oshape_packed) * 0.000001 / runtime
-        )
-        if self.platform != "alveo":
-            res["fclk[mhz]"] = Clocks.fclk0_mhz
-        else:
-            res["fclk[mhz]"] = self.fclk_mhz
-        res["batch_size"] = self.batch_size
-        return res
-'''
 pynq_driver_template = """
 # Copyright (c) 2020 Xilinx, Inc.
 # All rights reserved.
@@ -419,96 +133,3 @@ if __name__ == "__main__":
    else:
        raise Exception("Exec mode has to be set to remote_pynq or throughput_test")
 """
-pynq_validation_template = """
-# Copyright (c) 2020 Xilinx, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of Xilinx nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import argparse
-from driver import io_shape_dict
-from driver_base import FINNExampleOverlay
-import numpy as np
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser(description='Validate top-1 accuracy for FINN-generated accelerator')
-  parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=100)
-  parser.add_argument('--dataset', help='dataset to use (mnist of cifar10)', required=True)
-  parser.add_argument('--platform', help='Target platform: zynq-iodma alveo', default="zynq-iodma")
-  parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit")
-  parser.add_argument('--dataset_root', help='dataset root dir for download/reuse', default="/tmp")
-  # parse arguments
-  args = parser.parse_args()
-  bsize = args.batchsize
-  dataset = args.dataset
-  bitfile = args.bitfile
-  platform = args.platform
-  dataset_root = args.dataset_root
-  if dataset == "mnist":
-    from dataset_loading import mnist
-    trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data(dataset_root, download=True, one_hot=False)
-  elif dataset == "cifar10":
-    from dataset_loading import cifar
-    trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(dataset_root, download=True, one_hot=False)
-  else:
-    raise Exception("Unrecognized dataset")
-  test_imgs = testx
-  test_labels = testy
-  ok = 0
-  nok = 0
-  total = test_imgs.shape[0]
-  driver = FINNExampleOverlay(
-      bitfile_name = bitfile, platform = platform,
-      io_shape_dict = io_shape_dict, batch_size = bsize,
-      runtime_weight_dir = "runtime_weights/"
-  )
-  n_batches = int(total / bsize)
-  test_imgs = test_imgs.reshape(n_batches, bsize, -1)
-  test_labels = test_labels.reshape(n_batches, bsize)
-  for i in range(n_batches):
-    ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape)
-    exp = test_labels[i]
-    driver.copy_input_data_to_device(ibuf_normal)
-    driver.execute_on_buffers()
-    obuf_normal = np.empty_like(driver.obuf_packed_device)
-    driver.copy_output_data_from_device(obuf_normal)
-    ret = np.bincount(obuf_normal.flatten() == exp.flatten())
-    nok += ret[0]
-    ok += ret[1]
-    print("batch %d / %d : total OK %d NOK %d" % (i+1, n_batches, ok, nok))
-  acc = 100.0 * ok / (total)
-  print("Final accuracy: %f" % acc)
-"""