Skip to content
Snippets Groups Projects
Unverified Commit e202bca2 authored by Yaman Umuroglu's avatar Yaman Umuroglu Committed by GitHub
Browse files

Driver data packing + improvements (#261)

* [Driver] add driver_base.py as own template file + comments

* [Driver] also move validation to won template + use in transform

* [Driver] more comments

* [Driver] suggested updates from PYNQ team + async mode exec_on_buffers

* [Driver] allow smaller batchsize in execute_on_buffers

* [Driver] optimize buffer alloc a bit

* [Driver] wait condition fix

* [Deps] update finn-base

* [Deps] update finn-base

* [Driver] enable fast_mode, expose more benchmarks

* [Deps] update finn-base
parent 72b11ca6
No related merge requests found
...@@ -12,7 +12,7 @@ gecho () { ...@@ -12,7 +12,7 @@ gecho () {
# checkout the correct dependency repo commits # checkout the correct dependency repo commits
# the repos themselves are cloned in the Dockerfile # the repos themselves are cloned in the Dockerfile
FINN_BASE_COMMIT=ee0a7df4de00958cf4d71624a56e8a1acf4fd085 FINN_BASE_COMMIT=efcc0324fbca2476af25f7d3c060d51d5270f09a
BREVITAS_COMMIT=aff49758ec445d77c75721c7de3091a2a1797ca8 BREVITAS_COMMIT=aff49758ec445d77c75721c7de3091a2a1797ca8
CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e
......
# Copyright (c) 2020 Xilinx, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of Xilinx nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import numpy as np
import time
import os
from pynq import Overlay, allocate
from pynq.ps import Clocks
from finn.util.data_packing import (
finnpy_to_packed_bytearray,
packed_bytearray_to_finnpy,
)
# Driver base class for FINN-generated dataflow accelerators.
# The particulars of the generated accelerator are specified via the
# io_shape_dict (generated by the MakePYNQDriver transformation).
class FINNExampleOverlay(Overlay):
def __init__(
self,
bitfile_name,
platform,
io_shape_dict,
batch_size=1,
fclk_mhz=100.0,
device=None,
download=True,
runtime_weight_dir="runtime_weights/",
):
"""Initialize the FINN accelerator.
Parameters
----------
bitfile_name: str
Path to accelerator .bit/.xclbin file
platform: str
FINN platform type, either "alveo" or "zynq-iodma"
io_shape_dict: dict
Dictionary with particulars of the generated accelerator
batch_size: int
Maximum batch size in driver (hardware batchsize is always 1)
fclk_mhz: float
Override the clock frequency, only possible for Zynq.
device: pynq.Device
Which PYNQ device to use, None for default.
download: bool
Whether to flash the bitstream.
runtime_weight_dir: str
Path to runtime weights folder.
"""
super().__init__(bitfile_name, download=download, device=device)
self.runtime_weight_dir = runtime_weight_dir
self._io_shape_dict = io_shape_dict
self.ibuf_packed_device = None
self.obuf_packed_device = None
self.platform = platform
self.batch_size = batch_size
self.fclk_mhz = fclk_mhz
if self.platform == "alveo":
self.idma = self.idma0
self.odma = self.odma0
self.odma_handle = None
elif self.platform == "zynq-iodma":
self.idma = self.idma0
self.odma = self.odma0
# set the clock frequency as specified by user during transformations
if self.fclk_mhz > 0:
Clocks.fclk0_mhz = self.fclk_mhz
else:
raise ValueError("Supported platforms are zynq-iodma alveo")
# load any runtime weights
self.load_runtime_weights()
def load_runtime_weights(self, flush_accel=True, verify=True):
"""Load any existing runtime weights from the specified dir into the
appropriate layer of the accelerator. Note that this must be enabled
during the accelerator build process. The runtime weights directory
is specified as the class member ``runtime_weight_dir``.
Parameters
----------
flush_accel: bool
Run the accelerator with dummy input after weights are written to
flush any stale weight data in the weight streamer FIFOs.
verify: bool
Whether the written weights will be re-read and verified.
"""
w_filenames = []
for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir):
w_filenames.extend(filenames)
rt_weight_dict = {}
for w_filename in w_filenames:
if w_filename.endswith(".dat"):
with open(self.runtime_weight_dir + "/" + w_filename, "r") as f:
dat = f.read()
layer_w = np.fromiter(
[int(x, 16) for x in dat.strip().split()], dtype=np.uint32
)
layer_ind = int(w_filename.split("_")[0])
rt_weight_dict[layer_ind] = layer_w
for layer_ind in rt_weight_dict.keys():
cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind
if cand_if_name in self.ip_dict.keys():
layer_mmio = getattr(
self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind
).mmio
layer_w = rt_weight_dict[layer_ind]
layer_mmio.write_mm(0, layer_w.tobytes())
if verify:
new_w = np.copy(layer_mmio.array[: layer_w.shape[0]])
assert (layer_w == new_w).all()
if flush_accel:
# run accelerator to flush any stale weights from weight streamer FIFOs
self.execute_on_buffers()
@property
def idt(self):
return self._io_shape_dict["idt"]
@property
def odt(self):
return self._io_shape_dict["odt"]
@property
def ishape_normal(self):
ret = list(self._io_shape_dict["ishape_normal"])
ret[0] = self.batch_size
return tuple(ret)
@property
def oshape_normal(self):
ret = list(self._io_shape_dict["oshape_normal"])
ret[0] = self.batch_size
return tuple(ret)
@property
def ishape_folded(self):
ret = list(self._io_shape_dict["ishape_folded"])
ret[0] = self.batch_size
return tuple(ret)
@property
def oshape_folded(self):
ret = list(self._io_shape_dict["oshape_folded"])
ret[0] = self.batch_size
return tuple(ret)
@property
def ishape_packed(self):
ret = list(self._io_shape_dict["ishape_packed"])
ret[0] = self.batch_size
return tuple(ret)
@property
def oshape_packed(self):
ret = list(self._io_shape_dict["oshape_packed"])
ret[0] = self.batch_size
return tuple(ret)
@property
def batch_size(self):
return self._batch_size
@batch_size.setter
def batch_size(self, value):
self._batch_size = value
# free the old buffers by setting to None
# (reference counting should care of it)
if self.ibuf_packed_device is not None:
self.ibuf_packed_device = None
if self.obuf_packed_device is not None:
self.obuf_packed_device = None
if self.platform == "alveo":
self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
else:
self.ibuf_packed_device = allocate(
shape=self.ishape_packed, dtype=np.uint8, cacheable=True
)
self.obuf_packed_device = allocate(
shape=self.oshape_packed, dtype=np.uint8, cacheable=True
)
self.obuf_packed = np.empty_like(self.obuf_packed_device)
def fold_input(self, ibuf_normal):
"""Reshapes input in desired shape.
Gets input data (ibuf_normal), checks if data is in expected normal shape.
Returns folded input."""
# ensure that shape is as expected
assert ibuf_normal.shape == self.ishape_normal
# convert to folded form
ibuf_folded = ibuf_normal.reshape(self.ishape_folded)
return ibuf_folded
def pack_input(self, ibuf_folded):
"""Packs folded input and reverses both SIMD dim and endianness.
Gets input data in folded shape and returns packed input data."""
ibuf_packed = finnpy_to_packed_bytearray(
ibuf_folded,
self.idt,
reverse_endian=True,
reverse_inner=True,
fast_mode=True,
)
return ibuf_packed
def unpack_output(self, obuf_packed):
"""Unpacks the packed output buffer from accelerator.
Gets packed output and returns output data in folded shape."""
obuf_folded = packed_bytearray_to_finnpy(
obuf_packed,
self.odt,
self.oshape_folded,
reverse_endian=True,
reverse_inner=True,
fast_mode=True,
)
return obuf_folded
def unfold_output(self, obuf_folded):
"""Unfolds output data to normal shape.
Gets folded output data and returns output data in normal shape."""
obuf_normal = obuf_folded.reshape(self.oshape_normal)
return obuf_normal
def copy_input_data_to_device(self, data):
"""Copies given input data to PYNQ buffer."""
np.copyto(self.ibuf_packed_device, data)
self.ibuf_packed_device.flush()
def copy_output_data_from_device(self, data):
"""Copies PYNQ output buffer from device."""
self.obuf_packed_device.invalidate()
np.copyto(data, self.obuf_packed_device)
def execute_on_buffers(self, asynch=False, batch_size=None):
"""Executes accelerator by setting up the DMA(s) on pre-allocated buffers.
Blocking behavior depends on the asynch parameter:
* ``asynch=True`` will block until all transfers are complete.
* ``asynch=False`` won't block, use ``wait_until_finished()`` to check
completion
The optional batch_size parameter can be used to execute on a smaller
batch than the initialized ``self.batch_size``.
"""
if batch_size is None:
batch_size = self.batch_size
assert batch_size <= self.batch_size, "Specified batch_size is too large."
if self.platform == "zynq-iodma":
assert self.odma.read(0x00) & 0x4 != 0, "Output DMA is not idle"
# manually launch IODMAs since signatures are missing
self.idma.write(0x10, self.ibuf_packed_device.device_address)
self.idma.write(0x1C, batch_size)
self.odma.write(0x10, self.obuf_packed_device.device_address)
self.odma.write(0x1C, batch_size)
self.idma.write(0x00, 1)
self.odma.write(0x00, 1)
elif self.platform == "alveo":
assert self.odma_handle is None, "Output DMA is already running"
self.idma.start(self.ibuf_packed_device, batch_size)
self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size)
else:
raise Exception("Unrecognized platform: %s" % self.platform)
# blocking behavior depends on asynch parameter
if asynch is False:
self.wait_until_finished()
def wait_until_finished(self):
"Block until the output DMA has finished writing."
if self.platform == "zynq-iodma":
# check if output IODMA is finished via register reads
status = self.odma.read(0x00)
while status & 0x2 == 0:
status = self.odma.read(0x00)
elif self.platform == "alveo":
assert self.odma_handle is not None, "No odma_handle to wait on"
self.odma_handle.wait()
self.odma_handle = None
else:
raise Exception("Unrecognized platform: %s" % self.platform)
def execute(self, input_npy):
"""Given input numpy array, first perform necessary packing and copying
to device buffers, execute on accelerator, then unpack output and return
output numpy array from accelerator."""
ibuf_folded = self.fold_input(input_npy)
ibuf_packed = self.pack_input(ibuf_folded)
self.copy_input_data_to_device(ibuf_packed)
self.execute_on_buffers()
self.copy_output_data_from_device(self.obuf_packed)
obuf_folded = self.unpack_output(self.obuf_packed)
obuf_normal = self.unfold_output(obuf_folded)
return obuf_normal
def throughput_test(self):
"""Run accelerator with empty inputs to measure throughput and other metrics.
Returns dictionary with various metrics."""
# dictionary for results of throughput test
res = {}
start = time.time()
self.execute_on_buffers()
end = time.time()
runtime = end - start
res["runtime[ms]"] = runtime * 1000
res["throughput[images/s]"] = self.batch_size / runtime
res["DRAM_in_bandwidth[Mb/s]"] = (
np.prod(self.ishape_packed) * 0.000001 / runtime
)
res["DRAM_out_bandwidth[Mb/s]"] = (
np.prod(self.oshape_packed) * 0.000001 / runtime
)
if self.platform != "alveo":
res["fclk[mhz]"] = Clocks.fclk0_mhz
else:
res["fclk[mhz]"] = self.fclk_mhz
res["batch_size"] = self.batch_size
# also benchmark driver-related overheads
input_npy = np.zeros(self.ishape_normal, dtype=self.idt.to_numpy_dt())
start = time.time()
ibuf_folded = self.fold_input(input_npy)
end = time.time()
runtime = end - start
res["fold_input[ms]"] = runtime
start = time.time()
ibuf_packed = self.pack_input(ibuf_folded)
end = time.time()
runtime = end - start
res["pack_input[ms]"] = runtime
start = time.time()
self.copy_input_data_to_device(ibuf_packed)
end = time.time()
runtime = end - start
res["copy_input_data_to_device[ms]"] = runtime
start = time.time()
self.copy_output_data_from_device(self.obuf_packed)
end = time.time()
runtime = end - start
res["copy_output_data_from_device[ms]"] = runtime
start = time.time()
obuf_folded = self.unpack_output(self.obuf_packed)
end = time.time()
runtime = end - start
res["unpack_output[ms]"] = runtime
start = time.time()
self.unfold_output(obuf_folded)
end = time.time()
runtime = end - start
res["unfold_output[ms]"] = runtime
return res
# Copyright (c) 2020 Xilinx, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of Xilinx nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import argparse
from driver import io_shape_dict
from driver_base import FINNExampleOverlay
import numpy as np
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Validate top-1 accuracy for FINN-generated accelerator"
)
parser.add_argument(
"--batchsize", help="number of samples for inference", type=int, default=100
)
parser.add_argument(
"--dataset", help="dataset to use (mnist of cifar10)", required=True
)
parser.add_argument(
"--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma"
)
parser.add_argument(
"--bitfile", help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit"
)
parser.add_argument(
"--dataset_root", help="dataset root dir for download/reuse", default="/tmp"
)
# parse arguments
args = parser.parse_args()
bsize = args.batchsize
dataset = args.dataset
bitfile = args.bitfile
platform = args.platform
dataset_root = args.dataset_root
if dataset == "mnist":
from dataset_loading import mnist
trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data(
dataset_root, download=True, one_hot=False
)
elif dataset == "cifar10":
from dataset_loading import cifar
trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(
dataset_root, download=True, one_hot=False
)
else:
raise Exception("Unrecognized dataset")
test_imgs = testx
test_labels = testy
ok = 0
nok = 0
total = test_imgs.shape[0]
driver = FINNExampleOverlay(
bitfile_name=bitfile,
platform=platform,
io_shape_dict=io_shape_dict,
batch_size=bsize,
runtime_weight_dir="runtime_weights/",
)
n_batches = int(total / bsize)
test_imgs = test_imgs.reshape(n_batches, bsize, -1)
test_labels = test_labels.reshape(n_batches, bsize)
for i in range(n_batches):
ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape)
exp = test_labels[i]
driver.copy_input_data_to_device(ibuf_normal)
driver.execute_on_buffers()
obuf_normal = np.empty_like(driver.obuf_packed_device)
driver.copy_output_data_from_device(obuf_normal)
ret = np.bincount(obuf_normal.flatten() == exp.flatten())
nok += ret[0]
ok += ret[1]
print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))
acc = 100.0 * ok / (total)
print("Final accuracy: %f" % acc)
...@@ -35,6 +35,7 @@ import finn.core.datatype as dtp ...@@ -35,6 +35,7 @@ import finn.core.datatype as dtp
from finn.custom_op.registry import getCustomOp from finn.custom_op.registry import getCustomOp
import os import os
import warnings import warnings
import pkg_resources as pk
from . import template_driver from . import template_driver
...@@ -62,9 +63,11 @@ class MakePYNQDriver(Transformation): ...@@ -62,9 +63,11 @@ class MakePYNQDriver(Transformation):
model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir)
# create the base FINN driver -- same for all accels # create the base FINN driver -- same for all accels
driver_base_template = pk.resource_filename(
"finn.qnn-data", "templates/driver/driver_base.py"
)
driver_base_py = pynq_driver_dir + "/driver_base.py" driver_base_py = pynq_driver_dir + "/driver_base.py"
with open(driver_base_py, "w") as f: shutil.copy(driver_base_template, driver_base_py)
f.write(template_driver.driver_base)
# extract input-output shapes from the graph # extract input-output shapes from the graph
# TODO convert this to an analysis pass? # TODO convert this to an analysis pass?
...@@ -125,9 +128,10 @@ class MakePYNQDriver(Transformation): ...@@ -125,9 +128,10 @@ class MakePYNQDriver(Transformation):
# add validate.py to run full top-1 test (only for suitable networks) # add validate.py to run full top-1 test (only for suitable networks)
validate_py = pynq_driver_dir + "/validate.py" validate_py = pynq_driver_dir + "/validate.py"
validate_src = template_driver.pynq_validation_template validate_template = pk.resource_filename(
with open(validate_py, "w") as f: "finn.qnn-data", "templates/driver/validate.py"
f.write(validate_src) )
shutil.copy(validate_template, validate_py)
# copy all the dependencies into the driver folder # copy all the dependencies into the driver folder
# driver imports utils/data_packing and core/datatype # driver imports utils/data_packing and core/datatype
......
...@@ -28,292 +28,6 @@ ...@@ -28,292 +28,6 @@
# flake8: noqa # flake8: noqa
driver_base = '''
# Copyright (c) 2020 Xilinx, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of Xilinx nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import numpy as np
import time
import os
from pynq import Overlay, allocate
from pynq.ps import Clocks
from finn.util.data_packing import (
finnpy_to_packed_bytearray,
packed_bytearray_to_finnpy,
)
class FINNExampleOverlay(Overlay):
def __init__(
self,
bitfile_name,
platform,
io_shape_dict,
batch_size=1,
fclk_mhz=100.0,
download=None,
runtime_weight_dir="runtime_weights/"
):
super().__init__(bitfile_name, download)
self.runtime_weight_dir = runtime_weight_dir
self._io_shape_dict = io_shape_dict
self.ibuf_packed_device = None
self.obuf_packed_device = None
self.platform = platform
self.batch_size = batch_size
self.fclk_mhz = fclk_mhz
if self.platform == "alveo":
self.idma = self.idma0
self.odma = self.odma0
elif self.platform == "zynq-iodma":
self.idma = self.idma0
self.odma = self.odma0
# set the clock frequency as specified by user during transformations
if self.fclk_mhz > 0:
Clocks.fclk0_mhz = self.fclk_mhz
else:
raise ValueError("Supported platforms are zynq-iodma alveo")
# allocate a PYNQ buffer for the packed input and buffer
if self.platform == "alveo":
self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
else:
self.ibuf_packed_device = allocate(
shape=self.ishape_packed, dtype=np.uint8, cacheable=True
)
self.obuf_packed_device = allocate(
shape=self.oshape_packed, dtype=np.uint8, cacheable=True
)
# load any runtime weights
self.load_runtime_weights()
def load_runtime_weights(self, flush_accel=True, verify=True):
w_filenames = []
for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir):
w_filenames.extend(filenames)
rt_weight_dict = {}
for w_filename in w_filenames:
if w_filename.endswith(".dat"):
with open(self.runtime_weight_dir + "/" + w_filename, "r") as f:
dat = f.read()
layer_w=np.fromiter([int(x,16) for x in dat.strip().split()], dtype=np.uint32)
layer_ind=int(w_filename.split("_")[0])
rt_weight_dict[layer_ind] = layer_w
for layer_ind in rt_weight_dict.keys():
cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind
if cand_if_name in self.ip_dict.keys():
layer_mmio = getattr(self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind).mmio
layer_w = rt_weight_dict[layer_ind]
layer_mmio.write_mm(0, layer_w.tobytes())
if verify:
new_w = np.copy(layer_mmio.array[:layer_w.shape[0]])
assert (layer_w == new_w).all()
if flush_accel:
# run accelerator to flush any stale weights from weight streamer FIFOs
self.execute_on_buffers()
@property
def idt(self):
return self._io_shape_dict["idt"]
@property
def odt(self):
return self._io_shape_dict["odt"]
@property
def ishape_normal(self):
ret = list(self._io_shape_dict["ishape_normal"])
ret[0] = self.batch_size
return tuple(ret)
@property
def oshape_normal(self):
ret = list(self._io_shape_dict["oshape_normal"])
ret[0] = self.batch_size
return tuple(ret)
@property
def ishape_folded(self):
ret = list(self._io_shape_dict["ishape_folded"])
ret[0] = self.batch_size
return tuple(ret)
@property
def oshape_folded(self):
ret = list(self._io_shape_dict["oshape_folded"])
ret[0] = self.batch_size
return tuple(ret)
@property
def ishape_packed(self):
ret = list(self._io_shape_dict["ishape_packed"])
ret[0] = self.batch_size
return tuple(ret)
@property
def oshape_packed(self):
ret = list(self._io_shape_dict["oshape_packed"])
ret[0] = self.batch_size
return tuple(ret)
@property
def batch_size(self):
return self._batch_size
@batch_size.setter
def batch_size(self, value):
self._batch_size = value
# free the old buffers
if self.ibuf_packed_device is not None:
self.ibuf_packed_device.freebuffer()
if self.obuf_packed_device is not None:
self.obuf_packed_device.freebuffer()
if self.platform == "alveo":
self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
else:
self.ibuf_packed_device = allocate(
shape=self.ishape_packed, dtype=np.uint8, cacheable=True
)
self.obuf_packed_device = allocate(
shape=self.oshape_packed, dtype=np.uint8, cacheable=True
)
def fold_input(self, ibuf_normal):
"""Reshapes input in desired shape.
Gets input data (ibuf_normal), checks if data is in expected normal shape.
Returns folded input."""
# ensure that shape is as expected
assert ibuf_normal.shape == self.ishape_normal
# convert to folded form
ibuf_folded = ibuf_normal.reshape(self.ishape_folded)
return ibuf_folded
def pack_input(self, ibuf_folded):
"""Packs folded input and reverses both SIMD dim and endianness.
Gets input data in folded shape and returns packed input data."""
ibuf_packed = finnpy_to_packed_bytearray(
ibuf_folded, self.idt, reverse_endian=True, reverse_inner=True
)
return ibuf_packed
def unpack_output(self, obuf_packed):
"""Unpacks the packed output buffer from accelerator.
Gets packed output and returns output data in folded shape."""
obuf_folded = packed_bytearray_to_finnpy(
obuf_packed,
self.odt,
self.oshape_folded,
reverse_endian=True,
reverse_inner=True,
)
return obuf_folded
def unfold_output(self, obuf_folded):
"""Unfolds output data to normal shape.
Gets folded output data and returns output data in normal shape."""
obuf_normal = obuf_folded.reshape(self.oshape_normal)
return obuf_normal
def copy_input_data_to_device(self, data):
"""Copies given input data to PYNQ buffer."""
np.copyto(self.ibuf_packed_device, data)
self.ibuf_packed_device.flush()
def copy_output_data_from_device(self, data):
"""Copies PYNQ output buffer from device."""
self.obuf_packed_device.invalidate()
np.copyto(data, self.obuf_packed_device)
def execute_on_buffers(self):
"""Executes accelerator by setting up the DMA(s) and
waiting until all transfers/calls complete. Uses only member variables and
returns nothing."""
if self.platform == "zynq-iodma":
# manually launch IODMAs since signatures are missing
self.idma.write(0x10, self.ibuf_packed_device.device_address)
self.idma.write(0x1C, self.batch_size)
self.odma.write(0x10, self.obuf_packed_device.device_address)
self.odma.write(0x1C, self.batch_size)
self.idma.write(0x00, 1)
self.odma.write(0x00, 1)
# wait until output IODMA is finished
status = self.odma.read(0x00)
while status & 0x2 == 0:
status = self.odma.read(0x00)
elif self.platform == "alveo":
self.idma.start_sw(self.ibuf_packed_device, self.batch_size)
odma_handle = self.odma.start_sw(self.obuf_packed_device, self.batch_size)
odma_handle.wait()
else:
raise Exception("Unrecognized platform: %s" % self.platform)
def execute(self, input_npy):
"""Given input numpy array, first perform necessary packing and copying
to device buffers, execute on accelerator, then unpack output and return
output numpy array from accelerator."""
ibuf_folded = self.fold_input(input_npy)
ibuf_packed = self.pack_input(ibuf_folded)
self.copy_input_data_to_device(ibuf_packed)
self.execute_on_buffers()
obuf_packed = np.empty_like(self.obuf_packed_device)
self.copy_output_data_from_device(obuf_packed)
obuf_folded = self.unpack_output(obuf_packed)
obuf_normal = self.unfold_output(obuf_folded)
return obuf_normal
def throughput_test(self):
"Run accelerator with empty inputs to measure throughput and other metrics."
# dictionary for results of throughput test
res = {}
start = time.time()
self.execute_on_buffers()
end = time.time()
runtime = end - start
res["runtime[ms]"] = runtime * 1000
res["throughput[images/s]"] = self.batch_size / runtime
res["DRAM_in_bandwidth[Mb/s]"] = (
np.prod(self.ishape_packed) * 0.000001 / runtime
)
res["DRAM_out_bandwidth[Mb/s]"] = (
np.prod(self.oshape_packed) * 0.000001 / runtime
)
if self.platform != "alveo":
res["fclk[mhz]"] = Clocks.fclk0_mhz
else:
res["fclk[mhz]"] = self.fclk_mhz
res["batch_size"] = self.batch_size
return res
'''
pynq_driver_template = """ pynq_driver_template = """
# Copyright (c) 2020 Xilinx, Inc. # Copyright (c) 2020 Xilinx, Inc.
# All rights reserved. # All rights reserved.
...@@ -419,96 +133,3 @@ if __name__ == "__main__": ...@@ -419,96 +133,3 @@ if __name__ == "__main__":
else: else:
raise Exception("Exec mode has to be set to remote_pynq or throughput_test") raise Exception("Exec mode has to be set to remote_pynq or throughput_test")
""" """
pynq_validation_template = """
# Copyright (c) 2020 Xilinx, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of Xilinx nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import argparse
from driver import io_shape_dict
from driver_base import FINNExampleOverlay
import numpy as np
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Validate top-1 accuracy for FINN-generated accelerator')
parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=100)
parser.add_argument('--dataset', help='dataset to use (mnist of cifar10)', required=True)
parser.add_argument('--platform', help='Target platform: zynq-iodma alveo', default="zynq-iodma")
parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit")
parser.add_argument('--dataset_root', help='dataset root dir for download/reuse', default="/tmp")
# parse arguments
args = parser.parse_args()
bsize = args.batchsize
dataset = args.dataset
bitfile = args.bitfile
platform = args.platform
dataset_root = args.dataset_root
if dataset == "mnist":
from dataset_loading import mnist
trainx, trainy, testx, testy, valx, valy = mnist.load_mnist_data(dataset_root, download=True, one_hot=False)
elif dataset == "cifar10":
from dataset_loading import cifar
trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(dataset_root, download=True, one_hot=False)
else:
raise Exception("Unrecognized dataset")
test_imgs = testx
test_labels = testy
ok = 0
nok = 0
total = test_imgs.shape[0]
driver = FINNExampleOverlay(
bitfile_name = bitfile, platform = platform,
io_shape_dict = io_shape_dict, batch_size = bsize,
runtime_weight_dir = "runtime_weights/"
)
n_batches = int(total / bsize)
test_imgs = test_imgs.reshape(n_batches, bsize, -1)
test_labels = test_labels.reshape(n_batches, bsize)
for i in range(n_batches):
ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device.shape)
exp = test_labels[i]
driver.copy_input_data_to_device(ibuf_normal)
driver.execute_on_buffers()
obuf_normal = np.empty_like(driver.obuf_packed_device)
driver.copy_output_data_from_device(obuf_normal)
ret = np.bincount(obuf_normal.flatten() == exp.flatten())
nok += ret[0]
ok += ret[1]
print("batch %d / %d : total OK %d NOK %d" % (i+1, n_batches, ok, nok))
acc = 100.0 * ok / (total)
print("Final accuracy: %f" % acc)
"""
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment