Skip to content
Snippets Groups Projects
Commit 3a14c324 authored by Mirzam98's avatar Mirzam98
Browse files

Merge branch 'feature/vector_vector_activate_batch' into feature/1d_convolution_support

parents 6b9a094d 8786f4b5
No related branches found
No related tags found
No related merge requests found
......@@ -26,9 +26,9 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
def get_nodeattr_types(self):
my_attrs = {
"PE": ("i", True, 0),
"Dim": ("i", True, 0),
"Dim": ("ints", True, []), # [H, W]
"Channels": ("i", True, 0),
"Kernel": ("i", True, 0),
"Kernel": ("ints", True, []), # [H, W]
"resType": ("s", False, "auto", {"auto", "lut", "dsp"}),
"ActVal": ("i", False, 0),
# FINN DataTypes for inputs, weights, outputs
......@@ -45,10 +45,10 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
def minimize_accumulator_width(self, model):
weights = model.get_initializer(self.onnx_node.input[1])
k = self.get_nodeattr("Kernel")
k_h, k_w = self.get_nodeattr("Kernel")
fm = self.get_nodeattr("Channels")
# put weights into the shape expected by calculate_matvec_accumulator_range
weights = weights.reshape(fm, k * k).transpose()
weights = weights.reshape(fm, k_h * k_w).transpose()
if len(self.onnx_node.input) > 2:
thresholds = model.get_initializer(self.onnx_node.input[2])
else:
......@@ -85,9 +85,11 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
tdt = DataType.get_smallest_possible(0 - tdt_max)
else:
tdt = DataType.get_smallest_possible(tdt_max)
assert np.vectorize(tdt.allowed)(threshold_tensor).all(), (
"Thresholds in %s can't be expressed with type %s"
% (self.onnx_node.name, str(tdt))
assert np.vectorize(tdt.allowed)(
threshold_tensor
).all(), "Thresholds in %s can't be expressed with type %s" % (
self.onnx_node.name,
str(tdt),
)
self.set_nodeattr("accDataType", tdt.name)
else:
......@@ -110,9 +112,9 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
def calc_wmem(self):
"""Calculates and returns WMEM."""
ch = self.get_nodeattr("Channels")
k = self.get_nodeattr("Kernel")
k_h, k_w = self.get_nodeattr("Kernel")
pe = self.get_nodeattr("PE")
wmem = k * k * ch // pe
wmem = k_h * k_w * ch // pe
return wmem
def calc_tmem(self):
......@@ -181,34 +183,34 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
return out_width
def get_folded_input_shape(self):
k = self.get_nodeattr("Kernel")
sf = k * k
dim = self.get_nodeattr("Dim")
k_h, k_w = self.get_nodeattr("Kernel")
sf = k_h * k_w
dim_h, dim_w = self.get_nodeattr("Dim")
ch = self.get_nodeattr("Channels")
pe = self.get_nodeattr("PE")
nf = ch // pe
folded_input_shape = tuple([1, dim, dim, sf * nf, pe])
folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe])
return folded_input_shape
def get_folded_output_shape(self):
ch = self.get_nodeattr("Channels")
pe = self.get_nodeattr("PE")
nf = ch // pe
dim = self.get_nodeattr("Dim")
folded_output_shape = tuple([1, dim, dim, nf, pe])
dim_h, dim_w = self.get_nodeattr("Dim")
folded_output_shape = tuple([1, dim_h, dim_w, nf, pe])
return folded_output_shape
def get_normal_input_shape(self):
dim = self.get_nodeattr("Dim")
dim_h, dim_w = self.get_nodeattr("Dim")
ch = self.get_nodeattr("Channels")
k = self.get_nodeattr("Kernel")
normal_input_shape = tuple([1, dim, dim, k * k * ch])
k_h, k_w = self.get_nodeattr("Kernel")
normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch])
return normal_input_shape
def get_normal_output_shape(self):
ch = self.get_nodeattr("Channels")
dim = self.get_nodeattr("Dim")
normal_output_shape = tuple([1, dim, dim, ch])
dim_h, dim_w = self.get_nodeattr("Dim")
normal_output_shape = tuple([1, dim_h, dim_w, ch])
return normal_output_shape
def get_number_output_values(self):
......@@ -218,13 +220,13 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
def get_exp_cycles(self):
pe = self.get_nodeattr("PE")
ch = self.get_nodeattr("Channels")
dim = self.get_nodeattr("Dim")
k = self.get_nodeattr("Kernel")
dim_h, dim_w = self.get_nodeattr("Dim")
k_h, k_w = self.get_nodeattr("Kernel")
# currently FINN supports for vvau a batch size of 1
batch_size = 1
# since mmv != 1 is not supported yet, we set mmv for now to 1
mmv = 1
exp_cycles = ((ch * k * k) / pe) * batch_size * (dim * dim) / mmv
exp_cycles = ((ch * k_h * k_w) / pe) * batch_size * (dim_h * dim_w) / mmv
return int(exp_cycles)
def get_template_param_values(self):
......@@ -251,17 +253,17 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
pe = self.get_nodeattr("PE")
ch = self.get_nodeattr("Channels")
k = self.get_nodeattr("Kernel")
k_h, k_w = self.get_nodeattr("Kernel")
wmem = self.calc_wmem()
assert orig_weight_matrix.shape == (
ch,
1,
k,
k,
k_h,
k_w,
), """Weights matrix doesn't
have expected shape (channels, 1, kernel_size, kernel_size)"""
ret = orig_weight_matrix
ret = ret.reshape(ch, k * k)
ret = ret.reshape(ch, k_h * k_w)
# distribute rows between PEs
ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
ret = ret.reshape(1, pe, wmem, 1)
......@@ -338,9 +340,11 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
# get computed threshold datatype from attribute
tdt = DataType[self.get_nodeattr("accDataType")]
assert np.vectorize(tdt.allowed)(threshold_tensor).all(), (
"Thresholds in %s can't be expressed with type %s"
% (self.onnx_node.name, str(tdt))
assert np.vectorize(tdt.allowed)(
threshold_tensor
).all(), "Thresholds in %s can't be expressed with type %s" % (
self.onnx_node.name,
str(tdt),
)
thresholds_hls_code = numpy_to_hls_code(
threshold_tensor, tdt, "thresholds", False, True
......@@ -455,10 +459,10 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
def defines(self, var):
dim = self.get_nodeattr("Dim")
numReps = 1 * dim * dim
kernel = self.get_nodeattr("Kernel")
innerProdDim = kernel * kernel
dim_h, dim_w = self.get_nodeattr("Dim")
numReps = 1 * dim_h * dim_w
k_h, k_w = self.get_nodeattr("Kernel")
innerProdDim = k_h * k_w
self.code_gen_dict["$DEFINES$"] = [
"""#define Channels1 {}\n #define InnerProdDim {}\n
#define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format(
......@@ -664,8 +668,8 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
else:
mult_luts = (2 * math.ceil((W + A) / 6) - 1) * (W + A)
# accumulator
k = self.get_nodeattr("Kernel")
acc_bits = W + A + math.ceil(math.log(k * k, 2))
k_h, k_w = self.get_nodeattr("Kernel")
acc_bits = W + A + math.ceil(math.log(k_h * k_w, 2))
acc_luts = acc_bits
# thresholds and threshold comparators
thr_luts = 0
......@@ -694,20 +698,20 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
return int(mult_dsp)
def get_op_and_param_counts(self):
k = self.get_nodeattr("Kernel")
k_h, k_w = self.get_nodeattr("Kernel")
fm = self.get_nodeattr("Channels")
dim = self.get_nodeattr("Dim")
dim_h, dim_w = self.get_nodeattr("Dim")
weight_bits = self.get_weight_datatype().bitwidth()
inp_bits = self.get_input_datatype().bitwidth()
num_repetitions = int(dim * dim)
mac_count = k * k * fm * num_repetitions
num_repetitions = int(dim_h * dim_w)
mac_count = k_h * k_w * fm * num_repetitions
# cannonicalize op type: highest bitwidth operand first s.t.
# e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
bw1 = min(inp_bits, weight_bits)
bw2 = max(inp_bits, weight_bits)
mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
weight_param_type = "param_weight_%db" % (weight_bits)
weight_count = k * k * fm
weight_count = k_h * k_w * fm
ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
if self.get_nodeattr("noActivation") == 0:
tdt = DataType[self.get_nodeattr("accDataType")]
......
# Copyright (c) 2020, Xilinx
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of FINN nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import pytest
import numpy as np
from onnx import TensorProto, helper
import finn.core.onnx_exec as oxe
from finn.core.datatype import DataType
from finn.core.modelwrapper import ModelWrapper
from finn.util.basic import gen_finn_dt_tensor
from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
from finn.transformation.general import GiveUniqueNodeNames
from finn.custom_op.general.multithreshold import multithreshold
from finn.custom_op.registry import getCustomOp
from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels):
W_sparse = np.zeros((channels, channels, k_h, k_w))
for ch in range(channels):
W_sparse[ch][ch] = W_conv[ch][0]
W_conv = W_sparse.astype(np.float32)
W_matmul = W_conv.transpose(0, 2, 3, 1)
W_matmul = W_matmul.reshape(channels, channels * k_h * k_w)
W_matmul = W_matmul.T
return W_matmul
def _calculate_dot_prod_range(dt_a, dt_b, len):
"""Returns the (min,max) values a dot product between two (un)signed vectors of
types dt_a and dt_b of len elements can take."""
min_prod = 2 ** 30
max_prod = -(2 ** 30)
for a_val in [dt_a.min(), dt_a.max()]:
for b_val in [dt_b.min(), dt_b.max()]:
prod = a_val * b_val * len
if prod < min_prod:
min_prod = prod
if prod > max_prod:
max_prod = prod
return (min_prod, max_prod)
def _make_single_vvau_modelwrapper(
W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T=None, tdt=None
):
in_shape = [1, dim_h, dim_w, k_h * k_w * channels] # [N, H, W, K*K*CH]
out_shape = [
1,
dim_h,
dim_w,
channels,
] # [N, H, W, OFM_CH] (OFM_CH=IFM_CH because depthwise convolution)
inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, in_shape)
outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, out_shape)
if T is not None:
no_act = 0
node_inp_list = ["inp", "weights", "thresh"]
actval = odt.min()
else:
no_act = 1
node_inp_list = ["inp", "weights"]
actval = 0
VVAU_node = helper.make_node(
"Vector_Vector_Activate_Batch",
node_inp_list,
["outp"],
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
PE=pe,
Dim=[dim_h, dim_w],
Channels=channels,
Kernel=[k_h, k_w],
resType="lut",
ActVal=actval,
inputDataType=idt.name,
weightDataType=wdt.name,
outputDataType=odt.name,
noActivation=no_act,
)
graph = helper.make_graph(
nodes=[VVAU_node], name="vvau_graph", inputs=[inp], outputs=[outp]
)
model = helper.make_model(graph, producer_name="vvau-model")
model = ModelWrapper(model)
model.set_tensor_datatype("inp", idt)
model.set_tensor_datatype("outp", odt)
model.set_tensor_datatype("weights", wdt)
model.set_initializer("weights", W)
model.set_tensor_shape("weights", (channels, 1, k_h, k_w))
if T is not None:
model.set_tensor_datatype("thresh", tdt)
model.set_initializer("thresh", T)
return model
def prepare_inputs(input_tensor):
return {"inp": input_tensor}
# mem_mode: const or decoupled
@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.UINT8])
# weight datatype
@pytest.mark.parametrize("wdt", [DataType.INT4])
# activation: None or DataType
@pytest.mark.parametrize("act", [DataType.UINT4, None])
# PE
@pytest.mark.parametrize("pe", [1, "channels"])
# Input image shape
@pytest.mark.parametrize("dim_h", [10])
@pytest.mark.parametrize("dim_w", [10, 1])
# Kernel shape
@pytest.mark.parametrize("k_h", [3])
@pytest.mark.parametrize("k_w", [3, 1])
# Number of input and output channels
@pytest.mark.parametrize("channels", [3, 4])
# execution mode
@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
@pytest.mark.slow
@pytest.mark.vivado
def test_fpgadataflow_vvau(
idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, exec_mode
):
if pe == "channels":
pe = channels
if dim_w == 1 and k_w != 1:
pytest.skip("1D image requires 1D kernel, skipping.")
if channels % pe != 0:
pytest.skip("Requirement Channels divisable by PE is violated.")
# Generate weights in expected shape for ONNX and HLS node
W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w)) # shape: [channels, 1, k, k]
W_onnx = _infer_sparse_weight_tensor(
W, k_h, k_w, channels
) # shape: [k*k*channels, channels]
# Generate inputs in expected format for ONNX and HLS node
x = gen_finn_dt_tensor(idt, (1, dim_h, dim_w, k_h * k_w * channels))
x_vvau = x.reshape(1, dim_h, dim_w, k_h * k_w, channels // pe, pe)
x_vvau = x_vvau.transpose(0, 1, 2, 4, 3, 5)
x_vvau = x_vvau.reshape(1, dim_h, dim_w, channels * k_h * k_w)
if act is None:
T = None
tdt = None
odt = DataType.INT32
else:
odt = act
(min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w * channels)
n_steps = act.get_num_possible_values() - 1
T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32)
T = np.sort(T, axis=1)
tdt = DataType.INT32
model = _make_single_vvau_modelwrapper(
W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt
)
if exec_mode == "cppsim":
model = model.transform(SetExecMode("cppsim"))
model = model.transform(PrepareCppSim())
model = model.transform(CompileCppSim())
elif exec_mode == "rtlsim":
model = model.transform(SetExecMode("rtlsim"))
model = model.transform(GiveUniqueNodeNames())
model = model.transform(PrepareIP("xc7z020clg400-1", 5))
model = model.transform(HLSSynthIP())
model = model.transform(PrepareRTLSim())
else:
raise Exception("Unknown exec_mode in test_fpgadataflow_vvau")
input_dict = prepare_inputs(x_vvau)
# Calculate output
y_expected = np.matmul(x, W_onnx) # Y is in [N, H, W, C] format
if T is not None:
# Reshape Y, as multithreshold expects Y to be in [N, C, H, W] format
y_expected = np.transpose(y_expected, (0, 3, 1, 2))
y_expected = multithreshold(y_expected, T)
y_expected = np.transpose(y_expected, (0, 2, 3, 1))
# signed offset
y_expected += act.min()
y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)[
"outp"
]
assert (y_produced == y_expected).all(), "cppsim failed"
if exec_mode == "rtlsim":
node = model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0]
inst = getCustomOp(node)
cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
exp_cycles_dict = model.analysis(exp_cycles_per_layer)
exp_cycles = exp_cycles_dict[node.name]
assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
assert exp_cycles != 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment