diff --git a/fetch-repos.sh b/fetch-repos.sh index 86a2176c7549ae1debcae6365137ac30374de7cb..1e01a058ff368dfe227fdf33bd2d2866aa4f13c9 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -32,7 +32,7 @@ FINN_EXP_COMMIT="0aa7e1c44b20cf085b6fe42cff360f0a832afd2c" BREVITAS_COMMIT="c65f9c13dc124971f14739349531bbcda5c2a4aa" PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" -HLSLIB_COMMIT="4ddfa00b07275a3f1de1c13409e6acb489115fe2" +HLSLIB_COMMIT="c17aa478ae574971d115afa9fa4d9c215857d1ac" OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc" AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b" XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 40f625093b62c6f18282066d018a08ed2e587c81..d6285a6f699c774d18998c4c7426ac52362e9dec 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -709,13 +709,6 @@ class MatrixVectorActivation(HLSCustomOp): # ensure all thresholds are integer assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() ret = orig_thres_matrix - # workaround for vivado_hls threshold bug - if ret[0][0] == 0 and n_thres_steps == 1: - ret = np.copy(ret) - ret[0][0] = 1 - warnings.warn( - "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" - ) # ensure channels = mh , duplicating if necessary if ret.shape[0] == 1: ret = np.tile(ret, (mh, 1)) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index ce8c31ee9a6cf335dadacff95cf3dbd5cd7590f7..292f70941a3c18a594d904eb4922dc4bce550d82 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -319,13 +319,6 @@ class Thresholding_Batch(HLSCustomOp): np.mod(orig_thres_matrix, 1), 0 ).all(), "Need int threshold tensor" ret = orig_thres_matrix - # workaround for vivado_hls threshold bug - if ret[0][0] == 0 and n_thres_steps == 1: - ret = np.copy(ret) - ret[0][0] = 1 - warnings.warn( - "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" - ) # ensure channels = mh , duplicating if necessary if ret.shape[0] == 1: ret = np.tile(ret, (mh, 1)) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index 10ee30f89ab6313a4e5863890483e1da3f76e511..a2dd3c75dc3c02faab2465e8ac5c70474560bba5 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -56,6 +56,7 @@ class VectorVectorActivation(HLSCustomOp): def get_nodeattr_types(self): my_attrs = { "PE": ("i", True, 0), + "SIMD": ("i", False, 1), "Dim": ("ints", True, []), # [H, W] "Channels": ("i", True, 0), "Kernel": ("ints", True, []), # [H, W] @@ -198,7 +199,8 @@ class VectorVectorActivation(HLSCustomOp): ch = self.get_nodeattr("Channels") k_h, k_w = self.get_nodeattr("Kernel") pe = self.get_nodeattr("PE") - wmem = k_h * k_w * ch // pe + simd = self.get_nodeattr("SIMD") + wmem = (k_h * k_w * ch // pe) // simd return wmem def calc_tmem(self): @@ -250,7 +252,9 @@ class VectorVectorActivation(HLSCustomOp): def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() - in_width = i_bits * self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + in_width = i_bits * simd * pe return in_width def get_outstream_width(self, ind=0): @@ -260,15 +264,21 @@ class VectorVectorActivation(HLSCustomOp): def get_folded_input_shape(self, ind=0): k_h, k_w = self.get_nodeattr("Kernel") - sf = k_h * k_w dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") + simd = self.get_nodeattr("SIMD") pe = self.get_nodeattr("PE") + kernel_2 = k_h * k_w + assert ( + kernel_2 % simd == 0 + ), "Requirement kernel (k_h * k_w) divisable by SIMD is violated." + sf = kernel_2 // simd + assert ch % pe == 0, "Requirement Channels divisable by PE is violated." nf = ch // pe if ind == 0: # calculate shape of input 0 - folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe]) + folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, simd * pe]) elif ind == 1 and self.get_nodeattr("mem_mode") == "external": # calculate shape of input 1 (weights) folded_input_shape = tuple([1, sf * nf, pe]) @@ -304,6 +314,7 @@ class VectorVectorActivation(HLSCustomOp): def get_exp_cycles(self): pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") ch = self.get_nodeattr("Channels") dim_h, dim_w = self.get_nodeattr("Dim") k_h, k_w = self.get_nodeattr("Kernel") @@ -311,7 +322,7 @@ class VectorVectorActivation(HLSCustomOp): batch_size = 1 # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 - exp_cycles = ((ch * k_h * k_w) / pe) * batch_size * (dim_h * dim_w) / mmv + exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv return int(exp_cycles) def get_template_param_values(self): @@ -355,6 +366,7 @@ class VectorVectorActivation(HLSCustomOp): def get_hls_compatible_weight_tensor(self, orig_weight_matrix): pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") ch = self.get_nodeattr("Channels") k_h, k_w = self.get_nodeattr("Kernel") wmem = self.calc_wmem() @@ -366,10 +378,13 @@ class VectorVectorActivation(HLSCustomOp): ), """Weights matrix doesn't have expected shape (channels, 1, kernel_size, kernel_size)""" ret = orig_weight_matrix + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 ret = ret.reshape(ch, k_h * k_w) # distribute rows between PEs ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - ret = ret.reshape(1, pe, wmem, 1) + ret = ret.reshape(1, pe, wmem, simd) return ret def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): @@ -403,13 +418,6 @@ class VectorVectorActivation(HLSCustomOp): # ensure all thresholds are integer assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() ret = orig_thres_matrix - # workaround for vivado_hls threshold bug - if ret[0][0] == 0 and n_thres_steps == 1: - ret = np.copy(ret) - ret[0][0] = 1 - warnings.warn( - "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" - ) # ensure channels = mh , duplicating if necessary if ret.shape[0] == 1: ret = np.tile(ret, (ch, 1)) @@ -460,7 +468,8 @@ class VectorVectorActivation(HLSCustomOp): f_weights = open(weight_file_name, "w") if export_wdt.bitwidth() != 1: f_weights.write( - "const FixedPointWeights<1,{},{},{}> weights = ".format( + "const FixedPointWeights<{},{},{},{}> weights = ".format( + self.get_nodeattr("SIMD"), export_wdt.get_hls_datatype_str(), self.get_nodeattr("PE"), self.calc_wmem(), @@ -468,7 +477,8 @@ class VectorVectorActivation(HLSCustomOp): ) else: f_weights.write( - "const BinaryWeights<1,{},{}> weights = ".format( + "const BinaryWeights<{},{},{}> weights = ".format( + self.get_nodeattr("SIMD"), self.get_nodeattr("PE"), self.calc_wmem(), ) @@ -485,7 +495,7 @@ class VectorVectorActivation(HLSCustomOp): weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) # reshape weight tensor (simd_flipped and pe_flipped) to desired shape pe = self.get_nodeattr("PE") - simd = 1 + simd = self.get_nodeattr("SIMD") # simd_flipped weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( 1, -1, pe * simd @@ -664,6 +674,12 @@ class VectorVectorActivation(HLSCustomOp): not float32 as expected.""" expected_inp_shape = self.get_folded_input_shape() reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() # make copy before saving the array reshaped_input = reshaped_input.copy() np.save( @@ -679,14 +695,20 @@ class VectorVectorActivation(HLSCustomOp): super().exec_precompiled_singlenode_model() # load output npy file super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out assert ( context[node.output[0]].shape == self.get_normal_output_shape() ), "cppsim did not produce expected output shape" elif mode == "rtlsim": sim = self.get_rtlsim() nbits = self.get_instream_width() - idt = self.get_input_datatype() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits) + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) super().reset_rtlsim(sim) super().toggle_clk(sim) @@ -754,9 +776,10 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$DEFINES$"] = [ """#define Channels1 {}\n #define InnerProdDim {}\n - #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format( + #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( self.get_nodeattr("Channels"), innerProdDim, + self.get_nodeattr("SIMD"), self.get_nodeattr("PE"), numReps, ) @@ -770,6 +793,9 @@ class VectorVectorActivation(HLSCustomOp): def read_npy_data(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] elem_bits = dtype.bitwidth() packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits @@ -867,6 +893,9 @@ class VectorVectorActivation(HLSCustomOp): def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] elem_bits = dtype.bitwidth() packed_bits = self.get_outstream_width() packed_hls_type = "ap_uint<%d>" % packed_bits @@ -1108,7 +1137,7 @@ class VectorVectorActivation(HLSCustomOp): def uram_estimation(self): P = self.get_nodeattr("PE") - Q = 1 + Q = self.get_nodeattr("SIMD") wdt = self.get_weight_datatype() W = wdt.bitwidth() omega = self.calc_wmem() @@ -1117,7 +1146,7 @@ class VectorVectorActivation(HLSCustomOp): mstyle = self.get_nodeattr("ram_style") if ( (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "const") or (mmode == "external") ): return 0 @@ -1129,9 +1158,11 @@ class VectorVectorActivation(HLSCustomOp): """Calculates resource estimation for BRAM""" # TODO add in/out FIFO contributions P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") wdt = self.get_weight_datatype() W = wdt.bitwidth() omega = self.calc_wmem() + mem_width = Q * W * P # assuming SDP mode RAMB18s (see UG573 Table 1-10) # since this is HLS memory, not using the full width of a BRAM # assuming memories up to 128 deep get implemented in LUTs @@ -1139,23 +1170,24 @@ class VectorVectorActivation(HLSCustomOp): mstyle = self.get_nodeattr("ram_style") if ( (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mstyle == "auto" and self.calc_wmem() <= 128) or (mmode == "const" and self.calc_wmem() <= 128) or (mmode == "external") ): return 0 - if W == 1: - return math.ceil(omega / 16384) * P - elif W == 2: - return math.ceil(omega / 8192) * P - elif W <= 4: - return (math.ceil(omega / 4096)) * (math.ceil(W / 4)) * P - elif W <= 9: - return (math.ceil(omega / 2048)) * (math.ceil(W / 8)) * P - elif W <= 18 or omega > 512: - return (math.ceil(omega / 1024)) * (math.ceil(W / 16)) * P + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) else: - return (math.ceil(omega / 512)) * (math.ceil(W / 32)) * P + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) def bram_efficiency_estimation(self): P = self.get_nodeattr("PE") @@ -1179,6 +1211,7 @@ class VectorVectorActivation(HLSCustomOp): """ # TODO add in/out FIFO contributions P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") wdt = self.get_weight_datatype() W = wdt.bitwidth() # determine tdt with input and weight data types @@ -1193,14 +1226,16 @@ class VectorVectorActivation(HLSCustomOp): if (mmode == "decoupled" and mstyle == "distributed") or ( mmode == "const" and self.calc_wmem() <= 128 ): - c2 = (P * W) * math.ceil(self.calc_wmem() / 64) + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) # multiplication res_type = self.get_nodeattr("resType") if res_type == "dsp": mult_luts = 0 else: - mult_luts = (2 * math.ceil((W + A) / 6) - 1) * (W + A) + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) # accumulator acc_datatype = self.get_accumulator_datatype() acc_bits = acc_datatype.bitwidth() @@ -1223,10 +1258,14 @@ class VectorVectorActivation(HLSCustomOp): if noact == 0: odt = self.get_output_datatype() B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) + thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 comp_luts = (2**B - 1) * acc_bits - return int(c0 + c1 * (P * (mult_luts + acc_luts + thr_luts + comp_luts)) + c2) + return int( + c0 + + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + + c2 + ) def dsp_estimation(self): # multiplication @@ -1248,9 +1287,10 @@ class VectorVectorActivation(HLSCustomOp): self.get_nodeattr("mem_mode") == "decoupled" or self.get_nodeattr("mem_mode") == "external" ): + simd = self.get_nodeattr("SIMD") pe = self.get_nodeattr("PE") wp = self.get_weight_datatype().bitwidth() - w_width = pe * wp + w_width = simd * pe * wp return w_width else: return 0 diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py index 797dad32a2cfeb3e00e224f264d91b5ee0e9247b..3602b1bdd5d013ee8ce2f6cf156490478f0cc74e 100644 --- a/src/finn/util/data_packing.py +++ b/src/finn/util/data_packing.py @@ -220,7 +220,7 @@ def unpack_innermost_dim_from_hex_string( if conv_dtype == DataType["BIPOLAR"]: ar_list = [2 * x - 1 for x in ar_list] # interpret values as signed values - elif conv_dtype.name.startswith("INT"): + elif dtype.signed(): mask = 2 ** (conv_dtype.bitwidth() - 1) ar_list = [-(x & mask) + (x & ~mask) for x in ar_list] diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 96cd69c3453793c1634f132cb159f0cc8a94a28c..445afdf458c0e99d4a4dcc4eeb2f6fb305dbac4a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -132,10 +132,6 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): odt = act n_steps = act.get_num_possible_values() - 1 T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) - # make the vivado_hls threshold bug appear (incorrect rtlsim result when first - # threshold of first channel is zero, while using BIPOLAR output) - if act == DataType["BIPOLAR"]: - T[0][0] = 0 # provide non-decreasing thresholds T = np.sort(T, axis=1) diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index abf8ba0b9efde67c77711abc8451475887430cae..be1ada59a1ef50cf8d1c9ea26b31ce4956d9f3db 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -35,12 +35,17 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim @@ -77,6 +82,7 @@ def _calculate_dot_prod_range(dt_a, dt_b, len): def _make_single_vvau_modelwrapper( W, pe, + simd, k_h, k_w, channels, @@ -103,7 +109,10 @@ def _make_single_vvau_modelwrapper( if T is not None: no_act = 0 node_inp_list = ["inp", "weights", "thresh"] - actval = odt.min() + if odt == DataType["BIPOLAR"]: + actval = 0 + else: + actval = odt.min() else: no_act = 1 node_inp_list = ["inp", "weights"] @@ -116,6 +125,7 @@ def _make_single_vvau_modelwrapper( domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", PE=pe, + SIMD=simd, Dim=[dim_h, dim_w], Channels=channels, Kernel=[k_h, k_w], @@ -146,6 +156,11 @@ def _make_single_vvau_modelwrapper( model.set_tensor_datatype("thresh", tdt) model.set_initializer("thresh", T) + # Minimize accumulator width to obtain realistic HLS reports + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return model @@ -154,13 +169,15 @@ def prepare_inputs(input_tensor): # input datatype -@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["UINT4"]]) # weight datatype -@pytest.mark.parametrize("wdt", [DataType["INT4"]]) +@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["UINT4"]]) # activation: None or DataType -@pytest.mark.parametrize("act", [DataType["UINT4"], None]) +@pytest.mark.parametrize("act", [DataType["BIPOLAR"], DataType["UINT4"], None]) # PE -@pytest.mark.parametrize("pe", [1, "channels"]) +@pytest.mark.parametrize("pe", [1, 3, 6]) +# SIMD +@pytest.mark.parametrize("simd", [1, 9]) # Input image shape @pytest.mark.parametrize("dim_h", [10]) @pytest.mark.parametrize("dim_w", [10, 1]) @@ -168,7 +185,7 @@ def prepare_inputs(input_tensor): @pytest.mark.parametrize("k_h", [3]) @pytest.mark.parametrize("k_w", [3, 1]) # Number of input and output channels -@pytest.mark.parametrize("channels", [3, 4]) +@pytest.mark.parametrize("channels", [3, 6]) # memory mode @pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) # execution mode @@ -177,17 +194,17 @@ def prepare_inputs(input_tensor): @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_vvau( - idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode + idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode ): - if pe == "channels": - pe = channels - if dim_w == 1 and k_w != 1: pytest.skip("1D image requires 1D kernel, skipping.") if channels % pe != 0: pytest.skip("Requirement Channels divisable by PE is violated.") + if (k_h * k_w) % simd != 0: + pytest.skip("Requirement kernel (k_h * k_w) divisable by SIMD is violated.") + # Generate weights in expected shape for ONNX and HLS node W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w)) # shape: [channels, 1, k, k] W_onnx = _infer_sparse_weight_tensor( @@ -203,17 +220,26 @@ def test_fpgadataflow_vvau( if act is None: T = None tdt = None - odt = DataType["INT32"] + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] else: odt = act - (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w * channels) + (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w) n_steps = act.get_num_possible_values() - 1 T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32) T = np.sort(T, axis=1) - tdt = DataType["INT32"] + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + tdt = DataType["UINT32"] + # bias thresholds to be positive + T = np.ceil((T + (k_h * k_w)) / 2) + assert (T >= 0).all() + else: + tdt = DataType["INT32"] model = _make_single_vvau_modelwrapper( - W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode + W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode ) if exec_mode == "cppsim": @@ -232,20 +258,31 @@ def test_fpgadataflow_vvau( input_dict = prepare_inputs(x_vvau) # Calculate output - y_expected = np.matmul(x, W_onnx) # Y is in [N, H, W, C] format + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + # Simulate XNOR-popcount matrix multiplication, see + # qonnx.custom_op.general.xnorpopcount (not usable due to sparse W) + y_expected = np.matmul(x, W_onnx) + y_expected = (y_expected + (k_h * k_w)) / 2 + else: + y_expected = np.matmul(x, W_onnx) # Y is in [N, H, W, C] format + if T is not None: # Reshape Y, as multithreshold expects Y to be in [N, C, H, W] format y_expected = np.transpose(y_expected, (0, 3, 1, 2)) y_expected = multithreshold(y_expected, T) y_expected = np.transpose(y_expected, (0, 2, 3, 1)) - # signed offset - y_expected += act.min() + if act == DataType["BIPOLAR"]: + # binary to bipolar + y_expected = 2 * y_expected - 1 + else: + # signed offset + y_expected += act.min() y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)[ "outp" ] - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all(), "incorrect result" if exec_mode == "rtlsim": node = model.get_nodes_by_op_type("VectorVectorActivation")[0]