diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index 27b23dd32835c265759a8cabfd2a3412844077ca..9d0a9ee5203a048d50d2832dd7155b7e224e954f 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -29,6 +29,7 @@ import math import numpy as np import os +import textwrap import warnings from qonnx.core.datatype import DataType from qonnx.util.basic import ( @@ -41,6 +42,7 @@ from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, + pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) @@ -67,6 +69,33 @@ class VectorVectorActivation(HLSCustomOp): "accDataType": ("s", False, "INT32"), # no-activation mode (produce accumulators) "noActivation": ("i", False, 0, {0, 1}), + # memory mode for the layer weights + # const -- embedded weights, default, long compile/synth times + # decoupled -- streaming weights with weight streamer packaged inside IP + # external -- streaming weights with external streamer + "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), + # (mem_mode = decoupled only) whether weights will be writable through + # an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # FPGA resource type for memories in decoupled mode + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 + # see also https://www.xilinx.com/support/answers/38070.html + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -198,14 +227,23 @@ class VectorVectorActivation(HLSCustomOp): out_width = o_bits * self.get_nodeattr("PE") return out_width - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): k_h, k_w = self.get_nodeattr("Kernel") sf = k_h * k_w dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") nf = ch // pe - folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe]) + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple([1, sf * nf, pe]) + else: + raise Exception("Undefined input shape for requested input") + return folded_input_shape def get_folded_output_shape(self): @@ -319,43 +357,173 @@ class VectorVectorActivation(HLSCustomOp): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) - def generate_params(self, model, path): - # weights - weights = model.get_initializer(self.onnx_node.input[1]) + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights in appropriate format for this + layer. This file can be used for either synthesis or run-time reconfig + of weights. + + Arguments: + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + """ # convert weights into hlslib-compatible format weight_tensor = self.get_hls_compatible_weight_tensor(weights) - wdt = self.get_weight_datatype() - code_gen_dir = path - - """Saves weights into params.h""" - weight_hls_code = numpy_to_hls_code(weight_tensor, wdt, "weights", True, True) - # write weights into params.h - f_weights = open("{}/params.h".format(code_gen_dir), "w") - - if wdt.bitwidth() != 1: - f_weights.write( - "const FixedPointWeights<1,{},{},{}> weights = ".format( - wdt.get_hls_datatype_str(), - self.get_nodeattr("PE"), - self.calc_wmem(), + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + if weight_file_mode == "hls_header": + weight_hls_code = numpy_to_hls_code( + weight_tensor, export_wdt, "weights", True, True + ) + # write weights into C++ header file as dictated by finn-hlslib + f_weights = open(weight_file_name, "w") + if export_wdt.bitwidth() != 1: + f_weights.write( + "const FixedPointWeights<1,{},{},{}> weights = ".format( + export_wdt.get_hls_datatype_str(), + self.get_nodeattr("PE"), + self.calc_wmem(), + ) ) + else: + f_weights.write( + "const BinaryWeights<1,{},{}> weights = ".format( + self.get_nodeattr("PE"), + self.calc_wmem(), + ) + ) + f_weights.write(weight_hls_code) + f_weights.close() + elif "decoupled" in weight_file_mode: + # create a weight stream for various flavors of decoupled mode: + # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) + weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) + # reverse SIMD flip for saving weights in .npy + weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) + # PE flip for saving weights in .dat + weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # reshape weight tensor (simd_flipped and pe_flipped) to desired shape + pe = self.get_nodeattr("PE") + simd = 1 + # simd_flipped + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() + # flipped + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( + 1, -1, pe * simd ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, weight_tensor_simd_flipped) + elif weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + # add zeroes to pad out file to 1024 entries + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Unknown weight_file_mode") + else: - f_weights.write( - "const BinaryWeights<1,{},{}> weights = ".format( - self.get_nodeattr("PE"), self.calc_wmem() + raise Exception("Unknown weight_file_mode") + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + code_gen_dir = path + # weights, if not external + weights = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "const": + # save hlslib-compatible weights in params.h + weight_filename = "{}/params.h".format(code_gen_dir) + self.make_weight_file(weights, "hls_header", weight_filename) + elif mem_mode == "decoupled" or mem_mode == "external": + weight_filename_sim = "{}/weights.npy".format(code_gen_dir) + # save decoupled weights for cppsim + self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) + if mem_mode == "decoupled": + # also save weights as Verilog .dat file + # note that we provide two different .dat files, one for synth + # and one for synthesis. this is because URAM-based weights always + # need zero weights for synthesis, otherwise they get inferred + # as BRAM + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( + code_gen_dir + ) + weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) + # sim weights are always the true weights + self.make_weight_file( + weights, "decoupled_verilog_dat", weight_filename_rtl_sim + ) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + # UltraRAM must have no memory initializer, or only zeroes + # otherwise BRAM will be inferred instead of URAM + # as a workaround we provide a zero-weight init here + synth_weights = np.zeros_like(weights, dtype=np.float32) + else: + synth_weights = weights + self.make_weight_file( + synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) - f_weights.write(weight_hls_code) - f_weights.close() # save thresholds in thresh.h if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + # use UINT32 threshold export for bipolar times bipolar + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) # get computed threshold datatype from attribute tdt = DataType[self.get_nodeattr("accDataType")] + assert np.vectorize(tdt.allowed)( threshold_tensor ).all(), "Thresholds in %s can't be expressed with type %s" % ( @@ -368,8 +536,11 @@ class VectorVectorActivation(HLSCustomOp): # write thresholds into thresh.h f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") tdt_hls = tdt.get_hls_datatype_str() - odt = self.get_output_datatype() - odt_hls = odt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType["BIPOLAR"]: + export_odt = DataType["BINARY"] + odt_hls = export_odt.get_hls_datatype_str() f_thresh.write( "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ = ".format( @@ -387,6 +558,7 @@ class VectorVectorActivation(HLSCustomOp): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") node = self.onnx_node # TODO ensure codegen dir exists @@ -440,7 +612,26 @@ class VectorVectorActivation(HLSCustomOp): inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits) super().reset_rtlsim(sim) super().toggle_clk(sim) - output = self.rtlsim(sim, inp) + + if mem_mode == "external" or mem_mode == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = 1 + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -466,6 +657,12 @@ class VectorVectorActivation(HLSCustomOp): def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["const", "decoupled", "external"]: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) if self.calc_tmem() != 0: self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] @@ -474,6 +671,8 @@ class VectorVectorActivation(HLSCustomOp): numReps = 1 * dim_h * dim_w k_h, k_w = self.get_nodeattr("Kernel") innerProdDim = k_h * k_w + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$DEFINES$"] = [ """#define Channels1 {}\n #define InnerProdDim {}\n #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format( @@ -483,6 +682,11 @@ class VectorVectorActivation(HLSCustomOp): numReps, ) ] + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append( + "#define WP1 {}\n".format(wdt.bitwidth()) + ) def read_npy_data(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -500,7 +704,23 @@ class VectorVectorActivation(HLSCustomOp): % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) ) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) @@ -508,8 +728,15 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) ) + if mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> weights ("weights");'.format( + self.get_weightstream_width() + ) + ) def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") map_to_hls_mult_style = { "auto": "ap_resource_dflt()", "lut": "ap_resource_lut()", @@ -521,16 +748,42 @@ class VectorVectorActivation(HLSCustomOp): threshs = "PassThroughActivation<%s>()" % odtype_hls_str else: threshs = "threshs" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}> - (in0, out, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], + + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}> + (in0, out, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}> + (in0, out, weights, {}, numReps, {});""".format( + "Vector_Vector_Activate_Stream_Batch", + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" ) - ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -561,17 +814,38 @@ class VectorVectorActivation(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream<ap_uint<{}>> &in0, - hls::stream<ap_uint<{}>> &out - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_outstream_width(), + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &weights, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_weightstream_width(), + self.get_outstream_width(), + ) + ] + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" ) - ] def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") self.code_gen_dict["$PRAGMAS$"] = [ "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() ] @@ -593,12 +867,30 @@ class VectorVectorActivation(HLSCustomOp): "#pragma HLS INTERFACE ap_ctrl_none port=return" ) - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint<ch*prec> [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint<ch*prec> [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=weights.m_weights " + "complete dim=1" + ) + ) + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights name=weights_" + + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS stream depth=8 variable=weights" + ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" + ) + if self.calc_tmem() != 0: # TODO find a better way of checking for no pregenerated thresholds self.code_gen_dict["$PRAGMAS$"].append( @@ -614,6 +906,157 @@ class VectorVectorActivation(HLSCustomOp): ) ) + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append( + ("weights_" + sname, self.get_weightstream_width_padded()) + ) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def code_generation_ipi(self): + cmd = [] + # add streamer if needed + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if self.get_nodeattr("ram_style") == "ultra": + assert ( + runtime_writable == 1 + ), "Layer with URAM weights must have runtime_writeable_weights=1" + node_name = self.onnx_node.name + sname = self.hls_sname() + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP + strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_inst = node_name + "_wstrm" + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) + ) + cmd.append( + "set_property -dict [list " + "CONFIG.NSTREAMS {1} " + "CONFIG.MEM_DEPTH {%d} " + "CONFIG.MEM_WIDTH {%d} " + "CONFIG.MEM_INIT {%s} " + "CONFIG.RAM_STYLE {%s} " + "CONFIG.STRM0_DEPTH {%d} " + "CONFIG.STRM0_WIDTH {%d} " + "CONFIG.STRM0_OFFSET {0} " + "] [get_bd_cells /%s/%s]" + % ( + self.calc_wmem(), + self.get_weightstream_width_padded(), + self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("ram_style"), + self.calc_wmem(), + self.get_weightstream_width_padded(), + node_name, + strm_inst, + ) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " + "[get_bd_intf_pins %s/%s/weights_%s]" + % (node_name, strm_inst, node_name, node_name, sname) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + % (node_name, rst_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + % (node_name, clk_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, rst_name, node_name, node_name, rst_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk_name, node_name, node_name, clk_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, din_name, node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, dout_name, node_name, node_name, dout_name) + ) + if runtime_writable: + # expose axi lite interface for writeable weights + axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, axilite_name, node_name, strm_inst, axilite_name) + ) + # TODO calculate and pass in segment size here + cmd.append("assign_bd_address") + cmd.append("save_bd_design") + elif mem_mode == "const" or mem_mode == "external": + # base class impl sufficient for const/external modes + return super().code_generation_ipi() + else: + raise Exception("Unrecognized mem_mode for VectorVectorActivation") + return cmd + + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = 1 + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier + def bram_estimation(self): """Calculates resource estimation for BRAM""" # TODO add in/out FIFO contributions @@ -624,7 +1067,13 @@ class VectorVectorActivation(HLSCustomOp): # assuming SDP mode RAMB18s (see UG573 Table 1-10) # since this is HLS memory, not using the full width of a BRAM # assuming memories up to 128 deep get implemented in LUTs - if self.calc_wmem() <= 128: + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): return 0 if W == 1: @@ -671,8 +1120,12 @@ class VectorVectorActivation(HLSCustomOp): c0 = 300 c1 = 1.1 c2 = 0 - if self.calc_wmem() <= 128: - c2 = P * W * math.ceil(self.calc_wmem() / 64) + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * W) * math.ceil(self.calc_wmem() / 64) # multiplication res_type = self.get_nodeattr("resType") @@ -710,6 +1163,19 @@ class VectorVectorActivation(HLSCustomOp): mult_dsp = 0 return int(mult_dsp) + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * wp + return w_width + else: + return 0 + def get_op_and_param_counts(self): k_h, k_w = self.get_nodeattr("Kernel") fm = self.get_nodeattr("Channels")