diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 67cce8675681be47036ffaf3a3428b8c43284215..a0ca34ed0a6838dfe9c680cc6c16961ac7f897ed 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -354,3 +354,56 @@ $LAYER_NAME$ endmodule """ + +decoupled_thresholding_template = """ +template < + unsigned ImgDim, unsigned NumChannels, unsigned PE, + typename TSrcI = Identity, typename TDstI = Identity, + int ActVal=0, typename TT, unsigned int NumSteps, + typename TI, typename TO> +void Thresholding_Stream_Batch(hls::stream<TI> &in, + hls::stream<TO> &out, + hls::stream<ap_uint<PE*NumSteps*TT::width>> &weight, + int const reps) +{ + + // how many different rows each neuron will compute + // alternatively: number of vertical matrix chunks + unsigned const NF = NumChannels / PE; + + ThresholdsActivation<1, PE, NumSteps, TT, TO, ActVal, std::less_equal<TT>> internal_thr; + #pragma HLS ARRAY_PARTITION variable=internal_thr.m_thresholds complete dim=0 + + // everything merged into a common iteration space (one "big" loop instead + // of smaller nested loops) to get the pipelinening the way we want + for (unsigned i = 0; i < reps * ImgDim * ImgDim * NF; i++) + { + #pragma HLS PIPELINE II=1 + + ap_uint<PE*NumSteps*TT::width> packed_thr; + packed_thr = weight.read(); + // slicer to get 1 PE's worth of thresholds + auto const pe_slicer = Slice<ap_uint<NumSteps*TT::width>>()(packed_thr); + + TI inElem; + inElem = in.read(); + auto outElem = TDstI().template operator()<TO>(); + + for (unsigned pe = 0; pe < PE; pe++) + { +#pragma HLS UNROLL + // slicer to get individual thresholds + auto const thr_slicer = Slice<TT>()(pe_slicer(pe, 0)); + for (unsigned nt = 0; nt < NumSteps; nt++) + { + #pragma HLS UNROLL + internal_thr.m_thresholds[pe][0][nt] = thr_slicer(nt, 0); + } + + auto const act = TSrcI()(inElem); + outElem(pe,0,1) = internal_thr.activate(0, pe, act(pe,0)); + } + out.write(outElem); + } +} +""" diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index 2429bf6190f822fb4a6c988fcbb34152d5a338e0..ccb065f62a8340b916bfa5f6cf96c23c65d19d12 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -26,7 +26,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from math import ceil +from math import ceil, log2 +import textwrap import os import numpy as np @@ -34,11 +35,15 @@ import numpy as np from onnx import TensorProto, helper from finn.core.datatype import DataType from finn.custom_op.fpgadataflow import HLSCustomOp -from finn.util.basic import interleave_matrix_outer_dim_from_partitions +from finn.util.basic import ( + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, rtlsim_output_to_npy, + pack_innermost_dim_as_hex_string, ) from . import templates @@ -58,12 +63,17 @@ class Thresholding_Batch(HLSCustomOp): def get_nodeattr_types(self): my_attrs = { + # parallelization; channels thresholded per cycle "PE": ("i", True, 0), + # number of channels (each may have different thresholds) "NumChannels": ("i", True, 0), + # number of steps in thresholding function + "numSteps": ("i", True, 1), # string defining memory type "ram_style": ("s", False, "distributed"), - # FINN DataTypes for inputs, weights, outputs + # FINN DataTypes for inputs, outputs "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), "outputDataType": ("s", True, ""), # input and output FIFO depths "inFIFODepth": ("i", False, 0), @@ -75,6 +85,20 @@ class Thresholding_Batch(HLSCustomOp): "numInputVectors": ("ints", False, [1]), # initialization value for the thresholding accumulator "ActVal": ("i", False, 0), + # memory mode for the thresholds + # const -- embedded thresholds, default + # decoupled -- streaming thresholds with streamer packaged inside IP + "mem_mode": ("s", False, "const"), + # (mem_mode = decoupled only) whether weights (thresholds) will be + # writable through an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -183,6 +207,34 @@ class Thresholding_Batch(HLSCustomOp): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] + def get_weight_datatype(self): + """Returns FINN DataType of thresholds, here called weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def minimize_accumulator_width(self, model): + "Minimize threshold width ('accumulator width' here due to convention)" + thresholds = model.get_initializer(self.onnx_node.input[1]) + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + min_input = self.get_input_datatype().min() + max_input = self.get_input_datatype().max() + # get range required by threshold values + tdt_min = min(min_input, min_threshold) + tdt_max = max(max_input, max_threshold) + if tdt_min < 0: + if abs(tdt_min) > tdt_max: + tdt = DataType.get_smallest_possible(tdt_min) + else: + tdt = DataType.get_smallest_possible(0 - tdt_max - 1) + else: + tdt = DataType.get_smallest_possible(tdt_max) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + self.set_nodeattr("weightDataType", tdt.name) + return DataType[self.get_nodeattr("weightDataType")] + def get_instream_width(self): i_bits = self.get_input_datatype().bitwidth() return i_bits * self.get_nodeattr("PE") @@ -191,6 +243,28 @@ class Thresholding_Batch(HLSCustomOp): o_bits = self.get_output_datatype().bitwidth() return o_bits * self.get_nodeattr("PE") + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if self.get_nodeattr("mem_mode") == "decoupled": + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + n_thres_steps = self.get_nodeattr("numSteps") + w_width = pe * wp * n_thres_steps + return w_width + else: + return 0 + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + + def get_ap_int_max_w(self): + temp_value = super().get_ap_int_max_w() + weightstream = self.get_weightstream_width() + return max([weightstream, temp_value]) + def get_folded_input_shape(self): ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") @@ -251,6 +325,9 @@ class Thresholding_Batch(HLSCustomOp): ), """Threshold matrix dimension is not as expected (2).""" n_thres_steps = orig_thres_matrix.shape[1] + assert n_thres_steps == self.get_nodeattr( + "numSteps" + ), "Mismatch in threshold steps" if not self.get_input_datatype().signed(): # ensure all thresholds are nonnegative assert (orig_thres_matrix >= 0).all() @@ -279,56 +356,126 @@ class Thresholding_Batch(HLSCustomOp): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) - def generate_params(self, model, path): - code_gen_dir = path - # save thresholds in thresh.h - thresholds = model.get_initializer(self.onnx_node.input[1]) - - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights (thresholds) in appropriate + format for this layer. This file can be used for either synthesis or + run-time reconfig of weights. - min_threshold = thresholds.min() - max_threshold = thresholds.max() - min_input = self.get_input_datatype().min() - max_input = self.get_input_datatype().max() - # get range required by threshold values - tdt_min = min(min_input, min_threshold) - tdt_max = max(max_input, max_threshold) - if tdt_min < 0: - if abs(tdt_min) > tdt_max: - tdt = DataType.get_smallest_possible(tdt_min) - else: - tdt = DataType.get_smallest_possible(0 - tdt_max - 1) - else: - tdt = DataType.get_smallest_possible(tdt_max) + Arguments: + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + """ + threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) + tdt = self.get_weight_datatype() assert np.vectorize(tdt.allowed)( threshold_tensor ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + if weight_file_mode == "hls_header": + # save thresholds in thresh.h + thresholds_hls_code = numpy_to_hls_code( + threshold_tensor, tdt, "thresholds", False, True + ) + # write thresholds into thresh.h + f_thresh = open(weight_file_name, "w") + tdt_hls = tdt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType.BIPOLAR: + export_odt = DataType.BINARY + odt_hls = export_odt.get_hls_datatype_str() + f_thresh.write( + "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ + = ".format( + self.calc_tmem(), + self.get_nodeattr("PE"), + threshold_tensor.shape[-1], + tdt_hls, + odt_hls, + self.get_nodeattr("ActVal"), + "std::less_equal<%s>" % tdt_hls, + ) + ) + f_thresh.write(thresholds_hls_code) + f_thresh.close() + elif "decoupled" in weight_file_mode: + # streaming thresholds need to be organized differently + # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps) + decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3)) + # TODO add flips/reversals as needed here + # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps) + pe = self.get_nodeattr("PE") + n_thres_steps = self.get_nodeattr("numSteps") + decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2) + decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps) + decoupled_thres = decoupled_thres.copy() + decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape( + 1, -1, pe * n_thres_steps + ) + decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy() + + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, decoupled_thres) + elif weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** ceil(log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Decoupled weight export not yet implemented") + else: + raise Exception("Unknown weight_file_mode") - thresholds_hls_code = numpy_to_hls_code( - threshold_tensor, tdt, "thresholds", False, True - ) - # write thresholds into thresh.h - f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") - tdt_hls = tdt.get_hls_datatype_str() - # use binary to export bipolar activations - export_odt = self.get_output_datatype() - if self.get_output_datatype() == DataType.BIPOLAR: - export_odt = DataType.BINARY - odt_hls = export_odt.get_hls_datatype_str() - f_thresh.write( - "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ - = ".format( - self.calc_tmem(), - self.get_nodeattr("PE"), - threshold_tensor.shape[-1], - tdt_hls, - odt_hls, - self.get_nodeattr("ActVal"), - "std::less_equal<%s>" % tdt_hls, + def generate_params(self, model, path): + code_gen_dir = path + thresholds = model.get_initializer(self.onnx_node.input[1]) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + # save thresholds in thresh.h + weight_filename = "{}/thresh.h".format(code_gen_dir) + self.make_weight_file(thresholds, "hls_header", weight_filename) + elif mem_mode == "decoupled": + # save decoupled weights for cppsim + weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) + self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim) + # also save weights as Verilog .dat file + weight_filename_rtl = "{}/memblock_0.dat".format(code_gen_dir) + self.make_weight_file( + thresholds, "decoupled_verilog_dat", weight_filename_rtl ) - ) - f_thresh.write(thresholds_hls_code) - f_thresh.close() + else: + raise Exception("Unrecognized mem_mode") def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") @@ -373,7 +520,7 @@ class Thresholding_Batch(HLSCustomOp): reshaped_input, ) elif in_ind > 2: - raise Exception("Unexpected input found for StreamingFCLayer") + raise Exception("Unexpected input found for Thresholding_Batch") in_ind += 1 if mode == "cppsim": @@ -400,7 +547,23 @@ class Thresholding_Batch(HLSCustomOp): ) super().reset_rtlsim(sim) super().toggle_clk(sim) - output = self.rtlsim(sim, inp) + if self.get_nodeattr("mem_mode") == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + elif self.get_nodeattr("mem_mode") == "const": + output = self.rtlsim(sim, inp) + else: + raise Exception("Unrecognized mem_mode") odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -425,7 +588,8 @@ class Thresholding_Batch(HLSCustomOp): def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + if self.get_nodeattr("mem_mode") == "const": + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] # TODO check and add whatever missing def defines(self, var): @@ -436,6 +600,21 @@ class Thresholding_Batch(HLSCustomOp): self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps, ) ] + if self.get_nodeattr("mem_mode") == "decoupled": + self.code_gen_dict["$DEFINES$"].append( + "#define ActVal1 %d" % self.get_nodeattr("ActVal") + ) + self.code_gen_dict["$DEFINES$"].append( + "#define ThresType1 %s" + % self.get_weight_datatype().get_hls_datatype_str() + ) + self.code_gen_dict["$DEFINES$"].append( + "#define NumSteps1 %d" % self.get_nodeattr("numSteps") + ) + # TODO remove once Thresholding_Stream_Batch is in hlslib: + self.code_gen_dict["$DEFINES$"].append( + templates.decoupled_thresholding_template + ) def read_npy_data(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -452,6 +631,20 @@ class Thresholding_Batch(HLSCustomOp): 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);' % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) ) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + tdt = self.get_weight_datatype() + elem_bits = tdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = tdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/thresholds.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] @@ -461,6 +654,13 @@ class Thresholding_Batch(HLSCustomOp): self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) ) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> weights ("weights");'.format( + self.get_weightstream_width() + ) + ) def docompute(self): tmpl_args = self.get_template_param_values() @@ -474,12 +674,26 @@ class Thresholding_Batch(HLSCustomOp): imgdim = ishape[1] else: raise Exception("""Unexpeted input shape""") - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<{}, NumChannels1, PE1, {}, {}> - (in0, out, threshs, numReps);""".format( - node.op_type, imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"], - ) - ] + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<{}, NumChannels1, PE1, {}, {}> + (in0, out, threshs, numReps);""".format( + node.op_type, imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"], + ) + ] + elif mem_mode == "decoupled": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1> + (in0, out, weights, numReps);""".format( + "Thresholding_Stream_Batch", + imgdim, + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + ) + ] + else: + raise Exception("Unrecognized mem_mode") def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -513,15 +727,30 @@ class Thresholding_Batch(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream<ap_uint<{}>> &in0, - hls::stream<ap_uint<{}>> &out - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_outstream_width(), - ) - ] + if self.get_nodeattr("mem_mode") == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + ) + ] + elif self.get_nodeattr("mem_mode") == "decoupled": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &weights, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_weightstream_width(), + self.get_outstream_width(), + ) + ] + else: + raise Exception("Unrecognized mem_mode") def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] @@ -530,46 +759,173 @@ class Thresholding_Batch(HLSCustomOp): "#pragma HLS INTERFACE ap_ctrl_none port=return" ) - # the threshold tensor is acc_type [PE][TMEM][N_THRES] - # partition for parallel access along PE and N_THRES - # dimensions (dims 1 and 3) - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " - "complete dim=1" + if self.get_nodeattr("mem_mode") == "const": + # the threshold tensor is acc_type [PE][TMEM][N_THRES] + # partition for parallel access along PE and N_THRES + # dimensions (dims 1 and 3) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " + "complete dim=1" + ) ) - ) - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " - "complete dim=3" + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " + "complete dim=3" + ) ) - ) - # set resource type - ram_style = self.get_nodeattr("ram_style") - pe = self.get_nodeattr("PE") - ich = self.get_nodeattr("NumChannels") - # if PE less than NumChannels, assign cores according to ram_style; - # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs - if pe < ich: - if ram_style == "distributed": - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS RESOURCE variable=threshs.m_thresholds " - "core=ROM_2P_LUTRAM" + # set resource type + ram_style = self.get_nodeattr("ram_style") + pe = self.get_nodeattr("PE") + ich = self.get_nodeattr("NumChannels") + # if PE less than NumChannels, assign cores according to ram_style; + # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs + if pe < ich: + if ram_style == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS RESOURCE variable=threshs.m_thresholds " + "core=ROM_2P_LUTRAM" + ) ) - ) - elif ram_style == "block": - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS RESOURCE variable=threshs.m_thresholds " - "core=ROM_2P_BRAM" + elif ram_style == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS RESOURCE variable=threshs.m_thresholds " + "core=ROM_2P_BRAM" + ) ) - ) - else: - raise Exception( - """Invalid value for attribute ram_style! Is currently set to: {} - has to be set to one of ("block", "distributed")""".format( - ram_style + else: + raise Exception( + """Invalid value for attribute ram_style! Is currently set to: {} + has to be set to one of ("block", "distributed")""".format( + ram_style + ) ) + elif self.get_nodeattr("mem_mode") == "decoupled": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights" + ) + + def code_generation_ipi(self): + cmd = [] + # add streamer if needed + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + node_name = self.onnx_node.name + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP + strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_inst = node_name + "_wstrm" + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) + ) + cmd.append( + "set_property -dict [list " + "CONFIG.NSTREAMS {1} " + "CONFIG.MEM_DEPTH {%d} " + "CONFIG.MEM_WIDTH {%d} " + "CONFIG.MEM_INIT {%s} " + "CONFIG.RAM_STYLE {%s} " + "CONFIG.STRM0_DEPTH {%d} " + "CONFIG.STRM0_WIDTH {%d} " + "CONFIG.STRM0_OFFSET {0} " + "] [get_bd_cells /%s/%s]" + % ( + self.calc_tmem(), + self.get_weightstream_width_padded(), + self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("ram_style"), + self.calc_tmem(), + self.get_weightstream_width_padded(), + node_name, + strm_inst, ) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " + "[get_bd_intf_pins %s/%s/weights_V_V]" + % (node_name, strm_inst, node_name, node_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + % (node_name, rst_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + % (node_name, clk_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, rst_name, node_name, node_name, rst_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk_name, node_name, node_name, clk_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, din_name, node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, dout_name, node_name, node_name, dout_name) + ) + if runtime_writable: + # expose axi lite interface for writeable weights + axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, axilite_name, node_name, strm_inst, axilite_name) + ) + # TODO calculate and pass in segment size here + cmd.append("assign_bd_address") + cmd.append("save_bd_design") + elif mem_mode == "const": + # base class impl sufficient for const mode + return super().code_generation_ipi() + else: + raise Exception("Unrecognized mem_mode for Thresholding_Batch") + return cmd + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index d6f2e04f762a6b67ace989fa802829fd3e5a6fb5..f27ebc645dbee20ff97b64aa942e375250f60cbd 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -780,6 +780,10 @@ class InferVVAU(Transformation): class InferThresholdingLayer(Transformation): """Convert any MultiThreshold into a standalone thresholding HLS layer.""" + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + def apply(self, model): graph = model.graph node_ind = 0 @@ -791,6 +795,7 @@ class InferThresholdingLayer(Transformation): thl_threshold = node.input[1] thl_output = node.output[0] thl_in_shape = model.get_tensor_shape(thl_input) + thl_thres_shape = model.get_tensor_shape(thl_threshold) idt = model.get_tensor_datatype(thl_input) # skip conversion for layers with float input @@ -841,10 +846,13 @@ class InferThresholdingLayer(Transformation): backend="fpgadataflow", NumChannels=ifc, PE=pe, + numSteps=thl_thres_shape[1], inputDataType=idt.name, + weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth outputDataType=odt.name, numInputVectors=list(thl_in_shape[:-1]), ActVal=actval, + mem_mode=self.mem_mode, ) graph.node.insert(insert_point, new_node) # remove old node @@ -852,6 +860,7 @@ class InferThresholdingLayer(Transformation): graph_modified = True if graph_modified: + model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 75fa625ff00ad6d367e2d6c94d98705f391fb9be..8461efd15576fc04906b7f48b2629ad83835de38 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -46,9 +46,16 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.util.basic import gen_finn_dt_tensor from finn.custom_op.registry import getCustomOp from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +import os +from finn.util.pyverilator import axilite_read, axilite_write +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.core.rtlsim_exec import rtlsim_exec +test_fpga_part = "xc7z020clg400-1" +target_clk_ns = 5 -def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval): + +def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode): NumChannels = T.shape[0] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, NumChannels]) @@ -64,9 +71,12 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval): backend="fpgadataflow", NumChannels=NumChannels, PE=pe, + numSteps=T.shape[1], inputDataType=idt.name, + weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth outputDataType=odt.name, ActVal=actval, + mem_mode=mem_mode, ) graph = helper.make_graph( nodes=[Thresholding_node], @@ -96,9 +106,11 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval): @pytest.mark.parametrize("ich", [16]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# memory mode +@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) @pytest.mark.vivado @pytest.mark.slow -def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode): +def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): if nf == -1: nf = ich pe = ich // nf @@ -118,7 +130,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode): else: actval = odt.min() - model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval) + model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -127,7 +139,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode): elif exec_mode == "rtlsim": model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) else: @@ -164,3 +176,102 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode): exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) assert exp_cycles != 0 + + +@pytest.mark.vivado +def test_runtime_thresholds_single_layer(): + mem_mode = "decoupled" + act = DataType.INT4 + idt = DataType.INT16 + nf = 8 + ich = 16 + pe = ich // nf + assert ich % pe == 0 + + # generate input data + in_tensor = gen_finn_dt_tensor(idt, (1, ich)) + + odt = act + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + + if odt == DataType.BIPOLAR: + actval = 0 + else: + actval = odt.min() + + model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode) + op_inst = getCustomOp(model.graph.node[0]) + op_inst.set_nodeattr("runtime_writeable_weights", 1) + op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat") + with open("old_weights.dat", "r") as f: + old_weight_stream = f.read().strip() + os.remove("old_weights.dat") + old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) + old_weight_stream = list(old_weight_stream) + # need to create stitched IP for runtime weight testing + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + model = model.transform(PrepareRTLSim()) + model.set_metadata_prop("exec_mode", "rtlsim") + # add two copies of the input tensor as the first one is just used to + # "flush out" the pipeline (as mvau already starts receiving old weights while + # we read/write new ones and reads seem to cause a disturbance too) + in_tensor = np.tile(in_tensor, (2, 1)) + exec_ctx = {"inp": in_tensor} + extracted_weight_stream = [] + + def read_weights(sim): + addr = 0 + for i in range(len(old_weight_stream)): + extracted_weight_stream.append( + axilite_read(sim, addr, basename="s_axilite_0_") + ) + addr += 4 + + rtlsim_exec(model, exec_ctx, pre_hook=read_weights) + assert extracted_weight_stream == old_weight_stream + # only use second batch element in output; first will be invalid due to + # old weights (see above) + y = exec_ctx["outp"][1] + expected = multithreshold(in_tensor, T)[1] + if act == DataType.BIPOLAR: + # binary to bipolar + expected = 2 * expected - 1 + else: + # signed offset + expected += act.min() + assert (y == expected).all() + + new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype( + np.float32 + ) + # provide non-decreasing thresholds + new_weights = np.sort(T, axis=1) + op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat") + with open("new_weights.dat", "r") as f: + new_weight_stream = f.read().strip() + os.remove("new_weights.dat") + new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n")) + new_weight_stream = list(new_weight_stream) + + def write_weights(sim): + addr = 0 + for nw in new_weight_stream: + axilite_write(sim, addr, nw, basename="s_axilite_0_") + addr += 4 + + rtlsim_exec(model, exec_ctx, pre_hook=write_weights) + y = exec_ctx["outp"][1] + expected = multithreshold(in_tensor, new_weights)[1] + if act == DataType.BIPOLAR: + # binary to bipolar + expected = 2 * expected - 1 + else: + # signed offset + expected += act.min() + assert (y == expected).all()