diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py index 190bb857ad6e5448d49a6b742adc888f2bca79d2..e78f07b9f1097ee6e1042846a91c2a0ff80d12d0 100644 --- a/src/finn/core/remote_exec.py +++ b/src/finn/core/remote_exec.py @@ -43,6 +43,8 @@ def remote_exec(model, execution_context): pynq_target_dir = model.get_metadata_prop("pynq_target_dir") deployment_dir = model.get_metadata_prop("pynq_deploy_dir") inp = execution_context[model.graph.input[0].name] + # make copy of array before saving it + inp = inp.copy() np.save(os.path.join(deployment_dir, "input.npy"), inp) # extracting last folder of absolute path (deployment_dir) deployment_folder = os.path.basename(os.path.normpath(deployment_dir)) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 7f13b43d57d9fe2f6de5e5ed9bb52214611f1098..ef784b8ac29ca9e937fcd4ea22a8dfd6e1a7a470 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -207,9 +207,11 @@ Found no codegen dir for this node, did you run the codegen_npysim transformatio # assuming dynamic inputs start from 0 for in_ind in range(count): current_input_name = node.input[in_ind] + # make copy before saving array + input_array = context[current_input_name].copy() np.save( os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - context[current_input_name], + input_array, ) def npy_to_dynamic_output(self, context): diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 55daff5f72feddeb467d194ef50e4efe4d509110..463896b4331cf68337b0070e257f27fab36b0031 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -123,6 +123,8 @@ class ConvolutionInputGenerator(HLSCustomOp): ), """Input shape doesn't match expected shape (1, ifm_ch, ifm_dim, ifm_dim).""" reshaped_inp = inp.transpose(0, 2, 3, 1) + # make copy before saving array + reshaped_inp = reshaped_inp.copy() np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_inp) # execute the precompiled model super().exec_precompiled_singlenode_model() diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 39386b00b729d2d06678dd48b1566f0d39aea5ff..84cffade62578914a70be93a697052abb94c9bee 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -80,6 +80,10 @@ class StreamingFCLayer_Batch(HLSCustomOp): # [4] is four vectors (like a FC layer with batch=4) # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) "numInputVectors": ("ints", False, [1]), + # memory mode for the FC weights + # const -- embedded weights, default, long compile/synth times + # decoupled -- streaming weights + "mem_mode": ("s", False, "const"), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -235,6 +239,12 @@ class StreamingFCLayer_Batch(HLSCustomOp): o_bits = self.get_output_datatype().bitwidth() return o_bits * self.get_nodeattr("PE") + def get_weightstream_width(self): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + return pe * simd * wp + def get_folded_input_shape(self): mw = self.get_nodeattr("MW") simd = self.get_nodeattr("SIMD") @@ -300,8 +310,10 @@ class StreamingFCLayer_Batch(HLSCustomOp): elif (not inp_is_bipolar) and (not wt_is_bipolar): ret["TSrcI"] = "Slice<%s>" % inp_hls_str ret["TWeightI"] = "Identity" + # fill in TDstI ret["TDstI"] = "Slice<%s>" % out_hls_str + return ret def get_hls_compatible_weight_tensor(self, orig_weight_matrix): @@ -395,43 +407,66 @@ class StreamingFCLayer_Batch(HLSCustomOp): return ret.reshape(1, pe, tmem, n_thres_steps) def generate_params(self, model, path): - """Saves weights into params.h and if existing thresholds into thresh.h.""" - code_gen_dir = path + mem_mode = self.get_nodeattr("mem_mode") # weights weights = model.get_initializer(self.onnx_node.input[1]) # convert weights into hlslib-compatible format weight_tensor = self.get_hls_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType.BIPOLAR: - export_wdt = DataType.BINARY - weight_hls_code = numpy_to_hls_code( - weight_tensor, export_wdt, "weights", True, True - ) - # write weights into params.h - # code_gen_dir = self.get_nodeattr("code_gen_dir_npysim") - f_weights = open("{}/params.h".format(code_gen_dir), "w") - - if export_wdt.bitwidth() != 1: - f_weights.write( - "static FixedPointWeights<{},{},{},{}> weights = ".format( - self.get_nodeattr("SIMD"), - export_wdt.get_hls_datatype_str(), - self.get_nodeattr("PE"), - self.calc_wmem(), - ) + code_gen_dir = path + + if mem_mode == "const": + """Saves weights into params.h""" + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType.BIPOLAR: + export_wdt = DataType.BINARY + weight_hls_code = numpy_to_hls_code( + weight_tensor, export_wdt, "weights", True, True ) - else: - f_weights.write( - "static BinaryWeights<{},{},{}> weights = ".format( - self.get_nodeattr("SIMD"), self.get_nodeattr("PE"), self.calc_wmem() + # write weights into params.h + f_weights = open("{}/params.h".format(code_gen_dir), "w") + + if export_wdt.bitwidth() != 1: + f_weights.write( + "static FixedPointWeights<{},{},{},{}> weights = ".format( + self.get_nodeattr("SIMD"), + export_wdt.get_hls_datatype_str(), + self.get_nodeattr("PE"), + self.calc_wmem(), + ) ) + else: + f_weights.write( + "static BinaryWeights<{},{},{}> weights = ".format( + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + self.calc_wmem(), + ) + ) + f_weights.write(weight_hls_code) + f_weights.close() + + elif mem_mode == "decoupled": + """Saves weights into .npy file""" + # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) + weight_tensor = np.transpose(weight_tensor, (0, 2, 1, 3)) + # flip PE dimension + weight_tensor = np.flip(weight_tensor, axis=-2) + weight_tensor = np.flip(weight_tensor, axis=-1) + # reshape weight tensor to desired shape + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + weight_tensor = weight_tensor.reshape(1, -1, pe * simd) + weight_tensor = weight_tensor.copy() + np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor) + else: + raise Exception( + """Please set mem_mode to "const"i or "decoupled", currently no other + parameter value is supported!""" ) - f_weights.write(weight_hls_code) - f_weights.close() - # thresholds + # save thresholds in thresh.h if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: @@ -452,7 +487,6 @@ class StreamingFCLayer_Batch(HLSCustomOp): threshold_tensor, tdt, "thresholds", False, True ) # write thresholds into thresh.h - # code_gen_dir = self.get_nodeattr("code_gen_dir_npysim") f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") tdt_hls = tdt.get_hls_datatype_str() # use binary to export bipolar activations @@ -462,7 +496,7 @@ class StreamingFCLayer_Batch(HLSCustomOp): odt_hls = export_odt.get_hls_datatype_str() f_thresh.write( "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ - = ".format( + = ".format( self.calc_tmem(), self.get_nodeattr("PE"), threshold_tensor.shape[-1], @@ -511,6 +545,8 @@ class StreamingFCLayer_Batch(HLSCustomOp): export_idt = DataType.BINARY else: export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() np.save( os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), reshaped_input, @@ -589,12 +625,23 @@ class StreamingFCLayer_Batch(HLSCustomOp): def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"'] + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"'] + elif mem_mode == "decoupled": + self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" + ) if self.calc_tmem() != 0: # TODO find a better way of checking for no pregenerated thresholds self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] def defines(self, var): + mem_mode = self.get_nodeattr("mem_mode") numReps = 1 self.code_gen_dict["$DEFINES$"] = [ """#define MW1 {}\n #define MH1 {}\n #define SIMD1 {}\n @@ -613,6 +660,12 @@ class StreamingFCLayer_Batch(HLSCustomOp): self.code_gen_dict["$DEFINES$"].append("#define PRAGMA_SUB(x) _Pragma (#x)") self.code_gen_dict["$DEFINES$"].append("#define DO_PRAGMA(x) PRAGMA_SUB(x)") + if mem_mode == "decoupled": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append( + "#define WP1 {}\n".format(wdt.bitwidth()) + ) + def read_npy_data(self): code_gen_dir = self.get_nodeattr("code_gen_dir_npysim") dtype = self.get_input_datatype() @@ -632,7 +685,23 @@ class StreamingFCLayer_Batch(HLSCustomOp): % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) ) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights, false);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) @@ -641,25 +710,58 @@ class StreamingFCLayer_Batch(HLSCustomOp): 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) ) + if mem_mode == "decoupled": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> weights ("weights");'.format( + self.get_weightstream_width() + ) + ) + def docompute(self): - node = self.onnx_node + mem_mode = self.get_nodeattr("mem_mode") tmpl_args = self.get_template_param_values() if self.calc_tmem() == 0: odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() threshs = "PassThroughActivation<%s>()" % odtype_hls_str else: threshs = "threshs" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<MW1, MH1, SIMD1, PE1, {}, {}, {}> - (in0, out, weights, {}, numReps, {});""".format( - node.op_type, - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - threshs, - self.get_nodeattr("resType"), + if mem_mode == "const": + node = self.onnx_node + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<MW1, MH1, SIMD1, PE1, {}, {}, {}> + (in0, out, weights, {}, numReps, {});""".format( + node.op_type, + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + threshs, + self.get_nodeattr("resType"), + ) + ] + elif mem_mode == "decoupled": + wdt = self.get_weight_datatype() + if wdt == DataType.BIPOLAR: + export_wdt = DataType.BINARY + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Stream_Batch<MW1, MH1, SIMD1, PE1, {}, {}, {}, {} > + (in0, out, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + threshs, + self.get_nodeattr("resType"), + ) + ] + + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" ) - ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_npysim") @@ -693,54 +795,71 @@ class StreamingFCLayer_Batch(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream<ap_uint<{}>> &in0, - hls::stream<ap_uint<{}>> &out - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_outstream_width(), + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + ) + ] + else: + raise Exception( + """Please set mem_mode to "const", currently no other + parameter value is supported!""" ) - ] def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") - in_fifo_depth = self.get_nodeattr("inFIFODepth") - out_fifo_depth = self.get_nodeattr("outFIFODepth") - # insert depth pragmas only if specified - if in_fifo_depth != 0: + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth + "#pragma HLS INTERFACE axis port=out" ) - if out_fifo_depth != 0: + in_fifo_depth = self.get_nodeattr("inFIFODepth") + out_fifo_depth = self.get_nodeattr("outFIFODepth") + # insert depth pragmas only if specified + if in_fifo_depth != 0: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth + ) + if out_fifo_depth != 0: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS stream depth=%d variable=out" % out_fifo_depth + ) self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS stream depth=%d variable=out" % out_fifo_depth - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - # the weight tensor is ap_uint<simd*prec> [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ( - "DO_PRAGMA(HLS ARRAY_PARTITION " - "variable=weights.m_weights complete dim=1)" + "#pragma HLS INTERFACE ap_ctrl_none port=return" ) - ) - # the threshold tensor is acc_type [PE][TMEM][N_THRES] - # partition for parallel access along PE and N_THRES dimensions (dims 1 and 3) - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds + # the weight tensor is ap_uint<simd*prec> [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) self.code_gen_dict["$PRAGMAS$"].append( ( - "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds " - "complete dim=1)" + "DO_PRAGMA(HLS ARRAY_PARTITION " + "variable=weights.m_weights complete dim=1)" ) ) - self.code_gen_dict["$PRAGMAS$"].append( - ( - "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds " - "complete dim=3)" + # the threshold tensor is acc_type [PE][TMEM][N_THRES] + # partition for parallel access along PE and N_THRES + # dimensions (dims 1 and 3) + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ( + "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds " + "complete dim=1)" + ) ) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds " + "complete dim=3)" + ) + ) + else: + raise Exception( + """Please set mem_mode to "const", currently no other + parameter value is supported!""" ) diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py index ae98d312e7c5923a572f918430aececf29f3e094..1c08ac4fc7a9eedcc45deab824d7ec036941f808 100644 --- a/src/finn/util/data_packing.py +++ b/src/finn/util/data_packing.py @@ -297,6 +297,8 @@ def rtlsim_output_to_npy( out_array = unpack_innermost_dim_from_hex_string( output, dtype, shape, packedBits=packedBits, reverse_inner=reverse_inner ) + # make copy before saving the array + out_array = out_array.copy() np.save(path, out_array) return out_array diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py index 93bf0750776331af236154d8f2a005913cbb1c33..4201f64c963cb506305fd9b9a9fa32f66ae74226 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py +++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py @@ -31,6 +31,7 @@ import pytest import numpy as np from onnx import TensorProto, helper +from finn.custom_op.registry import getCustomOp import finn.core.onnx_exec as oxe import finn.custom_op.xnorpopcount as xp from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation @@ -128,6 +129,8 @@ def prepare_inputs(input_tensor, idt, wdt): return {"inp": input_tensor} +# mem_mode: const or decoupled +@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) # activation: None or DataType @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT2]) # weight datatype @@ -135,14 +138,14 @@ def prepare_inputs(input_tensor, idt, wdt): # input datatype @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2]) # neuron folding, -1 is maximum possible -@pytest.mark.parametrize("nf", [-1, 1]) +@pytest.mark.parametrize("nf", [-1, 2, 1]) # synapse folding, -1 is maximum possible -@pytest.mark.parametrize("sf", [-1, 1]) +@pytest.mark.parametrize("sf", [-1, 2, 1]) # HLS matrix width (input features) @pytest.mark.parametrize("mw", [4]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [4]) -def test_fpgadataflow_fclayer_npysim(idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: @@ -179,6 +182,10 @@ def test_fpgadataflow_fclayer_npysim(idt, wdt, act, nf, sf, mw, mh): else: tdt = DataType.INT32 model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + for node in model.graph.node: + # lookup op_type in registry of CustomOps + inst = getCustomOp(node) + inst.set_nodeattr("mem_mode", mem_mode) model = model.transform(SetExecMode("npysim")) model = model.transform(CodeGen_npysim()) model = model.transform(Compile()) @@ -201,7 +208,9 @@ def test_fpgadataflow_fclayer_npysim(idt, wdt, act, nf, sf, mw, mh): y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] - assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "npysim failed" + + y_produced = y_produced.reshape(y_expected.shape) + assert (y_produced == y_expected).all(), "npysim failed" # activation: None or DataType diff --git a/tests/util/test_data_packing.py b/tests/util/test_data_packing.py index 495ec60966ef67f3bf7b99c63cc70e133859d087..28f1d56d0dbc5451ccad3d36b4b1d4c6bed4f63e 100644 --- a/tests/util/test_data_packing.py +++ b/tests/util/test_data_packing.py @@ -104,6 +104,8 @@ g++ -o test_npy2apintstream test.cpp /workspace/cnpy/cnpy.cpp \ ["sh", "compile.sh"], stdout=subprocess.PIPE, cwd=test_dir ) (stdout, stderr) = compile.communicate() + # make copy before saving the array + ndarray = ndarray.copy() np.save(npy_in, ndarray) execute = subprocess.Popen( "./test_npy2apintstream", stdout=subprocess.PIPE, cwd=test_dir