From 1dc6a212b19a8d0a263c660f3ad49ed9d93e2142 Mon Sep 17 00:00:00 2001 From: auphelia <jakobapk@web.de> Date: Fri, 22 Nov 2019 11:24:25 +0000 Subject: [PATCH] [Code gen & test] Finished running version of fclayer code generation and execution --- .../fpgadataflow/streamingfclayer_batch.py | 163 +++++++------- tests/TESTexecute_StreamingFCLayer_Batch.cpp | 198 ------------------ tests/test_layer_streaming_fclayer_batch.py | 53 ++--- 3 files changed, 108 insertions(+), 306 deletions(-) delete mode 100644 tests/TESTexecute_StreamingFCLayer_Batch.cpp diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 4fbc2cde9..481e5f15d 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -1,17 +1,20 @@ -import sys import os -import numpy as np import subprocess +import numpy as np + import finn.core.utils as utils -from finn.custom_op.fpgadataflow import HLSCustomOp -from finn.core.datatype import DataType from finn.backend.fpgadataflow.utils import numpy_to_hls_code - - +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp class StreamingFCLayer_Batch(HLSCustomOp): + def __init__(self): + super().__init__() + self.WMEM = 0 + self.TMEM = 0 + def make_shape_compatible_op(self, node): pass @@ -28,34 +31,44 @@ class StreamingFCLayer_Batch(HLSCustomOp): temp_files.append("input_{}.npy".format(in_ind)) elif in_ind == 1: weights = context[inputs] - WMEM = weights.shape[2] - weights = np.transpose(weights, (1,2,0)) - weights = numpy_to_hls_code(weights, DataType.BINARY, "weights", True) - - f_weights = open("params.h","w") - f_weights.write("static BinaryWeights<{},{},{}> weights = {{{{\n".format(self.SIMD, self.PE, WMEM)) - for i in range(weights.shape[0]): - f_weights.write("{") - for j in range(weights.shape[1]): - f_weights.write(weights[i][j]) - if j < weights.shape[1]-1: - f_weights.write(", ") - if i < weights.shape[0]-1: - f_weights.write("}, ") - else: - f_weights.write("}") - f_weights.write("}}") + self.WMEM = weights.shape[2] + weights = np.transpose(weights, (1, 2, 0)) + weights = np.expand_dims(weights, 0) + weights = numpy_to_hls_code( + weights, DataType.BINARY, "weights", True, True + ) + + f_weights = open("params.h", "w") + f_weights.write( + "static BinaryWeights<{},{},{}> weights = ".format( + self.SIMD, self.PE, self.WMEM + ) + ) + f_weights.write(weights) f_weights.close() - + temp_files.append("params.h") + else: thresholds = context[inputs] - TMEM = thresholds.shape[0] - - #print(thresholds.shape) + self.TMEM = thresholds.shape[0] + thresholds = np.transpose(thresholds, (1, 0, 2)) + thresholds = np.expand_dims(thresholds, 0) + thresholds = numpy_to_hls_code( + thresholds, DataType.BINARY, "thresholds", True, True + ) + f_thresh = open("thresh.h", "w") + f_thresh.write( + """static ThresholdsActivation<{},{},1,ap_uint<16>, + ap_uint<1>> threshs = """.format( + self.TMEM, self.PE + ) + ) + f_thresh.write(thresholds) + f_thresh.close() + temp_files.append("thresh.h") in_ind += 1 - sys.exit(0) self.code_generation(node) temp_files.append("execute_{}.cpp".format(node.op_type)) bash_compile = """g++ -o execute_{} execute_{}.cpp @@ -77,86 +90,74 @@ class StreamingFCLayer_Batch(HLSCustomOp): for temp_file in temp_files: os.remove(temp_file) - def get_attributes(self, node): self.resType = utils.get_by_name(node.attribute, "resType").s.decode("utf-8") self.MW = utils.get_by_name(node.attribute, "MW").i self.MH = utils.get_by_name(node.attribute, "MH").i self.SIMD = utils.get_by_name(node.attribute, "SIMD").i self.PE = utils.get_by_name(node.attribute, "PE").i - self.resDataType = utils.get_by_name(node.attribute, "resDataType").s.decode("utf-8") + self.resDataType = utils.get_by_name(node.attribute, "resDataType").s.decode( + "utf-8" + ) def global_includes(self, node): - self.code_gen_dict["$GLOBALS$"] = ['// no additional includes necessary'] + self.code_gen_dict["$GLOBALS$"] = [ + """#include "weights.hpp" \n#include "activations.hpp" \n + #include "params.h" \n#include "thresh.h" """ + ] def defines(self, node): numReps = 2 self.code_gen_dict["$DEFINES$"] = [ - """#define MW {}\n #define MH {}\n - #define SIMD {}\n #define PE {}\n #define numReps {}""".format( - self.MW, self.MH, self.SIMD, self.PE, numReps + """#define MW1 {}\n #define MH1 {}\n #define SIMD1 {}\n + #define PE1 {}\n #define WMEM1 {}\n #define TMEM1 {}\n + #define numReps {}""".format( + self.MW, self.MH, self.SIMD, self.PE, self.WMEM, self.TMEM, numReps ) ] def read_npy_data(self, node): self.code_gen_dict["$READNPYDATA$"] = [] - input_ind = 0 - input_file_names = [] - for inputs in node.input: - input_file_names.append("input_{}.npy".format(input_ind)) - input_ind += 1 + self.code_gen_dict["$READNPYDATA$"].append( + """cnpy::NpyArray arr0 = cnpy::npy_load("input_0.npy");\n + float* loaded_data0 = arr0.data<float>();""" + ) - input_ind = 0 - for input_file in input_file_names: + self.code_gen_dict["$READNPYDATA$"].append( + """int num_values0 = 1; \n + for(int i = 0; i < arr0.shape.size(); i++){{\n + num_values0 *= arr0.shape[i]; \n }}""" + ) + self.code_gen_dict["$READNPYDATA$"].append( + "ap_uint<{}> dat0;".format(self.SIMD) + ) + self.code_gen_dict["$READNPYDATA$"].append( + "for(int i=0; i < num_values0/{}; i++){{".format(self.SIMD) + ) + for line in range(self.SIMD): self.code_gen_dict["$READNPYDATA$"].append( - """cnpy::NpyArray arr{} = cnpy::npy_load("{}");\n - float* loaded_data{} = arr{}.data<float>();""".format( - input_ind, input_file, input_ind, input_ind + "dat0.range({},{}) = loaded_data0[i+((num_values0/{})*{})];".format( + line, line, self.SIMD, line ) ) - if input_ind == 0: - self.code_gen_dict["$READNPYDATA$"].append( - """int num_values{} = 1; \n - for(int i = 0; i < arr{}.shape.size(); i++){{\n - num_values{} *= arr{}.shape[i]; \n }}""".format( - input_ind, input_ind, input_ind, input_ind - ) - ) - self.code_gen_dict["$READNPYDATA$"].append( - "ap_uint<{}> dat{};".format(self.SIMD, input_ind) - ) - self.code_gen_dict["$READNPYDATA$"].append( - "for(int i=0; i < num_values{}/{}; i++){{".format(input_ind, self.SIMD) - ) - for line in range(self.SIMD): - self.code_gen_dict["$READNPYDATA$"].append( - "dat{}.range({},{}) = loaded_data{}[i+((num_values{}/{})*{})];".format( - input_ind, line, line, input_ind, input_ind, self.SIMD, line - ) - ) - self.code_gen_dict["$READNPYDATA$"].append("in{} << dat{};".format(input_ind, input_ind)) - self.code_gen_dict["$READNPYDATA$"].append("}") - input_ind += 1 - + self.code_gen_dict["$READNPYDATA$"].append("in0 << dat0;") + self.code_gen_dict["$READNPYDATA$"].append("}") def strm_decl(self, node): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - input_ind = 0 - for inputs in node.input: - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream<ap_uint<{}>> in{} ("in{}");'.format( - self.SIMD, input_ind, input_ind - ) - ) - input_ind += 1 + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.SIMD) + ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> out ("out");'.format(self.PE) ) - def docompute(self, node): self.code_gen_dict["$DOCOMPUTE$"] = [ - "{}<MW, MH, SIMD, PE, {}>(in0, loaded_data1, loaded_data2, out, numReps, {});".format(node.op_type, self.resDataType, self.resType) + """{}<MW1, MH1, SIMD1, PE1, {}> + (in0, out, weights, threshs, numReps, {});""".format( + node.op_type, self.resDataType, self.resType + ) ] def dataoutstrm(self, node): @@ -189,14 +190,10 @@ class StreamingFCLayer_Batch(HLSCustomOp): ) self.code_gen_dict["$DATAOUTSTREAM$"].append("}") - def save_as_npy(self, node): - numReps = 2 self.code_gen_dict["$SAVEASCNPY$"] = [ """cnpy::npy_save("output.npy",&output_data_vector[0], {{1,{},{}}},"w");""".format( - self.PE, - self.PE, + int(self.MH / self.PE), int(self.PE), ) ] - diff --git a/tests/TESTexecute_StreamingFCLayer_Batch.cpp b/tests/TESTexecute_StreamingFCLayer_Batch.cpp deleted file mode 100644 index 2bf6705a2..000000000 --- a/tests/TESTexecute_StreamingFCLayer_Batch.cpp +++ /dev/null @@ -1,198 +0,0 @@ - -#include "cnpy.h" -#include <vector> -#include "bnn-library.h" - -// includes for network parameters -#include "weights.hpp" -#include "activations.hpp" -#include "interpret.hpp" -#include "mvau.hpp" -#include "utils.hpp" -#include "params.h" -// defines for network parameters -#define MW1 832 -#define MH1 1024 -#define SIMD1 64 -#define PE1 32 -#define WMEM1 416 -#define TMEM1 32 - -//static BinaryWeights<SIMD1, PE1, WMEM1> weights; -static ThresholdsActivation<TMEM1,PE1,1,ap_int<16>,ap_uint<1>> threshs; - -int main(){ - - hls::stream<ap_uint<64> > in0 ("in0"); - hls::stream<ap_uint<32> > out ("out"); - - cnpy::NpyArray arr0 = cnpy::npy_load("input_0.npy"); - float* loaded_data0 = arr0.data<float>(); - int num_values0 = 1; - - for(int i = 0; i < arr0.shape.size(); i++){ - - num_values0 *= arr0.shape[i]; - } - - ap_uint<64> dat0; - - for(int i=0; i < num_values0/64; i++){ - dat0.range(0,0) = loaded_data0[i+((num_values0/64)*0)]; - dat0.range(1,1) = loaded_data0[i+((num_values0/64)*1)]; - dat0.range(2,2) = loaded_data0[i+((num_values0/64)*2)]; - dat0.range(3,3) = loaded_data0[i+((num_values0/64)*3)]; - dat0.range(4,4) = loaded_data0[i+((num_values0/64)*4)]; - dat0.range(5,5) = loaded_data0[i+((num_values0/64)*5)]; - dat0.range(6,6) = loaded_data0[i+((num_values0/64)*6)]; - dat0.range(7,7) = loaded_data0[i+((num_values0/64)*7)]; - dat0.range(8,8) = loaded_data0[i+((num_values0/64)*8)]; - dat0.range(9,9) = loaded_data0[i+((num_values0/64)*9)]; - dat0.range(10,10) = loaded_data0[i+((num_values0/64)*10)]; - dat0.range(11,11) = loaded_data0[i+((num_values0/64)*11)]; - dat0.range(12,12) = loaded_data0[i+((num_values0/64)*12)]; - dat0.range(13,13) = loaded_data0[i+((num_values0/64)*13)]; - dat0.range(14,14) = loaded_data0[i+((num_values0/64)*14)]; - dat0.range(15,15) = loaded_data0[i+((num_values0/64)*15)]; - dat0.range(16,16) = loaded_data0[i+((num_values0/64)*16)]; - dat0.range(17,17) = loaded_data0[i+((num_values0/64)*17)]; - dat0.range(18,18) = loaded_data0[i+((num_values0/64)*18)]; - dat0.range(19,19) = loaded_data0[i+((num_values0/64)*19)]; - dat0.range(20,20) = loaded_data0[i+((num_values0/64)*20)]; - dat0.range(21,21) = loaded_data0[i+((num_values0/64)*21)]; - dat0.range(22,22) = loaded_data0[i+((num_values0/64)*22)]; - dat0.range(23,23) = loaded_data0[i+((num_values0/64)*23)]; - dat0.range(24,24) = loaded_data0[i+((num_values0/64)*24)]; - dat0.range(25,25) = loaded_data0[i+((num_values0/64)*25)]; - dat0.range(26,26) = loaded_data0[i+((num_values0/64)*26)]; - dat0.range(27,27) = loaded_data0[i+((num_values0/64)*27)]; - dat0.range(28,28) = loaded_data0[i+((num_values0/64)*28)]; - dat0.range(29,29) = loaded_data0[i+((num_values0/64)*29)]; - dat0.range(30,30) = loaded_data0[i+((num_values0/64)*30)]; - dat0.range(31,31) = loaded_data0[i+((num_values0/64)*31)]; - dat0.range(32,32) = loaded_data0[i+((num_values0/64)*32)]; - dat0.range(33,33) = loaded_data0[i+((num_values0/64)*33)]; -dat0.range(34,34) = loaded_data0[i+((num_values0/64)*34)]; -dat0.range(35,35) = loaded_data0[i+((num_values0/64)*35)]; -dat0.range(36,36) = loaded_data0[i+((num_values0/64)*36)]; -dat0.range(37,37) = loaded_data0[i+((num_values0/64)*37)]; -dat0.range(38,38) = loaded_data0[i+((num_values0/64)*38)]; -dat0.range(39,39) = loaded_data0[i+((num_values0/64)*39)]; -dat0.range(40,40) = loaded_data0[i+((num_values0/64)*40)]; -dat0.range(41,41) = loaded_data0[i+((num_values0/64)*41)]; -dat0.range(42,42) = loaded_data0[i+((num_values0/64)*42)]; -dat0.range(43,43) = loaded_data0[i+((num_values0/64)*43)]; -dat0.range(44,44) = loaded_data0[i+((num_values0/64)*44)]; -dat0.range(45,45) = loaded_data0[i+((num_values0/64)*45)]; -dat0.range(46,46) = loaded_data0[i+((num_values0/64)*46)]; -dat0.range(47,47) = loaded_data0[i+((num_values0/64)*47)]; -dat0.range(48,48) = loaded_data0[i+((num_values0/64)*48)]; -dat0.range(49,49) = loaded_data0[i+((num_values0/64)*49)]; -dat0.range(50,50) = loaded_data0[i+((num_values0/64)*50)]; -dat0.range(51,51) = loaded_data0[i+((num_values0/64)*51)]; -dat0.range(52,52) = loaded_data0[i+((num_values0/64)*52)]; -dat0.range(53,53) = loaded_data0[i+((num_values0/64)*53)]; -dat0.range(54,54) = loaded_data0[i+((num_values0/64)*54)]; -dat0.range(55,55) = loaded_data0[i+((num_values0/64)*55)]; -dat0.range(56,56) = loaded_data0[i+((num_values0/64)*56)]; -dat0.range(57,57) = loaded_data0[i+((num_values0/64)*57)]; -dat0.range(58,58) = loaded_data0[i+((num_values0/64)*58)]; -dat0.range(59,59) = loaded_data0[i+((num_values0/64)*59)]; -dat0.range(60,60) = loaded_data0[i+((num_values0/64)*60)]; -dat0.range(61,61) = loaded_data0[i+((num_values0/64)*61)]; -dat0.range(62,62) = loaded_data0[i+((num_values0/64)*62)]; -dat0.range(63,63) = loaded_data0[i+((num_values0/64)*63)]; -in0 << dat0; -} - - //cnpy::NpyArray arr1 = cnpy::npy_load("input_1.npy"); - //float* loaded_data1 = arr1.data<float>(); - - //cnpy::NpyArray arr2 = cnpy::npy_load("input_2.npy"); - //float* loaded_data2 = arr2.data<float>(); - - - - //for(int i=0; i < PE1; i++){ - // for(int k; k < WMEM1; k++){ - // ap_uint<64> dat1; - // for(int j; j < SIMD1; j++){ - // if(i == 0){ - // dat1.range(j,j) = loaded_data1[j+(k-1)*64]; - // } - // else{ - // dat1.range(j,j) = loaded_data1[j+i*(k-1)*64]; - // } -// -// } -// weights.m_weights[i][k] = dat1; -// } -// } - - for(int i=0; i < PE1; i++){ - for(int k; k < TMEM1; k++){ - ap_uint<64> dat2; - for(int j; j < 64; j++){ - if(i == 0){ - dat2.range(j,j) = loaded_data2[j+(k-1)*64]; - } - else{ - dat2.range(j,j) = loaded_data2[j+i*(k-1)*64]; - } - } - threshs.m_thresholds[i][k][0] = dat2; - } - } - int numReps = 2; - - StreamingFCLayer_Batch<MW1, MH1, SIMD1, PE1, Recast<XnorMul>>(in0, out, weights, threshs, numReps, ap_resource_lut()); - - ap_uint<32> out_data; - std::vector<ap_uint<32>> out_data_vector; -while(out.read_nb(out_data)){ -out_data_vector.push_back(out_data); -} -std::vector<float> output_data_vector; -for(std::vector<ap_uint<32>>::iterator it = out_data_vector.begin(); - it != out_data_vector.end(); ++it){ -ap_uint<32> output_data = *it; -output_data_vector.push_back(output_data.range(0,0)); -output_data_vector.push_back(output_data.range(1,1)); -output_data_vector.push_back(output_data.range(2,2)); -output_data_vector.push_back(output_data.range(3,3)); -output_data_vector.push_back(output_data.range(4,4)); -output_data_vector.push_back(output_data.range(5,5)); -output_data_vector.push_back(output_data.range(6,6)); -output_data_vector.push_back(output_data.range(7,7)); -output_data_vector.push_back(output_data.range(8,8)); -output_data_vector.push_back(output_data.range(9,9)); -output_data_vector.push_back(output_data.range(10,10)); -output_data_vector.push_back(output_data.range(11,11)); -output_data_vector.push_back(output_data.range(12,12)); -output_data_vector.push_back(output_data.range(13,13)); -output_data_vector.push_back(output_data.range(14,14)); -output_data_vector.push_back(output_data.range(15,15)); -output_data_vector.push_back(output_data.range(16,16)); -output_data_vector.push_back(output_data.range(17,17)); -output_data_vector.push_back(output_data.range(18,18)); -output_data_vector.push_back(output_data.range(19,19)); -output_data_vector.push_back(output_data.range(20,20)); -output_data_vector.push_back(output_data.range(21,21)); -output_data_vector.push_back(output_data.range(22,22)); -output_data_vector.push_back(output_data.range(23,23)); -output_data_vector.push_back(output_data.range(24,24)); -output_data_vector.push_back(output_data.range(25,25)); -output_data_vector.push_back(output_data.range(26,26)); -output_data_vector.push_back(output_data.range(27,27)); -output_data_vector.push_back(output_data.range(28,28)); -output_data_vector.push_back(output_data.range(29,29)); -output_data_vector.push_back(output_data.range(30,30)); -output_data_vector.push_back(output_data.range(31,31)); -} - - cnpy::npy_save("output.npy",&output_data_vector[0], - {1,32,32},"w"); - - } - - diff --git a/tests/test_layer_streaming_fclayer_batch.py b/tests/test_layer_streaming_fclayer_batch.py index 9de83d8ab..06fb9a8f1 100644 --- a/tests/test_layer_streaming_fclayer_batch.py +++ b/tests/test_layer_streaming_fclayer_batch.py @@ -1,6 +1,6 @@ -import onnx -from onnx import TensorProto, helper +# import onnx import numpy as np +from onnx import TensorProto, helper import finn.core.onnx_exec as oxe from finn.core.datatype import DataType @@ -8,27 +8,32 @@ from finn.core.modelwrapper import ModelWrapper def test_fclayer_batch(): - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 13, 64]) - outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 32, 32]) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 2, 8]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 4, 4]) FCLayer_node = helper.make_node( "StreamingFCLayer_Batch", ["inp", "weights", "thresh"], ["outp"], - domain='finn', - backend='fpgadataflow', + domain="finn", + backend="fpgadataflow", resType="ap_resource_lut()", - MW=832, - MH=1024, - SIMD=64, - PE=32, + MW=16, + MH=16, + SIMD=8, + PE=4, resDataType="Recast<XnorMul>", ) graph = helper.make_graph( - nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp], value_info=[ - helper.make_tensor_value_info("weights", TensorProto.FLOAT, [64, 32, 416]), - helper.make_tensor_value_info("thresh", TensorProto.FLOAT, [32, 32, 1, 16, 1])] + nodes=[FCLayer_node], + name="fclayer_graph", + inputs=[inp], + outputs=[outp], + value_info=[ + helper.make_tensor_value_info("weights", TensorProto.FLOAT, [8, 4, 16]), + helper.make_tensor_value_info("thresh", TensorProto.FLOAT, [16, 4, 3]), + ], ) model = helper.make_model(graph, producer_name="fclayer-model") @@ -40,24 +45,22 @@ def test_fclayer_batch(): for tensor in graph.output: model.set_tensor_datatype(tensor.name, DataType["BIPOLAR"]) - onnx.save(model.model, "fclayer-model.onnx") + # onnx.save(model.model, "fclayer-model.onnx") # generate input data - input_tensor = np.random.randint(2, size=832) - input_tensor = (np.asarray(input_tensor, dtype=np.float32)).reshape(1,13,64) - input_dict = {"inp" : input_tensor} + input_tensor = np.random.randint(2, size=16) + input_tensor = (np.asarray(input_tensor, dtype=np.float32)).reshape(1, 2, 8) + input_dict = {"inp": input_tensor} # generate weights - weights_tensor = np.random.randint(2, size=851968) - weights_tensor = (np.asarray(weights_tensor, dtype=np.float32)).reshape(64,32,416) + weights_tensor = np.random.randint(2, size=512) + weights_tensor = (np.asarray(weights_tensor, dtype=np.float32)).reshape(8, 4, 16) input_dict["weights"] = weights_tensor # generate threshold activation - thresh_tensor = np.random.randint(2, size=16384) - thresh_tensor = (np.asarray(thresh_tensor, dtype=np.float32)).reshape(32,32,1,16,1) + thresh_tensor = np.random.randint(2, size=192) + thresh_tensor = (np.asarray(thresh_tensor, dtype=np.float32)).reshape(16, 4, 3) input_dict["thresh"] = thresh_tensor - output_dict = oxe.execute_onnx(model, input_dict) - - - + output_dict = oxe.execute_onnx(model, input_dict) + print(output_dict) -- GitLab