Skip to content
Snippets Groups Projects
Commit 1dc6a212 authored by auphelia's avatar auphelia
Browse files

[Code gen & test] Finished running version of fclayer code generation and execution

parent 5c5dc983
No related branches found
No related tags found
No related merge requests found
import sys
import os
import numpy as np
import subprocess
import numpy as np
import finn.core.utils as utils
from finn.custom_op.fpgadataflow import HLSCustomOp
from finn.core.datatype import DataType
from finn.backend.fpgadataflow.utils import numpy_to_hls_code
from finn.core.datatype import DataType
from finn.custom_op.fpgadataflow import HLSCustomOp
class StreamingFCLayer_Batch(HLSCustomOp):
def __init__(self):
super().__init__()
self.WMEM = 0
self.TMEM = 0
def make_shape_compatible_op(self, node):
pass
......@@ -28,34 +31,44 @@ class StreamingFCLayer_Batch(HLSCustomOp):
temp_files.append("input_{}.npy".format(in_ind))
elif in_ind == 1:
weights = context[inputs]
WMEM = weights.shape[2]
weights = np.transpose(weights, (1,2,0))
weights = numpy_to_hls_code(weights, DataType.BINARY, "weights", True)
f_weights = open("params.h","w")
f_weights.write("static BinaryWeights<{},{},{}> weights = {{{{\n".format(self.SIMD, self.PE, WMEM))
for i in range(weights.shape[0]):
f_weights.write("{")
for j in range(weights.shape[1]):
f_weights.write(weights[i][j])
if j < weights.shape[1]-1:
f_weights.write(", ")
if i < weights.shape[0]-1:
f_weights.write("}, ")
else:
f_weights.write("}")
f_weights.write("}}")
self.WMEM = weights.shape[2]
weights = np.transpose(weights, (1, 2, 0))
weights = np.expand_dims(weights, 0)
weights = numpy_to_hls_code(
weights, DataType.BINARY, "weights", True, True
)
f_weights = open("params.h", "w")
f_weights.write(
"static BinaryWeights<{},{},{}> weights = ".format(
self.SIMD, self.PE, self.WMEM
)
)
f_weights.write(weights)
f_weights.close()
temp_files.append("params.h")
else:
thresholds = context[inputs]
TMEM = thresholds.shape[0]
#print(thresholds.shape)
self.TMEM = thresholds.shape[0]
thresholds = np.transpose(thresholds, (1, 0, 2))
thresholds = np.expand_dims(thresholds, 0)
thresholds = numpy_to_hls_code(
thresholds, DataType.BINARY, "thresholds", True, True
)
f_thresh = open("thresh.h", "w")
f_thresh.write(
"""static ThresholdsActivation<{},{},1,ap_uint<16>,
ap_uint<1>> threshs = """.format(
self.TMEM, self.PE
)
)
f_thresh.write(thresholds)
f_thresh.close()
temp_files.append("thresh.h")
in_ind += 1
sys.exit(0)
self.code_generation(node)
temp_files.append("execute_{}.cpp".format(node.op_type))
bash_compile = """g++ -o execute_{} execute_{}.cpp
......@@ -77,86 +90,74 @@ class StreamingFCLayer_Batch(HLSCustomOp):
for temp_file in temp_files:
os.remove(temp_file)
def get_attributes(self, node):
self.resType = utils.get_by_name(node.attribute, "resType").s.decode("utf-8")
self.MW = utils.get_by_name(node.attribute, "MW").i
self.MH = utils.get_by_name(node.attribute, "MH").i
self.SIMD = utils.get_by_name(node.attribute, "SIMD").i
self.PE = utils.get_by_name(node.attribute, "PE").i
self.resDataType = utils.get_by_name(node.attribute, "resDataType").s.decode("utf-8")
self.resDataType = utils.get_by_name(node.attribute, "resDataType").s.decode(
"utf-8"
)
def global_includes(self, node):
self.code_gen_dict["$GLOBALS$"] = ['// no additional includes necessary']
self.code_gen_dict["$GLOBALS$"] = [
"""#include "weights.hpp" \n#include "activations.hpp" \n
#include "params.h" \n#include "thresh.h" """
]
def defines(self, node):
numReps = 2
self.code_gen_dict["$DEFINES$"] = [
"""#define MW {}\n #define MH {}\n
#define SIMD {}\n #define PE {}\n #define numReps {}""".format(
self.MW, self.MH, self.SIMD, self.PE, numReps
"""#define MW1 {}\n #define MH1 {}\n #define SIMD1 {}\n
#define PE1 {}\n #define WMEM1 {}\n #define TMEM1 {}\n
#define numReps {}""".format(
self.MW, self.MH, self.SIMD, self.PE, self.WMEM, self.TMEM, numReps
)
]
def read_npy_data(self, node):
self.code_gen_dict["$READNPYDATA$"] = []
input_ind = 0
input_file_names = []
for inputs in node.input:
input_file_names.append("input_{}.npy".format(input_ind))
input_ind += 1
self.code_gen_dict["$READNPYDATA$"].append(
"""cnpy::NpyArray arr0 = cnpy::npy_load("input_0.npy");\n
float* loaded_data0 = arr0.data<float>();"""
)
input_ind = 0
for input_file in input_file_names:
self.code_gen_dict["$READNPYDATA$"].append(
"""int num_values0 = 1; \n
for(int i = 0; i < arr0.shape.size(); i++){{\n
num_values0 *= arr0.shape[i]; \n }}"""
)
self.code_gen_dict["$READNPYDATA$"].append(
"ap_uint<{}> dat0;".format(self.SIMD)
)
self.code_gen_dict["$READNPYDATA$"].append(
"for(int i=0; i < num_values0/{}; i++){{".format(self.SIMD)
)
for line in range(self.SIMD):
self.code_gen_dict["$READNPYDATA$"].append(
"""cnpy::NpyArray arr{} = cnpy::npy_load("{}");\n
float* loaded_data{} = arr{}.data<float>();""".format(
input_ind, input_file, input_ind, input_ind
"dat0.range({},{}) = loaded_data0[i+((num_values0/{})*{})];".format(
line, line, self.SIMD, line
)
)
if input_ind == 0:
self.code_gen_dict["$READNPYDATA$"].append(
"""int num_values{} = 1; \n
for(int i = 0; i < arr{}.shape.size(); i++){{\n
num_values{} *= arr{}.shape[i]; \n }}""".format(
input_ind, input_ind, input_ind, input_ind
)
)
self.code_gen_dict["$READNPYDATA$"].append(
"ap_uint<{}> dat{};".format(self.SIMD, input_ind)
)
self.code_gen_dict["$READNPYDATA$"].append(
"for(int i=0; i < num_values{}/{}; i++){{".format(input_ind, self.SIMD)
)
for line in range(self.SIMD):
self.code_gen_dict["$READNPYDATA$"].append(
"dat{}.range({},{}) = loaded_data{}[i+((num_values{}/{})*{})];".format(
input_ind, line, line, input_ind, input_ind, self.SIMD, line
)
)
self.code_gen_dict["$READNPYDATA$"].append("in{} << dat{};".format(input_ind, input_ind))
self.code_gen_dict["$READNPYDATA$"].append("}")
input_ind += 1
self.code_gen_dict["$READNPYDATA$"].append("in0 << dat0;")
self.code_gen_dict["$READNPYDATA$"].append("}")
def strm_decl(self, node):
self.code_gen_dict["$STREAMDECLARATIONS$"] = []
input_ind = 0
for inputs in node.input:
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
'hls::stream<ap_uint<{}>> in{} ("in{}");'.format(
self.SIMD, input_ind, input_ind
)
)
input_ind += 1
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.SIMD)
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
'hls::stream<ap_uint<{}>> out ("out");'.format(self.PE)
)
def docompute(self, node):
self.code_gen_dict["$DOCOMPUTE$"] = [
"{}<MW, MH, SIMD, PE, {}>(in0, loaded_data1, loaded_data2, out, numReps, {});".format(node.op_type, self.resDataType, self.resType)
"""{}<MW1, MH1, SIMD1, PE1, {}>
(in0, out, weights, threshs, numReps, {});""".format(
node.op_type, self.resDataType, self.resType
)
]
def dataoutstrm(self, node):
......@@ -189,14 +190,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
)
self.code_gen_dict["$DATAOUTSTREAM$"].append("}")
def save_as_npy(self, node):
numReps = 2
self.code_gen_dict["$SAVEASCNPY$"] = [
"""cnpy::npy_save("output.npy",&output_data_vector[0],
{{1,{},{}}},"w");""".format(
self.PE,
self.PE,
int(self.MH / self.PE), int(self.PE),
)
]
#include "cnpy.h"
#include <vector>
#include "bnn-library.h"
// includes for network parameters
#include "weights.hpp"
#include "activations.hpp"
#include "interpret.hpp"
#include "mvau.hpp"
#include "utils.hpp"
#include "params.h"
// defines for network parameters
#define MW1 832
#define MH1 1024
#define SIMD1 64
#define PE1 32
#define WMEM1 416
#define TMEM1 32
//static BinaryWeights<SIMD1, PE1, WMEM1> weights;
static ThresholdsActivation<TMEM1,PE1,1,ap_int<16>,ap_uint<1>> threshs;
int main(){
hls::stream<ap_uint<64> > in0 ("in0");
hls::stream<ap_uint<32> > out ("out");
cnpy::NpyArray arr0 = cnpy::npy_load("input_0.npy");
float* loaded_data0 = arr0.data<float>();
int num_values0 = 1;
for(int i = 0; i < arr0.shape.size(); i++){
num_values0 *= arr0.shape[i];
}
ap_uint<64> dat0;
for(int i=0; i < num_values0/64; i++){
dat0.range(0,0) = loaded_data0[i+((num_values0/64)*0)];
dat0.range(1,1) = loaded_data0[i+((num_values0/64)*1)];
dat0.range(2,2) = loaded_data0[i+((num_values0/64)*2)];
dat0.range(3,3) = loaded_data0[i+((num_values0/64)*3)];
dat0.range(4,4) = loaded_data0[i+((num_values0/64)*4)];
dat0.range(5,5) = loaded_data0[i+((num_values0/64)*5)];
dat0.range(6,6) = loaded_data0[i+((num_values0/64)*6)];
dat0.range(7,7) = loaded_data0[i+((num_values0/64)*7)];
dat0.range(8,8) = loaded_data0[i+((num_values0/64)*8)];
dat0.range(9,9) = loaded_data0[i+((num_values0/64)*9)];
dat0.range(10,10) = loaded_data0[i+((num_values0/64)*10)];
dat0.range(11,11) = loaded_data0[i+((num_values0/64)*11)];
dat0.range(12,12) = loaded_data0[i+((num_values0/64)*12)];
dat0.range(13,13) = loaded_data0[i+((num_values0/64)*13)];
dat0.range(14,14) = loaded_data0[i+((num_values0/64)*14)];
dat0.range(15,15) = loaded_data0[i+((num_values0/64)*15)];
dat0.range(16,16) = loaded_data0[i+((num_values0/64)*16)];
dat0.range(17,17) = loaded_data0[i+((num_values0/64)*17)];
dat0.range(18,18) = loaded_data0[i+((num_values0/64)*18)];
dat0.range(19,19) = loaded_data0[i+((num_values0/64)*19)];
dat0.range(20,20) = loaded_data0[i+((num_values0/64)*20)];
dat0.range(21,21) = loaded_data0[i+((num_values0/64)*21)];
dat0.range(22,22) = loaded_data0[i+((num_values0/64)*22)];
dat0.range(23,23) = loaded_data0[i+((num_values0/64)*23)];
dat0.range(24,24) = loaded_data0[i+((num_values0/64)*24)];
dat0.range(25,25) = loaded_data0[i+((num_values0/64)*25)];
dat0.range(26,26) = loaded_data0[i+((num_values0/64)*26)];
dat0.range(27,27) = loaded_data0[i+((num_values0/64)*27)];
dat0.range(28,28) = loaded_data0[i+((num_values0/64)*28)];
dat0.range(29,29) = loaded_data0[i+((num_values0/64)*29)];
dat0.range(30,30) = loaded_data0[i+((num_values0/64)*30)];
dat0.range(31,31) = loaded_data0[i+((num_values0/64)*31)];
dat0.range(32,32) = loaded_data0[i+((num_values0/64)*32)];
dat0.range(33,33) = loaded_data0[i+((num_values0/64)*33)];
dat0.range(34,34) = loaded_data0[i+((num_values0/64)*34)];
dat0.range(35,35) = loaded_data0[i+((num_values0/64)*35)];
dat0.range(36,36) = loaded_data0[i+((num_values0/64)*36)];
dat0.range(37,37) = loaded_data0[i+((num_values0/64)*37)];
dat0.range(38,38) = loaded_data0[i+((num_values0/64)*38)];
dat0.range(39,39) = loaded_data0[i+((num_values0/64)*39)];
dat0.range(40,40) = loaded_data0[i+((num_values0/64)*40)];
dat0.range(41,41) = loaded_data0[i+((num_values0/64)*41)];
dat0.range(42,42) = loaded_data0[i+((num_values0/64)*42)];
dat0.range(43,43) = loaded_data0[i+((num_values0/64)*43)];
dat0.range(44,44) = loaded_data0[i+((num_values0/64)*44)];
dat0.range(45,45) = loaded_data0[i+((num_values0/64)*45)];
dat0.range(46,46) = loaded_data0[i+((num_values0/64)*46)];
dat0.range(47,47) = loaded_data0[i+((num_values0/64)*47)];
dat0.range(48,48) = loaded_data0[i+((num_values0/64)*48)];
dat0.range(49,49) = loaded_data0[i+((num_values0/64)*49)];
dat0.range(50,50) = loaded_data0[i+((num_values0/64)*50)];
dat0.range(51,51) = loaded_data0[i+((num_values0/64)*51)];
dat0.range(52,52) = loaded_data0[i+((num_values0/64)*52)];
dat0.range(53,53) = loaded_data0[i+((num_values0/64)*53)];
dat0.range(54,54) = loaded_data0[i+((num_values0/64)*54)];
dat0.range(55,55) = loaded_data0[i+((num_values0/64)*55)];
dat0.range(56,56) = loaded_data0[i+((num_values0/64)*56)];
dat0.range(57,57) = loaded_data0[i+((num_values0/64)*57)];
dat0.range(58,58) = loaded_data0[i+((num_values0/64)*58)];
dat0.range(59,59) = loaded_data0[i+((num_values0/64)*59)];
dat0.range(60,60) = loaded_data0[i+((num_values0/64)*60)];
dat0.range(61,61) = loaded_data0[i+((num_values0/64)*61)];
dat0.range(62,62) = loaded_data0[i+((num_values0/64)*62)];
dat0.range(63,63) = loaded_data0[i+((num_values0/64)*63)];
in0 << dat0;
}
//cnpy::NpyArray arr1 = cnpy::npy_load("input_1.npy");
//float* loaded_data1 = arr1.data<float>();
//cnpy::NpyArray arr2 = cnpy::npy_load("input_2.npy");
//float* loaded_data2 = arr2.data<float>();
//for(int i=0; i < PE1; i++){
// for(int k; k < WMEM1; k++){
// ap_uint<64> dat1;
// for(int j; j < SIMD1; j++){
// if(i == 0){
// dat1.range(j,j) = loaded_data1[j+(k-1)*64];
// }
// else{
// dat1.range(j,j) = loaded_data1[j+i*(k-1)*64];
// }
//
// }
// weights.m_weights[i][k] = dat1;
// }
// }
for(int i=0; i < PE1; i++){
for(int k; k < TMEM1; k++){
ap_uint<64> dat2;
for(int j; j < 64; j++){
if(i == 0){
dat2.range(j,j) = loaded_data2[j+(k-1)*64];
}
else{
dat2.range(j,j) = loaded_data2[j+i*(k-1)*64];
}
}
threshs.m_thresholds[i][k][0] = dat2;
}
}
int numReps = 2;
StreamingFCLayer_Batch<MW1, MH1, SIMD1, PE1, Recast<XnorMul>>(in0, out, weights, threshs, numReps, ap_resource_lut());
ap_uint<32> out_data;
std::vector<ap_uint<32>> out_data_vector;
while(out.read_nb(out_data)){
out_data_vector.push_back(out_data);
}
std::vector<float> output_data_vector;
for(std::vector<ap_uint<32>>::iterator it = out_data_vector.begin();
it != out_data_vector.end(); ++it){
ap_uint<32> output_data = *it;
output_data_vector.push_back(output_data.range(0,0));
output_data_vector.push_back(output_data.range(1,1));
output_data_vector.push_back(output_data.range(2,2));
output_data_vector.push_back(output_data.range(3,3));
output_data_vector.push_back(output_data.range(4,4));
output_data_vector.push_back(output_data.range(5,5));
output_data_vector.push_back(output_data.range(6,6));
output_data_vector.push_back(output_data.range(7,7));
output_data_vector.push_back(output_data.range(8,8));
output_data_vector.push_back(output_data.range(9,9));
output_data_vector.push_back(output_data.range(10,10));
output_data_vector.push_back(output_data.range(11,11));
output_data_vector.push_back(output_data.range(12,12));
output_data_vector.push_back(output_data.range(13,13));
output_data_vector.push_back(output_data.range(14,14));
output_data_vector.push_back(output_data.range(15,15));
output_data_vector.push_back(output_data.range(16,16));
output_data_vector.push_back(output_data.range(17,17));
output_data_vector.push_back(output_data.range(18,18));
output_data_vector.push_back(output_data.range(19,19));
output_data_vector.push_back(output_data.range(20,20));
output_data_vector.push_back(output_data.range(21,21));
output_data_vector.push_back(output_data.range(22,22));
output_data_vector.push_back(output_data.range(23,23));
output_data_vector.push_back(output_data.range(24,24));
output_data_vector.push_back(output_data.range(25,25));
output_data_vector.push_back(output_data.range(26,26));
output_data_vector.push_back(output_data.range(27,27));
output_data_vector.push_back(output_data.range(28,28));
output_data_vector.push_back(output_data.range(29,29));
output_data_vector.push_back(output_data.range(30,30));
output_data_vector.push_back(output_data.range(31,31));
}
cnpy::npy_save("output.npy",&output_data_vector[0],
{1,32,32},"w");
}
import onnx
from onnx import TensorProto, helper
# import onnx
import numpy as np
from onnx import TensorProto, helper
import finn.core.onnx_exec as oxe
from finn.core.datatype import DataType
......@@ -8,27 +8,32 @@ from finn.core.modelwrapper import ModelWrapper
def test_fclayer_batch():
inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 13, 64])
outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 32, 32])
inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 2, 8])
outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 4, 4])
FCLayer_node = helper.make_node(
"StreamingFCLayer_Batch",
["inp", "weights", "thresh"],
["outp"],
domain='finn',
backend='fpgadataflow',
domain="finn",
backend="fpgadataflow",
resType="ap_resource_lut()",
MW=832,
MH=1024,
SIMD=64,
PE=32,
MW=16,
MH=16,
SIMD=8,
PE=4,
resDataType="Recast<XnorMul>",
)
graph = helper.make_graph(
nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp], value_info=[
helper.make_tensor_value_info("weights", TensorProto.FLOAT, [64, 32, 416]),
helper.make_tensor_value_info("thresh", TensorProto.FLOAT, [32, 32, 1, 16, 1])]
nodes=[FCLayer_node],
name="fclayer_graph",
inputs=[inp],
outputs=[outp],
value_info=[
helper.make_tensor_value_info("weights", TensorProto.FLOAT, [8, 4, 16]),
helper.make_tensor_value_info("thresh", TensorProto.FLOAT, [16, 4, 3]),
],
)
model = helper.make_model(graph, producer_name="fclayer-model")
......@@ -40,24 +45,22 @@ def test_fclayer_batch():
for tensor in graph.output:
model.set_tensor_datatype(tensor.name, DataType["BIPOLAR"])
onnx.save(model.model, "fclayer-model.onnx")
# onnx.save(model.model, "fclayer-model.onnx")
# generate input data
input_tensor = np.random.randint(2, size=832)
input_tensor = (np.asarray(input_tensor, dtype=np.float32)).reshape(1,13,64)
input_dict = {"inp" : input_tensor}
input_tensor = np.random.randint(2, size=16)
input_tensor = (np.asarray(input_tensor, dtype=np.float32)).reshape(1, 2, 8)
input_dict = {"inp": input_tensor}
# generate weights
weights_tensor = np.random.randint(2, size=851968)
weights_tensor = (np.asarray(weights_tensor, dtype=np.float32)).reshape(64,32,416)
weights_tensor = np.random.randint(2, size=512)
weights_tensor = (np.asarray(weights_tensor, dtype=np.float32)).reshape(8, 4, 16)
input_dict["weights"] = weights_tensor
# generate threshold activation
thresh_tensor = np.random.randint(2, size=16384)
thresh_tensor = (np.asarray(thresh_tensor, dtype=np.float32)).reshape(32,32,1,16,1)
thresh_tensor = np.random.randint(2, size=192)
thresh_tensor = (np.asarray(thresh_tensor, dtype=np.float32)).reshape(16, 4, 3)
input_dict["thresh"] = thresh_tensor
output_dict = oxe.execute_onnx(model, input_dict)
output_dict = oxe.execute_onnx(model, input_dict)
print(output_dict)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment