diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py index 3b0fa55b0065e6ceeb8ad2eb7282a413adf443d7..08e48f435f7bf86e538c1b67ead2ec0f555eb727 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py @@ -29,7 +29,6 @@ import numpy as np import os import warnings -from onnx import TensorProto, helper from finn.core.datatype import DataType from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp @@ -46,6 +45,8 @@ class DuplicateStreams_Batch(HLSCustomOp): my_attrs = { "NumChannels": ("i", True, 0), "PE": ("i", True, 0), + # how many duplicated output streams to create + "NumOutputStreams": ("i", True, 0), # FINN DataTypes for input "inputDataType": ("s", True, ""), # number of input vectors, examples: @@ -57,6 +58,9 @@ class DuplicateStreams_Batch(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs + def get_num_output_streams(self): + return self.get_nodeattr("NumOutputStreams") + def get_normal_input_shape(self): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) @@ -82,26 +86,13 @@ class DuplicateStreams_Batch(HLSCustomOp): exp_ishape = self.get_normal_input_shape() ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) assert ishape == exp_ishape, "Unexpected input shape." + num_out = self.get_num_output_streams() + assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs" oshape = self.get_normal_output_shape() - values = np.zeros(oshape).astype(np.float32) - split_input = np.concatenate((values, values), axis=0) - - split_in = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, oshape - ) - - model.graph.value_info.append(split_in) # requires clean up - model.set_initializer(split_in.name, split_input) - - shape_comp_node = helper.make_node( - "Split", - inputs=[split_in.name], - outputs=[self.onnx_node.output[0], self.onnx_node.output[1]], - axis=0, - ) - - return shape_comp_node + ret = super().make_const_shape_op(oshape) + ret.output[:] = self.onnx_node.output + return ret def infer_node_datatype(self, model): node = self.onnx_node @@ -115,8 +106,8 @@ class DuplicateStreams_Batch(HLSCustomOp): warnings.warn(warn_str) self.set_nodeattr("inputDataType", idt.name) odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - model.set_tensor_datatype(self.onnx_node.output[1], odt) + for my_out in self.onnx_node.output: + model.set_tensor_datatype(my_out, odt) def verify_node(self): info_messages = [] @@ -133,6 +124,7 @@ class DuplicateStreams_Batch(HLSCustomOp): self.get_nodeattr("executable_path") self.get_nodeattr("NumChannels") self.get_nodeattr("PE") + self.get_nodeattr("NumOutputStreams") self.get_nodeattr("inputDataType") info_messages.append("All necessary attributes exist") except Exception: @@ -165,12 +157,46 @@ class DuplicateStreams_Batch(HLSCustomOp): return out_width def get_number_output_values(self): - return 2 * np.prod(self.get_folded_output_shape()[1:-1]) + return self.get_num_output_streams() * np.prod( + self.get_folded_output_shape()[1:-1] + ) def get_exp_cycles(self): # Channels/PE * batch size * fmdim * fmdim return np.prod(self.get_folded_output_shape()[:-1]) + def generate_params(self, model, path): + n_outputs = self.get_num_output_streams() + inp_streams = [] + commands = [] + o_stream_w = self.get_outstream_width() + i_stream_w = self.get_instream_width() + in_stream = "hls::stream<ap_uint<%d> > &in0" % (i_stream_w) + inp_streams.append(in_stream) + commands.append("ap_uint<%d> e = in0.read();" % i_stream_w) + iters = self.get_number_output_values() // self.get_num_output_streams() + for i in range(n_outputs): + out_stream = "hls::stream<ap_uint<%d> > &out%d" % (o_stream_w, i) + inp_streams.append(out_stream) + cmd = "out%d.write(e);" % i + commands.append(cmd) + + impl_hls_code = [] + impl_hls_code.append("void DuplicateStreamsCustom(") + impl_hls_code.append(",".join(inp_streams)) + impl_hls_code.append(") {") + impl_hls_code.append("for(unsigned int i = 0; i < %d; i++) {" % iters) + impl_hls_code.append("#pragma HLS PIPELINE II=1") + impl_hls_code.append("\n".join(commands)) + impl_hls_code.append("}") + impl_hls_code.append("}") + impl_hls_code = "\n".join(impl_hls_code) + + impl_filename = "{}/duplicate_impl.hpp".format(path) + f_impl = open(impl_filename, "w") + f_impl.write(impl_hls_code) + f_impl.close() + def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node @@ -178,6 +204,7 @@ class DuplicateStreams_Batch(HLSCustomOp): exp_oshape = self.get_normal_output_shape() folded_ishape = self.get_folded_input_shape() folded_oshape = self.get_folded_output_shape() + n_outputs = self.get_num_output_streams() if mode == "cppsim": code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -205,17 +232,15 @@ class DuplicateStreams_Batch(HLSCustomOp): # execute the precompiled model super().exec_precompiled_singlenode_model() # load output npy file - super().npy_to_dynamic_outputs(context, ["output0.npy", "output1.npy"]) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim \ - did not produce expected ofolded utput shape" - assert ( - context[node.output[1]].shape == folded_oshape - ), "cppsim \ - did not produce expected ofolded utput shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - context[node.output[1]] = context[node.output[1]].reshape(*exp_oshape) + super().npy_to_dynamic_outputs( + context, ["output%d.npy" % i for i in range(n_outputs)] + ) + for i in range(n_outputs): + assert ( + context[node.output[i]].shape == folded_oshape + ), "cppsim \ + did not produce expected ofolded utput shape" + context[node.output[i]] = context[node.output[i]].reshape(*exp_oshape) elif mode == "rtlsim": sim = self.get_rtlsim() nbits = self.get_instream_width() @@ -226,41 +251,30 @@ class DuplicateStreams_Batch(HLSCustomOp): super().toggle_clk(sim) rtlsim_dict = { "inputs": {"in0": rtlsim_inp}, - "outputs": {"out0": [], "out1": []}, + "outputs": {}, } + for i in range(n_outputs): + rtlsim_dict["outputs"]["out%d" % i] = [] self.rtlsim_multi_io(sim, rtlsim_dict) odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() out_shape = self.get_folded_output_shape() + for i in range(n_outputs): + out_npy_path = "%s/output%d.npy" % (code_gen_dir, i) + rtlsim_output_to_npy( + rtlsim_dict["outputs"]["out%d" % i], + out_npy_path, + odt, + out_shape, + packed_bits, + target_bits, + ) + # load and reshape output 0 + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[i]] = output - out_npy_path = "{}/output0.npy".format(code_gen_dir) - rtlsim_output_to_npy( - rtlsim_dict["outputs"]["out0"], - out_npy_path, - odt, - out_shape, - packed_bits, - target_bits, - ) - # load and reshape output 0 - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - - out_npy_path = "{}/output1.npy".format(code_gen_dir) - rtlsim_output_to_npy( - rtlsim_dict["outputs"]["out1"], - out_npy_path, - odt, - out_shape, - packed_bits, - target_bits, - ) - # load and reshape output 1 - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[1]] = output else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} @@ -277,7 +291,7 @@ class DuplicateStreams_Batch(HLSCustomOp): ), """Output1 shape doesn't match expected shape.""" def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + self.code_gen_dict["$GLOBALS$"] = ['#include "duplicate_impl.hpp"'] def defines(self, var): self.code_gen_dict["$DEFINES$"] = [] @@ -298,24 +312,23 @@ class DuplicateStreams_Batch(HLSCustomOp): ) def strm_decl(self): + n_outputs = self.get_num_output_streams() self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream<ap_uint<{}>> out0 ("out0");'.format(self.get_outstream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream<ap_uint<{}>> out1 ("out1");'.format(self.get_outstream_width()) - ) + for i in range(n_outputs): + out_name = "out%d" % i + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<%d>> %s ("%s");' + % (self.get_outstream_width(), out_name, out_name) + ) def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [ - """DuplicateStreams_Batch<{}, {}> (in0, out0, out1, 1);""".format( - self.get_outstream_width(), - self.get_number_output_values() // 2, - ) - ] + n_outputs = self.get_num_output_streams() + ostreams = ["out%d" % x for x in range(n_outputs)] + dc = "DuplicateStreamsCustom(in0, %s);" % (",".join(ostreams)) + self.code_gen_dict["$DOCOMPUTE$"] = [dc] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -325,62 +338,67 @@ class DuplicateStreams_Batch(HLSCustomOp): packed_hls_type = "ap_uint<%d>" % packed_bits elem_hls_type = dtype.get_hls_datatype_str() npy_type = "float" - npy_out = "%s/output0.npy" % code_gen_dir - npy_out1 = "%s/output1.npy" % code_gen_dir + n_outputs = self.get_num_output_streams() oshape = self.get_folded_output_shape() oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out0, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, + outstrm_code = [] + + for i in range(n_outputs): + out_name = "out%d" % i + npy_out = "%s/output%d.npy" % (code_gen_dir, i) + outstrm_code.append( + 'apintstream2npy<%s, %s, %d, %s>(%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + out_name, + oshape_cpp_str, + npy_out, + ) ) - ] - self.code_gen_dict["$DATAOUTSTREAM$"] += [ - 'apintstream2npy<%s, %s, %d, %s>(out1, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out1, - ) - ] + self.code_gen_dict["$DATAOUTSTREAM$"] = outstrm_code def save_as_npy(self): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): + n_outputs = self.get_num_output_streams() + inp_streams = [] + o_stream_w = self.get_outstream_width() + i_stream_w = self.get_instream_width() + in_stream = "hls::stream<ap_uint<%d> > &in0" % (i_stream_w) + inp_streams.append(in_stream) + for i in range(n_outputs): + out_stream = "hls::stream<ap_uint<%d> > &out%d" % (o_stream_w, i) + inp_streams.append(out_stream) + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream<ap_uint<{}>> &in0, - hls::stream<ap_uint<{}>> &out0, - hls::stream<ap_uint<{}>> &out1)""".format( + """void {}({})""".format( self.onnx_node.name, - self.get_instream_width(), - self.get_outstream_width(), - self.get_outstream_width(), + ",".join(inp_streams), ) ] def pragmas(self): + n_outputs = self.get_num_output_streams() self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out0") - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out1") + for i in range(n_outputs): + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out%d" % i + ) self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE ap_ctrl_none port=return" ) def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() - intf_names["m_axis"] = [ - ("out0_V_V", self.get_outstream_width_padded()), - ("out1_V_V", self.get_outstream_width_padded()), - ] + n_outputs = self.get_num_output_streams() + intf_names["m_axis"] = [] + for i in range(n_outputs): + intf_names["m_axis"].append( + ("out%d_V_V", self.get_outstream_width_padded()) + ) return intf_names diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py index 73bf1165afa9418be0c89f77797de538275fd220..4592fc2f8a2e908a91dd74fef58fb03467f9e7c0 100644 --- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py @@ -61,6 +61,7 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt): domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", NumChannels=ch, + NumOutputStreams=2, PE=pe, inputDataType=idt.name, numInputVectors=[1, idim, idim],