diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py new file mode 100644 index 0000000000000000000000000000000000000000..9b718ecbbc490610790b68871080de23a54f4891 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/iodma.py @@ -0,0 +1,346 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import math +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp + + +# the IODMA inerfaces a memory-mapped AXI interface and an AXI stream +# direction "in": pulls data from AXI-MM to AXI stream +# direction "out": pushes data from AXI stream to AXI-MM + +# DMA Addressing +# - burst mode can be "wrap" or "increment" +# - "increment" bursts will increment the address when moving to the next image +# - "wrap" bursts will reinitialize the address to the start address, +# and are useful for e.g. streaming weights, where the same buffer is +# repeatedly read into the FPGA +# - no additional alignment restrictions beyond anything specified in the AXI spec + +# Interfaces +# - AXI-MM name specified by intfName unless this is set to "" (empty, the default) +# in which case output AXI-MM are named "out" and input AXI-MM are named "in0" +# - AXI-MM interface width (in bits) is specified by intfWidth +# - AXI-Stream interface width (in bits) is specified by streamWidth +# - If inftWidth and streamWidth are not equal, the DMA core performs +# width conversion by going up to the least common multiple of bitwidths +# e.g. intfWidth=32b -> 96b -> sreamWidth=24b +# - transfers occur in multiples of the AXI-MM interface width, therefore +# the total number of bits in the tensor must be a multiple of intfWidth +# - transfers occur in multiples of the AXI-Stream interface width, therefore +# the total number of bits in the tensor must be a multiple of streamWidth +# - both interface widths must be a multiple of 8b (AXI protocol requirement) +# - in most systems, intfWidth is also restricted to a power of 2 (e.g. Vitis) +# but this is not universal so we don't check here explicitly + +# Input/output tensor sizes shapes +# - The data being moved is a tensor of shape numInputVectors+[NumChannels] +# - The data type of the tensor elements is specified by dataType +# - on the stream side +# -the normal shape is the same as the ONNX tensor attached to it +# -the folded shape is computed from the stream width and normal shape +# - on the AXI-MM side +# -the normal shape is the same as the one on the stream side +# -the folded shape is not defined + + +class IODMA(HLSCustomOp): + """Class that corresponds to finn-hlslib DMA function(s).""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, 0), + # FINN input datatype + "dataType": ("s", True, ""), + # Stream parameters + "streamWidth": ("i", False, 32), + # DMA-specific parameters + "intfWidth": ("i", False, 32), + "burstMode": ("s", False, "increment"), + "direction": ("s", False, "in"), + # shape describing input vecs per execution + "numInputVectors": ("ints", False, [1]), + # name of axi-mm interface + "intfName": ("s", False, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self): + vecs = list(self.get_nodeattr("numInputVectors")) + num_ch = self.get_nodeattr("NumChannels") + ishape = tuple(vecs + [num_ch]) + return ishape + + def get_normal_output_shape(self): + return self.get_normal_input_shape() + + def get_folded_input_shape(self): + if self.get_nodeattr("direction") == "in": + raise ValueError("Folded input shape not defined for input IODMA") + else: + shape = list(self.get_normal_input_shape()) + itype_bits = self.get_input_datatype().bitwidth() + intfw = self.get_nodeattr("streamWidth") + assert ( + intfw % itype_bits == 0 + ), "Input stream width must be a multiple of datatype bits" + elems_per_word = intfw // itype_bits + assert shape[-1] % elems_per_word == 0, "Fold depth must be integer" + fold_depth = shape[-1] // elems_per_word + shape[-1] = fold_depth + shape.append(elems_per_word) + return tuple(shape) + + def get_folded_output_shape(self): + if self.get_nodeattr("direction") == "out": + raise ValueError("Folded output shape not defined for output IODMA") + else: + shape = list(self.get_normal_output_shape()) + itype_bits = self.get_output_datatype().bitwidth() + intfw = self.get_nodeattr("streamWidth") + assert ( + intfw % itype_bits == 0 + ), "Input stream width must be a multiple of datatype bits" + elems_per_word = intfw // itype_bits + assert shape[-1] % elems_per_word == 0, "Fold depth must be integer" + fold_depth = shape[-1] // elems_per_word + shape[-1] = fold_depth + shape.append(elems_per_word) + return tuple(shape) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = model.get_tensor_datatype(node.input[0]) + exp_idtype = self.get_input_datatype() + assert dtype == exp_idtype, "Unexpected datatype." + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_instream_width(self): + if self.get_nodeattr("direction") == "in": + return self.get_nodeattr("intfWidth") + elif self.get_nodeattr("direction") == "out": + return self.get_nodeattr("streamWidth") + else: + raise ValueError("Invalid IODMA direction, please set to in or out") + + def get_outstream_width(self): + if self.get_nodeattr("direction") == "out": + return self.get_nodeattr("intfWidth") + elif self.get_nodeattr("direction") == "in": + return self.get_nodeattr("streamWidth") + else: + raise ValueError("Invalid IODMA direction, please set to in or out") + + def get_number_output_values(self): + oshape = self.get_normal_output_shape() + itype_bits = self.get_input_datatype().bitwidth() + intfw = self.get_nodeattr("intfWidth") + nelems = np.prod(oshape) + nbits = nelems * itype_bits + assert nbits % intfw == 0, "DMA: total transfer size must be word multiple" + ovalues = nbits // intfw + return ovalues + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "dma.h"'] + self.code_gen_dict["$GLOBALS$"].append('#include "streamtools.h"') + + def defines(self, var): + itype_bits = self.get_input_datatype().bitwidth() + total_bits = itype_bits * np.prod(self.get_normal_input_shape()) + assert total_bits % 8 == 0, "DMA input not a multiple of 1 Byte" + total_bytes = total_bits // 8 + self.code_gen_dict["$DEFINES$"] = [ + """#define NumBytes1 {}\n#define DataWidth1 {}\n""".format( + total_bytes, self.get_nodeattr("intfWidth") + ) + ] + + def get_ap_int_max_w(self): + "Return the maximum width of any ap_int used in this module." + instream = self.get_instream_width() + outstream = self.get_outstream_width() + width_lcm = (instream * outstream) // math.gcd(instream, outstream) + return width_lcm + + def docompute(self): + direction = self.get_nodeattr("direction") + mode = self.get_nodeattr("burstMode") + if direction == "in": + if mode == "wrap": + func = "Mem2Stream_Batch_external_wmem" + else: + func = "Mem2Stream_Batch" + dwc_func = "WidthAdjustedOutputStream" + elif direction == "out": + func = "Stream2Mem_Batch" + dwc_func = "WidthAdjustedInputStream" + else: + raise ValueError("Invalid IODMA direction, please set to in or out") + # define templates for instantiation + dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);" + dwc_inst_template = dwc_func + "<%d, %d, %d> %s(%s, numReps);" + # do stream infrastructure and instantiations + intfw = self.get_nodeattr("intfWidth") + strmw = self.get_nodeattr("streamWidth") + width_lcm = (strmw * intfw) // math.gcd(strmw, intfw) + # we always need two streams: one of width_lcm, and one of intfw width + # because we use WidthAdjustedInputStream, + dtype_bits = self.get_input_datatype().bitwidth() + total_bits = dtype_bits * np.prod(self.get_normal_input_shape()) + if direction == "in": + self.code_gen_dict["$DOCOMPUTE$"] = [ + dwc_inst_template + % (width_lcm, strmw, total_bits // width_lcm, "dwc_lcm", "out"), + dwc_inst_template + % (intfw, width_lcm, total_bits // intfw, "dwc_intfw", "dwc_lcm"), + dma_inst_template % ("in0", "dwc_intfw"), + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + dwc_inst_template + % (strmw, width_lcm, total_bits // strmw, "dwc_lcm", "in0"), + dwc_inst_template + % (width_lcm, intfw, total_bits // width_lcm, "dwc_intfw", "dwc_lcm"), + dma_inst_template % ("dwc_intfw", "out"), + ] + + def blackboxfunction(self): + packed_ibits = self.get_instream_width() + packed_hls_type_in = "ap_uint<%d>" % packed_ibits + packed_obits = self.get_outstream_width() + packed_hls_type_out = "ap_uint<%d>" % packed_obits + direction = self.get_nodeattr("direction") + if direction == "in": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)" + % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out) + ] + elif direction == "out": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)" + % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out) + ] + else: + raise ValueError("Invalid IODMA direction, please set to in or out") + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE s_axilite port=numReps bundle=control" + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE s_axilite port=return bundle=control" + ) + direction = self.get_nodeattr("direction") + intfname = self.get_nodeattr("intfName") + if direction == "in": + if intfname == "": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE m_axi offset=slave port=in0" + ) + else: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname) + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE s_axilite port=in0 bundle=control" + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out" + ) + elif direction == "out": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=in0" + ) + if intfname == "": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE m_axi offset=slave port=out" + ) + else: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname) + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE s_axilite port=out bundle=control" + ) + else: + raise ValueError("Invalid IODMA direction, please set to in or out") + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW") + + def execute_node(self, context, graph): + pass + + def dataoutstrm(self): + pass + + def read_npy_data(self): + pass + + def save_as_npy(self): + pass + + def strm_decl(self): + pass diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py new file mode 100644 index 0000000000000000000000000000000000000000..e4368edea717f7499481e9b1c6ac20f7d5bb5f58 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -0,0 +1,198 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from onnx import TensorProto +from onnx import helper as oh + +from finn.util.basic import get_by_name +from finn.custom_op.registry import getCustomOp +from finn.transformation import Transformation +from finn.transformation.general import SortGraph +import finn.core.data_layout as DataLayout +import math +import numpy as np + + +class InsertIODMA(Transformation): + """Insert DMA nodes on all inputs and outputs.""" + + def __init__(self, max_intfwidth=32): + super().__init__() + assert ( + 2 ** math.log2(max_intfwidth) == max_intfwidth + ), "max_intfwidth must be a power of 2" + self.max_intfwidth = max_intfwidth + + def apply(self, model): + # only makes sense for a pure fpgadataflow graph -- so we check! + all_nodes = list(model.graph.node) + assert all( + get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow" + for x in all_nodes + ) + # parse streamingfclayers looking for external weights with no attached IODMA + fc_extw_nodes = list( + filter( + lambda x: x.op_type == "StreamingFCLayer_Batch" + and get_by_name(x.attribute, "mem_mode") is not None + and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external" + and model.find_producer(x.input[1]) is None, + all_nodes, + ) + ) + graph_in_name = model.graph.input[0].name + first_node = model.find_consumer(graph_in_name) + graph_out_name = model.graph.output[0].name + final_node = model.find_producer(graph_out_name) + if ( + final_node.op_type == "IODMA" + and first_node.op_type == "IODMA" + and len(fc_extw_nodes) == 0 + ): + # TODO maybe check the correctness of properties + return (model, False) + else: + if final_node.op_type != "IODMA": + # check if tensor is NHWC + assert ( + model.get_tensor_layout(graph_out_name) == DataLayout.NHWC + or model.get_tensor_layout(graph_in_name) == DataLayout.NC + ), "Data layout of tensors must be NHWC or NC" + out_shape = model.get_tensor_shape(graph_out_name) + out_dtype = model.get_tensor_datatype(graph_out_name) + # determine the feasible interface width + transfer_bits = np.prod(out_shape) * out_dtype.bitwidth() + intfwidth = math.gcd(transfer_bits, self.max_intfwidth) + assert ( + intfwidth % 8 == 0 + ), "No feasible interface width for transfer size" + # get width of stream input to DMA + streamWidth = getCustomOp(final_node).get_outstream_width() + # make new buffer + final_node_out = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape + ) + model.graph.value_info.append(final_node_out) + model.set_tensor_datatype(final_node_out.name, out_dtype) + # reroute final node output to final_node_out_name + final_node.output[0] = final_node_out.name + dma_node = oh.make_node( + "IODMA", + [final_node_out.name], + [graph_out_name], + numInputVectors=out_shape[:-1], + NumChannels=out_shape[-1], + dataType=str(out_dtype.name), + intfWidth=intfwidth, + streamWidth=streamWidth, + direction="out", + domain="finn", + backend="fpgadataflow", + ) + model.graph.node.append(dma_node) + if first_node.op_type != "IODMA": + # check if tensor is NHWC + assert ( + model.get_tensor_layout(graph_in_name) == DataLayout.NHWC + or model.get_tensor_layout(graph_in_name) == DataLayout.NC + ), "Data layout of tensors must be NHWC or NC" + in_shape = model.get_tensor_shape(graph_in_name) + in_dtype = model.get_tensor_datatype(graph_in_name) + # determine the feasible interface width + transfer_bits = np.prod(in_shape) * in_dtype.bitwidth() + intfwidth = math.gcd(transfer_bits, self.max_intfwidth) + assert ( + intfwidth % 8 == 0 + ), "No feasible interface width for transfer size" + # get width of stream output from DMA + streamWidth = getCustomOp(first_node).get_instream_width() + # make new buffer + first_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape + ) + model.graph.value_info.append(first_node_in) + model.set_tensor_datatype(first_node_in.name, in_dtype) + # reroute final node output to final_node_out_name + first_node.input[0] = first_node_in.name + dma_node = oh.make_node( + "IODMA", + [graph_in_name], + [first_node_in.name], + numInputVectors=in_shape[:-1], + NumChannels=in_shape[-1], + dataType=str(in_dtype.name), + intfWidth=intfwidth, + streamWidth=streamWidth, + direction="in", + domain="finn", + backend="fpgadataflow", + ) + model.graph.node.insert(0, dma_node) + for fc_node in fc_extw_nodes: + # check if tensor is NHWC + assert ( + model.get_tensor_layout(fc_node.input[1]) == DataLayout.NHWC + or model.get_tensor_layout(graph_in_name) == DataLayout.NC + ), "Data layout of tensors must be NHWC or NC" + fc_w_name = fc_node.input[1] + w_shape = model.get_tensor_shape(fc_w_name) + w_dtype = model.get_tensor_datatype(fc_w_name) + # determine the feasible interface width + transfer_bits = np.prod(w_shape) * w_dtype.bitwidth() + intfwidth = math.gcd(transfer_bits, self.max_intfwidth) + assert ( + intfwidth % 8 == 0 + ), "No feasible interface width for transfer size" + # calculate width of stream output from DMA + pe = get_by_name(fc_node.attribute, "PE").i + simd = get_by_name(fc_node.attribute, "SIMD").i + streamWidth = simd * pe * w_dtype.bitwidth() + # make new buffer + fc_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape + ) + model.graph.value_info.append(fc_node_in) + model.set_tensor_datatype(fc_node_in.name, w_dtype) + dma_node = oh.make_node( + "IODMA", + [fc_w_name], + [fc_node_in.name], + numInputVectors=w_shape[:-1], + NumChannels=w_shape[-1], + dataType=str(w_dtype.name), + intfWidth=intfwidth, + streamWidth=streamWidth, + direction="in", + burstMode="wrap", + domain="finn", + backend="fpgadataflow", + ) + fc_node.input[1] = fc_node_in.name + model.graph.node.insert(0, dma_node) + model = model.transform(SortGraph()) + return (model, True)