Skip to content
Snippets Groups Projects
Unverified Commit ccbc3d2f authored by Yaman Umuroglu's avatar Yaman Umuroglu Committed by GitHub
Browse files

Merge pull request #146 from quetric/feature_iodma

Added custom hls op for DMA node, and transform to insert it
parents 01fdd9c5 715da32c
No related branches found
No related tags found
No related merge requests found
# Copyright (c) 2020, Xilinx
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of FINN nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import numpy as np
import math
from onnx import TensorProto, helper
from finn.core.datatype import DataType
from finn.custom_op.fpgadataflow import HLSCustomOp
# the IODMA inerfaces a memory-mapped AXI interface and an AXI stream
# direction "in": pulls data from AXI-MM to AXI stream
# direction "out": pushes data from AXI stream to AXI-MM
# DMA Addressing
# - burst mode can be "wrap" or "increment"
# - "increment" bursts will increment the address when moving to the next image
# - "wrap" bursts will reinitialize the address to the start address,
# and are useful for e.g. streaming weights, where the same buffer is
# repeatedly read into the FPGA
# - no additional alignment restrictions beyond anything specified in the AXI spec
# Interfaces
# - AXI-MM name specified by intfName unless this is set to "" (empty, the default)
# in which case output AXI-MM are named "out" and input AXI-MM are named "in0"
# - AXI-MM interface width (in bits) is specified by intfWidth
# - AXI-Stream interface width (in bits) is specified by streamWidth
# - If inftWidth and streamWidth are not equal, the DMA core performs
# width conversion by going up to the least common multiple of bitwidths
# e.g. intfWidth=32b -> 96b -> sreamWidth=24b
# - transfers occur in multiples of the AXI-MM interface width, therefore
# the total number of bits in the tensor must be a multiple of intfWidth
# - transfers occur in multiples of the AXI-Stream interface width, therefore
# the total number of bits in the tensor must be a multiple of streamWidth
# - both interface widths must be a multiple of 8b (AXI protocol requirement)
# - in most systems, intfWidth is also restricted to a power of 2 (e.g. Vitis)
# but this is not universal so we don't check here explicitly
# Input/output tensor sizes shapes
# - The data being moved is a tensor of shape numInputVectors+[NumChannels]
# - The data type of the tensor elements is specified by dataType
# - on the stream side
# -the normal shape is the same as the ONNX tensor attached to it
# -the folded shape is computed from the stream width and normal shape
# - on the AXI-MM side
# -the normal shape is the same as the one on the stream side
# -the folded shape is not defined
class IODMA(HLSCustomOp):
"""Class that corresponds to finn-hlslib DMA function(s)."""
def __init__(self, onnx_node):
super().__init__(onnx_node)
def get_nodeattr_types(self):
my_attrs = {
"NumChannels": ("i", True, 0),
# FINN input datatype
"dataType": ("s", True, ""),
# Stream parameters
"streamWidth": ("i", False, 32),
# DMA-specific parameters
"intfWidth": ("i", False, 32),
"burstMode": ("s", False, "increment"),
"direction": ("s", False, "in"),
# shape describing input vecs per execution
"numInputVectors": ("ints", False, [1]),
# name of axi-mm interface
"intfName": ("s", False, ""),
}
my_attrs.update(super().get_nodeattr_types())
return my_attrs
def get_normal_input_shape(self):
vecs = list(self.get_nodeattr("numInputVectors"))
num_ch = self.get_nodeattr("NumChannels")
ishape = tuple(vecs + [num_ch])
return ishape
def get_normal_output_shape(self):
return self.get_normal_input_shape()
def get_folded_input_shape(self):
if self.get_nodeattr("direction") == "in":
raise ValueError("Folded input shape not defined for input IODMA")
else:
shape = list(self.get_normal_input_shape())
itype_bits = self.get_input_datatype().bitwidth()
intfw = self.get_nodeattr("streamWidth")
assert (
intfw % itype_bits == 0
), "Input stream width must be a multiple of datatype bits"
elems_per_word = intfw // itype_bits
assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
fold_depth = shape[-1] // elems_per_word
shape[-1] = fold_depth
shape.append(elems_per_word)
return tuple(shape)
def get_folded_output_shape(self):
if self.get_nodeattr("direction") == "out":
raise ValueError("Folded output shape not defined for output IODMA")
else:
shape = list(self.get_normal_output_shape())
itype_bits = self.get_output_datatype().bitwidth()
intfw = self.get_nodeattr("streamWidth")
assert (
intfw % itype_bits == 0
), "Input stream width must be a multiple of datatype bits"
elems_per_word = intfw // itype_bits
assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
fold_depth = shape[-1] // elems_per_word
shape[-1] = fold_depth
shape.append(elems_per_word)
return tuple(shape)
def make_shape_compatible_op(self, model):
exp_ishape = self.get_normal_input_shape()
oshape = self.get_normal_output_shape()
ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
assert ishape == exp_ishape, "Unexpected input shape."
# implement tensor with correct shape
values = np.random.randn(*oshape).astype(np.float32)
return helper.make_node(
"Constant",
inputs=[],
outputs=[self.onnx_node.output[0]],
value=helper.make_tensor(
name="const_tensor",
data_type=TensorProto.FLOAT,
dims=values.shape,
vals=values.flatten().astype(float),
),
)
def infer_node_datatype(self, model):
node = self.onnx_node
# data type stays the same
dtype = model.get_tensor_datatype(node.input[0])
exp_idtype = self.get_input_datatype()
assert dtype == exp_idtype, "Unexpected datatype."
model.set_tensor_datatype(node.output[0], dtype)
def verify_node(self):
pass
def get_input_datatype(self):
"""Returns FINN DataType of input."""
return DataType[self.get_nodeattr("dataType")]
def get_output_datatype(self):
"""Returns FINN DataType of output. (Same as input datatype)"""
return self.get_input_datatype()
def get_instream_width(self):
if self.get_nodeattr("direction") == "in":
return self.get_nodeattr("intfWidth")
elif self.get_nodeattr("direction") == "out":
return self.get_nodeattr("streamWidth")
else:
raise ValueError("Invalid IODMA direction, please set to in or out")
def get_outstream_width(self):
if self.get_nodeattr("direction") == "out":
return self.get_nodeattr("intfWidth")
elif self.get_nodeattr("direction") == "in":
return self.get_nodeattr("streamWidth")
else:
raise ValueError("Invalid IODMA direction, please set to in or out")
def get_number_output_values(self):
oshape = self.get_normal_output_shape()
itype_bits = self.get_input_datatype().bitwidth()
intfw = self.get_nodeattr("intfWidth")
nelems = np.prod(oshape)
nbits = nelems * itype_bits
assert nbits % intfw == 0, "DMA: total transfer size must be word multiple"
ovalues = nbits // intfw
return ovalues
def global_includes(self):
self.code_gen_dict["$GLOBALS$"] = ['#include "dma.h"']
self.code_gen_dict["$GLOBALS$"].append('#include "streamtools.h"')
def defines(self, var):
itype_bits = self.get_input_datatype().bitwidth()
total_bits = itype_bits * np.prod(self.get_normal_input_shape())
assert total_bits % 8 == 0, "DMA input not a multiple of 1 Byte"
total_bytes = total_bits // 8
self.code_gen_dict["$DEFINES$"] = [
"""#define NumBytes1 {}\n#define DataWidth1 {}\n""".format(
total_bytes, self.get_nodeattr("intfWidth")
)
]
def get_ap_int_max_w(self):
"Return the maximum width of any ap_int used in this module."
instream = self.get_instream_width()
outstream = self.get_outstream_width()
width_lcm = (instream * outstream) // math.gcd(instream, outstream)
return width_lcm
def docompute(self):
direction = self.get_nodeattr("direction")
mode = self.get_nodeattr("burstMode")
if direction == "in":
if mode == "wrap":
func = "Mem2Stream_Batch_external_wmem"
else:
func = "Mem2Stream_Batch"
dwc_func = "WidthAdjustedOutputStream"
elif direction == "out":
func = "Stream2Mem_Batch"
dwc_func = "WidthAdjustedInputStream"
else:
raise ValueError("Invalid IODMA direction, please set to in or out")
# define templates for instantiation
dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);"
dwc_inst_template = dwc_func + "<%d, %d, %d> %s(%s, numReps);"
# do stream infrastructure and instantiations
intfw = self.get_nodeattr("intfWidth")
strmw = self.get_nodeattr("streamWidth")
width_lcm = (strmw * intfw) // math.gcd(strmw, intfw)
# we always need two streams: one of width_lcm, and one of intfw width
# because we use WidthAdjustedInputStream,
dtype_bits = self.get_input_datatype().bitwidth()
total_bits = dtype_bits * np.prod(self.get_normal_input_shape())
if direction == "in":
self.code_gen_dict["$DOCOMPUTE$"] = [
dwc_inst_template
% (width_lcm, strmw, total_bits // width_lcm, "dwc_lcm", "out"),
dwc_inst_template
% (intfw, width_lcm, total_bits // intfw, "dwc_intfw", "dwc_lcm"),
dma_inst_template % ("in0", "dwc_intfw"),
]
else:
self.code_gen_dict["$DOCOMPUTE$"] = [
dwc_inst_template
% (strmw, width_lcm, total_bits // strmw, "dwc_lcm", "in0"),
dwc_inst_template
% (width_lcm, intfw, total_bits // width_lcm, "dwc_intfw", "dwc_lcm"),
dma_inst_template % ("dwc_intfw", "out"),
]
def blackboxfunction(self):
packed_ibits = self.get_instream_width()
packed_hls_type_in = "ap_uint<%d>" % packed_ibits
packed_obits = self.get_outstream_width()
packed_hls_type_out = "ap_uint<%d>" % packed_obits
direction = self.get_nodeattr("direction")
if direction == "in":
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)"
% (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
]
elif direction == "out":
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)"
% (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
]
else:
raise ValueError("Invalid IODMA direction, please set to in or out")
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = [
"#pragma HLS INTERFACE s_axilite port=numReps bundle=control"
]
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE s_axilite port=return bundle=control"
)
direction = self.get_nodeattr("direction")
intfname = self.get_nodeattr("intfName")
if direction == "in":
if intfname == "":
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE m_axi offset=slave port=in0"
)
else:
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname)
)
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE s_axilite port=in0 bundle=control"
)
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE axis port=out"
)
elif direction == "out":
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE axis port=in0"
)
if intfname == "":
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE m_axi offset=slave port=out"
)
else:
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname)
)
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE s_axilite port=out bundle=control"
)
else:
raise ValueError("Invalid IODMA direction, please set to in or out")
self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW")
def execute_node(self, context, graph):
pass
def dataoutstrm(self):
pass
def read_npy_data(self):
pass
def save_as_npy(self):
pass
def strm_decl(self):
pass
# Copyright (c) 2020, Xilinx
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of FINN nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from onnx import TensorProto
from onnx import helper as oh
from finn.util.basic import get_by_name
from finn.custom_op.registry import getCustomOp
from finn.transformation import Transformation
from finn.transformation.general import SortGraph
import finn.core.data_layout as DataLayout
import math
import numpy as np
class InsertIODMA(Transformation):
"""Insert DMA nodes on all inputs and outputs."""
def __init__(self, max_intfwidth=32):
super().__init__()
assert (
2 ** math.log2(max_intfwidth) == max_intfwidth
), "max_intfwidth must be a power of 2"
self.max_intfwidth = max_intfwidth
def apply(self, model):
# only makes sense for a pure fpgadataflow graph -- so we check!
all_nodes = list(model.graph.node)
assert all(
get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
for x in all_nodes
)
# parse streamingfclayers looking for external weights with no attached IODMA
fc_extw_nodes = list(
filter(
lambda x: x.op_type == "StreamingFCLayer_Batch"
and get_by_name(x.attribute, "mem_mode") is not None
and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external"
and model.find_producer(x.input[1]) is None,
all_nodes,
)
)
graph_in_name = model.graph.input[0].name
first_node = model.find_consumer(graph_in_name)
graph_out_name = model.graph.output[0].name
final_node = model.find_producer(graph_out_name)
if (
final_node.op_type == "IODMA"
and first_node.op_type == "IODMA"
and len(fc_extw_nodes) == 0
):
# TODO maybe check the correctness of properties
return (model, False)
else:
if final_node.op_type != "IODMA":
# check if tensor is NHWC
assert (
model.get_tensor_layout(graph_out_name) == DataLayout.NHWC
or model.get_tensor_layout(graph_in_name) == DataLayout.NC
), "Data layout of tensors must be NHWC or NC"
out_shape = model.get_tensor_shape(graph_out_name)
out_dtype = model.get_tensor_datatype(graph_out_name)
# determine the feasible interface width
transfer_bits = np.prod(out_shape) * out_dtype.bitwidth()
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# get width of stream input to DMA
streamWidth = getCustomOp(final_node).get_outstream_width()
# make new buffer
final_node_out = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
)
model.graph.value_info.append(final_node_out)
model.set_tensor_datatype(final_node_out.name, out_dtype)
# reroute final node output to final_node_out_name
final_node.output[0] = final_node_out.name
dma_node = oh.make_node(
"IODMA",
[final_node_out.name],
[graph_out_name],
numInputVectors=out_shape[:-1],
NumChannels=out_shape[-1],
dataType=str(out_dtype.name),
intfWidth=intfwidth,
streamWidth=streamWidth,
direction="out",
domain="finn",
backend="fpgadataflow",
)
model.graph.node.append(dma_node)
if first_node.op_type != "IODMA":
# check if tensor is NHWC
assert (
model.get_tensor_layout(graph_in_name) == DataLayout.NHWC
or model.get_tensor_layout(graph_in_name) == DataLayout.NC
), "Data layout of tensors must be NHWC or NC"
in_shape = model.get_tensor_shape(graph_in_name)
in_dtype = model.get_tensor_datatype(graph_in_name)
# determine the feasible interface width
transfer_bits = np.prod(in_shape) * in_dtype.bitwidth()
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# get width of stream output from DMA
streamWidth = getCustomOp(first_node).get_instream_width()
# make new buffer
first_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
)
model.graph.value_info.append(first_node_in)
model.set_tensor_datatype(first_node_in.name, in_dtype)
# reroute final node output to final_node_out_name
first_node.input[0] = first_node_in.name
dma_node = oh.make_node(
"IODMA",
[graph_in_name],
[first_node_in.name],
numInputVectors=in_shape[:-1],
NumChannels=in_shape[-1],
dataType=str(in_dtype.name),
intfWidth=intfwidth,
streamWidth=streamWidth,
direction="in",
domain="finn",
backend="fpgadataflow",
)
model.graph.node.insert(0, dma_node)
for fc_node in fc_extw_nodes:
# check if tensor is NHWC
assert (
model.get_tensor_layout(fc_node.input[1]) == DataLayout.NHWC
or model.get_tensor_layout(graph_in_name) == DataLayout.NC
), "Data layout of tensors must be NHWC or NC"
fc_w_name = fc_node.input[1]
w_shape = model.get_tensor_shape(fc_w_name)
w_dtype = model.get_tensor_datatype(fc_w_name)
# determine the feasible interface width
transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# calculate width of stream output from DMA
pe = get_by_name(fc_node.attribute, "PE").i
simd = get_by_name(fc_node.attribute, "SIMD").i
streamWidth = simd * pe * w_dtype.bitwidth()
# make new buffer
fc_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape
)
model.graph.value_info.append(fc_node_in)
model.set_tensor_datatype(fc_node_in.name, w_dtype)
dma_node = oh.make_node(
"IODMA",
[fc_w_name],
[fc_node_in.name],
numInputVectors=w_shape[:-1],
NumChannels=w_shape[-1],
dataType=str(w_dtype.name),
intfWidth=intfwidth,
streamWidth=streamWidth,
direction="in",
burstMode="wrap",
domain="finn",
backend="fpgadataflow",
)
fc_node.input[1] = fc_node_in.name
model.graph.node.insert(0, dma_node)
model = model.transform(SortGraph())
return (model, True)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment