Skip to content
Snippets Groups Projects
Commit eaf52087 authored by Lucian Petrica's avatar Lucian Petrica
Browse files

Added documentation of features, reworked shapes, and added width conversion

parent 9e61c73e
No related branches found
No related tags found
No related merge requests found
# Copyright (c) 2020, Xilinx
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of FINN nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import numpy as np import numpy as np
import math
from onnx import TensorProto, helper from onnx import TensorProto, helper
from finn.core.datatype import DataType from finn.core.datatype import DataType
from finn.custom_op.fpgadataflow import HLSCustomOp from finn.custom_op.fpgadataflow import HLSCustomOp
# the IODMA inerfaces a memory-mapped AXI interface and an AXI stream
# direction "in": pulls data from AXI-MM to AXI stream
# direction "out": pushes data from AXI stream to AXI-MM
# DMA Addressing
# - burst mode can be "wrap" or "increment"
# - "increment" bursts will increment the address when moving to the next image
# - "wrap" bursts will reinitialize the address to the start address,
# and are useful for e.g. streaming weights, where the same buffer is
# repeatedly read into the FPGA
# - no additional alignment restrictions beyond anything specified in the AXI spec
# Interfaces
# - AXI-MM interface width (in bits) is specified by intfWidth
# - AXI-Stream interface width (in bits) is specified by streamWidth
# - If inftWidth and streamWidth are not equal, the DMA core performs
# width conversion by going up to the least common multiple of bitwidths
# e.g. intfWidth=32b -> 96b -> sreamWidth=24b
# - transfers occur in multiples of the AXI-MM interface width, therefore
# the total number of bits in the tensor must be a multiple of intfWidth
# - transfers occur in multiples of the AXI-Stream interface width, therefore
# the total number of bits in the tensor must be a multiple of streamWidth
# - both interface widths must be a multiple of 8b (AXI protocol requirement)
# - in most systems, intfWidth is also restricted to a power of 2 (e.g. Vitis)
# but this is not universal so we don't check here explicitly
# Input/output tensor sizes shapes
# - The data being moved is a tensor of shape numInputVectors+[NumChannels]
# - The data type of the tensor elements is specified by dataType
# - on the stream side
# -the normal shape is the same as the ONNX tensor attached to it
# -the folded shape is computed from the stream width and normal shape
# - on the AXI-MM side
# -the normal shape is the same as the one on the stream side
# -the folded shape is not defined
class IODMA(HLSCustomOp): class IODMA(HLSCustomOp):
"""Class that corresponds to finn-hlslib DMA function(s).""" """Class that corresponds to finn-hlslib DMA function(s)."""
...@@ -15,6 +81,8 @@ class IODMA(HLSCustomOp): ...@@ -15,6 +81,8 @@ class IODMA(HLSCustomOp):
"NumChannels": ("i", True, 0), "NumChannels": ("i", True, 0),
# FINN input datatype # FINN input datatype
"dataType": ("s", True, ""), "dataType": ("s", True, ""),
# Stream parameters
"streamWidth": ("i", False, 32),
# DMA-specific parameters # DMA-specific parameters
"intfWidth": ("i", False, 32), "intfWidth": ("i", False, 32),
"burstMode": ("s", False, "increment"), "burstMode": ("s", False, "increment"),
...@@ -35,17 +103,38 @@ class IODMA(HLSCustomOp): ...@@ -35,17 +103,38 @@ class IODMA(HLSCustomOp):
return self.get_normal_input_shape() return self.get_normal_input_shape()
def get_folded_input_shape(self): def get_folded_input_shape(self):
shape = list(self.get_normal_input_shape()) if self.get_nodeattr("direction") == "in":
itype_bits = self.get_input_datatype().bitwidth() raise ValueError("Folded input shape not defined for input IODMA")
intfw = self.get_nodeattr("intfWidth") else:
elems_per_word = intfw / itype_bits shape = list(self.get_normal_input_shape())
fold_depth = round(shape[-1] / elems_per_word) itype_bits = self.get_input_datatype().bitwidth()
shape[-1] = fold_depth intfw = self.get_nodeattr("streamWidth")
shape.append(elems_per_word) assert (
return tuple(shape) intfw % itype_bits == 0
), "Input stream width must be a multiple of datatype bits"
elems_per_word = intfw // itype_bits
assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
fold_depth = shape[-1] // elems_per_word
shape[-1] = fold_depth
shape.append(elems_per_word)
return tuple(shape)
def get_folded_output_shape(self): def get_folded_output_shape(self):
return self.get_folded_input_shape() if self.get_nodeattr("direction") == "out":
raise ValueError("Folded output shape not defined for output IODMA")
else:
shape = list(self.get_normal_output_shape())
itype_bits = self.get_output_datatype().bitwidth()
intfw = self.get_nodeattr("streamWidth")
assert (
intfw % itype_bits == 0
), "Input stream width must be a multiple of datatype bits"
elems_per_word = intfw // itype_bits
assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
fold_depth = shape[-1] // elems_per_word
shape[-1] = fold_depth
shape.append(elems_per_word)
return tuple(shape)
def make_shape_compatible_op(self, model): def make_shape_compatible_op(self, model):
exp_ishape = self.get_normal_input_shape() exp_ishape = self.get_normal_input_shape()
...@@ -86,10 +175,16 @@ class IODMA(HLSCustomOp): ...@@ -86,10 +175,16 @@ class IODMA(HLSCustomOp):
return self.get_input_datatype() return self.get_input_datatype()
def get_instream_width(self): def get_instream_width(self):
return self.get_nodeattr("intfWidth") if self.get_nodeattr("direction") == "in":
return self.get_nodeattr("intfWidth")
else:
return self.get_nodeattr("streamWidth")
def get_outstream_width(self): def get_outstream_width(self):
return self.get_instream_width() if self.get_nodeattr("direction") == "out":
return self.get_nodeattr("intfWidth")
else:
return self.get_nodeattr("streamWidth")
def get_number_output_values(self): def get_number_output_values(self):
oshape = self.get_normal_output_shape() oshape = self.get_normal_output_shape()
...@@ -103,6 +198,7 @@ class IODMA(HLSCustomOp): ...@@ -103,6 +198,7 @@ class IODMA(HLSCustomOp):
def global_includes(self): def global_includes(self):
self.code_gen_dict["$GLOBALS$"] = ['#include "dma.h"'] self.code_gen_dict["$GLOBALS$"] = ['#include "dma.h"']
self.code_gen_dict["$GLOBALS$"].append('#include "streamtools.h"')
def defines(self, var): def defines(self, var):
itype_bits = self.get_input_datatype().bitwidth() itype_bits = self.get_input_datatype().bitwidth()
...@@ -115,6 +211,13 @@ class IODMA(HLSCustomOp): ...@@ -115,6 +211,13 @@ class IODMA(HLSCustomOp):
) )
] ]
def get_ap_int_max_w(self):
"Return the maximum width of any ap_int used in this module."
instream = self.get_instream_width()
outstream = self.get_outstream_width()
width_lcm = (instream * outstream) // math.gcd(instream, outstream)
return width_lcm
def docompute(self): def docompute(self):
direction = self.get_nodeattr("direction") direction = self.get_nodeattr("direction")
mode = self.get_nodeattr("burstMode") mode = self.get_nodeattr("burstMode")
...@@ -123,25 +226,53 @@ class IODMA(HLSCustomOp): ...@@ -123,25 +226,53 @@ class IODMA(HLSCustomOp):
func = "Mem2Stream_Batch_external_wmem" func = "Mem2Stream_Batch_external_wmem"
else: else:
func = "Mem2Stream_Batch" func = "Mem2Stream_Batch"
dwc_func = "WidthAdjustedOutputStream"
else: else:
func = "Stream2Mem_Batch" func = "Stream2Mem_Batch"
self.code_gen_dict["$DOCOMPUTE$"] = [ dwc_func = "WidthAdjustedInputStream"
"""{}<DataWidth1, NumBytes1>(in0, out, numReps);""".format(func,) # define templates for instantiation
] dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);"
dwc_inst_template = dwc_func + "<%d, %d, %d> %s(%s, numReps);"
# do stream infrastructure and instantiations
intfw = self.get_nodeattr("intfWidth")
strmw = self.get_nodeattr("streamWidth")
width_lcm = (strmw * intfw) // math.gcd(strmw, intfw)
# we always need two streams: one of width_lcm, and one of intfw width
# because we use WidthAdjustedInputStream,
dtype_bits = self.get_input_datatype().bitwidth()
total_bits = dtype_bits * np.prod(self.get_normal_input_shape())
if direction == "in":
self.code_gen_dict["$DOCOMPUTE$"] = [
dwc_inst_template
% (width_lcm, strmw, total_bits // width_lcm, "dwc_lcm", "out"),
dwc_inst_template
% (intfw, width_lcm, total_bits // intfw, "dwc_intfw", "dwc_lcm"),
dma_inst_template % ("in0", "dwc_intfw"),
]
else:
self.code_gen_dict["$DOCOMPUTE$"] = [
dwc_inst_template
% (strmw, width_lcm, total_bits // strmw, "dwc_lcm", "in0"),
dwc_inst_template
% (width_lcm, intfw, total_bits // width_lcm, "dwc_intfw", "dwc_lcm"),
dma_inst_template % ("dwc_intfw", "out"),
]
def blackboxfunction(self): def blackboxfunction(self):
packed_bits = self.get_instream_width() packed_ibits = self.get_instream_width()
packed_hls_type = "ap_uint<%d>" % packed_bits packed_hls_type_in = "ap_uint<%d>" % packed_ibits
packed_obits = self.get_outstream_width()
packed_hls_type_out = "ap_uint<%d>" % packed_obits
direction = self.get_nodeattr("direction") direction = self.get_nodeattr("direction")
if direction == "in": if direction == "in":
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)" "void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)"
% (self.onnx_node.name, packed_hls_type, packed_hls_type) % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
] ]
else: else:
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)" "void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)"
% (self.onnx_node.name, packed_hls_type, packed_hls_type) % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
] ]
def pragmas(self): def pragmas(self):
...@@ -172,6 +303,7 @@ class IODMA(HLSCustomOp): ...@@ -172,6 +303,7 @@ class IODMA(HLSCustomOp):
self.code_gen_dict["$PRAGMAS$"].append( self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE s_axilite port=out bundle=control" "#pragma HLS INTERFACE s_axilite port=out bundle=control"
) )
self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW")
def execute_node(self, context, graph): def execute_node(self, context, graph):
pass pass
......
...@@ -29,7 +29,10 @@ ...@@ -29,7 +29,10 @@
from onnx import TensorProto from onnx import TensorProto
from onnx import helper as oh from onnx import helper as oh
from finn.util.basic import get_by_name
from finn.custom_op.registry import getCustomOp
from finn.transformation import Transformation from finn.transformation import Transformation
from finn.transformation.general import SortGraph
import math import math
import numpy as np import numpy as np
...@@ -45,12 +48,30 @@ class InsertIODMA(Transformation): ...@@ -45,12 +48,30 @@ class InsertIODMA(Transformation):
self.max_intfwidth = max_intfwidth self.max_intfwidth = max_intfwidth
def apply(self, model): def apply(self, model):
# TODO only makes sense for a pure fpgadataflow graph -- check! # only makes sense for a pure fpgadataflow graph -- so we check!
all_nodes = list(model.graph.node)
assert all(
get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
for x in all_nodes
)
# parse streamingfclayers looking for external weights with no attached IODMA
fc_extw_nodes = list(
filter(
lambda x: x.op_type == "StreamingFCLayer_Batch"
and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external"
and model.find_producer(x.input[1]) is None,
all_nodes,
)
)
graph_in_name = model.graph.input[0].name graph_in_name = model.graph.input[0].name
first_node = model.find_consumer(graph_in_name) first_node = model.find_consumer(graph_in_name)
graph_out_name = model.graph.output[0].name graph_out_name = model.graph.output[0].name
final_node = model.find_producer(graph_out_name) final_node = model.find_producer(graph_out_name)
if final_node.op_type == "IODMA" and first_node.op_type == "IODMA": if (
final_node.op_type == "IODMA"
and first_node.op_type == "IODMA"
and len(fc_extw_nodes) == 0
):
# TODO maybe check the correctness of properties # TODO maybe check the correctness of properties
return (model, False) return (model, False)
else: else:
...@@ -63,6 +84,8 @@ class InsertIODMA(Transformation): ...@@ -63,6 +84,8 @@ class InsertIODMA(Transformation):
assert ( assert (
intfwidth % 8 == 0 intfwidth % 8 == 0
), "No feasible interface width for transfer size" ), "No feasible interface width for transfer size"
# get width of stream input to DMA
streamWidth = getCustomOp(final_node).get_outstream_width()
# make new buffer # make new buffer
final_node_out = oh.make_tensor_value_info( final_node_out = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
...@@ -79,6 +102,7 @@ class InsertIODMA(Transformation): ...@@ -79,6 +102,7 @@ class InsertIODMA(Transformation):
NumChannels=out_shape[-1], NumChannels=out_shape[-1],
dataType=str(out_dtype.name), dataType=str(out_dtype.name),
intfWidth=intfwidth, intfWidth=intfwidth,
streamWidth=streamWidth,
direction="out", direction="out",
domain="finn", domain="finn",
backend="fpgadataflow", backend="fpgadataflow",
...@@ -93,6 +117,8 @@ class InsertIODMA(Transformation): ...@@ -93,6 +117,8 @@ class InsertIODMA(Transformation):
assert ( assert (
intfwidth % 8 == 0 intfwidth % 8 == 0
), "No feasible interface width for transfer size" ), "No feasible interface width for transfer size"
# get width of stream output from DMA
streamWidth = getCustomOp(first_node).get_instream_width()
# make new buffer # make new buffer
first_node_in = oh.make_tensor_value_info( first_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
...@@ -109,10 +135,47 @@ class InsertIODMA(Transformation): ...@@ -109,10 +135,47 @@ class InsertIODMA(Transformation):
NumChannels=in_shape[-1], NumChannels=in_shape[-1],
dataType=str(in_dtype.name), dataType=str(in_dtype.name),
intfWidth=intfwidth, intfWidth=intfwidth,
streamWidth=streamWidth,
direction="in", direction="in",
domain="finn", domain="finn",
backend="fpgadataflow", backend="fpgadataflow",
) )
model.graph.node.insert(0, dma_node) model.graph.node.insert(0, dma_node)
for fc_node in fc_extw_nodes:
fc_w_name = fc_node.input[1]
w_shape = model.get_tensor_shape(fc_w_name)
w_dtype = model.get_tensor_datatype(fc_w_name)
# determine the feasible interface width
transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# calculate width of stream output from DMA
pe = get_by_name(fc_node.attribute, "PE").i
simd = get_by_name(fc_node.attribute, "SIMD").i
streamWidth = simd * pe * w_dtype.bitwidth()
# make new buffer
fc_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape
)
model.graph.value_info.append(fc_node_in)
model.set_tensor_datatype(fc_node_in.name, w_dtype)
dma_node = oh.make_node(
"IODMA",
[fc_w_name],
[fc_node_in.name],
numInputVectors=w_shape[:-1],
NumChannels=w_shape[-1],
dataType=str(w_dtype.name),
intfWidth=intfwidth,
streamWidth=streamWidth,
direction="in",
burstMode="wrap",
domain="finn",
backend="fpgadataflow",
)
fc_node.input[1] = fc_node_in.name
model.graph.node.insert(0, dma_node)
model = model.transform(SortGraph())
return (model, True) return (model, True)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment