Skip to content
Snippets Groups Projects
Commit b32ad623 authored by Yaman Umuroglu's avatar Yaman Umuroglu
Browse files

[IODMA] make IODMA insertion more configurable

parent 168a58a6
No related branches found
No related tags found
No related merge requests found
...@@ -37,10 +37,20 @@ from qonnx.util.basic import get_by_name ...@@ -37,10 +37,20 @@ from qonnx.util.basic import get_by_name
class InsertIODMA(Transformation): class InsertIODMA(Transformation):
"""Insert DMA nodes on all inputs and outputs.""" """Insert DMA nodes on inputs and outputs, or as specified by filters in
the constructor."""
def __init__(self, max_intfwidth=32): def __init__(
self,
max_intfwidth=32,
insert_input=True,
insert_output=True,
insert_extmemw=True,
):
super().__init__() super().__init__()
self.insert_input = insert_input
self.insert_output = insert_output
self.insert_extmemw = insert_extmemw
assert ( assert (
2 ** math.log2(max_intfwidth) == max_intfwidth 2 ** math.log2(max_intfwidth) == max_intfwidth
), "max_intfwidth must be a power of 2" ), "max_intfwidth must be a power of 2"
...@@ -93,153 +103,163 @@ class InsertIODMA(Transformation): ...@@ -93,153 +103,163 @@ class InsertIODMA(Transformation):
get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow" get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
for x in all_nodes for x in all_nodes
) )
# parse matrixvectoractivation layers looking for external weights with no
# attached IODMA
fc_extw_nodes = list(
filter(
lambda x: x.op_type == "MatrixVectorActivation"
and getCustomOp(x).get_nodeattr("mem_mode") == "external"
and model.find_producer(x.input[1]) is None,
all_nodes,
)
)
# insert IODMAs for graph inputs # insert IODMAs for graph inputs
graph_in_names = [x.name for x in model.graph.input] if self.insert_input:
for graph_in_name in graph_in_names: graph_in_names = [x.name for x in model.graph.input]
first_node = model.find_consumer(graph_in_name) for graph_in_name in graph_in_names:
if first_node.op_type == "IODMA": first_node = model.find_consumer(graph_in_name)
# IODMA already inserted for this input if first_node.op_type == "IODMA":
continue # IODMA already inserted for this input
else: continue
in_shape = model.get_tensor_shape(graph_in_name) else:
in_dtype = model.get_tensor_datatype(graph_in_name) in_shape = model.get_tensor_shape(graph_in_name)
first_node_inst = getCustomOp(first_node) in_dtype = model.get_tensor_datatype(graph_in_name)
in_folded_shape = first_node_inst.get_folded_input_shape() first_node_inst = getCustomOp(first_node)
# take advantage of AXI stream width padding for DMA alignment in_folded_shape = first_node_inst.get_folded_input_shape()
# (AXI streams are always padded to 8 bits) # take advantage of AXI stream width padding for DMA alignment
# this is the width of stream output expected from the DMA # (AXI streams are always padded to 8 bits)
padded_instream_width = first_node_inst.get_instream_width_padded() # this is the width of stream output expected from the DMA
padded_instream_bytes = padded_instream_width // 8 padded_instream_width = first_node_inst.get_instream_width_padded()
padded_instream_bytes = padded_instream_width // 8
# determine the feasible interface width
transfer_bits = padded_instream_width * np.prod(
in_folded_shape[:-1]
)
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# make new buffer
first_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
)
model.graph.value_info.append(first_node_in)
model.set_tensor_datatype(first_node_in.name, in_dtype)
# reroute first node input
# FIXME: currently always using 8-bit dtypes to work around the
# padding problems for i/o DMA
first_node.input[0] = first_node_in.name
dma_node = oh.make_node(
"IODMA",
[graph_in_name],
[first_node_in.name],
numInputVectors=in_folded_shape[:-1],
NumChannels=padded_instream_bytes,
dataType="UINT8",
intfWidth=intfwidth,
streamWidth=padded_instream_width,
direction="in",
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
)
model.graph.node.insert(0, dma_node)
modified = True
# insert IODMAs for graph outputs
if self.insert_output:
graph_out_names = [x.name for x in model.graph.output]
for graph_out_name in graph_out_names:
final_node = model.find_producer(graph_out_name)
if final_node.op_type == "IODMA":
continue
else:
out_shape = model.get_tensor_shape(graph_out_name)
out_dtype = model.get_tensor_datatype(graph_out_name)
final_node_inst = getCustomOp(final_node)
out_folded_shape = final_node_inst.get_folded_output_shape()
# take advantage of AXI stream width padding for DMA alignment
# (AXI streams are always padded to 8 bits)
# this is the width of stream input to DMA
padded_outstream_width = (
final_node_inst.get_outstream_width_padded()
)
padded_outstream_bytes = padded_outstream_width // 8
# determine the feasible interface width
transfer_bits = padded_outstream_width * np.prod(
out_folded_shape[:-1]
)
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# make new buffer
final_node_out = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
)
model.graph.value_info.append(final_node_out)
model.set_tensor_datatype(final_node_out.name, out_dtype)
# reroute final node output to final_node_out_name
final_node.output[0] = final_node_out.name
# FIXME: currently always using 8-bit dtypes to work around the
# padding problems for i/o DMA
dma_node = oh.make_node(
"IODMA",
[final_node_out.name],
[graph_out_name],
numInputVectors=out_folded_shape[:-1],
NumChannels=padded_outstream_bytes,
dataType="UINT8",
intfWidth=intfwidth,
streamWidth=padded_outstream_width,
direction="out",
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
)
model.graph.node.append(dma_node)
modified = True
if self.insert_extmemw:
# parse matrixvectoractivation layers looking for external weights with no
# attached IODMA
fc_extw_nodes = list(
filter(
lambda x: x.op_type == "MatrixVectorActivation"
and getCustomOp(x).get_nodeattr("mem_mode") == "external"
and model.find_producer(x.input[1]) is None,
all_nodes,
)
)
for fc_node in fc_extw_nodes:
fc_inst = getCustomOp(fc_node)
fc_w_name = fc_node.input[1]
w_shape = model.get_tensor_shape(fc_w_name)
w_dtype = model.get_tensor_datatype(fc_w_name)
# determine the feasible interface width # determine the feasible interface width
transfer_bits = padded_instream_width * np.prod(in_folded_shape[:-1]) transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
intfwidth = math.gcd(transfer_bits, self.max_intfwidth) intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert ( assert (
intfwidth % 8 == 0 intfwidth % 8 == 0
), "No feasible interface width for transfer size" ), "No feasible interface width for transfer size"
# calculate width of stream output from DMA
pe = get_by_name(fc_node.attribute, "PE").i
simd = get_by_name(fc_node.attribute, "SIMD").i
streamWidth = fc_inst.get_weightstream_width_padded()
# make new buffer # make new buffer
first_node_in = oh.make_tensor_value_info( W = model.get_initializer(fc_w_name)
model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape iodma_mem = self.get_mem_init(W, pe, simd)
model.set_initializer(fc_w_name, iodma_mem)
fc_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
) )
model.graph.value_info.append(first_node_in) model.graph.value_info.append(fc_node_in)
model.set_tensor_datatype(first_node_in.name, in_dtype) model.set_tensor_datatype(fc_node_in.name, w_dtype)
# reroute first node input model.set_initializer(fc_node_in.name, W)
# FIXME: currently always using 8-bit dtypes to work around the
# padding problems for i/o DMA
first_node.input[0] = first_node_in.name
dma_node = oh.make_node( dma_node = oh.make_node(
"IODMA", "IODMA",
[graph_in_name], [fc_w_name],
[first_node_in.name], [fc_node_in.name],
numInputVectors=in_folded_shape[:-1], numInputVectors=[iodma_mem.shape[0]],
NumChannels=padded_instream_bytes, NumChannels=pe * simd,
dataType="UINT8", dataType=str(w_dtype.name),
intfWidth=intfwidth, intfWidth=intfwidth,
streamWidth=padded_instream_width, streamWidth=streamWidth,
direction="in", direction="in",
burstMode="wrap",
domain="finn.custom_op.fpgadataflow", domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow", backend="fpgadataflow",
) )
fc_node.input[1] = fc_node_in.name
model.graph.node.insert(0, dma_node) model.graph.node.insert(0, dma_node)
modified = True modified = True
# insert IODMAs for graph outputs
graph_out_names = [x.name for x in model.graph.output]
for graph_out_name in graph_out_names:
final_node = model.find_producer(graph_out_name)
if final_node.op_type == "IODMA":
continue
else:
out_shape = model.get_tensor_shape(graph_out_name)
out_dtype = model.get_tensor_datatype(graph_out_name)
final_node_inst = getCustomOp(final_node)
out_folded_shape = final_node_inst.get_folded_output_shape()
# take advantage of AXI stream width padding for DMA alignment
# (AXI streams are always padded to 8 bits)
# this is the width of stream input to DMA
padded_outstream_width = final_node_inst.get_outstream_width_padded()
padded_outstream_bytes = padded_outstream_width // 8
# determine the feasible interface width
transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# make new buffer
final_node_out = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
)
model.graph.value_info.append(final_node_out)
model.set_tensor_datatype(final_node_out.name, out_dtype)
# reroute final node output to final_node_out_name
final_node.output[0] = final_node_out.name
# FIXME: currently always using 8-bit dtypes to work around the
# padding problems for i/o DMA
dma_node = oh.make_node(
"IODMA",
[final_node_out.name],
[graph_out_name],
numInputVectors=out_folded_shape[:-1],
NumChannels=padded_outstream_bytes,
dataType="UINT8",
intfWidth=intfwidth,
streamWidth=padded_outstream_width,
direction="out",
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
)
model.graph.node.append(dma_node)
modified = True
for fc_node in fc_extw_nodes:
fc_inst = getCustomOp(fc_node)
fc_w_name = fc_node.input[1]
w_shape = model.get_tensor_shape(fc_w_name)
w_dtype = model.get_tensor_datatype(fc_w_name)
# determine the feasible interface width
transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert intfwidth % 8 == 0, "No feasible interface width for transfer size"
# calculate width of stream output from DMA
pe = get_by_name(fc_node.attribute, "PE").i
simd = get_by_name(fc_node.attribute, "SIMD").i
streamWidth = fc_inst.get_weightstream_width_padded()
# make new buffer
W = model.get_initializer(fc_w_name)
iodma_mem = self.get_mem_init(W, pe, simd)
model.set_initializer(fc_w_name, iodma_mem)
fc_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
)
model.graph.value_info.append(fc_node_in)
model.set_tensor_datatype(fc_node_in.name, w_dtype)
model.set_initializer(fc_node_in.name, W)
dma_node = oh.make_node(
"IODMA",
[fc_w_name],
[fc_node_in.name],
numInputVectors=[iodma_mem.shape[0]],
NumChannels=pe * simd,
dataType=str(w_dtype.name),
intfWidth=intfwidth,
streamWidth=streamWidth,
direction="in",
burstMode="wrap",
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
)
fc_node.input[1] = fc_node_in.name
model.graph.node.insert(0, dma_node)
modified = True
if modified: if modified:
model = model.transform(SortGraph()) model = model.transform(SortGraph())
return (model, modified) return (model, modified)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment