Skip to content
Snippets Groups Projects
Unverified Commit f678450e authored by auphelia's avatar auphelia Committed by GitHub
Browse files

Merge pull request #616 from Xilinx/fix/extmem_lookup_and_iodma

Fixes for Lookup external mode and IODMA insertion
parents 9944fd80 e67f4736
No related branches found
No related tags found
No related merge requests found
......@@ -471,6 +471,8 @@ class Lookup(HLSCustomOp):
def get_verilog_top_module_intf_names(self):
intf_names = super().get_verilog_top_module_intf_names()
intf_names["axilite"] = ["s_axi_control"]
intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))]
mem_mode = self.get_nodeattr("mem_mode")
if mem_mode == "external":
intf_names["axilite"] = ["s_axi_control"]
intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))]
return intf_names
......@@ -166,11 +166,12 @@ class CreateStitchedIP(Transformation):
"make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
% (inst_name, aximm_intf_name[0][0])
)
ext_if_name = "m_axi_gmem%d" % (len(self.intf_names["aximm"]))
self.connect_cmds.append(
"set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]"
"set_property name %s [get_bd_intf_ports m_axi_gmem_0]" % ext_if_name
)
self.connect_cmds.append("assign_bd_address")
seg_name = "%s/Data_m_axi_gmem/SEG_m_axi_gmem0_Reg" % (inst_name)
seg_name = "%s/Data_m_axi_gmem/SEG_%s_Reg" % (inst_name, ext_if_name)
self.connect_cmds.append(
"set_property offset 0 [get_bd_addr_segs {%s}]" % (seg_name)
)
......@@ -178,9 +179,7 @@ class CreateStitchedIP(Transformation):
self.connect_cmds.append(
"set_property range 4G [get_bd_addr_segs {%s}]" % (seg_name)
)
self.intf_names["aximm"] = [("m_axi_gmem0", aximm_intf_name[0][1])]
assert self.has_aximm is False, "Currently limited to one AXI-MM interface"
self.intf_names["aximm"] = [(ext_if_name, aximm_intf_name[0][1])]
self.has_aximm = True
def connect_m_axis_external(self, node, idx=None):
......
......@@ -37,10 +37,20 @@ from qonnx.util.basic import get_by_name
class InsertIODMA(Transformation):
"""Insert DMA nodes on all inputs and outputs."""
"""Insert DMA nodes on inputs and outputs, or as specified by filters in
the constructor."""
def __init__(self, max_intfwidth=32):
def __init__(
self,
max_intfwidth=32,
insert_input=True,
insert_output=True,
insert_extmemw=True,
):
super().__init__()
self.insert_input = insert_input
self.insert_output = insert_output
self.insert_extmemw = insert_extmemw
assert (
2 ** math.log2(max_intfwidth) == max_intfwidth
), "max_intfwidth must be a power of 2"
......@@ -93,153 +103,163 @@ class InsertIODMA(Transformation):
get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
for x in all_nodes
)
# parse matrixvectoractivation layers looking for external weights with no
# attached IODMA
fc_extw_nodes = list(
filter(
lambda x: x.op_type == "MatrixVectorActivation"
and getCustomOp(x).get_nodeattr("mem_mode") == "external"
and model.find_producer(x.input[1]) is None,
all_nodes,
)
)
# insert IODMAs for graph inputs
graph_in_names = [x.name for x in model.graph.input]
for graph_in_name in graph_in_names:
first_node = model.find_consumer(graph_in_name)
if first_node.op_type == "IODMA":
# IODMA already inserted for this input
continue
else:
in_shape = model.get_tensor_shape(graph_in_name)
in_dtype = model.get_tensor_datatype(graph_in_name)
first_node_inst = getCustomOp(first_node)
in_folded_shape = first_node_inst.get_folded_input_shape()
# take advantage of AXI stream width padding for DMA alignment
# (AXI streams are always padded to 8 bits)
# this is the width of stream output expected from the DMA
padded_instream_width = first_node_inst.get_instream_width_padded()
padded_instream_bytes = padded_instream_width // 8
if self.insert_input:
graph_in_names = [x.name for x in model.graph.input]
for graph_in_name in graph_in_names:
first_node = model.find_consumer(graph_in_name)
if first_node.op_type == "IODMA":
# IODMA already inserted for this input
continue
else:
in_shape = model.get_tensor_shape(graph_in_name)
in_dtype = model.get_tensor_datatype(graph_in_name)
first_node_inst = getCustomOp(first_node)
in_folded_shape = first_node_inst.get_folded_input_shape()
# take advantage of AXI stream width padding for DMA alignment
# (AXI streams are always padded to 8 bits)
# this is the width of stream output expected from the DMA
padded_instream_width = first_node_inst.get_instream_width_padded()
padded_instream_bytes = padded_instream_width // 8
# determine the feasible interface width
transfer_bits = padded_instream_width * np.prod(
in_folded_shape[:-1]
)
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# make new buffer
first_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
)
model.graph.value_info.append(first_node_in)
model.set_tensor_datatype(first_node_in.name, in_dtype)
# reroute first node input
# FIXME: currently always using 8-bit dtypes to work around the
# padding problems for i/o DMA
first_node.input[0] = first_node_in.name
dma_node = oh.make_node(
"IODMA",
[graph_in_name],
[first_node_in.name],
numInputVectors=in_folded_shape[:-1],
NumChannels=padded_instream_bytes,
dataType="UINT8",
intfWidth=intfwidth,
streamWidth=padded_instream_width,
direction="in",
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
)
model.graph.node.insert(0, dma_node)
modified = True
# insert IODMAs for graph outputs
if self.insert_output:
graph_out_names = [x.name for x in model.graph.output]
for graph_out_name in graph_out_names:
final_node = model.find_producer(graph_out_name)
if final_node.op_type == "IODMA":
continue
else:
out_shape = model.get_tensor_shape(graph_out_name)
out_dtype = model.get_tensor_datatype(graph_out_name)
final_node_inst = getCustomOp(final_node)
out_folded_shape = final_node_inst.get_folded_output_shape()
# take advantage of AXI stream width padding for DMA alignment
# (AXI streams are always padded to 8 bits)
# this is the width of stream input to DMA
padded_outstream_width = (
final_node_inst.get_outstream_width_padded()
)
padded_outstream_bytes = padded_outstream_width // 8
# determine the feasible interface width
transfer_bits = padded_outstream_width * np.prod(
out_folded_shape[:-1]
)
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# make new buffer
final_node_out = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
)
model.graph.value_info.append(final_node_out)
model.set_tensor_datatype(final_node_out.name, out_dtype)
# reroute final node output to final_node_out_name
final_node.output[0] = final_node_out.name
# FIXME: currently always using 8-bit dtypes to work around the
# padding problems for i/o DMA
dma_node = oh.make_node(
"IODMA",
[final_node_out.name],
[graph_out_name],
numInputVectors=out_folded_shape[:-1],
NumChannels=padded_outstream_bytes,
dataType="UINT8",
intfWidth=intfwidth,
streamWidth=padded_outstream_width,
direction="out",
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
)
model.graph.node.append(dma_node)
modified = True
if self.insert_extmemw:
# parse matrixvectoractivation layers looking for external weights with no
# attached IODMA
fc_extw_nodes = list(
filter(
lambda x: x.op_type == "MatrixVectorActivation"
and getCustomOp(x).get_nodeattr("mem_mode") == "external"
and model.find_producer(x.input[1]) is None,
all_nodes,
)
)
for fc_node in fc_extw_nodes:
fc_inst = getCustomOp(fc_node)
fc_w_name = fc_node.input[1]
w_shape = model.get_tensor_shape(fc_w_name)
w_dtype = model.get_tensor_datatype(fc_w_name)
# determine the feasible interface width
transfer_bits = padded_instream_width * np.prod(in_folded_shape[:-1])
transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# calculate width of stream output from DMA
pe = get_by_name(fc_node.attribute, "PE").i
simd = get_by_name(fc_node.attribute, "SIMD").i
streamWidth = fc_inst.get_weightstream_width_padded()
# make new buffer
first_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
W = model.get_initializer(fc_w_name)
iodma_mem = self.get_mem_init(W, pe, simd)
model.set_initializer(fc_w_name, iodma_mem)
fc_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
)
model.graph.value_info.append(first_node_in)
model.set_tensor_datatype(first_node_in.name, in_dtype)
# reroute first node input
# FIXME: currently always using 8-bit dtypes to work around the
# padding problems for i/o DMA
first_node.input[0] = first_node_in.name
model.graph.value_info.append(fc_node_in)
model.set_tensor_datatype(fc_node_in.name, w_dtype)
model.set_initializer(fc_node_in.name, W)
dma_node = oh.make_node(
"IODMA",
[graph_in_name],
[first_node_in.name],
numInputVectors=in_folded_shape[:-1],
NumChannels=padded_instream_bytes,
dataType="UINT8",
[fc_w_name],
[fc_node_in.name],
numInputVectors=[iodma_mem.shape[0]],
NumChannels=pe * simd,
dataType=str(w_dtype.name),
intfWidth=intfwidth,
streamWidth=padded_instream_width,
streamWidth=streamWidth,
direction="in",
burstMode="wrap",
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
)
fc_node.input[1] = fc_node_in.name
model.graph.node.insert(0, dma_node)
modified = True
# insert IODMAs for graph outputs
graph_out_names = [x.name for x in model.graph.output]
for graph_out_name in graph_out_names:
final_node = model.find_producer(graph_out_name)
if final_node.op_type == "IODMA":
continue
else:
out_shape = model.get_tensor_shape(graph_out_name)
out_dtype = model.get_tensor_datatype(graph_out_name)
final_node_inst = getCustomOp(final_node)
out_folded_shape = final_node_inst.get_folded_output_shape()
# take advantage of AXI stream width padding for DMA alignment
# (AXI streams are always padded to 8 bits)
# this is the width of stream input to DMA
padded_outstream_width = final_node_inst.get_outstream_width_padded()
padded_outstream_bytes = padded_outstream_width // 8
# determine the feasible interface width
transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert (
intfwidth % 8 == 0
), "No feasible interface width for transfer size"
# make new buffer
final_node_out = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
)
model.graph.value_info.append(final_node_out)
model.set_tensor_datatype(final_node_out.name, out_dtype)
# reroute final node output to final_node_out_name
final_node.output[0] = final_node_out.name
# FIXME: currently always using 8-bit dtypes to work around the
# padding problems for i/o DMA
dma_node = oh.make_node(
"IODMA",
[final_node_out.name],
[graph_out_name],
numInputVectors=out_folded_shape[:-1],
NumChannels=padded_outstream_bytes,
dataType="UINT8",
intfWidth=intfwidth,
streamWidth=padded_outstream_width,
direction="out",
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
)
model.graph.node.append(dma_node)
modified = True
for fc_node in fc_extw_nodes:
fc_inst = getCustomOp(fc_node)
fc_w_name = fc_node.input[1]
w_shape = model.get_tensor_shape(fc_w_name)
w_dtype = model.get_tensor_datatype(fc_w_name)
# determine the feasible interface width
transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
assert intfwidth % 8 == 0, "No feasible interface width for transfer size"
# calculate width of stream output from DMA
pe = get_by_name(fc_node.attribute, "PE").i
simd = get_by_name(fc_node.attribute, "SIMD").i
streamWidth = fc_inst.get_weightstream_width_padded()
# make new buffer
W = model.get_initializer(fc_w_name)
iodma_mem = self.get_mem_init(W, pe, simd)
model.set_initializer(fc_w_name, iodma_mem)
fc_node_in = oh.make_tensor_value_info(
model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
)
model.graph.value_info.append(fc_node_in)
model.set_tensor_datatype(fc_node_in.name, w_dtype)
model.set_initializer(fc_node_in.name, W)
dma_node = oh.make_node(
"IODMA",
[fc_w_name],
[fc_node_in.name],
numInputVectors=[iodma_mem.shape[0]],
NumChannels=pe * simd,
dataType=str(w_dtype.name),
intfWidth=intfwidth,
streamWidth=streamWidth,
direction="in",
burstMode="wrap",
domain="finn.custom_op.fpgadataflow",
backend="fpgadataflow",
)
fc_node.input[1] = fc_node_in.name
model.graph.node.insert(0, dma_node)
modified = True
if modified:
model = model.transform(SortGraph())
return (model, modified)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment