diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index d4b2a1032aeb305c85ffb535ac821692ce747c18..d0ef270816c362af730a75b59be71d0457e0b8e2 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -87,6 +87,7 @@ class InsertIODMA(Transformation): return reshaped_w def apply(self, model): + modified = False # only makes sense for a pure fpgadataflow graph -- so we check! all_nodes = list(model.graph.node) assert all( @@ -102,59 +103,14 @@ class InsertIODMA(Transformation): all_nodes, ) ) - graph_in_name = model.graph.input[0].name - first_node = model.find_consumer(graph_in_name) - graph_out_name = model.graph.output[0].name - final_node = model.find_producer(graph_out_name) - if ( - final_node.op_type == "IODMA" - and first_node.op_type == "IODMA" - and len(fc_extw_nodes) == 0 - ): - # TODO maybe check the correctness of properties - return (model, False) - else: - if final_node.op_type != "IODMA": - out_shape = model.get_tensor_shape(graph_out_name) - out_dtype = model.get_tensor_datatype(graph_out_name) - final_node_inst = getCustomOp(final_node) - out_folded_shape = final_node_inst.get_folded_output_shape() - # take advantage of AXI stream width padding for DMA alignment - # (AXI streams are always padded to 8 bits) - # this is the width of stream input to DMA - padded_outstream_width = final_node_inst.get_outstream_width_padded() - padded_outstream_bytes = padded_outstream_width // 8 - # determine the feasible interface width - transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1]) - intfwidth = math.gcd(transfer_bits, self.max_intfwidth) - assert ( - intfwidth % 8 == 0 - ), "No feasible interface width for transfer size" - # make new buffer - final_node_out = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape - ) - model.graph.value_info.append(final_node_out) - model.set_tensor_datatype(final_node_out.name, out_dtype) - # reroute final node output to final_node_out_name - final_node.output[0] = final_node_out.name - # FIXME: currently always using 8-bit dtypes to work around the - # padding problems for i/o DMA - dma_node = oh.make_node( - "IODMA", - [final_node_out.name], - [graph_out_name], - numInputVectors=out_folded_shape[:-1], - NumChannels=padded_outstream_bytes, - dataType="UINT8", - intfWidth=intfwidth, - streamWidth=padded_outstream_width, - direction="out", - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ) - model.graph.node.append(dma_node) - if first_node.op_type != "IODMA": + # insert IODMAs for graph inputs + graph_in_names = [x.name for x in model.graph.input] + for graph_in_name in graph_in_names: + first_node = model.find_consumer(graph_in_name) + if first_node.op_type == "IODMA": + # IODMA already inserted for this input + continue + else: in_shape = model.get_tensor_shape(graph_in_name) in_dtype = model.get_tensor_datatype(graph_in_name) first_node_inst = getCustomOp(first_node) @@ -194,47 +150,96 @@ class InsertIODMA(Transformation): backend="fpgadataflow", ) model.graph.node.insert(0, dma_node) - for fc_node in fc_extw_nodes: - fc_inst = getCustomOp(fc_node) - fc_w_name = fc_node.input[1] - w_shape = model.get_tensor_shape(fc_w_name) - w_dtype = model.get_tensor_datatype(fc_w_name) + modified = True + # insert IODMAs for graph outputs + graph_out_names = [x.name for x in model.graph.output] + for graph_out_name in graph_out_names: + final_node = model.find_producer(graph_out_name) + if final_node.op_type == "IODMA": + continue + else: + out_shape = model.get_tensor_shape(graph_out_name) + out_dtype = model.get_tensor_datatype(graph_out_name) + final_node_inst = getCustomOp(final_node) + out_folded_shape = final_node_inst.get_folded_output_shape() + # take advantage of AXI stream width padding for DMA alignment + # (AXI streams are always padded to 8 bits) + # this is the width of stream input to DMA + padded_outstream_width = final_node_inst.get_outstream_width_padded() + padded_outstream_bytes = padded_outstream_width // 8 # determine the feasible interface width - transfer_bits = np.prod(w_shape) * w_dtype.bitwidth() + transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1]) intfwidth = math.gcd(transfer_bits, self.max_intfwidth) assert ( intfwidth % 8 == 0 ), "No feasible interface width for transfer size" - # calculate width of stream output from DMA - pe = get_by_name(fc_node.attribute, "PE").i - simd = get_by_name(fc_node.attribute, "SIMD").i - streamWidth = fc_inst.get_weightstream_width_padded() # make new buffer - W = model.get_initializer(fc_w_name) - iodma_mem = self.get_mem_init(W, pe, simd) - model.set_initializer(fc_w_name, iodma_mem) - - fc_node_in = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape + final_node_out = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape ) - model.graph.value_info.append(fc_node_in) - model.set_tensor_datatype(fc_node_in.name, w_dtype) - model.set_initializer(fc_node_in.name, W) + model.graph.value_info.append(final_node_out) + model.set_tensor_datatype(final_node_out.name, out_dtype) + # reroute final node output to final_node_out_name + final_node.output[0] = final_node_out.name + # FIXME: currently always using 8-bit dtypes to work around the + # padding problems for i/o DMA dma_node = oh.make_node( "IODMA", - [fc_w_name], - [fc_node_in.name], - numInputVectors=[iodma_mem.shape[0]], - NumChannels=pe * simd, - dataType=str(w_dtype.name), + [final_node_out.name], + [graph_out_name], + numInputVectors=out_folded_shape[:-1], + NumChannels=padded_outstream_bytes, + dataType="UINT8", intfWidth=intfwidth, - streamWidth=streamWidth, - direction="in", - burstMode="wrap", + streamWidth=padded_outstream_width, + direction="out", domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", ) - fc_node.input[1] = fc_node_in.name - model.graph.node.insert(0, dma_node) + model.graph.node.append(dma_node) + modified = True + + for fc_node in fc_extw_nodes: + fc_inst = getCustomOp(fc_node) + fc_w_name = fc_node.input[1] + w_shape = model.get_tensor_shape(fc_w_name) + w_dtype = model.get_tensor_datatype(fc_w_name) + # determine the feasible interface width + transfer_bits = np.prod(w_shape) * w_dtype.bitwidth() + intfwidth = math.gcd(transfer_bits, self.max_intfwidth) + assert intfwidth % 8 == 0, "No feasible interface width for transfer size" + # calculate width of stream output from DMA + pe = get_by_name(fc_node.attribute, "PE").i + simd = get_by_name(fc_node.attribute, "SIMD").i + streamWidth = fc_inst.get_weightstream_width_padded() + # make new buffer + W = model.get_initializer(fc_w_name) + iodma_mem = self.get_mem_init(W, pe, simd) + model.set_initializer(fc_w_name, iodma_mem) + + fc_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape + ) + model.graph.value_info.append(fc_node_in) + model.set_tensor_datatype(fc_node_in.name, w_dtype) + model.set_initializer(fc_node_in.name, W) + dma_node = oh.make_node( + "IODMA", + [fc_w_name], + [fc_node_in.name], + numInputVectors=[iodma_mem.shape[0]], + NumChannels=pe * simd, + dataType=str(w_dtype.name), + intfWidth=intfwidth, + streamWidth=streamWidth, + direction="in", + burstMode="wrap", + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ) + fc_node.input[1] = fc_node_in.name + model.graph.node.insert(0, dma_node) + modified = True + if modified: model = model.transform(SortGraph()) - return (model, True) + return (model, modified)