[IODMA] make IODMA insertion more configurable

b32ad623 · Yaman Umuroglu · 168a58a6 · b32ad623
Commit b32ad623 authored 2 years ago by Yaman Umuroglu
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -37,10 +37,20 @@ from qonnx.util.basic import get_by_name
 class InsertIODMA(Transformation):
-    """Insert DMA nodes on all inputs and outputs."""
+    """Insert DMA nodes on inputs and outputs, or as specified by filters in
+    the constructor."""
-    def __init__(self, max_intfwidth=32):
+    def __init__(
+        self,
+        max_intfwidth=32,
+        insert_input=True,
+        insert_output=True,
+        insert_extmemw=True,
+    ):
        super().__init__()
+        self.insert_input = insert_input
+        self.insert_output = insert_output
+        self.insert_extmemw = insert_extmemw
        assert (
            2 ** math.log2(max_intfwidth) == max_intfwidth
        ), "max_intfwidth must be a power of 2"
@@ -93,153 +103,163 @@ class InsertIODMA(Transformation):
            get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
            for x in all_nodes
        )
-        # parse matrixvectoractivation layers looking for external weights with no
-        # attached IODMA
-        fc_extw_nodes = list(
-            filter(
-                lambda x: x.op_type == "MatrixVectorActivation"
-                and getCustomOp(x).get_nodeattr("mem_mode") == "external"
-                and model.find_producer(x.input[1]) is None,
-                all_nodes,
-            )
-        )
        # insert IODMAs for graph inputs
-        graph_in_names = [x.name for x in model.graph.input]
+        if self.insert_input:
-        for graph_in_name in graph_in_names:
+            graph_in_names = [x.name for x in model.graph.input]
-            first_node = model.find_consumer(graph_in_name)
+            for graph_in_name in graph_in_names:
-            if first_node.op_type == "IODMA":
+                first_node = model.find_consumer(graph_in_name)
-                # IODMA already inserted for this input
+                if first_node.op_type == "IODMA":
-                continue
+                    # IODMA already inserted for this input
-            else:
+                    continue
-                in_shape = model.get_tensor_shape(graph_in_name)
+                else:
-                in_dtype = model.get_tensor_datatype(graph_in_name)
+                    in_shape = model.get_tensor_shape(graph_in_name)
-                first_node_inst = getCustomOp(first_node)
+                    in_dtype = model.get_tensor_datatype(graph_in_name)
-                in_folded_shape = first_node_inst.get_folded_input_shape()
+                    first_node_inst = getCustomOp(first_node)
-                # take advantage of AXI stream width padding for DMA alignment
+                    in_folded_shape = first_node_inst.get_folded_input_shape()
-                # (AXI streams are always padded to 8 bits)
+                    # take advantage of AXI stream width padding for DMA alignment
-                # this is the width of stream output expected from the DMA
+                    # (AXI streams are always padded to 8 bits)
-                padded_instream_width = first_node_inst.get_instream_width_padded()
+                    # this is the width of stream output expected from the DMA
-                padded_instream_bytes = padded_instream_width // 8
+                    padded_instream_width = first_node_inst.get_instream_width_padded()
+                    padded_instream_bytes = padded_instream_width // 8
+                    # determine the feasible interface width
+                    transfer_bits = padded_instream_width * np.prod(
+                        in_folded_shape[:-1]
+                    )
+                    intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                    assert (
+                        intfwidth % 8 == 0
+                    ), "No feasible interface width for transfer size"
+                    # make new buffer
+                    first_node_in = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                    )
+                    model.graph.value_info.append(first_node_in)
+                    model.set_tensor_datatype(first_node_in.name, in_dtype)
+                    # reroute first node input
+                    # FIXME: currently always using 8-bit dtypes to work around the
+                    # padding problems for i/o DMA
+                    first_node.input[0] = first_node_in.name
+                    dma_node = oh.make_node(
+                        "IODMA",
+                        [graph_in_name],
+                        [first_node_in.name],
+                        numInputVectors=in_folded_shape[:-1],
+                        NumChannels=padded_instream_bytes,
+                        dataType="UINT8",
+                        intfWidth=intfwidth,
+                        streamWidth=padded_instream_width,
+                        direction="in",
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                    )
+                    model.graph.node.insert(0, dma_node)
+                    modified = True
+        # insert IODMAs for graph outputs
+        if self.insert_output:
+            graph_out_names = [x.name for x in model.graph.output]
+            for graph_out_name in graph_out_names:
+                final_node = model.find_producer(graph_out_name)
+                if final_node.op_type == "IODMA":
+                    continue
+                else:
+                    out_shape = model.get_tensor_shape(graph_out_name)
+                    out_dtype = model.get_tensor_datatype(graph_out_name)
+                    final_node_inst = getCustomOp(final_node)
+                    out_folded_shape = final_node_inst.get_folded_output_shape()
+                    # take advantage of AXI stream width padding for DMA alignment
+                    # (AXI streams are always padded to 8 bits)
+                    # this is the width of stream input to DMA
+                    padded_outstream_width = (
+                        final_node_inst.get_outstream_width_padded()
+                    )
+                    padded_outstream_bytes = padded_outstream_width // 8
+                    # determine the feasible interface width
+                    transfer_bits = padded_outstream_width * np.prod(
+                        out_folded_shape[:-1]
+                    )
+                    intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                    assert (
+                        intfwidth % 8 == 0
+                    ), "No feasible interface width for transfer size"
+                    # make new buffer
+                    final_node_out = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                    )
+                    model.graph.value_info.append(final_node_out)
+                    model.set_tensor_datatype(final_node_out.name, out_dtype)
+                    # reroute final node output to final_node_out_name
+                    final_node.output[0] = final_node_out.name
+                    # FIXME: currently always using 8-bit dtypes to work around the
+                    # padding problems for i/o DMA
+                    dma_node = oh.make_node(
+                        "IODMA",
+                        [final_node_out.name],
+                        [graph_out_name],
+                        numInputVectors=out_folded_shape[:-1],
+                        NumChannels=padded_outstream_bytes,
+                        dataType="UINT8",
+                        intfWidth=intfwidth,
+                        streamWidth=padded_outstream_width,
+                        direction="out",
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                    )
+                    model.graph.node.append(dma_node)
+                    modified = True
+        if self.insert_extmemw:
+            # parse matrixvectoractivation layers looking for external weights with no
+            # attached IODMA
+            fc_extw_nodes = list(
+                filter(
+                    lambda x: x.op_type == "MatrixVectorActivation"
+                    and getCustomOp(x).get_nodeattr("mem_mode") == "external"
+                    and model.find_producer(x.input[1]) is None,
+                    all_nodes,
+                )
+            )
+            for fc_node in fc_extw_nodes:
+                fc_inst = getCustomOp(fc_node)
+                fc_w_name = fc_node.input[1]
+                w_shape = model.get_tensor_shape(fc_w_name)
+                w_dtype = model.get_tensor_datatype(fc_w_name)
                # determine the feasible interface width
-                transfer_bits = padded_instream_width * np.prod(in_folded_shape[:-1])
+                transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
                assert (
                    intfwidth % 8 == 0
                ), "No feasible interface width for transfer size"
+                # calculate width of stream output from DMA
+                pe = get_by_name(fc_node.attribute, "PE").i
+                simd = get_by_name(fc_node.attribute, "SIMD").i
+                streamWidth = fc_inst.get_weightstream_width_padded()
                # make new buffer
-                first_node_in = oh.make_tensor_value_info(
+                W = model.get_initializer(fc_w_name)
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                iodma_mem = self.get_mem_init(W, pe, simd)
+                model.set_initializer(fc_w_name, iodma_mem)
+                fc_node_in = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
                )
-                model.graph.value_info.append(first_node_in)
+                model.graph.value_info.append(fc_node_in)
-                model.set_tensor_datatype(first_node_in.name, in_dtype)
+                model.set_tensor_datatype(fc_node_in.name, w_dtype)
-                # reroute first node input
+                model.set_initializer(fc_node_in.name, W)
-                # FIXME: currently always using 8-bit dtypes to work around the
-                # padding problems for i/o DMA
-                first_node.input[0] = first_node_in.name
                dma_node = oh.make_node(
                    "IODMA",
-                    [graph_in_name],
+                    [fc_w_name],
-                    [first_node_in.name],
+                    [fc_node_in.name],
-                    numInputVectors=in_folded_shape[:-1],
+                    numInputVectors=[iodma_mem.shape[0]],
-                    NumChannels=padded_instream_bytes,
+                    NumChannels=pe * simd,
-                    dataType="UINT8",
+                    dataType=str(w_dtype.name),
                    intfWidth=intfwidth,
-                    streamWidth=padded_instream_width,
+                    streamWidth=streamWidth,
                    direction="in",
+                    burstMode="wrap",
                    domain="finn.custom_op.fpgadataflow",
                    backend="fpgadataflow",
                )
+                fc_node.input[1] = fc_node_in.name
                model.graph.node.insert(0, dma_node)
                modified = True
-        # insert IODMAs for graph outputs
-        graph_out_names = [x.name for x in model.graph.output]
-        for graph_out_name in graph_out_names:
-            final_node = model.find_producer(graph_out_name)
-            if final_node.op_type == "IODMA":
-                continue
-            else:
-                out_shape = model.get_tensor_shape(graph_out_name)
-                out_dtype = model.get_tensor_datatype(graph_out_name)
-                final_node_inst = getCustomOp(final_node)
-                out_folded_shape = final_node_inst.get_folded_output_shape()
-                # take advantage of AXI stream width padding for DMA alignment
-                # (AXI streams are always padded to 8 bits)
-                # this is the width of stream input to DMA
-                padded_outstream_width = final_node_inst.get_outstream_width_padded()
-                padded_outstream_bytes = padded_outstream_width // 8
-                # determine the feasible interface width
-                transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
-                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-                assert (
-                    intfwidth % 8 == 0
-                ), "No feasible interface width for transfer size"
-                # make new buffer
-                final_node_out = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
-                )
-                model.graph.value_info.append(final_node_out)
-                model.set_tensor_datatype(final_node_out.name, out_dtype)
-                # reroute final node output to final_node_out_name
-                final_node.output[0] = final_node_out.name
-                # FIXME: currently always using 8-bit dtypes to work around the
-                # padding problems for i/o DMA
-                dma_node = oh.make_node(
-                    "IODMA",
-                    [final_node_out.name],
-                    [graph_out_name],
-                    numInputVectors=out_folded_shape[:-1],
-                    NumChannels=padded_outstream_bytes,
-                    dataType="UINT8",
-                    intfWidth=intfwidth,
-                    streamWidth=padded_outstream_width,
-                    direction="out",
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                )
-                model.graph.node.append(dma_node)
-                modified = True
-        for fc_node in fc_extw_nodes:
-            fc_inst = getCustomOp(fc_node)
-            fc_w_name = fc_node.input[1]
-            w_shape = model.get_tensor_shape(fc_w_name)
-            w_dtype = model.get_tensor_datatype(fc_w_name)
-            # determine the feasible interface width
-            transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
-            intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-            assert intfwidth % 8 == 0, "No feasible interface width for transfer size"
-            # calculate width of stream output from DMA
-            pe = get_by_name(fc_node.attribute, "PE").i
-            simd = get_by_name(fc_node.attribute, "SIMD").i
-            streamWidth = fc_inst.get_weightstream_width_padded()
-            # make new buffer
-            W = model.get_initializer(fc_w_name)
-            iodma_mem = self.get_mem_init(W, pe, simd)
-            model.set_initializer(fc_w_name, iodma_mem)
-            fc_node_in = oh.make_tensor_value_info(
-                model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
-            )
-            model.graph.value_info.append(fc_node_in)
-            model.set_tensor_datatype(fc_node_in.name, w_dtype)
-            model.set_initializer(fc_node_in.name, W)
-            dma_node = oh.make_node(
-                "IODMA",
-                [fc_w_name],
-                [fc_node_in.name],
-                numInputVectors=[iodma_mem.shape[0]],
-                NumChannels=pe * simd,
-                dataType=str(w_dtype.name),
-                intfWidth=intfwidth,
-                streamWidth=streamWidth,
-                direction="in",
-                burstMode="wrap",
-                domain="finn.custom_op.fpgadataflow",
-                backend="fpgadataflow",
-            )
-            fc_node.input[1] = fc_node_in.name
-            model.graph.node.insert(0, dma_node)
-            modified = True
        if modified:
            model = model.transform(SortGraph())
        return (model, modified)