Merge pull request #616 from Xilinx/fix/extmem_lookup_and_iodma

Fixes for Lookup external mode and IODMA insertion

Merge pull request #616 from Xilinx/fix/extmem_lookup_and_iodma
Fixes for Lookup external mode and IODMA insertion
f678450e · auphelia · GitHub · 9944fd80 · e67f4736 · f678450e
Unverified Commit f678450e authored 2 years ago by auphelia Committed by GitHub 2 years ago
--- a/src/finn/custom_op/fpgadataflow/lookup.py
+++ b/src/finn/custom_op/fpgadataflow/lookup.py
@@ -471,6 +471,8 @@ class Lookup(HLSCustomOp):

    def get_verilog_top_module_intf_names(self):
        intf_names = super().get_verilog_top_module_intf_names()
-        intf_names["axilite"] = ["s_axi_control"]
-        intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))]
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "external":
+            intf_names["axilite"] = ["s_axi_control"]
+            intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))]
        return intf_names
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -166,11 +166,12 @@ class CreateStitchedIP(Transformation):
                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
                % (inst_name, aximm_intf_name[0][0])
            )
+            ext_if_name = "m_axi_gmem%d" % (len(self.intf_names["aximm"]))
            self.connect_cmds.append(
-                "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]"
+                "set_property name %s [get_bd_intf_ports m_axi_gmem_0]" % ext_if_name
            )
            self.connect_cmds.append("assign_bd_address")
-            seg_name = "%s/Data_m_axi_gmem/SEG_m_axi_gmem0_Reg" % (inst_name)
+            seg_name = "%s/Data_m_axi_gmem/SEG_%s_Reg" % (inst_name, ext_if_name)
            self.connect_cmds.append(
                "set_property offset 0 [get_bd_addr_segs {%s}]" % (seg_name)
            )
@@ -178,9 +179,7 @@ class CreateStitchedIP(Transformation):
            self.connect_cmds.append(
                "set_property range 4G [get_bd_addr_segs {%s}]" % (seg_name)
            )
-
-            self.intf_names["aximm"] = [("m_axi_gmem0", aximm_intf_name[0][1])]
-            assert self.has_aximm is False, "Currently limited to one AXI-MM interface"
+            self.intf_names["aximm"] = [(ext_if_name, aximm_intf_name[0][1])]
            self.has_aximm = True

    def connect_m_axis_external(self, node, idx=None):

--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -37,10 +37,20 @@ from qonnx.util.basic import get_by_name


 class InsertIODMA(Transformation):
-    """Insert DMA nodes on all inputs and outputs."""
+    """Insert DMA nodes on inputs and outputs, or as specified by filters in
+    the constructor."""

-    def __init__(self, max_intfwidth=32):
+    def __init__(
+        self,
+        max_intfwidth=32,
+        insert_input=True,
+        insert_output=True,
+        insert_extmemw=True,
+    ):
        super().__init__()
+        self.insert_input = insert_input
+        self.insert_output = insert_output
+        self.insert_extmemw = insert_extmemw
        assert (
            2 ** math.log2(max_intfwidth) == max_intfwidth
        ), "max_intfwidth must be a power of 2"
@@ -93,153 +103,163 @@ class InsertIODMA(Transformation):
            get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
            for x in all_nodes
        )
-        # parse matrixvectoractivation layers looking for external weights with no
-        # attached IODMA
-        fc_extw_nodes = list(
-            filter(
-                lambda x: x.op_type == "MatrixVectorActivation"
-                and getCustomOp(x).get_nodeattr("mem_mode") == "external"
-                and model.find_producer(x.input[1]) is None,
-                all_nodes,
-            )
-        )
        # insert IODMAs for graph inputs
-        graph_in_names = [x.name for x in model.graph.input]
-        for graph_in_name in graph_in_names:
-            first_node = model.find_consumer(graph_in_name)
-            if first_node.op_type == "IODMA":
-                # IODMA already inserted for this input
-                continue
-            else:
-                in_shape = model.get_tensor_shape(graph_in_name)
-                in_dtype = model.get_tensor_datatype(graph_in_name)
-                first_node_inst = getCustomOp(first_node)
-                in_folded_shape = first_node_inst.get_folded_input_shape()
-                # take advantage of AXI stream width padding for DMA alignment
-                # (AXI streams are always padded to 8 bits)
-                # this is the width of stream output expected from the DMA
-                padded_instream_width = first_node_inst.get_instream_width_padded()
-                padded_instream_bytes = padded_instream_width // 8
+        if self.insert_input:
+            graph_in_names = [x.name for x in model.graph.input]
+            for graph_in_name in graph_in_names:
+                first_node = model.find_consumer(graph_in_name)
+                if first_node.op_type == "IODMA":
+                    # IODMA already inserted for this input
+                    continue
+                else:
+                    in_shape = model.get_tensor_shape(graph_in_name)
+                    in_dtype = model.get_tensor_datatype(graph_in_name)
+                    first_node_inst = getCustomOp(first_node)
+                    in_folded_shape = first_node_inst.get_folded_input_shape()
+                    # take advantage of AXI stream width padding for DMA alignment
+                    # (AXI streams are always padded to 8 bits)
+                    # this is the width of stream output expected from the DMA
+                    padded_instream_width = first_node_inst.get_instream_width_padded()
+                    padded_instream_bytes = padded_instream_width // 8
+                    # determine the feasible interface width
+                    transfer_bits = padded_instream_width * np.prod(
+                        in_folded_shape[:-1]
+                    )
+                    intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                    assert (
+                        intfwidth % 8 == 0
+                    ), "No feasible interface width for transfer size"
+                    # make new buffer
+                    first_node_in = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                    )
+                    model.graph.value_info.append(first_node_in)
+                    model.set_tensor_datatype(first_node_in.name, in_dtype)
+                    # reroute first node input
+                    # FIXME: currently always using 8-bit dtypes to work around the
+                    # padding problems for i/o DMA
+                    first_node.input[0] = first_node_in.name
+                    dma_node = oh.make_node(
+                        "IODMA",
+                        [graph_in_name],
+                        [first_node_in.name],
+                        numInputVectors=in_folded_shape[:-1],
+                        NumChannels=padded_instream_bytes,
+                        dataType="UINT8",
+                        intfWidth=intfwidth,
+                        streamWidth=padded_instream_width,
+                        direction="in",
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                    )
+                    model.graph.node.insert(0, dma_node)
+                    modified = True
+        # insert IODMAs for graph outputs
+        if self.insert_output:
+            graph_out_names = [x.name for x in model.graph.output]
+            for graph_out_name in graph_out_names:
+                final_node = model.find_producer(graph_out_name)
+                if final_node.op_type == "IODMA":
+                    continue
+                else:
+                    out_shape = model.get_tensor_shape(graph_out_name)
+                    out_dtype = model.get_tensor_datatype(graph_out_name)
+                    final_node_inst = getCustomOp(final_node)
+                    out_folded_shape = final_node_inst.get_folded_output_shape()
+                    # take advantage of AXI stream width padding for DMA alignment
+                    # (AXI streams are always padded to 8 bits)
+                    # this is the width of stream input to DMA
+                    padded_outstream_width = (
+                        final_node_inst.get_outstream_width_padded()
+                    )
+                    padded_outstream_bytes = padded_outstream_width // 8
+                    # determine the feasible interface width
+                    transfer_bits = padded_outstream_width * np.prod(
+                        out_folded_shape[:-1]
+                    )
+                    intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                    assert (
+                        intfwidth % 8 == 0
+                    ), "No feasible interface width for transfer size"
+                    # make new buffer
+                    final_node_out = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                    )
+                    model.graph.value_info.append(final_node_out)
+                    model.set_tensor_datatype(final_node_out.name, out_dtype)
+                    # reroute final node output to final_node_out_name
+                    final_node.output[0] = final_node_out.name
+                    # FIXME: currently always using 8-bit dtypes to work around the
+                    # padding problems for i/o DMA
+                    dma_node = oh.make_node(
+                        "IODMA",
+                        [final_node_out.name],
+                        [graph_out_name],
+                        numInputVectors=out_folded_shape[:-1],
+                        NumChannels=padded_outstream_bytes,
+                        dataType="UINT8",
+                        intfWidth=intfwidth,
+                        streamWidth=padded_outstream_width,
+                        direction="out",
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                    )
+                    model.graph.node.append(dma_node)
+                    modified = True
+        if self.insert_extmemw:
+            # parse matrixvectoractivation layers looking for external weights with no
+            # attached IODMA
+            fc_extw_nodes = list(
+                filter(
+                    lambda x: x.op_type == "MatrixVectorActivation"
+                    and getCustomOp(x).get_nodeattr("mem_mode") == "external"
+                    and model.find_producer(x.input[1]) is None,
+                    all_nodes,
+                )
+            )
+            for fc_node in fc_extw_nodes:
+                fc_inst = getCustomOp(fc_node)
+                fc_w_name = fc_node.input[1]
+                w_shape = model.get_tensor_shape(fc_w_name)
+                w_dtype = model.get_tensor_datatype(fc_w_name)
                # determine the feasible interface width
-                transfer_bits = padded_instream_width * np.prod(in_folded_shape[:-1])
+                transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
                assert (
                    intfwidth % 8 == 0
                ), "No feasible interface width for transfer size"
+                # calculate width of stream output from DMA
+                pe = get_by_name(fc_node.attribute, "PE").i
+                simd = get_by_name(fc_node.attribute, "SIMD").i
+                streamWidth = fc_inst.get_weightstream_width_padded()
                # make new buffer
-                first_node_in = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                W = model.get_initializer(fc_w_name)
+                iodma_mem = self.get_mem_init(W, pe, simd)
+                model.set_initializer(fc_w_name, iodma_mem)
+
+                fc_node_in = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
                )
-                model.graph.value_info.append(first_node_in)
-                model.set_tensor_datatype(first_node_in.name, in_dtype)
-                # reroute first node input
-                # FIXME: currently always using 8-bit dtypes to work around the
-                # padding problems for i/o DMA
-                first_node.input[0] = first_node_in.name
+                model.graph.value_info.append(fc_node_in)
+                model.set_tensor_datatype(fc_node_in.name, w_dtype)
+                model.set_initializer(fc_node_in.name, W)
                dma_node = oh.make_node(
                    "IODMA",
-                    [graph_in_name],
-                    [first_node_in.name],
-                    numInputVectors=in_folded_shape[:-1],
-                    NumChannels=padded_instream_bytes,
-                    dataType="UINT8",
+                    [fc_w_name],
+                    [fc_node_in.name],
+                    numInputVectors=[iodma_mem.shape[0]],
+                    NumChannels=pe * simd,
+                    dataType=str(w_dtype.name),
                    intfWidth=intfwidth,
-                    streamWidth=padded_instream_width,
+                    streamWidth=streamWidth,
                    direction="in",
+                    burstMode="wrap",
                    domain="finn.custom_op.fpgadataflow",
                    backend="fpgadataflow",
                )
+                fc_node.input[1] = fc_node_in.name
                model.graph.node.insert(0, dma_node)
                modified = True
-        # insert IODMAs for graph outputs
-        graph_out_names = [x.name for x in model.graph.output]
-        for graph_out_name in graph_out_names:
-            final_node = model.find_producer(graph_out_name)
-            if final_node.op_type == "IODMA":
-                continue
-            else:
-                out_shape = model.get_tensor_shape(graph_out_name)
-                out_dtype = model.get_tensor_datatype(graph_out_name)
-                final_node_inst = getCustomOp(final_node)
-                out_folded_shape = final_node_inst.get_folded_output_shape()
-                # take advantage of AXI stream width padding for DMA alignment
-                # (AXI streams are always padded to 8 bits)
-                # this is the width of stream input to DMA
-                padded_outstream_width = final_node_inst.get_outstream_width_padded()
-                padded_outstream_bytes = padded_outstream_width // 8
-                # determine the feasible interface width
-                transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1])
-                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-                assert (
-                    intfwidth % 8 == 0
-                ), "No feasible interface width for transfer size"
-                # make new buffer
-                final_node_out = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
-                )
-                model.graph.value_info.append(final_node_out)
-                model.set_tensor_datatype(final_node_out.name, out_dtype)
-                # reroute final node output to final_node_out_name
-                final_node.output[0] = final_node_out.name
-                # FIXME: currently always using 8-bit dtypes to work around the
-                # padding problems for i/o DMA
-                dma_node = oh.make_node(
-                    "IODMA",
-                    [final_node_out.name],
-                    [graph_out_name],
-                    numInputVectors=out_folded_shape[:-1],
-                    NumChannels=padded_outstream_bytes,
-                    dataType="UINT8",
-                    intfWidth=intfwidth,
-                    streamWidth=padded_outstream_width,
-                    direction="out",
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                )
-                model.graph.node.append(dma_node)
-                modified = True
-
-        for fc_node in fc_extw_nodes:
-            fc_inst = getCustomOp(fc_node)
-            fc_w_name = fc_node.input[1]
-            w_shape = model.get_tensor_shape(fc_w_name)
-            w_dtype = model.get_tensor_datatype(fc_w_name)
-            # determine the feasible interface width
-            transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
-            intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
-            assert intfwidth % 8 == 0, "No feasible interface width for transfer size"
-            # calculate width of stream output from DMA
-            pe = get_by_name(fc_node.attribute, "PE").i
-            simd = get_by_name(fc_node.attribute, "SIMD").i
-            streamWidth = fc_inst.get_weightstream_width_padded()
-            # make new buffer
-            W = model.get_initializer(fc_w_name)
-            iodma_mem = self.get_mem_init(W, pe, simd)
-            model.set_initializer(fc_w_name, iodma_mem)
-
-            fc_node_in = oh.make_tensor_value_info(
-                model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
-            )
-            model.graph.value_info.append(fc_node_in)
-            model.set_tensor_datatype(fc_node_in.name, w_dtype)
-            model.set_initializer(fc_node_in.name, W)
-            dma_node = oh.make_node(
-                "IODMA",
-                [fc_w_name],
-                [fc_node_in.name],
-                numInputVectors=[iodma_mem.shape[0]],
-                NumChannels=pe * simd,
-                dataType=str(w_dtype.name),
-                intfWidth=intfwidth,
-                streamWidth=streamWidth,
-                direction="in",
-                burstMode="wrap",
-                domain="finn.custom_op.fpgadataflow",
-                backend="fpgadataflow",
-            )
-            fc_node.input[1] = fc_node_in.name
-            model.graph.node.insert(0, dma_node)
-            modified = True
        if modified:
            model = model.transform(SortGraph())
        return (model, modified)