Merge branch 'feature/weight_streamers_rtlsim' of...

Merge branch 'feature/weight_streamers_rtlsim' of https://github.com/Xilinx/finn into feature/cnv_w1a1_convert_to_hls_layers

Merge branch 'feature/weight_streamers_rtlsim' of...
Merge branch 'feature/weight_streamers_rtlsim' of https://github.com/Xilinx/finn into feature/cnv_w1a1_convert_to_hls_layers
8331c41c · Yaman Umuroglu · f2dd151b · fe7d83e8 · 8331c41c · 8331c41c
Commit 8331c41c authored 5 years ago by Yaman Umuroglu
--- a/finn-rtllib/memstream/hdl/ramb18.v
+++ b/finn-rtllib/memstream/hdl/ramb18.v
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -68,7 +68,8 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 BREVITAS_REPO=https://github.com/Xilinx/brevitas.git
 EXAMPLES_REPO=https://github.com/maltanar/brevitas_cnv_lfc.git
 CNPY_REPO=https://github.com/rogersce/cnpy.git
-FINN_HLS_REPO=https://github.com/Xilinx/finn-hlslib.git
+#FINN_HLS_REPO=https://github.com/Xilinx/finn-hlslib.git
+FINN_HLS_REPO=https://github.com/Tobi-Alonso/finn-hlslib.git
 PYVERILATOR_REPO=https://github.com/maltanar/pyverilator
 PYNQSHELL_REPO=https://github.com/maltanar/PYNQ-HelloWorld.git

--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -28,6 +28,7 @@
 import math
 import os
+from shutil import copy
 import numpy as np
 from pyverilator import PyVerilator
@@ -39,7 +40,9 @@ from finn.util.data_packing import (
    npy_to_rtlsim_input,
    numpy_to_hls_code,
    rtlsim_output_to_npy,
+    pack_innermost_dim_as_hex_string,
 )
+from . import templates
 # ONNX i/o tensor shape assumptions for StreamingFCLayer:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
@@ -54,6 +57,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
    def __init__(self, onnx_node):
        super().__init__(onnx_node)
+        self.decoupled_wrapper = templates.decoupled_wrapper
    def get_nodeattr_types(self):
        my_attrs = {
@@ -413,14 +417,14 @@ class StreamingFCLayer_Batch(HLSCustomOp):
        # convert weights into hlslib-compatible format
        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
        export_wdt = self.get_weight_datatype()
+        # we have converted bipolar weights to binary for export,
+        # so use it as such for weight generation
+        if self.get_weight_datatype() == DataType.BIPOLAR:
+            export_wdt = DataType.BINARY
        code_gen_dir = path
        if mem_mode == "const":
            """Saves weights into params.h"""
-            # we have converted bipolar weights to binary for export,
-            # so use it as such for weight generation
-            if self.get_weight_datatype() == DataType.BIPOLAR:
-                export_wdt = DataType.BINARY
            weight_hls_code = numpy_to_hls_code(
                weight_tensor, export_wdt, "weights", True, True
            )
@@ -448,18 +452,48 @@ class StreamingFCLayer_Batch(HLSCustomOp):
            f_weights.close()
        elif mem_mode == "decoupled":
-            """Saves weights into .npy file"""
+            """Saves weights in corresponding file format for npysim or rtlsim"""
            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
-            weight_tensor = np.transpose(weight_tensor, (0, 2, 1, 3))
+            # and save as unflipped weight tensor to be able to differentiate between
-            # flip PE dimension
+            # flipped an unflipped weight tensor (has to be flipped for npysim)
-            weight_tensor = np.flip(weight_tensor, axis=-2)
-            weight_tensor = np.flip(weight_tensor, axis=-1)
+            weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
-            # reshape weight tensor to desired shape
+            # flip PE dimension and reverse SIMD flip for saving weights in .npy
+            weight_tensor_flipped = np.flip(weight_tensor_unflipped, axis=-2)
+            weight_tensor_flipped = np.flip(weight_tensor_flipped, axis=-1)
+            # reshape weight tensor (flipped and unflipped) to desired shape
            pe = self.get_nodeattr("PE")
            simd = self.get_nodeattr("SIMD")
-            weight_tensor = weight_tensor.reshape(1, -1, pe * simd)
+            # unflipped
-            weight_tensor = weight_tensor.copy()
+            weight_tensor_unflipped = weight_tensor_unflipped.reshape(1, -1, pe * simd)
-            np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor)
+            weight_tensor_unflipped = weight_tensor_unflipped.copy()
+            # flipped
+            weight_tensor_flipped = weight_tensor_flipped.reshape(1, -1, pe * simd)
+            weight_tensor_flipped = weight_tensor_flipped.copy()
+            """Saves weights into .npy file"""
+            np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor_flipped)
+            """Saves weights into .dat file"""
+            # convert weight value sinto hexstring
+            weight_width = self.get_weightstream_width()
+            weight_tensor_unflipped = pack_innermost_dim_as_hex_string(
+                weight_tensor_unflipped, export_wdt, weight_width
+            )
+            weight_pad = np.zeros((1024), int).astype(str)
+            weight_tensor_unflipped = weight_tensor_unflipped.flatten()
+            # delete "0x" in the beginning of the hexstring
+            for i in range(len(weight_tensor_unflipped)):
+                weight_tensor_unflipped[i] = weight_tensor_unflipped[i][2:]
+            weight_pad[: weight_tensor_unflipped.shape[0]] = weight_tensor_unflipped
+            weight_pad = weight_pad.copy()
+            f = open("{}/memblock_0.dat".format(code_gen_dir), "w+")
+            for val in weight_pad:
+                f.write(val + "\n")
+            f.close()
        else:
            raise Exception(
                """Please set mem_mode to "const"i or "decoupled", currently no other
@@ -572,7 +606,17 @@ class StreamingFCLayer_Batch(HLSCustomOp):
            oshape = self.get_normal_output_shape()
            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
        elif mode == "rtlsim":
-            prefixed_top_name = "%s_%s" % (node.name, node.name)
+            # set top name depending on mem_mode
+            mem_mode = self.get_nodeattr("mem_mode")
+            if mem_mode == "const":
+                prefixed_top_name = "%s_%s" % (node.name, node.name)
+            elif mem_mode == "decoupled":
+                prefixed_top_name = "%s_memstream" % (node.name)
+            else:
+                raise Exception(
+                    """Please set mem_mode to "const" or "decoupled", currently no other
+                    parameter value is supported!"""
+                )
            # check if needed file exists
            verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
                code_gen_dir, node.name, prefixed_top_name
@@ -657,16 +701,16 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                numReps,
            )
        ]
-        if var == "ipgen":
-            self.code_gen_dict["$DEFINES$"].append("#define PRAGMA_SUB(x) _Pragma (#x)")
-            self.code_gen_dict["$DEFINES$"].append("#define DO_PRAGMA(x) PRAGMA_SUB(x)")
        if mem_mode == "decoupled":
            wdt = self.get_weight_datatype()
            self.code_gen_dict["$DEFINES$"].append(
                "#define WP1 {}\n".format(wdt.bitwidth())
            )
+        if var == "ipgen":
+            self.code_gen_dict["$DEFINES$"].append("#define PRAGMA_SUB(x) _Pragma (#x)")
+            self.code_gen_dict["$DEFINES$"].append("#define DO_PRAGMA(x) PRAGMA_SUB(x)")
    def read_npy_data(self):
        code_gen_dir = self.get_nodeattr("code_gen_dir_npysim")
        dtype = self.get_input_datatype()
@@ -807,33 +851,46 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                    self.get_outstream_width(),
                )
            ]
+        elif mem_mode == "decoupled":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(
+                    hls::stream<ap_uint<{}>> &in0,
+                    hls::stream<ap_uint<{}>> &weights,
+                    hls::stream<ap_uint<{}>> &out
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.get_weightstream_width(),
+                    self.get_outstream_width(),
+                )
+            ]
        else:
            raise Exception(
-                """Please set mem_mode to "const", currently no other
+                """Please set mem_mode to "const" or "decoupled", currently no other
                    parameter value is supported!"""
            )
    def pragmas(self):
        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const":
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-            self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        in_fifo_depth = self.get_nodeattr("inFIFODepth")
+        out_fifo_depth = self.get_nodeattr("outFIFODepth")
+        # insert depth pragmas only if specified
+        if in_fifo_depth != 0:
            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=out"
+                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
            )
-            in_fifo_depth = self.get_nodeattr("inFIFODepth")
+        if out_fifo_depth != 0:
-            out_fifo_depth = self.get_nodeattr("outFIFODepth")
-            # insert depth pragmas only if specified
-            if in_fifo_depth != 0:
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
-                )
-            if out_fifo_depth != 0:
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
-                )
            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE ap_ctrl_none port=return"
+                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
            )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+        if mem_mode == "const":
            # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
            # partition for parallel access along the PE dimension (dim 1)
            self.code_gen_dict["$PRAGMAS$"].append(
@@ -842,25 +899,111 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                    "variable=weights.m_weights complete dim=1)"
                )
            )
-            # the threshold tensor is acc_type [PE][TMEM][N_THRES]
+        elif mem_mode == "decoupled":
-            # partition for parallel access along PE and N_THRES
+            self.code_gen_dict["$PRAGMAS$"].append(
-            # dimensions (dims 1 and 3)
+                "#pragma HLS INTERFACE axis port=weights"
-            if self.calc_tmem() != 0:
+            )
-                # TODO find a better way of checking for no pregenerated thresholds
+            self.code_gen_dict["$PRAGMAS$"].append(
-                self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=8 variable=8"
-                    (
+            )
-                        "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                        "complete dim=1)"
-                    )
-                )
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    (
-                        "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                        "complete dim=3)"
-                    )
-                )
        else:
            raise Exception(
                """Please set mem_mode to "const", currently no other
                    parameter value is supported!"""
            )
+        # the threshold tensor is acc_type [PE][TMEM][N_THRES]
+        # partition for parallel access along PE and N_THRES
+        # dimensions (dims 1 and 3)
+        if self.calc_tmem() != 0:
+            # TODO find a better way of checking for no pregenerated thresholds
+            self.code_gen_dict["$PRAGMAS$"].append(
+                (
+                    "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                    "complete dim=1)"
+                )
+            )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                (
+                    "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                    "complete dim=3)"
+                )
+            )
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        # generate code for all mem_mode of MVAU/FCLayer unit
+        super().code_generation_ipgen(model, fpgapart, clk)
+        # if mem_mode = "decoupled" generate code for verilog wrapper
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            # empty code gen dictionary for new entries
+            self.code_gen_dict.clear()
+            self.code_gen_dict["$TOPNAME$"] = [
+                "{}_memstream".format(self.onnx_node.name)
+            ]
+            self.code_gen_dict["$LAYER_NAME$"] = [
+                "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
+            ]
+            # make instream width a multiple of 8 for axi interface
+            in_width = self.get_instream_width()
+            if in_width % 8 != 0:
+                in_width = math.floor(in_width / 8) + 8
+            self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
+            self.code_gen_dict["$OUT_RANGE$"] = [
+                "[{}:0]".format(self.get_outstream_width() - 1)
+            ]
+            # make weight stream width a multiple of 8 for axi interface
+            weight_width = self.get_weightstream_width()
+            if weight_width % 8 != 0:
+                weight_width = math.floor(weight_width / 8) + 8
+            self.code_gen_dict["$WEIGHT_RANGE$"] = ["[{}:0]".format(weight_width - 1)]
+            self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)]
+            mw = self.get_nodeattr("MW")
+            mh = self.get_nodeattr("MH")
+            self.code_gen_dict["$WEIGHT_DEPTH$"] = [str(int(mw * mh))]
+            template = self.decoupled_wrapper
+            for key in self.code_gen_dict:
+                # transform list into long string separated by '\n'
+                code_gen_line = "\n".join(self.code_gen_dict[key])
+                template = template.replace(key, code_gen_line)
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            f = open(
+                os.path.join(
+                    code_gen_dir, "{}_memstream.v".format(self.onnx_node.name)
+                ),
+                "w",
+            )
+            f.write(template)
+            f.close()
+            self.code_gen_dict.clear()
+    def ipgen_singlenode_code(self):
+        # generate ip block of MVAU/FCLayer unit for all mem modes
+        super().ipgen_singlenode_code()
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            # copy necessary verilog and .dat files
+            # into verilog folder in code generation folder
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            verilog_folder = "{}/project_{}/sol1/impl/verilog/".format(
+                code_gen_dir, self.onnx_node.name
+            )
+            # copy memstream components from finn-rtllib
+            memstream_dir = "/workspace/finn/finn-rtllib/memstream/hdl/"
+            for file in os.listdir(memstream_dir):
+                if file.endswith(".v"):
+                    verilog_file = os.path.join(memstream_dir, file)
+                    copy(verilog_file, verilog_folder)
+            # copy .dat file of weights
+            dat_file = "{}/memblock_0.dat".format(code_gen_dir)
+            copy(dat_file, verilog_folder)
+            # copy verilog wrapper
+            verilog_wrapper = "{}/{}_memstream.v".format(
+                code_gen_dir, self.onnx_node.name
+            )
+            copy(verilog_wrapper, verilog_folder)
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -102,3 +102,166 @@ csynth_design
 export_design -format ip_catalog
 exit 0
 """
+# verilog wrapper for decoupled mem mode
+decoupled_wrapper = """
+module $TOPNAME$(
+ap_clk,
+ap_rst_n,
+in0_V_V_TDATA,
+in0_V_V_TVALID,
+in0_V_V_TREADY,
+out_V_V_TDATA,
+out_V_V_TVALID,
+out_V_V_TREADY
+);
+input   ap_clk;
+input   ap_rst_n;
+input  $IN_RANGE$ in0_V_V_TDATA;
+input   in0_V_V_TVALID;
+output   in0_V_V_TREADY;
+output  $OUT_RANGE$ out_V_V_TDATA;
+output   out_V_V_TVALID;
+input   out_V_V_TREADY;
+reg [31:0] config_address = 0;
+reg config_ce = 0;
+reg config_we = 0;
+reg [31:0] config_d0 = 0;
+wire [31:0] config_q0;
+//multiple wire AXI Streams
+reg m_axis_0_afull = 0;
+reg m_axis_0_tready;
+wire m_axis_0_tvalid;
+wire $WEIGHT_RANGE$ m_axis_0_tdata;
+reg m_axis_1_afull = 0;
+reg m_axis_1_tready = 1;
+wire m_axis_1_tvalid;
+wire $WEIGHT_RANGE$ m_axis_1_tdata;
+reg m_axis_2_afull = 0;
+reg m_axis_2_tready = 1;
+wire m_axis_2_tvalid;
+wire $WEIGHT_RANGE$ m_axis_2_tdata;
+reg m_axis_3_afull = 0;
+reg m_axis_3_tready = 1;
+wire m_axis_3_tvalid;
+wire $WEIGHT_RANGE$ m_axis_3_tdata;
+reg m_axis_4_afull = 0;
+reg m_axis_4_tready = 1;
+wire m_axis_4_tvalid;
+wire $WEIGHT_RANGE$ m_axis_4_tdata;
+reg m_axis_5_afull = 0;
+reg m_axis_5_tready = 1;
+wire m_axis_5_tvalid;
+wire $WEIGHT_RANGE$ m_axis_5_tdata;
+//memstream component
+memstream
+#(
+//parameters to enable/disable axi-mm, set number of streams, set readmemh for
+// memory, set per-stream offsets in memory, set per-stream widths
+.CONFIG_EN(1),
+.NSTREAMS(1),
+.MEM_DEPTH(1024),
+.MEM_WIDTH($WEIGHT_WIDTH$),
+.MEM_INIT("./"),
+//widths per stream
+.STRM0_WIDTH($WEIGHT_WIDTH$),
+.STRM1_WIDTH($WEIGHT_WIDTH$),
+.STRM2_WIDTH($WEIGHT_WIDTH$),
+.STRM3_WIDTH($WEIGHT_WIDTH$),
+.STRM4_WIDTH($WEIGHT_WIDTH$),
+.STRM5_WIDTH($WEIGHT_WIDTH$),
+//depths per stream
+.STRM0_DEPTH($WEIGHT_DEPTH$),
+.STRM1_DEPTH(1),
+.STRM2_DEPTH(1),
+.STRM3_DEPTH(1),
+.STRM4_DEPTH(1),
+.STRM5_DEPTH(1),
+//offsets for each stream
+.STRM0_OFFSET(0),
+.STRM1_OFFSET(0),
+.STRM2_OFFSET(0),
+.STRM3_OFFSET(0),
+.STRM4_OFFSET(0),
+.STRM5_OFFSET(0)
+)
+mem
+(
+.aclk(ap_clk),
+.aresetn(ap_rst_n),
+//optional configuration interface compatible with ap_memory
+.config_address(config_address),
+.config_ce(config_ce),
+.config_we(config_we),
+.config_d0(config_d0),
+.config_q0(config_q0),
+//multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
+.m_axis_0_afull(m_axis_0_afull),
+.m_axis_0_tready(m_axis_0_tready),
+.m_axis_0_tvalid(m_axis_0_tvalid),
+.m_axis_0_tdata(m_axis_0_tdata),
+.m_axis_1_afull(m_axis_1_afull),
+.m_axis_1_tready(m_axis_1_tready),
+.m_axis_1_tvalid(m_axis_1_tvalid),
+.m_axis_1_tdata(m_axis_1_tdata),
+.m_axis_2_afull(m_axis_2_afull),
+.m_axis_2_tready(m_axis_2_tready),
+.m_axis_2_tvalid(m_axis_2_tvalid),
+.m_axis_2_tdata(m_axis_2_tdata),
+.m_axis_3_afull(m_axis_3_afull),
+.m_axis_3_tready(m_axis_3_tready),
+.m_axis_3_tvalid(m_axis_3_tvalid),
+.m_axis_3_tdata(m_axis_3_tdata),
+.m_axis_4_afull(m_axis_4_afull),
+.m_axis_4_tready(m_axis_4_tready),
+.m_axis_4_tvalid(m_axis_4_tvalid),
+.m_axis_4_tdata(m_axis_4_tdata),
+.m_axis_5_afull(m_axis_5_afull),
+.m_axis_5_tready(m_axis_5_tready),
+.m_axis_5_tvalid(m_axis_5_tvalid),
+.m_axis_5_tdata(m_axis_5_tdata)
+);
+//MVA_Stream_Unit
+$LAYER_NAME$
+MVA_Stream_U
+(
+.ap_clk(ap_clk),			//input
+.ap_rst_n(ap_rst_n), 			//input
+.in0_V_V_TDATA(in0_V_V_TDATA),		//$IN_RANGE$ input
+.in0_V_V_TVALID(in0_V_V_TVALID),  	//input
+.in0_V_V_TREADY(in0_V_V_TREADY),	//output
+.weights_V_V_TDATA(m_axis_0_tdata),	//$WEIGHT_RANGE$ input
+.weights_V_V_TVALID(m_axis_0_tvalid),	//input
+.weights_V_V_TREADY(m_axis_0_tready),	//output
+.out_V_V_TDATA(out_V_V_TDATA),		//$OUT_RANGE$ output
+.out_V_V_TVALID(out_V_V_TVALID),	//output
+.out_V_V_TREADY(out_V_V_TREADY)		//input
+);
+endmodule
+"""
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -210,9 +210,12 @@ def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
    y_produced = y_produced.reshape(y_expected.shape)
    assert (y_produced == y_expected).all(), "npysim failed"
+# mem_mode: const or decoupled
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT2])
 # weight datatype
@@ -227,7 +230,7 @@ def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [4])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [4])
-def test_fpgadataflow_fclayer_rtlsim(idt, wdt, act, nf, sf, mw, mh):
+def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
    if nf == -1:
        nf = mh
    if sf == -1:
@@ -264,6 +267,11 @@ def test_fpgadataflow_fclayer_rtlsim(idt, wdt, act, nf, sf, mw, mh):
        else:
            tdt = DataType.INT32
    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    for node in model.graph.node:
+        # lookup op_type in registry of CustomOps
+        inst = getCustomOp(node)
+        inst.set_nodeattr("mem_mode", mem_mode)
    # prepare input data
    input_dict = prepare_inputs(x, idt, wdt)
    if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: