diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
index 190bb857ad6e5448d49a6b742adc888f2bca79d2..e78f07b9f1097ee6e1042846a91c2a0ff80d12d0 100644
--- a/src/finn/core/remote_exec.py
+++ b/src/finn/core/remote_exec.py
@@ -43,6 +43,8 @@ def remote_exec(model, execution_context):
     pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
     deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
     inp = execution_context[model.graph.input[0].name]
+    # make copy of array before saving it
+    inp = inp.copy()
     np.save(os.path.join(deployment_dir, "input.npy"), inp)
     # extracting last folder of absolute path (deployment_dir)
     deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 7f13b43d57d9fe2f6de5e5ed9bb52214611f1098..ef784b8ac29ca9e937fcd4ea22a8dfd6e1a7a470 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -207,9 +207,11 @@ Found no codegen dir for this node, did you run the codegen_npysim transformatio
         # assuming dynamic inputs start from 0
         for in_ind in range(count):
             current_input_name = node.input[in_ind]
+            # make copy before saving array
+            input_array = context[current_input_name].copy()
             np.save(
                 os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
-                context[current_input_name],
+                input_array,
             )
 
     def npy_to_dynamic_output(self, context):
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 55daff5f72feddeb467d194ef50e4efe4d509110..463896b4331cf68337b0070e257f27fab36b0031 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -123,6 +123,8 @@ class ConvolutionInputGenerator(HLSCustomOp):
             ), """Input shape doesn't
             match expected shape (1, ifm_ch, ifm_dim, ifm_dim)."""
             reshaped_inp = inp.transpose(0, 2, 3, 1)
+            # make copy before saving array
+            reshaped_inp = reshaped_inp.copy()
             np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_inp)
             # execute the precompiled model
             super().exec_precompiled_singlenode_model()
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 39386b00b729d2d06678dd48b1566f0d39aea5ff..84cffade62578914a70be93a697052abb94c9bee 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -80,6 +80,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # [4] is four vectors (like a FC layer with batch=4)
             # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
             "numInputVectors": ("ints", False, [1]),
+            # memory mode for the FC weights
+            # const -- embedded weights, default, long compile/synth times
+            # decoupled -- streaming weights
+            "mem_mode": ("s", False, "const"),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -235,6 +239,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
+    def get_weightstream_width(self):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wp = self.get_weight_datatype().bitwidth()
+        return pe * simd * wp
+
     def get_folded_input_shape(self):
         mw = self.get_nodeattr("MW")
         simd = self.get_nodeattr("SIMD")
@@ -300,8 +310,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         elif (not inp_is_bipolar) and (not wt_is_bipolar):
             ret["TSrcI"] = "Slice<%s>" % inp_hls_str
             ret["TWeightI"] = "Identity"
+
         # fill in TDstI
         ret["TDstI"] = "Slice<%s>" % out_hls_str
+
         return ret
 
     def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
@@ -395,43 +407,66 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
     def generate_params(self, model, path):
-        """Saves weights into params.h and if existing thresholds into thresh.h."""
-        code_gen_dir = path
+        mem_mode = self.get_nodeattr("mem_mode")
         # weights
         weights = model.get_initializer(self.onnx_node.input[1])
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
         export_wdt = self.get_weight_datatype()
-        # we have converted bipolar weights to binary for export,
-        # so use it as such for weight generation
-        if self.get_weight_datatype() == DataType.BIPOLAR:
-            export_wdt = DataType.BINARY
-        weight_hls_code = numpy_to_hls_code(
-            weight_tensor, export_wdt, "weights", True, True
-        )
-        # write weights into params.h
-        # code_gen_dir = self.get_nodeattr("code_gen_dir_npysim")
-        f_weights = open("{}/params.h".format(code_gen_dir), "w")
-
-        if export_wdt.bitwidth() != 1:
-            f_weights.write(
-                "static FixedPointWeights<{},{},{},{}> weights = ".format(
-                    self.get_nodeattr("SIMD"),
-                    export_wdt.get_hls_datatype_str(),
-                    self.get_nodeattr("PE"),
-                    self.calc_wmem(),
-                )
+        code_gen_dir = path
+
+        if mem_mode == "const":
+            """Saves weights into params.h"""
+            # we have converted bipolar weights to binary for export,
+            # so use it as such for weight generation
+            if self.get_weight_datatype() == DataType.BIPOLAR:
+                export_wdt = DataType.BINARY
+            weight_hls_code = numpy_to_hls_code(
+                weight_tensor, export_wdt, "weights", True, True
             )
-        else:
-            f_weights.write(
-                "static BinaryWeights<{},{},{}> weights = ".format(
-                    self.get_nodeattr("SIMD"), self.get_nodeattr("PE"), self.calc_wmem()
+            # write weights into params.h
+            f_weights = open("{}/params.h".format(code_gen_dir), "w")
+
+            if export_wdt.bitwidth() != 1:
+                f_weights.write(
+                    "static FixedPointWeights<{},{},{},{}> weights = ".format(
+                        self.get_nodeattr("SIMD"),
+                        export_wdt.get_hls_datatype_str(),
+                        self.get_nodeattr("PE"),
+                        self.calc_wmem(),
+                    )
                 )
+            else:
+                f_weights.write(
+                    "static BinaryWeights<{},{},{}> weights = ".format(
+                        self.get_nodeattr("SIMD"),
+                        self.get_nodeattr("PE"),
+                        self.calc_wmem(),
+                    )
+                )
+            f_weights.write(weight_hls_code)
+            f_weights.close()
+
+        elif mem_mode == "decoupled":
+            """Saves weights into .npy file"""
+            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
+            weight_tensor = np.transpose(weight_tensor, (0, 2, 1, 3))
+            # flip PE dimension
+            weight_tensor = np.flip(weight_tensor, axis=-2)
+            weight_tensor = np.flip(weight_tensor, axis=-1)
+            # reshape weight tensor to desired shape
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            weight_tensor = weight_tensor.reshape(1, -1, pe * simd)
+            weight_tensor = weight_tensor.copy()
+            np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor)
+        else:
+            raise Exception(
+                """Please set mem_mode to "const"i or "decoupled", currently no other
+                    parameter value is supported!"""
             )
-        f_weights.write(weight_hls_code)
-        f_weights.close()
 
-        # thresholds
+        # save thresholds in thresh.h
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
             if thresholds is not None:
@@ -452,7 +487,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     threshold_tensor, tdt, "thresholds", False, True
                 )
                 # write thresholds into thresh.h
-                # code_gen_dir = self.get_nodeattr("code_gen_dir_npysim")
                 f_thresh = open("{}/thresh.h".format(code_gen_dir), "w")
                 tdt_hls = tdt.get_hls_datatype_str()
                 # use binary to export bipolar activations
@@ -462,7 +496,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 odt_hls = export_odt.get_hls_datatype_str()
                 f_thresh.write(
                     "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
-                     = ".format(
+                    = ".format(
                         self.calc_tmem(),
                         self.get_nodeattr("PE"),
                         threshold_tensor.shape[-1],
@@ -511,6 +545,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     export_idt = DataType.BINARY
                 else:
                     export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
                 np.save(
                     os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
                     reshaped_input,
@@ -589,12 +625,23 @@ class StreamingFCLayer_Batch(HLSCustomOp):
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
         self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
-        self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
+        elif mem_mode == "decoupled":
+            self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                    parameter value is supported!"""
+            )
         if self.calc_tmem() != 0:
             # TODO find a better way of checking for no pregenerated thresholds
             self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
 
     def defines(self, var):
+        mem_mode = self.get_nodeattr("mem_mode")
         numReps = 1
         self.code_gen_dict["$DEFINES$"] = [
             """#define MW1 {}\n #define MH1 {}\n #define SIMD1 {}\n
@@ -613,6 +660,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             self.code_gen_dict["$DEFINES$"].append("#define PRAGMA_SUB(x) _Pragma (#x)")
             self.code_gen_dict["$DEFINES$"].append("#define DO_PRAGMA(x) PRAGMA_SUB(x)")
 
+        if mem_mode == "decoupled":
+            wdt = self.get_weight_datatype()
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define WP1 {}\n".format(wdt.bitwidth())
+            )
+
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_npysim")
         dtype = self.get_input_datatype()
@@ -632,7 +685,23 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
         )
 
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            wdt = self.get_weight_datatype()
+            elem_bits = wdt.bitwidth()
+            packed_bits = self.get_weightstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            elem_hls_type = wdt.get_hls_datatype_str()
+            npy_type = "float"
+            npy_in = "%s/weights.npy" % code_gen_dir
+
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false);'
+                % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            )
+
     def strm_decl(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
@@ -641,25 +710,58 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
 
+        if mem_mode == "decoupled":
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> weights ("weights");'.format(
+                    self.get_weightstream_width()
+                )
+            )
+
     def docompute(self):
-        node = self.onnx_node
+        mem_mode = self.get_nodeattr("mem_mode")
         tmpl_args = self.get_template_param_values()
         if self.calc_tmem() == 0:
             odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
             threshs = "PassThroughActivation<%s>()" % odtype_hls_str
         else:
             threshs = "threshs"
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<MW1, MH1, SIMD1, PE1, {}, {}, {}>
-            (in0, out, weights, {}, numReps, {});""".format(
-                node.op_type,
-                tmpl_args["TSrcI"],
-                tmpl_args["TDstI"],
-                tmpl_args["TWeightI"],
-                threshs,
-                self.get_nodeattr("resType"),
+        if mem_mode == "const":
+            node = self.onnx_node
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<MW1, MH1, SIMD1, PE1, {}, {}, {}>
+                (in0, out, weights, {}, numReps, {});""".format(
+                    node.op_type,
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    threshs,
+                    self.get_nodeattr("resType"),
+                )
+            ]
+        elif mem_mode == "decoupled":
+            wdt = self.get_weight_datatype()
+            if wdt == DataType.BIPOLAR:
+                export_wdt = DataType.BINARY
+            else:
+                export_wdt = wdt
+            wdtype_hls_str = export_wdt.get_hls_datatype_str()
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """Matrix_Vector_Activate_Stream_Batch<MW1, MH1, SIMD1, PE1, {}, {}, {}, {} >
+                (in0, out, weights, {}, numReps, {});""".format(
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    wdtype_hls_str,
+                    threshs,
+                    self.get_nodeattr("resType"),
+                )
+            ]
+
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                    parameter value is supported!"""
             )
-        ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_npysim")
@@ -693,54 +795,71 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0,
-                hls::stream<ap_uint<{}>> &out
-                )""".format(
-                self.onnx_node.name,
-                self.get_instream_width(),
-                self.get_outstream_width(),
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0,
+                    hls::stream<ap_uint<{}>> &out
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.get_outstream_width(),
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", currently no other
+                    parameter value is supported!"""
             )
-        ]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
-        in_fifo_depth = self.get_nodeattr("inFIFODepth")
-        out_fifo_depth = self.get_nodeattr("outFIFODepth")
-        # insert depth pragmas only if specified
-        if in_fifo_depth != 0:
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
+                "#pragma HLS INTERFACE axis port=out"
             )
-        if out_fifo_depth != 0:
+            in_fifo_depth = self.get_nodeattr("inFIFODepth")
+            out_fifo_depth = self.get_nodeattr("outFIFODepth")
+            # insert depth pragmas only if specified
+            if in_fifo_depth != 0:
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
+                )
+            if out_fifo_depth != 0:
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
+                )
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
-            )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
-        )
-        # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
-        # partition for parallel access along the PE dimension (dim 1)
-        self.code_gen_dict["$PRAGMAS$"].append(
-            (
-                "DO_PRAGMA(HLS ARRAY_PARTITION "
-                "variable=weights.m_weights complete dim=1)"
+                "#pragma HLS INTERFACE ap_ctrl_none port=return"
             )
-        )
-        # the threshold tensor is acc_type [PE][TMEM][N_THRES]
-        # partition for parallel access along PE and N_THRES dimensions (dims 1 and 3)
-        if self.calc_tmem() != 0:
-            # TODO find a better way of checking for no pregenerated thresholds
+            # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
+            # partition for parallel access along the PE dimension (dim 1)
             self.code_gen_dict["$PRAGMAS$"].append(
                 (
-                    "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                    "complete dim=1)"
+                    "DO_PRAGMA(HLS ARRAY_PARTITION "
+                    "variable=weights.m_weights complete dim=1)"
                 )
             )
-            self.code_gen_dict["$PRAGMAS$"].append(
-                (
-                    "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                    "complete dim=3)"
+            # the threshold tensor is acc_type [PE][TMEM][N_THRES]
+            # partition for parallel access along PE and N_THRES
+            # dimensions (dims 1 and 3)
+            if self.calc_tmem() != 0:
+                # TODO find a better way of checking for no pregenerated thresholds
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                        "complete dim=1)"
+                    )
                 )
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                        "complete dim=3)"
+                    )
+                )
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", currently no other
+                    parameter value is supported!"""
             )
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index ae98d312e7c5923a572f918430aececf29f3e094..1c08ac4fc7a9eedcc45deab824d7ec036941f808 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -297,6 +297,8 @@ def rtlsim_output_to_npy(
     out_array = unpack_innermost_dim_from_hex_string(
         output, dtype, shape, packedBits=packedBits, reverse_inner=reverse_inner
     )
+    # make copy before saving the array
+    out_array = out_array.copy()
     np.save(path, out_array)
     return out_array
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 93bf0750776331af236154d8f2a005913cbb1c33..4201f64c963cb506305fd9b9a9fa32f66ae74226 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -31,6 +31,7 @@ import pytest
 import numpy as np
 from onnx import TensorProto, helper
 
+from finn.custom_op.registry import getCustomOp
 import finn.core.onnx_exec as oxe
 import finn.custom_op.xnorpopcount as xp
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
@@ -128,6 +129,8 @@ def prepare_inputs(input_tensor, idt, wdt):
         return {"inp": input_tensor}
 
 
+# mem_mode: const or decoupled
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT2])
 # weight datatype
@@ -135,14 +138,14 @@ def prepare_inputs(input_tensor, idt, wdt):
 # input datatype
 @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2])
 # neuron folding, -1 is maximum possible
-@pytest.mark.parametrize("nf", [-1, 1])
+@pytest.mark.parametrize("nf", [-1, 2, 1])
 # synapse folding, -1 is maximum possible
-@pytest.mark.parametrize("sf", [-1, 1])
+@pytest.mark.parametrize("sf", [-1, 2, 1])
 # HLS matrix width (input features)
 @pytest.mark.parametrize("mw", [4])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [4])
-def test_fpgadataflow_fclayer_npysim(idt, wdt, act, nf, sf, mw, mh):
+def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     if nf == -1:
         nf = mh
     if sf == -1:
@@ -179,6 +182,10 @@ def test_fpgadataflow_fclayer_npysim(idt, wdt, act, nf, sf, mw, mh):
         else:
             tdt = DataType.INT32
     model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    for node in model.graph.node:
+        # lookup op_type in registry of CustomOps
+        inst = getCustomOp(node)
+        inst.set_nodeattr("mem_mode", mem_mode)
     model = model.transform(SetExecMode("npysim"))
     model = model.transform(CodeGen_npysim())
     model = model.transform(Compile())
@@ -201,7 +208,9 @@ def test_fpgadataflow_fclayer_npysim(idt, wdt, act, nf, sf, mw, mh):
     y_expected = y.reshape(oshape)
     # execute model
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
-    assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "npysim failed"
+
+    y_produced = y_produced.reshape(y_expected.shape)
+    assert (y_produced == y_expected).all(), "npysim failed"
 
 
 # activation: None or DataType
diff --git a/tests/util/test_data_packing.py b/tests/util/test_data_packing.py
index 495ec60966ef67f3bf7b99c63cc70e133859d087..28f1d56d0dbc5451ccad3d36b4b1d4c6bed4f63e 100644
--- a/tests/util/test_data_packing.py
+++ b/tests/util/test_data_packing.py
@@ -104,6 +104,8 @@ g++ -o test_npy2apintstream test.cpp /workspace/cnpy/cnpy.cpp \
         ["sh", "compile.sh"], stdout=subprocess.PIPE, cwd=test_dir
     )
     (stdout, stderr) = compile.communicate()
+    # make copy before saving the array
+    ndarray = ndarray.copy()
     np.save(npy_in, ndarray)
     execute = subprocess.Popen(
         "./test_npy2apintstream", stdout=subprocess.PIPE, cwd=test_dir