diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 67cce8675681be47036ffaf3a3428b8c43284215..a0ca34ed0a6838dfe9c680cc6c16961ac7f897ed 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -354,3 +354,56 @@ $LAYER_NAME$
 
 endmodule
 """
+
+decoupled_thresholding_template = """
+template <
+    unsigned ImgDim, unsigned NumChannels, unsigned PE,
+    typename TSrcI = Identity, typename TDstI = Identity,
+    int ActVal=0, typename TT, unsigned int NumSteps,
+    typename TI, typename TO>
+void Thresholding_Stream_Batch(hls::stream<TI> &in,
+                        hls::stream<TO> &out,
+                        hls::stream<ap_uint<PE*NumSteps*TT::width>> &weight,
+                        int const reps)
+{
+
+  // how many different rows each neuron will compute
+  // alternatively: number of vertical matrix chunks
+  unsigned const NF = NumChannels / PE;
+
+  ThresholdsActivation<1, PE, NumSteps, TT, TO, ActVal, std::less_equal<TT>> internal_thr;
+  #pragma HLS ARRAY_PARTITION variable=internal_thr.m_thresholds complete dim=0
+
+  // everything merged into a common iteration space (one "big" loop instead
+  // of smaller nested loops) to get the pipelinening the way we want
+  for (unsigned i = 0; i < reps * ImgDim * ImgDim * NF; i++)
+  {
+    #pragma HLS PIPELINE II=1
+
+    ap_uint<PE*NumSteps*TT::width> packed_thr;
+    packed_thr = weight.read();
+    // slicer to get 1 PE's worth of thresholds
+    auto const pe_slicer = Slice<ap_uint<NumSteps*TT::width>>()(packed_thr);
+
+    TI inElem;
+    inElem = in.read();
+    auto outElem = TDstI().template operator()<TO>();
+
+    for (unsigned pe = 0; pe < PE; pe++)
+    {
+#pragma HLS UNROLL
+      // slicer to get individual thresholds
+      auto const thr_slicer = Slice<TT>()(pe_slicer(pe, 0));
+      for (unsigned nt = 0; nt < NumSteps; nt++)
+      {
+      #pragma HLS UNROLL
+        internal_thr.m_thresholds[pe][0][nt] = thr_slicer(nt, 0);
+      }
+
+      auto const act = TSrcI()(inElem);
+      outElem(pe,0,1) = internal_thr.activate(0, pe, act(pe,0));
+    }
+    out.write(outElem);
+  }
+}
+"""
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 2429bf6190f822fb4a6c988fcbb34152d5a338e0..ccb065f62a8340b916bfa5f6cf96c23c65d19d12 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -26,7 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from math import ceil
+from math import ceil, log2
+import textwrap
 import os
 
 import numpy as np
@@ -34,11 +35,15 @@ import numpy as np
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
-from finn.util.basic import interleave_matrix_outer_dim_from_partitions
+from finn.util.basic import (
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
     rtlsim_output_to_npy,
+    pack_innermost_dim_as_hex_string,
 )
 from . import templates
 
@@ -58,12 +63,17 @@ class Thresholding_Batch(HLSCustomOp):
 
     def get_nodeattr_types(self):
         my_attrs = {
+            # parallelization; channels thresholded per cycle
             "PE": ("i", True, 0),
+            # number of channels (each may have different thresholds)
             "NumChannels": ("i", True, 0),
+            # number of steps in thresholding function
+            "numSteps": ("i", True, 1),
             # string defining memory type
             "ram_style": ("s", False, "distributed"),
-            # FINN DataTypes for inputs, weights, outputs
+            # FINN DataTypes for inputs, outputs
             "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
             # input and output FIFO depths
             "inFIFODepth": ("i", False, 0),
@@ -75,6 +85,20 @@ class Thresholding_Batch(HLSCustomOp):
             "numInputVectors": ("ints", False, [1]),
             # initialization value for the thresholding accumulator
             "ActVal": ("i", False, 0),
+            # memory mode for the thresholds
+            # const -- embedded thresholds, default
+            # decoupled -- streaming thresholds with  streamer packaged inside IP
+            "mem_mode": ("s", False, "const"),
+            # (mem_mode = decoupled only) whether weights (thresholds) will be
+            # writable through an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -183,6 +207,34 @@ class Thresholding_Batch(HLSCustomOp):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
+    def get_weight_datatype(self):
+        """Returns FINN DataType of thresholds, here called weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def minimize_accumulator_width(self, model):
+        "Minimize threshold width ('accumulator width' here due to convention)"
+        thresholds = model.get_initializer(self.onnx_node.input[1])
+        threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+        min_threshold = thresholds.min()
+        max_threshold = thresholds.max()
+        min_input = self.get_input_datatype().min()
+        max_input = self.get_input_datatype().max()
+        # get range required by threshold values
+        tdt_min = min(min_input, min_threshold)
+        tdt_max = max(max_input, max_threshold)
+        if tdt_min < 0:
+            if abs(tdt_min) > tdt_max:
+                tdt = DataType.get_smallest_possible(tdt_min)
+            else:
+                tdt = DataType.get_smallest_possible(0 - tdt_max - 1)
+        else:
+            tdt = DataType.get_smallest_possible(tdt_max)
+        assert np.vectorize(tdt.allowed)(
+            threshold_tensor
+        ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
+        self.set_nodeattr("weightDataType", tdt.name)
+        return DataType[self.get_nodeattr("weightDataType")]
+
     def get_instream_width(self):
         i_bits = self.get_input_datatype().bitwidth()
         return i_bits * self.get_nodeattr("PE")
@@ -191,6 +243,28 @@ class Thresholding_Batch(HLSCustomOp):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if self.get_nodeattr("mem_mode") == "decoupled":
+            pe = self.get_nodeattr("PE")
+            wp = self.get_weight_datatype().bitwidth()
+            n_thres_steps = self.get_nodeattr("numSteps")
+            w_width = pe * wp * n_thres_steps
+            return w_width
+        else:
+            return 0
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
+    def get_ap_int_max_w(self):
+        temp_value = super().get_ap_int_max_w()
+        weightstream = self.get_weightstream_width()
+        return max([weightstream, temp_value])
+
     def get_folded_input_shape(self):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
@@ -251,6 +325,9 @@ class Thresholding_Batch(HLSCustomOp):
         ), """Threshold matrix dimension is
         not as expected (2)."""
         n_thres_steps = orig_thres_matrix.shape[1]
+        assert n_thres_steps == self.get_nodeattr(
+            "numSteps"
+        ), "Mismatch in threshold steps"
         if not self.get_input_datatype().signed():
             # ensure all thresholds are nonnegative
             assert (orig_thres_matrix >= 0).all()
@@ -279,56 +356,126 @@ class Thresholding_Batch(HLSCustomOp):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
-    def generate_params(self, model, path):
-        code_gen_dir = path
-        # save thresholds in thresh.h
-        thresholds = model.get_initializer(self.onnx_node.input[1])
-
-        threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights (thresholds) in appropriate
+        format for this layer. This file can be used for either synthesis or
+        run-time reconfig of weights.
 
-        min_threshold = thresholds.min()
-        max_threshold = thresholds.max()
-        min_input = self.get_input_datatype().min()
-        max_input = self.get_input_datatype().max()
-        # get range required by threshold values
-        tdt_min = min(min_input, min_threshold)
-        tdt_max = max(max_input, max_threshold)
-        if tdt_min < 0:
-            if abs(tdt_min) > tdt_max:
-                tdt = DataType.get_smallest_possible(tdt_min)
-            else:
-                tdt = DataType.get_smallest_possible(0 - tdt_max - 1)
-        else:
-            tdt = DataType.get_smallest_possible(tdt_max)
+        Arguments:
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+        """
+        threshold_tensor = self.get_hls_compatible_threshold_tensor(weights)
+        tdt = self.get_weight_datatype()
         assert np.vectorize(tdt.allowed)(
             threshold_tensor
         ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
+        if weight_file_mode == "hls_header":
+            # save thresholds in thresh.h
+            thresholds_hls_code = numpy_to_hls_code(
+                threshold_tensor, tdt, "thresholds", False, True
+            )
+            # write thresholds into thresh.h
+            f_thresh = open(weight_file_name, "w")
+            tdt_hls = tdt.get_hls_datatype_str()
+            # use binary to export bipolar activations
+            export_odt = self.get_output_datatype()
+            if self.get_output_datatype() == DataType.BIPOLAR:
+                export_odt = DataType.BINARY
+            odt_hls = export_odt.get_hls_datatype_str()
+            f_thresh.write(
+                "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
+                = ".format(
+                    self.calc_tmem(),
+                    self.get_nodeattr("PE"),
+                    threshold_tensor.shape[-1],
+                    tdt_hls,
+                    odt_hls,
+                    self.get_nodeattr("ActVal"),
+                    "std::less_equal<%s>" % tdt_hls,
+                )
+            )
+            f_thresh.write(thresholds_hls_code)
+            f_thresh.close()
+        elif "decoupled" in weight_file_mode:
+            # streaming thresholds need to be organized differently
+            # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps)
+            decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3))
+            # TODO add flips/reversals as needed here
+            # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps)
+            pe = self.get_nodeattr("PE")
+            n_thres_steps = self.get_nodeattr("numSteps")
+            decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2)
+            decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps)
+            decoupled_thres = decoupled_thres.copy()
+            decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape(
+                1, -1, pe * n_thres_steps
+            )
+            decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy()
+
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, decoupled_thres)
+            elif weight_file_mode == "decoupled_verilog_dat":
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** ceil(log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Decoupled weight export not yet implemented")
+        else:
+            raise Exception("Unknown weight_file_mode")
 
-        thresholds_hls_code = numpy_to_hls_code(
-            threshold_tensor, tdt, "thresholds", False, True
-        )
-        # write thresholds into thresh.h
-        f_thresh = open("{}/thresh.h".format(code_gen_dir), "w")
-        tdt_hls = tdt.get_hls_datatype_str()
-        # use binary to export bipolar activations
-        export_odt = self.get_output_datatype()
-        if self.get_output_datatype() == DataType.BIPOLAR:
-            export_odt = DataType.BINARY
-        odt_hls = export_odt.get_hls_datatype_str()
-        f_thresh.write(
-            "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
-            = ".format(
-                self.calc_tmem(),
-                self.get_nodeattr("PE"),
-                threshold_tensor.shape[-1],
-                tdt_hls,
-                odt_hls,
-                self.get_nodeattr("ActVal"),
-                "std::less_equal<%s>" % tdt_hls,
+    def generate_params(self, model, path):
+        code_gen_dir = path
+        thresholds = model.get_initializer(self.onnx_node.input[1])
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            # save thresholds in thresh.h
+            weight_filename = "{}/thresh.h".format(code_gen_dir)
+            self.make_weight_file(thresholds, "hls_header", weight_filename)
+        elif mem_mode == "decoupled":
+            # save decoupled weights for cppsim
+            weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir)
+            self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim)
+            # also save weights as Verilog .dat file
+            weight_filename_rtl = "{}/memblock_0.dat".format(code_gen_dir)
+            self.make_weight_file(
+                thresholds, "decoupled_verilog_dat", weight_filename_rtl
             )
-        )
-        f_thresh.write(thresholds_hls_code)
-        f_thresh.close()
+        else:
+            raise Exception("Unrecognized mem_mode")
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
@@ -373,7 +520,7 @@ class Thresholding_Batch(HLSCustomOp):
                     reshaped_input,
                 )
             elif in_ind > 2:
-                raise Exception("Unexpected input found for StreamingFCLayer")
+                raise Exception("Unexpected input found for Thresholding_Batch")
             in_ind += 1
 
         if mode == "cppsim":
@@ -400,7 +547,23 @@ class Thresholding_Batch(HLSCustomOp):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            output = self.rtlsim(sim, inp)
+            if self.get_nodeattr("mem_mode") == "decoupled":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                wei = npy_to_rtlsim_input(
+                    "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            elif self.get_nodeattr("mem_mode") == "const":
+                output = self.rtlsim(sim, inp)
+            else:
+                raise Exception("Unrecognized mem_mode")
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
@@ -425,7 +588,8 @@ class Thresholding_Batch(HLSCustomOp):
 
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"']
-        self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+        if self.get_nodeattr("mem_mode") == "const":
+            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
 
     # TODO check and add whatever missing
     def defines(self, var):
@@ -436,6 +600,21 @@ class Thresholding_Batch(HLSCustomOp):
                 self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps,
             )
         ]
+        if self.get_nodeattr("mem_mode") == "decoupled":
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define ActVal1 %d" % self.get_nodeattr("ActVal")
+            )
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define ThresType1 %s"
+                % self.get_weight_datatype().get_hls_datatype_str()
+            )
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define NumSteps1 %d" % self.get_nodeattr("numSteps")
+            )
+            # TODO remove once Thresholding_Stream_Batch is in hlslib:
+            self.code_gen_dict["$DEFINES$"].append(
+                templates.decoupled_thresholding_template
+            )
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -452,6 +631,20 @@ class Thresholding_Batch(HLSCustomOp):
             'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
             % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
         )
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            tdt = self.get_weight_datatype()
+            elem_bits = tdt.bitwidth()
+            packed_bits = self.get_weightstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            elem_hls_type = tdt.get_hls_datatype_str()
+            npy_type = "float"
+            npy_in = "%s/thresholds.npy" % code_gen_dir
+
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);'
+                % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            )
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
@@ -461,6 +654,13 @@ class Thresholding_Batch(HLSCustomOp):
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> weights ("weights");'.format(
+                    self.get_weightstream_width()
+                )
+            )
 
     def docompute(self):
         tmpl_args = self.get_template_param_values()
@@ -474,12 +674,26 @@ class Thresholding_Batch(HLSCustomOp):
             imgdim = ishape[1]
         else:
             raise Exception("""Unexpeted input shape""")
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<{}, NumChannels1, PE1, {}, {}>
-            (in0, out, threshs, numReps);""".format(
-                node.op_type, imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"],
-            )
-        ]
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<{}, NumChannels1, PE1, {}, {}>
+                (in0, out, threshs, numReps);""".format(
+                    node.op_type, imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"],
+                )
+            ]
+        elif mem_mode == "decoupled":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
+                (in0, out, weights, numReps);""".format(
+                    "Thresholding_Stream_Batch",
+                    imgdim,
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                )
+            ]
+        else:
+            raise Exception("Unrecognized mem_mode")
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -513,15 +727,30 @@ class Thresholding_Batch(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0,
-                hls::stream<ap_uint<{}>> &out
-                )""".format(
-                self.onnx_node.name,
-                self.get_instream_width(),
-                self.get_outstream_width(),
-            )
-        ]
+        if self.get_nodeattr("mem_mode") == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0,
+                    hls::stream<ap_uint<{}>> &out
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.get_outstream_width(),
+                )
+            ]
+        elif self.get_nodeattr("mem_mode") == "decoupled":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0,
+                    hls::stream<ap_uint<{}>> &weights,
+                    hls::stream<ap_uint<{}>> &out
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.get_weightstream_width(),
+                    self.get_outstream_width(),
+                )
+            ]
+        else:
+            raise Exception("Unrecognized mem_mode")
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
@@ -530,46 +759,173 @@ class Thresholding_Batch(HLSCustomOp):
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
 
-        # the threshold tensor is acc_type [PE][TMEM][N_THRES]
-        # partition for parallel access along PE and N_THRES
-        # dimensions (dims 1 and 3)
-        self.code_gen_dict["$PRAGMAS$"].append(
-            (
-                "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                "complete dim=1"
+        if self.get_nodeattr("mem_mode") == "const":
+            # the threshold tensor is acc_type [PE][TMEM][N_THRES]
+            # partition for parallel access along PE and N_THRES
+            # dimensions (dims 1 and 3)
+            self.code_gen_dict["$PRAGMAS$"].append(
+                (
+                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                    "complete dim=1"
+                )
             )
-        )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            (
-                "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
-                "complete dim=3"
+            self.code_gen_dict["$PRAGMAS$"].append(
+                (
+                    "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
+                    "complete dim=3"
+                )
             )
-        )
-        # set resource type
-        ram_style = self.get_nodeattr("ram_style")
-        pe = self.get_nodeattr("PE")
-        ich = self.get_nodeattr("NumChannels")
-        # if PE less than NumChannels, assign cores according to ram_style;
-        # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs
-        if pe < ich:
-            if ram_style == "distributed":
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    (
-                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
-                        "core=ROM_2P_LUTRAM"
+            # set resource type
+            ram_style = self.get_nodeattr("ram_style")
+            pe = self.get_nodeattr("PE")
+            ich = self.get_nodeattr("NumChannels")
+            # if PE less than NumChannels, assign cores according to ram_style;
+            # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs
+            if pe < ich:
+                if ram_style == "distributed":
+                    self.code_gen_dict["$PRAGMAS$"].append(
+                        (
+                            "#pragma HLS RESOURCE variable=threshs.m_thresholds "
+                            "core=ROM_2P_LUTRAM"
+                        )
                     )
-                )
-            elif ram_style == "block":
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    (
-                        "#pragma HLS RESOURCE variable=threshs.m_thresholds "
-                        "core=ROM_2P_BRAM"
+                elif ram_style == "block":
+                    self.code_gen_dict["$PRAGMAS$"].append(
+                        (
+                            "#pragma HLS RESOURCE variable=threshs.m_thresholds "
+                            "core=ROM_2P_BRAM"
+                        )
                     )
-                )
-            else:
-                raise Exception(
-                    """Invalid value for attribute ram_style! Is currently set to: {}
-                has to be set to one of ("block", "distributed")""".format(
-                        ram_style
+                else:
+                    raise Exception(
+                        """Invalid value for attribute ram_style! Is currently set to: {}
+                    has to be set to one of ("block", "distributed")""".format(
+                            ram_style
+                        )
                     )
+        elif self.get_nodeattr("mem_mode") == "decoupled":
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=weights"
+            )
+
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            node_name = self.onnx_node.name
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate the hls ip
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
+            )
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.NSTREAMS {1} "
+                "CONFIG.MEM_DEPTH {%d} "
+                "CONFIG.MEM_WIDTH {%d} "
+                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "CONFIG.STRM0_DEPTH {%d} "
+                "CONFIG.STRM0_WIDTH {%d} "
+                "CONFIG.STRM0_OFFSET {0} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_tmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("ram_style"),
+                    self.calc_tmem(),
+                    self.get_weightstream_width_padded(),
+                    node_name,
+                    strm_inst,
                 )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_V_V]"
+                % (node_name, strm_inst, node_name, node_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
+            )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
+            cmd.append("save_bd_design")
+        elif mem_mode == "const":
+            # base class impl sufficient for const mode
+            return super().code_generation_ipi()
+        else:
+            raise Exception("Unrecognized mem_mode for Thresholding_Batch")
+        return cmd
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index d6f2e04f762a6b67ace989fa802829fd3e5a6fb5..f27ebc645dbee20ff97b64aa942e375250f60cbd 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -780,6 +780,10 @@ class InferVVAU(Transformation):
 class InferThresholdingLayer(Transformation):
     """Convert any MultiThreshold into a standalone thresholding HLS layer."""
 
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
@@ -791,6 +795,7 @@ class InferThresholdingLayer(Transformation):
                 thl_threshold = node.input[1]
                 thl_output = node.output[0]
                 thl_in_shape = model.get_tensor_shape(thl_input)
+                thl_thres_shape = model.get_tensor_shape(thl_threshold)
                 idt = model.get_tensor_datatype(thl_input)
 
                 # skip conversion for layers with float input
@@ -841,10 +846,13 @@ class InferThresholdingLayer(Transformation):
                     backend="fpgadataflow",
                     NumChannels=ifc,
                     PE=pe,
+                    numSteps=thl_thres_shape[1],
                     inputDataType=idt.name,
+                    weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
                     outputDataType=odt.name,
                     numInputVectors=list(thl_in_shape[:-1]),
                     ActVal=actval,
+                    mem_mode=self.mem_mode,
                 )
                 graph.node.insert(insert_point, new_node)
                 # remove old node
@@ -852,6 +860,7 @@ class InferThresholdingLayer(Transformation):
                 graph_modified = True
 
         if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 75fa625ff00ad6d367e2d6c94d98705f391fb9be..8461efd15576fc04906b7f48b2629ad83835de38 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -46,9 +46,16 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.util.basic import gen_finn_dt_tensor
 from finn.custom_op.registry import getCustomOp
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+import os
+from finn.util.pyverilator import axilite_read, axilite_write
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.core.rtlsim_exec import rtlsim_exec
 
+test_fpga_part = "xc7z020clg400-1"
+target_clk_ns = 5
 
-def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval):
+
+def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode):
     NumChannels = T.shape[0]
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, NumChannels])
@@ -64,9 +71,12 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval):
         backend="fpgadataflow",
         NumChannels=NumChannels,
         PE=pe,
+        numSteps=T.shape[1],
         inputDataType=idt.name,
+        weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
         outputDataType=odt.name,
         ActVal=actval,
+        mem_mode=mem_mode,
     )
     graph = helper.make_graph(
         nodes=[Thresholding_node],
@@ -96,9 +106,11 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval):
 @pytest.mark.parametrize("ich", [16])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+# memory mode
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
 @pytest.mark.vivado
 @pytest.mark.slow
-def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
+def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
     if nf == -1:
         nf = ich
     pe = ich // nf
@@ -118,7 +130,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
     else:
         actval = odt.min()
 
-    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval)
+    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode)
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
@@ -127,7 +139,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
     elif exec_mode == "rtlsim":
         model = model.transform(SetExecMode("rtlsim"))
         model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
     else:
@@ -164,3 +176,102 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode):
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
         assert exp_cycles != 0
+
+
+@pytest.mark.vivado
+def test_runtime_thresholds_single_layer():
+    mem_mode = "decoupled"
+    act = DataType.INT4
+    idt = DataType.INT16
+    nf = 8
+    ich = 16
+    pe = ich // nf
+    assert ich % pe == 0
+
+    # generate input data
+    in_tensor = gen_finn_dt_tensor(idt, (1, ich))
+
+    odt = act
+    n_steps = act.get_num_possible_values() - 1
+    T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
+    # provide non-decreasing thresholds
+    T = np.sort(T, axis=1)
+
+    if odt == DataType.BIPOLAR:
+        actval = 0
+    else:
+        actval = odt.min()
+
+    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode)
+    op_inst = getCustomOp(model.graph.node[0])
+    op_inst.set_nodeattr("runtime_writeable_weights", 1)
+    op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat")
+    with open("old_weights.dat", "r") as f:
+        old_weight_stream = f.read().strip()
+    os.remove("old_weights.dat")
+    old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
+    old_weight_stream = list(old_weight_stream)
+    # need to create stitched IP for runtime weight testing
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model = model.transform(PrepareRTLSim())
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    # add two copies of the input tensor as the first one is just used to
+    # "flush out" the pipeline (as mvau already starts receiving old weights while
+    # we read/write new ones and reads seem to cause a disturbance too)
+    in_tensor = np.tile(in_tensor, (2, 1))
+    exec_ctx = {"inp": in_tensor}
+    extracted_weight_stream = []
+
+    def read_weights(sim):
+        addr = 0
+        for i in range(len(old_weight_stream)):
+            extracted_weight_stream.append(
+                axilite_read(sim, addr, basename="s_axilite_0_")
+            )
+            addr += 4
+
+    rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
+    assert extracted_weight_stream == old_weight_stream
+    # only use second batch element in output; first will be invalid due to
+    # old weights (see above)
+    y = exec_ctx["outp"][1]
+    expected = multithreshold(in_tensor, T)[1]
+    if act == DataType.BIPOLAR:
+        # binary to bipolar
+        expected = 2 * expected - 1
+    else:
+        # signed offset
+        expected += act.min()
+    assert (y == expected).all()
+
+    new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(
+        np.float32
+    )
+    # provide non-decreasing thresholds
+    new_weights = np.sort(T, axis=1)
+    op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat")
+    with open("new_weights.dat", "r") as f:
+        new_weight_stream = f.read().strip()
+    os.remove("new_weights.dat")
+    new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n"))
+    new_weight_stream = list(new_weight_stream)
+
+    def write_weights(sim):
+        addr = 0
+        for nw in new_weight_stream:
+            axilite_write(sim, addr, nw, basename="s_axilite_0_")
+            addr += 4
+
+    rtlsim_exec(model, exec_ctx, pre_hook=write_weights)
+    y = exec_ctx["outp"][1]
+    expected = multithreshold(in_tensor, new_weights)[1]
+    if act == DataType.BIPOLAR:
+        # binary to bipolar
+        expected = 2 * expected - 1
+    else:
+        # signed offset
+        expected += act.min()
+    assert (y == expected).all()