diff --git a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
index 7b207fbd6db7c9d985ba3ed50d7fcd97612e07f5..4ddb7c6f1c62a015e0468cb651f1f4b19944d172 100644
--- a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
+++ b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
@@ -37,7 +37,7 @@ module ramb18_wf_dualport
 )
 (
 	input clk,
-	
+
 	input wea,
 	input [AWIDTH-1:0] addra,
 	input [DWIDTH-1:0] wdataa,
@@ -53,7 +53,7 @@ module ramb18_wf_dualport
 reg [DWIDTH-1:0] rdataa;
 reg [DWIDTH-1:0] rdatab;
 
-reg [7:0] idx = ID;
+reg [15:0] idx;
 //initialize memory
 initial begin
     //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT
@@ -63,10 +63,11 @@ initial begin
 	    $finish();
     end
 	//MEM_INIT path must be terminated by /
-	if (ID < 10)
-		$readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, 1023);
+	$sformat(idx,"%0d",ID);
+        if (ID < 10)
+		$readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, 1023);
 	else
-		$readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, 1023);
+		$readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, 1023);
 end
 
 //memory ports, with output pipeline register
@@ -83,4 +84,4 @@ always @(posedge clk) begin
     rdqb <= rdatab;
 end
 
-endmodule
\ No newline at end of file
+endmodule
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 7784024aae102989338df9b040fcfc1f9dc36983..e0c3348cf9926e8a7272688cca29ab18cbf3ad16 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -40,7 +40,10 @@ except ModuleNotFoundError:
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
-from finn.util.basic import interleave_matrix_outer_dim_from_partitions
+from finn.util.basic import (
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
@@ -513,20 +516,23 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 weight_tensor_unflipped, export_wdt, weight_width, prefix=""
             )
             weight_stream_len = np.prod(weight_tensor_unflipped.shape)
-            assert (
-                weight_stream_len <= 1024
-            ), """Decoupled mem mode needs
-            weight stream length <= 1024 for now"""
+            factor = math.ceil(weight_stream_len / 1024)
             # add zeroes to pad out file to 1024 entries
             weight_stream = weight_tensor_unflipped.flatten()
-            pad_amt = 1024 - weight_stream_len
+            pad_amt = (factor * 1024) - weight_stream_len
             weight_stream = np.pad(
                 weight_stream, (0, pad_amt), mode="constant", constant_values="0"
             )
             weight_stream = weight_stream.copy()
-            with open("{}/memblock_0.dat".format(code_gen_dir), "w+") as f:
-                for val in weight_stream:
+            i = 0
+            j = 0
+            for val in weight_stream:
+                if i == 1024:
+                    i = 0
+                    j += 1
+                with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
                     f.write(val + "\n")
+                i += 1
 
         else:
             raise Exception(
@@ -997,7 +1003,11 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)]
             mw = self.get_nodeattr("MW")
             mh = self.get_nodeattr("MH")
-            self.code_gen_dict["$WEIGHT_DEPTH$"] = [str(int(mw * mh))]
+            depth = int(mw * mh)
+            self.code_gen_dict["$WEIGHT_DEPTH$"] = [str(depth)]
+            self.code_gen_dict["$MEM_DEPTH$"] = [
+                str(roundup_to_integer_multiple(depth, 1024))
+            ]
 
             template = self.decoupled_wrapper
 
@@ -1034,9 +1044,11 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 if file.endswith(".v"):
                     verilog_file = os.path.join(memstream_dir, file)
                     copy(verilog_file, verilog_folder)
-            # copy .dat file of weights
-            dat_file = "{}/memblock_0.dat".format(code_gen_dir)
-            copy(dat_file, verilog_folder)
+            # copy .dat files of weights
+            for file in os.listdir(code_gen_dir):
+                if file.endswith(".dat"):
+                    dat_file = os.path.join(code_gen_dir, file)
+                    copy(dat_file, verilog_folder)
             # copy verilog wrapper
             verilog_wrapper = "{}/{}_memstream.v".format(
                 code_gen_dir, self.onnx_node.name
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 5323aac2e344fb8b3c1166e695753e68a435b08f..3e5205d9e8fc1abd5938f8a3dc4df489f81b9eb7 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -183,7 +183,7 @@ memstream
 // memory, set per-stream offsets in memory, set per-stream widths
 .CONFIG_EN(1),
 .NSTREAMS(1),
-.MEM_DEPTH(1024),
+.MEM_DEPTH($MEM_DEPTH$),
 .MEM_WIDTH($WEIGHT_WIDTH$),
 .MEM_INIT("./"),
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 80c9e84ba92c93e8a5d57ffaceb22b5abf188963..330d9a1aedcd6607ae0c150b0ea9ef439afcc1df 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -300,3 +300,90 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
     assert "StreamingFCLayer_Batch_0" in hls_synt_res_est
+
+# mem_mode: const or decoupled
+@pytest.mark.parametrize("mem_mode", ["decoupled"])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType.INT4])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType.INT4])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.INT4])
+# neuron folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1])
+# synapse folding, -1 is maximum possible
+@pytest.mark.parametrize("sf", [-1])
+# HLS matrix width (input features)
+@pytest.mark.parametrize("mw", [128])
+# HLS matrix height (output features)
+@pytest.mark.parametrize("mh", [128])
+def test_fpgadataflow_fclayer_large_depth_decoupled_mode(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+    if nf == -1:
+        nf = mh
+    if sf == -1:
+        sf = mw
+    pe = mh // nf
+    simd = mw // sf
+    assert mh % pe == 0
+    assert mw % sf == 0
+    # generate weights
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+    # generate input data
+    x = gen_finn_dt_tensor(idt, (1, mw))
+    if act is None:
+        # no activation, produce accumulators
+        T = None
+        tdt = None
+        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+            odt = DataType.UINT32
+        else:
+            odt = DataType.INT32
+    else:
+        odt = act
+        (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
+        n_steps = act.get_num_possible_values() - 1
+        T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32)
+        # provide non-decreasing thresholds
+        T = np.sort(T, axis=1)
+        # generate thresholds for activation
+        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+            tdt = DataType.UINT32
+            # bias thresholds to be positive
+            T = np.ceil((T + mw) / 2)
+            assert (T >= 0).all()
+        else:
+            tdt = DataType.INT32
+    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    for node in model.graph.node:
+        # lookup op_type in registry of CustomOps
+        inst = getCustomOp(node)
+        inst.set_nodeattr("mem_mode", mem_mode)
+
+    # prepare input data
+    input_dict = prepare_inputs(x, idt, wdt)
+    if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+        # convert inputs to binary and use xnorpopcountmatmul
+        y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2)
+    else:
+        y = np.matmul(x, W)
+    if T is not None:
+        y = multithreshold(y, T)
+        if act == DataType.BIPOLAR:
+            # binary to bipolar
+            y = 2 * y - 1
+        else:
+            # signed offset
+            y += act.min()
+    oshape = model.get_tensor_shape("outp")
+    y_expected = y.reshape(oshape)
+    # TODO split up into several dependent tests -- need to check how this
+    # works for parametrized tests...
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynth_IPGen())
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
+
+    hls_synt_res_est = model.analysis(hls_synth_res_estimation)
+    assert "StreamingFCLayer_Batch_0" in hls_synt_res_est