diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 7784024aae102989338df9b040fcfc1f9dc36983..692f67f715857e4455f33cb88c618dbfb2b2e356 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -40,7 +40,10 @@ except ModuleNotFoundError:
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
-from finn.util.basic import interleave_matrix_outer_dim_from_partitions
+from finn.util.basic import (
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
@@ -513,20 +516,37 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 weight_tensor_unflipped, export_wdt, weight_width, prefix=""
             )
             weight_stream_len = np.prod(weight_tensor_unflipped.shape)
-            assert (
-                weight_stream_len <= 1024
-            ), """Decoupled mem mode needs
-            weight stream length <= 1024 for now"""
+            factor = math.ceil(weight_stream_len / 1024)
             # add zeroes to pad out file to 1024 entries
             weight_stream = weight_tensor_unflipped.flatten()
-            pad_amt = 1024 - weight_stream_len
+            pad_amt = (factor * 1024) - weight_stream_len
             weight_stream = np.pad(
                 weight_stream, (0, pad_amt), mode="constant", constant_values="0"
             )
             weight_stream = weight_stream.copy()
-            with open("{}/memblock_0.dat".format(code_gen_dir), "w+") as f:
-                for val in weight_stream:
+            i = 0
+            j = 0
+            for val in weight_stream:
+                if i == 1024:
+                    i = 0
+                    j += 1
+                with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
                     f.write(val + "\n")
+                i += 1
+	    #assert (
+            #    weight_stream_len <= 1024
+            #), """Decoupled mem mode needs
+            #weight stream length <= 1024 for now"""
+            # add zeroes to pad out file to 1024 entries
+            #weight_stream = weight_tensor_unflipped.flatten()
+            #pad_amt = 1024 - weight_stream_len
+            #weight_stream = np.pad(
+            #    weight_stream, (0, pad_amt), mode="constant", constant_values="0"
+            #)
+            #weight_stream = weight_stream.copy()
+            #with open("{}/memblock_0.dat".format(code_gen_dir), "w+") as f:
+            #    for val in weight_stream:
+            #        f.write(val + "\n")
 
         else:
             raise Exception(
@@ -997,7 +1017,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)]
             mw = self.get_nodeattr("MW")
             mh = self.get_nodeattr("MH")
-            self.code_gen_dict["$WEIGHT_DEPTH$"] = [str(int(mw * mh))]
+            depth = int(mw * mh)
+            self.code_gen_dict["$WEIGHT_DEPTH$"] = [str(depth)]
+            self.code_gen_dict["$MEM_DEPTH$"] = [str(roundup_to_integer_multiple(depth, 1024))]
 
             template = self.decoupled_wrapper
 
@@ -1034,10 +1056,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 if file.endswith(".v"):
                     verilog_file = os.path.join(memstream_dir, file)
                     copy(verilog_file, verilog_folder)
-            # copy .dat file of weights
-            dat_file = "{}/memblock_0.dat".format(code_gen_dir)
-            copy(dat_file, verilog_folder)
-            # copy verilog wrapper
+            # copy .dat files of weights
+            for file in os.listdir(code_gen_dir):
+                if file.endswith(".dat"):
+                    dat_file = os.path.join(code_gen_dir, file)
+                    copy(dat_file, verilog_folder) 
+	    # copy verilog wrapper
             verilog_wrapper = "{}/{}_memstream.v".format(
                 code_gen_dir, self.onnx_node.name
             )