diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index a7ebff68749120868cae9ce5ac18d2856fe2cb8a..9c3bd3ac87b94f3e0ff11a2937bf5083aae614f6 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -87,7 +87,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "numInputVectors": ("ints", False, [1]),
             # memory mode for the FC weights
             # const -- embedded weights, default, long compile/synth times
-            # decoupled -- streaming weights
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
             "mem_mode": ("s", False, "const"),
             # FPGA resource type for memories in decoupled mode
             # auto -- let Vivado decide
@@ -105,14 +106,14 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         node = self.onnx_node
         # set top name depending on mem_mode
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const":
+        if mem_mode == "const" or mem_mode == "external":
             prefixed_top_name = "%s_%s" % (node.name, node.name)
         elif mem_mode == "decoupled":
             prefixed_top_name = "%s_memstream" % (node.name)
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
         return prefixed_top_name
 
@@ -301,7 +302,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def get_weightstream_width(self):
         """Returns weight stream width. Used only in decoupled mode."""
-        if self.get_nodeattr("mem_mode") == "decoupled":
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
             wp = self.get_weight_datatype().bitwidth()
@@ -484,7 +488,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def generate_params(self, model, path):
         mem_mode = self.get_nodeattr("mem_mode")
-        # weights
+        code_gen_dir = path
+        # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
@@ -493,7 +498,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         # so use it as such for weight generation
         if self.get_weight_datatype() == DataType.BIPOLAR:
             export_wdt = DataType.BINARY
-        code_gen_dir = path
 
         if mem_mode == "const":
             """Saves weights into params.h"""
@@ -523,7 +527,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             f_weights.write(weight_hls_code)
             f_weights.close()
 
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             """Saves weights in corresponding file format for cppsim or rtlsim"""
             # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
             weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
@@ -552,37 +556,37 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped
             )
 
-            """Saves weights into .dat file"""
-            # convert weight values into hexstring
-            weight_width = self.get_weightstream_width()
-            # pad to nearest 4 bits to get hex strings
-            weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
-            weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
-                weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
-            )
-            weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
-            factor = math.ceil(weight_stream_len / 1024)
-            # add zeroes to pad out file to 1024 entries
-            weight_stream = weight_tensor_pe_flipped.flatten()
-            pad_amt = (factor * 1024) - weight_stream_len
-            weight_stream = np.pad(
-                weight_stream, (0, pad_amt), mode="constant", constant_values="0"
-            )
-            weight_stream = weight_stream.copy()
-            i = 0
-            j = 0
-            for val in weight_stream:
-                if i == 1024:
-                    i = 0
-                    j += 1
-                with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
-                    f.write(val + "\n")
-                i += 1
-
+            if mem_mode == "decoupled":
+                """Saves weights into .dat file"""
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
+                factor = math.ceil(weight_stream_len / 1024)
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                pad_amt = (factor * 1024) - weight_stream_len
+                weight_stream = np.pad(
+                    weight_stream, (0, pad_amt), mode="constant", constant_values="0"
+                )
+                weight_stream = weight_stream.copy()
+                i = 0
+                j = 0
+                for val in weight_stream:
+                    if i == 1024:
+                        i = 0
+                        j += 1
+                    with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
+                        f.write(val + "\n")
+                    i += 1
         else:
             raise Exception(
-                """Please set mem_mode to "const"i or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
 
         # save thresholds in thresh.h
@@ -630,6 +634,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
         # TODO ensure codegen dir exists
@@ -698,7 +703,24 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            output = self.rtlsim(sim, inp)
+            if mem_mode == "external":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType.BIPOLAR:
+                    export_wdt = DataType.BINARY
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
@@ -729,12 +751,12 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         if mem_mode == "const":
             # self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
             pass
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
         if self.calc_tmem() != 0:
             # TODO find a better way of checking for no pregenerated thresholds
@@ -757,7 +779,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 numReps,
             )
         ]
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             self.code_gen_dict["$DEFINES$"].append(
                 "#define WP1 {}\n".format(wdt.bitwidth())
@@ -783,7 +805,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         )
 
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             elem_bits = wdt.bitwidth()
             packed_bits = self.get_weightstream_width()
@@ -807,7 +829,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
 
-        if mem_mode == "decoupled":
+        if mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$STREAMDECLARATIONS$"].append(
                 'hls::stream<ap_uint<{}>> weights ("weights");'.format(
                     self.get_weightstream_width()
@@ -835,7 +857,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     self.get_nodeattr("resType"),
                 )
             ]
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             wdt = self.get_weight_datatype()
             if wdt == DataType.BIPOLAR:
                 export_wdt = DataType.BINARY
@@ -856,8 +878,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         else:
             raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
 
     def dataoutstrm(self):
@@ -903,7 +925,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     self.get_outstream_width(),
                 )
             ]
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
                 """void {}(
                     hls::stream<ap_uint<{}>> &in0,
@@ -952,7 +974,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                     "complete dim=1"
                 )
             )
-        elif mem_mode == "decoupled":
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$PRAGMAS$"].append(
                 "#pragma HLS INTERFACE axis port=weights"
             )
@@ -962,8 +984,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
         else:
             raise Exception(
-                """Please set mem_mode to "const", currently no other
-                    parameter value is supported!"""
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
             )
 
         # the threshold tensor is acc_type [PE][TMEM][N_THRES]
@@ -1092,3 +1114,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             )
             self.set_nodeattr("ip_vlnv", vlnv)
             self.code_gen_dict.clear()
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "external":
+            intf_names["s_axis"] = ["in0_V_V", "weights_V_V"]
+        return intf_names
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index fc5cdb7745945bee99564ba9ab19423a66d8e035..952d994076fc4da7e7f763d9f0fe303d8da0ff11 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -134,7 +134,7 @@ def prepare_inputs(input_tensor, idt, wdt):
 
 
 # mem_mode: const or decoupled
-@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
 # weight datatype
@@ -221,7 +221,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 
 
 # mem_mode: const or decoupled
-@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
 @pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
 # weight datatype