diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index 05e41a48a8f4cb34616bf06c01b652afb9ae4257..38940ccb94f11fe49af5f49ee020f150326a026c 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -63,7 +63,7 @@ class AddStreams_Batch(HLSCustomOp):
         ishape = tuple(vecs + [ich])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         assert ich % pe == 0, "PE must divide NumChannels"
@@ -362,5 +362,5 @@ class AddStreams_Batch(HLSCustomOp):
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
-        intf_names["s_axis"] = ["in0_V_V", "in1_V_V"]
+        intf_names["s_axis"].append(("in1_V_V", self.get_instream_width_padded()))
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 10a8051730217b56873b5a53c0803e3b90dada90..73da77bd3f940cee5ffd10fcfc43571f1a612eb4 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -312,7 +312,8 @@ class DuplicateStreams_Batch(HLSCustomOp):
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = [
             """DuplicateStreams_Batch<{}, {}> (in0, out0, out1, 1);""".format(
-                self.get_outstream_width(), self.get_number_output_values() // 2,
+                self.get_outstream_width(),
+                self.get_number_output_values() // 2,
             )
         ]
 
@@ -378,5 +379,8 @@ class DuplicateStreams_Batch(HLSCustomOp):
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
-        intf_names["m_axis"] = ["out0_V_V", "out1_V_V"]
+        intf_names["m_axis"] = [
+            ("out0_V_V", self.get_outstream_width_padded()),
+            ("out1_V_V", self.get_outstream_width_padded()),
+        ]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 02912b2d5f45b3bab0eaca13ee0a0bf19bf9cfca..39069e4c157f37ea65acf7c7b3da7a78e1ab2d0e 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -130,8 +130,8 @@ class HLSCustomOp(CustomOp):
         intf_names = {}
         intf_names["clk"] = ["ap_clk"]
         intf_names["rst"] = ["ap_rst_n"]
-        intf_names["s_axis"] = ["in0_V_V"]
-        intf_names["m_axis"] = ["out_V_V"]
+        intf_names["s_axis"] = [("in0_V_V", self.get_instream_width_padded())]
+        intf_names["m_axis"] = [("out_V_V", self.get_outstream_width_padded())]
         intf_names["aximm"] = []
         intf_names["axilite"] = []
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index a6cddcc4aeb45957c16249cd57f122fe5e58b85a..857496a2614894588ebf065db3e384cf2cecf106 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -355,11 +355,9 @@ class IODMA(HLSCustomOp):
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         if self.get_nodeattr("direction") == "out":
-            intf_names["s_axis"] = ["in0_V_V"]
             intf_names["m_axis"] = []
         else:
             intf_names["s_axis"] = []
-            intf_names["m_axis"] = ["out_V_V"]
         intf_names["axilite"] = ["s_axi_control"]
-        intf_names["aximm"] = ["m_axi_gmem"]
+        intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("intfWidth"))]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 23c1779a27c123583c0c8af5f53d022d03e78126..4d84b74dce001fca769ed2850a8f718ac942f14c 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -395,8 +395,8 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 8868002c9e2cb8726eeb573e104140e3e1a61d27..3cc01ade73fc6b735509f2839e5c10785a8b9f54 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -444,12 +444,24 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         single_pe_w = simd * weight_bits
         return max([weightstream, max_of_io, single_pe_w])
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
         simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
         sf = mw // simd
+        nf = mh // pe
         vecs = list(self.get_nodeattr("numInputVectors"))
-        folded_input_shape = tuple(vecs + [sf, simd])
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple(vecs + [sf, simd])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple(vecs + [sf * nf, simd * pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
         return folded_input_shape
 
     def get_folded_output_shape(self):
@@ -1253,8 +1265,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
@@ -1348,8 +1360,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 # TODO calculate and pass in segment size here
                 cmd.append("assign_bd_address")
             cmd.append("save_bd_design")
-        elif mem_mode == "const":
-            # base class impl sufficient for const mode
+        elif mem_mode == "const" or mem_mode == "external":
+            # base class impl sufficient for const/external modes
             return super().code_generation_ipi()
         else:
             raise Exception("Unrecognized mem_mode for StreamingFCLayer")
@@ -1359,7 +1371,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         intf_names = super().get_verilog_top_module_intf_names()
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "external":
-            intf_names["s_axis"] = ["in0_V_V", "weights_V_V"]
+            intf_names["s_axis"].append(
+                ("weights_V_V", self.get_weightstream_width_padded())
+            )
         if mem_mode == "decoupled":
             # only expose axilite interface if attribute is set
             runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index fb41bceca09fe544bd729537b1af726c9c43d290..133a869b28cf9968a719e243a3266dfb25b637ba 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -343,8 +343,8 @@ class StreamingFIFO(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
index 9ec03ea5dd726b49b157a92addef05f85f02b644..6700019a4a430d785967a684ad1ca8d186d32bae 100644
--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -36,6 +36,7 @@ from finn.util.data_packing import (
     finnpy_to_packed_bytearray,
     packed_bytearray_to_finnpy,
 )
+from warnings import warn
 
 from finn.util.basic import gen_finn_dt_tensor
 
@@ -86,11 +87,17 @@ class FINNExampleOverlay(Overlay):
         self.batch_size = batch_size
         self.fclk_mhz = fclk_mhz
         if self.platform == "alveo":
-            self.idma = self.idma0
+            if "input_dma_name" in io_shape_dict.keys():
+                self.idma = getattr(self, io_shape_dict["input_dma_name"])
+            else:
+                self.idma = self.idma0
             self.odma = self.odma0
             self.odma_handle = None
         elif self.platform == "zynq-iodma":
-            self.idma = self.idma0
+            if "input_dma_name" in io_shape_dict.keys():
+                self.idma = getattr(self, io_shape_dict["input_dma_name"])
+            else:
+                self.idma = self.idma0
             self.odma = self.odma0
             # set the clock frequency as specified by user during transformations
             if self.fclk_mhz > 0:
@@ -98,8 +105,65 @@ class FINNExampleOverlay(Overlay):
         else:
             raise ValueError("Supported platforms are zynq-iodma alveo")
         # load any runtime weights
+        self.external_weights = []
+        self.load_external_weights()
         self.load_runtime_weights()
 
+    def load_external_weights(self):
+        """Load any existing runtime weights from the specified dir into the
+        appropriate layer of the accelerator. Note that this must be enabled
+        during the accelerator build process. The runtime weights directory
+        is specified as the class member ``runtime_weight_dir``.
+
+        Parameters
+        ----------
+        flush_accel: bool
+            Run the accelerator with dummy input after weights are written to
+            flush any stale weight data in the weight streamer FIFOs.
+        verify: bool
+            Whether the written weights will be re-read and verified.
+        """
+
+        w_filenames = []
+        if not os.path.isdir(self.runtime_weight_dir):
+            return
+        for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir):
+            w_filenames.extend(filenames)
+
+        tmp_weight_dict = {}
+
+        for w_filename in w_filenames:
+            if w_filename.endswith(".npy"):
+                weight_tensor = np.load(self.runtime_weight_dir + "/" + w_filename)
+            else:
+                continue
+
+            idma_name = w_filename.split(".")[0]
+            tmp_weight_dict[idma_name] = weight_tensor
+
+        if self.platform != "alveo" and len(tmp_weight_dict) > 0:
+            # Todo: add zynq support pynq API is different
+            warn("external_weights are not yet supported for non-Alveo builds")
+            return
+
+        for idma_name in tmp_weight_dict.keys():
+            if idma_name in self.ip_dict.keys():
+                iwdma = getattr(self, idma_name)
+                weight_tensor = tmp_weight_dict[idma_name]
+                weight_buf = allocate(weight_tensor.shape, dtype=np.uint8)
+                weight_buf[:] = weight_tensor
+                weight_buf.sync_to_device()
+
+                self.external_weights += [(iwdma, weight_buf)]
+
+        if "number_of_external_weights" in self._io_shape_dict:
+            hw_ext_weights = self._io_shape_dict["number_of_external_weights"]
+            assert len(self.external_weights) == hw_ext_weights, (
+                "Number of hardware external weights and number of external "
+                + "weight tensors available do not match. \n"
+                + "Is runtime_weight_dir pointing to the correct folder?"
+            )
+
     def load_runtime_weights(self, flush_accel=True, verify=True):
         """Load any existing runtime weights from the specified dir into the
         appropriate layer of the accelerator. Note that this must be enabled
@@ -124,18 +188,25 @@ class FINNExampleOverlay(Overlay):
             if w_filename.endswith(".dat"):
                 with open(self.runtime_weight_dir + "/" + w_filename, "r") as f:
                     dat = f.read()
+            else:
+                continue
             layer_w = np.fromiter(
                 [int(x, 16) for x in dat.strip().split()], dtype=np.uint32
             )
-            layer_ind = int(w_filename.split("_")[0])
-            rt_weight_dict[layer_ind] = layer_w
-        for layer_ind in rt_weight_dict.keys():
-            cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind
+            sdp_ind = int(w_filename.split("_")[0])
+            layer_ind = int(w_filename.split("_")[1])
+            rt_weight_dict[(sdp_ind, layer_ind)] = layer_w
+        for sdp_ind, layer_ind in rt_weight_dict.keys():
+            cand_if_name = "StreamingDataflowPartition_%d/s_axilite_%d" % (
+                sdp_ind,
+                layer_ind,
+            )
             if cand_if_name in self.ip_dict.keys():
                 layer_mmio = getattr(
-                    self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind
+                    getattr(self, "StreamingDataflowPartition_%d" % sdp_ind),
+                    "s_axilite_%d" % layer_ind,
                 ).mmio
-                layer_w = rt_weight_dict[layer_ind]
+                layer_w = rt_weight_dict[(sdp_ind, layer_ind)]
                 layer_mmio.write_mm(0, layer_w.tobytes())
                 if verify:
                     new_w = np.copy(layer_mmio.array[: layer_w.shape[0]])
@@ -289,6 +360,8 @@ class FINNExampleOverlay(Overlay):
         elif self.platform == "alveo":
             assert self.odma_handle is None, "Output DMA is already running"
             self.idma.start(self.ibuf_packed_device, batch_size)
+            for iwdma, iwbuf in self.external_weights:
+                iwdma.start(iwbuf, batch_size)
             self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size)
         else:
             raise Exception("Unrecognized platform: %s" % self.platform)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 19fa5c603bfafe16ed151e10fa8eb11a79106ede..aed5792a63ff95803b4d7ccc80cf2c94ac732ad7 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -29,6 +29,7 @@
 import os
 import warnings
 import subprocess
+import json
 
 from finn.transformation.base import Transformation
 from finn.util.basic import get_by_name, make_build_dir, is_finn_op
@@ -40,6 +41,31 @@ from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
 )
 
 
+def is_external_input(model, node, i):
+    # indicate whether input i of node should be made external
+    # True only if input is unconnected and has no initializer
+    # Only esception is second input of FC layers when mem_mode is external
+    node_inst = getCustomOp(node)
+    producer = model.find_producer(node.input[i])
+    if producer is None:
+        if model.get_initializer(node.input[i]) is None:
+            return True
+        else:
+            if node.op_type == "StreamingFCLayer_Batch":
+                if node_inst.get_nodeattr("mem_mode") == "external":
+                    return True
+    return False
+
+
+def is_external_output(model, node, i):
+    # indicate whether output i of node should be made external
+    # True only if output is unconnected
+    consumers = model.find_consumers(node.output[i])
+    if consumers is None:
+        return True
+    return False
+
+
 class CreateStitchedIP(Transformation):
     """Create a Vivado IP Block Design project from all the generated IPs of a
     graph. All nodes in the graph must have the fpgadataflow backend attribute,
@@ -134,21 +160,24 @@ class CreateStitchedIP(Transformation):
         if len(aximm_intf_name) != 0:
             self.connect_cmds.append(
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
-                % (inst_name, aximm_intf_name[0])
+                % (inst_name, aximm_intf_name[0][0])
             )
             self.connect_cmds.append(
                 "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]"
             )
-            self.intf_names["aximm"] = ["m_axi_gmem0"]
+            self.intf_names["aximm"] = [("m_axi_gmem0", aximm_intf_name[0][1])]
             assert self.has_aximm is False, "Currently limited to one AXI-MM interface"
             self.has_aximm = True
 
-    def connect_m_axis_external(self, node):
+    def connect_m_axis_external(self, node, idx=None):
         inst_name = node.name
         node_inst = getCustomOp(node)
         output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"]
         # make output axis external
-        for output_intf_name in output_intf_names:
+        for i in range(len(output_intf_names)):
+            if idx is not None and idx != i:
+                continue
+            output_intf_name = output_intf_names[i][0]
             self.connect_cmds.append(
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
                 % (inst_name, output_intf_name)
@@ -158,15 +187,20 @@ class CreateStitchedIP(Transformation):
                 % (self.m_axis_idx, output_intf_name)
             )
             self.has_m_axis = True
-            self.intf_names["m_axis"].append("m_axis_%d" % self.m_axis_idx)
+            self.intf_names["m_axis"].append(
+                ("m_axis_%d" % self.m_axis_idx, output_intf_names[i][1])
+            )
             self.m_axis_idx += 1
 
-    def connect_s_axis_external(self, node):
+    def connect_s_axis_external(self, node, idx=None):
         inst_name = node.name
         node_inst = getCustomOp(node)
         input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"]
         # make input axis external
-        for input_intf_name in input_intf_names:
+        for i in range(len(input_intf_names)):
+            if idx is not None and idx != i:
+                continue
+            input_intf_name = input_intf_names[i][0]
             self.connect_cmds.append(
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
                 % (inst_name, input_intf_name)
@@ -176,7 +210,9 @@ class CreateStitchedIP(Transformation):
                 % (self.s_axis_idx, input_intf_name)
             )
             self.has_s_axis = True
-            self.intf_names["s_axis"].append("s_axis_%d" % self.s_axis_idx)
+            self.intf_names["s_axis"].append(
+                ("s_axis_%d" % self.s_axis_idx, input_intf_names[i][1])
+            )
             self.s_axis_idx += 1
 
     def apply(self, model):
@@ -200,57 +236,30 @@ class CreateStitchedIP(Transformation):
             assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist."
             ip_dirs += [ip_dir_value]
             self.create_cmds += node_inst.code_generation_ipi()
-            my_producer = model.find_producer(node.input[0])
             self.connect_clk_rst(node)
             self.connect_axi(node)
-            if my_producer is None:
-                # first node in graph
-                self.connect_s_axis_external(node)
-                if node.op_type == "TLastMarker":
-                    assert (
-                        node_inst.get_nodeattr("Direction") == "in"
-                    ), """Output TLastMarker incorrect direction"""
-                elif node.op_type == "IODMA" and len(model.graph.node) != 1:
-                    # don't apply this check for a 1-node partition
-                    assert (
-                        node_inst.get_nodeattr("direction") == "in"
-                    ), """Input DMA incorrect direction"""
-            else:
-                # intermediate node
-                # wire up input(s) to previous node output(s)
-                # foreach input
-                #     find producer
-                #     find index of producer output connected to our target input
-                #     get names of hdl interfaces for input and producer output
-                #     issue a TCL directive to connect input to output
-                #     if FC layer with mode "decoupled", add a streamer on input 1
-                for i in range(len(node.input)):
+            for i in range(len(node.input)):
+                if is_external_input(model, node, i):
+                    self.connect_s_axis_external(node, idx=i)
+                else:
                     producer = model.find_producer(node.input[i])
                     if producer is None:
                         continue
                     j = list(producer.output).index(node.input[i])
                     src_intf_name = getCustomOp(
                         producer
-                    ).get_verilog_top_module_intf_names()["m_axis"][j]
+                    ).get_verilog_top_module_intf_names()["m_axis"][j][0]
                     dst_intf_name = node_inst.get_verilog_top_module_intf_names()[
                         "s_axis"
-                    ][i]
+                    ][i][0]
                     self.connect_cmds.append(
                         "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
                         "[get_bd_intf_pins %s/%s]"
                         % (producer.name, src_intf_name, node.name, dst_intf_name)
                     )
-            if model.find_consumers(node.output[0]) is None:
-                # last node in graph
-                self.connect_m_axis_external(node)
-                if node.op_type == "TLastMarker":
-                    assert (
-                        node_inst.get_nodeattr("Direction") == "out"
-                    ), """Output TLastMarker incorrect direction"""
-                elif node.op_type == "IODMA" and len(model.graph.node) != 1:
-                    assert (
-                        node_inst.get_nodeattr("direction") == "out"
-                    ), """Output DMA incorrect direction"""
+            for i in range(len(node.output)):
+                if is_external_output(model, node, i):
+                    self.connect_m_axis_external(node, idx=i)
 
         # create a temporary folder for the project
         prjname = "finn_vivado_stitch_proj"
@@ -316,7 +325,7 @@ class CreateStitchedIP(Transformation):
         block_library = "finn"
         block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name)
         model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv)
-        model.set_metadata_prop("vivado_stitch_ifnames", str(self.intf_names))
+        model.set_metadata_prop("vivado_stitch_ifnames", json.dumps(self.intf_names))
         tcl.append(
             (
                 "ipx::package_project -root_dir %s/ip -vendor %s "
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 0f2b8ef6a4c0858cd98218538930c97c6df2ad9d..e7bf29da36e9978911c5bfc64665dba4d2edca4e 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -59,7 +59,20 @@ class InsertDWC(Transformation):
                         n0 = getCustomOp(n)
                         n1 = getCustomOp(consumer)
                         n0_out_shape = n0.get_folded_output_shape()
-                        n1_in_shape = n1.get_folded_input_shape()
+
+                        # If FC and external mem, it could be connected to input 1
+                        if (consumer.op_type == "StreamingFCLayer_Batch" and 
+                            n1.get_nodeattr("mem_mode") == "external"):
+                            # get input idx
+                            in_idx = None
+                            for idx, n_input in enumerate(consumer.input):
+                                if n_output == n_input:
+                                    in_idx = idx
+                            assert in_idx is not None,"Malformed model"
+                            n1_in_shape = n1.get_folded_input_shape(in_idx)
+                        else:
+                            n1_in_shape = n1.get_folded_input_shape()
+
                         if n0_out_shape[-1] != n1_in_shape[-1]:
                             graph_modified = True
                             # determine dwc inwidth
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 67143547557a9b24b311e69cff6f885f8745cd3c..ebd7cbe0276d3e9b4128275b0a65b1a9a40d1f80 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -48,6 +48,39 @@ class InsertIODMA(Transformation):
         ), "max_intfwidth must be a power of 2"
         self.max_intfwidth = max_intfwidth
 
+    def get_mem_init(self, weights, pe, simd):
+        """
+        Returns matrix ready for pack_innermost_dim_as_hex_string with
+        reverse=False (finn.util.data_packing) to return the memory init file
+        little endian packed.
+        That is, get_mem_init returns:
+        elem(pe,simd)
+        addr = 0: [(pe-1,simd-1),(pe-1,simd-2),...(0,1),(0,0)]
+        addr = 1: [(pe-1,simd*2-1),.......(0,simd+1),(0,simd)]
+        .
+        """
+        w_shape = weights.shape
+        assert len(w_shape) == 2, "weights withincorrect number of dims"
+        inp_w, out_w = w_shape
+
+        assert out_w % pe == 0, "Malformed weight matrix"
+        assert inp_w % simd == 0, "Malformed weight matrix"
+        reshaped_w = np.zeros(inp_w * out_w).reshape(-1, pe * simd)
+
+        addr = 0
+        for fr in range(out_w // pe):
+            for fc in range(inp_w // simd):
+                tile = weights[
+                    (fc * simd) : ((fc + 1) * simd), (fr * pe) : ((fr + 1) * pe)
+                ]
+                for p in range(pe):
+                    reshaped_w[addr, (p * simd) : ((p + 1) * simd)] = tile[
+                        :, p
+                    ].transpose()
+                addr += 1
+        reshaped_w = np.flip(reshaped_w, axis=-1)
+        return reshaped_w
+
     def apply(self, model):
         # only makes sense for a pure fpgadataflow graph -- so we check!
         all_nodes = list(model.graph.node)
@@ -78,11 +111,6 @@ class InsertIODMA(Transformation):
             return (model, False)
         else:
             if final_node.op_type != "IODMA":
-                # check if tensor is NHWC
-                assert (
-                    model.get_tensor_layout(graph_out_name) == DataLayout.NHWC
-                    or model.get_tensor_layout(graph_out_name) == DataLayout.NC
-                ), "Data layout of output tensor must be NHWC or NC"
                 out_shape = model.get_tensor_shape(graph_out_name)
                 out_dtype = model.get_tensor_datatype(graph_out_name)
                 final_node_inst = getCustomOp(final_node)
@@ -123,11 +151,6 @@ class InsertIODMA(Transformation):
                 )
                 model.graph.node.append(dma_node)
             if first_node.op_type != "IODMA":
-                # check if tensor is NHWC
-                assert (
-                    model.get_tensor_layout(graph_in_name) == DataLayout.NHWC
-                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
-                ), "Data layout of input tensor must be NHWC or NC"
                 in_shape = model.get_tensor_shape(graph_in_name)
                 in_dtype = model.get_tensor_datatype(graph_in_name)
                 first_node_inst = getCustomOp(first_node)
@@ -168,11 +191,6 @@ class InsertIODMA(Transformation):
                 )
                 model.graph.node.insert(0, dma_node)
             for fc_node in fc_extw_nodes:
-                # check if tensor is NHWC
-                assert (
-                    model.get_tensor_layout(fc_node.input[1]) == DataLayout.NHWC
-                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
-                ), "Data layout of tensors must be NHWC or NC"
                 fc_w_name = fc_node.input[1]
                 w_shape = model.get_tensor_shape(fc_w_name)
                 w_dtype = model.get_tensor_datatype(fc_w_name)
@@ -185,21 +203,24 @@ class InsertIODMA(Transformation):
                 # calculate width of stream output from DMA
                 pe = get_by_name(fc_node.attribute, "PE").i
                 simd = get_by_name(fc_node.attribute, "SIMD").i
-                assert pe * simd == w_shape[0], "Malformed weight matrix"
                 streamWidth = simd * pe * w_dtype.bitwidth()
                 # make new buffer
+                W = model.get_initializer(fc_w_name)
+                iodma_mem = self.get_mem_init(W, pe, simd)
+                model.set_initializer(fc_w_name, iodma_mem)
+
                 fc_node_in = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
                 )
                 model.graph.value_info.append(fc_node_in)
                 model.set_tensor_datatype(fc_node_in.name, w_dtype)
-                model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name))
+                model.set_initializer(fc_node_in.name, W)
                 dma_node = oh.make_node(
                     "IODMA",
                     [fc_w_name],
                     [fc_node_in.name],
-                    numInputVectors=[w_shape[1]],
-                    NumChannels=w_shape[0],
+                    numInputVectors=[iodma_mem.shape[0]],
+                    NumChannels=pe * simd,
                     dataType=str(w_dtype.name),
                     intfWidth=intfwidth,
                     streamWidth=streamWidth,
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 42f18d9a812d2db2119351dabfbb38e68c33194e..f75ef766dc939f6b8660825203e30ff3904cf5ea 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -38,12 +38,38 @@ import warnings
 import pkg_resources as pk
 from . import template_driver
 from finn.core.modelwrapper import ModelWrapper
+import numpy as np
+
+from finn.util.data_packing import (
+    pack_innermost_dim_as_hex_string,
+    hexstring2npbytearray,
+)
+from finn.util.basic import roundup_to_integer_multiple
+
+
+def to_external_tensor(init, w_dtype):
+    """Return an appropriately formatted and packed numpy byte array for given
+    external parameter tensor."""
+
+    weight_width = init.shape[1] * w_dtype.bitwidth()
+    weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+    hex_init = pack_innermost_dim_as_hex_string(
+        init, w_dtype, weight_width_padded, prefix="0x"
+    )
+    ext_weight = np.array([], dtype=np.uint8)
+    for line in hex_init:
+        array_line = [
+            x for x in reversed(hexstring2npbytearray(line, remove_prefix="0x"))
+        ]
+        ext_weight = np.append(ext_weight, array_line)
+
+    return ext_weight
 
 
 class MakePYNQDriver(Transformation):
     """Create PYNQ Python code to correctly interface the generated
     accelerator, including data packing/unpacking. Should be called
-    after conversion to HLS layers and folding, but prior to the creation of
+    after conversion to HLS layers, folding and the creation of
     dataflow partitions for correct operation.
 
     platform: one of ["zynq-iodma", "alveo"]
@@ -123,6 +149,40 @@ class MakePYNQDriver(Transformation):
         i_tensor_shape_packed = i_tensor_dummy_packed.shape
         o_tensor_shape_packed = o_tensor_dummy_packed.shape
 
+        # generate external weights npy files
+        weights_dir = pynq_driver_dir + "/runtime_weights"
+
+        os.makedirs(weights_dir)
+        idma_idx = 0
+        ext_weight_dma_cnt = 0
+
+        for node in model.graph.node:
+            assert (
+                node.op_type == "StreamingDataflowPartition"
+            ), "CreateDataflowPartition needs to be applied before driver generation"
+
+            producer = model.find_producer(node.input[0])
+            init_tensor = model.get_initializer(node.input[0])
+
+            if producer is None:  # input dma?
+                idma_name = "idma" + str(idma_idx)
+                if init_tensor is not None:  # input weights dma?
+                    ext_weight_dma_cnt += 1
+                    w_dtype = model.get_tensor_datatype(node.input[0])
+                    init_external_tensor = to_external_tensor(init_tensor, w_dtype)
+                    np.save(
+                        weights_dir + "/" + idma_name + ".npy", init_external_tensor
+                    )
+                    if self.platform != "alveo":
+                        # Todo: add support in driver_base.py
+                        warnings.warn(
+                            "external_weights not yet supported for Zynq builds"
+                        )
+                else:
+                    net_input_name = idma_name
+
+                idma_idx += 1
+
         # fill in the driver template
         driver_py = pynq_driver_dir + "/driver.py"
         driver = template_driver.pynq_driver_template
@@ -146,6 +206,8 @@ class MakePYNQDriver(Transformation):
         driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal))
         driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded))
         driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
+        driver = driver.replace("$INPUT_DMA_NAME$", "'%s'" % net_input_name)
+        driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt))
 
         with open(driver_py, "w") as f:
             f.write(driver)
@@ -172,25 +234,35 @@ class MakePYNQDriver(Transformation):
         shutil.copytree(dtp_root, pynq_driver_dir + "/finn/core")
 
         # generate weight files for runtime-writable layers
-        weights_dir = pynq_driver_dir + "/runtime_weights"
-        rt_layer_ind = 0
-        os.makedirs(weights_dir)
-        for node in model.graph.node:
-            if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
-                node_inst = getCustomOp(node)
-                is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights")
-                if is_rt_weights == 1:
-                    fcl_w = model.get_initializer(node.input[1])
-                    w_filename = weights_dir + "/%d_%s.dat" % (rt_layer_ind, node.name)
-                    node_inst.make_weight_file(fcl_w, "decoupled_runtime", w_filename)
-                    rt_layer_ind += 1
-            elif node.op_type == "StreamingDataflowPartition":
-                warnings.warn(
-                    """Please call MakePYNQDriver prior to
-                CreateDataflowPartition. Can only extract runtime-writable
-                weights from HLSCustomOp instances and not StreamingDataflowPartition.
-                """
-                )
-            else:
-                continue
+
+        for sdp_ind, sdp_node in enumerate(model.graph.node):
+            assert sdp_node.op_type == "StreamingDataflowPartition"
+            # get dataflow model
+            sdp_node = getCustomOp(sdp_node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            dataflow_model = ModelWrapper(dataflow_model_filename)
+            rt_layer_ind = 0
+            for node in dataflow_model.graph.node:
+                if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
+                    node_inst = getCustomOp(node)
+                    is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights")
+                    if is_rt_weights == 1:
+                        fcl_w = dataflow_model.get_initializer(node.input[1])
+                        w_filename = weights_dir + "/%d_%d_%s.dat" % (
+                            sdp_ind,
+                            rt_layer_ind,
+                            node.name,
+                        )
+                        node_inst.make_weight_file(
+                            fcl_w, "decoupled_runtime", w_filename
+                        )
+                        rt_layer_ind += 1
+                elif node.op_type == "StreamingDataflowPartition":
+                    warnings.warn(
+                        """Nested StreamingDataflowPartition are not supported
+                    """
+                    )
+                else:
+                    continue
+
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 1ac7ee178531e745bf68405d1ae9df35c0c216fb..3dab426ccf9bab73ddac83299bdc47f89ea46bdc 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -286,7 +286,10 @@ class MakeZYNQProject(Transformation):
 
 
 class ZynqBuild(Transformation):
-    """Best-effort attempt at building the accelerator for Zynq."""
+    """Best-effort attempt at building the accelerator for Zynq.
+    It assumes the model has only fpgadataflow nodes
+
+    """
 
     def __init__(self, platform, period_ns, enable_debug=False):
         super().__init__()
@@ -300,7 +303,6 @@ class ZynqBuild(Transformation):
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
         prep_transforms = [
-            MakePYNQDriver(platform="zynq-iodma"),
             InsertIODMA(64),
             InsertDWC(),
             Floorplan(),
@@ -335,6 +337,10 @@ class ZynqBuild(Transformation):
         model = model.transform(
             MakeZYNQProject(self.platform, enable_debug=self.enable_debug)
         )
+
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "zynq-iodma")
+
+        # create driver
+        model = model.transform(MakePYNQDriver(platform="zynq-iodma"))
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py
index b595205714d8cb630816d2b42fe96640e49e506e..5265835dd2530a5c93ceefbef629a43d6f33de52 100644
--- a/src/finn/transformation/fpgadataflow/template_driver.py
+++ b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -77,7 +77,9 @@ io_shape_dict = {
     "ishape_folded" : $INPUT_SHAPE_FOLDED$,
     "oshape_folded" : $OUTPUT_SHAPE_FOLDED$,
     "ishape_packed" : $INPUT_SHAPE_PACKED$,
-    "oshape_packed" : $OUTPUT_SHAPE_PACKED$
+    "oshape_packed" : $OUTPUT_SHAPE_PACKED$,
+    "input_dma_name" : $INPUT_DMA_NAME$,
+    "number_of_external_weights": $EXT_WEIGHT_NUM$
 }
 
 if __name__ == "__main__":
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index e52fb14b158a7927311d1b7e90067fea4bde6e27..0fe4276096852c08d0798be8e1ee715cc5769286 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -28,6 +28,7 @@
 
 import os
 import subprocess
+import json
 
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.base import Transformation
@@ -38,14 +39,17 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 )
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
-from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.floorplan import Floorplan
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    RemoveUnusedTensors,
+)
 from finn.util.basic import make_build_dir
 from finn.transformation.infer_data_layouts import InferDataLayouts
 from . import templates
@@ -89,63 +93,47 @@ class CreateVitisXO(Transformation):
         _check_vitis_envvars()
         vivado_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
         stitched_ip_dir = vivado_proj_dir + "/ip"
+        interfaces = json.loads(model.get_metadata_prop("vivado_stitch_ifnames"))
         args_string = []
-        m_axis_idx = 0
-        s_axis_idx = 0
+        arg_id = 0
         # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface
         # developed from instructions in UG1393 (v2019.2) and package_xo documentation
         # package_xo is responsible for generating the kernel xml
-        ifnames = eval(model.get_metadata_prop("vivado_stitch_ifnames"))
         assert (
-            len(ifnames["axilite"]) <= 1
+            len(interfaces["axilite"]) <= 1
         ), "CreateVitisXO supports max 1 AXI lite interface"
-        if len(ifnames["axilite"]) == 1:
-            axilite_intf_name = ifnames["axilite"][0]
-        else:
-            axilite_intf_name = None
-
-        for node in model.graph.node:
-            node_inst = getCustomOp(node)
-            arg_id = 0
-            if node.op_type == "TLastMarker":
-                stream_width = node_inst.get_nodeattr("StreamWidth")
-                # add a stream input or output port, based on direction
-                if node_inst.get_nodeattr("Direction") == "in":
-                    args_string.append(
-                        "{in:4:%s:s_axis_%d:0x0:0x0:ap_uint&lt;%s>:0}"
-                        % (str(arg_id), s_axis_idx, str(stream_width))
-                    )
-                    s_axis_idx += 1
-                else:
-                    args_string.append(
-                        "{out:4:%s:m_axis_%d:0x0:0x0:ap_uint&lt;%s>:0}"
-                        % (str(arg_id), m_axis_idx, str(stream_width))
+        axilite_intf_name = None
+        if len(interfaces["axilite"]) == 1:
+            axilite_intf_name = interfaces["axilite"][0]
+            if len(interfaces["aximm"]) > 0:
+                args_string.append(
+                    "{addr:1:%s:%s:0x8:0x10:ap_uint&lt;%s>*:0}"
+                    % (
+                        str(arg_id),
+                        interfaces["aximm"][0][0],
+                        str(interfaces["aximm"][0][1]),
                     )
-                    m_axis_idx += 1
+                )
                 arg_id += 1
-                # add a axilite port if dynamic
-                # add a count parameter if dynamic
-                if node_inst.get_nodeattr("DynIters") == 1:
-                    assert axilite_intf_name is not None
-                    args_string.append(
-                        "{numReps:0:%s:%s:0x4:0x10:uint:0}"
-                        % (str(arg_id), axilite_intf_name)
-                    )
-                    arg_id += 1
-            elif node.op_type == "IODMA":
-                port_width = node_inst.get_nodeattr("intfWidth")
-                # add an address parameter
-                # add a count parameter
                 args_string.append(
-                    "{addr:1:%s:m_axi_gmem0:0x8:0x10:ap_uint&lt;%s>*:0}"
-                    % (str(arg_id), str(port_width))
+                    "{numReps:0:%s:%s:0x4:0x1C:uint:0}" 
+                    % (str(arg_id), axilite_intf_name)
                 )
                 arg_id += 1
+            else:
                 args_string.append(
-                    "{numReps:0:%s:%s:0x4:0x1C:uint:0}"
+                    "{numReps:0:%s:%s:0x4:0x10:uint:0}"
                     % (str(arg_id), axilite_intf_name)
                 )
                 arg_id += 1
+        for intf in interfaces["s_axis"] + interfaces["m_axis"]:
+            stream_width = intf[1]
+            stream_name = intf[0]
+            args_string.append(
+                "{%s:4:%s:%s:0x0:0x0:ap_uint&lt;%s>:0}"
+                % (stream_name, str(arg_id), stream_name, str(stream_width))
+            )
+            arg_id += 1
 
         # save kernel xml then run package_xo
         xo_name = self.ip_name + ".xo"
@@ -342,6 +330,7 @@ class VitisLink(Transformation):
 
 class VitisBuild(Transformation):
     """Best-effort attempt at building the accelerator with Vitis.
+    It assumes the model has only fpgadataflow nodes
 
     fpga_part: string identifying the target FPGA
     period_ns: target clock period
@@ -377,7 +366,6 @@ class VitisBuild(Transformation):
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
         prep_transforms = [
-            MakePYNQDriver(platform="alveo"),
             InsertIODMA(512),
             InsertDWC(),
         ]
@@ -399,9 +387,7 @@ class VitisBuild(Transformation):
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
             kernel_model = kernel_model.transform(InsertFIFO())
-            kernel_model = kernel_model.transform(
-                InsertTLastMarker(both=True, external=False, dynamic=False)
-            )
+            kernel_model = kernel_model.transform(RemoveUnusedTensors())
             kernel_model = kernel_model.transform(GiveUniqueNodeNames())
             kernel_model.save(dataflow_model_filename)
             kernel_model = kernel_model.transform(
@@ -430,4 +416,6 @@ class VitisBuild(Transformation):
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "alveo")
 
+        #create driver
+        model = model.transform(MakePYNQDriver(platform="alveo"))
         return (model, False)
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index ddea2dafce02c181a279d9c95759b97dee00a504..2823dec1fbce9f2e6a5f5f681cf403c205ee0a2d 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -591,6 +591,45 @@ class TestEnd2End:
         update_dashboard_data(topology, wbits, abits, "board", cfg["board"])
         model.save(get_checkpoint_name(topology, wbits, abits, "build_" + kind))
 
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.vitis
+    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
+    def test_build_extweights(self, topology, wbits, abits, kind):
+        if "VITIS_PATH" not in os.environ:
+            pytest.skip("VITIS_PATH not set")
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, "fifodepth_" + kind
+        )
+        model = load_test_checkpoint_or_skip(prev_chkpt_name)
+        # select some FC layers, erase their implementation
+        # and set them to external weights
+        num_extw_layers = 0
+        for node in model.graph.node:
+            if node.op_type == "StreamingFCLayer_Batch":
+                node_inst = getCustomOp(node)
+                simd = node_inst.get_nodeattr("SIMD")
+                pe = node_inst.get_nodeattr("PE")
+                # skip layers which require very large IODMA DWCs
+                if (512 % simd) != 0 or ((pe * simd) % 32) != 0:
+                    continue
+                node_inst.set_nodeattr("code_gen_dir_ipgen", "")
+                node_inst.set_nodeattr("ipgen_path", "")
+                node_inst.set_nodeattr("mem_mode", "external")
+                num_extw_layers += 1
+        if num_extw_layers == 0:
+            pytest.skip("No layers suitable for external weights")
+        # build
+        cfg = get_build_env(kind, target_clk_ns)
+        model = model.transform(cfg["build_fxn"])
+        # check list of interfaces
+        # model = model.transform(AnnotateResources("synth"))
+        # synth_dct = eval(model.get_metadata_prop("res_total_top_synth"))
+        # for (k, v) in synth_dct.items():
+        #     update_dashboard_data(topology, wbits, abits, k, v)
+        # update_dashboard_data(topology, wbits, abits, "board", cfg["board"])
+        model.save(get_checkpoint_name(topology, wbits, abits, "build_" + kind+"_extweights"))
+
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
     def test_deploy(self, topology, wbits, abits, kind):
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "build_" + kind)