diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index b206e00a2eb6da1d76ccf57c078b16f61868a98c..bd938f17411ee42e94e95e02776ad8e973ea10fa 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -114,9 +114,9 @@ default_build_dataflow_steps = [
     "step_set_fifo_depths",
     "step_create_stitched_ip",
     "step_measure_rtlsim_performance",
-    "step_make_pynq_driver",
     "step_out_of_context_synthesis",
     "step_synthesize_bitfile",
+    "step_make_pynq_driver",
     "step_deployment_package",
 ]
 
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index 05e41a48a8f4cb34616bf06c01b652afb9ae4257..38940ccb94f11fe49af5f49ee020f150326a026c 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -63,7 +63,7 @@ class AddStreams_Batch(HLSCustomOp):
         ishape = tuple(vecs + [ich])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         assert ich % pe == 0, "PE must divide NumChannels"
@@ -362,5 +362,5 @@ class AddStreams_Batch(HLSCustomOp):
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
-        intf_names["s_axis"] = ["in0_V_V", "in1_V_V"]
+        intf_names["s_axis"].append(("in1_V_V", self.get_instream_width_padded()))
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 10a8051730217b56873b5a53c0803e3b90dada90..73da77bd3f940cee5ffd10fcfc43571f1a612eb4 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -312,7 +312,8 @@ class DuplicateStreams_Batch(HLSCustomOp):
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = [
             """DuplicateStreams_Batch<{}, {}> (in0, out0, out1, 1);""".format(
-                self.get_outstream_width(), self.get_number_output_values() // 2,
+                self.get_outstream_width(),
+                self.get_number_output_values() // 2,
             )
         ]
 
@@ -378,5 +379,8 @@ class DuplicateStreams_Batch(HLSCustomOp):
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
-        intf_names["m_axis"] = ["out0_V_V", "out1_V_V"]
+        intf_names["m_axis"] = [
+            ("out0_V_V", self.get_outstream_width_padded()),
+            ("out1_V_V", self.get_outstream_width_padded()),
+        ]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 02912b2d5f45b3bab0eaca13ee0a0bf19bf9cfca..2ab070b2fdc059a554930345a81abc368c29bfa7 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -123,15 +123,16 @@ class HLSCustomOp(CustomOp):
         """Return a dict of names of input and output interfaces.
         The keys reflect the protocols each interface implements:
         'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
-        Values are lists of names:
-        's_axis' names correspond to the list of node inputs in order,
-        'm_axis' names correspond to the list of node outputs in order'
+        Values are lists of tuples (axis, aximm) or names (axilite):
+        'axis' tuples correspond to the list of node inputs in order,
+        each tuple is (interface_name, interface_width_bits).
+        axilite always assumed to be 32 bits and is not tuple (name only).
         Each block must have at most one aximm and one axilite."""
         intf_names = {}
         intf_names["clk"] = ["ap_clk"]
         intf_names["rst"] = ["ap_rst_n"]
-        intf_names["s_axis"] = ["in0_V_V"]
-        intf_names["m_axis"] = ["out_V_V"]
+        intf_names["s_axis"] = [("in0_V_V", self.get_instream_width_padded())]
+        intf_names["m_axis"] = [("out_V_V", self.get_outstream_width_padded())]
         intf_names["aximm"] = []
         intf_names["axilite"] = []
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index a6cddcc4aeb45957c16249cd57f122fe5e58b85a..857496a2614894588ebf065db3e384cf2cecf106 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -355,11 +355,9 @@ class IODMA(HLSCustomOp):
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         if self.get_nodeattr("direction") == "out":
-            intf_names["s_axis"] = ["in0_V_V"]
             intf_names["m_axis"] = []
         else:
             intf_names["s_axis"] = []
-            intf_names["m_axis"] = ["out_V_V"]
         intf_names["axilite"] = ["s_axi_control"]
-        intf_names["aximm"] = ["m_axi_gmem"]
+        intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("intfWidth"))]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 23c1779a27c123583c0c8af5f53d022d03e78126..4d84b74dce001fca769ed2850a8f718ac942f14c 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -395,8 +395,8 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 8868002c9e2cb8726eeb573e104140e3e1a61d27..3cc01ade73fc6b735509f2839e5c10785a8b9f54 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -444,12 +444,24 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         single_pe_w = simd * weight_bits
         return max([weightstream, max_of_io, single_pe_w])
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
         simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
         sf = mw // simd
+        nf = mh // pe
         vecs = list(self.get_nodeattr("numInputVectors"))
-        folded_input_shape = tuple(vecs + [sf, simd])
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple(vecs + [sf, simd])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple(vecs + [sf * nf, simd * pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
         return folded_input_shape
 
     def get_folded_output_shape(self):
@@ -1253,8 +1265,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
@@ -1348,8 +1360,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 # TODO calculate and pass in segment size here
                 cmd.append("assign_bd_address")
             cmd.append("save_bd_design")
-        elif mem_mode == "const":
-            # base class impl sufficient for const mode
+        elif mem_mode == "const" or mem_mode == "external":
+            # base class impl sufficient for const/external modes
             return super().code_generation_ipi()
         else:
             raise Exception("Unrecognized mem_mode for StreamingFCLayer")
@@ -1359,7 +1371,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         intf_names = super().get_verilog_top_module_intf_names()
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "external":
-            intf_names["s_axis"] = ["in0_V_V", "weights_V_V"]
+            intf_names["s_axis"].append(
+                ("weights_V_V", self.get_weightstream_width_padded())
+            )
         if mem_mode == "decoupled":
             # only expose axilite interface if attribute is set
             runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index fb41bceca09fe544bd729537b1af726c9c43d290..133a869b28cf9968a719e243a3266dfb25b637ba 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -343,8 +343,8 @@ class StreamingFIFO(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 30374a7d97f4d2189e142a9b7b6e44a5abbb46b0..0b248c15035a2b685ebfb024c8a944a6ea6c65bf 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -825,8 +825,8 @@ class Thresholding_Batch(HLSCustomOp):
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
             rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
index 9ec03ea5dd726b49b157a92addef05f85f02b644..f430402538b873c3db7c93ceca79d324d878571d 100644
--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -86,25 +86,78 @@ class FINNExampleOverlay(Overlay):
         self.batch_size = batch_size
         self.fclk_mhz = fclk_mhz
         if self.platform == "alveo":
-            self.idma = self.idma0
+            if "input_dma_name" in io_shape_dict.keys():
+                self.idma = getattr(self, io_shape_dict["input_dma_name"])
+            else:
+                self.idma = self.idma0
             self.odma = self.odma0
             self.odma_handle = None
         elif self.platform == "zynq-iodma":
-            self.idma = self.idma0
+            if "input_dma_name" in io_shape_dict.keys():
+                self.idma = getattr(self, io_shape_dict["input_dma_name"])
+            else:
+                self.idma = self.idma0
             self.odma = self.odma0
             # set the clock frequency as specified by user during transformations
             if self.fclk_mhz > 0:
                 Clocks.fclk0_mhz = self.fclk_mhz
         else:
             raise ValueError("Supported platforms are zynq-iodma alveo")
-        # load any runtime weights
+        # load any external + runtime weights
+        self.load_external_weights()
         self.load_runtime_weights()
 
+    def load_external_weights(self):
+        """Load any existing external (DRAM) weights from the specified dir into the
+        appropriate layer of the accelerator. Note that this must be enabled
+        during the accelerator build process. The weights directory
+        is specified as the class member ``runtime_weight_dir``. External (DRAM)
+        weights are one .npy file per layer.
+        """
+
+        self.external_weights = []
+        w_filenames = []
+        if not os.path.isdir(self.runtime_weight_dir):
+            return
+        for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir):
+            w_filenames.extend(filenames)
+
+        tmp_weight_dict = {}
+
+        for w_filename in w_filenames:
+            if w_filename.endswith(".npy"):
+                weight_tensor = np.load(self.runtime_weight_dir + "/" + w_filename)
+            else:
+                continue
+
+            idma_name = w_filename.split(".")[0]
+            tmp_weight_dict[idma_name] = weight_tensor
+
+        for idma_name in tmp_weight_dict.keys():
+            if idma_name in self.ip_dict.keys():
+                iwdma = getattr(self, idma_name)
+                weight_tensor = tmp_weight_dict[idma_name]
+                weight_buf = allocate(weight_tensor.shape, dtype=np.uint8)
+                weight_buf[:] = weight_tensor
+                # weight_buf.sync_to_device()
+                weight_buf.flush()
+
+                self.external_weights += [(iwdma, weight_buf, idma_name)]
+
+        if "number_of_external_weights" in self._io_shape_dict:
+            hw_ext_weights = self._io_shape_dict["number_of_external_weights"]
+            assert len(self.external_weights) == hw_ext_weights, (
+                "Number of hardware external weights and number of external "
+                + "weight tensors available do not match. \n"
+                + "Is runtime_weight_dir pointing to the correct folder?"
+            )
+
     def load_runtime_weights(self, flush_accel=True, verify=True):
-        """Load any existing runtime weights from the specified dir into the
+        """Load any existing runtime-writable weights from the specified dir into the
         appropriate layer of the accelerator. Note that this must be enabled
         during the accelerator build process. The runtime weights directory
-        is specified as the class member ``runtime_weight_dir``.
+        is specified as the class member ``runtime_weight_dir``. Runtime-writable
+        weights are provided as one .dat file per layer.
 
         Parameters
         ----------
@@ -124,18 +177,25 @@ class FINNExampleOverlay(Overlay):
             if w_filename.endswith(".dat"):
                 with open(self.runtime_weight_dir + "/" + w_filename, "r") as f:
                     dat = f.read()
+            else:
+                continue
             layer_w = np.fromiter(
                 [int(x, 16) for x in dat.strip().split()], dtype=np.uint32
             )
-            layer_ind = int(w_filename.split("_")[0])
-            rt_weight_dict[layer_ind] = layer_w
-        for layer_ind in rt_weight_dict.keys():
-            cand_if_name = "StreamingDataflowPartition_1/s_axilite_%d" % layer_ind
+            sdp_ind = int(w_filename.split("_")[0])
+            layer_ind = int(w_filename.split("_")[1])
+            rt_weight_dict[(sdp_ind, layer_ind)] = layer_w
+        for sdp_ind, layer_ind in rt_weight_dict.keys():
+            cand_if_name = "StreamingDataflowPartition_%d/s_axilite_%d" % (
+                sdp_ind,
+                layer_ind,
+            )
             if cand_if_name in self.ip_dict.keys():
                 layer_mmio = getattr(
-                    self.StreamingDataflowPartition_1, "s_axilite_%d" % layer_ind
+                    getattr(self, "StreamingDataflowPartition_%d" % sdp_ind),
+                    "s_axilite_%d" % layer_ind,
                 ).mmio
-                layer_w = rt_weight_dict[layer_ind]
+                layer_w = rt_weight_dict[(sdp_ind, layer_ind)]
                 layer_mmio.write_mm(0, layer_w.tobytes())
                 if verify:
                     new_w = np.copy(layer_mmio.array[: layer_w.shape[0]])
@@ -280,6 +340,10 @@ class FINNExampleOverlay(Overlay):
         if self.platform == "zynq-iodma":
             assert self.odma.read(0x00) & 0x4 != 0, "Output DMA is not idle"
             # manually launch IODMAs since signatures are missing
+            for iwdma, iwbuf, iwdma_name in self.external_weights:
+                iwdma.write(0x10, iwbuf.device_address)
+                iwdma.write(0x1C, batch_size)
+                iwdma.write(0x00, 1)
             self.idma.write(0x10, self.ibuf_packed_device.device_address)
             self.idma.write(0x1C, batch_size)
             self.odma.write(0x10, self.obuf_packed_device.device_address)
@@ -289,6 +353,8 @@ class FINNExampleOverlay(Overlay):
         elif self.platform == "alveo":
             assert self.odma_handle is None, "Output DMA is already running"
             self.idma.start(self.ibuf_packed_device, batch_size)
+            for iwdma, iwbuf, iwdma_name in self.external_weights:
+                iwdma.start(iwbuf, batch_size)
             self.odma_handle = self.odma.start(self.obuf_packed_device, batch_size)
         else:
             raise Exception("Unrecognized platform: %s" % self.platform)
@@ -340,6 +406,10 @@ class FINNExampleOverlay(Overlay):
         res["DRAM_out_bandwidth[Mb/s]"] = (
             np.prod(self.oshape_packed) * 0.000001 / runtime
         )
+        for iwdma, iwbuf, iwdma_name in self.external_weights:
+            res["DRAM_extw_%s_bandwidth[Mb/s]" % iwdma_name] = (
+                self.batch_size * np.prod(iwbuf.shape) * 0.000001 / runtime
+            )
         if self.platform == "zynq-iodma":
             res["fclk[mhz]"] = Clocks.fclk0_mhz
         elif self.platform == "alveo":
diff --git a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json
new file mode 100644
index 0000000000000000000000000000000000000000..299a8be815aeaba70c0f41e4b1b3252b77c6f042
--- /dev/null
+++ b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json
@@ -0,0 +1,30 @@
+{
+    "Defaults": {},
+    "Thresholding_Batch_0": {
+      "PE": 49,
+      "ram_style": "distributed"
+    },
+    "StreamingFCLayer_Batch_0": {
+      "PE": 16,
+      "SIMD": 49,
+      "ram_style": "block"
+    },
+    "StreamingFCLayer_Batch_1": {
+      "PE": 8,
+      "SIMD": 8,
+      "mem_mode": "external"
+    },
+    "StreamingFCLayer_Batch_2": {
+      "PE": 8,
+      "SIMD": 8,
+      "mem_mode": "external"
+    },
+    "StreamingFCLayer_Batch_3": {
+      "PE": 10,
+      "SIMD": 8,
+      "ram_style": "distributed"
+    },
+    "LabelSelect_Batch_0": {
+      "PE": 1
+    }
+  }
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 19fa5c603bfafe16ed151e10fa8eb11a79106ede..738f2000a1929024d3808dd7bad0267338b51659 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -29,17 +29,43 @@
 import os
 import warnings
 import subprocess
+import json
 
 from finn.transformation.base import Transformation
-from finn.util.basic import get_by_name, make_build_dir, is_finn_op
+from finn.util.basic import make_build_dir, get_num_default_workers
+from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.custom_op.registry import getCustomOp
-from finn.util.basic import get_num_default_workers
 import multiprocessing as mp
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
 
 
+def is_external_input(model, node, i):
+    # indicate whether input i of node should be made external
+    # True only if input is unconnected and has no initializer
+    # Only esception is second input of FC layers when mem_mode is external
+    node_inst = getCustomOp(node)
+    producer = model.find_producer(node.input[i])
+    if producer is None:
+        if model.get_initializer(node.input[i]) is None:
+            return True
+        else:
+            if node.op_type == "StreamingFCLayer_Batch":
+                if node_inst.get_nodeattr("mem_mode") == "external":
+                    return True
+    return False
+
+
+def is_external_output(model, node, i):
+    # indicate whether output i of node should be made external
+    # True only if output is unconnected
+    consumers = model.find_consumers(node.output[i])
+    if consumers is None:
+        return True
+    return False
+
+
 class CreateStitchedIP(Transformation):
     """Create a Vivado IP Block Design project from all the generated IPs of a
     graph. All nodes in the graph must have the fpgadataflow backend attribute,
@@ -134,21 +160,24 @@ class CreateStitchedIP(Transformation):
         if len(aximm_intf_name) != 0:
             self.connect_cmds.append(
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
-                % (inst_name, aximm_intf_name[0])
+                % (inst_name, aximm_intf_name[0][0])
             )
             self.connect_cmds.append(
                 "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]"
             )
-            self.intf_names["aximm"] = ["m_axi_gmem0"]
+            self.intf_names["aximm"] = [("m_axi_gmem0", aximm_intf_name[0][1])]
             assert self.has_aximm is False, "Currently limited to one AXI-MM interface"
             self.has_aximm = True
 
-    def connect_m_axis_external(self, node):
+    def connect_m_axis_external(self, node, idx=None):
         inst_name = node.name
         node_inst = getCustomOp(node)
         output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"]
         # make output axis external
-        for output_intf_name in output_intf_names:
+        for i in range(len(output_intf_names)):
+            if idx is not None and idx != i:
+                continue
+            output_intf_name = output_intf_names[i][0]
             self.connect_cmds.append(
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
                 % (inst_name, output_intf_name)
@@ -158,15 +187,20 @@ class CreateStitchedIP(Transformation):
                 % (self.m_axis_idx, output_intf_name)
             )
             self.has_m_axis = True
-            self.intf_names["m_axis"].append("m_axis_%d" % self.m_axis_idx)
+            self.intf_names["m_axis"].append(
+                ("m_axis_%d" % self.m_axis_idx, output_intf_names[i][1])
+            )
             self.m_axis_idx += 1
 
-    def connect_s_axis_external(self, node):
+    def connect_s_axis_external(self, node, idx=None):
         inst_name = node.name
         node_inst = getCustomOp(node)
         input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"]
         # make input axis external
-        for input_intf_name in input_intf_names:
+        for i in range(len(input_intf_names)):
+            if idx is not None and idx != i:
+                continue
+            input_intf_name = input_intf_names[i][0]
             self.connect_cmds.append(
                 "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
                 % (inst_name, input_intf_name)
@@ -176,7 +210,9 @@ class CreateStitchedIP(Transformation):
                 % (self.s_axis_idx, input_intf_name)
             )
             self.has_s_axis = True
-            self.intf_names["s_axis"].append("s_axis_%d" % self.s_axis_idx)
+            self.intf_names["s_axis"].append(
+                ("s_axis_%d" % self.s_axis_idx, input_intf_names[i][1])
+            )
             self.s_axis_idx += 1
 
     def apply(self, model):
@@ -187,70 +223,38 @@ class CreateStitchedIP(Transformation):
         ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
         # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
-            assert is_finn_op(node.domain), "Found non-FINN node"
-            backend_attribute = get_by_name(node.attribute, "backend")
-            assert backend_attribute is not None, "Backend node attribute is not set."
-            backend_value = backend_attribute.s.decode("UTF-8")
-            assert (
-                backend_value == "fpgadataflow"
-            ), """Backend node attribute is not
-            set to "fpgadataflow"."""
+            assert is_fpgadataflow_node(
+                node
+            ), "All nodes must be FINN fpgadataflow nodes."
             node_inst = getCustomOp(node)
             ip_dir_value = node_inst.get_nodeattr("ip_path")
             assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist."
             ip_dirs += [ip_dir_value]
             self.create_cmds += node_inst.code_generation_ipi()
-            my_producer = model.find_producer(node.input[0])
             self.connect_clk_rst(node)
             self.connect_axi(node)
-            if my_producer is None:
-                # first node in graph
-                self.connect_s_axis_external(node)
-                if node.op_type == "TLastMarker":
-                    assert (
-                        node_inst.get_nodeattr("Direction") == "in"
-                    ), """Output TLastMarker incorrect direction"""
-                elif node.op_type == "IODMA" and len(model.graph.node) != 1:
-                    # don't apply this check for a 1-node partition
-                    assert (
-                        node_inst.get_nodeattr("direction") == "in"
-                    ), """Input DMA incorrect direction"""
-            else:
-                # intermediate node
-                # wire up input(s) to previous node output(s)
-                # foreach input
-                #     find producer
-                #     find index of producer output connected to our target input
-                #     get names of hdl interfaces for input and producer output
-                #     issue a TCL directive to connect input to output
-                #     if FC layer with mode "decoupled", add a streamer on input 1
-                for i in range(len(node.input)):
+            for i in range(len(node.input)):
+                if is_external_input(model, node, i):
+                    self.connect_s_axis_external(node, idx=i)
+                else:
                     producer = model.find_producer(node.input[i])
                     if producer is None:
                         continue
                     j = list(producer.output).index(node.input[i])
                     src_intf_name = getCustomOp(
                         producer
-                    ).get_verilog_top_module_intf_names()["m_axis"][j]
+                    ).get_verilog_top_module_intf_names()["m_axis"][j][0]
                     dst_intf_name = node_inst.get_verilog_top_module_intf_names()[
                         "s_axis"
-                    ][i]
+                    ][i][0]
                     self.connect_cmds.append(
                         "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
                         "[get_bd_intf_pins %s/%s]"
                         % (producer.name, src_intf_name, node.name, dst_intf_name)
                     )
-            if model.find_consumers(node.output[0]) is None:
-                # last node in graph
-                self.connect_m_axis_external(node)
-                if node.op_type == "TLastMarker":
-                    assert (
-                        node_inst.get_nodeattr("Direction") == "out"
-                    ), """Output TLastMarker incorrect direction"""
-                elif node.op_type == "IODMA" and len(model.graph.node) != 1:
-                    assert (
-                        node_inst.get_nodeattr("direction") == "out"
-                    ), """Output DMA incorrect direction"""
+            for i in range(len(node.output)):
+                if is_external_output(model, node, i):
+                    self.connect_m_axis_external(node, idx=i)
 
         # create a temporary folder for the project
         prjname = "finn_vivado_stitch_proj"
@@ -316,7 +320,7 @@ class CreateStitchedIP(Transformation):
         block_library = "finn"
         block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name)
         model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv)
-        model.set_metadata_prop("vivado_stitch_ifnames", str(self.intf_names))
+        model.set_metadata_prop("vivado_stitch_ifnames", json.dumps(self.intf_names))
         tcl.append(
             (
                 "ipx::package_project -root_dir %s/ip -vendor %s "
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 0f2b8ef6a4c0858cd98218538930c97c6df2ad9d..c8df80659d30e1855fc658bad83c3fe9bccb9bf9 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -44,8 +44,8 @@ class InsertDWC(Transformation):
         for n in graph.node:
             node_ind += 1
             if _suitable_node(n):
-                for n_output in n.output:
-                    consumers = model.find_consumers(n_output)
+                for output_name in n.output:
+                    consumers = model.find_consumers(output_name)
                     if consumers is None:
                         continue
                     if len(consumers) > 1:
@@ -59,7 +59,22 @@ class InsertDWC(Transformation):
                         n0 = getCustomOp(n)
                         n1 = getCustomOp(consumer)
                         n0_out_shape = n0.get_folded_output_shape()
-                        n1_in_shape = n1.get_folded_input_shape()
+
+                        # If FC and external mem, it could be connected to input 1
+                        if (
+                            consumer.op_type == "StreamingFCLayer_Batch"
+                            and n1.get_nodeattr("mem_mode") == "external"
+                        ):
+                            # get input idx
+                            in_idx = None
+                            for idx, n_input in enumerate(consumer.input):
+                                if output_name == n_input:
+                                    in_idx = idx
+                            assert in_idx is not None, "Malformed model"
+                            n1_in_shape = n1.get_folded_input_shape(in_idx)
+                        else:
+                            n1_in_shape = n1.get_folded_input_shape()
+
                         if n0_out_shape[-1] != n1_in_shape[-1]:
                             graph_modified = True
                             # determine dwc inwidth
@@ -82,7 +97,7 @@ class InsertDWC(Transformation):
 
                             dwc_node = oh.make_node(
                                 "StreamingDataWidthConverter_Batch",
-                                [n_output],
+                                [output_name],
                                 [dwc_output_tensor.name],
                                 domain="finn.custom_op.fpgadataflow",
                                 backend="fpgadataflow",
@@ -96,7 +111,7 @@ class InsertDWC(Transformation):
 
                             # set dwc output tensor as new input tensor of second node
                             for idx, inp in enumerate(consumer.input):
-                                if inp == n_output:
+                                if inp == output_name:
                                     consumer.input[idx] = dwc_output_tensor.name
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 67143547557a9b24b311e69cff6f885f8745cd3c..27055a4fd29dba3849c0e4a889f27802f8c36081 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -33,7 +33,6 @@ from finn.util.basic import get_by_name
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.transformation.general import SortGraph
-import finn.core.data_layout as DataLayout
 import math
 import numpy as np
 
@@ -48,6 +47,45 @@ class InsertIODMA(Transformation):
         ), "max_intfwidth must be a power of 2"
         self.max_intfwidth = max_intfwidth
 
+    def get_mem_init(self, weights, pe, simd):
+        """
+        Returns matrix ready for pack_innermost_dim_as_hex_string with
+        reverse=False (finn.util.data_packing) to return the memory init file
+        little endian packed.
+        That is, get_mem_init returns:
+        elem(pe,simd)
+        addr = 0: [(pe-1,simd-1),(pe-1,simd-2),...(0,1),(0,0)]
+        addr = 1: [(pe-1,simd*2-1),.......(0,simd+1),(0,simd)]
+        .
+        """
+
+        # TODO: refactor this into streamingfclayer_batch.py, could go into
+        # make_weight_file except it doesn't write a file but returns a npy
+        # array instead
+        w_shape = weights.shape
+        assert len(w_shape) == 2, "weights withincorrect number of dims"
+        inp_w, out_w = w_shape
+
+        assert out_w % pe == 0, "Malformed weight matrix"
+        assert inp_w % simd == 0, "Malformed weight matrix"
+        reshaped_w = np.zeros(inp_w * out_w).reshape(-1, pe * simd)
+
+        addr = 0
+        for fr in range(out_w // pe):
+            for fc in range(inp_w // simd):
+                w0_lower = fc * simd
+                w0_upper = (fc + 1) * simd
+                w1_lower = fr * pe
+                w1_upper = (fr + 1) * pe
+                tile = weights[w0_lower:w0_upper, w1_lower:w1_upper]
+                for p in range(pe):
+                    rw0_lower = p * simd
+                    rw0_upper = (p + 1) * simd
+                    reshaped_w[addr, rw0_lower:rw0_upper] = tile[:, p].transpose()
+                addr += 1
+        reshaped_w = np.flip(reshaped_w, axis=-1)
+        return reshaped_w
+
     def apply(self, model):
         # only makes sense for a pure fpgadataflow graph -- so we check!
         all_nodes = list(model.graph.node)
@@ -59,8 +97,7 @@ class InsertIODMA(Transformation):
         fc_extw_nodes = list(
             filter(
                 lambda x: x.op_type == "StreamingFCLayer_Batch"
-                and get_by_name(x.attribute, "mem_mode") is not None
-                and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external"
+                and getCustomOp(x).get_nodeattr("mem_mode") == "external"
                 and model.find_producer(x.input[1]) is None,
                 all_nodes,
             )
@@ -78,11 +115,6 @@ class InsertIODMA(Transformation):
             return (model, False)
         else:
             if final_node.op_type != "IODMA":
-                # check if tensor is NHWC
-                assert (
-                    model.get_tensor_layout(graph_out_name) == DataLayout.NHWC
-                    or model.get_tensor_layout(graph_out_name) == DataLayout.NC
-                ), "Data layout of output tensor must be NHWC or NC"
                 out_shape = model.get_tensor_shape(graph_out_name)
                 out_dtype = model.get_tensor_datatype(graph_out_name)
                 final_node_inst = getCustomOp(final_node)
@@ -123,11 +155,6 @@ class InsertIODMA(Transformation):
                 )
                 model.graph.node.append(dma_node)
             if first_node.op_type != "IODMA":
-                # check if tensor is NHWC
-                assert (
-                    model.get_tensor_layout(graph_in_name) == DataLayout.NHWC
-                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
-                ), "Data layout of input tensor must be NHWC or NC"
                 in_shape = model.get_tensor_shape(graph_in_name)
                 in_dtype = model.get_tensor_datatype(graph_in_name)
                 first_node_inst = getCustomOp(first_node)
@@ -168,11 +195,7 @@ class InsertIODMA(Transformation):
                 )
                 model.graph.node.insert(0, dma_node)
             for fc_node in fc_extw_nodes:
-                # check if tensor is NHWC
-                assert (
-                    model.get_tensor_layout(fc_node.input[1]) == DataLayout.NHWC
-                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
-                ), "Data layout of tensors must be NHWC or NC"
+                fc_inst = getCustomOp(fc_node)
                 fc_w_name = fc_node.input[1]
                 w_shape = model.get_tensor_shape(fc_w_name)
                 w_dtype = model.get_tensor_datatype(fc_w_name)
@@ -185,21 +208,24 @@ class InsertIODMA(Transformation):
                 # calculate width of stream output from DMA
                 pe = get_by_name(fc_node.attribute, "PE").i
                 simd = get_by_name(fc_node.attribute, "SIMD").i
-                assert pe * simd == w_shape[0], "Malformed weight matrix"
-                streamWidth = simd * pe * w_dtype.bitwidth()
+                streamWidth = fc_inst.get_weightstream_width_padded()
                 # make new buffer
+                W = model.get_initializer(fc_w_name)
+                iodma_mem = self.get_mem_init(W, pe, simd)
+                model.set_initializer(fc_w_name, iodma_mem)
+
                 fc_node_in = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape
                 )
                 model.graph.value_info.append(fc_node_in)
                 model.set_tensor_datatype(fc_node_in.name, w_dtype)
-                model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name))
+                model.set_initializer(fc_node_in.name, W)
                 dma_node = oh.make_node(
                     "IODMA",
                     [fc_w_name],
                     [fc_node_in.name],
-                    numInputVectors=[w_shape[1]],
-                    NumChannels=w_shape[0],
+                    numInputVectors=[iodma_mem.shape[0]],
+                    NumChannels=pe * simd,
                     dataType=str(w_dtype.name),
                     intfWidth=intfwidth,
                     streamWidth=streamWidth,
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 42f18d9a812d2db2119351dabfbb38e68c33194e..6ab12548abbcbe00496101bd146b2c9b873204c8 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -38,12 +38,38 @@ import warnings
 import pkg_resources as pk
 from . import template_driver
 from finn.core.modelwrapper import ModelWrapper
+import numpy as np
+
+from finn.util.data_packing import (
+    pack_innermost_dim_as_hex_string,
+    hexstring2npbytearray,
+)
+from finn.util.basic import roundup_to_integer_multiple
+
+
+def to_external_tensor(init, w_dtype):
+    """Return an appropriately formatted and packed numpy byte array for given
+    external parameter tensor."""
+
+    weight_width = init.shape[1] * w_dtype.bitwidth()
+    weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+    hex_init = pack_innermost_dim_as_hex_string(
+        init, w_dtype, weight_width_padded, prefix="0x"
+    )
+    ext_weight = np.array([], dtype=np.uint8)
+    for line in hex_init:
+        array_line = [
+            x for x in reversed(hexstring2npbytearray(line, remove_prefix="0x"))
+        ]
+        ext_weight = np.append(ext_weight, array_line)
+
+    return ext_weight
 
 
 class MakePYNQDriver(Transformation):
     """Create PYNQ Python code to correctly interface the generated
     accelerator, including data packing/unpacking. Should be called
-    after conversion to HLS layers and folding, but prior to the creation of
+    after conversion to HLS layers, folding and the creation of
     dataflow partitions for correct operation.
 
     platform: one of ["zynq-iodma", "alveo"]
@@ -123,6 +149,35 @@ class MakePYNQDriver(Transformation):
         i_tensor_shape_packed = i_tensor_dummy_packed.shape
         o_tensor_shape_packed = o_tensor_dummy_packed.shape
 
+        # generate external weights npy files
+        weights_dir = pynq_driver_dir + "/runtime_weights"
+
+        os.makedirs(weights_dir)
+        idma_idx = 0
+        ext_weight_dma_cnt = 0
+
+        for node in model.graph.node:
+            assert (
+                node.op_type == "StreamingDataflowPartition"
+            ), "CreateDataflowPartition needs to be applied before driver generation"
+
+            producer = model.find_producer(node.input[0])
+            init_tensor = model.get_initializer(node.input[0])
+
+            if producer is None:  # input dma?
+                idma_name = "idma" + str(idma_idx)
+                if init_tensor is not None:  # input weights dma?
+                    ext_weight_dma_cnt += 1
+                    w_dtype = model.get_tensor_datatype(node.input[0])
+                    init_external_tensor = to_external_tensor(init_tensor, w_dtype)
+                    np.save(
+                        weights_dir + "/" + idma_name + ".npy", init_external_tensor
+                    )
+                else:
+                    net_input_name = idma_name
+
+                idma_idx += 1
+
         # fill in the driver template
         driver_py = pynq_driver_dir + "/driver.py"
         driver = template_driver.pynq_driver_template
@@ -146,6 +201,8 @@ class MakePYNQDriver(Transformation):
         driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal))
         driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded))
         driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed))
+        driver = driver.replace("$INPUT_DMA_NAME$", "'%s'" % net_input_name)
+        driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt))
 
         with open(driver_py, "w") as f:
             f.write(driver)
@@ -172,25 +229,35 @@ class MakePYNQDriver(Transformation):
         shutil.copytree(dtp_root, pynq_driver_dir + "/finn/core")
 
         # generate weight files for runtime-writable layers
-        weights_dir = pynq_driver_dir + "/runtime_weights"
-        rt_layer_ind = 0
-        os.makedirs(weights_dir)
-        for node in model.graph.node:
-            if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
-                node_inst = getCustomOp(node)
-                is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights")
-                if is_rt_weights == 1:
-                    fcl_w = model.get_initializer(node.input[1])
-                    w_filename = weights_dir + "/%d_%s.dat" % (rt_layer_ind, node.name)
-                    node_inst.make_weight_file(fcl_w, "decoupled_runtime", w_filename)
-                    rt_layer_ind += 1
-            elif node.op_type == "StreamingDataflowPartition":
-                warnings.warn(
-                    """Please call MakePYNQDriver prior to
-                CreateDataflowPartition. Can only extract runtime-writable
-                weights from HLSCustomOp instances and not StreamingDataflowPartition.
-                """
-                )
-            else:
-                continue
+
+        for sdp_ind, sdp_node in enumerate(model.graph.node):
+            assert sdp_node.op_type == "StreamingDataflowPartition"
+            # get dataflow model
+            sdp_node = getCustomOp(sdp_node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            dataflow_model = ModelWrapper(dataflow_model_filename)
+            rt_layer_ind = 0
+            for node in dataflow_model.graph.node:
+                if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]:
+                    node_inst = getCustomOp(node)
+                    is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights")
+                    if is_rt_weights == 1:
+                        fcl_w = dataflow_model.get_initializer(node.input[1])
+                        w_filename = weights_dir + "/%d_%d_%s.dat" % (
+                            sdp_ind,
+                            rt_layer_ind,
+                            node.name,
+                        )
+                        node_inst.make_weight_file(
+                            fcl_w, "decoupled_runtime", w_filename
+                        )
+                        rt_layer_ind += 1
+                elif node.op_type == "StreamingDataflowPartition":
+                    warnings.warn(
+                        """Nested StreamingDataflowPartition are not supported
+                    """
+                    )
+                else:
+                    continue
+
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 1ac7ee178531e745bf68405d1ae9df35c0c216fb..3dab426ccf9bab73ddac83299bdc47f89ea46bdc 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -286,7 +286,10 @@ class MakeZYNQProject(Transformation):
 
 
 class ZynqBuild(Transformation):
-    """Best-effort attempt at building the accelerator for Zynq."""
+    """Best-effort attempt at building the accelerator for Zynq.
+    It assumes the model has only fpgadataflow nodes
+
+    """
 
     def __init__(self, platform, period_ns, enable_debug=False):
         super().__init__()
@@ -300,7 +303,6 @@ class ZynqBuild(Transformation):
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
         prep_transforms = [
-            MakePYNQDriver(platform="zynq-iodma"),
             InsertIODMA(64),
             InsertDWC(),
             Floorplan(),
@@ -335,6 +337,10 @@ class ZynqBuild(Transformation):
         model = model.transform(
             MakeZYNQProject(self.platform, enable_debug=self.enable_debug)
         )
+
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "zynq-iodma")
+
+        # create driver
+        model = model.transform(MakePYNQDriver(platform="zynq-iodma"))
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py
index b595205714d8cb630816d2b42fe96640e49e506e..5265835dd2530a5c93ceefbef629a43d6f33de52 100644
--- a/src/finn/transformation/fpgadataflow/template_driver.py
+++ b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -77,7 +77,9 @@ io_shape_dict = {
     "ishape_folded" : $INPUT_SHAPE_FOLDED$,
     "oshape_folded" : $OUTPUT_SHAPE_FOLDED$,
     "ishape_packed" : $INPUT_SHAPE_PACKED$,
-    "oshape_packed" : $OUTPUT_SHAPE_PACKED$
+    "oshape_packed" : $OUTPUT_SHAPE_PACKED$,
+    "input_dma_name" : $INPUT_DMA_NAME$,
+    "number_of_external_weights": $EXT_WEIGHT_NUM$
 }
 
 if __name__ == "__main__":
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index e52fb14b158a7927311d1b7e90067fea4bde6e27..0fe4276096852c08d0798be8e1ee715cc5769286 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -28,6 +28,7 @@
 
 import os
 import subprocess
+import json
 
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.base import Transformation
@@ -38,14 +39,17 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 )
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
-from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.floorplan import Floorplan
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    RemoveUnusedTensors,
+)
 from finn.util.basic import make_build_dir
 from finn.transformation.infer_data_layouts import InferDataLayouts
 from . import templates
@@ -89,63 +93,47 @@ class CreateVitisXO(Transformation):
         _check_vitis_envvars()
         vivado_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
         stitched_ip_dir = vivado_proj_dir + "/ip"
+        interfaces = json.loads(model.get_metadata_prop("vivado_stitch_ifnames"))
         args_string = []
-        m_axis_idx = 0
-        s_axis_idx = 0
+        arg_id = 0
         # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface
         # developed from instructions in UG1393 (v2019.2) and package_xo documentation
         # package_xo is responsible for generating the kernel xml
-        ifnames = eval(model.get_metadata_prop("vivado_stitch_ifnames"))
         assert (
-            len(ifnames["axilite"]) <= 1
+            len(interfaces["axilite"]) <= 1
         ), "CreateVitisXO supports max 1 AXI lite interface"
-        if len(ifnames["axilite"]) == 1:
-            axilite_intf_name = ifnames["axilite"][0]
-        else:
-            axilite_intf_name = None
-
-        for node in model.graph.node:
-            node_inst = getCustomOp(node)
-            arg_id = 0
-            if node.op_type == "TLastMarker":
-                stream_width = node_inst.get_nodeattr("StreamWidth")
-                # add a stream input or output port, based on direction
-                if node_inst.get_nodeattr("Direction") == "in":
-                    args_string.append(
-                        "{in:4:%s:s_axis_%d:0x0:0x0:ap_uint&lt;%s>:0}"
-                        % (str(arg_id), s_axis_idx, str(stream_width))
-                    )
-                    s_axis_idx += 1
-                else:
-                    args_string.append(
-                        "{out:4:%s:m_axis_%d:0x0:0x0:ap_uint&lt;%s>:0}"
-                        % (str(arg_id), m_axis_idx, str(stream_width))
+        axilite_intf_name = None
+        if len(interfaces["axilite"]) == 1:
+            axilite_intf_name = interfaces["axilite"][0]
+            if len(interfaces["aximm"]) > 0:
+                args_string.append(
+                    "{addr:1:%s:%s:0x8:0x10:ap_uint&lt;%s>*:0}"
+                    % (
+                        str(arg_id),
+                        interfaces["aximm"][0][0],
+                        str(interfaces["aximm"][0][1]),
                     )
-                    m_axis_idx += 1
+                )
                 arg_id += 1
-                # add a axilite port if dynamic
-                # add a count parameter if dynamic
-                if node_inst.get_nodeattr("DynIters") == 1:
-                    assert axilite_intf_name is not None
-                    args_string.append(
-                        "{numReps:0:%s:%s:0x4:0x10:uint:0}"
-                        % (str(arg_id), axilite_intf_name)
-                    )
-                    arg_id += 1
-            elif node.op_type == "IODMA":
-                port_width = node_inst.get_nodeattr("intfWidth")
-                # add an address parameter
-                # add a count parameter
                 args_string.append(
-                    "{addr:1:%s:m_axi_gmem0:0x8:0x10:ap_uint&lt;%s>*:0}"
-                    % (str(arg_id), str(port_width))
+                    "{numReps:0:%s:%s:0x4:0x1C:uint:0}" 
+                    % (str(arg_id), axilite_intf_name)
                 )
                 arg_id += 1
+            else:
                 args_string.append(
-                    "{numReps:0:%s:%s:0x4:0x1C:uint:0}"
+                    "{numReps:0:%s:%s:0x4:0x10:uint:0}"
                     % (str(arg_id), axilite_intf_name)
                 )
                 arg_id += 1
+        for intf in interfaces["s_axis"] + interfaces["m_axis"]:
+            stream_width = intf[1]
+            stream_name = intf[0]
+            args_string.append(
+                "{%s:4:%s:%s:0x0:0x0:ap_uint&lt;%s>:0}"
+                % (stream_name, str(arg_id), stream_name, str(stream_width))
+            )
+            arg_id += 1
 
         # save kernel xml then run package_xo
         xo_name = self.ip_name + ".xo"
@@ -342,6 +330,7 @@ class VitisLink(Transformation):
 
 class VitisBuild(Transformation):
     """Best-effort attempt at building the accelerator with Vitis.
+    It assumes the model has only fpgadataflow nodes
 
     fpga_part: string identifying the target FPGA
     period_ns: target clock period
@@ -377,7 +366,6 @@ class VitisBuild(Transformation):
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
         prep_transforms = [
-            MakePYNQDriver(platform="alveo"),
             InsertIODMA(512),
             InsertDWC(),
         ]
@@ -399,9 +387,7 @@ class VitisBuild(Transformation):
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
             kernel_model = kernel_model.transform(InsertFIFO())
-            kernel_model = kernel_model.transform(
-                InsertTLastMarker(both=True, external=False, dynamic=False)
-            )
+            kernel_model = kernel_model.transform(RemoveUnusedTensors())
             kernel_model = kernel_model.transform(GiveUniqueNodeNames())
             kernel_model.save(dataflow_model_filename)
             kernel_model = kernel_model.transform(
@@ -430,4 +416,6 @@ class VitisBuild(Transformation):
         # set platform attribute for correct remote execution
         model.set_metadata_prop("platform", "alveo")
 
+        #create driver
+        model = model.transform(MakePYNQDriver(platform="alveo"))
         return (model, False)
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
index c9259afabfa6fcae0020e378c79ce391c218408f..eedbf97f389754440a116cf8755c25d597c433ee 100644
--- a/tests/end2end/test_end2end_cybsec_mlp.py
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -1,3 +1,31 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import torch
 from brevitas.nn import QuantLinear, QuantReLU
 import torch.nn as nn
diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..0407395ed57dc07c6700efcebbb1fc8a767877bb
--- /dev/null
+++ b/tests/end2end/test_ext_weights.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+import os
+import shutil
+from finn.util.test import get_build_env, load_test_checkpoint_or_skip
+import pytest
+from finn.util.basic import make_build_dir
+import pkg_resources as pk
+import wget
+import subprocess
+
+target_clk_ns = 10
+build_kind = "zynq"
+build_dir = os.environ["FINN_BUILD_DIR"]
+onnx_zip_url = "https://github.com/Xilinx/finn-examples"
+onnx_zip_url += "/releases/download/v0.0.1a/onnx-models-bnn-pynq.zip"
+onnx_zip_local = build_dir + "/onnx-models-bnn-pynq.zip"
+onnx_dir_local = build_dir + "/onnx-models-bnn-pynq"
+
+
+def get_checkpoint_name(step):
+    if step == "build":
+        # checkpoint for build step is an entire dir
+        return build_dir + "/end2end_ext_weights_build"
+    elif step == "download":
+        return onnx_dir_local + "/tfc-w1a1.onnx"
+    else:
+        # other checkpoints are onnx files
+        return build_dir + "/end2end_ext_weights_%s.onnx" % (step)
+
+
+def test_end2end_ext_weights_download():
+    if not os.path.isfile(onnx_zip_local):
+        wget.download(onnx_zip_url, out=onnx_zip_local)
+    assert os.path.isfile(onnx_zip_local)
+    subprocess.check_output(["unzip", "-o", onnx_zip_local, "-d", onnx_dir_local])
+    assert os.path.isfile(get_checkpoint_name("download"))
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_ext_weights_build():
+    model_file = get_checkpoint_name("download")
+    load_test_checkpoint_or_skip(model_file)
+    build_env = get_build_env(build_kind, target_clk_ns)
+    folding_config_file = pk.resource_filename(
+        "finn.qnn-data", "test_ext_weights/tfc-w1a1-extw.json"
+    )
+    output_dir = make_build_dir("test_end2end_ext_weights_build")
+    cfg = build.DataflowBuildConfig(
+        output_dir=output_dir,
+        folding_config_file=folding_config_file,
+        synth_clk_period_ns=target_clk_ns,
+        board=build_env["board"],
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.BITFILE,
+            build_cfg.DataflowOutputType.PYNQ_DRIVER,
+            build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
+        ],
+    )
+    build.build_dataflow_cfg(model_file, cfg)
+    assert os.path.isfile(output_dir + "/deploy/bitfile/finn-accel.bit")
+    assert os.path.isfile(output_dir + "/deploy/bitfile/finn-accel.hwh")
+    assert os.path.isfile(output_dir + "/deploy/driver/driver.py")
+    assert os.path.isfile(output_dir + "/deploy/driver/runtime_weights/idma0.npy")
+    if os.path.isdir(get_checkpoint_name("build")):
+        shutil.rmtree(get_checkpoint_name("build"))
+    shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build"))
+
+
+def test_end2end_ext_weights_run_on_hw():
+    build_env = get_build_env(build_kind, target_clk_ns)
+    deploy_dir = get_checkpoint_name("build")
+    if not os.path.isdir(deploy_dir):
+        pytest.skip(deploy_dir + " not found from previous test step, skipping")
+    driver_dir = deploy_dir + "/driver"
+    assert os.path.isdir(driver_dir)
+    # create a shell script for running validation: 10 batches x 10 imgs
+    with open(driver_dir + "/validate.sh", "w") as f:
+        f.write(
+            """#!/bin/bash
+cd %s/driver
+echo %s | sudo -S python3.6 validate.py --dataset mnist --bitfile %s
+        """
+            % (
+                build_env["target_dir"] + "/end2end_ext_weights_build",
+                build_env["password"],
+                "../bitfile/finn-accel.bit",
+            )
+        )
+    # set up rsync command
+    remote_target = "%s@%s:%s" % (
+        build_env["username"],
+        build_env["ip"],
+        build_env["target_dir"],
+    )
+    rsync_res = subprocess.run(
+        [
+            "sshpass",
+            "-p",
+            build_env["password"],
+            "rsync",
+            "-avz",
+            deploy_dir,
+            remote_target,
+        ]
+    )
+    assert rsync_res.returncode == 0
+    remote_verif_cmd = [
+        "sshpass",
+        "-p",
+        build_env["password"],
+        "ssh",
+        "%s@%s" % (build_env["username"], build_env["ip"]),
+        "sh",
+        build_env["target_dir"] + "/end2end_ext_weights_build/driver/validate.sh",
+    ]
+    verif_res = subprocess.run(
+        remote_verif_cmd,
+        stdout=subprocess.PIPE,
+        universal_newlines=True,
+        input=build_env["password"],
+    )
+    assert verif_res.returncode == 0
+    log_output = verif_res.stdout.split("\n")
+    assert log_output[-3] == "batch 100 / 100 : total OK 9296 NOK 704"
+    assert log_output[-2] == "Final accuracy: 92.960000"