diff --git a/src/finn/custom_op/streamingdataflowpartition.py b/src/finn/custom_op/streamingdataflowpartition.py
index b63326d676f4ded5ec1dd62f5cc7f02d7acb82ad..bce4dde426b8838d6c86638a3641d51ab259a6db 100644
--- a/src/finn/custom_op/streamingdataflowpartition.py
+++ b/src/finn/custom_op/streamingdataflowpartition.py
@@ -83,7 +83,7 @@ class StreamingDataflowPartition(CustomOp):
             )
 
         # verify the number of inputs
-        if len(self.onnx_node.input) == 1:
+        if len(self.onnx_node.input) >= 1:
             info_messages.append("The number of inputs is correct")
         else:
             info_messages.append("StreamingDataflowPartition needs 1 data input")
diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
index e0f990600d9ca4be748b662b47ce8296d3d462ce..7197e68be2fbdf5fc39b7ed202e88672614514ec 100644
--- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
+++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py
@@ -45,58 +45,89 @@ class CreateDataflowPartition(Transformation):
         super().__init__()
 
     def apply(self, model):
-        # TODO we currently assume that all dataflow nodes are connected to
-        # each other, forming a single partition. check the assumption and/or
-        # improve this.
-        all_nodes = list(model.graph.node)
-        df_nodes = filter(
-            lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes
-        )
-        df_nodes = filter(
-            lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8")
-            == "fpgadataflow",
-            df_nodes,
-        )
-        df_nodes = list(df_nodes)
-        non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes)
-        non_df_nodes = list(non_df_nodes)
-
-        if len(df_nodes) == 0:
-            # no changes if no dataflow nodes are present
-            return (model, False)
-        else:
-            # partition the model into two models
-            df_model = copy.deepcopy(model)
-            non_df_model = model
-            # remove all non-dataflow nodes from the dataflow model
-            for node_to_remove in non_df_nodes:
-                df_model.graph.node.remove(node_to_remove)
-            # identify the entry and exit points for the dataflow part
-            df_in = df_model.graph.node[0].input[0]
-            df_out = df_model.graph.node[-1].output[0]
-            df_in_vi = df_model.get_tensor_valueinfo(df_in)
-            df_out_vi = df_model.get_tensor_valueinfo(df_out)
-            # set df graph in/out to be df_in/df_out
-            df_model.graph.input.remove(df_model.graph.input[0])
-            df_model.graph.input.insert(0, df_in_vi)
-            df_model.graph.output.remove(df_model.graph.output[0])
-            df_model.graph.output.insert(0, df_out_vi)
-            df_model_dir = make_build_dir("dataflow_partition_")
-            df_model_filename = df_model_dir + "/df_model.onnx"
-            df_model.save(df_model_filename)
-            # remove all dataflow nodes from the non-dataflow model
-            # keep track of where the dataflow part starts
-            df_start_ind = all_nodes.index(df_nodes[0])
-            for node_to_remove in df_nodes:
-                non_df_model.graph.node.remove(node_to_remove)
-            # create StreamingDataflow node with df_in/df_out io
-            df_node = helper.make_node(
-                "StreamingDataflowPartition",
-                [df_in],
-                [df_out],
-                # use the model attribute to mark the df model
-                model=df_model_filename,
+        target_partition_id = 0
+        # we currently assume that all dataflow nodes belonging to the same partition
+        # are connected to each other and there is a single input/output to/from each.
+        # NOTE: all dataflow nodes with no partition_id set are moved to partition 0
+        # TODO: check the assumption and/or improve this.
+        while True:
+            all_nodes = list(model.graph.node)
+            df_nodes = filter(
+                lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes
+            )
+            df_nodes = filter(
+                lambda x: get_by_name(x.attribute, "backend").s.decode("UTF-8")
+                == "fpgadataflow"
+                and (
+                    get_by_name(x.attribute, "partition_id") is None
+                    or get_by_name(x.attribute, "partition_id").i == target_partition_id
+                )
+                and x.op_type != "StreamingDataflowPartition",
+                df_nodes,
             )
-            non_df_model.graph.node.insert(df_start_ind, df_node)
+            df_nodes = list(df_nodes)
+            non_df_nodes = filter(lambda x: x not in df_nodes, all_nodes)
+            non_df_nodes = list(non_df_nodes)
+
+            if len(df_nodes) == 0:
+                # no changes if no dataflow nodes are present
+                break
+            else:
+                # partition the model into two models
+                df_model = copy.deepcopy(model)
+                non_df_model = model
+                # remove all non-dataflow nodes from the dataflow model
+                for node_to_remove in non_df_nodes:
+                    df_model.graph.node.remove(node_to_remove)
+                # identify the entry and exit points for the dataflow part
+                df_in = df_model.graph.node[0].input[0]
+                df_out = df_model.graph.node[-1].output[0]
+                df_in_vi = df_model.get_tensor_valueinfo(df_in)
+                df_out_vi = df_model.get_tensor_valueinfo(df_out)
+                # set df graph in/out to be df_in/df_out
+                df_model.graph.input.remove(df_model.graph.input[0])
+                df_model.graph.input.insert(0, df_in_vi)
+                df_model.graph.output.remove(df_model.graph.output[0])
+                df_model.graph.output.insert(0, df_out_vi)
+                # parse StreamingFCLayers looking for external weight memories
+                fc_extw_nodes = filter(
+                    lambda x: x.op_type == "StreamingFCLayer_Batch"
+                    and get_by_name(x.attribute, "mem_mode") is not None
+                    and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8")
+                    == "external",
+                    df_nodes,
+                )
+                fc_extw_nodes = list(fc_extw_nodes)
+                extra_df_inputs = []
+
+                for i in range(len(fc_extw_nodes)):
+                    fc_weight_vi = df_model.get_tensor_valueinfo(
+                        fc_extw_nodes[i].input[1]
+                    )
+                    df_model.graph.input.insert(i + 1, fc_weight_vi)
+                    extra_df_inputs.append(fc_extw_nodes[i].input[1])
+
+                # save model
+                df_model_dir = make_build_dir(
+                    "dataflow_partition" + str(target_partition_id) + "_"
+                )
+                df_model_filename = df_model_dir + "/df_model.onnx"
+                df_model.save(df_model_filename)
+                # remove all dataflow nodes from the non-dataflow model
+                # keep track of where the dataflow part starts
+                df_start_ind = all_nodes.index(df_nodes[0])
+                for node_to_remove in df_nodes:
+                    non_df_model.graph.node.remove(node_to_remove)
+                # create StreamingDataflow node with df_in/df_out io
+                df_node = helper.make_node(
+                    "StreamingDataflowPartition",
+                    [df_in] + extra_df_inputs,
+                    [df_out],
+                    # use the model attribute to mark the df model
+                    model=df_model_filename,
+                )
+                non_df_model.graph.node.insert(df_start_ind, df_node)
+                model = non_df_model
+                target_partition_id += 1
 
-        return (non_df_model, False)
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index b01f8cbe5c48db6c5288b2db1a8b009ea09ce6c0..85a2d47be0599a852b223f1a65d3ec04efe9bda7 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -118,8 +118,11 @@ class InsertFIFO(Transformation):
                         graph_modified = True
 
         if graph_modified is False:
-            # insert FIFO as first node
-            if graph.node[0].op_type != "StreamingFIFO":
+            # insert FIFO as first node, except when first node is DMA
+            if (
+                graph.node[0].op_type != "StreamingFIFO"
+                and graph.node[0].op_type != "IODMA"
+            ):
                 n = graph.node[0]
                 n_input = n.input[0]
                 n0 = getCustomOp(n)
@@ -153,8 +156,11 @@ class InsertFIFO(Transformation):
                 # set fifo output tensor as new input tensor of second node
                 n.input[0] = fifo_output_tensor.name
 
-            # insert FIFO as last node
-            if graph.node[-1].op_type != "StreamingFIFO":
+            # insert FIFO as last node, except when last node is DMA
+            if (
+                graph.node[-1].op_type != "StreamingFIFO"
+                and graph.node[0].op_type != "IODMA"
+            ):
                 n = graph.node[-1]
                 assert (
                     n.op_type != "TLastMarker"
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index e4368edea717f7499481e9b1c6ac20f7d5bb5f58..0cd7c0d4d41accf8cdba8adfaf4dbb00fc0cab7a 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -171,6 +171,7 @@ class InsertIODMA(Transformation):
                 # calculate width of stream output from DMA
                 pe = get_by_name(fc_node.attribute, "PE").i
                 simd = get_by_name(fc_node.attribute, "SIMD").i
+                assert pe * simd == w_shape[0], "Malformed weight matrix"
                 streamWidth = simd * pe * w_dtype.bitwidth()
                 # make new buffer
                 fc_node_in = oh.make_tensor_value_info(
@@ -178,12 +179,13 @@ class InsertIODMA(Transformation):
                 )
                 model.graph.value_info.append(fc_node_in)
                 model.set_tensor_datatype(fc_node_in.name, w_dtype)
+                model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name))
                 dma_node = oh.make_node(
                     "IODMA",
                     [fc_w_name],
                     [fc_node_in.name],
-                    numInputVectors=w_shape[:-1],
-                    NumChannels=w_shape[-1],
+                    numInputVectors=[w_shape[1]],
+                    NumChannels=w_shape[0],
                     dataType=str(w_dtype.name),
                     intfWidth=intfwidth,
                     streamWidth=streamWidth,
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 04dd437af27b9fbe18b2255c20a8e4acda03b3d0..bbb0e43fda464e919a7d8c9dcd25e08a49b33cec 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -38,7 +38,8 @@ import numpy as np
 
 class InsertTLastMarker(Transformation):
     """Ensure that the graph is started/terminated with a TLastMarker node, inserting
-    one if necessary. Use constructor args to determine type of TLastMarker to be inserted.
+    one if necessary.
+    Use constructor args to determine type of TLastMarker to be inserted.
     More information available on the TLastMarker documentation.
     """
 
@@ -90,41 +91,78 @@ class InsertTLastMarker(Transformation):
             graph_modified = True
         # if both is True, also insert marker on input
         if self.both:
-            graph_in_name = model.graph.input[0].name
-            first_node = model.find_consumer(graph_in_name)
-            if first_node.op_type != "TLastMarker" and not (
-                first_node.op_type == "IODMA"
-                and get_by_name(first_node.attribute, "direction").s.decode("UTF-8")
-                == "in"
-            ):
+            # detect and parse graph inputs
+            insert_idx = 0
+            graph_in_names = [x.name for x in model.graph.input]
+            for graph_in_name in graph_in_names:
+                first_node = model.find_consumers(graph_in_name)
+                # skip if no consumers (this may be the case for unused initializers)
+                # TODO: fix this with a cleanup transform
+                if first_node is None:
+                    continue
+                assert len(first_node) == 1, "Input fans out to multiple nodes"
+                first_node = first_node[0]
+                # several scenarios exclude the node:
+                # 1. node is a FC layer with internal weights, in which case
+                #    the input is in the list of graph inputs because it has an
+                #    initializer (TODO: fix this with a clean-up transform)
+                if (
+                    first_node.op_type == "StreamingFCLayer_Batch"
+                    and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8")
+                    != "external"
+                ):
+                    continue
+                # 2. node is either a TLastMarker or an input IODMA
+                if first_node.op_type != "TLastMarker" and not (
+                    first_node.op_type == "IODMA"
+                    and get_by_name(first_node.attribute, "direction").s.decode("UTF-8")
+                    == "in"
+                ):
 
-                custom_op = getCustomOp(first_node)
-                num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1])
-                stream_width = int(custom_op.get_instream_width())
-                in_shape = model.get_tensor_shape(graph_in_name)
-                in_dtype = model.get_tensor_datatype(graph_in_name)
-                elem_width = in_dtype.bitwidth()
-                # make new buffer
-                first_node_in = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
-                )
-                model.graph.value_info.append(first_node_in)
-                model.set_tensor_datatype(first_node_in.name, in_dtype)
-                # reroute final node output to first_node_in_name
-                first_node.input[0] = first_node_in.name
-                tlast_node = oh.make_node(
-                    "TLastMarker",
-                    [graph_in_name],
-                    [first_node_in.name],
-                    NumIters=num_iters,
-                    StreamWidth=stream_width,
-                    ElemWidth=elem_width,
-                    DynIters=(1 if self.dyniters else 0),
-                    Direction="in",
-                    Protocol=("external" if self.external else "internal"),
-                    domain="finn",
-                    backend="fpgadataflow",
-                )
-                model.graph.node.insert(0, tlast_node)
-                graph_modified = True
+                    custom_op = getCustomOp(first_node)
+                    num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1])
+                    inp_idx = list(first_node.input).index(graph_in_name)
+                    if inp_idx > 0:
+                        if (
+                            first_node.op_type == "StreamingFCLayer_Batch"
+                            and inp_idx == 1
+                        ):
+                            stream_width = int(custom_op.get_weightstream_width())
+                        elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1:
+                            stream_width = int(custom_op.get_instream_width())
+                        else:
+                            raise Exception("No method to determine stream width")
+                    else:
+                        stream_width = int(custom_op.get_instream_width())
+                    in_shape = model.get_tensor_shape(graph_in_name)
+                    in_dtype = model.get_tensor_datatype(graph_in_name)
+                    elem_width = in_dtype.bitwidth()
+                    # make new buffer
+                    first_node_in = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                    )
+                    model.graph.value_info.append(first_node_in)
+                    model.set_tensor_datatype(first_node_in.name, in_dtype)
+                    ini = model.get_initializer(graph_in_name)
+                    # copy initializer if it exists
+                    if ini is not None:
+                        model.set_initializer(first_node_in.name, ini)
+                    # reroute final node output to first_node_in_name
+                    first_node.input[inp_idx] = first_node_in.name
+                    tlast_node = oh.make_node(
+                        "TLastMarker",
+                        [graph_in_name],
+                        [first_node_in.name],
+                        NumIters=num_iters,
+                        StreamWidth=stream_width,
+                        ElemWidth=elem_width,
+                        DynIters=(1 if self.dyniters else 0),
+                        Direction="in",
+                        Protocol=("external" if self.external else "internal"),
+                        domain="finn",
+                        backend="fpgadataflow",
+                    )
+                    model.graph.node.insert(insert_idx, tlast_node)
+                    graph_modified = True
+                    insert_idx += 1
         return (model, graph_modified)