diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index 0944304c70cc0e3416a136b14563b53470546fdd..97d37cd99aad227aa3ec36fc2b06d83cb4171aca 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -1,9 +1,75 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import numpy as np
+import math
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
 
 
+# the IODMA inerfaces a memory-mapped AXI interface and an AXI stream
+# direction "in": pulls data from AXI-MM to AXI stream
+# direction "out": pushes data from AXI stream to AXI-MM
+
+# DMA Addressing
+# - burst mode can be "wrap" or "increment"
+# - "increment" bursts will increment the address when moving to the next image
+# - "wrap" bursts will reinitialize the address to the start address,
+#   and are useful for e.g. streaming weights, where the same buffer is
+#   repeatedly read into the FPGA
+# - no additional alignment restrictions beyond anything specified in the AXI spec
+
+# Interfaces
+# - AXI-MM interface width (in bits) is specified by intfWidth
+# - AXI-Stream interface width (in bits) is specified by streamWidth
+# - If inftWidth and streamWidth are not equal, the DMA core performs
+#   width conversion by going up to the least common multiple of bitwidths
+#   e.g. intfWidth=32b -> 96b -> sreamWidth=24b
+# - transfers occur in multiples of the AXI-MM interface width, therefore
+#   the total number of bits in the tensor must be a multiple of intfWidth
+# - transfers occur in multiples of the AXI-Stream interface width, therefore
+#   the total number of bits in the tensor must be a multiple of streamWidth
+# - both interface widths must be a multiple of 8b (AXI protocol requirement)
+# - in most systems, intfWidth is also restricted to a power of 2 (e.g. Vitis)
+#   but this is not universal so we don't check here explicitly
+
+# Input/output tensor sizes shapes
+# - The data being moved is a tensor of shape numInputVectors+[NumChannels]
+# - The data type of the tensor elements is specified by dataType
+# - on the stream side
+#       -the normal shape is the same as the ONNX tensor attached to it
+#       -the folded shape is computed from the stream width and normal shape
+# - on the AXI-MM side
+#       -the normal shape is the same as the one on the stream side
+#       -the folded shape is not defined
+
+
 class IODMA(HLSCustomOp):
     """Class that corresponds to finn-hlslib DMA function(s)."""
 
@@ -15,6 +81,8 @@ class IODMA(HLSCustomOp):
             "NumChannels": ("i", True, 0),
             # FINN input datatype
             "dataType": ("s", True, ""),
+            # Stream parameters
+            "streamWidth": ("i", False, 32),
             # DMA-specific parameters
             "intfWidth": ("i", False, 32),
             "burstMode": ("s", False, "increment"),
@@ -35,17 +103,38 @@ class IODMA(HLSCustomOp):
         return self.get_normal_input_shape()
 
     def get_folded_input_shape(self):
-        shape = list(self.get_normal_input_shape())
-        itype_bits = self.get_input_datatype().bitwidth()
-        intfw = self.get_nodeattr("intfWidth")
-        elems_per_word = intfw / itype_bits
-        fold_depth = round(shape[-1] / elems_per_word)
-        shape[-1] = fold_depth
-        shape.append(elems_per_word)
-        return tuple(shape)
+        if self.get_nodeattr("direction") == "in":
+            raise ValueError("Folded input shape not defined for input IODMA")
+        else:
+            shape = list(self.get_normal_input_shape())
+            itype_bits = self.get_input_datatype().bitwidth()
+            intfw = self.get_nodeattr("streamWidth")
+            assert (
+                intfw % itype_bits == 0
+            ), "Input stream width must be a multiple of datatype bits"
+            elems_per_word = intfw // itype_bits
+            assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
+            fold_depth = shape[-1] // elems_per_word
+            shape[-1] = fold_depth
+            shape.append(elems_per_word)
+            return tuple(shape)
 
     def get_folded_output_shape(self):
-        return self.get_folded_input_shape()
+        if self.get_nodeattr("direction") == "out":
+            raise ValueError("Folded output shape not defined for output IODMA")
+        else:
+            shape = list(self.get_normal_output_shape())
+            itype_bits = self.get_output_datatype().bitwidth()
+            intfw = self.get_nodeattr("streamWidth")
+            assert (
+                intfw % itype_bits == 0
+            ), "Input stream width must be a multiple of datatype bits"
+            elems_per_word = intfw // itype_bits
+            assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
+            fold_depth = shape[-1] // elems_per_word
+            shape[-1] = fold_depth
+            shape.append(elems_per_word)
+            return tuple(shape)
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
@@ -86,10 +175,16 @@ class IODMA(HLSCustomOp):
         return self.get_input_datatype()
 
     def get_instream_width(self):
-        return self.get_nodeattr("intfWidth")
+        if self.get_nodeattr("direction") == "in":
+            return self.get_nodeattr("intfWidth")
+        else:
+            return self.get_nodeattr("streamWidth")
 
     def get_outstream_width(self):
-        return self.get_instream_width()
+        if self.get_nodeattr("direction") == "out":
+            return self.get_nodeattr("intfWidth")
+        else:
+            return self.get_nodeattr("streamWidth")
 
     def get_number_output_values(self):
         oshape = self.get_normal_output_shape()
@@ -103,6 +198,7 @@ class IODMA(HLSCustomOp):
 
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "dma.h"']
+        self.code_gen_dict["$GLOBALS$"].append('#include "streamtools.h"')
 
     def defines(self, var):
         itype_bits = self.get_input_datatype().bitwidth()
@@ -115,6 +211,13 @@ class IODMA(HLSCustomOp):
             )
         ]
 
+    def get_ap_int_max_w(self):
+        "Return the maximum width of any ap_int used in this module."
+        instream = self.get_instream_width()
+        outstream = self.get_outstream_width()
+        width_lcm = (instream * outstream) // math.gcd(instream, outstream)
+        return width_lcm
+
     def docompute(self):
         direction = self.get_nodeattr("direction")
         mode = self.get_nodeattr("burstMode")
@@ -123,25 +226,53 @@ class IODMA(HLSCustomOp):
                 func = "Mem2Stream_Batch_external_wmem"
             else:
                 func = "Mem2Stream_Batch"
+            dwc_func = "WidthAdjustedOutputStream"
         else:
             func = "Stream2Mem_Batch"
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<DataWidth1, NumBytes1>(in0, out, numReps);""".format(func,)
-        ]
+            dwc_func = "WidthAdjustedInputStream"
+        # define templates for instantiation
+        dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);"
+        dwc_inst_template = dwc_func + "<%d, %d, %d> %s(%s, numReps);"
+        # do stream infrastructure and instantiations
+        intfw = self.get_nodeattr("intfWidth")
+        strmw = self.get_nodeattr("streamWidth")
+        width_lcm = (strmw * intfw) // math.gcd(strmw, intfw)
+        # we always need two streams: one of width_lcm, and one of intfw width
+        # because we use WidthAdjustedInputStream,
+        dtype_bits = self.get_input_datatype().bitwidth()
+        total_bits = dtype_bits * np.prod(self.get_normal_input_shape())
+        if direction == "in":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                dwc_inst_template
+                % (width_lcm, strmw, total_bits // width_lcm, "dwc_lcm", "out"),
+                dwc_inst_template
+                % (intfw, width_lcm, total_bits // intfw, "dwc_intfw", "dwc_lcm"),
+                dma_inst_template % ("in0", "dwc_intfw"),
+            ]
+        else:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                dwc_inst_template
+                % (strmw, width_lcm, total_bits // strmw, "dwc_lcm", "in0"),
+                dwc_inst_template
+                % (width_lcm, intfw, total_bits // width_lcm, "dwc_intfw", "dwc_lcm"),
+                dma_inst_template % ("dwc_intfw", "out"),
+            ]
 
     def blackboxfunction(self):
-        packed_bits = self.get_instream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
+        packed_ibits = self.get_instream_width()
+        packed_hls_type_in = "ap_uint<%d>" % packed_ibits
+        packed_obits = self.get_outstream_width()
+        packed_hls_type_out = "ap_uint<%d>" % packed_obits
         direction = self.get_nodeattr("direction")
         if direction == "in":
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
                 "void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)"
-                % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+                % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
             ]
         else:
             self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
                 "void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)"
-                % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+                % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
             ]
 
     def pragmas(self):
@@ -172,6 +303,7 @@ class IODMA(HLSCustomOp):
             self.code_gen_dict["$PRAGMAS$"].append(
                 "#pragma HLS INTERFACE s_axilite port=out bundle=control"
             )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW")
 
     def execute_node(self, context, graph):
         pass
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 22e8e2876a7249040355fc02df4a70a6b2afcb01..e3808114adf4669513f7c3cdb6cf587c37412e13 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -29,7 +29,10 @@
 from onnx import TensorProto
 from onnx import helper as oh
 
+from finn.util.basic import get_by_name
+from finn.custom_op.registry import getCustomOp
 from finn.transformation import Transformation
+from finn.transformation.general import SortGraph
 import math
 import numpy as np
 
@@ -45,12 +48,30 @@ class InsertIODMA(Transformation):
         self.max_intfwidth = max_intfwidth
 
     def apply(self, model):
-        # TODO only makes sense for a pure fpgadataflow graph -- check!
+        # only makes sense for a pure fpgadataflow graph -- so we check!
+        all_nodes = list(model.graph.node)
+        assert all(
+            get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
+            for x in all_nodes
+        )
+        # parse streamingfclayers looking for external weights with no attached IODMA
+        fc_extw_nodes = list(
+            filter(
+                lambda x: x.op_type == "StreamingFCLayer_Batch"
+                and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external"
+                and model.find_producer(x.input[1]) is None,
+                all_nodes,
+            )
+        )
         graph_in_name = model.graph.input[0].name
         first_node = model.find_consumer(graph_in_name)
         graph_out_name = model.graph.output[0].name
         final_node = model.find_producer(graph_out_name)
-        if final_node.op_type == "IODMA" and first_node.op_type == "IODMA":
+        if (
+            final_node.op_type == "IODMA"
+            and first_node.op_type == "IODMA"
+            and len(fc_extw_nodes) == 0
+        ):
             # TODO maybe check the correctness of properties
             return (model, False)
         else:
@@ -63,6 +84,8 @@ class InsertIODMA(Transformation):
                 assert (
                     intfwidth % 8 == 0
                 ), "No feasible interface width for transfer size"
+                # get width of stream input to DMA
+                streamWidth = getCustomOp(final_node).get_outstream_width()
                 # make new buffer
                 final_node_out = oh.make_tensor_value_info(
                     model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
@@ -79,6 +102,7 @@ class InsertIODMA(Transformation):
                     NumChannels=out_shape[-1],
                     dataType=str(out_dtype.name),
                     intfWidth=intfwidth,
+                    streamWidth=streamWidth,
                     direction="out",
                     domain="finn",
                     backend="fpgadataflow",
@@ -93,6 +117,8 @@ class InsertIODMA(Transformation):
                 assert (
                     intfwidth % 8 == 0
                 ), "No feasible interface width for transfer size"
+                # get width of stream output from DMA
+                streamWidth = getCustomOp(first_node).get_instream_width()
                 # make new buffer
                 first_node_in = oh.make_tensor_value_info(
                     model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
@@ -109,10 +135,47 @@ class InsertIODMA(Transformation):
                     NumChannels=in_shape[-1],
                     dataType=str(in_dtype.name),
                     intfWidth=intfwidth,
+                    streamWidth=streamWidth,
                     direction="in",
                     domain="finn",
                     backend="fpgadataflow",
                 )
                 model.graph.node.insert(0, dma_node)
-
+            for fc_node in fc_extw_nodes:
+                fc_w_name = fc_node.input[1]
+                w_shape = model.get_tensor_shape(fc_w_name)
+                w_dtype = model.get_tensor_datatype(fc_w_name)
+                # determine the feasible interface width
+                transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
+                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                assert (
+                    intfwidth % 8 == 0
+                ), "No feasible interface width for transfer size"
+                # calculate width of stream output from DMA
+                pe = get_by_name(fc_node.attribute, "PE").i
+                simd = get_by_name(fc_node.attribute, "SIMD").i
+                streamWidth = simd * pe * w_dtype.bitwidth()
+                # make new buffer
+                fc_node_in = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape
+                )
+                model.graph.value_info.append(fc_node_in)
+                model.set_tensor_datatype(fc_node_in.name, w_dtype)
+                dma_node = oh.make_node(
+                    "IODMA",
+                    [fc_w_name],
+                    [fc_node_in.name],
+                    numInputVectors=w_shape[:-1],
+                    NumChannels=w_shape[-1],
+                    dataType=str(w_dtype.name),
+                    intfWidth=intfwidth,
+                    streamWidth=streamWidth,
+                    direction="in",
+                    burstMode="wrap",
+                    domain="finn",
+                    backend="fpgadataflow",
+                )
+                fc_node.input[1] = fc_node_in.name
+                model.graph.node.insert(0, dma_node)
+            model = model.transform(SortGraph())
             return (model, True)