diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index c0ac1319dd520794afd66f187b35e529739e5cd7..1ce936cd79c2257897e74430d00e5082c51c9320 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -29,11 +29,9 @@ def _suitable_node(node):
 
 
 def _suitable_folded_shapes(ishape, oshape):
-    i_dummy = np.random.rand(*ishape)
-    o_dummy = np.random.rand(*oshape)
-    ishape_canonical = np.squeeze(i_dummy).shape
-    oshape_canonical = np.squeeze(o_dummy).shape
-    return ishape_canonical == oshape_canonical
+    matching_stream_width = ishape[-1] == oshape[-1]
+    matching_size = np.prod(ishape) == np.prod(oshape)
+    return matching_stream_width and matching_size
 
 
 class InsertFIFO(Transformation):
diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py
index cb8deaeec4b79d3c47d7705ff8f9bf72a085dfc0..990b858ad62aec00be4be4e0dd30bef3eb9e3ce3 100644
--- a/src/finn/transformation/move_reshape.py
+++ b/src/finn/transformation/move_reshape.py
@@ -1,5 +1,7 @@
 from finn.transformation.base import Transformation
 from finn.util.basic import get_by_name, is_finn_op
+from finn.custom_op.registry import getCustomOp
+import warnings
 
 
 def _is_fpgadataflow_node(node):
@@ -18,33 +20,66 @@ def _is_fpgadataflow_node(node):
 
 
 class RemoveCNVtoFCFlatten(Transformation):
-    """Removes a node that implements a (1, -1) reshape if it is
-    between two fpgadataflow nodes"""
+    """Removes a flatten node if it is between two fpgadataflow nodes.
+    For an NHWC-Conv to FC transition, the preceding transpose is absorbed.
+    The flatten operation can also be implemented by a reshape node."""
 
     def apply(self, model):
-
         graph = model.graph
         graph_modified = False
         for n in graph.node:
-            if n.op_type == "Reshape":
-                shape = model.get_initializer(n.input[1])
-                if (shape == [1, -1]).all():
+            # also support implicit flatten via reshape, e.g. reshape(1,-1)
+            if n.op_type == "Flatten" or n.op_type == "Reshape":
+                ishape = model.get_tensor_shape(n.input[0])
+                oshape = model.get_tensor_shape(n.output[0])
+                if len(oshape) == 2 and ishape[0] == oshape[0]:
                     producer = model.find_producer(n.input[0])
                     if _is_fpgadataflow_node(producer) is True:
+                        # standalone flatten, remove
                         consumer = model.find_consumer(n.output[0])
                         if _is_fpgadataflow_node(consumer) is True:
                             graph_modified = True
                             consumer.input[0] = n.input[0]
                             graph.node.remove(n)
                     elif producer.op_type == "Transpose":
+                        # transpose + flatten, absorb into following node
                         transp_node = producer
-                        producer = model.find_producer(transp_node.input[0])
-                        if _is_fpgadataflow_node(producer) is True:
-                            consumer = model.find_consumer(n.output[0])
-                            if _is_fpgadataflow_node(consumer) is True:
-                                graph_modified = True
-                                consumer.input[0] = transp_node.input[0]
-                                graph.node.remove(n)
-                                graph.node.remove(transp_node)
+                        # check if transpose converts NHWC to NCHW
+                        perms = list(get_by_name(transp_node.attribute, "perm").ints)
+                        if perms == [0, 3, 1, 2]:
+                            producer = model.find_producer(transp_node.input[0])
+                            if _is_fpgadataflow_node(producer) is True:
+                                consumer = model.find_consumer(n.output[0])
+                                if consumer.op_type == "StreamingFCLayer_Batch":
+                                    fc_inst = getCustomOp(consumer)
+                                    mw = fc_inst.get_nodeattr("MW")
+                                    mh = fc_inst.get_nodeattr("MH")
+                                    (b, h, w, c) = model.get_tensor_shape(
+                                        transp_node.input[0]
+                                    )
+                                    # absorb transpose into weight matrix,
+                                    # allowing FC layer to operate on the NHWC input
+                                    W = model.get_initializer(consumer.input[1])
+                                    assert (
+                                        W is not None
+                                    ), "Initializer for matmul weights is not set."
+                                    W_new = W.reshape(c, h, w, mh)
+                                    W_new = W_new.transpose((1, 2, 0, 3))
+                                    W_new = W_new.reshape(mw, mh)
+                                    model.set_initializer(consumer.input[1], W_new)
+                                    # remove transpose & flatten nodes
+                                    consumer.input[0] = transp_node.input[0]
+                                    graph.node.remove(n)
+                                    graph.node.remove(transp_node)
+                                    graph_modified = True
+                                else:
+                                    warnings.warn(
+                                        "Could not absorb transpose->flatten \
+                                        into subsequent node"
+                                    )
+                        else:
+                            warnings.warn(
+                                "Unsupported transpose node before flatten layer"
+                            )
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index fa2d7a714ad894ebb19099c7ed73e42e12ffdf44..9b842162f7f751c60b18bbd288ff96ef28d3aa88 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -309,7 +309,8 @@ class Absorb1BitMulIntoConv(Transformation):
 
 class AbsorbTransposeIntoMultiThreshold(Transformation):
     """Change (NHWCTranpose -> MultiThreshold -> NCHWTranspose) to (MultiThreshold)
-    with NHWC mode."""
+    with NHWC mode. For (NHWCTranpose -> MultiThreshold -> Flatten), move Transpose
+    past MultiThreshold to prepare for the RemoveCNVtoFCFlatten() transformation."""
 
     def apply(self, model):
         graph = model.graph
@@ -338,23 +339,34 @@ class AbsorbTransposeIntoMultiThreshold(Transformation):
                                 graph.node.remove(n)
                                 graph.node.remove(final_t_cand)
                                 graph_modified = True
-                        elif final_t_cand.op_type == "Reshape":
+                        # also support implicit flatten via reshape, e.g. reshape(1,-1)
+                        elif (
+                            final_t_cand.op_type == "Flatten"
+                            or final_t_cand.op_type == "Reshape"
+                        ):
+                            ishape = model.get_tensor_shape(final_t_cand.input[0])
                             oshape = model.get_tensor_shape(final_t_cand.output[0])
-                            if len(oshape) == 2:
+                            if len(oshape) == 2 and ishape[0] == oshape[0]:
                                 # transition to FC part, can still use NHWC
                                 mt = getCustomOp(mt_cand)
                                 mt.set_nodeattr("data_layout", "NHWC")
                                 # get rid of first tranpose node
                                 mt_cand.input[0] = n.input[0]
+                                graph.node.remove(n)
                                 # fix output shape for MultiThreshold
                                 mt_ishape = model.get_tensor_shape(mt_cand.input[0])
                                 (b, h, w, c) = mt_ishape
-                                assert (
-                                    h == 1 and w == 1
-                                ), """Untested spatial dim
-                                in conv->fc transition, proceed with caution!"""
                                 model.set_tensor_shape(mt_cand.output[0], mt_ishape)
-                                graph.node.remove(n)
+                                # re-insert Transpose behind MultiThreshold
+                                transpose_output = model.make_new_valueinfo_name()
+                                new_transpose = oh.make_node(
+                                    "Transpose",
+                                    [mt_cand.output[0]],
+                                    [transpose_output],
+                                    perm=[0, 3, 1, 2],
+                                )
+                                graph.node.insert(node_ind + 1, new_transpose)
+                                final_t_cand.input[0] = transpose_output
                                 graph_modified = True
         if graph_modified:
             model = model.transform(InferDataTypes())
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
new file mode 100755
index 0000000000000000000000000000000000000000..a67a22fed294a1715293d42b824dc895db752496
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from onnx import TensorProto, helper
+import numpy as np
+import pytest
+
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.util.basic import gen_finn_dt_tensor
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.custom_op.general.im2col import compute_conv_output_dim
+
+import finn.transformation.streamline.absorb as absorb
+from finn.transformation.general import RemoveUnusedTensors
+from finn.transformation.streamline import Streamline
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants
+
+import finn.core.data_layout as DataLayout
+
+
+def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
+    if seed is not None:
+        np.random.seed(seed)
+    steps = np.random.rand(channels, 1) * 30
+    bias = np.random.rand(channels, 1) * -10
+    thres = [np.arange(num_of_thres) for chn in range(channels)]
+    thres = ((thres + bias) * steps).astype(np.float32)
+    thres = np.round(thres)
+    return thres
+
+
+# conv_config: input_shape, kernel_shape, stride, pad
+@pytest.mark.parametrize(
+    "conv_config",
+    [
+        ((6, 6), (3, 3), (1, 1), (1, 1)),
+        # TODO: enable 1d conv test cases
+        # ((12, 1), (3, 1), (1, 1), (1, 0)),
+        # ((1, 15), (1, 5), (1, 1), (0, 2)),
+    ],
+)
+@pytest.mark.parametrize("depthwise", [False, True])
+@pytest.mark.parametrize("use_reshape", [False, True])
+def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
+    np.random.seed(0)
+    idt = DataType.UINT4
+    odt = DataType.UINT4
+    conv_weight_dt = DataType.INT4
+    fc_weight_dt = DataType.INT4
+
+    input_shape, kernel_shape, stride, pad = conv_config
+    kernel_size_h, kernel_size_w = kernel_shape
+    input_size_h, input_size_w = input_shape
+    stride_h, stride_w = stride
+    pad_h, pad_w = pad
+
+    in_chn = 4
+    fc_filters = 16
+
+    if depthwise is True:
+        group = out_chn = in_chn
+        conv_param_shape = [out_chn, 1, kernel_size_h, kernel_size_w]
+    else:
+        group = 1
+        out_chn = 8
+        conv_param_shape = [out_chn, in_chn, kernel_size_h, kernel_size_w]
+
+    output_size_h = compute_conv_output_dim(
+        input_size_h, kernel_size_h, stride_h, 2 * pad_h
+    )
+    output_size_w = compute_conv_output_dim(
+        input_size_w, kernel_size_w, stride_w, 2 * pad_w
+    )
+
+    input_shape = [1, in_chn, input_size_h, input_size_w]
+    fc_param_shape = [out_chn * output_size_h * output_size_w, fc_filters]
+    output_shape = [1, fc_filters]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = group
+    conv_config["kernel_shape"] = [kernel_size_h, kernel_size_w]
+    conv_config["pads"] = [pad_h, pad_w, pad_h, pad_w]
+    conv_config["strides"] = [stride_h, stride_w]
+
+    global_in = helper.make_tensor_value_info(
+        "global_in", TensorProto.FLOAT, input_shape
+    )
+    global_out = helper.make_tensor_value_info(
+        "global_out", TensorProto.FLOAT, output_shape
+    )
+    value_info = [
+        helper.make_tensor_value_info(
+            "conv_param", TensorProto.FLOAT, conv_param_shape
+        ),
+        helper.make_tensor_value_info("thres1_param", TensorProto.FLOAT, (out_chn, 15)),
+        helper.make_tensor_value_info(
+            "matmul_param", TensorProto.FLOAT, fc_param_shape
+        ),
+        helper.make_tensor_value_info(
+            "thres2_param", TensorProto.FLOAT, (fc_filters, 15)
+        ),
+        helper.make_tensor_value_info("reshape_shape", TensorProto.INT64, []),
+    ]
+
+    if use_reshape:
+        flatten_node = helper.make_node(
+            "Reshape", ["thres1_out", "reshape_shape"], ["flatten_out"]
+        )
+    else:
+        flatten_node = helper.make_node(
+            "Flatten", ["thres1_out"], ["flatten_out"], axis=1
+        )
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[global_in],
+            outputs=[global_out],
+            value_info=value_info,
+            nodes=[
+                helper.make_node(
+                    "Conv", ["global_in", "conv_param"], ["conv_out"], **conv_config
+                ),
+                helper.make_node(
+                    "MultiThreshold",
+                    ["conv_out", "thres1_param"],
+                    ["thres1_out"],
+                    domain="finn.custom_op.general",
+                    out_dtype="UINT4",
+                ),
+                flatten_node,
+                helper.make_node(
+                    "MatMul", ["flatten_out", "matmul_param"], ["matmul_out"]
+                ),
+                helper.make_node(
+                    "MultiThreshold",
+                    ["matmul_out", "thres2_param"],
+                    ["global_out"],
+                    domain="finn.custom_op.general",
+                    out_dtype="UINT4",
+                ),
+            ],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("global_in", idt)
+    model.set_tensor_layout("global_in", DataLayout.NCHW)
+    model.set_tensor_datatype("global_out", odt)
+    model.set_tensor_datatype("conv_param", conv_weight_dt)
+    model.set_tensor_datatype("matmul_param", fc_weight_dt)
+    model.set_tensor_datatype("thres1_param", DataType.INT32)
+    model.set_tensor_datatype("thres2_param", DataType.INT32)
+
+    model.set_initializer(
+        "conv_param", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape)
+    )
+    model.set_initializer(
+        "thres1_param", get_multithreshold_rand_params(out_chn, 15, seed=0)
+    )
+    model.set_initializer(
+        "thres2_param", get_multithreshold_rand_params(fc_filters, 15, seed=0)
+    )
+    model.set_initializer(
+        "matmul_param", gen_finn_dt_tensor(fc_weight_dt, fc_param_shape)
+    )
+    model.set_initializer("reshape_shape", np.array([1, -1]))
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+
+    # streamlining
+    new_model = model.transform(MoveScalarLinearPastInvariants())
+    new_model = new_model.transform(Streamline())
+    new_model = new_model.transform(LowerConvsToMatMul())
+    new_model = new_model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    new_model = new_model.transform(Streamline())
+    new_model = new_model.transform(InferDataLayouts())
+    new_model = new_model.transform(RemoveUnusedTensors())
+
+    # convert_to_hls
+    if depthwise is True:
+        new_model = new_model.transform(to_hls.InferVVAU())
+    new_model = new_model.transform(to_hls.InferQuantizedStreamingFCLayer())
+    new_model = new_model.transform(to_hls.InferThresholdingLayer())
+    new_model = new_model.transform(to_hls.InferConvInpGen())
+    new_model = new_model.transform(to_hls.InferStreamingMaxPool())
+    new_model = new_model.transform(RemoveCNVtoFCFlatten())
+    new_model = new_model.transform(absorb.AbsorbConsecutiveTransposes())
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    new_model = new_model.transform(InferDataLayouts())
+
+    # prepare cppsim
+    new_model = new_model.transform(PrepareCppSim())
+    new_model = new_model.transform(CompileCppSim())
+    new_model = new_model.transform(SetExecMode("cppsim"))
+
+    # check for correct execution
+    x = gen_finn_dt_tensor(idt, input_shape)
+    inp_dict = {model.graph.input[0].name: x}
+    assert oxe.compare_execution(model, new_model, inp_dict)
+
+    num_transpose = len(new_model.get_nodes_by_op_type("Transpose"))
+    num_flatten = len(new_model.get_nodes_by_op_type("Flatten"))
+    num_reshape = len(new_model.get_nodes_by_op_type("Reshape"))
+
+    # check if transpose->flatten was removed
+    assert num_transpose == 1 and num_flatten == 0 and num_reshape == 0