diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index 960abd675bbb185ce2fadfab954ec2b4fd6ff94e..051fd506cab6ae2da0cbea8badff342361734562 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -11,7 +11,7 @@ jobs:
 
   test:
     name: Run quicktest on PR branch
-    runs-on: ubuntu-16.04
+    runs-on: ubuntu-18.04
 
     steps:
       - name: checkout
diff --git a/AUTHORS.rst b/AUTHORS.rst
index eb1e06e54b7eb6deedd3e7f8392bb3aa257e7dc6..533ed62e1dbda2799f74805f2100769f9c4fecfc 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -2,9 +2,14 @@
 Contributors
 ============
 
-* Yaman Umuroglu (@maltanar)
+* Yaman Umuroglu (@maltanar) (maintainer)
 * Jakoba Petri-Koenig (@auphelia)
 * Andrea Rigoni (@AndreaRigoni)
 * Hendrik Borras (@HenniOVP)
 * Lucian Petrica (@quetric)
 * Tobias Alonso (@Tobi-Alonso)
+* Felix Paul Jentzsch (@felixpj)
+* Mirza Mrahorovic (@mmrahorovic)
+* Suranga Mahesh (@surangamh)
+* Peter Lehnhardt (@pete-lennart)
+* Neil Kim Nielsen (@neilkimn)
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 6afd2859a866706c93d44e82c6f3091fcc7320e5..7d06b50997f8ff44eca743d222c430f4b032e0b0 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -17,13 +17,14 @@ recho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-FINN_BASE_COMMIT=2c08044c5e9011c19911e731a18ac20d775bbf46
+FINN_BASE_COMMIT=ac0b86a63eb937b869bfa453a996a8a8b8506546
 FINN_EXP_COMMIT=e9f97dcdb4db2f889b0f36af079a6a1792b7d4de
-BREVITAS_COMMIT=14abbe1e7ef82485d79415871fcf5766b0a40a00
+BREVITAS_COMMIT=d7ded80fa9557da2998ea310669edee7fb2d9526
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
 HLSLIB_COMMIT=4d74baefa79df48b5a0348d63f39a26df075de51
 PYVERILATOR_COMMIT=e2ff74030de3992dcac54bf1b6aad2915946e8cb
 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
+AVNET_BDF_COMMIT=2d49cfc25766f07792c0b314489f21fe916b639b
 
 gecho "Setting up known-good commit versions for FINN dependencies"
 # finn-base
@@ -86,10 +87,11 @@ if [ ! -d "/workspace/finn/board_files" ]; then
     cd $OLD_PWD
 fi
 if [ ! -d "/workspace/finn/board_files/ultra96v2" ]; then
-    gecho "Downloading Avnet BDF files into board_files"
+    gecho "Downloading Avnet BDF files from known-good commit into board_files"
     OLD_PWD=$(pwd)
     cd /workspace/finn
     git clone https://github.com/Avnet/bdf.git
+    git -C /workspace/finn/bdf checkout $AVNET_BDF_COMMIT --quiet
     mv /workspace/finn/bdf/* /workspace/finn/board_files/
     rm -rf /workspace/finn/bdf
     cd $OLD_PWD
diff --git a/requirements.txt b/requirements.txt
index 6dd4b5724782d01fc2958cc56c04cbc8e70af31f..de007ace503de9f110027b4190d5db6188633575 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ future==0.18.2
 gspread==3.6.0
 numpy==1.18.0
 onnx==1.7.0
+onnxoptimizer
 onnxruntime==1.4.0
 pre-commit==2.6.0
 scipy==1.5.2
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index c0ac1319dd520794afd66f187b35e529739e5cd7..1ce936cd79c2257897e74430d00e5082c51c9320 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -29,11 +29,9 @@ def _suitable_node(node):
 
 
 def _suitable_folded_shapes(ishape, oshape):
-    i_dummy = np.random.rand(*ishape)
-    o_dummy = np.random.rand(*oshape)
-    ishape_canonical = np.squeeze(i_dummy).shape
-    oshape_canonical = np.squeeze(o_dummy).shape
-    return ishape_canonical == oshape_canonical
+    matching_stream_width = ishape[-1] == oshape[-1]
+    matching_size = np.prod(ishape) == np.prod(oshape)
+    return matching_stream_width and matching_size
 
 
 class InsertFIFO(Transformation):
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 3dab426ccf9bab73ddac83299bdc47f89ea46bdc..f2f172139ed9144eccdfbe37d0ae1a0695d049e4 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -265,7 +265,10 @@ class MakeZYNQProject(Transformation):
             vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit"
         )
         if not os.path.isfile(bitfile_name):
-            raise Exception("Synthesis failed, no bitfile found")
+            raise Exception(
+                "Synthesis failed, no bitfile found. Check logs under %s"
+                % vivado_pynq_proj_dir
+            )
         deploy_bitfile_name = vivado_pynq_proj_dir + "/resizer.bit"
         copy(bitfile_name, deploy_bitfile_name)
         # set bitfile attribute
@@ -275,7 +278,10 @@ class MakeZYNQProject(Transformation):
             + "/finn_zynq_link.srcs/sources_1/bd/top/hw_handoff/top.hwh"
         )
         if not os.path.isfile(hwh_name):
-            raise Exception("Synthesis failed, no hardware handoff file found")
+            raise Exception(
+                "Synthesis failed, no bitfile found. Check logs under %s"
+                % vivado_pynq_proj_dir
+            )
         deploy_hwh_name = vivado_pynq_proj_dir + "/resizer.hwh"
         copy(hwh_name, deploy_hwh_name)
         model.set_metadata_prop("hw_handoff", deploy_hwh_name)
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 9c0169a98f515d0b32e10bdfc834eca5fb681ffd..bd5de66cf575cd2e4eee0bf064a611e736a4b76e 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -115,7 +115,7 @@ if {$BOARD == "ZCU104"} {
     set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
     set ZYNQ_TYPE "zynq_us+"
 } elseif {$BOARD == "Ultra96"} {
-    set_property board_part em.avnet.com:ultra96v1:part0:1.2 [current_project]
+    set_property board_part avnet.com:ultra96v1:part0:1.2 [current_project]
     set ZYNQ_TYPE "zynq_us+"
 } elseif {$BOARD == "Pynq-Z2"} {
     set ZYNQ_TYPE "zynq_7000"
diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py
index cb8deaeec4b79d3c47d7705ff8f9bf72a085dfc0..990b858ad62aec00be4be4e0dd30bef3eb9e3ce3 100644
--- a/src/finn/transformation/move_reshape.py
+++ b/src/finn/transformation/move_reshape.py
@@ -1,5 +1,7 @@
 from finn.transformation.base import Transformation
 from finn.util.basic import get_by_name, is_finn_op
+from finn.custom_op.registry import getCustomOp
+import warnings
 
 
 def _is_fpgadataflow_node(node):
@@ -18,33 +20,66 @@ def _is_fpgadataflow_node(node):
 
 
 class RemoveCNVtoFCFlatten(Transformation):
-    """Removes a node that implements a (1, -1) reshape if it is
-    between two fpgadataflow nodes"""
+    """Removes a flatten node if it is between two fpgadataflow nodes.
+    For an NHWC-Conv to FC transition, the preceding transpose is absorbed.
+    The flatten operation can also be implemented by a reshape node."""
 
     def apply(self, model):
-
         graph = model.graph
         graph_modified = False
         for n in graph.node:
-            if n.op_type == "Reshape":
-                shape = model.get_initializer(n.input[1])
-                if (shape == [1, -1]).all():
+            # also support implicit flatten via reshape, e.g. reshape(1,-1)
+            if n.op_type == "Flatten" or n.op_type == "Reshape":
+                ishape = model.get_tensor_shape(n.input[0])
+                oshape = model.get_tensor_shape(n.output[0])
+                if len(oshape) == 2 and ishape[0] == oshape[0]:
                     producer = model.find_producer(n.input[0])
                     if _is_fpgadataflow_node(producer) is True:
+                        # standalone flatten, remove
                         consumer = model.find_consumer(n.output[0])
                         if _is_fpgadataflow_node(consumer) is True:
                             graph_modified = True
                             consumer.input[0] = n.input[0]
                             graph.node.remove(n)
                     elif producer.op_type == "Transpose":
+                        # transpose + flatten, absorb into following node
                         transp_node = producer
-                        producer = model.find_producer(transp_node.input[0])
-                        if _is_fpgadataflow_node(producer) is True:
-                            consumer = model.find_consumer(n.output[0])
-                            if _is_fpgadataflow_node(consumer) is True:
-                                graph_modified = True
-                                consumer.input[0] = transp_node.input[0]
-                                graph.node.remove(n)
-                                graph.node.remove(transp_node)
+                        # check if transpose converts NHWC to NCHW
+                        perms = list(get_by_name(transp_node.attribute, "perm").ints)
+                        if perms == [0, 3, 1, 2]:
+                            producer = model.find_producer(transp_node.input[0])
+                            if _is_fpgadataflow_node(producer) is True:
+                                consumer = model.find_consumer(n.output[0])
+                                if consumer.op_type == "StreamingFCLayer_Batch":
+                                    fc_inst = getCustomOp(consumer)
+                                    mw = fc_inst.get_nodeattr("MW")
+                                    mh = fc_inst.get_nodeattr("MH")
+                                    (b, h, w, c) = model.get_tensor_shape(
+                                        transp_node.input[0]
+                                    )
+                                    # absorb transpose into weight matrix,
+                                    # allowing FC layer to operate on the NHWC input
+                                    W = model.get_initializer(consumer.input[1])
+                                    assert (
+                                        W is not None
+                                    ), "Initializer for matmul weights is not set."
+                                    W_new = W.reshape(c, h, w, mh)
+                                    W_new = W_new.transpose((1, 2, 0, 3))
+                                    W_new = W_new.reshape(mw, mh)
+                                    model.set_initializer(consumer.input[1], W_new)
+                                    # remove transpose & flatten nodes
+                                    consumer.input[0] = transp_node.input[0]
+                                    graph.node.remove(n)
+                                    graph.node.remove(transp_node)
+                                    graph_modified = True
+                                else:
+                                    warnings.warn(
+                                        "Could not absorb transpose->flatten \
+                                        into subsequent node"
+                                    )
+                        else:
+                            warnings.warn(
+                                "Unsupported transpose node before flatten layer"
+                            )
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index fa2d7a714ad894ebb19099c7ed73e42e12ffdf44..9b842162f7f751c60b18bbd288ff96ef28d3aa88 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -309,7 +309,8 @@ class Absorb1BitMulIntoConv(Transformation):
 
 class AbsorbTransposeIntoMultiThreshold(Transformation):
     """Change (NHWCTranpose -> MultiThreshold -> NCHWTranspose) to (MultiThreshold)
-    with NHWC mode."""
+    with NHWC mode. For (NHWCTranpose -> MultiThreshold -> Flatten), move Transpose
+    past MultiThreshold to prepare for the RemoveCNVtoFCFlatten() transformation."""
 
     def apply(self, model):
         graph = model.graph
@@ -338,23 +339,34 @@ class AbsorbTransposeIntoMultiThreshold(Transformation):
                                 graph.node.remove(n)
                                 graph.node.remove(final_t_cand)
                                 graph_modified = True
-                        elif final_t_cand.op_type == "Reshape":
+                        # also support implicit flatten via reshape, e.g. reshape(1,-1)
+                        elif (
+                            final_t_cand.op_type == "Flatten"
+                            or final_t_cand.op_type == "Reshape"
+                        ):
+                            ishape = model.get_tensor_shape(final_t_cand.input[0])
                             oshape = model.get_tensor_shape(final_t_cand.output[0])
-                            if len(oshape) == 2:
+                            if len(oshape) == 2 and ishape[0] == oshape[0]:
                                 # transition to FC part, can still use NHWC
                                 mt = getCustomOp(mt_cand)
                                 mt.set_nodeattr("data_layout", "NHWC")
                                 # get rid of first tranpose node
                                 mt_cand.input[0] = n.input[0]
+                                graph.node.remove(n)
                                 # fix output shape for MultiThreshold
                                 mt_ishape = model.get_tensor_shape(mt_cand.input[0])
                                 (b, h, w, c) = mt_ishape
-                                assert (
-                                    h == 1 and w == 1
-                                ), """Untested spatial dim
-                                in conv->fc transition, proceed with caution!"""
                                 model.set_tensor_shape(mt_cand.output[0], mt_ishape)
-                                graph.node.remove(n)
+                                # re-insert Transpose behind MultiThreshold
+                                transpose_output = model.make_new_valueinfo_name()
+                                new_transpose = oh.make_node(
+                                    "Transpose",
+                                    [mt_cand.output[0]],
+                                    [transpose_output],
+                                    perm=[0, 3, 1, 2],
+                                )
+                                graph.node.insert(node_ind + 1, new_transpose)
+                                final_t_cand.input[0] = transpose_output
                                 graph_modified = True
         if graph_modified:
             model = model.transform(InferDataTypes())
diff --git a/src/finn/transformation/streamline/remove.py b/src/finn/transformation/streamline/remove.py
index 0a36b8bbe5c05a8226ae647e0061c1551f3b1cbf..0abcf441f9636a52f9194df325874d293530f78a 100644
--- a/src/finn/transformation/streamline/remove.py
+++ b/src/finn/transformation/streamline/remove.py
@@ -50,7 +50,13 @@ def _remove_node_and_rewire(model, node):
 
 
 class RemoveIdentityOps(Transformation):
-    """Remove identity ops like Add/Sub with zero or Mul/Div with one"""
+    """Remove identity ops like Add/Sub with zero or Mul/Div with one. A tolerance
+    value (defaults to 1e-05) can be specified during init for the comparison
+    to zero/one."""
+
+    def __init__(self, atol=1e-05):
+        super().__init__()
+        self.atol = atol
 
     def apply(self, model):
         graph = model.graph
@@ -64,7 +70,10 @@ class RemoveIdentityOps(Transformation):
                 and not model.is_join_node(n)
             ):
                 A = model.get_initializer(n.input[1])
-                if A is not None and (A == np.zeros_like(A)).all():
+                if (
+                    A is not None
+                    and np.isclose(A, np.zeros_like(A), atol=self.atol).all()
+                ):
                     _remove_node_and_rewire(model, n)
 
             elif (
@@ -73,7 +82,10 @@ class RemoveIdentityOps(Transformation):
                 and not model.is_join_node(n)
             ):
                 A = model.get_initializer(n.input[1])
-                if A is not None and (A == np.ones_like(A)).all():
+                if (
+                    A is not None
+                    and np.isclose(A, np.ones_like(A), atol=self.atol).all()
+                ):
                     _remove_node_and_rewire(model, n)
         model = model.transform(InferShapes())
         return (model, graph_modified)
diff --git a/tests/brevitas/test_brevitas_QConv2d.py b/tests/brevitas/test_brevitas_QConv2d.py
index 21de8863d3f23316265b075ce529cf2249764a64..5f124690d7c1f266f074351abe690abdd3ae5a2c 100644
--- a/tests/brevitas/test_brevitas_QConv2d.py
+++ b/tests/brevitas/test_brevitas_QConv2d.py
@@ -50,8 +50,6 @@ export_onnx_path = "test_brevitas_conv.onnx"
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("in_channels", [32])
 def test_brevitas_QConv2d(dw, bias, in_channels):
-    if bias:
-        pytest.xfail("bias export bug")
     ishape = (1, 32, 111, 111)
     if dw is True:
         groups = in_channels
diff --git a/tests/brevitas/test_brevitas_qlinear.py b/tests/brevitas/test_brevitas_qlinear.py
index e389bc8c1223510e5f89beed1e973e2d1c7dad35..62ed358dc9030c35e865921ca7cf9e80c34020fd 100644
--- a/tests/brevitas/test_brevitas_qlinear.py
+++ b/tests/brevitas/test_brevitas_qlinear.py
@@ -48,8 +48,6 @@ export_onnx_path = "test_brevitas_qlinear.onnx"
 @pytest.mark.parametrize("w_bits", [4])
 @pytest.mark.parametrize("i_dtype", [DataType.UINT4])
 def test_brevitas_qlinear(bias, out_features, in_features, w_bits, i_dtype):
-    if bias:
-        pytest.xfail("bias export bug")
     i_shape = (1, in_features)
     w_shape = (out_features, in_features)
     b_linear = QuantLinear(
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 6d0e028a1dc499154e6526be9384f92438a4b98a..a6e7ad642222936775293ec145e845ef111dd4d3 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -307,8 +307,8 @@ class TestEnd2End:
     def test_export(self, topology, wbits, abits):
         if wbits > abits:
             pytest.skip("No wbits > abits end2end network configs for now")
-        if topology == "lfc" and wbits > 1:
-            pytest.skip("Skipping non-existing lfc configs")
+        if topology == "lfc" and not (wbits == 1 and abits == 1):
+            pytest.skip("Skipping certain lfc configs")
         (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
         chkpt_name = get_checkpoint_name(topology, wbits, abits, "export")
         bo.export_finn_onnx(model, ishape, chkpt_name)
@@ -347,6 +347,8 @@ class TestEnd2End:
         assert os.path.isfile(chkpt_preproc_name)
         # join preprocessing and core model
         pre_model = ModelWrapper(chkpt_preproc_name)
+        pre_model = pre_model.transform(InferShapes())
+        pre_model = pre_model.transform(FoldConstants())
         model = model.transform(MergeONNXModels(pre_model))
         # add input quantization annotation: UINT8 for all BNN-PYNQ models
         global_inp_name = model.graph.input[0].name
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
index c23749829a9d75c9a9519663a872aa1281bd46d3..5bfe8e1ea1b48ed77c40a584d624cc0ecdedb668 100644
--- a/tests/end2end/test_end2end_mobilenet_v1.py
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -74,18 +74,8 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
-    ReplaceVerilogRelPaths,
-)
-from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
-from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.core.onnx_exec import execute_onnx
 from finn.util.basic import alveo_part_map, alveo_default_platform
-from finn.util.config import extract_model_config_to_json
-from finn.transformation.fpgadataflow.vitis_build import VitisBuild, VitisOptStrategy
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 
@@ -111,6 +101,7 @@ def test_end2end_mobilenet_export():
     # set input finn datatype to UINT8
     preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType.UINT8)
     preproc_model = preproc_model.transform(InferShapes())
+    preproc_model = preproc_model.transform(FoldConstants())
     preproc_model = preproc_model.transform(GiveUniqueNodeNames())
     preproc_model = preproc_model.transform(GiveUniqueParameterTensors())
     preproc_model = preproc_model.transform(GiveReadableTensorNames())
@@ -197,6 +188,10 @@ def test_end2end_mobilenet_streamline():
         model = model.transform(GiveReadableTensorNames())
         model = model.transform(InferDataTypes())
     model.save(build_dir + "/end2end_mobilenet_streamlined.onnx")
+    assert (
+        len(model.get_nodes_by_op_type("Add")) == 1
+    )  # only final quantized bias Add op remains
+    assert len(model.get_nodes_by_op_type("Mul")) == 0  # no Mul ops remain
 
 
 def test_end2end_mobilenet_lowering():
@@ -334,101 +329,3 @@ def test_end2end_mobilenet_cppsim():
 
     assert (golden == res_cppsim).all()
     assert np.isclose(golden_prob, res_cppsim_prob).all()
-
-
-@pytest.mark.slow
-@pytest.mark.vivado
-def test_end2end_mobilenet_gen_hls_ip():
-    model = load_test_checkpoint_or_skip(
-        build_dir + "/end2end_mobilenet_dataflow_model.onnx"
-    )
-    start = time.time()
-    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
-    model = model.transform(HLSSynthIP())
-    model = model.transform(ReplaceVerilogRelPaths())
-    end = time.time()
-    elapsed_time = end - start
-    f = open(build_dir + "/end2end_mobilenet_ipgen_time.txt", "w+")
-    f.write("Execution time in seconds: " + str(elapsed_time))
-    f.close()
-
-    model = model.transform(AnnotateResources("hls"))
-    model.save(build_dir + "/end2end_mobilenet_ipgen.onnx")
-
-
-@pytest.mark.slow
-@pytest.mark.vivado
-@pytest.mark.xfail
-def test_end2end_mobilenet_rtlsim():
-    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_ipgen.onnx")
-    x = np.load(build_dir + "/end2end_mobilenet_input.npy")
-    inp_name = model.graph.input[0].name
-    out_name = model.graph.output[0].name
-    inp_dict = {inp_name: x}
-    # node-by-node rtlsim
-    model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(PrepareRTLSim())
-    model.save(build_dir + "/end2end_mobilenet_ipgen_nodebynode_rtlsim.onnx")
-    ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
-    res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
-    np.save(
-        build_dir + "/end2end_mobilenet_result_rtlsim_nodebynode.npy",
-        res_rtlsim_nodebynode,
-    )
-    a0 = np.load(build_dir + "/end2end_mobilenet_topk_scale.npy")
-    res_rtlsim_nodebynode_prob = (
-        ret_rtlsim_nodebynode[model.graph.node[-2].output[0]] * a0
-    )
-    np.save(
-        build_dir + "/end2end_mobilenet_result_rtlsim_nodebynode_prob.npy",
-        res_rtlsim_nodebynode_prob,
-    )
-
-    # check result with golden values
-    golden = np.load(build_dir + "/end2end_mobilenet_golden_top5.npy")
-    golden_prob = np.load(build_dir + "/end2end_mobilenet_golden_top5_prob.npy")
-
-    assert (golden == res_rtlsim_nodebynode).all()
-    assert np.isclose(golden_prob, res_rtlsim_nodebynode_prob).all()
-
-
-@pytest.mark.slow
-@pytest.mark.vivado
-def test_end2end_mobilenet_set_fifo_depths():
-    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_ipgen.onnx")
-    start = time.time()
-    model = model.transform(
-        InsertAndSetFIFODepths(
-            test_fpga_part, target_clk_ns, vivado_ram_style=large_fifo_ram_style
-        )
-    )
-    end = time.time()
-    elapsed_time = end - start
-    f = open(build_dir + "/end2end_mobilenet_fifoset_time.txt", "w+")
-    f.write("Execution time in seconds: " + str(elapsed_time))
-    f.close()
-    extract_model_config_to_json(
-        model,
-        build_dir + "/end2end_mobilenet_folded_and_fifo_config.json",
-        ["PE", "SIMD", "impl_style", "ram_style", "depth"],
-    )
-    model.save(build_dir + "/end2end_mobilenet_fifodepth.onnx")
-
-
-@pytest.mark.slow
-@pytest.mark.vitis
-def test_end2end_mobilenet_build():
-    model = load_test_checkpoint_or_skip(
-        build_dir + "/end2end_mobilenet_fifodepth.onnx"
-    )
-    model = model.transform(
-        VitisBuild(
-            test_fpga_part,
-            target_clk_ns,
-            test_platform,
-            strategy=VitisOptStrategy.PERFORMANCE_BEST,
-        )
-    )
-    model.save(build_dir + "/end2end_mobilenet_build.onnx")
-    model = model.transform(AnnotateResources("synth"))
-    model.save(build_dir + "/end2end_mobilenet_final.onnx")
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
new file mode 100755
index 0000000000000000000000000000000000000000..8b7b0a4b6a93cde690a3d87eb3a2f1a0a55a85f8
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from onnx import TensorProto, helper
+import numpy as np
+import pytest
+
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.util.basic import gen_finn_dt_tensor
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.custom_op.general.im2col import compute_conv_output_dim
+
+import finn.transformation.streamline.absorb as absorb
+from finn.transformation.general import RemoveUnusedTensors
+from finn.transformation.streamline import Streamline
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants
+
+import finn.core.data_layout as DataLayout
+
+
+def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
+    if seed is not None:
+        np.random.seed(seed)
+    steps = np.random.rand(channels, 1) * 30
+    bias = np.random.rand(channels, 1) * -10
+    thres = [np.arange(num_of_thres) for chn in range(channels)]
+    thres = ((thres + bias) * steps).astype(np.float32)
+    thres = np.round(thres)
+    return thres
+
+
+# conv_config: input_shape, kernel_shape, stride, pad
+@pytest.mark.parametrize(
+    "conv_config",
+    [
+        ((6, 6), (3, 3), (1, 1), (1, 1)),
+        # TODO: enable 1d conv test cases
+        # ((12, 1), (3, 1), (1, 1), (1, 0)),
+        # ((1, 15), (1, 5), (1, 1), (0, 2)),
+    ],
+)
+@pytest.mark.parametrize("depthwise", [False, True])
+@pytest.mark.parametrize("use_reshape", [False, True])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
+    np.random.seed(0)
+    idt = DataType.UINT4
+    odt = DataType.UINT4
+    conv_weight_dt = DataType.INT4
+    fc_weight_dt = DataType.INT4
+
+    input_shape, kernel_shape, stride, pad = conv_config
+    kernel_size_h, kernel_size_w = kernel_shape
+    input_size_h, input_size_w = input_shape
+    stride_h, stride_w = stride
+    pad_h, pad_w = pad
+
+    in_chn = 4
+    fc_filters = 16
+
+    if depthwise is True:
+        group = out_chn = in_chn
+        conv_param_shape = [out_chn, 1, kernel_size_h, kernel_size_w]
+    else:
+        group = 1
+        out_chn = 8
+        conv_param_shape = [out_chn, in_chn, kernel_size_h, kernel_size_w]
+
+    output_size_h = compute_conv_output_dim(
+        input_size_h, kernel_size_h, stride_h, 2 * pad_h
+    )
+    output_size_w = compute_conv_output_dim(
+        input_size_w, kernel_size_w, stride_w, 2 * pad_w
+    )
+
+    input_shape = [1, in_chn, input_size_h, input_size_w]
+    fc_param_shape = [out_chn * output_size_h * output_size_w, fc_filters]
+    output_shape = [1, fc_filters]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = group
+    conv_config["kernel_shape"] = [kernel_size_h, kernel_size_w]
+    conv_config["pads"] = [pad_h, pad_w, pad_h, pad_w]
+    conv_config["strides"] = [stride_h, stride_w]
+
+    global_in = helper.make_tensor_value_info(
+        "global_in", TensorProto.FLOAT, input_shape
+    )
+    global_out = helper.make_tensor_value_info(
+        "global_out", TensorProto.FLOAT, output_shape
+    )
+    value_info = [
+        helper.make_tensor_value_info(
+            "conv_param", TensorProto.FLOAT, conv_param_shape
+        ),
+        helper.make_tensor_value_info("thres1_param", TensorProto.FLOAT, (out_chn, 15)),
+        helper.make_tensor_value_info(
+            "matmul_param", TensorProto.FLOAT, fc_param_shape
+        ),
+        helper.make_tensor_value_info(
+            "thres2_param", TensorProto.FLOAT, (fc_filters, 15)
+        ),
+        helper.make_tensor_value_info("reshape_shape", TensorProto.INT64, []),
+    ]
+
+    if use_reshape:
+        flatten_node = helper.make_node(
+            "Reshape", ["thres1_out", "reshape_shape"], ["flatten_out"]
+        )
+    else:
+        flatten_node = helper.make_node(
+            "Flatten", ["thres1_out"], ["flatten_out"], axis=1
+        )
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[global_in],
+            outputs=[global_out],
+            value_info=value_info,
+            nodes=[
+                helper.make_node(
+                    "Conv", ["global_in", "conv_param"], ["conv_out"], **conv_config
+                ),
+                helper.make_node(
+                    "MultiThreshold",
+                    ["conv_out", "thres1_param"],
+                    ["thres1_out"],
+                    domain="finn.custom_op.general",
+                    out_dtype="UINT4",
+                ),
+                flatten_node,
+                helper.make_node(
+                    "MatMul", ["flatten_out", "matmul_param"], ["matmul_out"]
+                ),
+                helper.make_node(
+                    "MultiThreshold",
+                    ["matmul_out", "thres2_param"],
+                    ["global_out"],
+                    domain="finn.custom_op.general",
+                    out_dtype="UINT4",
+                ),
+            ],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("global_in", idt)
+    model.set_tensor_layout("global_in", DataLayout.NCHW)
+    model.set_tensor_datatype("global_out", odt)
+    model.set_tensor_datatype("conv_param", conv_weight_dt)
+    model.set_tensor_datatype("matmul_param", fc_weight_dt)
+    model.set_tensor_datatype("thres1_param", DataType.INT32)
+    model.set_tensor_datatype("thres2_param", DataType.INT32)
+
+    model.set_initializer(
+        "conv_param", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape)
+    )
+    model.set_initializer(
+        "thres1_param", get_multithreshold_rand_params(out_chn, 15, seed=0)
+    )
+    model.set_initializer(
+        "thres2_param", get_multithreshold_rand_params(fc_filters, 15, seed=0)
+    )
+    model.set_initializer(
+        "matmul_param", gen_finn_dt_tensor(fc_weight_dt, fc_param_shape)
+    )
+    model.set_initializer("reshape_shape", np.array([1, -1]))
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+
+    # streamlining
+    new_model = model.transform(MoveScalarLinearPastInvariants())
+    new_model = new_model.transform(Streamline())
+    new_model = new_model.transform(LowerConvsToMatMul())
+    new_model = new_model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    new_model = new_model.transform(Streamline())
+    new_model = new_model.transform(InferDataLayouts())
+    new_model = new_model.transform(RemoveUnusedTensors())
+
+    # convert_to_hls
+    if depthwise is True:
+        new_model = new_model.transform(to_hls.InferVVAU())
+    new_model = new_model.transform(to_hls.InferQuantizedStreamingFCLayer())
+    new_model = new_model.transform(to_hls.InferThresholdingLayer())
+    new_model = new_model.transform(to_hls.InferConvInpGen())
+    new_model = new_model.transform(to_hls.InferStreamingMaxPool())
+    new_model = new_model.transform(RemoveCNVtoFCFlatten())
+    new_model = new_model.transform(absorb.AbsorbConsecutiveTransposes())
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    new_model = new_model.transform(InferDataLayouts())
+
+    # prepare cppsim
+    new_model = new_model.transform(PrepareCppSim())
+    new_model = new_model.transform(CompileCppSim())
+    new_model = new_model.transform(SetExecMode("cppsim"))
+
+    # check for correct execution
+    x = gen_finn_dt_tensor(idt, input_shape)
+    inp_dict = {model.graph.input[0].name: x}
+    assert oxe.compare_execution(model, new_model, inp_dict)
+
+    num_transpose = len(new_model.get_nodes_by_op_type("Transpose"))
+    num_flatten = len(new_model.get_nodes_by_op_type("Flatten"))
+    num_reshape = len(new_model.get_nodes_by_op_type("Reshape"))
+
+    # check if transpose->flatten was removed
+    assert num_transpose == 1 and num_flatten == 0 and num_reshape == 0
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
index 20751a5877a879eeabf1ed6b67a7573208cf9367..15bf160799826b0d50a0f043a56dd1fc2accdd12 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
@@ -115,8 +115,8 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation):
         thr_nodes = model.get_nodes_by_op_type("Thresholding_Batch")
         assert len(thr_nodes) == 8
     non_finn_nodes = model.get_non_finn_nodes()
-    assert len(non_finn_nodes) == 4
-    exp_non_finn_nodes = ["Transpose", "Reshape", "Mul", "Add"]
+    assert len(non_finn_nodes) == 5
+    exp_non_finn_nodes = ["Transpose", "Transpose", "Reshape", "Mul", "Add"]
     assert [x.op_type for x in non_finn_nodes] == exp_non_finn_nodes
     fc_nodes = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     assert len(fc_nodes) == 9
diff --git a/tests/transformation/streamline/test_remove_identity_ops.py b/tests/transformation/streamline/test_remove_identity_ops.py
index 98430fad0e0f4c17d77ddbf44afeeccd44372047..d02e1d39755bf4783cd5dbdc2b88ca0931e02874 100644
--- a/tests/transformation/streamline/test_remove_identity_ops.py
+++ b/tests/transformation/streamline/test_remove_identity_ops.py
@@ -11,11 +11,17 @@ from finn.transformation.streamline.remove import RemoveIdentityOps
 from finn.util.basic import gen_finn_dt_tensor
 
 
-def insert_identity_op(model, op, as_first_node):
+def insert_identity_op(model, op, as_first_node, approx):
+    if approx:
+        zero_val = 0.000001
+        one_val = 0.999999
+    else:
+        zero_val = 0.0
+        one_val = 1.0
     if op in ["Add", "Sub"]:
-        val = np.asarray([0.0], dtype=np.float32)
+        val = np.asarray([zero_val], dtype=np.float32)
     elif op in ["Mul", "Div"]:
-        val = np.asarray([1.0], dtype=np.float32)
+        val = np.asarray([one_val], dtype=np.float32)
     else:
         return
 
@@ -35,8 +41,9 @@ def insert_identity_op(model, op, as_first_node):
 
 # identity operations to be inserted
 @pytest.mark.parametrize("op", ["Add", "Sub", "Mul", "Div"])
+@pytest.mark.parametrize("approx", [False, True])
 @pytest.mark.parametrize("as_first_node", [False, True])
-def test_remove_identity_ops(op, as_first_node):
+def test_remove_identity_ops(op, as_first_node, approx):
 
     # set up onnx model
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 4, 1, 1])
@@ -70,7 +77,7 @@ def test_remove_identity_ops(op, as_first_node):
     model.set_initializer("shape", shape_values)
     model.set_initializer("div", div_values)
     model.set_initializer("matmul", matmul_values)
-    insert_identity_op(model, op, as_first_node)
+    insert_identity_op(model, op, as_first_node, approx)
     model = model.transform(InferShapes())
     model = model.transform(InferDataTypes())
     idict = {"inp": inp_values}
@@ -84,4 +91,4 @@ def test_remove_identity_ops(op, as_first_node):
 
     odict = oxe.execute_onnx(model, idict)
     out_after = odict["outp"]
-    assert (out_before == out_after).all()
+    assert np.isclose(out_before, out_after, atol=1e-3).all()