diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index cd59a629405c748187cdf478c0bdb0694c58c79f..924fbd24a174df49af4b3e259ad57d0a7907d42b 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -18,4 +18,6 @@ jobs:
         uses: actions/checkout@v2
 
       - name: DockerRunQuicktest
+        env:
+          NUM_DEFAULT_WORKERS: 4
         run: sh run-docker.sh quicktest
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index 5772b16abc8b927def1e2dfbbb8193a2f964f87d..d06ff8521555ccd6d09383cab039850f1565fc61 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -61,6 +61,8 @@ RUN git clone --branch $FINN_CI_BRANCH https://github.com/Xilinx/finn /workspace
 RUN pip install -r /workspace/finn/requirements.txt
 RUN apt update; apt install nano
 RUN pip install pytest-dependency
+RUN pip install pytest-xdist
+RUN pip install pytest-parallel
 
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index 0e12b504a26ccdb8fd78e162f04cfdeab5a186f1..f8919d7498e0e8ef08a52d1da0782988b56d6df4 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -55,6 +55,8 @@ RUN pip install matplotlib
 RUN pip install pytest-dependency
 RUN pip install sphinx
 RUN pip install sphinx_rtd_theme
+RUN pip install pytest-xdist
+RUN pip install pytest-parallel
 
 # switch user
 RUN groupadd -g $GID $GNAME
diff --git a/docker/Jenkinsfile b/docker/Jenkinsfile
index 80be261fb3da057186259598f84d915176577a5d..2215bc79cc7b2c20036d882fdc654fbe8721cab6 100644
--- a/docker/Jenkinsfile
+++ b/docker/Jenkinsfile
@@ -9,7 +9,12 @@ pipeline {
         string(name: 'PYNQ_PASSWORD', defaultValue: 'xilinx', description: 'PYNQ board password')
         string(name: 'PYNQ_TARGET_DIR', defaultValue: '/home/xilinx/finn', description: 'PYNQ board target deployment directory')
         string(name: 'NUM_DEFAULT_WORKERS', defaultValue: '1', description: 'Number of cores for parallel transformations')
-        string(name: 'DOCKER_CMD', defaultValue: """python setup.py test""", description: 'Command to run')
+        // main test: everything except rtlsim and end2end tests, parallel run with xdist, no parallel transformations to save on memory
+        string(name: 'DOCKER_CMD_MAIN', defaultValue: """python setup.py test --addopts "-k 'not (rtlsim or end2end)' --dist=loadfile -n auto" """, description: 'Main test command')
+        // rtlsim tests: parallel run with pytest-parallel, no parallel transformations to save on memory
+        string(name: 'DOCKER_CMD_RTLSIM', defaultValue: """python setup.py test --addopts "-k rtlsim --workers auto" """, description: 'rtlsim test command')
+        // end2end tests: no parallel testing, use NUM_DEFAULT_WORKERS for parallel transformations
+        string(name: 'DOCKER_CMD_END2END', defaultValue: """python setup.py test --addopts "-k end2end" """, description: 'end2end test command')
     }
     environment {
         DOCKER_TAG='finn_ci:$BUILD_ID'
@@ -32,10 +37,49 @@ pipeline {
                 """
             }
         }
-        stage('Test') {
+        stage('test-main') {
             steps {
+                catchError {
                 sh """
-                docker run --name $DOCKER_INST_NAME --init \
+                docker run --init \
+                --hostname $DOCKER_INST_NAME \
+                -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
+                -e NUM_DEFAULT_WORKERS=1 \
+                -e FINN_INST_NAME=$DOCKER_INST_NAME \
+                -e VIVADO_PATH=${params.VIVADO_PATH} \
+                -e PYNQ_BOARD=${params.PYNQ_BOARD} \
+                -e PYNQ_IP=${params.PYNQ_IP} \
+                -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
+                -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
+                -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
+                $DOCKER_TAG ${params.DOCKER_CMD_MAIN}
+                """}
+            }
+        }
+        stage('test-rtlsim') {
+            steps {
+                catchError {
+                sh """
+                docker run --init \
+                --hostname $DOCKER_INST_NAME \
+                -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
+                -e NUM_DEFAULT_WORKERS=1 \
+                -e FINN_INST_NAME=$DOCKER_INST_NAME \
+                -e VIVADO_PATH=${params.VIVADO_PATH} \
+                -e PYNQ_BOARD=${params.PYNQ_BOARD} \
+                -e PYNQ_IP=${params.PYNQ_IP} \
+                -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
+                -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
+                -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
+                $DOCKER_TAG ${params.DOCKER_CMD_RTLSIM}
+                """}
+            }
+        }
+        stage('test-end2end') {
+            steps {
+                catchError {
+                sh """
+                docker run --init \
                 --hostname $DOCKER_INST_NAME \
                 -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
                 -e NUM_DEFAULT_WORKERS=${params.NUM_DEFAULT_WORKERS} \
@@ -46,8 +90,8 @@ pipeline {
                 -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
                 -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
                 -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
-                $DOCKER_TAG ${params.DOCKER_CMD}
-                """
+                $DOCKER_TAG ${params.DOCKER_CMD_END2END}
+                """ }
             }
         }
     }
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 132d5bdaa286ba3e50bbd06971e9139f5859ef11..b312737c317517ca0ab19c74cf22284b5977b661 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -15,7 +15,7 @@ gecho () {
 # the repos themselves are cloned in the Dockerfile
 BREVITAS_COMMIT=f9a27226d4acf1661dd38bc449f71f89e0983cce
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=8aed899c278c36c977a249558d71795086cf852c
+HLSLIB_COMMIT=8f9f2018762f654f196b666838aeaf6fc730ad9a
 PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f
 PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d
 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
index 4f6a2d3e230de9fcbb947d794722294880a7730d..49b7886836ac4e45dad856dfcd49223276bd831a 100755
--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -1,4 +1,22 @@
 #!/bin/bash
 
+: ${PYTEST_PARALLEL=auto}
+
 cd $FINN_ROOT
-python setup.py test --addopts "-m 'not (vivado or slow)'"
+
+# check if command line argument is empty or not present
+if [ -z $1 ]; then
+  echo "Running quicktest: not (vivado or slow) with pytest-xdist"
+  python setup.py test --addopts "-m 'not (vivado or slow)' --dist=loadfile -n $PYTEST_PARALLEL"
+elif [ $1 = "main" ]; then
+  echo "Running main test suite: not (rtlsim or end2end) with pytest-xdist"
+  python setup.py test --addopts "-k not (rtlsim or end2end) --dist=loadfile -n $PYTEST_PARALLEL"
+elif [ $1 = "rtlsim" ]; then
+  echo "Running rtlsim test suite with pytest-parallel"
+  python setup.py test --addopts "-k rtlsim --workers $PYTEST_PARALLEL"
+elif [ $1 = "end2end" ]; then
+  echo "Running end2end test suite with no parallelism"
+  python setup.py test --addopts "-k end2end"
+else
+  echo "Unrecognized argument to quicktest.sh"
+fi
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index f4fa7a13dcbe4fe8ab9667a111df00c605747710..8b20cebcfc49d14d0afbb26edd678d65425476d3 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -73,8 +73,12 @@ from the FINN root directory* as follows:
 
   python setup.py test --addopts "-k test_end2end_tfc_w1a2"
 
-Please see the pytest documentation for more about picking tests by marks or
-by name.
+Finally, if you want to run tests in parallel (e.g. to take advantage of a multi-core CPU)
+you can use:
+ * pytest-parallel for any rtlsim tests, e.g. `python setup.py test --addopts "-k rtlsim --workers auto"`
+ * pytest-xdist for anything else, make sure to add `--dist=loadfile` if you have tests in the same file that have dependencies on each other e.g. `python setup.py test --addopts "-k mytest -n auto --dist=loadfile"`
+
+Please see the pytest documentation for more about picking tests by marks or by name.
 
 Environment variables
 **********************
diff --git a/notebooks/end2end_example/tfc_end2end_example.ipynb b/notebooks/end2end_example/tfc_end2end_example.ipynb
index d573061487de204084e0d3242da8ad1b791f44d8..c84efc964b1f57b7ed385521fc5214fdc2396590 100644
--- a/notebooks/end2end_example/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/tfc_end2end_example.ipynb
@@ -132,7 +132,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f8890385828>"
+       "<IPython.lib.display.IFrame at 0x7f7cc4290940>"
       ]
      },
      "execution_count": 3,
@@ -293,7 +293,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe1ad0639e8>"
+       "<IPython.lib.display.IFrame at 0x7f7c6c567f28>"
       ]
      },
      "execution_count": 6,
@@ -333,9 +333,10 @@
       "            ConvertDivToMul(),\n",
       "            BatchNormToAffine(),\n",
       "            ConvertSignToThres(),\n",
+      "            AbsorbSignBiasIntoMultiThreshold(),\n",
       "            MoveAddPastMul(),\n",
       "            MoveScalarAddPastMatMul(),\n",
-      "            MoveScalarAddPastConv(),\n",
+      "            MoveAddPastConv(),\n",
       "            MoveScalarMulPastMatMul(),\n",
       "            MoveScalarMulPastConv(),\n",
       "            MoveAddPastMul(),\n",
@@ -350,6 +351,7 @@
       "        ]\n",
       "        for trn in streamline_transformations:\n",
       "            model = model.transform(trn)\n",
+      "            model = model.transform(RemoveIdentityOps())\n",
       "            model = model.transform(GiveUniqueNodeNames())\n",
       "            model = model.transform(GiveReadableTensorNames())\n",
       "            model = model.transform(InferDataTypes())\n",
@@ -400,7 +402,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe1346e4ef0>"
+       "<IPython.lib.display.IFrame at 0x7f7c6c0bf898>"
       ]
      },
      "execution_count": 8,
@@ -454,7 +456,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fe1346f7780>"
+       "<IPython.lib.display.IFrame at 0x7f7c6c0e5c18>"
       ]
      },
      "execution_count": 9,
diff --git a/src/finn/custom_op/__init__.py b/src/finn/custom_op/__init__.py
index ab6e03bee65b8bf5c4041dd8021b1a561e7673d2..4ae7b9ebffaab6ca6be04b8d73f647b2db22dc78 100644
--- a/src/finn/custom_op/__init__.py
+++ b/src/finn/custom_op/__init__.py
@@ -56,8 +56,15 @@ class CustomOp(ABC):
                     ret = ret.decode("utf-8")
                 return ret
             else:
-                # not set, return default value
-                return def_val
+                if req:
+                    raise Exception(
+                        """Required attribute %s unspecified in
+                    a %s node"""
+                        % (name, self.onnx_node.op_type)
+                    )
+                else:
+                    # not set, return default value
+                    return def_val
         except KeyError:
             raise AttributeError("Op has no such attribute: " + name)
 
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 14ba1b813c8d61145f7d221deee9c184aeb9bddc..71c731f96ca45519c443a5f932ead050770e17de 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -88,6 +88,8 @@ class HLSCustomOp(CustomOp):
             "res_hls": ("s", False, ""),
             "res_synth": ("s", False, ""),
             "rtlsim_so": ("s", False, ""),
+            # partitioning info
+            "partition_id": ("i", False, 0),
             # input and output FIFO depths
             "inFIFODepth": ("i", False, 2),
             "outFIFODepth": ("i", False, 2),
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad68a4bde29123b2498ac7789048bcd2e13bf3bc
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -0,0 +1,576 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import ceil
+import os
+
+import numpy as np
+
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    rtlsim_output_to_npy,
+)
+from . import templates
+
+# ONNX i/o tensor shape assumptions for channelwise ops:
+# input 0 is the input tensor, shape (..., NumChannels)
+# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel)
+# output 0 is the output tensor, shape (..., NumChannels) - same as input
+# the ... here can be any shape (representing groups of vectors)
+
+
+class ChannelwiseOp_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hls Thresholding_Batch function.
+    It can implement a variety of channel-wise parametrized operations,
+    including Add, Mul and multi-thresholding.
+    """
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+        self.decoupled_wrapper = templates.decoupled_wrapper
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # channelwise "map" function to apply:
+            # one of cmp_le, cmp_ge, add, mul
+            "Func": ("s", False, "cmp_le"),
+            "PE": ("i", True, 0),
+            "NumChannels": ("i", True, 0),
+            # string defining memory resource type for parameters
+            "ram_style": ("s", False, "distributed"),
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "paramDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # input and output FIFO depths
+            "inFIFODepth": ("i", False, 0),
+            "outFIFODepth": ("i", False, 0),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def calc_tmem(self):
+        """Calculates and returns TMEM, the depth of the memory used
+        to store the channelwise op parameters."""
+        chn = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        return chn // pe
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # check input datatype against property
+        idt_name = self.get_input_datatype().name
+        exp_idt_name = self.get_nodeattr("inputDataType")
+        assert exp_idt_name == idt_name, "Bad input DataType for ChannelwiseOp layer"
+        # TODO: dynamically infer/update odt based on idt as done in ConvertToHLSLayers?
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        # TODO collect automatically from get_nodeattr_types
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("NumChannels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType")
+            self.get_nodeattr("paramDataType")
+            self.get_nodeattr("outputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required Threshold_Batch attributes do not exist."""
+            )
+
+        return info_messages
+
+    def bram_estimation(self):
+        """Calculates BRAM cost if resource set to BRAM"""
+        style = self.get_nodeattr("ram_style")
+        P = self.get_nodeattr("PE")
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        tmem = self.calc_tmem()
+
+        if style == "block" and tmem > 1:
+            return int(ceil(A * P / 16)) * int(ceil(tmem / 1024))
+        else:
+            return 0
+
+    def lut_estimation(self):
+        """Calculates LUT cost, taking memory resource type into account """
+        # TODO add in/out FIFO contributions
+        style = self.get_nodeattr("ram_style")
+        P = self.get_nodeattr("PE")
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        tmem = self.calc_tmem()
+        # cost of comparators
+        comparator_cost = A * P
+        # cost of LUTRAM
+        if style == "distributed" and tmem > 1:
+            lutram_cost = P * A * int(ceil(tmem / 64))
+        else:
+            lutram_cost = 0
+        # total cost
+        return comparator_cost + lutram_cost
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self):
+        i_bits = self.get_input_datatype().bitwidth()
+        return i_bits * self.get_nodeattr("PE")
+
+    def get_outstream_width(self):
+        o_bits = self.get_output_datatype().bitwidth()
+        return o_bits * self.get_nodeattr("PE")
+
+    def get_folded_input_shape(self):
+        ich = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        fold = ich // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        folded_input_shape = tuple(vecs + [fold, pe])
+        return folded_input_shape
+
+    def get_folded_output_shape(self):
+        # same shape as input
+        return self.get_folded_input_shape()
+
+    def get_normal_input_shape(self):
+        ich = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_input_shape = tuple(vecs + [ich])
+        return normal_input_shape
+
+    def get_normal_output_shape(self):
+        # same shape as input
+        return self.get_normal_input_shape()
+
+    def get_number_output_values(self):
+        nf = np.prod(self.get_folded_output_shape()[:-1])
+        return nf
+
+    def get_template_param_values(self):
+        """Returns the template parameter values according to input, output and weight
+        data types."""
+        ret = dict()
+        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        # fill in TSrcI
+        ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+        # fill in TDstI
+        ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+        return ret
+
+    def get_hls_compatible_parameter_tensor(self, orig_param_vector):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure chn % PE == 0
+        * interleave rows between PEs
+        * reshape into (PE, TMEM) and return
+        """
+        chn = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        tmem = chn // pe
+        assert chn % pe == 0, "Requirement NumChannels divisable by PE is violated."
+        assert (
+            orig_param_vector.ndim == 1
+        ), """Parameter vector dimension is {}.
+        Expected dimension: 1.""".format(
+            orig_param_vector.ndim
+        )
+
+        # if not self.get_input_datatype().signed():
+        #     # ensure all thresholds are nonnegative
+        #     assert (orig_param_vector >= 0).all()
+
+        # ensure all thresholds are integer
+        assert (orig_param_vector.astype(np.int32) == orig_param_vector).all()
+        ret = orig_param_vector
+
+        assert (
+            ret.shape[0] == chn
+        ), "Cardinality of parameter vector is not as expected (chn)"
+
+        # distribute rows between PEs
+        ret = ret.reshape(tmem, pe).transpose()
+        assert (
+            ret.shape[0] == pe
+        ), """First dimension after distribution of the
+        rows between PEs is not as expected (pe)"""
+        assert (
+            ret.shape[1] == tmem
+        ), """Second dimension after distribution of the
+        rows between PEs is not as expected (tmem)"""
+
+        return ret.reshape(1, pe, tmem)
+
+    def generate_params(self, model, path):
+        code_gen_dir = path
+        # save thresholds in params.h
+        parameters = model.get_initializer(self.onnx_node.input[1])
+        parameter_tensor = self.get_hls_compatible_parameter_tensor(parameters)
+        pdt = DataType[self.get_nodeattr("paramDataType")]
+
+        parameters_hls_code = numpy_to_hls_code(
+            parameter_tensor, pdt, "parameters", False, True
+        )
+        # get input data type
+        export_idt = self.get_input_datatype()
+        if self.get_input_datatype() == DataType.BIPOLAR:
+            export_idt = DataType.BINARY
+        idt_hls = export_idt.get_hls_datatype_str()
+
+        # write parameters into params.h
+        f_params = open("{}/params.h".format(code_gen_dir), "w")
+        pdt_hls = pdt.get_hls_datatype_str()
+        # use binary to export bipolar activations
+        export_odt = self.get_output_datatype()
+        if self.get_output_datatype() == DataType.BIPOLAR:
+            export_odt = DataType.BINARY
+        odt_hls = export_odt.get_hls_datatype_str()
+        # get desired function
+        func = self.get_nodeattr("Func")
+        if func == "cmp_le":
+            func_str = "std::less_equal"
+        elif func == "cmp_ge":
+            func_str = "std::greater_equal"
+        elif func == "add":
+            func_str = "std::plus"
+        elif func == "mul":
+            func_str = "std::multiplies"
+        else:
+            raise Exception(
+                """Invalid value for attribute Func! Is currently set to: {}
+            has to be set to one of the following value
+            ("cmp_le", "cmp_ge", "add", "mul")""".format(
+                    func
+                )
+            )
+        f_params.write(
+            "static ChannelWiseOperation<{},{},{},{},{},{}> threshs \
+            = ".format(
+                self.calc_tmem(),
+                self.get_nodeattr("PE"),
+                idt_hls,
+                pdt_hls,
+                odt_hls,
+                "%s<%s>" % (func_str, odt_hls),
+            )
+        )
+        f_params.write(parameters_hls_code)
+        f_params.close()
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for ChannelwiseOp_Batch")
+            in_ind += 1
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType.BIPOLAR:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
+            assert (
+                context[node.output[0]].shape == self.get_folded_output_shape()
+            ), """Output shape is not as expected"""
+            # reshape output to have expected shape
+            oshape = self.get_normal_output_shape()
+            context[node.output[0]] = context[node.output[0]].reshape(*oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
+
+    # TODO check and add whatever missing
+    def defines(self, var):
+        numInputVectors = list(self.get_nodeattr("numInputVectors"))
+        numReps = numInputVectors[0]
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define NumChannels1 {}\n#define PE1 {}\n#define numReps {}""".format(
+                self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps,
+            )
+        ]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        tmpl_args = self.get_template_param_values()
+        # TODO: why put some template parameters into defines and not others?
+        # should ImgDim be defined or just filled in here like we do now?
+        ishape = self.get_folded_input_shape()
+        if len(ishape) == 3:
+            imgdim = 1
+        elif len(ishape) == 5:
+            imgdim = ishape[1]
+        else:
+            raise Exception("""Unexpeted input shape""")
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}>
+            (in0, out, threshs, numReps);""".format(
+                imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"],
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = self.get_folded_output_shape()
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                shape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0,
+                hls::stream<ap_uint<{}>> &out
+                )""".format(
+                self.onnx_node.name,
+                self.get_instream_width(),
+                self.get_outstream_width(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+        # the channelwise parameter tensor is acc_type [PE][TMEM][N_PARAMS_PER_CHANNEL]
+        # partition for parallel access along PE and N_PARAMS_PER_CHANNEL
+        # dimensions (dims 1 and 3)
+        self.code_gen_dict["$PRAGMAS$"].append(
+            (
+                "#pragma HLS ARRAY_PARTITION variable=threshs.parameters "
+                "complete dim=1"
+            )
+        )
+        # self.code_gen_dict["$PRAGMAS$"].append(
+        #     (
+        #         "#pragma HLS ARRAY_PARTITION variable=threshs.parameters "
+        #         "complete dim=3"
+        #     )
+        # )
+
+        # set resource type
+        ram_style = self.get_nodeattr("ram_style")
+        pe = self.get_nodeattr("PE")
+        ich = self.get_nodeattr("NumChannels")
+        # if PE less than NumChannels, assign cores according to ram_style;
+        # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs
+        if pe < ich:
+            if ram_style == "distributed":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "#pragma HLS RESOURCE variable=threshs.parameters "
+                        "core=ROM_2P_LUTRAM"
+                    )
+                )
+            elif ram_style == "block":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    (
+                        "#pragma HLS RESOURCE variable=threshs.parameters "
+                        "core=ROM_2P_BRAM"
+                    )
+                )
+            else:
+                raise Exception(
+                    """Invalid value for attribute ram_style! Is currently set to: {}
+                has to be set to one of ("block", "distributed")""".format(
+                        ram_style
+                    )
+                )
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ce4379a2c41baa5bc009e9df7623d133ee89a09
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -0,0 +1,297 @@
+import os
+import numpy as np
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class DownSampler(HLSCustomOp):
+    """Corresponds to finn-hlslib ConvolutionInputGenerator_kernel1 function.
+    Basically performs a down sampling of the image removing rows and columns."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # spatial size of input images
+            "ImgDim": ("i", True, 0),
+            # number of channels in input image
+            "NumChannels": ("i", True, 0),
+            # Number of input columns computed in parallel
+            "SIMD": ("i", False, 1),
+            "Stride": ("i", True, 2),
+            # FINN input datatype
+            "inputDataType": ("s", True, ""),
+            # Batch size
+            "numInputVectors": ("i", False, 1),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_downsampled_odim(self):
+        "Return the down sampled spatial size of the output."
+        idim = self.get_nodeattr("ImgDim")
+        stride = self.get_nodeattr("Stride")
+        return int(np.floor((idim - 1) / stride) + 1)
+
+    def get_normal_input_shape(self):
+        idim = self.get_nodeattr("ImgDim")
+        num_ch = self.get_nodeattr("NumChannels")
+        batch = self.get_nodeattr("numInputVectors")
+        ishape = (batch, idim, idim, num_ch)
+        return ishape
+
+    def get_normal_output_shape(self):
+        odim = self.get_downsampled_odim()
+        num_ch = self.get_nodeattr("NumChannels")
+        batch = self.get_nodeattr("numInputVectors")
+        oshape = (batch, odim, odim, num_ch)
+        return oshape
+
+    def get_folded_input_shape(self):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
+
+    def get_folded_output_shape(self):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape for DownSampler."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        exp_idtype = self.get_input_datatype()
+        assert dtype == exp_idtype, "Unexpected datatype for DownSampler"
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        return ret
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
+
+    def get_outstream_width(self):
+        obits = self.get_output_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+        ifm_ch = self.get_nodeattr("NumChannels")
+        self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)]
+
+        ibits = self.get_input_datatype().bitwidth()
+        self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)]
+
+        idim = self.get_nodeattr("ImgDim")
+        self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)]
+
+        simd = self.get_nodeattr("SIMD")
+        self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)]
+
+        stride = self.get_nodeattr("Stride")
+        self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)]
+
+        batch_size = self.get_nodeattr("numInputVectors")
+        self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """ConvolutionInputGenerator_kernel1<IFMChannels, Input_precision,
+            IFMDim, SIMD,Stride> (in0, out, numReps);"""
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
+            % (self.onnx_node.name, packed_hls_type, packed_hls_type)
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels)."""
+        export_idt = self.get_input_datatype()
+
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape
+            (1, OutputDim, OutputDim, NumChannels)."""
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index 9e6c63dc510aab5f6baff9cb6326a2d0476f67a9..83152dea6cc494b8464c78605399b21b38d48b80 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -75,16 +75,19 @@ class GlobalAccPool_Batch(HLSCustomOp):
     def get_normal_output_shape(self):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
-        oshape = tuple([vecs[0]] + [ch])
+        if len(vecs) == 1:
+            oshape = tuple(vecs + [ch])
+        elif len(vecs) == 3:
+            oshape = tuple([vecs[0]] + [1, 1, ch])
         return oshape
 
     def get_folded_output_shape(self):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
-        vecs = list(self.get_nodeattr("numInputVectors"))
+        unfolded_shape = list(self.get_normal_output_shape())
         assert ch % pe == 0, "PE must divide NumChannels"
         folds = int(ch / pe)
-        oshape = tuple([vecs[0]] + [folds, pe])
+        oshape = tuple(unfolded_shape[:-1] + [folds, pe])
         return oshape
 
     def make_shape_compatible_op(self, model):
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b718ecbbc490610790b68871080de23a54f4891
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -0,0 +1,346 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import math
+from onnx import TensorProto, helper
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow import HLSCustomOp
+
+
+# the IODMA inerfaces a memory-mapped AXI interface and an AXI stream
+# direction "in": pulls data from AXI-MM to AXI stream
+# direction "out": pushes data from AXI stream to AXI-MM
+
+# DMA Addressing
+# - burst mode can be "wrap" or "increment"
+# - "increment" bursts will increment the address when moving to the next image
+# - "wrap" bursts will reinitialize the address to the start address,
+#   and are useful for e.g. streaming weights, where the same buffer is
+#   repeatedly read into the FPGA
+# - no additional alignment restrictions beyond anything specified in the AXI spec
+
+# Interfaces
+# - AXI-MM name specified by intfName unless this is set to "" (empty, the default)
+#   in which case output AXI-MM are named "out" and input AXI-MM are named "in0"
+# - AXI-MM interface width (in bits) is specified by intfWidth
+# - AXI-Stream interface width (in bits) is specified by streamWidth
+# - If inftWidth and streamWidth are not equal, the DMA core performs
+#   width conversion by going up to the least common multiple of bitwidths
+#   e.g. intfWidth=32b -> 96b -> sreamWidth=24b
+# - transfers occur in multiples of the AXI-MM interface width, therefore
+#   the total number of bits in the tensor must be a multiple of intfWidth
+# - transfers occur in multiples of the AXI-Stream interface width, therefore
+#   the total number of bits in the tensor must be a multiple of streamWidth
+# - both interface widths must be a multiple of 8b (AXI protocol requirement)
+# - in most systems, intfWidth is also restricted to a power of 2 (e.g. Vitis)
+#   but this is not universal so we don't check here explicitly
+
+# Input/output tensor sizes shapes
+# - The data being moved is a tensor of shape numInputVectors+[NumChannels]
+# - The data type of the tensor elements is specified by dataType
+# - on the stream side
+#       -the normal shape is the same as the ONNX tensor attached to it
+#       -the folded shape is computed from the stream width and normal shape
+# - on the AXI-MM side
+#       -the normal shape is the same as the one on the stream side
+#       -the folded shape is not defined
+
+
+class IODMA(HLSCustomOp):
+    """Class that corresponds to finn-hlslib DMA function(s)."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "NumChannels": ("i", True, 0),
+            # FINN input datatype
+            "dataType": ("s", True, ""),
+            # Stream parameters
+            "streamWidth": ("i", False, 32),
+            # DMA-specific parameters
+            "intfWidth": ("i", False, 32),
+            "burstMode": ("s", False, "increment"),
+            "direction": ("s", False, "in"),
+            # shape describing input vecs per execution
+            "numInputVectors": ("ints", False, [1]),
+            # name of axi-mm interface
+            "intfName": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self):
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        num_ch = self.get_nodeattr("NumChannels")
+        ishape = tuple(vecs + [num_ch])
+        return ishape
+
+    def get_normal_output_shape(self):
+        return self.get_normal_input_shape()
+
+    def get_folded_input_shape(self):
+        if self.get_nodeattr("direction") == "in":
+            raise ValueError("Folded input shape not defined for input IODMA")
+        else:
+            shape = list(self.get_normal_input_shape())
+            itype_bits = self.get_input_datatype().bitwidth()
+            intfw = self.get_nodeattr("streamWidth")
+            assert (
+                intfw % itype_bits == 0
+            ), "Input stream width must be a multiple of datatype bits"
+            elems_per_word = intfw // itype_bits
+            assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
+            fold_depth = shape[-1] // elems_per_word
+            shape[-1] = fold_depth
+            shape.append(elems_per_word)
+            return tuple(shape)
+
+    def get_folded_output_shape(self):
+        if self.get_nodeattr("direction") == "out":
+            raise ValueError("Folded output shape not defined for output IODMA")
+        else:
+            shape = list(self.get_normal_output_shape())
+            itype_bits = self.get_output_datatype().bitwidth()
+            intfw = self.get_nodeattr("streamWidth")
+            assert (
+                intfw % itype_bits == 0
+            ), "Input stream width must be a multiple of datatype bits"
+            elems_per_word = intfw // itype_bits
+            assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
+            fold_depth = shape[-1] // elems_per_word
+            shape[-1] = fold_depth
+            shape.append(elems_per_word)
+            return tuple(shape)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        exp_idtype = self.get_input_datatype()
+        assert dtype == exp_idtype, "Unexpected datatype."
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("dataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self):
+        if self.get_nodeattr("direction") == "in":
+            return self.get_nodeattr("intfWidth")
+        elif self.get_nodeattr("direction") == "out":
+            return self.get_nodeattr("streamWidth")
+        else:
+            raise ValueError("Invalid IODMA direction, please set to in or out")
+
+    def get_outstream_width(self):
+        if self.get_nodeattr("direction") == "out":
+            return self.get_nodeattr("intfWidth")
+        elif self.get_nodeattr("direction") == "in":
+            return self.get_nodeattr("streamWidth")
+        else:
+            raise ValueError("Invalid IODMA direction, please set to in or out")
+
+    def get_number_output_values(self):
+        oshape = self.get_normal_output_shape()
+        itype_bits = self.get_input_datatype().bitwidth()
+        intfw = self.get_nodeattr("intfWidth")
+        nelems = np.prod(oshape)
+        nbits = nelems * itype_bits
+        assert nbits % intfw == 0, "DMA: total transfer size must be word multiple"
+        ovalues = nbits // intfw
+        return ovalues
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "dma.h"']
+        self.code_gen_dict["$GLOBALS$"].append('#include "streamtools.h"')
+
+    def defines(self, var):
+        itype_bits = self.get_input_datatype().bitwidth()
+        total_bits = itype_bits * np.prod(self.get_normal_input_shape())
+        assert total_bits % 8 == 0, "DMA input not a multiple of 1 Byte"
+        total_bytes = total_bits // 8
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define NumBytes1 {}\n#define DataWidth1 {}\n""".format(
+                total_bytes, self.get_nodeattr("intfWidth")
+            )
+        ]
+
+    def get_ap_int_max_w(self):
+        "Return the maximum width of any ap_int used in this module."
+        instream = self.get_instream_width()
+        outstream = self.get_outstream_width()
+        width_lcm = (instream * outstream) // math.gcd(instream, outstream)
+        return width_lcm
+
+    def docompute(self):
+        direction = self.get_nodeattr("direction")
+        mode = self.get_nodeattr("burstMode")
+        if direction == "in":
+            if mode == "wrap":
+                func = "Mem2Stream_Batch_external_wmem"
+            else:
+                func = "Mem2Stream_Batch"
+            dwc_func = "WidthAdjustedOutputStream"
+        elif direction == "out":
+            func = "Stream2Mem_Batch"
+            dwc_func = "WidthAdjustedInputStream"
+        else:
+            raise ValueError("Invalid IODMA direction, please set to in or out")
+        # define templates for instantiation
+        dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);"
+        dwc_inst_template = dwc_func + "<%d, %d, %d> %s(%s, numReps);"
+        # do stream infrastructure and instantiations
+        intfw = self.get_nodeattr("intfWidth")
+        strmw = self.get_nodeattr("streamWidth")
+        width_lcm = (strmw * intfw) // math.gcd(strmw, intfw)
+        # we always need two streams: one of width_lcm, and one of intfw width
+        # because we use WidthAdjustedInputStream,
+        dtype_bits = self.get_input_datatype().bitwidth()
+        total_bits = dtype_bits * np.prod(self.get_normal_input_shape())
+        if direction == "in":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                dwc_inst_template
+                % (width_lcm, strmw, total_bits // width_lcm, "dwc_lcm", "out"),
+                dwc_inst_template
+                % (intfw, width_lcm, total_bits // intfw, "dwc_intfw", "dwc_lcm"),
+                dma_inst_template % ("in0", "dwc_intfw"),
+            ]
+        else:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                dwc_inst_template
+                % (strmw, width_lcm, total_bits // strmw, "dwc_lcm", "in0"),
+                dwc_inst_template
+                % (width_lcm, intfw, total_bits // width_lcm, "dwc_intfw", "dwc_lcm"),
+                dma_inst_template % ("dwc_intfw", "out"),
+            ]
+
+    def blackboxfunction(self):
+        packed_ibits = self.get_instream_width()
+        packed_hls_type_in = "ap_uint<%d>" % packed_ibits
+        packed_obits = self.get_outstream_width()
+        packed_hls_type_out = "ap_uint<%d>" % packed_obits
+        direction = self.get_nodeattr("direction")
+        if direction == "in":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                "void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)"
+                % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
+            ]
+        elif direction == "out":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                "void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)"
+                % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
+            ]
+        else:
+            raise ValueError("Invalid IODMA direction, please set to in or out")
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE s_axilite port=numReps bundle=control"
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE s_axilite port=return bundle=control"
+        )
+        direction = self.get_nodeattr("direction")
+        intfname = self.get_nodeattr("intfName")
+        if direction == "in":
+            if intfname == "":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    "#pragma HLS INTERFACE m_axi offset=slave port=in0"
+                )
+            else:
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname)
+                )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE s_axilite port=in0 bundle=control"
+            )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=out"
+            )
+        elif direction == "out":
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=in0"
+            )
+            if intfname == "":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    "#pragma HLS INTERFACE m_axi offset=slave port=out"
+                )
+            else:
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname)
+                )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE s_axilite port=out bundle=control"
+            )
+        else:
+            raise ValueError("Invalid IODMA direction, please set to in or out")
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW")
+
+    def execute_node(self, context, graph):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def strm_decl(self):
+        pass
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index 0060e5d400f30055d532671c8cf1680f0668442a..e4317e02d46df90c8fd0c8854262ca6eb0ea4f48 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -31,6 +31,7 @@
 from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
     ConvolutionInputGenerator,
 )
+from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
 from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
@@ -51,11 +52,14 @@ from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
 from finn.custom_op.quantavgpool2d import QuantAvgPool2d
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
+from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
+from finn.custom_op.fpgadataflow.iodma import IODMA
 
 # create a mapping of all known CustomOp names and classes
 custom_op = {}
 
 custom_op["MultiThreshold"] = MultiThreshold
+custom_op["DownSampler"] = DownSampler
 custom_op["XnorPopcountMatMul"] = XnorPopcountMatMul
 custom_op["Im2Col"] = Im2Col
 custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
@@ -74,6 +78,8 @@ custom_op["AddStreams_Batch"] = AddStreams_Batch
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
 custom_op["QuantAvgPool2d"] = QuantAvgPool2d
 custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
+custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
+custom_op["IODMA"] = IODMA
 
 
 def getCustomOp(node):
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index b70b126680d650547cf376dd601c048c73a1cfd4..34a697a43426aae0f984770689552063aa35b9e8 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from onnx import helper, TensorProto
+import numpy as np
 
 from finn.core.datatype import DataType
 from finn.transformation import Transformation
@@ -34,7 +35,10 @@ from finn.custom_op.registry import getCustomOp
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
 import finn.core.data_layout as DataLayout
+from finn.util.onnx import nchw_to_nhwc
+import warnings
 from finn.util.basic import get_by_name
+import warnings
 
 
 class InferConvInpGen(Transformation):
@@ -52,6 +56,9 @@ class InferConvInpGen(Transformation):
                 i2c_in_shape = model.get_tensor_shape(i2c_input)
                 i2c_out_shape = model.get_tensor_shape(i2c_output)
                 dt = model.get_tensor_datatype(i2c_input)
+                if not dt.is_integer():
+                    warnings.warn("Input is not int. Can't infer ConvInpGen")
+                    continue
                 i2c_inst = getCustomOp(n)
                 stride = i2c_inst.get_nodeattr("stride")
                 k = i2c_inst.get_nodeattr("kernel_size")
@@ -103,24 +110,40 @@ class InferConvInpGen(Transformation):
                     )
                     graph.node.insert(node_ind, padding_node)
 
-                # create equivalent ConvolutionInputGenerator node
-                ConvInpGen_node = helper.make_node(
-                    "ConvolutionInputGenerator",
-                    [ConvInpGen_input],
-                    [i2c_output],
-                    domain="finn",
-                    backend="fpgadataflow",
-                    ConvKernelDim=k,
-                    IFMChannels=ifm_ch,
-                    IFMDim=ConvInpGen_idim,
-                    OFMDim=ofm_dim,
-                    SIMD=ifm_ch,
-                    Stride=stride,
-                    inputDataType=dt.name,
-                    outputDataType=dt.name,
-                    depthwise=depthwise,
-                )
-                graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                if stride > 1 and k == 1:
+                    # create DownSampler node
+                    ConvInpGen_node = helper.make_node(
+                        "DownSampler",
+                        [ConvInpGen_input],
+                        [i2c_output],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        ImgDim=ConvInpGen_idim,
+                        NumChannels=ifm_ch,
+                        SIMD=ifm_ch,
+                        Stride=stride,
+                        inputDataType=dt.name,
+                    )
+                    graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                else:
+                    # create equivalent ConvolutionInputGenerator node
+                    ConvInpGen_node = helper.make_node(
+                        "ConvolutionInputGenerator",
+                        [ConvInpGen_input],
+                        [i2c_output],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        ConvKernelDim=k,
+                        IFMChannels=ifm_ch,
+                        IFMDim=ConvInpGen_idim,
+                        OFMDim=ofm_dim,
+                        SIMD=ifm_ch,
+                        Stride=stride,
+                        inputDataType=dt.name,
+                        outputDataType=dt.name,
+                        depthwise=depthwise,
+                    )
+                    graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
                 graph.node.remove(n)
                 graph_modified = True
@@ -627,3 +650,243 @@ class InferThresholdingLayer(Transformation):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferChannelwiseLinearLayer(Transformation):
+    """Convert any channel-wise Add/Mul into a HLS layer."""
+
+    def get_smallest_possible(self, vals):
+        """Returns smallest (fewest bits) possible DataType that can represent
+        value. Prefers unsigned integers where possible."""
+        vals = np.array(vals)
+        for v in vals:
+            assert int(v) == v, "Error float value"
+
+        for k in DataType.__members__:
+            dt = DataType[k]
+
+            if dt in [DataType.BIPOLAR, DataType.TERNARY, DataType.FLOAT32]:
+                # not currently supported
+                continue
+
+            if (dt.min() <= vals).all() and (vals <= dt.max()).all():
+                return dt
+
+        warnings.warn(
+            """InferChannelwiseLinearLayer: Output values may not be
+        representable with supported data types.
+        Setting maximum width data type available.
+        This will lead to errors if there are no constrains on the input
+        """
+        )
+
+        if (0 <= vals).all():
+            return DataType.UINT32
+        else:
+            return DataType.INT32
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Add" or node.op_type == "Mul":
+                # assuming input[0] is dynamic
+                ll_input = node.input[0]
+                ll_output = node.output[0]
+                ll_in_shape = model.get_tensor_shape(ll_input)
+
+                # check if input 1 has an initializer
+                ll_const = node.input[1]
+                if ll_const is not None:
+                    ll_cinit = model.get_initializer(ll_const)
+                    if ll_cinit is None:
+                        # input 1 is also dynamic
+                        continue
+                else:
+                    continue
+
+                # get number of channels and channel index from input
+                ll_in_layout = model.get_tensor_layout(ll_input)
+                if ll_in_layout == DataLayout.NHWC or ll_in_layout == DataLayout.NC:
+                    ch_index = -1
+                    ch = ll_in_shape[-1]
+                elif ll_in_layout == DataLayout.NCHW:
+                    ch_index = 1
+                    ch = ll_in_shape[1]
+                else:
+                    continue
+
+                # check if the shape of initializer is compatible
+                ll_cinit_shape = list(ll_cinit.shape)
+                if np.prod(ll_cinit_shape) == 1:
+                    warnings.warn(
+                        "Broadcasting " + str(node.op_type) + "(" + node.name + ")"
+                    )
+                    ll_cinit = np.full((ch), ll_cinit.flatten()[0])
+                elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch:
+                    # parameter shape not compatible with Channelwise_batch
+                    continue
+
+                # check initializer contains integers as floats
+                if not (ll_cinit.astype(np.int32) == ll_cinit).all():
+                    continue
+                # all initializer conditions are met
+
+                # check inputs
+                idt = model.get_tensor_datatype(ll_input)
+                if not idt.is_integer():
+                    # skip conversion for layers with float input
+                    continue
+
+                # check layout of inputs/outputs, and convert if needed
+                # check layout and convert if necessary
+                if ll_in_layout == DataLayout.NCHW:
+                    ll_input = nchw_to_nhwc(ll_input, model, node_ind)
+                    node_ind += 1
+                    ll_in_shape = model.get_tensor_shape(ll_input)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+                ll_output_layout = model.get_tensor_layout(ll_output)
+                if ll_output_layout == DataLayout.NCHW:
+                    ll_output = nchw_to_nhwc(ll_output, model, node_ind, reverse=True)
+                    node_ind += 1
+
+                # get parameter data type
+                param_min = min(ll_cinit.flatten())
+                param_max = max(ll_cinit.flatten())
+                pdt = self.get_smallest_possible([param_min, param_max])
+
+                # set function and determine output data type
+                if node.op_type == "Add":
+                    func = "add"
+                    out_min = idt.min() + param_min
+                    out_max = idt.max() + param_max
+                    odt = self.get_smallest_possible([out_min, out_max])
+                elif node.op_type == "Mul":
+                    func = "mul"
+                    possible_limits = []
+                    possible_limits += [idt.min() * param_min]
+                    possible_limits += [idt.min() * param_max]
+                    possible_limits += [idt.max() * param_min]
+                    possible_limits += [idt.max() * param_max]
+                    odt = self.get_smallest_possible(possible_limits)
+
+                model.set_initializer(ll_const, ll_cinit.reshape(ch))
+                model.set_tensor_datatype(ll_output, odt)
+
+                # create node with no parallelization first
+                pe = 1
+                assert ch % pe == 0, "Requirement IFC divisable by PE is violated."
+                # create and insert node
+                new_node = helper.make_node(
+                    "ChannelwiseOp_Batch",
+                    [ll_input, ll_const],
+                    [ll_output],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    Func=func,
+                    NumChannels=ch,
+                    PE=pe,
+                    inputDataType=idt.name,
+                    paramDataType=pdt.name,
+                    outputDataType=odt.name,
+                    numInputVectors=list(ll_in_shape[:-1]),
+                )
+                graph.node.insert(insert_point, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
+class InferGlobalAccPoolLayer(Transformation):
+    """Convert any GlobalAveragePool into a GlobalAccPool HLS layer and a scalar Mul."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "GlobalAveragePool":
+                in0 = node.input[0]
+                result = node.output[0]
+                in0_shape = model.get_tensor_shape(in0)
+
+                idt = model.get_tensor_datatype(in0)
+
+                # skip conversion for layers with float input
+                if not idt.is_integer():
+                    continue
+
+                # check layout and convert if necessary
+                in0_layout = model.get_tensor_layout(in0)
+                result_layout = model.get_tensor_layout(result)
+
+                if in0_layout == DataLayout.NCHW:
+                    in0 = nchw_to_nhwc(in0, model, node_ind)
+                    node_ind += 1
+                    in0_shape = model.get_tensor_shape(in0)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+
+                if result_layout == DataLayout.NCHW:
+                    result = nchw_to_nhwc(result, model, node_ind, reverse=True)
+                    node_ind += 1
+
+                num_ch = int(in0_shape[-1])
+                vecs = in0_shape[:-1]
+                # create node with no parallelization first
+                pe = 1
+                assert (
+                    num_ch % pe == 0
+                ), "Requirement Labels divisable by PE is violated."
+
+                # create an additional tensor of the same shape and layout as result
+                out_shape = model.get_tensor_shape(result)
+                pool_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                )
+                model.graph.value_info.append(pool_out)
+                pool_out = pool_out.name
+                model.set_tensor_layout(pool_out, model.get_tensor_layout(result))
+
+                new_pool = helper.make_node(
+                    "GlobalAccPool_Batch",
+                    [in0],
+                    [pool_out],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    NumChannels=num_ch,
+                    PE=pe,
+                    inputDataType=idt.name,
+                    numInputVectors=vecs,
+                )
+
+                mul_value = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, [1]
+                )
+                model.graph.value_info.append(mul_value)
+                model.set_initializer(mul_value.name, np.array(1 / (vecs[1] * vecs[2])))
+                new_mul = helper.make_node("Mul", [pool_out, mul_value.name], [result],)
+                graph.node.insert(insert_point, new_pool)
+                graph.node.insert(insert_point + 1, new_mul)
+                node_ind += 1
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d9a51875499d77f384c03f54009a9dd1001dea0
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+from finn.util.basic import get_by_name
+
+
+class Floorplan(Transformation):
+    """Perform Floorplanning of the dataflow design. Separate DMAs into their own
+    partitions IDs, and TODO: split the design into sections of defined size"""
+
+    def __init__(self, limits=None):
+        super().__init__()
+        self.resource_limits = limits
+
+    def apply(self, model):
+        target_partition_id = 0
+        # we currently assume that all dataflow nodes belonging to the same partition
+        # are connected to each other and there is a single input/output to/from each.
+        all_nodes = list(model.graph.node)
+        df_nodes = list(
+            filter(lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes)
+        )
+        dma_nodes = list(filter(lambda x: x.op_type == "IODMA", df_nodes))
+
+        non_dma_nodes = list(filter(lambda x: x not in dma_nodes, df_nodes))
+        dyn_tlastmarker_nodes = list(
+            filter(
+                lambda x: x.op_type == "TLastMarker"
+                and getCustomOp(x).get_nodeattr("DynIters") == "true",
+                non_dma_nodes,
+            )
+        )
+
+        non_dma_nodes = list(
+            filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes)
+        )
+
+        for node in dma_nodes:
+            node_inst = getCustomOp(node)
+            node_inst.set_nodeattr("partition_id", target_partition_id)
+            target_partition_id += 1
+
+        for node in dyn_tlastmarker_nodes:
+            node_inst = getCustomOp(node)
+            node_inst.set_nodeattr("partition_id", target_partition_id)
+            target_partition_id += 1
+
+        for node in non_dma_nodes:
+            # TODO: implement proper floorplanning; for now just a single partition
+            node_inst = getCustomOp(node)
+            node_inst.set_nodeattr("partition_id", target_partition_id)
+
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4368edea717f7499481e9b1c6ac20f7d5bb5f58
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from onnx import TensorProto
+from onnx import helper as oh
+
+from finn.util.basic import get_by_name
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+from finn.transformation.general import SortGraph
+import finn.core.data_layout as DataLayout
+import math
+import numpy as np
+
+
+class InsertIODMA(Transformation):
+    """Insert DMA nodes on all inputs and outputs."""
+
+    def __init__(self, max_intfwidth=32):
+        super().__init__()
+        assert (
+            2 ** math.log2(max_intfwidth) == max_intfwidth
+        ), "max_intfwidth must be a power of 2"
+        self.max_intfwidth = max_intfwidth
+
+    def apply(self, model):
+        # only makes sense for a pure fpgadataflow graph -- so we check!
+        all_nodes = list(model.graph.node)
+        assert all(
+            get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow"
+            for x in all_nodes
+        )
+        # parse streamingfclayers looking for external weights with no attached IODMA
+        fc_extw_nodes = list(
+            filter(
+                lambda x: x.op_type == "StreamingFCLayer_Batch"
+                and get_by_name(x.attribute, "mem_mode") is not None
+                and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external"
+                and model.find_producer(x.input[1]) is None,
+                all_nodes,
+            )
+        )
+        graph_in_name = model.graph.input[0].name
+        first_node = model.find_consumer(graph_in_name)
+        graph_out_name = model.graph.output[0].name
+        final_node = model.find_producer(graph_out_name)
+        if (
+            final_node.op_type == "IODMA"
+            and first_node.op_type == "IODMA"
+            and len(fc_extw_nodes) == 0
+        ):
+            # TODO maybe check the correctness of properties
+            return (model, False)
+        else:
+            if final_node.op_type != "IODMA":
+                # check if tensor is NHWC
+                assert (
+                    model.get_tensor_layout(graph_out_name) == DataLayout.NHWC
+                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
+                ), "Data layout of tensors must be NHWC or NC"
+                out_shape = model.get_tensor_shape(graph_out_name)
+                out_dtype = model.get_tensor_datatype(graph_out_name)
+                # determine the feasible interface width
+                transfer_bits = np.prod(out_shape) * out_dtype.bitwidth()
+                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                assert (
+                    intfwidth % 8 == 0
+                ), "No feasible interface width for transfer size"
+                # get width of stream input to DMA
+                streamWidth = getCustomOp(final_node).get_outstream_width()
+                # make new buffer
+                final_node_out = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                )
+                model.graph.value_info.append(final_node_out)
+                model.set_tensor_datatype(final_node_out.name, out_dtype)
+                # reroute final node output to final_node_out_name
+                final_node.output[0] = final_node_out.name
+                dma_node = oh.make_node(
+                    "IODMA",
+                    [final_node_out.name],
+                    [graph_out_name],
+                    numInputVectors=out_shape[:-1],
+                    NumChannels=out_shape[-1],
+                    dataType=str(out_dtype.name),
+                    intfWidth=intfwidth,
+                    streamWidth=streamWidth,
+                    direction="out",
+                    domain="finn",
+                    backend="fpgadataflow",
+                )
+                model.graph.node.append(dma_node)
+            if first_node.op_type != "IODMA":
+                # check if tensor is NHWC
+                assert (
+                    model.get_tensor_layout(graph_in_name) == DataLayout.NHWC
+                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
+                ), "Data layout of tensors must be NHWC or NC"
+                in_shape = model.get_tensor_shape(graph_in_name)
+                in_dtype = model.get_tensor_datatype(graph_in_name)
+                # determine the feasible interface width
+                transfer_bits = np.prod(in_shape) * in_dtype.bitwidth()
+                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                assert (
+                    intfwidth % 8 == 0
+                ), "No feasible interface width for transfer size"
+                # get width of stream output from DMA
+                streamWidth = getCustomOp(first_node).get_instream_width()
+                # make new buffer
+                first_node_in = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                )
+                model.graph.value_info.append(first_node_in)
+                model.set_tensor_datatype(first_node_in.name, in_dtype)
+                # reroute final node output to final_node_out_name
+                first_node.input[0] = first_node_in.name
+                dma_node = oh.make_node(
+                    "IODMA",
+                    [graph_in_name],
+                    [first_node_in.name],
+                    numInputVectors=in_shape[:-1],
+                    NumChannels=in_shape[-1],
+                    dataType=str(in_dtype.name),
+                    intfWidth=intfwidth,
+                    streamWidth=streamWidth,
+                    direction="in",
+                    domain="finn",
+                    backend="fpgadataflow",
+                )
+                model.graph.node.insert(0, dma_node)
+            for fc_node in fc_extw_nodes:
+                # check if tensor is NHWC
+                assert (
+                    model.get_tensor_layout(fc_node.input[1]) == DataLayout.NHWC
+                    or model.get_tensor_layout(graph_in_name) == DataLayout.NC
+                ), "Data layout of tensors must be NHWC or NC"
+                fc_w_name = fc_node.input[1]
+                w_shape = model.get_tensor_shape(fc_w_name)
+                w_dtype = model.get_tensor_datatype(fc_w_name)
+                # determine the feasible interface width
+                transfer_bits = np.prod(w_shape) * w_dtype.bitwidth()
+                intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
+                assert (
+                    intfwidth % 8 == 0
+                ), "No feasible interface width for transfer size"
+                # calculate width of stream output from DMA
+                pe = get_by_name(fc_node.attribute, "PE").i
+                simd = get_by_name(fc_node.attribute, "SIMD").i
+                streamWidth = simd * pe * w_dtype.bitwidth()
+                # make new buffer
+                fc_node_in = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape
+                )
+                model.graph.value_info.append(fc_node_in)
+                model.set_tensor_datatype(fc_node_in.name, w_dtype)
+                dma_node = oh.make_node(
+                    "IODMA",
+                    [fc_w_name],
+                    [fc_node_in.name],
+                    numInputVectors=w_shape[:-1],
+                    NumChannels=w_shape[-1],
+                    dataType=str(w_dtype.name),
+                    intfWidth=intfwidth,
+                    streamWidth=streamWidth,
+                    direction="in",
+                    burstMode="wrap",
+                    domain="finn",
+                    backend="fpgadataflow",
+                )
+                fc_node.input[1] = fc_node_in.name
+                model.graph.node.insert(0, dma_node)
+            model = model.transform(SortGraph())
+            return (model, True)
diff --git a/src/finn/transformation/remove_identity.py b/src/finn/transformation/remove_identity.py
deleted file mode 100644
index d7a58d59c1bb8ff643e691442e7eda3c0516aa5c..0000000000000000000000000000000000000000
--- a/src/finn/transformation/remove_identity.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from finn.transformation import Transformation
-
-
-def _is_identity(node, model):
-    if node.op_type == "Mul":
-        scale = model.get_initializer(node.input[1])
-        if scale is not None:
-            return (scale == 1).all()
-    elif node.op_type == "Add":
-        bias = model.get_initializer(node.input[1])
-        if bias is not None:
-            return (bias == 0).all()
-    return False
-
-
-class RemoveIdentity(Transformation):
-    """Remove nodes that apply identity ops from the graph, including:
-    * Multiply by 1
-    * Add 0
-    ."""
-
-    def apply(self, model):
-        graph = model.graph
-        node_ind = 0
-        graph_modified = False
-        for node in graph.node:
-            node_ind += 1
-            if _is_identity(node, model):
-                node_src = node.input[0]
-                node_dst = node.output[0]
-                graph.node.remove(node)
-                model.rename_tensor(node_dst, node_src)
-                graph_modified = True
-        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/__init__.py b/src/finn/transformation/streamline/__init__.py
index d9c12a20975084705b801c0ff027d4b99aff9490..d7686eaadcbc800542ab96c5f45145857412b773 100644
--- a/src/finn/transformation/streamline/__init__.py
+++ b/src/finn/transformation/streamline/__init__.py
@@ -53,7 +53,7 @@ from finn.transformation.streamline.reorder import (
     MoveAddPastMul,
     MoveScalarMulPastMatMul,
     MoveScalarAddPastMatMul,
-    MoveScalarAddPastConv,
+    MoveAddPastConv,
     MoveScalarMulPastConv,
 )
 
@@ -75,7 +75,7 @@ class Streamline(Transformation):
             AbsorbSignBiasIntoMultiThreshold(),
             MoveAddPastMul(),
             MoveScalarAddPastMatMul(),
-            MoveScalarAddPastConv(),
+            MoveAddPastConv(),
             MoveScalarMulPastMatMul(),
             MoveScalarMulPastConv(),
             MoveAddPastMul(),
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index dc01eea411fc1f640e481c9be02a92acdd59533f..f089275c221f769daace3e9628a00bf87b4e5457 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -31,6 +31,7 @@ from onnx import helper as oh
 import warnings
 
 from finn.core.datatype import DataType
+import finn.core.data_layout as DataLayout
 from finn.transformation import Transformation
 from finn.util.basic import get_by_name
 from finn.custom_op.registry import getCustomOp
@@ -357,7 +358,68 @@ class AbsorbTransposeIntoMultiThreshold(Transformation):
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
 
+class AbsorbTransposeIntoFlatten(Transformation):
+    """Absorb transpose node into succeeding flatten node, if H=W=1 and the first
+    dimension stays the same. Can also be applied if flatten is implemented implicitly
+    by a reshape node with shape [1, -1] and the first input dimension is 1"""
 
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        node_ind = 0
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type == "Reshape"
+                and (model.get_initializer(n.input[1]) == [1, -1]).all()
+            ) or n.op_type == "Flatten":
+                prod = model.find_producer(n.input[0])
+                if (
+                    prod is not None
+                    and prod.op_type == "Transpose"
+                    # we ensure that the first dimension is not changed from the
+                    # transpose operation
+                    and get_by_name(prod.attribute, "perm").ints[0] == 0
+                ):
+                    data_layout = model.get_tensor_layout(prod.input[0])
+                    # check for the data layout to interpret input shape correctly
+                    if data_layout is None:
+                        warnings.warn(
+                            """Data layout for input tensor of Transpose node is not set.
+                                To use AbsorbTransposeIntoFlatten transformation
+                                please set tensor data layout."""
+                        )
+                        continue
+                    elif data_layout == DataLayout.NCHW:
+                        (b, c, h, w) = model.get_tensor_shape(prod.input[0])
+                        # if h=w=1 the transposition can be absorbed, otherwise
+                        # the absorption would lead to an error in the behavior
+                        if h != 1 or w != 1:
+                            continue
+                        # the flatten node from onnx keeps by default the first
+                        # dim and flattens the rest, that is why this transformation
+                        # can only work with b != 1 if the model contains already a
+                        # flatten node and not a reshape node with shape = [1, -1].
+                        # If the first  dim of the input tensor is not 1, flatten and
+                        # reshape (with shape = [1, -1]) would lead to different results
+                        if n.op_type == "Reshape" and b != 1:
+                            continue
+                    elif data_layout == DataLayout.NHWC:
+                        (b, h, w, c) = model.get_tensor_shape(prod.input[0])
+                        if h != 1 or w != 1:
+                            continue
+                        if n.op_type == "Reshape" and b != 1:
+                            continue
+                    # create single flatten node and remove obsolete nodes
+                    node = oh.make_node("Flatten", [prod.input[0]], [n.output[0]])
+                    graph.node.remove(n)
+                    graph.node.remove(prod)
+                    graph.node.insert(node_ind, node)
+                    graph_modified = True
+        if graph_modified:
+          model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+      
 class AbsorbScalarMulIntoTopK(Transformation):
     """Absorb a mul node into a suceeding topk node if the mul is scalar."""
 
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index a1bd16f6d0b70193122d5d067ccdee395260c7b1..2b03532ce3ba7d5159e5ae57e61c2af9c8c37fce 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -29,9 +29,13 @@
 import numpy as np
 import warnings
 from onnx import helper as oh
+from onnx import TensorProto
 
 from finn.transformation import Transformation
+import finn.core.data_layout as DataLayout
 from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.core.datatype import DataType
 from finn.core.onnx_exec import execute_node
 from finn.util.basic import get_by_name
@@ -68,8 +72,11 @@ class MoveAddPastMul(Transformation):
                     add_weight_name = n.input[1]
                     A = model.get_initializer(mul_weight_name)
                     B = model.get_initializer(add_weight_name)
-                    assert A is not None, "Initializer for mul weights is not set."
-                    assert B is not None, "Initializer for add weights is not set."
+                    if (A is None) or (B is None):
+                        warnings.warn(
+                            "Mul or add does not have constant params, skipping"
+                        )
+                        continue
                     start_name = n.input[0]
                     middle_name = n.output[0]
                     end_name = consumer.output[0]
@@ -124,8 +131,9 @@ class MoveScalarMulPastMatMul(Transformation):
                     matmul_weight_name = consumer.input[1]
                     A = model.get_initializer(mul_weight_name)
                     W = model.get_initializer(matmul_weight_name)
-                    assert A is not None, "Initializer for mul weights is not set."
-                    assert W is not None, "Initializer for matmul weights is not set."
+                    if (A is None) or (W is None):
+                        warnings.warn("MatMul or Mul params are not constant, skipping")
+                        continue
                     start_name = n.input[0]
                     middle_name = n.output[0]
                     end_name = consumer.output[0]
@@ -181,8 +189,9 @@ class MoveScalarAddPastMatMul(Transformation):
                     matmul_weight_name = consumer.input[1]
                     A = model.get_initializer(add_weight_name)
                     W = model.get_initializer(matmul_weight_name)
-                    assert A is not None, "Initializer for add weights is not set."
-                    assert W is not None, "Initializer for matmul weights is not set."
+                    if (A is None) or (W is None):
+                        warnings.warn("MatMul or Add params are not constant, skipping")
+                        continue
                     start_name = n.input[0]
                     middle_name = n.output[0]
                     end_name = consumer.output[0]
@@ -216,8 +225,8 @@ class MoveScalarAddPastMatMul(Transformation):
         return (model, graph_modified)
 
 
-class MoveScalarAddPastConv(Transformation):
-    """Move scalar add operations past conv operations. We want to have adds
+class MoveAddPastConv(Transformation):
+    """Move scalar and channelwise add operations past conv operations. We want to have adds
     next to each other such that they can be collapsed into a single add."""
 
     def apply(self, model):
@@ -242,8 +251,12 @@ class MoveScalarAddPastConv(Transformation):
                     add_weight_name = n.input[1]
                     conv_in_name = consumer.input[0]
                     conv_in_shape = model.get_tensor_shape(conv_in_name)
+                    # assume datalayout to be NCHW
+                    channels = conv_in_shape[1]
                     A = model.get_initializer(add_weight_name)
-                    assert A is not None, "Initializer for add weights is not set."
+                    if A is None:
+                        warnings.warn("Add param is not constant, skipping")
+                        continue
                     start_name = n.input[0]
                     end_name = consumer.output[0]
                     conv_out_shape = model.get_tensor_shape(end_name)
@@ -252,11 +265,17 @@ class MoveScalarAddPastConv(Transformation):
                     pads = list(get_by_name(consumer.attribute, "pads").ints)
                     if sum(pads) == 0:
                         using_padding = False
-                    if all(x == 1 for x in A.shape) and not using_padding:
+                    if (
+                        all(x == 1 for x in A.shape) or A.shape == (1, channels, 1, 1)
+                    ) and not using_padding:
                         # create a tensor filled with the add constant, in
                         # the shape expected by the convolution
                         conv_in_const = np.zeros(conv_in_shape, dtype=np.float32)
-                        conv_in_const.fill(A.item())
+                        if A.shape == (1, channels, 1, 1):
+                            for ch in range(channels):
+                                conv_in_const[0][ch].fill(A[0][ch].item())
+                        else:
+                            conv_in_const.fill(A.item())
                         # create an execution context and put in const input
                         exec_ctx = model.make_empty_exec_context()
                         exec_ctx[conv_in_name] = conv_in_const
@@ -311,7 +330,9 @@ class MoveScalarMulPastConv(Transformation):
                 ):
                     mul_weight_name = n.input[1]
                     A = model.get_initializer(mul_weight_name)
-                    assert A is not None, "Initializer for mul weights is not set."
+                    if A is None:
+                        warnings.warn("Mul param is not constant, skipping")
+                        continue
                     conv_node = consumer
                     mul_node = n
                     start_name = mul_node.input[0]
@@ -663,3 +684,215 @@ class MoveMaxPoolPastMultiThreshold(Transformation):
 
         model = model.transform(InferShapes())
         return (model, graph_modified)
+
+class MoveFlattenPastTopK(Transformation):
+    """Move flatten node past a succeeding topk node, if the "axis" attribute in topk
+    is set to -1 and the data layout before the flatten is NHWC with H=W=1"""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "Flatten":
+                consumer = model.find_consumer(n.output[0])
+                if consumer is not None and consumer.op_type == "TopK":
+                    axis = get_by_name(consumer.attribute, "axis")
+                    if axis is None or axis.i != -1:
+                        continue
+                    start_name = n.input[0]
+                    data_layout = model.get_tensor_layout(start_name)
+                    if data_layout != DataLayout.NHWC:
+                        warnings.warn(
+                            """Transformation can't be applied. The input
+                            to flatten has to have DataLayout.NHWC"""
+                        )
+                        continue
+                    (b, h, w, c) = model.get_tensor_shape(start_name)
+                    if h != 1 or w != 1:
+                        continue
+                    # get parameter k from topk
+                    k = model.get_tensor_shape(consumer.output[1])[-1]
+
+                    # swap conections
+                    # new tensor because dims change
+                    middle_name = model.make_new_valueinfo_name()
+                    topk_indices = oh.make_tensor_value_info(
+                        middle_name, TensorProto.INT64, [b, h, w, k]
+                    )
+                    end_name = consumer.output[1]
+                    graph.value_info.append(topk_indices)
+
+                    # remove old nodes
+                    graph.node.remove(n)
+                    graph.node.remove(consumer)
+
+                    # set inputs and outputs correctly
+                    consumer.input[0] = start_name
+                    consumer.output[1] = middle_name
+                    model.set_tensor_shape(consumer.output[0], (b, h, w, k))
+
+                    n.input[0] = middle_name
+                    n.output[0] = end_name
+
+                    # insert them back in
+                    graph.node.insert(node_ind - 1, consumer)
+                    graph.node.insert(node_ind, n)
+
+                    graph_modified = True
+
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
+
+class MoveFlattenPastAffine(Transformation):
+    """Moves a node that implements a (1, -1) reshape past a MatMul, Mul or Add node."""
+
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        node_ind = 0
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type == "Flatten"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                consumer = model.find_consumer(n.output[0])
+                if (
+                    consumer is not None
+                    and (
+                        consumer.op_type == "MatMul"
+                        or consumer.op_type == "Mul"
+                        or consumer.op_type == "Add"
+                    )
+                    and not model.is_join_node(consumer)
+                ):
+                    # move flatten past operation and rewire tensors
+                    start_name = n.input[0]
+                    # check if datalyout is set to NHWC and H=W=1
+                    datalayout = model.get_tensor_layout(start_name)
+                    if datalayout == DataLayout.NHWC:
+                        (b, h, w, c) = model.get_tensor_shape(start_name)
+                        if h != 1 or w != 1:
+                            warnings.warn(
+                                """The Transformation can only be performed if
+                            H=W=1."""
+                            )
+                            continue
+                    else:
+                        warnings.warn(
+                            """The Transformation can only be performed on
+                            operations that operate on data layout NHWC."""
+                        )
+                        continue
+                    middle_name = n.output[0]
+                    end_name = consumer.output[0]
+                    op_param_name = consumer.input[1]
+                    A = model.get_initializer(op_param_name)
+                    if A is None:
+                        warnings.warn("Param is not constant, skipping")
+                        continue
+                    op_in_dt = model.get_tensor_datatype(consumer.input[0])
+                    op_out_dt = model.get_tensor_datatype(consumer.output[0])
+                    start_shape = model.get_tensor_shape(start_name)
+                    dummy_in = np.random.uniform(low=0, high=1, size=(start_shape))
+
+                    if consumer.op_type == "MatMul":
+                        dummy_out = np.matmul(dummy_in, A)
+                    elif consumer.op_type == "Mul":
+                        dummy_out = dummy_in * A
+                    elif consumer.op_type == "Add":
+                        dummy_out = dummy_in + A
+
+                    new_op = oh.make_node(
+                        consumer.op_type,
+                        [start_name, op_param_name],
+                        [middle_name],
+                        name=consumer.name,
+                    )
+                    new_flatten = oh.make_node("Flatten", [middle_name], [end_name])
+                    graph.node.insert(node_ind, new_op)
+                    graph.node.insert(node_ind + 1, new_flatten)
+                    model.set_tensor_shape(middle_name, dummy_out.shape)
+                    # because a flatten node doesn't change the datatype we need
+                    # only the datatype of the op node
+                    model.set_tensor_datatype(start_name, op_in_dt)
+                    model.set_tensor_datatype(middle_name, op_out_dt)
+                    model.set_tensor_datatype(end_name, op_out_dt)
+                    # set datalayout
+                    model.set_tensor_layout(start_name, DataLayout.NHWC)
+                    model.set_tensor_layout(middle_name, DataLayout.NHWC)
+                    # remove old nodes
+                    graph.node.remove(n)
+                    graph.node.remove(consumer)
+                    graph_modified = True
+
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        model = model.transform(InferDataLayouts())                  
+        return (model, graph_modified)
+      
+class MoveTransposePastScalarMul(Transformation):
+    """Moves a Transpose node past a scalar Mul node"""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type == "Transpose"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                consumer = model.find_consumer(n.output[0])
+                if (
+                    consumer is not None
+                    and consumer.op_type == "Mul"
+                    and not model.is_join_node(consumer)
+                ):
+                    mul_weight_name = consumer.input[1]
+                    A = model.get_initializer(mul_weight_name)
+                    if A is None:
+                        warnings.warn("Mul param is not constant, skipping")
+                        continue
+                    transp_node = n
+                    mul_node = consumer
+                    start_name = transp_node.input[0]
+                    middle_name = transp_node.output[0]
+                    end_name = mul_node.output[0]
+                    transp_in_shape = model.get_tensor_shape(start_name)
+                    transp_out_shape = model.get_tensor_shape(middle_name)
+                    transp_in_layout = model.get_tensor_layout(start_name)
+                    transp_out_layout = model.get_tensor_layout(middle_name)
+                    if transp_in_layout is None or transp_out_layout is None:
+                        warnings.warn(
+                            """Datalayout is not set for tensors.
+                            Transformation can't be applied."""
+                        )
+                        continue
+                    if all(x == 1 for x in A.shape):
+                        # if the mul is scalar, we can simply swap the order of ops
+                        # rewire transpose input to be mul input
+                        mul_node.input[0] = start_name
+                        model.set_tensor_shape(start_name, transp_in_shape)
+                        model.set_tensor_layout(start_name, transp_in_layout)
+                        mul_node.output[0] = middle_name
+                        model.set_tensor_shape(middle_name, transp_in_shape)
+                        model.set_tensor_layout(middle_name, transp_in_layout)
+                        transp_node.input[0] = middle_name
+                        transp_node.output[0] = end_name
+                        model.set_tensor_shape(end_name, transp_out_shape)
+                        model.set_tensor_layout(end_name, transp_out_layout)
+                        graph.node.remove(transp_node)
+                        graph.node.insert(node_ind, transp_node)
+                        graph_modified = True
+
+        if graph_modified is True:
+            model = model.transform(InferDataLayouts())
+            model = model.transform(InferShapes())
+        return (model, graph_modified)
+
diff --git a/src/finn/util/onnx.py b/src/finn/util/onnx.py
index b9932111d86d7206b23e1d0e49a6aa8451f8ba24..4d7cdd126ededac887639a932c2021ef5f081c02 100644
--- a/src/finn/util/onnx.py
+++ b/src/finn/util/onnx.py
@@ -28,6 +28,7 @@
 
 import numpy as np
 import onnx
+import finn.core.data_layout as DataLayout
 
 
 def valueinfo_to_tensor(vi):
@@ -37,3 +38,38 @@ def valueinfo_to_tensor(vi):
     return np.zeros(
         dims, dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[vi.type.tensor_type.elem_type]
     )
+
+
+def nchw_to_nhwc(t, model, idx, reverse=False):
+    """Converts between NCHW <-> NHWC layouts for tensor t by inserting a transpose. 
+    If reverse=False, t is assumed NCHW and we insert transpose to convert NCHW -> NHWC
+    If reverse=True, t is assumed NHWC and we insert transpose to convert NHWC -> NCHW.
+    """
+    graph = model.graph
+    # create new NHWC tensor
+    t_shape = model.get_tensor_shape(t)
+    bs = t_shape[0]
+    ch = t_shape[1]
+    height = t_shape[2]
+    width = t_shape[3]
+    t_trans = onnx.helper.make_tensor_value_info(
+        model.make_new_valueinfo_name(),
+        onnx.TensorProto.FLOAT,
+        (bs, height, width, ch),  # NHWC
+    )
+    graph.value_info.append(t_trans)
+    dt = model.get_tensor_datatype(t)
+    t_trans = t_trans.name
+    model.set_tensor_datatype(t_trans, dt)
+    model.set_tensor_layout(t_trans, DataLayout.NHWC)
+    # NCHW <-> NHWC transpose
+    if reverse:
+        t_trans_node = onnx.helper.make_node(
+            "Transpose", [t_trans], [t], perm=[0, 3, 1, 2]
+        )
+    else:
+        t_trans_node = onnx.helper.make_node(
+            "Transpose", [t], [t_trans], perm=[0, 2, 3, 1]
+        )
+    graph.node.insert(idx, t_trans_node)
+    return t_trans
diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py
index 24854a2153df9af78feb8352ca119e831a9ac9eb..e78812b21a03baa6963f1f0efaefdb4c73e4d0db 100644
--- a/tests/brevitas/test_brevitas_avg_pool_export.py
+++ b/tests/brevitas/test_brevitas_avg_pool_export.py
@@ -16,7 +16,7 @@ import finn.core.onnx_exec as oxe
 
 import pytest
 
-export_onnx_path = "test_avg_pool.onnx"
+export_onnx_path = "test_brevitas_avg_pool_export.onnx"
 
 
 @pytest.mark.parametrize("kernel_size", [2, 3])
diff --git a/tests/brevitas/test_brevitas_cnv.py b/tests/brevitas/test_brevitas_cnv.py
index c04e16ad1923609c81240235057cc7a190c90ffb..f91ca600d3f0ce3b1cda3c29216fe8e0e3f415e4 100644
--- a/tests/brevitas/test_brevitas_cnv.py
+++ b/tests/brevitas/test_brevitas_cnv.py
@@ -42,7 +42,7 @@ from finn.transformation.general import GiveUniqueNodeNames
 from finn.transformation.double_to_single_float import DoubleToSingleFloat
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = "test_output_cnv.onnx"
+export_onnx_path = "test_brevitas_cnv.onnx"
 
 
 @pytest.mark.parametrize("abits", [1, 2])
diff --git a/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py b/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py
index b66348a9902802bc65b2a35e8bc3e311cc81e0bc..9c7296b7b3b6d36cfb43b6d9e96e7fba6bbce49a 100644
--- a/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py
+++ b/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py
@@ -12,7 +12,7 @@ import finn.core.onnx_exec as oxe
 from finn.transformation.infer_shapes import InferShapes
 from brevitas.core.quant import QuantType
 
-export_onnx_path = "test_act.onnx"
+export_onnx_path = "test_brevitas_non_scaled_QuantHardTanh_export.onnx"
 
 
 @pytest.mark.parametrize("abits", [1, 2, 4, 8])
diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py
index c5ddad12ca3e8d353682fbb20449d44358485f69..77974dacb51aa8746ce33f9a490becd35390db5a 100644
--- a/tests/brevitas/test_brevitas_relu_act_export.py
+++ b/tests/brevitas/test_brevitas_relu_act_export.py
@@ -12,7 +12,7 @@ from finn.core.modelwrapper import ModelWrapper
 import finn.core.onnx_exec as oxe
 from finn.transformation.infer_shapes import InferShapes
 
-export_onnx_path = "test_act.onnx"
+export_onnx_path = "test_brevitas_relu_act_export.onnx"
 
 
 @pytest.mark.parametrize("abits", [1, 2, 4, 8])
diff --git a/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py b/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py
index d499f1517341477eca9915245da9ad12c346c5a9..e0ec82ebed44e2e984be9f62e02bc1721a7f9c33 100644
--- a/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py
+++ b/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py
@@ -12,7 +12,7 @@ from finn.core.modelwrapper import ModelWrapper
 import finn.core.onnx_exec as oxe
 from finn.transformation.infer_shapes import InferShapes
 
-export_onnx_path = "test_act.onnx"
+export_onnx_path = "test_brevitas_scaled_QHardTanh_export.onnx"
 
 
 @pytest.mark.parametrize("abits", [2, 4, 8])
diff --git a/tests/core/test_modelwrapper.py b/tests/core/test_modelwrapper.py
index 5fa9b23bad5c5b67f65530c55f862f889c07b1ac..0fb7ae42f3bd556755f81a02be6c71fd73ffc519 100644
--- a/tests/core/test_modelwrapper.py
+++ b/tests/core/test_modelwrapper.py
@@ -36,7 +36,7 @@ import finn.core.data_layout as DataLayout
 from finn.core.modelwrapper import ModelWrapper
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = "test_output_lfc.onnx"
+export_onnx_path = "test_modelwrapper.onnx"
 
 
 def test_modelwrapper():
diff --git a/tests/custom_op/test_xnorpopcountmatmul.py b/tests/custom_op/test_xnorpopcountmatmul.py
index 37d9b7e5968bdb70023be9b70515410e941f51ce..745b782d418129d96e21c327a49de04d53aa7c48 100644
--- a/tests/custom_op/test_xnorpopcountmatmul.py
+++ b/tests/custom_op/test_xnorpopcountmatmul.py
@@ -47,7 +47,7 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline.sign_to_thres import ConvertSignToThres
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = "test_output_lfc.onnx"
+export_onnx_path = "test_xnorpopcountmatmul.onnx"
 
 
 def test_xnorpopcountmatmul():
diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09c64a1250f78604c1a0a362cf234712de2cf57
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
@@ -0,0 +1,115 @@
+import pytest
+
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.infer_shapes import InferShapes
+import numpy as np
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape):
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape)
+    p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape)
+
+    model = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[inp],
+            outputs=[outp],
+            value_info=[p0],
+            nodes=[helper.make_node(onnx_op_name, ["inp", "p0"], ["outp"])],
+        )
+    )
+
+    model = ModelWrapper(model)
+    model.set_initializer("p0", gen_finn_dt_tensor(pdt, pshape))
+    model.set_tensor_datatype("inp", idt)
+    model.transform(InferDataLayouts(), make_deepcopy=False)
+    model.transform(InferShapes(), make_deepcopy=False)
+    return model
+
+
+# parameter datatype
+@pytest.mark.parametrize("pdt", [DataType.BIPOLAR, DataType.UINT4, DataType.INT2])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.INT32, DataType.UINT4, DataType.INT4])
+# function
+@pytest.mark.parametrize("onnx_op_name", ["Add", "Mul"])
+# vector parameter or scalar parameter (broadcast)
+@pytest.mark.parametrize("scalar_param", [True, False])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_convert_to_hls_channelwise_layer(
+    pdt, idt, onnx_op_name, scalar_param, exec_mode
+):
+    ifm_ch = 16
+    ifm_dim = 5
+    ishape = (1, ifm_ch, ifm_dim, ifm_dim)
+    if scalar_param:
+        pshape = (1,)
+    else:
+        pshape = (1, ifm_ch, 1, 1)
+
+    np.random.seed(0)
+    model = make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape)
+
+    # Since the aren't Data types with a bit width of a non power of 2,
+    # there are cases where the input won't use it full range.
+    if idt == DataType.INT32:
+        x = gen_finn_dt_tensor(DataType.INT16, (1, ifm_ch, ifm_dim, ifm_dim))
+    elif idt == DataType.UINT32:
+        x = gen_finn_dt_tensor(DataType.UINT16, (1, ifm_ch, ifm_dim, ifm_dim))
+    else:
+        x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim))
+
+    input_dict = prepare_inputs(x)
+    y_expected = oxe.execute_onnx(model, input_dict)["outp"]
+
+    new_model = model.transform(to_hls.InferChannelwiseLinearLayer())
+    new_model = new_model.transform(GiveUniqueNodeNames())
+
+    if exec_mode == "cppsim":
+        new_model = new_model.transform(PrepareCppSim())
+        new_model = new_model.transform(CompileCppSim())
+        new_model = new_model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        new_model = new_model.transform(SetExecMode("rtlsim"))
+        new_model = new_model.transform(GiveUniqueNodeNames())
+        new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
+        new_model = new_model.transform(HLSSynthIP())
+        new_model = new_model.transform(ReplaceVerilogRelPaths())
+        new_model = new_model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    ctx_produced = oxe.execute_onnx(
+        new_model, input_dict, return_full_exec_context=True
+    )
+    y_produced = ctx_produced["outp"]
+
+    assert (y_produced == y_expected).all()
+    assert new_model.graph.node[1].op_type == "ChannelwiseOp_Batch"
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index ee65326ec57fb7fa7fa0490a8980dbabb8efc13c..22c356a5869b25fcc7ae3ef0164ed61b53ef232c 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -5,10 +5,15 @@ import pytest
 from finn.core.datatype import DataType
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
-from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
-from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames
 from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.util.basic import gen_finn_dt_tensor
@@ -17,47 +22,40 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.custom_op.im2col import compute_conv_output_dim
 
+# conv_config  kernel_size,stride, pad
 
-@pytest.mark.parametrize("padding", [True, False])
-@pytest.mark.parametrize("kernel_size", [3, 5])
+
+@pytest.mark.parametrize(
+    "conv_config", [(1, 2, 0), (1, 3, 0), (3, 2, 1), (3, 1, 0), (3, 1, 1), (5, 2, 1)]
+)
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_convert_to_hls_conv_layer(padding, kernel_size):
-
-    assert (
-        kernel_size % 2 != 0
-    ), """test_convert_to_hls_conv_layer test only
-    supports odd kernel_size"""
-
+def test_convert_to_hls_conv_layer(conv_config, exec_mode):
+    kernel_size, stride, pad = conv_config
     np.random.seed(0)
-    padding = True
     idt = DataType.UINT4
 
     in_feature_dim = 7
-    in_chn = 3
+    in_chn = 16
+    out_chn = 20
 
-    stages = 1  # just one convolution
-
-    out_feature_dim = (
-        in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages
-    )
+    out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad)
 
     input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
-    output_shape = [1, in_chn, out_feature_dim, out_feature_dim]
+    output_shape = [1, out_chn, out_feature_dim, out_feature_dim]
 
-    conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size]
+    conv_param_shape = [out_chn, in_chn, kernel_size, kernel_size]
+    conv_weight_dt = DataType.UINT4
 
     conv_config = {}
     conv_config["dilations"] = [1, 1]
     conv_config["group"] = 1
     conv_config["kernel_shape"] = [kernel_size, kernel_size]
-    if padding:
-        pad = kernel_size // 2
-        conv_config["pads"] = [pad, pad, pad, pad]
-    else:
-        conv_config["pads"] = [0, 0, 0, 0]
-    conv_config["strides"] = [1, 1]
+    conv_config["pads"] = [pad, pad, pad, pad]
+    conv_config["strides"] = [stride, stride]
 
     top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
     top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
@@ -80,27 +78,35 @@ def test_convert_to_hls_conv_layer(padding, kernel_size):
     model = ModelWrapper(modelproto)
     model.set_tensor_datatype("top_in", idt)
     model.set_tensor_datatype("top_out", idt)
-    model.set_tensor_datatype("p1", DataType.UINT4)
+    model.set_tensor_datatype("p1", conv_weight_dt)
+    model.set_initializer("p1", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape))
 
     model = model.transform(InferShapes())
-    model.set_initializer(
-        "p1", np.round(np.random.rand(*conv_param_shape).astype(np.float32) * 16)
-    )
-
-    model.set_tensor_datatype(model.graph.input[0].name, idt)
-    model = model.transform(InferShapes())
-    model = model.transform(InferDataLayouts())
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(GiveReadableTensorNames())
     model = model.transform(InferDataTypes())
 
     new_model = model.transform(LowerConvsToMatMul())
     new_model = new_model.transform(to_hls.InferConvInpGen())
 
-    new_model = new_model.transform(PrepareCppSim())
-    new_model = new_model.transform(CompileCppSim())
-    new_model = new_model.transform(SetExecMode("cppsim"))
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    new_model = new_model.transform(InferShapes())
+    new_model = new_model.transform(InferDataTypes())
+
+    if exec_mode == "cppsim":
+        new_model = new_model.transform(PrepareCppSim())
+        new_model = new_model.transform(CompileCppSim())
+        new_model = new_model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        new_model = new_model.transform(SetExecMode("rtlsim"))
+        new_model = new_model.transform(GiveUniqueNodeNames())
+        new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
+        new_model = new_model.transform(HLSSynthIP())
+        new_model = new_model.transform(ReplaceVerilogRelPaths())
+        new_model = new_model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
 
     x = gen_finn_dt_tensor(idt, input_shape)
     inp_dict = {model.graph.input[0].name: x}
     assert oxe.compare_execution(model, new_model, inp_dict)
+    if kernel_size == 1 and stride > 1 and pad == 0:
+        assert new_model.graph.node[1].op_type == "DownSampler"
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
index 48803c9614f53a3a149c6eaac4289d10086513a5..20e3ee08d7ffdd013a89d26bb71d86ccc554a5b4 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py
@@ -51,7 +51,7 @@ from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.custom_op.registry import getCustomOp
 
-export_onnx_path_cnv = "test_output_cnv.onnx"
+export_onnx_path_cnv = "test_convert_to_hls_layers_cnv.onnx"
 
 
 @pytest.mark.vivado
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
index e261a3114853bf24bdb4c931c46ff92eea4150dd..d77065ad9396d0cc8dd57a39ed823fffcb30ee47 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py
@@ -52,8 +52,7 @@ from finn.transformation.streamline.round_thresholds import RoundAndClipThreshol
 from finn.util.test import get_test_model_trained
 
 
-export_onnx_path = "test_output_tfc.onnx"
-export_onnx_path_cnv = "test_output_cnv.onnx"
+export_onnx_path = "test_convert_to_hls_layers_fc.onnx"
 
 
 @pytest.mark.vivado
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ed352e28981552b186bb778b94dcbc07471e14b
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+
+
+def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
+    NumChannels = C.shape[0]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, vecs + [NumChannels])
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, vecs + [NumChannels]
+    )
+
+    node_inp_list = ["inp", "const"]
+
+    node = helper.make_node(
+        "ChannelwiseOp_Batch",
+        node_inp_list,
+        ["outp"],
+        domain="finn",
+        backend="fpgadataflow",
+        NumChannels=NumChannels,
+        Func=func,
+        PE=pe,
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        paramDataType=pdt.name,
+        numInputVectors=vecs,
+    )
+    graph = helper.make_graph(nodes=[node], name="graph", inputs=[inp], outputs=[outp])
+
+    model = helper.make_model(graph, producer_name="model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    model.set_tensor_datatype("const", idt)
+    model.set_initializer("const", C)
+    return model
+
+
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType.INT8])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.INT4])
+# param datatype
+@pytest.mark.parametrize("pdt", [DataType.INT4])
+# folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1, 2])
+# number of input features
+@pytest.mark.parametrize("ich", [16])
+# vecs
+@pytest.mark.parametrize("vecs", [[1], [1, 7, 7]])
+# function
+@pytest.mark.parametrize("func", ["add", "mul"])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_mode):
+    if nf == -1:
+        nf = ich
+    pe = ich // nf
+    assert ich % pe == 0
+
+    # generate input and param data
+    x = gen_finn_dt_tensor(idt, tuple(vecs + [ich]))
+    # C = np.random.randint(idt.min(), idt.max() + 1, ich).astype(np.float32)
+    C = gen_finn_dt_tensor(pdt, (ich))
+
+    odt = act
+
+    model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs)
+
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # package input data as dictionary
+    input_dict = {"inp": x}
+
+    oshape = model.get_tensor_shape("outp")
+
+    C_reshaped = np.broadcast_to(C.flatten(), x.shape)
+    if func == "add":
+        y = x + C_reshaped
+    elif func == "mul":
+        y = x * C_reshaped
+
+    y_expected = y.reshape(oshape)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+
+    y_produced = y_produced.reshape(y_expected.shape)
+
+    assert (y_produced == y_expected).all(), "cppsim failed"
+
+    if exec_mode == "rtlsim":
+        hls_synt_res_est = model.analysis(hls_synth_res_estimation)
+        assert "ChannelwiseOp_Batch_0" in hls_synt_res_est
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index b830693c32afe629dd6fc70868d0bddacac4c887..a9f5bf5ffa1f816b82ef701800e92249056b7c74 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -54,6 +54,10 @@ from finn.util.basic import gen_finn_dt_tensor, pynq_part_map
 from finn.util.fpgadataflow import pyverilate_stitched_ip
 from finn.util.test import load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
+from finn.transformation.fpgadataflow.floorplan import Floorplan
+
 
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
@@ -390,3 +394,19 @@ def test_fpgadataflow_ipstitch_remote_execution():
         assert np.isclose(outp["outp"], x).all()
     except KeyError:
         pytest.skip("PYNQ board IP address not specified")
+
+
+def test_fpgadataflow_ipstitch_iodma_floorplan():
+    model = create_one_fc_model()
+    if model.graph.node[0].op_type == "StreamingDataflowPartition":
+        sdp_node = getCustomOp(model.graph.node[0])
+        assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
+        assert os.path.isfile(sdp_node.get_nodeattr("model"))
+        model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
+    model = model.transform(InferDataLayouts())
+    model = model.transform(InsertIODMA())
+    model = model.transform(Floorplan())
+    assert getCustomOp(model.graph.node[0]).get_nodeattr("partition_id") == 0
+    assert getCustomOp(model.graph.node[1]).get_nodeattr("partition_id") == 2
+    assert getCustomOp(model.graph.node[2]).get_nodeattr("partition_id") == 1
+    model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_iodma_floorplan.onnx")
diff --git a/tests/transformation/test_absorb_transp_into_flatten.py b/tests/transformation/test_absorb_transp_into_flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbfa15277717c554da01e38608601997407803b2
--- /dev/null
+++ b/tests/transformation/test_absorb_transp_into_flatten.py
@@ -0,0 +1,99 @@
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+import finn.core.data_layout as DataLayout
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.streamline.absorb import AbsorbTransposeIntoFlatten
+import finn.core.onnx_exec as oxe
+
+# permutation of transpose node
+@pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 1, 3, 2], [3, 2, 0, 1]])
+# reshape or flatten
+@pytest.mark.parametrize("shape", [None, [1, -1], [-1, 1]])
+# input shape
+@pytest.mark.parametrize("ishape", [[1, 1, 1, 4], [2, 4, 1, 1], [1, 2, 2, 4]])
+# datalayout
+@pytest.mark.parametrize("data_layout", ["NCHW", "NHWC"])
+def test_absorb_transp_into_flatten(perm, shape, ishape, data_layout):
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    transp_node = helper.make_node("Transpose", ["inp"], ["transp_out"], perm=perm)
+    dummy_in = np.random.uniform(low=0, high=1, size=tuple(ishape)).astype(np.float32)
+    if shape is None:
+        shape_node = helper.make_node("Flatten", ["transp_out"], ["outp"])
+        dummy_in = dummy_in.transpose(tuple(perm))
+        oshape = dummy_in.reshape(dummy_in.shape[0], -1).shape
+        outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+        shape0 = None
+    else:
+        shape0 = helper.make_tensor_value_info("shape0", TensorProto.FLOAT, shape)
+        shape_node = helper.make_node("Reshape", ["transp_out", "shape0"], ["outp"])
+        oshape = dummy_in.transpose(tuple(perm)).reshape(tuple(shape)).shape
+        outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+
+    graph = helper.make_graph(
+        nodes=[transp_node, shape_node],
+        name="absorb-transpose-graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="absorb_transpose_model")
+    model = ModelWrapper(model)
+    if shape is not None:
+        model.graph.value_info.append(shape0)
+        model.set_initializer("shape0", np.asarray(shape))
+    if data_layout == "NCHW":
+        model.set_tensor_layout("inp", DataLayout.NCHW)
+    else:
+        model.set_tensor_layout("inp", DataLayout.NHWC)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model.save("test.onnx")
+    model_transformed = model.transform(AbsorbTransposeIntoFlatten())
+    model_transformed.save("test2.onnx")
+
+    # verify transformation
+    inp_values = np.random.uniform(low=-1, high=1, size=tuple(ishape)).astype(
+        np.float32
+    )
+    idict = {model.graph.input[0].name: inp_values}
+    assert oxe.compare_execution(model, model_transformed, idict)
+
+    # only some of the parameter combinations lead to a graph that will be changed when
+    # AbsorbTransposeIntoFlatten is applied
+
+    if shape == [-1, 1]:  # not a flatten operation, so the graph will not be changed
+        assert model.graph == model_transformed.graph
+
+    elif perm == [
+        3,
+        2,
+        0,
+        1,
+    ]:  # the first dimension is also part of the transpose operation
+        # so the graph will not be changed
+        assert model.graph == model_transformed.graph
+
+    # the following cases are the ones in which the model is transformed
+    # because we tested the parameters shape and perm befire we can only consider ishape
+    # and data_layout (the transformed model should only contain a "Flatten" node)
+    elif ishape == [1, 1, 1, 4] and data_layout == "NHWC":
+        assert model_transformed.graph.node[0].op_type == "Flatten"
+
+    elif ishape == [2, 4, 1, 1] and data_layout == "NCHW" and shape is None:
+        # If the first  dimension of the input tensor is not 1, flatten and
+        # reshape (with shape = [1, -1]) would lead to different results
+        assert model_transformed.graph.node[0].op_type == "Flatten"
+
+    # all other cases lead to an unchanged model
+    else:
+        assert model.graph == model_transformed.graph
diff --git a/tests/transformation/test_conv_lowering.py b/tests/transformation/test_conv_lowering.py
index 73891ded1b9691c7c48a2075ad6ca4668fcf6bfe..16c574b29b55e314b06661b28e4bb869bd6b7996 100644
--- a/tests/transformation/test_conv_lowering.py
+++ b/tests/transformation/test_conv_lowering.py
@@ -41,7 +41,7 @@ from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
 from finn.transformation.double_to_single_float import DoubleToSingleFloat
 import finn.core.onnx_exec as oxe
 
-export_onnx_path = "test_output_cnv.onnx"
+export_onnx_path = "test_conv_lowering.onnx"
 
 
 def test_conv_lowering_cnv_w1a1():
diff --git a/tests/transformation/test_fold_constants.py b/tests/transformation/test_fold_constants.py
index 685c14a98b9031096aaf5b244c4f484d4f308bca..a976ffd62bce744a474a6fac2a61a6478526777f 100644
--- a/tests/transformation/test_fold_constants.py
+++ b/tests/transformation/test_fold_constants.py
@@ -40,7 +40,7 @@ from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.infer_shapes import InferShapes
 from finn.util.test import get_test_model_untrained
 
-export_onnx_path = "test_output_lfc.onnx"
+export_onnx_path = "test_fold_constants.onnx"
 
 
 def test_const_folding():
diff --git a/tests/transformation/test_infer_data_layouts.py b/tests/transformation/test_infer_data_layouts.py
index fccc7813da6f98c8af4ade7ae562c99b32247a8b..d6d9920043114c78e970842aee5955e3150cf526 100644
--- a/tests/transformation/test_infer_data_layouts.py
+++ b/tests/transformation/test_infer_data_layouts.py
@@ -44,7 +44,7 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.transformation.infer_data_layouts import InferDataLayouts
 import finn.core.data_layout as DataLayout
 
-export_onnx_path_cnv = "test_output_cnv.onnx"
+export_onnx_path_cnv = "test_infer_data_layouts.onnx"
 
 
 def test_infer_data_layouts():
diff --git a/tests/transformation/test_infer_datatypes.py b/tests/transformation/test_infer_datatypes.py
index e3db40289c4318894cf5ad41c2f67b3bff501db9..097ae03f6153843fbb7956a72b38431559d5d0f1 100644
--- a/tests/transformation/test_infer_datatypes.py
+++ b/tests/transformation/test_infer_datatypes.py
@@ -38,7 +38,7 @@ from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.infer_shapes import InferShapes
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = "test_output_lfc.onnx"
+export_onnx_path = "test_infer_datatypes.onnx"
 
 
 def test_infer_datatypes():
diff --git a/tests/transformation/test_linear_past_eltwise.py b/tests/transformation/test_linear_past_eltwise.py
index b77f59779a5e8559f80e017d13b66bcb67249830..4cff5e5e1d40986a006cc02186fce21a907c2ef1 100644
--- a/tests/transformation/test_linear_past_eltwise.py
+++ b/tests/transformation/test_linear_past_eltwise.py
@@ -41,7 +41,7 @@ from finn.transformation.double_to_single_float import DoubleToSingleFloat
 
 import pytest
 
-export_onnx_path = "test_scalar_past_eltwise.onnx"
+export_onnx_path = "test_linear_past_eltwise.onnx"
 
 # construct a synthetic graph to test:
 # topk insertion, topk conversion to hls, add conversion to hls
diff --git a/tests/transformation/test_move_chw_add_past_conv.py b/tests/transformation/test_move_chw_add_past_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..b626f7e5b8564739ec383aaddfc262d642bf47cc
--- /dev/null
+++ b/tests/transformation/test_move_chw_add_past_conv.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+from onnx import helper, TensorProto
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline.reorder import MoveAddPastConv
+from finn.custom_op.im2col import compute_conv_output_dim
+import finn.core.onnx_exec as oxe
+
+
+# input dimension
+@pytest.mark.parametrize("idim", [4, 7])
+# kernel size
+@pytest.mark.parametrize("k", [2, 3])
+# stride
+@pytest.mark.parametrize("s", [1, 2])
+# input channels
+@pytest.mark.parametrize("ich", [2, 4])
+# output channels
+@pytest.mark.parametrize("och", [2, 3])
+def test_move_chw_add_past_conv(idim, k, s, ich, och):
+    odim = compute_conv_output_dim(idim, k, s)
+
+    ishape = [1, ich, idim, idim]
+    oshape = [1, och, odim, odim]
+    add_param_shape = [1, ich, 1, 1]
+    conv_param_shape = [och, ich, k, k]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+    a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, add_param_shape)
+    a1 = helper.make_tensor_value_info("a1", TensorProto.FLOAT, conv_param_shape)
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = 1
+    conv_config["kernel_shape"] = [k, k]
+    conv_config["pads"] = [0, 0, 0, 0]
+    conv_config["strides"] = [s, s]
+
+    add_node = helper.make_node("Add", ["inp", "a0"], ["add_out"])
+    conv_node = helper.make_node("Conv", ["add_out", "a1"], ["outp"], **conv_config)
+
+    model = helper.make_model(
+        helper.make_graph(
+            nodes=[add_node, conv_node],
+            name="move-add-graph",
+            inputs=[inp],
+            outputs=[outp],
+            value_info=[a0, a1],
+        )
+    )
+
+    model = ModelWrapper(model)
+    # initialize model
+    a0_values = np.random.uniform(low=0, high=1, size=tuple(add_param_shape)).astype(
+        np.float32
+    )
+    model.set_initializer("a0", a0_values)
+    a1_values = np.random.uniform(low=0, high=1, size=tuple(conv_param_shape)).astype(
+        np.float32
+    )
+    model.set_initializer("a1", a1_values)
+
+    model = model.transform(InferShapes())
+
+    # execution before transformation
+    inp_values = np.random.uniform(low=0, high=1, size=tuple(ishape)).astype(np.float32)
+    idict = {model.graph.input[0].name: inp_values}
+    odict = oxe.execute_onnx(model, idict)
+    y_before = odict[model.graph.output[0].name]
+
+    model = model.transform(MoveAddPastConv())
+    odict = oxe.execute_onnx(model, idict)
+    y_after = odict[model.graph.output[0].name]
+
+    assert np.isclose(y_before, y_after).all()
+    assert model.graph.node[0].op_type == "Conv"
+    assert model.graph.node[1].op_type == "Add"
diff --git a/tests/transformation/test_move_flatten_past_affine.py b/tests/transformation/test_move_flatten_past_affine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d5e51613d41f3f2db3dabcef7b982ec2816b19
--- /dev/null
+++ b/tests/transformation/test_move_flatten_past_affine.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+import finn.core.data_layout as DataLayout
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.streamline.reorder import MoveFlattenPastAffine
+import finn.core.onnx_exec as oxe
+
+# data layout
+@pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
+# batch size
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_move_flatten_past_affine(data_layout, batch_size):
+    if data_layout == DataLayout.NHWC:
+        ishape = [batch_size, 1, 1, 1024]
+        oshape = [batch_size, 1000]
+    else:
+        ishape = [batch_size, 1024, 1, 1]
+        oshape = [batch_size, 1000]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    a0 = helper.make_tensor_value_info("a1", TensorProto.FLOAT, [1024, 1000])
+    a1 = helper.make_tensor_value_info("a2", TensorProto.FLOAT, [])
+    a2 = helper.make_tensor_value_info("a3", TensorProto.FLOAT, [1000])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+
+    flatten_node = helper.make_node("Flatten", ["inp"], ["flatten_out"])
+    matmul_node = helper.make_node("MatMul", ["flatten_out", "a0"], ["matmul_out"])
+    mul_node = helper.make_node("Mul", ["matmul_out", "a1"], ["mul_out"])
+    add_node = helper.make_node("Add", ["mul_out", "a2"], ["outp"])
+
+    graph = helper.make_graph(
+        nodes=[flatten_node, matmul_node, mul_node, add_node],
+        name="move-reshape-graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[a0, a1, a2],
+    )
+
+    model = helper.make_model(graph, producer_name="move_reshape_model")
+    model = ModelWrapper(model)
+
+    # initialize values
+    a0_values = gen_finn_dt_tensor(DataType.TERNARY, [1024, 1000])
+    model.set_initializer("a0", a0_values)
+    a1_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
+    model.set_initializer("a1", a1_values)
+    a2_values = np.random.uniform(low=-1, high=1, size=(1000)).astype(np.float32)
+    model.set_initializer("a2", a2_values)
+
+    model.set_tensor_datatype("inp", DataType.INT2)
+    model.set_tensor_layout("inp", data_layout)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    # compare execution before and after transformation
+    inp_values = gen_finn_dt_tensor(DataType.INT2, ishape)
+    idict = {model.graph.input[0].name: inp_values}
+    model_transformed = model.transform(MoveFlattenPastAffine())
+    assert oxe.compare_execution(model, model_transformed, idict)
+
+    # depending on data layout check if graph is transformed or not
+    if data_layout == DataLayout.NHWC:
+        # check if nodes have new order in transformed graph
+        assert model.graph != model_transformed.graph
+        assert model_transformed.graph.node[-1].op_type == "Flatten"
+    else:
+        assert model.graph == model_transformed.graph
diff --git a/tests/transformation/test_move_flatten_past_topk.py b/tests/transformation/test_move_flatten_past_topk.py
new file mode 100644
index 0000000000000000000000000000000000000000..65da92c22dbe9f6b1c5a49172ffae59fa6e98607
--- /dev/null
+++ b/tests/transformation/test_move_flatten_past_topk.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+import finn.core.data_layout as DataLayout
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.insert_topk import InsertTopK
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.streamline.reorder import MoveFlattenPastTopK
+import finn.core.onnx_exec as oxe
+
+# data layout
+@pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
+# batch size
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_move_flatten_past_affine(data_layout, batch_size):
+    if data_layout == DataLayout.NHWC:
+        ishape = [batch_size, 1, 1, 1024]
+        oshape = [batch_size, 1024]
+    else:
+        ishape = [batch_size, 1024, 1, 1]
+        oshape = [batch_size, 1024]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+
+    flatten_node = helper.make_node("Flatten", ["inp"], ["outp"])
+
+    graph = helper.make_graph(
+        nodes=[flatten_node], name="move-flatten-graph", inputs=[inp], outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="move_flatten_model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", DataType.INT2)
+    model.set_tensor_layout("inp", data_layout)
+    model = model.transform(InsertTopK())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    # compare execution before and after transformation
+    inp_values = gen_finn_dt_tensor(DataType.INT2, ishape)
+    idict = {model.graph.input[0].name: inp_values}
+    model_transformed = model.transform(MoveFlattenPastTopK())
+    assert oxe.compare_execution(model, model_transformed, idict)
+
+    # depending on data layout check if graph is transformed or not
+    if data_layout == DataLayout.NHWC:
+        # check if nodes have new order in transformed graph
+        assert model.graph != model_transformed.graph
+        assert model_transformed.graph.node[-1].op_type == "Flatten"
+    else:
+        assert model.graph == model_transformed.graph
diff --git a/tests/transformation/test_move_scalar_past_conv.py b/tests/transformation/test_move_scalar_past_conv.py
index 0f50642d2b9d1583030630cb4927c2b86667e71a..94fee7907d1ed1cccbf95520e903c7d9b43d8f7d 100644
--- a/tests/transformation/test_move_scalar_past_conv.py
+++ b/tests/transformation/test_move_scalar_past_conv.py
@@ -7,14 +7,14 @@ import finn.core.onnx_exec as ox
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import (
-    MoveScalarAddPastConv,
+    MoveAddPastConv,
     MoveScalarMulPastConv,
 )
 
 
 @pytest.mark.parametrize("padding", [False, True])
 @pytest.mark.parametrize(
-    "test_args", [("Add", MoveScalarAddPastConv()), ("Mul", MoveScalarMulPastConv())],
+    "test_args", [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())],
 )
 def test_move_scalar_past_conv(test_args, padding):
     scalar_op = test_args[0]
@@ -83,8 +83,8 @@ def test_move_scalar_past_conv(test_args, padding):
             assert new_model.graph.node[2].op_type == "Conv"
         else:
             assert new_model.graph.node[0].op_type == "Conv"
-            assert new_model.graph.node[1].op_type == scalar_op
-            assert new_model.graph.node[2].op_type == "Conv"
+            assert new_model.graph.node[1].op_type == "Conv"
+            assert new_model.graph.node[2].op_type == scalar_op
     else:
         assert new_model.graph.node[0].op_type == "Conv"
         assert new_model.graph.node[1].op_type == "Conv"
@@ -92,7 +92,7 @@ def test_move_scalar_past_conv(test_args, padding):
 
 
 @pytest.mark.parametrize(
-    "test_args", [("Add", MoveScalarAddPastConv()), ("Mul", MoveScalarMulPastConv())],
+    "test_args", [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())],
 )
 def test_move_scalar_past_conv_only_if_linear(test_args):
     scalar_op = test_args[0]
diff --git a/tests/transformation/test_move_transpose_past_scalar_mul.py b/tests/transformation/test_move_transpose_past_scalar_mul.py
new file mode 100644
index 0000000000000000000000000000000000000000..e434fc7d4f683120176e18a2bfa9da99d9ee0b0e
--- /dev/null
+++ b/tests/transformation/test_move_transpose_past_scalar_mul.py
@@ -0,0 +1,82 @@
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+import finn.core.data_layout as DataLayout
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.streamline.reorder import MoveTransposePastScalarMul
+import finn.core.onnx_exec as oxe
+
+# permutation of transpose node
+@pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 1, 3, 2], [3, 2, 0, 1]])
+# scalar mul
+@pytest.mark.parametrize("scalar", [True, False])
+# data layout
+@pytest.mark.parametrize("data_layout", [None, DataLayout.NHWC, DataLayout.NCHW])
+def test_move_transpose_past_scalar_mul(perm, scalar, data_layout):
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 2, 3, 4])
+    # to determine out_size we need to calculate with "perm" for this test case
+    dummy_in = np.random.uniform(low=0, high=1, size=(1, 2, 3, 4)).astype(np.float32)
+    out_size = dummy_in.transpose(tuple(perm)).shape
+
+    if scalar is True:
+        a0_size = []
+    else:
+        a0_size = out_size
+    a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, a0_size)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, out_size)
+    transp_node = helper.make_node("Transpose", ["inp"], ["transp_out"], perm=perm)
+    mul_node = helper.make_node("Mul", ["transp_out", "a0"], ["outp"])
+
+    graph = helper.make_graph(
+        nodes=[transp_node, mul_node],
+        name="mv-transpose-graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[a0],
+    )
+
+    model = helper.make_model(graph, producer_name="mv_transpose_model")
+    model = ModelWrapper(model)
+
+    # initialize values
+    a0_values = np.random.uniform(low=0, high=1, size=tuple(a0_size)).astype(np.float32)
+    model.set_initializer("a0", a0_values)
+    if data_layout is not None:
+        model.set_tensor_layout("inp", data_layout)
+        model = model.transform(InferDataLayouts())
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    # compare execution before and after transformation
+    inp_values = np.random.uniform(low=0, high=1, size=(1, 2, 3, 4)).astype(np.float32)
+    idict = {model.graph.input[0].name: inp_values}
+    model_transformed = model.transform(MoveTransposePastScalarMul())
+    assert oxe.compare_execution(model, model_transformed, idict)
+
+    # check if order changed
+    if scalar is True and data_layout is not None:
+        assert model_transformed.graph.node[0] != model.graph.node[0]
+        assert model_transformed.graph.node[1] != model.graph.node[1]
+        assert model_transformed.graph.node[0].op_type == "Mul"
+        assert model_transformed.graph.node[1].op_type == "Transpose"
+        mul_input = model_transformed.graph.node[0].input[0]
+        mul_output = model_transformed.graph.node[0].output[0]
+        assert model_transformed.get_tensor_layout(mul_input) == data_layout
+        assert model_transformed.get_tensor_layout(mul_output) == data_layout
+    else:
+        assert model_transformed.graph.node[0] == model.graph.node[0]
+        assert model_transformed.graph.node[1] == model.graph.node[1]
+        if data_layout is not None:
+            mul_input = model_transformed.graph.node[1].input[0]
+            mul_output = model_transformed.graph.node[1].output[0]
+            assert model_transformed.get_tensor_layout(mul_input) != data_layout
+            assert model_transformed.get_tensor_layout(mul_output) != data_layout
diff --git a/tests/transformation/test_sign_to_thres.py b/tests/transformation/test_sign_to_thres.py
index b10840df37a695986e54c0bdaa68baa0538f90f2..a92f839e5f6ca8b45eadf939fa35973ac153e0b1 100644
--- a/tests/transformation/test_sign_to_thres.py
+++ b/tests/transformation/test_sign_to_thres.py
@@ -40,8 +40,7 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.streamline import ConvertSignToThres
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = "test_output_lfc.onnx"
-transformed_onnx_path = "test_output_lfc_transformed.onnx"
+export_onnx_path = "test_sign_to_thres.onnx"
 
 
 def test_sign_to_thres():
diff --git a/tests/transformation/test_topk_insert.py b/tests/transformation/test_topk_insert.py
index 1af0f255d8fb1af8a6e571518f18d831aa71298b..a18e63384150f140cb63ec7b438283eb4797266c 100644
--- a/tests/transformation/test_topk_insert.py
+++ b/tests/transformation/test_topk_insert.py
@@ -18,7 +18,7 @@ from pkgutil import get_data
 
 import pytest
 
-export_onnx_path = "test_output_lfc.onnx"
+export_onnx_path = "test_topk_insert.onnx"
 
 
 @pytest.mark.parametrize("k", [1, 5, 10])