diff --git a/.gitignore b/.gitignore
index f838c1695130d232ac6a2b888aed0cea31aafaa7..8b3166a44070a4575aac86c445c4504b594cda08 100644
--- a/.gitignore
+++ b/.gitignore
@@ -78,3 +78,6 @@ MANIFEST
 
 # Jenkins cfg dir
 /docker/jenkins_home
+
+# SSH key dir mounted into Docker
+/ssh_keys/
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index 22e3eb623c7a5da19a5e3ae2284557577898ad23..0e12b504a26ccdb8fd78e162f04cfdeab5a186f1 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -42,7 +42,7 @@ WORKDIR /workspace
 RUN apt-get update
 RUN apt-get -y upgrade
 RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
-RUN apt-get install -y verilator nano zsh
+RUN apt-get install -y verilator nano zsh rsync
 RUN apt-get -y install sshpass
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index e34b6ce9cc4488c806da8bcf3cc5cc8e500ae806..132d5bdaa286ba3e50bbd06971e9139f5859ef11 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -13,9 +13,9 @@ gecho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-BREVITAS_COMMIT=989cdfdba4700fdd900ba0b25a820591d561c21a
+BREVITAS_COMMIT=f9a27226d4acf1661dd38bc449f71f89e0983cce
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716
+HLSLIB_COMMIT=8aed899c278c36c977a249558d71795086cf852c
 PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f
 PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d
 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
diff --git a/docs/finn/example_networks.rst b/docs/finn/example_networks.rst
index 9f221871f09bf655db9d81988d6fa83e53473634..86bb2bd11fd805a23a3bdf6da8a8ed686259ecc1 100644
--- a/docs/finn/example_networks.rst
+++ b/docs/finn/example_networks.rst
@@ -20,17 +20,17 @@ version, this is indicated by an x mark in the table.
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
 | Export/Import         | x          | x        | x        | x        | x        |    x     |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| Streamlining          | x          | x        | x        | x        | x        |          |          |
+| Streamlining          | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| Convert to HLS layers | x          | x        | x        | x        | x        |          |          |
+| Convert to HLS layers | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| Stitched IP           | x          | x        | x        | x        | x        |          |          |
+| Stitched IP           | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| Hardware test         | x          | x        | x        |          | x        |          |          |
+| Hardware test         | x          | x        | x        |          | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| cppsim                | x          | x        | x        | x        | x        |          |          |
+| cppsim                | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| rtlsim node-by-node   | x          | x        | x        | x        | x        |          |          |
+| rtlsim node-by-node   | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
-| rtlsim stitched IP    | x          | x        | x        | x        | x        |          |          |
+| rtlsim stitched IP    | x          | x        | x        | x        | x        |          |     x    |
 +-----------------------+------------+----------+----------+----------+----------+----------+----------+
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 95594bb67a2be3a4c3fbba488c75a704f623c136..f4fa7a13dcbe4fe8ab9667a111df00c605747710 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -18,6 +18,7 @@ Requirements
 * A working Vivado 2019.1 installation
 * A `VIVADO_PATH` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located)
 * (optional) A PYNQ board with a network connection
+   * the ``bitstring`` package must be installed on the PYNQ: ``sudo pip3 install bitstring``
 
 Running FINN in Docker
 ======================
@@ -30,6 +31,7 @@ Getting an interactive shell for development or experimentation
   sh run_docker.sh
 
 Simply running sh run-docker.sh without any additional arguments will clone the dependency repos, create a Docker container and give you a terminal with you can use for development for experimentation.
+If you want a new terminal on an already-running container, you can do this with `docker exec -it finn_dev_<username> bash`.
 
 .. warning:: The Docker container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the /workspace/finn folder (which is mounted from the host computer) or otherwise backed up.
 
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index 7a4bc687eeb827320991f7d3f1ef8cc35e97f3da..dee62f09a9253380e05300dac8fa34915c20dab5 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -16,6 +16,10 @@ Custom Quantization Annotations
 
 ONNX does not support datatypes smaller than 8-bit integers, whereas in FINN we are interested in smaller integers down to ternary and bipolar. To make this work, FINN uses the quantization_annotation field in ONNX to annotate tensors with their FINN DataType (:py:mod:`finn.core.datatype.DataType`) information. However, all tensors are expected to use single-precision floating point (float32) storage in FINN. This means we store even a 1-bit value as floating point for the purposes of representation. The FINN compiler flow is responsible for eventually producing a packed representation for the target hardware, where the 1-bit is actually stored as 1-bit.
 
+Note that FINN uses floating point tensors as a carrier data type to represent integers. Floating point arithmetic can introduce rounding errors, e.g. (int_num * float_scale) / float_scale is not always equal to int_num.
+When using the custom ONNX execution flow, FINN will attempt to sanitize any rounding errors for integer tensors. See (:py:mod:`finn.util.basic.sanitize_quant_values`) for more information.
+This behavior can be disabled (not recommended!) by setting the environment variable SANITIZE_QUANT_TENSORS=0.
+
 Custom Operations/Nodes
 =======================
 
diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst
index 391c6f999312839daca0d4161336c7c0ae822f89..c52c0840aa40566d930164490b1fd249d7c07757 100644
--- a/docs/finn/verification.rst
+++ b/docs/finn/verification.rst
@@ -28,4 +28,15 @@ This simulation can be used for a model containing several HLS custom operations
 Emulation using PyVerilator
 ===========================
 
-The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole design. For that purpose PyVerilator gets the generated verilog files.
+The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files.
+
+For debugging purposes, it's possible to generate .vcd trace files that show the value of external & internal signals as the emuation is running. To enable this:
+ - for node-by-node rtlsim, set the `rtlsim_trace` attribute of each node of interest to either a file name for the vcd or `default` to use the node name as the filename.
+ - for IP-stitched rtlsim, set the `rtlsim_trace` metadata_prop  for the graph as per above.
+
+To control the tracing depth in the module hierarchy, use the `RTLSIM_TRACE_DEPTH` environment variable (default is 1):
+ - level 1 shows top-level input/output streams
+ - level 2 shows per-layer input/output streams
+ - level 3 shows per full-layer I/O including FIFO count signals
+
+Note that deeper tracing will take longer to execute and may produce very large .vcd files.
diff --git a/run-docker.sh b/run-docker.sh
index e07556716db335421f57a390f1e6a17168ac058b..00ca8f86985a78d8f2af099c51dcd4b80cd2e974 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -65,6 +65,11 @@ DOCKER_INST_NAME="finn_dev_${DOCKER_UNAME}"
 # ensure Docker tag and inst. name are all lowercase
 DOCKER_TAG=$(echo "$DOCKER_TAG" | tr '[:upper:]' '[:lower:]')
 DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
+# Absolute path to this script, e.g. /home/user/bin/foo.sh
+SCRIPT=$(readlink -f "$0")
+# Absolute path this script is in, thus /home/user/bin
+SCRIPTPATH=$(dirname "$SCRIPT")
+
 # the settings below will be taken from environment variables if available,
 # otherwise the defaults below will be used
 : ${JUPYTER_PORT=8888}
@@ -74,11 +79,7 @@ DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
 : ${PYNQ_BOARD="Pynq-Z1"}
 : ${PYNQ_TARGET_DIR="/home/xilinx/$DOCKER_INST_NAME"}
 : ${NUM_DEFAULT_WORKERS=1}
-
-# Absolute path to this script, e.g. /home/user/bin/foo.sh
-SCRIPT=$(readlink -f "$0")
-# Absolute path this script is in, thus /home/user/bin
-SCRIPTPATH=$(dirname "$SCRIPT")
+: ${FINN_SSH_KEY_DIR="$SCRIPTPATH/ssh_keys"}
 
 BUILD_LOCAL=/tmp/$DOCKER_INST_NAME
 VIVADO_HLS_LOCAL=$VIVADO_PATH
@@ -87,6 +88,7 @@ VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache
 # ensure build dir exists locally
 mkdir -p $BUILD_LOCAL
 mkdir -p $VIVADO_IP_CACHE
+mkdir -p $FINN_SSH_KEY_DIR
 
 gecho "Instance is named as $DOCKER_INST_NAME"
 gecho "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
@@ -133,6 +135,7 @@ docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init \
 -v $SCRIPTPATH:/workspace/finn \
 -v $BUILD_LOCAL:$BUILD_LOCAL \
 -v $VIVADO_PATH:$VIVADO_PATH \
+-v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh \
 -e VIVADO_PATH=$VIVADO_PATH \
 -e FINN_INST_NAME=$DOCKER_INST_NAME \
 -e FINN_ROOT="/workspace/finn" \
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
index c2f68a35076418e0cf2edb578bdb8d548772fc78..7c3123cd5eb29a54dc5cbfb912225ad3fdb0f219 100644
--- a/src/finn/core/onnx_exec.py
+++ b/src/finn/core/onnx_exec.py
@@ -39,6 +39,7 @@ from finn.core.remote_exec import remote_exec
 from finn.core.rtlsim_exec import rtlsim_exec
 from finn.custom_op.registry import getCustomOp
 import finn.analysis.topology as ta
+from finn.util.basic import sanitize_quant_values, get_sanitize_quant_tensors
 
 
 def execute_node(node, context, graph):
@@ -102,15 +103,14 @@ def execute_node(node, context, graph):
                     raise Exception(
                         """Output shapes disagree after node execution:
                         found %s vs expected %s"""
-                        % (
-                            str(output_list[list_ind].shape),
-                            str(context[outp].shape),
-                        )
+                        % (str(output_list[list_ind].shape), str(context[outp].shape))
                     )
                 context[outp] = output_list[list_ind]
 
 
-def execute_onnx(model, input_dict, return_full_exec_context=False):
+def execute_onnx(
+    model, input_dict, return_full_exec_context=False, start_node=None, end_node=None
+):
     """Executes given ONNX ModelWrapper with given named inputs.
 
     If return_full_exec_context is False, a dict of named outputs is returned
@@ -118,7 +118,12 @@ def execute_onnx(model, input_dict, return_full_exec_context=False):
 
     If return return_full_exec_context is True, the full set of tensors used by
     the execution (including inputs, weights, activations and final outputs)
-    will be returned as a dict."""
+    will be returned as a dict.
+
+    When start_node and end_node are set to None, the whole graph is executed.
+    If they are set to particular ONNX nodes, only the subgraph between (and
+    including) those nodes is executed.
+    """
 
     if not model.check_all_tensor_shapes_specified():
         raise Exception("Found unspecified tensor shapes, try infer_shapes")
@@ -161,8 +166,28 @@ def execute_onnx(model, input_dict, return_full_exec_context=False):
         # execute the model node by node
         # we can simply walk down the list since the ONNX spec guarantees that it is
         # topologically sorted
-        for node in graph.node:
+        subgraph = []
+        if start_node is None:
+            start_node = model.graph.node[0]
+        if end_node is None:
+            end_node = model.graph.node[-1]
+        # select the nodes between specified start/end nodes
+        start_ind = model.get_node_index(start_node)
+        end_ind = model.get_node_index(end_node) + 1
+        assert end_ind >= start_ind, "Start/end nodes must define valid subgraph"
+        subgraph = graph.node[start_ind:end_ind]
+        for node in subgraph:
+            if get_sanitize_quant_tensors() != 0:
+                # round input values to match quantization annotation
+                execution_context = sanitize_quant_values(
+                    model, node.input, execution_context
+                )
             execute_node(node, execution_context, graph)
+            if get_sanitize_quant_tensors() != 0:
+                # round output values to quantization annotation
+                execution_context = sanitize_quant_values(
+                    model, node.output, execution_context
+                )
     elif model_exec_mode == "remote_pynq":
         # use remote exec metadata built into model to execute on a remote PYNQ
         remote_exec(model, execution_context)
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index c4800011fe2c944fa877b12d0247795beda4a5e6..71c731f96ca45519c443a5f932ead050770e17de 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -173,9 +173,15 @@ class HLSCustomOp(CustomOp):
         of the node as a dictionary."""
         ret = dict()
         ret["BRAM_18K"] = self.bram_estimation()
+        ret["BRAM_efficiency"] = self.bram_efficiency_estimation()
         ret["LUT"] = self.lut_estimation()
         return ret
 
+    def bram_efficiency_estimation(self):
+        """Function for BRAM efficiency estimation: actual parameter storage
+        needed divided by the allocated BRAM storage (from estimation)"""
+        return 1
+
     def bram_estimation(self):
         """Function for BRAM resource estimation, is member function of
         HLSCustomOp class but has to be filled by every node"""
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
similarity index 88%
rename from src/finn/custom_op/fpgadataflow/fmpadding.py
rename to src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index fa321dfa65d14b67fa218fb6a49f602ddab8d57e..d326ae7dfc7830a0081c3b13233d67ef08b12eff 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -21,6 +21,8 @@ class FMPadding_Batch(HLSCustomOp):
             "Padding": ("i", True, 2),
             # number of channels in input image
             "NumChannels": ("i", True, 0),
+            # SIMD Input parallelism
+            "SIMD": ("i", False, 1),
             # FINN input datatype
             "inputDataType": ("s", True, ""),
             # controls distribution of padded pixels
@@ -55,20 +57,22 @@ class FMPadding_Batch(HLSCustomOp):
         return oshape
 
     def get_folded_input_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
-        ret = list(self.get_normal_input_shape())
-        ret.insert(-1, 1)
-        return tuple(ret)
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
 
     def get_folded_output_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
-        ret = list(self.get_normal_output_shape())
-        ret.insert(-1, 1)
-        return tuple(ret)
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
@@ -114,15 +118,13 @@ class FMPadding_Batch(HLSCustomOp):
 
     def get_instream_width(self):
         ibits = self.get_input_datatype().bitwidth()
-        num_ch = self.get_nodeattr("NumChannels")
-
-        return ibits * num_ch
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
 
     def get_outstream_width(self):
         obits = self.get_output_datatype().bitwidth()
-        num_ch = self.get_nodeattr("NumChannels")
-
-        return obits * num_ch
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
@@ -135,13 +137,15 @@ class FMPadding_Batch(HLSCustomOp):
         self.code_gen_dict["$DEFINES$"] = [
             """#define ImgDim1 {}\n#define OutputDim1 {}\n
             #define Padding1 {}\n#define NumChannels1 {}\n
-            #define PaddingStyle1 {}\n#define numReps {}\n""".format(
+            #define PaddingStyle1 {}\n#define numReps {}
+            #define SIMD1 {}\n""".format(
                 self.get_nodeattr("ImgDim"),
                 self.get_padded_odim(),
                 self.get_nodeattr("Padding"),
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PaddingStyle"),
                 self.get_nodeattr("numInputVectors"),
+                self.get_nodeattr("SIMD"),
             )
         ]
 
@@ -176,7 +180,7 @@ class FMPadding_Batch(HLSCustomOp):
         in_t = self.get_input_datatype().get_hls_datatype_str()
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,
+            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
             {}, PaddingStyle1> (in0, out, numReps);""".format(
                 node.op_type, in_t
             )
@@ -232,6 +236,7 @@ class FMPadding_Batch(HLSCustomOp):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
@@ -254,10 +259,8 @@ class FMPadding_Batch(HLSCustomOp):
         match expected shape (1, ImgDim, ImgDim, NumChannels)."""
         export_idt = self.get_input_datatype()
 
-        # no reshaping for input since assuming no folding on input
-        # make copy before saving array
-        inp = inp.copy()
-        np.save(os.path.join(code_gen_dir, "input_0.npy"), inp)
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
 
         if mode == "cppsim":
             # execute the precompiled model
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7edc24d0e24eef1154293caca2519ab3aa68358
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -0,0 +1,395 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import numpy as np
+
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.core.datatype import DataType
+from onnx import TensorProto, helper
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class Pool_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib Pool_batch function.
+    Requires ConvolutionInputGenerator(depthwise == 1) to format its input
+
+    TODO: explain input shape (to reuse im2col code)
+    Input shape (BatchSize,OutImgDim,OutImgDim,KernelSize^2*Channels)
+    Output shape (BatchSize,OutImgDim,OutImgDim,Channels)
+
+    # note: the actual data layout produced by the hlslib kernels is different
+    # for depthwise ops.
+    # * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
+
+    Channels can be folded using PE (SIMD from the input perspective)
+    TODO: doc
+    """
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "Channels": ("i", True, 0),
+            "PE": ("i", True, 1),
+            "KernelSize": ("i", True, 0),
+            # Function:
+            #  - MaxPool
+            #  - AvgPool (not yet supported, but HLSLIB does)
+            #  - AccPool (not yet supported, but HLSLIB does)
+            "Function": ("s", True, ""),
+            "OutImgDim": ("i", True, 0),
+            # FINN DataTypes for inputs/outputs
+            "dataType": ("s", True, ""),
+            "BatchSize": ("i", False, 1),
+        }
+
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("dataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        fxn = self.get_nodeattr("Function")
+        if fxn == "MaxPool":
+            # Same as input
+            return DataType[self.get_nodeattr("dataType")]
+        else:
+            raise Exception("Pool_Batch doesn't currently support " + fxn)
+
+    def get_normal_input_shape(self):
+        ifm_ch = self.get_nodeattr("Channels")
+        odim = self.get_nodeattr("OutImgDim")
+        batch_size = self.get_nodeattr("BatchSize")
+        k = self.get_nodeattr("KernelSize")
+        ishape = (batch_size, odim, odim, k * k * ifm_ch)
+        return ishape
+
+    def get_folded_input_shape(self):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        assert ifm_ch % pe == 0, "PE must divide input channels"
+        fold = int(normal_ishape[-1] / pe)
+        folded_ishape = normal_ishape[:-1] + [fold, pe]
+        return tuple(folded_ishape)
+
+    def get_normal_output_shape(self):
+        ofm_ch = self.get_nodeattr("Channels")
+        odim = self.get_nodeattr("OutImgDim")
+        batch_size = self.get_nodeattr("BatchSize")
+        oshape = (batch_size, odim, odim, ofm_ch)
+        return oshape
+
+    def get_folded_output_shape(self):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        assert ifm_ch % pe == 0, "PE must divide input channels"
+        fold = int(ifm_ch / pe)
+        folded_oshape = normal_oshape[:-1] + [fold, pe]
+        return tuple(folded_oshape)
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[1:-1])
+
+    def get_instream_width(self):
+        dt_bits = self.get_input_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        # ofm_ch = self.get_nodeattr("Channels")
+        # k = self.get_nodeattr("KernelSize")
+        # assert ifm_ch % pe == 0, "PE must divide input channels"
+        # simd = int(ifm_ch/pe)
+        in_width = int(dt_bits * pe)
+        return in_width
+
+    def get_outstream_width(self):
+        fxn = self.get_nodeattr("Function")
+        if fxn == "MaxPool":
+            return self.get_instream_width()
+        else:
+            raise Exception("Pool_Batch doesn't currently support " + fxn)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        info_messages = []
+
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify the number of inputs
+        if len(self.onnx_node.input) == 1:
+            info_messages.append("The number of inputs is correct")
+        else:
+            info_messages.append("""Pool_Batch needs 1 data input""")
+
+        # check supported function
+        fnx = self.get_nodeattr("Function")
+        if fnx == "MaxPool":
+            info_messages.append(
+                "Attribute Function contains a supported pool function"
+            )
+        else:
+            info_messages.append(
+                "Attribute Function contains an unsupported pool function"
+            )
+        return info_messages
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "pool.hpp"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+        ifm_ch = self.get_nodeattr("Channels")
+        self.code_gen_dict["$DEFINES$"] += ["#define Channels {}".format(ifm_ch)]
+
+        pe = self.get_nodeattr("PE")
+        self.code_gen_dict["$DEFINES$"] += ["#define PE {}".format(pe)]
+
+        k = self.get_nodeattr("KernelSize")
+        self.code_gen_dict["$DEFINES$"] += ["#define KernelSize {}".format(k)]
+
+        odim = self.get_nodeattr("OutImgDim")
+        self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)]
+
+        numReps = self.get_nodeattr("BatchSize")
+        self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(numReps)]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        idt = self.get_input_datatype()
+        i_hls_dt = idt.get_hls_datatype_str()
+        odt = self.get_output_datatype()
+        o_hls_dt = odt.get_hls_datatype_str()
+
+        self.code_gen_dict["$DOCOMPUTE$"] = []
+
+        fxn = self.get_nodeattr("Function")
+        if fxn == "MaxPool":
+            self.code_gen_dict["$DOCOMPUTE$"] += [
+                "MaxPoolFunction<{},KernelSize> pool_fxn;".format(i_hls_dt)
+            ]
+        else:
+            raise Exception("Pool_Batch doesn't currently support " + fxn)
+
+        self.code_gen_dict["$DOCOMPUTE$"] += [
+            """Pool_batch<Channels, PE, KernelSize,Slice<{} >, Slice< {} > >
+        (in0,out, pool_fxn, OFMDim*OFMDim*numReps);""".format(
+                i_hls_dt, o_hls_dt
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s",false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        packed_ibits = self.get_instream_width()
+        packed_in_hls_type = "ap_uint<%d>" % packed_ibits
+
+        packed_obits = self.get_outstream_width()
+        packed_out_hls_type = "ap_uint<%d>" % packed_obits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
+            % (self.onnx_node.name, packed_in_hls_type, packed_out_hls_type)
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        folded_ishape = self.get_folded_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (batch_size,odim,odim,k*k*ifm_ch)."""
+
+        export_idt = self.get_input_datatype()
+        reshaped_input = inp.reshape(folded_ishape)
+
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output
+        shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch)."""
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 9b73ba1e100aa83fd19aa8799195c99891fca3fd..a7ebff68749120868cae9ce5ac18d2856fe2cb8a 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -240,11 +240,21 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
-        D_in = self.get_instream_width()
-        D_out = self.get_outstream_width()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
         omega = (D_in * D_out) / (Q * P)
         return P * (math.ceil(omega / 512)) * (math.ceil((Q * W) / 36))
 
+    def bram_efficiency_estimation(self):
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        bram16_est = self.bram_estimation()
+        wbits = W * D_in * D_out
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -290,12 +300,15 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         return out_width
 
     def get_weightstream_width(self):
-        """Returns weight stream width. Used in decoupled mode."""
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        wp = self.get_weight_datatype().bitwidth()
-        w_width = pe * simd * wp
-        return w_width
+        """Returns weight stream width. Used only in decoupled mode."""
+        if self.get_nodeattr("mem_mode") == "decoupled":
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = pe * simd * wp
+            return w_width
+        else:
+            return 0
 
     def get_weightstream_width_padded(self):
         """Returns weight stream width padded to a multiple of 8. This is required
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 1a8216f64bf71b7fb9f1f8becf4732970b5bf451..1da60a5124fa86b4336bae8fd1a587672f2f2e6f 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -99,6 +99,7 @@ set_top $config_toplevelfxn
 open_solution sol1
 set_part $config_proj_part
 
+config_compile -ignore_long_run_time -disable_unroll_code_size_check
 config_interface -m_axi_addr64
 config_rtl -auto_prefix
 $EXTRA_DIRECTIVES$
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 25ea05e3607a52731ae1b64de421837bf137ee2b..17ba44b959577faf573d77ae222f7b2a3be6669d 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -30,20 +30,30 @@ from finn.custom_op.fpgadataflow import HLSCustomOp
 
 
 class TLastMarker(HLSCustomOp):
-    """Class that corresponds to the TLastMarker node that needs to be
-    inserted at the end of the model for rtlsim with stitched IP.
-    It marks the end of the current image/input sample."""
+    """Node that adds/removes AXI stream TLAST signals where needed. Its behavior
+    is transparent in node-by-node execution, only visible in IP-stitched rtlsim or
+    actual hardware.
+    This node  may be needed at the end of the network to signal a DMA write (needed by the
+    FINN PYNQ shell) or at the beginning to remove the end-of-burst from DMA read."""
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
 
     def get_nodeattr_types(self):
         my_attrs = {
+            # number of (static) iterations until TLAST=1 is generated for Direction=out
             "NumIters": ("i", True, 0),
+            # whether static or dynamic (from AXI lite) number of iterations are used
+            "DynIters": ("i", False, 1),
+            # direction: whether to insert or remove TLAST
+            "Direction": ("s", False, "out"),
             # width of input-output data streams, in bits
             "StreamWidth": ("i", True, 0),
             # width of individual element in stream, in bits
             "ElemWidth": ("i", True, 0),
+            # Protocol: external or internal
+            # Vitis docs recommend using qdma_axis for external, ap_axiu for internal
+            "Protocol": ("s", False, "external"),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -76,12 +86,33 @@ class TLastMarker(HLSCustomOp):
 
     def defines(self, var):
         stream_width = self.get_nodeattr("StreamWidth")
+        direction = self.get_nodeattr("Direction")
+        protocol = self.get_nodeattr("Protocol")
         # output stream must have TLAST, so we use this stream data type:
         # qdma_axis<stream_data_width,0,0,0 >
-        out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width
+        if direction == "out":
+            if protocol == "external":
+                out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width
+            elif protocol == "internal":
+                out_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width
+            else:
+                raise Exception("Unrecognized Protocol in TLastMarker")
+            in_stream_dtype = "ap_uint<%d>" % stream_width
+        elif direction == "in":
+            out_stream_dtype = "ap_uint<%d>" % stream_width
+            if protocol == "external":
+                in_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width
+            elif protocol == "internal":
+                in_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width
+            else:
+                raise Exception("Unrecognized Protocol in TLastMarker")
+        else:
+            raise Exception("Unrecognized Direction in TLastMarker")
+
         self.code_gen_dict["$DEFINES$"] = [
             "#define StreamWidth %d" % stream_width,
             "#define OutDType %s" % out_stream_dtype,
+            "#define InDType %s" % in_stream_dtype,
             "#define NumItersPerImg %d" % self.get_nodeattr("NumIters"),
         ]
 
@@ -89,27 +120,60 @@ class TLastMarker(HLSCustomOp):
         self.code_gen_dict["$READNPYDATA$"] = []
 
     def docompute(self):
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            "unsigned int n = 1;",
-            "OutDType t;",
-            "t.set_keep(-1);",
-            "io_section: { // start of cycle accurate region",
-            "#pragma HLS protocol fixed",
-            "// do a first read from stream before we decide on numIters",
-            "// giving software a chance to set up the numIters prior to startup",
-            "t.set_data(in0.read());",
-            "n = (numIters == 0 ? NumItersPerImg : numIters);",
-            "t.set_last(n==1);",
-            "out.write(t);",
-            "} // end of cycle accurate region",
-            "// do one less iteration than spec since we already did one",
-            "for(unsigned int i=1; i<n; i++) {",
-            "#pragma HLS PIPELINE II=1",
-            "t.set_data(in0.read());",
-            "t.set_last(i==(n-1));",
-            "out.write(t);",
-            "}",
-        ]
+        dyn_iters = self.get_nodeattr("DynIters")
+        direction = self.get_nodeattr("Direction")
+        use_qdma_axis = self.get_nodeattr("Protocol") == "external"
+        if direction == "in":
+            # read from input and just pass data along; ignore tlast
+            # no dyn iters on input, it doesnt make sense
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "for(unsigned int i=0; i<NumItersPerImg; i++) {",
+                "#pragma HLS PIPELINE II=1",
+                "out.write(in0.read().get_data());"
+                if use_qdma_axis
+                else "out.write(in0.read().data);",
+                "}",
+            ]
+
+        elif dyn_iters == 1:
+            # output, with dynamic iteration counts
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "unsigned int n = 1;",
+                "OutDType t;",
+                "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;",
+                "io_section: { // start of cycle accurate region",
+                "#pragma HLS protocol fixed",
+                "// do a first read from stream before we decide on numIters",
+                "// giving software a chance to set up the numIters prior to startup",
+                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "n = (numIters == 0 ? NumItersPerImg : numIters);",
+                "t.set_last(n==1);" if use_qdma_axis else "t.last = (n==1);",
+                "out.write(t);",
+                "} // end of cycle accurate region",
+                "// do one less iteration than spec since we already did one",
+                "for(unsigned int i=1; i<n; i++) {",
+                "#pragma HLS PIPELINE II=1",
+                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "t.set_last(i==(n-1));" if use_qdma_axis else "t.last = (i==(n-1));",
+                "out.write(t);",
+                "}",
+            ]
+
+        else:
+            # output, with static iteration counts
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "unsigned int n = 1;",
+                "OutDType t;",
+                "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;",
+                "for(unsigned int i=0; i<NumItersPerImg; i++) {",
+                "#pragma HLS PIPELINE II=1",
+                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "t.set_last(i==(NumItersPerImg-1));"
+                if use_qdma_axis
+                else "t.last = (i==(NumItersPerImg-1));",
+                "out.write(t);",
+                "}",
+            ]
 
     def dataoutstrm(self):
         self.code_gen_dict["$DATAOUTSTREAM$"] = []
@@ -118,18 +182,30 @@ class TLastMarker(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void %s(hls::stream<ap_uint<StreamWidth> > &in0,
-                hls::stream<OutDType> &out, unsigned int numIters)"""
-            % self.onnx_node.name
-        ]
+        dyn_iters = self.get_nodeattr("DynIters")
+
+        if dyn_iters == 1:
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void %s(hls::stream<InDType> &in0,
+                    hls::stream<OutDType> &out, unsigned int numIters)"""
+                % self.onnx_node.name
+            ]
+        else:
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void %s(hls::stream<InDType> &in0, hls::stream<OutDType> &out)"""
+                % self.onnx_node.name
+            ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE s_axilite port=numIters bundle=control"
-        )
+
+        dyn_iters = self.get_nodeattr("DynIters")
+        if dyn_iters == 1:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE s_axilite port=numIters bundle=control"
+            )
+
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
@@ -158,7 +234,7 @@ class TLastMarker(HLSCustomOp):
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<InDType> in0 ("in0");'
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<OutDType> out ("out");'
diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py
index 82a6b140f7af1be4e5c0f429d077b99c7865383e..8ed0041704d421dab587f08bcbcd9e739e8434e9 100644
--- a/src/finn/custom_op/im2col.py
+++ b/src/finn/custom_op/im2col.py
@@ -80,6 +80,8 @@ class Im2Col(CustomOp):
             "input_shape": ("s", True, ""),
             "pad_amount": ("i", False, 0),
             "pad_value": ("i", False, 0),
+            # depthwise: if != 0, infer ConvolutionInputGenerator with depthwise == 1
+            "depthwise": ("i", False, 0),
         }
 
     def make_shape_compatible_op(self, model):
diff --git a/src/finn/custom_op/quantavgpool2d.py b/src/finn/custom_op/quantavgpool2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb5c78bc0c8419ba519c5c3113d9b0c7ae2dd3b7
--- /dev/null
+++ b/src/finn/custom_op/quantavgpool2d.py
@@ -0,0 +1,128 @@
+import numpy as np
+from onnx import TensorProto, helper
+import onnxruntime as rt
+
+from finn.custom_op import CustomOp
+from finn.core.datatype import DataType
+from finn.custom_op.maxpoolnhwc import compute_pool_output_dim
+
+
+class QuantAvgPool2d(CustomOp):
+    """Class that corresponds to the quantized average pooling
+    layer from brevitas"""
+
+    def get_nodeattr_types(self):
+        return {
+            "stride": ("i", True, 1),
+            "kernel": ("i", True, 1),
+            "ibits": ("i", True, 1),
+            "obits": ("i", True, 1),
+            # determines if values are signed (set to "1") or unsigned ("0")
+            "signed": ("i", True, 0),
+            # data layout attribute can be set to "NCHW" or "NHWC"
+            "data_layout": ("s", False, "NCHW"),
+        }
+
+    def make_shape_compatible_op(self, model):
+        node = self.onnx_node
+        k = self.get_nodeattr("kernel")
+        s = self.get_nodeattr("stride")
+        data_layout = self.get_nodeattr("data_layout")
+        if data_layout == "NCHW":
+            return helper.make_node(
+                "AveragePool",
+                inputs=[node.input[0]],
+                outputs=[node.output[0]],
+                kernel_shape=[k, k],
+                strides=[s, s],
+            )
+        elif data_layout == "NHWC":
+            iname = node.input[0]
+            ishape = model.get_tensor_shape(iname)
+            (n, hi, wi, c) = ishape
+            ho = compute_pool_output_dim(hi, k, s)
+            wo = compute_pool_output_dim(wi, k, s)
+            oshape = (n, ho, wo, c)
+            # implement tensor with correct shape
+            values = np.random.randn(*oshape).astype(np.float32)
+            return helper.make_node(
+                "Constant",
+                inputs=[],
+                outputs=[node.output[0]],
+                value=helper.make_tensor(
+                    name="const_tensor",
+                    data_type=TensorProto.FLOAT,
+                    dims=values.shape,
+                    vals=values.flatten().astype(float),
+                ),
+            )
+
+        else:
+            raise Exception(
+                """Datalayout for QuantAvgPool2d is set to an invalid value.
+                    Has to be set to "NCHW" or "NHWC"."""
+            )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        bw = self.get_nodeattr("obits")
+        if bw in [2, 4, 8, 16, 32]:
+            if self.get_nodeattr("signed") == 0:
+                dtype = DataType["UINT%d" % bw]
+            else:
+                dtype = DataType["INT%d" % bw]
+        else:
+            raise Exception("Unsupported output datatype for QuantAvgPool2d")
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def execute_node(self, context, graph):
+        # create a standard average pooling node to help calculate the result
+        node = self.onnx_node
+        k = self.get_nodeattr("kernel")
+        s = self.get_nodeattr("stride")
+        inp_values = context[node.input[0]]
+        oshape = context[node.output[0]].shape
+        if self.get_nodeattr("data_layout") == "NHWC":
+            inp_values = inp_values.transpose(0, 3, 1, 2)
+            oshape = (context[node.output[0]]).transpose(0, 3, 1, 2).shape
+        ishape = inp_values.shape
+        inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape)
+        outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape)
+        node_avgpool = helper.make_node(
+            "AveragePool",
+            inputs=[node.input[0]],
+            outputs=[node.output[0]],
+            kernel_shape=[k, k],
+            strides=[s, s],
+        )
+        graph_avgpool = helper.make_graph(
+            nodes=[node_avgpool],
+            name="single-avgpool-exec",
+            inputs=[inp],
+            outputs=[outp],
+        )
+        model_avgpool = helper.make_model(graph_avgpool)
+        idict = {node.input[0]: inp_values}
+        sess = rt.InferenceSession(model_avgpool.SerializeToString())
+        result_temp = sess.run(None, idict)
+        # remove scaling introduced by average
+        result_temp = result_temp[0] * (k * k)
+        ibits = self.get_nodeattr("ibits")
+        max_value = 2 ** ibits - 1
+        max_value = max_value * k * k
+        max_bit_width = int(max_value).bit_length()
+        shift_bits = max_bit_width - self.get_nodeattr("obits")
+        result = np.right_shift(result_temp.astype(int), shift_bits)
+        if self.get_nodeattr("data_layout") == "NHWC":
+            result = result.transpose(0, 2, 3, 1)
+        context[node.output[0]] = result.astype(np.float32)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+        return info_messages
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index 614a3d7ffd70d0b102bad2b76177a2d3b32765c7..0060e5d400f30055d532671c8cf1680f0668442a 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -44,10 +44,12 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
-from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch
+from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
+from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
 from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
+from finn.custom_op.quantavgpool2d import QuantAvgPool2d
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
 
 # create a mapping of all known CustomOp names and classes
@@ -65,10 +67,12 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
 custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
+custom_op["Pool_Batch"] = Pool_Batch
 custom_op["FMPadding_Batch"] = FMPadding_Batch
 custom_op["Thresholding_Batch"] = Thresholding_Batch
 custom_op["AddStreams_Batch"] = AddStreams_Batch
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
+custom_op["QuantAvgPool2d"] = QuantAvgPool2d
 custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
 
 
diff --git a/src/finn/transformation/change_datalayout.py b/src/finn/transformation/change_datalayout.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5b393a25e57122b059a44f70904a6dbe5bbaa3f
--- /dev/null
+++ b/src/finn/transformation/change_datalayout.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from onnx import helper, TensorProto
+
+from finn.transformation import Transformation
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import get_by_name
+
+
+class ChangeDataLayoutQuantAvgPool2d(Transformation):
+    """Replace QuantAvgPool2d with datalayout (N,C,H,W) with Transpose nodes
+    and QuantAvgPool2dNHWC with datalayout (N,H,W,C)"""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "QuantAvgPool2d" and (
+                get_by_name(n.attribute, "data_layout") is None
+                or get_by_name(n.attribute, "data_layout").s.decode("UTF-8") == "NCHW"
+            ):
+                graph_modified = True
+                node_input = n.input[0]
+                node_output = n.output[0]
+                s = get_by_name(n.attribute, "stride").i
+                k = get_by_name(n.attribute, "kernel").i
+                ibits = get_by_name(n.attribute, "ibits").i
+                obits = get_by_name(n.attribute, "obits").i
+                signed = get_by_name(n.attribute, "signed").i
+                batchsize = model.get_tensor_shape(n.input[0])[0]  # assume NCHW
+                channels = model.get_tensor_shape(n.input[0])[1]  # assume NCHW
+                idim = model.get_tensor_shape(n.input[0])[-1]  # assume NCHW
+                odim = model.get_tensor_shape(n.output[0])[-1]  # assume NCHW
+
+                # create new nodes
+                # NCHW -> NHWC
+                # create new intermediate values
+                inp_trans_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (batchsize, idim, idim, channels),  # NHWC
+                )
+                graph.value_info.append(inp_trans_out)
+                inp_trans_out = inp_trans_out.name
+                quantavg_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (batchsize, odim, odim, channels),
+                )
+                graph.value_info.append(quantavg_out)
+                quantavg_out = quantavg_out.name
+                inp_trans_node = helper.make_node(
+                    "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1]
+                )
+                quantavg_node = helper.make_node(
+                    "QuantAvgPool2d",
+                    [inp_trans_out],
+                    [quantavg_out],
+                    domain="finn",
+                    stride=s,
+                    kernel=k,
+                    ibits=ibits,
+                    obits=obits,
+                    signed=signed,
+                    data_layout="NHWC",
+                )
+                # NHWC -> NCHW
+                out_trans_node = helper.make_node(
+                    "Transpose", [quantavg_out], [node_output], perm=[0, 3, 1, 2]
+                )
+                # insert nodes
+                graph.node.insert(node_ind, inp_trans_node)
+                graph.node.insert(node_ind + 1, quantavg_node)
+                graph.node.insert(node_ind + 2, out_trans_node)
+                # remove old nodes
+                graph.node.remove(n)
+
+                # set shapes
+                model.set_tensor_shape(inp_trans_out, (batchsize, idim, idim, channels))
+                model.set_tensor_shape(quantavg_out, (batchsize, odim, odim, channels))
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py
index 207075b00de1871da19ea78472125d435449ed6e..62ee92df54eee2b63d84657515d7fbc3a8808b81 100644
--- a/src/finn/transformation/fpgadataflow/annotate_resources.py
+++ b/src/finn/transformation/fpgadataflow/annotate_resources.py
@@ -69,6 +69,9 @@ class AnnotateResources(Transformation):
                     total_dict[r_type] += r_amount
                 else:
                     total_dict[r_type] = r_amount
+        for k in total_dict.keys():
+            if "efficiency" in k:
+                total_dict[k] = total_dict[k] / len(graph.node)
         model.set_metadata_prop("res_total_" + self.mode, str(total_dict))
         for node in graph.node:
             if _is_fpgadataflow_node(node) and node.name in res_dict.keys():
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index d421a5f3ef8ca980b399087de1482b2ae913da1b..b70b126680d650547cf376dd601c048c73a1cfd4 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -34,6 +34,7 @@ from finn.custom_op.registry import getCustomOp
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
 import finn.core.data_layout as DataLayout
+from finn.util.basic import get_by_name
 
 
 class InferConvInpGen(Transformation):
@@ -56,6 +57,7 @@ class InferConvInpGen(Transformation):
                 k = i2c_inst.get_nodeattr("kernel_size")
                 pad = i2c_inst.get_nodeattr("pad_amount")
                 pad_val = i2c_inst.get_nodeattr("pad_value")
+                depthwise = i2c_inst.get_nodeattr("depthwise")
                 ifm_ch = i2c_in_shape[-1]
                 ifm_dim = i2c_in_shape[1]
                 ofm_dim = i2c_out_shape[1]
@@ -67,7 +69,11 @@ class InferConvInpGen(Transformation):
 
                 if pad > 0:
                     # if padding enabled, ensure pad_val supported by DataType
-                    assert dt.allowed(pad_val), "Im2Col DataType must support pad_val"
+                    # assert dt.allowed(pad_val),"""FMPadding_Batch DataType
+                    # must support pad_val"""
+                    assert (
+                        pad_val == 0
+                    ), "FMPadding_Batch doesn't currently support pad_val!= 0"
 
                     odim_padding = ifm_dim + 2 * pad
 
@@ -112,6 +118,7 @@ class InferConvInpGen(Transformation):
                     Stride=stride,
                     inputDataType=dt.name,
                     outputDataType=dt.name,
+                    depthwise=depthwise,
                 )
                 graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
@@ -169,6 +176,137 @@ class InferStreamingMaxPool(Transformation):
         return (model, graph_modified)
 
 
+class InferPool_Batch(Transformation):
+    """If kernel_shape > strides, replace Pool layer with  with of Im2col
+    + pool(with kernel_shape == strides), plus Transpose layers to keep the original
+    data layout."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type in ["MaxPool"]:
+                # extract pool parameters
+                k = get_by_name(n.attribute, "kernel_shape").ints[-1]
+                stride = get_by_name(n.attribute, "strides").ints[-1]
+
+                if k <= stride:
+                    continue
+
+                try:
+                    pad = get_by_name(n.attribute, "pads").ints[-1]
+                except AttributeError:
+                    pad = 0
+
+                node_input = n.input[0]
+                node_output = n.output[0]
+                idt = model.get_tensor_datatype(node_input)
+                if not idt.is_integer():
+                    continue
+
+                # odt = model.get_tensor_datatype(node_output)
+
+                ifm_ch = model.get_tensor_shape(n.input[0])[1]  # assume NCHW
+                ofm_ch = ifm_ch
+                ifm_dim = model.get_tensor_shape(n.input[0])[-1]  # assume NCHW
+                ofm_dim = model.get_tensor_shape(n.output[0])[-1]  # assume NCHW
+                # create new intermediate values
+                inp_trans_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (1, ifm_dim, ifm_dim, ifm_ch),  # NHWC
+                )
+                graph.value_info.append(inp_trans_out)
+                inp_trans_out = inp_trans_out.name
+                model.set_tensor_datatype(inp_trans_out, idt)
+
+                im2col_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (1, ofm_dim, ofm_dim, ifm_ch * k * k),
+                )
+                graph.value_info.append(im2col_out)
+                im2col_out = im2col_out.name
+                model.set_tensor_datatype(im2col_out, idt)
+
+                pool_output = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (1, ofm_dim, ofm_dim, ofm_ch),
+                )
+                graph.value_info.append(pool_output)
+                pool_output = pool_output.name
+                # model.set_tensor_datatype(pool_output, odt)
+
+                # create new nodes
+                # NCHW -> NHWC
+                inp_trans_node = helper.make_node(
+                    "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1]
+                )
+
+                if n.op_type == "MaxPool":
+                    pool_fxn = "MaxPool"
+                    pad_value = idt.min()
+                else:
+                    raise Exception(
+                        "pad_value and pool_fxn not configured for {}".format(n.op_type)
+                    )
+
+                # format input tensor
+                im2col_node = helper.make_node(
+                    "Im2Col",
+                    [inp_trans_out],
+                    [im2col_out],
+                    domain="finn",
+                    stride=stride,
+                    kernel_size=k,
+                    pad_amount=pad,
+                    pad_value=pad_value,
+                    depthwise=1,
+                    input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
+                )
+
+                # Warning PE has to be equal to ifm_ch until Im2Col is replaced by
+                # ConvolutionInputGenerator with depthwise=1.
+                # For other settings the output will be incorrect due to incorrect input
+                # data layout
+                pool_node = helper.make_node(
+                    "Pool_Batch",
+                    [im2col_out],
+                    [pool_output],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    dataType=idt.name,
+                    Channels=ifm_ch,
+                    PE=ifm_ch,
+                    KernelSize=k,
+                    Function=pool_fxn,
+                    OutImgDim=ofm_dim,
+                    BatchSize=1,
+                )
+
+                # NHWC -> NCHW
+                out_trans_node = helper.make_node(
+                    "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2]
+                )
+
+                # insert nodes where the conv is to preserve topological ordering
+                graph.node.insert(node_ind, inp_trans_node)
+                graph.node.insert(node_ind + 1, im2col_node)
+                graph.node.insert(node_ind + 2, pool_node)
+                graph.node.insert(node_ind + 3, out_trans_node)
+                # remove old node
+                graph.node.remove(n)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class InferBinaryStreamingFCLayer(Transformation):
     """Convert XnorPopcountMatMul layers to
     StreamingFCLayer_Batch layers. Any immediately following MultiThreshold
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 32f32ece585a93465ba32fede45d5eb606a2b0a3..04dd437af27b9fbe18b2255c20a8e4acda03b3d0 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -31,23 +31,34 @@ from onnx import helper as oh
 
 from finn.custom_op.registry import getCustomOp
 from finn.transformation import Transformation
+from finn.util.basic import get_by_name
+
+import numpy as np
 
 
 class InsertTLastMarker(Transformation):
-    """Ensure that the graph is terminated with a TLastMarker node, inserting
-    one if necessary."""
+    """Ensure that the graph is started/terminated with a TLastMarker node, inserting
+    one if necessary. Use constructor args to determine type of TLastMarker to be inserted.
+    More information available on the TLastMarker documentation.
+    """
 
-    def __init__(self):
+    def __init__(self, both=False, external=True, dynamic=True):
         super().__init__()
+        self.dyniters = dynamic
+        self.external = external
+        self.both = both
 
     def apply(self, model):
         # TODO only makes sense for a pure fpgadataflow graph -- check!
         graph_out_name = model.graph.output[0].name
         final_node = model.find_producer(graph_out_name)
-        if final_node.op_type == "TLastMarker":
-            # TODO maybe check the correctness of properties
-            return (model, False)
-        else:
+        graph_modified = False
+        if final_node.op_type != "TLastMarker" and not (
+            final_node.op_type == "IODMA"
+            and get_by_name(final_node.attribute, "direction").s.decode("UTF-8")
+            == "out"
+        ):
+
             custom_op = getCustomOp(final_node)
             num_iters = int(custom_op.get_number_output_values())
             stream_width = int(custom_op.get_outstream_width())
@@ -69,8 +80,51 @@ class InsertTLastMarker(Transformation):
                 NumIters=num_iters,
                 StreamWidth=stream_width,
                 ElemWidth=elem_width,
+                DynIters=(1 if self.dyniters else 0),
+                Direction="out",
+                Protocol=("external" if self.external else "internal"),
                 domain="finn",
                 backend="fpgadataflow",
             )
             model.graph.node.append(tlast_node)
-            return (model, True)
+            graph_modified = True
+        # if both is True, also insert marker on input
+        if self.both:
+            graph_in_name = model.graph.input[0].name
+            first_node = model.find_consumer(graph_in_name)
+            if first_node.op_type != "TLastMarker" and not (
+                first_node.op_type == "IODMA"
+                and get_by_name(first_node.attribute, "direction").s.decode("UTF-8")
+                == "in"
+            ):
+
+                custom_op = getCustomOp(first_node)
+                num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1])
+                stream_width = int(custom_op.get_instream_width())
+                in_shape = model.get_tensor_shape(graph_in_name)
+                in_dtype = model.get_tensor_datatype(graph_in_name)
+                elem_width = in_dtype.bitwidth()
+                # make new buffer
+                first_node_in = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                )
+                model.graph.value_info.append(first_node_in)
+                model.set_tensor_datatype(first_node_in.name, in_dtype)
+                # reroute final node output to first_node_in_name
+                first_node.input[0] = first_node_in.name
+                tlast_node = oh.make_node(
+                    "TLastMarker",
+                    [graph_in_name],
+                    [first_node_in.name],
+                    NumIters=num_iters,
+                    StreamWidth=stream_width,
+                    ElemWidth=elem_width,
+                    DynIters=(1 if self.dyniters else 0),
+                    Direction="in",
+                    Protocol=("external" if self.external else "internal"),
+                    domain="finn",
+                    backend="fpgadataflow",
+                )
+                model.graph.node.insert(0, tlast_node)
+                graph_modified = True
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
index a1524322ec03a4e96ef41f999144e3eed349c5af..6eae560e1191642cfaf85d92c6d0fcf644630973 100644
--- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
@@ -29,9 +29,12 @@
 import os
 
 import finn.custom_op.registry as registry
-from finn.transformation import Transformation
 from finn.util.basic import make_build_dir
 from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.transformation import Transformation
+from finn.util.basic import get_num_default_workers
+import multiprocessing as mp
+import copy
 
 
 def _codegen_single_node(node, model):
@@ -66,8 +69,39 @@ class PrepareCppSim(Transformation):
     that contains generated C++ code that can be used to simulate node using cppsim.
     The subsequent transformation is CompileCppSim"""
 
+    def __init__(self, num_workers=None):
+        super().__init__()
+        if num_workers is None:
+            self._num_workers = get_num_default_workers()
+        else:
+            self._num_workers = num_workers
+        assert self._num_workers >= 0, "Number of workers must be nonnegative."
+        if self._num_workers == 0:
+            self._num_workers = mp.cpu_count()
+
+    def prepareCppSim_node(self, node):
+        if is_fpgadataflow_node(node) is True:
+            _codegen_single_node(node, self.model)
+        return (node, False)
+
     def apply(self, model):
-        for node in model.graph.node:
-            if is_fpgadataflow_node(node) is True:
-                _codegen_single_node(node, model)
-        return (model, False)
+        # Remove old nodes from the current model
+        self.model = copy.deepcopy(model)
+        old_nodes = []
+        for i in range(len(model.graph.node)):
+            old_nodes.append(model.graph.node.pop())
+
+        # Execute transformation in parallel
+        with mp.Pool(self._num_workers) as p:
+            new_nodes_and_bool = p.map(self.prepareCppSim_node, old_nodes, chunksize=1)
+
+        # extract nodes and check if the transformation needs to run again
+        # Note: .pop() had initially reversed the node order
+        run_again = False
+        for node, run in reversed(new_nodes_and_bool):
+            # Reattach new nodes to old model
+            model.graph.node.append(node)
+            if run is True:
+                run_again = True
+
+        return (model, run_again)
diff --git a/src/finn/transformation/infer_data_layouts.py b/src/finn/transformation/infer_data_layouts.py
index 9ac75578ffb911cc44cfddc2b2119b55e6abf2dd..e7a6b88239a1735d5379e165333f8356ae6f88a1 100644
--- a/src/finn/transformation/infer_data_layouts.py
+++ b/src/finn/transformation/infer_data_layouts.py
@@ -38,7 +38,7 @@ def _dims_to_layout(model, node, ndims):
         return DataLayout.NC
     else:
         if node.domain == "finn":
-            if node.op_type == "MultiThreshold":
+            if node.op_type == "MultiThreshold" or node.op_type == "QuantAvgPool2d":
                 mt_inst = registry.getCustomOp(node)
                 layout = mt_inst.get_nodeattr("data_layout")
                 if layout == "NHWC" and ndims == 4:
diff --git a/src/finn/transformation/infer_datatypes.py b/src/finn/transformation/infer_datatypes.py
index 1acd4e3abe2d77248810cf15c15475e806a3bd32..39b7a787be8c725e7b6d474757dd96fc4848dfe0 100644
--- a/src/finn/transformation/infer_datatypes.py
+++ b/src/finn/transformation/infer_datatypes.py
@@ -71,7 +71,13 @@ def _infer_node_datatype(model, node):
         else:
             # unknown, assume node produces float32 outputs
             for o in node.output:
-                model.set_tensor_datatype(o, DataType.FLOAT32)
+                # check if output datatype is already set to a value != FLOAT32
+                odtype = model.get_tensor_datatype(o)
+                if odtype is not None and odtype != DataType.FLOAT32:
+                    # don't change data type
+                    model.set_tensor_datatype(o, odtype)
+                else:
+                    model.set_tensor_datatype(o, DataType.FLOAT32)
     # compare old and new output dtypes to see if anything changed
     new_odtypes = list(map(lambda x: model.get_tensor_datatype(x), node.output))
     graph_modified = new_odtypes != odtypes
diff --git a/src/finn/transformation/lower_convs_to_matmul.py b/src/finn/transformation/lower_convs_to_matmul.py
index 3da785d8dd21b2c6701bffc8ce3869fb14b237a9..aa231a43a3865a161a501b4997ff2f538800554f 100644
--- a/src/finn/transformation/lower_convs_to_matmul.py
+++ b/src/finn/transformation/lower_convs_to_matmul.py
@@ -80,14 +80,19 @@ class LowerConvsToMatMul(Transformation):
                 inp_trans_out = inp_trans_out.name
                 model.set_tensor_datatype(inp_trans_out, idt)
 
-                im2col_out = helper.make_tensor_value_info(
-                    model.make_new_valueinfo_name(),
-                    TensorProto.FLOAT,
-                    (1, ofm_dim, ofm_dim, ifm_ch * k * k),
-                )
-                graph.value_info.append(im2col_out)
-                im2col_out = im2col_out.name
-                model.set_tensor_datatype(im2col_out, idt)
+                need_im2col = True
+                if k == 1 and pad == 0 and stride == 1:
+                    need_im2col = False
+
+                if need_im2col:
+                    im2col_out = helper.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        (1, ofm_dim, ofm_dim, ifm_ch * k * k),
+                    )
+                    graph.value_info.append(im2col_out)
+                    im2col_out = im2col_out.name
+                    model.set_tensor_datatype(im2col_out, idt)
 
                 matmul_out = helper.make_tensor_value_info(
                     model.make_new_valueinfo_name(),
@@ -104,19 +109,23 @@ class LowerConvsToMatMul(Transformation):
                     "Transpose", [cnv_input], [inp_trans_out], perm=[0, 2, 3, 1]
                 )
                 # lower input tensor
-                im2col_node = helper.make_node(
-                    "Im2Col",
-                    [inp_trans_out],
-                    [im2col_out],
-                    domain="finn",
-                    stride=stride,
-                    kernel_size=k,
-                    pad_amount=pad,
-                    input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
-                )
+                matmul_input = inp_trans_out
+                if need_im2col:
+                    matmul_input = im2col_out
+                    im2col_node = helper.make_node(
+                        "Im2Col",
+                        [inp_trans_out],
+                        [im2col_out],
+                        domain="finn",
+                        stride=stride,
+                        kernel_size=k,
+                        pad_amount=pad,
+                        input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
+                    )
+
                 # do matmul
                 matmul_node = helper.make_node(
-                    "MatMul", [im2col_out, weight_name], [matmul_out]
+                    "MatMul", [matmul_input, weight_name], [matmul_out]
                 )
                 # NHWC -> NCHW
                 out_trans_node = helper.make_node(
@@ -124,9 +133,13 @@ class LowerConvsToMatMul(Transformation):
                 )
                 # insert nodes where the conv is to preserve topological ordering
                 graph.node.insert(node_ind, inp_trans_node)
-                graph.node.insert(node_ind + 1, im2col_node)
-                graph.node.insert(node_ind + 2, matmul_node)
-                graph.node.insert(node_ind + 3, out_trans_node)
+                if need_im2col:
+                    graph.node.insert(node_ind + 1, im2col_node)
+                    graph.node.insert(node_ind + 2, matmul_node)
+                    graph.node.insert(node_ind + 3, out_trans_node)
+                else:
+                    graph.node.insert(node_ind + 1, matmul_node)
+                    graph.node.insert(node_ind + 2, out_trans_node)
                 # remove old nodes
                 graph.node.remove(n)
         model = model.transform(InferShapes())
diff --git a/src/finn/transformation/remove_identity.py b/src/finn/transformation/remove_identity.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7a58d59c1bb8ff643e691442e7eda3c0516aa5c
--- /dev/null
+++ b/src/finn/transformation/remove_identity.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.transformation import Transformation
+
+
+def _is_identity(node, model):
+    if node.op_type == "Mul":
+        scale = model.get_initializer(node.input[1])
+        if scale is not None:
+            return (scale == 1).all()
+    elif node.op_type == "Add":
+        bias = model.get_initializer(node.input[1])
+        if bias is not None:
+            return (bias == 0).all()
+    return False
+
+
+class RemoveIdentity(Transformation):
+    """Remove nodes that apply identity ops from the graph, including:
+    * Multiply by 1
+    * Add 0
+    ."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if _is_identity(node, model):
+                node_src = node.input[0]
+                node_dst = node.output[0]
+                graph.node.remove(node)
+                model.rename_tensor(node_dst, node_src)
+                graph_modified = True
+        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/__init__.py b/src/finn/transformation/streamline/__init__.py
index c9c73fa4c8303ee28bc1cc6aee879d633740e01e..d9c12a20975084705b801c0ff027d4b99aff9490 100644
--- a/src/finn/transformation/streamline/__init__.py
+++ b/src/finn/transformation/streamline/__init__.py
@@ -41,6 +41,7 @@ from finn.transformation.streamline.absorb import (
     FactorOutMulSignMagnitude,
     Absorb1BitMulIntoMatMul,
     Absorb1BitMulIntoConv,
+    AbsorbSignBiasIntoMultiThreshold,
 )
 
 from finn.transformation.streamline.collapse_repeated import (
@@ -59,6 +60,7 @@ from finn.transformation.streamline.reorder import (
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.transformation.streamline.sign_to_thres import ConvertSignToThres
 from finn.transformation.batchnorm_to_affine import BatchNormToAffine
+from finn.transformation.streamline.remove import RemoveIdentityOps
 
 
 class Streamline(Transformation):
@@ -70,6 +72,7 @@ class Streamline(Transformation):
             ConvertDivToMul(),
             BatchNormToAffine(),
             ConvertSignToThres(),
+            AbsorbSignBiasIntoMultiThreshold(),
             MoveAddPastMul(),
             MoveScalarAddPastMatMul(),
             MoveScalarAddPastConv(),
@@ -87,6 +90,7 @@ class Streamline(Transformation):
         ]
         for trn in streamline_transformations:
             model = model.transform(trn)
+            model = model.transform(RemoveIdentityOps())
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(GiveReadableTensorNames())
             model = model.transform(InferDataTypes())
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index dbcf97361017144174f9fbfca35a84361b5abd26..dc01eea411fc1f640e481c9be02a92acdd59533f 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -28,14 +28,80 @@
 
 import numpy as np
 from onnx import helper as oh
+import warnings
 
 from finn.core.datatype import DataType
 from finn.transformation import Transformation
 from finn.util.basic import get_by_name
 from finn.custom_op.registry import getCustomOp
+from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
 
 
+class AbsorbSignBiasIntoMultiThreshold(Transformation):
+    """Absorb scalar bias originating from signed int export back into
+    MultiThreshold and re-evaluate the output datatype."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            # search for (MultiThreshold, Add) pair
+            node_ind += 1
+            if (
+                n.op_type == "MultiThreshold"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                consumer = model.find_consumer(n.output[0])
+                if consumer is not None and consumer.op_type == "Add":
+                    mt_node = n
+                    add_node = consumer
+                    threshold_name = mt_node.input[1]
+                    add_weight_name = add_node.input[1]
+                    T = model.get_initializer(threshold_name)
+                    A = model.get_initializer(add_weight_name)
+                    if (A is None) or (T is None):
+                        warnings.warn("Threshold or add bias not constant, skipping")
+                        continue
+                    end_name = add_node.output[0]
+                    # we can only absorb scalar adds
+                    is_scalar = A.ndim == 0 or all(x == 1 for x in A.shape)
+                    if not is_scalar:
+                        continue
+                    bias = A.flatten()[0]
+                    # set MultiThreshold bias property
+                    mt_inst = getCustomOp(mt_node)
+                    bias += mt_inst.get_nodeattr("out_bias")
+                    mt_inst.set_nodeattr("out_bias", bias)
+                    graph_modified = True
+                    # compute new DataType for MultiThreshold output
+                    steps = T.shape[-1]
+                    new_min = bias
+                    new_max = steps + bias
+                    odt = DataType.get_smallest_possible(steps).name.replace(
+                        "UINT", "INT"
+                    )
+                    odt = DataType[odt]
+                    assert odt.allowed(new_max) and odt.allowed(
+                        new_min
+                    ), """Could
+                    not compute new MultiThreshold DataType (min = %d max = %d)""" % (
+                        new_min,
+                        new_max,
+                    )
+                    mt_inst.set_nodeattr("out_dtype", odt.name)
+                    # remove Add node, rewire MultiThreshold
+                    graph.node.remove(add_node)
+                    mt_node.output[0] = end_name
+                    # set datatype
+                    model.set_tensor_datatype(end_name, odt)
+        if graph_modified:
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class AbsorbAddIntoMultiThreshold(Transformation):
     """Absorb preceding Add ops into MultiThreshold by updating the threshold
     values. Only scalar/1D add vectors can be absorbed."""
@@ -290,3 +356,38 @@ class AbsorbTransposeIntoMultiThreshold(Transformation):
         if graph_modified:
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class AbsorbScalarMulIntoTopK(Transformation):
+    """Absorb a mul node into a suceeding topk node if the mul is scalar."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "TopK":
+                prod = model.find_producer(n.input[0])
+                if prod is not None and prod.op_type == "Mul":
+                    prod_input = prod.input[0]
+                    param_name = prod.input[1]
+                    A = model.get_initializer(param_name)
+                    if A is None:
+                        warnings.warn("Param is not constant, skipping")
+                        continue
+                    if all(x == 1 for x in A.shape) and A > 0:
+                        # if the mul is scalar and positive, we can just delete the
+                        # mul node and rewire the top k node. Because the top k node
+                        # works with probabilities and their relation to each other
+                        # the relation doesn't change if every value is multiplied
+                        # with a scalar
+                        graph.node.remove(prod)
+                        n.input[0] = prod_input
+                        # to avoid error the dataype is set to float32
+                        model.set_tensor_datatype(n.input[0], DataType.FLOAT32)
+                        graph_modified = True
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py
index 67824ad4f633983b93e3178d03118927a1ddd85b..769bed841ce07c1c9c62f762de4b2c0937a6d68f 100644
--- a/src/finn/transformation/streamline/collapse_repeated.py
+++ b/src/finn/transformation/streamline/collapse_repeated.py
@@ -30,6 +30,7 @@ from onnx import helper as oh
 
 from finn.transformation import Transformation
 from finn.transformation.infer_shapes import InferShapes
+from finn.core.datatype import DataType
 
 
 class CollapseRepeatedOp(Transformation):
@@ -83,6 +84,9 @@ class CollapseRepeatedOp(Transformation):
                     graph.node.insert(node_ind, new_node)
                     # replace parameter value
                     model.set_initializer(new_node_param_name, new_param)
+                    # be conservative with param/output DataTypes
+                    model.set_tensor_datatype(new_node_param_name, DataType.FLOAT32)
+                    model.set_tensor_datatype(end_name, DataType.FLOAT32)
                     # remove old nodes
                     graph.node.remove(n)
                     graph.node.remove(consumer)
diff --git a/src/finn/transformation/streamline/remove.py b/src/finn/transformation/streamline/remove.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddc4233ddafbc70c4d20d316ea72ea6bba1b82a8
--- /dev/null
+++ b/src/finn/transformation/streamline/remove.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+from finn.transformation import Transformation
+from finn.transformation.infer_shapes import InferShapes
+import numpy as np
+
+class RemoveIdentityOps(Transformation):
+    """Remove identity ops like Add/Sub with zero or Mul/Div with one"""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type in ["Add", "Sub"]
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                A = model.get_initializer(n.input[1])
+                if A is not None and (A == np.zeros_like(A)).all():
+                    producer = model.find_producer(n.input[0])
+                    # remove node and wire output tensor to
+                    # output of producer node
+                    producer.output[0] = n.output[0]
+                    graph.node.remove(n)
+
+            elif (
+                n.op_type in ["Mul", "Div"]
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                A = model.get_initializer(n.input[1])
+                if A is not None and (A == np.ones_like(A)).all():
+                    producer = model.find_producer(n.input[0])
+                    # remove node and wire output tensor to
+                    # output of producer node
+                    producer.output[0] = n.output[0]
+                    graph.node.remove(n)
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 0b6259a61d3eb67b7b38d4c6939019ce2893a875..a1bd16f6d0b70193122d5d067ccdee395260c7b1 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -27,12 +27,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import warnings
 from onnx import helper as oh
 
 from finn.transformation import Transformation
 from finn.transformation.infer_shapes import InferShapes
+from finn.core.datatype import DataType
 from finn.core.onnx_exec import execute_node
 from finn.util.basic import get_by_name
+from finn.custom_op.registry import getCustomOp
 
 
 class MoveAddPastMul(Transformation):
@@ -336,6 +339,71 @@ class MoveScalarMulPastConv(Transformation):
         return (model, graph_modified)
 
 
+class MoveMulPastDWConv(Transformation):
+    """Move channelwise mul operations past depthwise conv operations. We want to have muls
+    next to each other such that they can be collapsed into a single mul."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type == "Mul"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                consumer = model.find_consumer(n.output[0])
+                if (
+                    consumer is not None
+                    and consumer.op_type == "Conv"
+                    and not model.is_join_node(consumer)
+                ):
+                    mul_weight_name = n.input[1]
+                    A = model.get_initializer(mul_weight_name)
+                    if A is None:
+                        warnings.warn(
+                            """Mul weight tensor is not set. If it is a constant,
+                                please use set_initializer to set the tensor."""
+                        )
+                        continue
+                    conv_node = consumer
+                    mul_node = n
+                    start_name = mul_node.input[0]
+                    conv_in_name = conv_node.input[0]
+                    conv_in_shape = model.get_tensor_shape(conv_in_name)
+                    ifm_ch = conv_in_shape[1]
+                    group_attribute = get_by_name(consumer.attribute, "group")
+                    if group_attribute is None:
+                        continue
+                    group_attribute = group_attribute.i
+                    conv_out_name = conv_node.output[0]
+                    conv_out_shape = model.get_tensor_shape(conv_out_name)
+                    if A.shape == (1, ifm_ch, 1, 1) and ifm_ch == group_attribute:
+                        # if the mul is channelwise and conv is depthwise,
+                        # we can simply swap the order of ops
+                        # rewire mul input to be conv input
+                        conv_node.input[0] = start_name
+                        model.set_tensor_shape(start_name, conv_in_shape)
+                        model.set_tensor_datatype(start_name, DataType.FLOAT32)
+                        # use old conv input tensor as conv output
+                        conv_node.output[0] = conv_in_name
+                        model.set_tensor_shape(conv_in_name, conv_out_shape)
+                        model.set_tensor_datatype(conv_in_name, DataType.FLOAT32)
+                        # use new conv output as new mul node input
+                        mul_node.input[0] = conv_in_name
+                        # use old conv output as new mul node output
+                        mul_node.output[0] = conv_out_name
+                        model.set_tensor_datatype(conv_out_name, DataType.FLOAT32)
+                        # move mul node past conv node
+                        graph.node.remove(mul_node)
+                        graph.node.insert(node_ind, mul_node)
+                        graph_modified = True
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
+
+
 class MoveLinearPastEltwiseAdd(Transformation):
     """Move linear operations (mul, add) past elementwise add operations where possible.
        Specifically,matches and transforms the following patterns:
@@ -531,3 +599,67 @@ class MoveMulPastFork(MoveOpPastFork):
 class MoveLinearPastFork(MoveOpPastFork):
     def __init__(self):
         super().__init__(["Add", "Mul"])
+
+
+class MoveMaxPoolPastMultiThreshold(Transformation):
+    """Move MaxPool nodes past MultiThreshold nodes on linear segments of the graph."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        nodes = [n for n in graph.node]
+        for n in nodes:
+            node_ind += 1
+            if n.op_type == "MaxPool" and not model.is_fork_node(n):
+                consumer = model.find_consumer(n.output[0])
+                pads = get_by_name(n.attribute, "pads")
+                has_padding = False
+                if pads is not None:
+                    pads = list(pads.ints)
+                    has_padding = np.prod(pads) != 0
+                if consumer is not None and consumer.op_type == "MultiThreshold":
+                    mt_out = consumer.output[0]
+                    mt_odt = model.get_tensor_datatype(mt_out)
+                    if mt_odt.signed() and has_padding:
+                        warnings.warn(
+                            "Skipping padded MaxPool + signed-output MultiThreshold"
+                        )
+                        continue
+                    # check for non-decreasing thresholds and nonnegative
+                    # scale factor in MultiThreshold
+                    # otherwise we cannot do the reordering
+                    T = model.get_initializer(consumer.input[1])
+                    T_sorted = np.sort(T, axis=1)
+                    assert (
+                        T == T_sorted
+                    ).all(), "MultiThreshold must have non-decreasing thresholds"
+                    mt_inst = getCustomOp(consumer)
+                    if mt_inst.get_nodeattr("out_scale") < 0:
+                        warnings.warn("Skipping MultiThreshold with negative out_scale")
+                        continue
+
+                    # remove old nodes
+                    graph.node.remove(n)
+                    graph.node.remove(consumer)
+
+                    # swap conections
+                    group_in = n.input[0]
+                    # new tensor because dims change
+                    group_middle = model.make_new_valueinfo_name()
+                    group_out = consumer.output[0]
+
+                    consumer.input[0] = group_in
+                    consumer.output[0] = group_middle
+
+                    n.input[0] = group_middle
+                    n.output[0] = group_out
+
+                    # insert them back in
+                    graph.node.insert(node_ind - 1, consumer)
+                    graph.node.insert(node_ind, n)
+
+                    graph_modified = True
+
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index eb3d46bcd66e3dc307a679e6b8dfbb9913398d36..4a8277e08d3fc21e0b20668edf2ecad947b36647 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -31,6 +31,7 @@ import random
 import string
 import subprocess
 import tempfile
+import warnings
 
 import numpy as np
 
@@ -105,6 +106,25 @@ def get_finn_root():
         )
 
 
+def get_execution_error_thresh():
+    "Return the max error that is allowed for rounding in FINN execution."
+    try:
+        return float(os.environ["ERROR_THRESH"])
+    except KeyError:
+        return 1e-2
+
+
+def get_sanitize_quant_tensors():
+    """Return whether tensors with quantization annotations should be sanitized.
+    Enabled by default, disabling will yield faster ONNX execution but may give
+    incorrect results. Use with caution."""
+    try:
+        return int(os.environ["SANITIZE_QUANT_TENSORS"])
+    except KeyError:
+        # enabled by default
+        return 1
+
+
 def make_build_dir(prefix=""):
     """Creates a temporary folder with given prefix to be used as a build dir.
     Use this function instead of tempfile.mkdtemp to ensure any generated files
@@ -264,6 +284,69 @@ def calculate_signed_dot_prod_range(dt_a, dt_b, len):
     return (min_prod, max_prod)
 
 
+def sanitize_quant_values(model, node_tensors, execution_context, check_values=False):
+    """ Sanitize given list of tensors in execution_context by rounding values
+    that are supposed to be integers (as indicated by their quantization
+    annotation). Will raise an assertion if the amount of rounding is too large.
+    Returns the sanitized execution context.
+
+    If check_values is specified, an extra DataType.allowed() check will be
+    performed on any rounded tensors.
+
+    Background:
+    FINN uses floating point tensors as a carrier data type to represent
+    integers. Floating point arithmetic can introduce rounding errors, e.g.
+    (int_num * float_scale) / float_scale is not always equal to int_num.
+    We use this function to ensure that the values that are supposed to be
+    integers are indeed integers.
+    """
+
+    for tensor in node_tensors:
+        dtype = model.get_tensor_datatype(tensor)
+        # floats don't need sanitization, skip to next
+        # introduces less quicker runtime
+        if dtype == DataType.FLOAT32:
+            continue
+        current_values = execution_context[tensor]
+        updated_values = current_values
+        has_to_be_rounded = False
+        # TODO: vectorize with numpy
+        for value in np.nditer(current_values):
+            if not dtype.allowed(value):
+                has_to_be_rounded = True
+                break
+        if has_to_be_rounded:
+            updated_values = np.round(current_values)
+            warnings.warn(
+                "The values of tensor {} can't be represented "
+                "with the set FINN datatype ({}), they will be rounded to match the "
+                "FINN datatype.".format(tensor, dtype)
+            )
+        # check if rounded values are not too far from original values
+        max_error = max(np.abs(current_values - updated_values).flatten())
+        if max_error <= get_execution_error_thresh():
+            if check_values is True:
+                # check again if values can now be represented with set finn datatype
+                # TODO: vectorize with numpy
+                for value in np.nditer(updated_values):
+                    if not dtype.allowed(value):
+                        raise Exception(
+                            """Values can't be represented with set
+                                finn datatype ({}) for input {}""".format(
+                                dtype, tensor
+                            )
+                        )
+            execution_context[tensor] = updated_values
+        else:
+            raise Exception(
+                """Rounding error is too high to match set FINN
+            datatype ({}) for input {}""".format(
+                    dtype, tensor
+                )
+            )
+    return execution_context
+
+
 class CppBuilder:
     """Builds the g++ compiler command to produces the executable of the c++ code
     in code_gen_dir which is passed to the function build() of this class."""
diff --git a/src/finn/util/vivado.py b/src/finn/util/vivado.py
index 0f82c52cb2c1fc5ee4ed5a1927f46e222e0ab9b5..6b6df3940cfeeed292345382471719c49f725de6 100644
--- a/src/finn/util/vivado.py
+++ b/src/finn/util/vivado.py
@@ -28,6 +28,7 @@
 
 import os
 import subprocess
+import stat
 from finn.util.basic import get_remote_vivado
 
 
@@ -91,6 +92,7 @@ def out_of_context_synth(
     vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name)
     res_counts_path = vivado_proj_folder + "/res.txt"
     if remote_server is not None:
+        print("Using remote Vivado OOC synth, remote server %s" % remote_server)
         run_synth = """
 #!/bin/bash
 which vivado;
@@ -105,14 +107,17 @@ cat %s
         )
         with open(vivado_proj_folder + "/run.sh", "w") as f:
             f.write(run_synth)
+        st = os.stat(vivado_proj_folder + "/run.sh")
+        os.chmod(vivado_proj_folder + "/run.sh", st.st_mode | stat.S_IEXEC)
         # note that this assumes the same temp folder can be created on the
         # remote server
-        remote_server_uri = remote_server + ":" + verilog_dir
-        copy_files = "rsync -avz %s %s" % (verilog_dir + "/", remote_server_uri + "/")
+        # note we set target path as / due to use of -R (relative)
+        remote_server_uri = remote_server + ":/"
+        copy_files = "rsync -avzR %s %s" % (verilog_dir + "/", remote_server_uri)
         copy_files = copy_files.split()
         proc = subprocess.Popen(copy_files, cwd=verilog_dir, env=os.environ)
         proc.communicate()
-        vivado_cmd = "bash %s/run.sh" % vivado_proj_folder
+        vivado_cmd = "bash -ic %s/run.sh" % vivado_proj_folder
         run_vivado = ["ssh", "-t", remote_server, vivado_cmd]
         proc = subprocess.Popen(run_vivado, cwd=verilog_dir, env=os.environ)
         proc.communicate()
diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..24854a2153df9af78feb8352ca119e831a9ac9eb
--- /dev/null
+++ b/tests/brevitas/test_brevitas_avg_pool_export.py
@@ -0,0 +1,103 @@
+import os
+
+import onnx  # noqa
+import torch
+import numpy as np
+import brevitas.onnx as bo
+from brevitas.nn import QuantAvgPool2d
+from brevitas.quant_tensor import pack_quant_tensor
+from brevitas.core.quant import QuantType
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.util.basic import gen_finn_dt_tensor
+import finn.core.onnx_exec as oxe
+
+import pytest
+
+export_onnx_path = "test_avg_pool.onnx"
+
+
+@pytest.mark.parametrize("kernel_size", [2, 3])
+@pytest.mark.parametrize("stride", [1, 2])
+@pytest.mark.parametrize("signed", [False, True])
+@pytest.mark.parametrize("bit_width", [2, 4])
+@pytest.mark.parametrize("input_bit_width", [4, 8, 32])
+@pytest.mark.parametrize("channels", [2, 4])
+@pytest.mark.parametrize("idim", [7, 8])
+def test_brevitas_avg_pool_export(
+    kernel_size, stride, signed, bit_width, input_bit_width, channels, idim
+):
+    ishape = (1, channels, idim, idim)
+    ibw_tensor = torch.Tensor([input_bit_width])
+
+    b_avgpool = QuantAvgPool2d(
+        kernel_size=kernel_size,
+        stride=stride,
+        signed=signed,
+        min_overall_bit_width=bit_width,
+        max_overall_bit_width=bit_width,
+        quant_type=QuantType.INT,
+    )
+    # call forward pass manually once to cache scale factor and bitwidth
+    input_tensor = torch.from_numpy(np.zeros(ishape)).float()
+    scale = np.ones((1, channels, 1, 1))
+    output_scale = torch.from_numpy(scale).float()
+    input_quant_tensor = pack_quant_tensor(
+        tensor=input_tensor, scale=output_scale, bit_width=ibw_tensor
+    )
+    bo.export_finn_onnx(b_avgpool, ishape, export_onnx_path, input_t=input_quant_tensor)
+    model = ModelWrapper(export_onnx_path)
+
+    # determine input FINN datatype
+    if signed is True:
+        prefix = "INT"
+    else:
+        prefix = "UINT"
+    dt_name = prefix + str(input_bit_width // 2)
+    dtype = DataType[dt_name]
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    # execution with input tensor using integers and scale = 1
+    # calculate golden output
+    inp = gen_finn_dt_tensor(dtype, ishape)
+    input_tensor = torch.from_numpy(inp).float()
+    input_quant_tensor = pack_quant_tensor(
+        tensor=input_tensor, scale=output_scale, bit_width=ibw_tensor
+    )
+    b_avgpool.eval()
+    expected = b_avgpool.forward(input_quant_tensor).tensor.detach().numpy()
+
+    # finn execution
+    idict = {model.graph.input[0].name: inp}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    assert (expected == produced).all()
+
+    # execution with input tensor using float and scale != 1
+    scale = np.random.uniform(low=0, high=1, size=(1, channels, 1, 1)).astype(
+        np.float32
+    )
+    inp_tensor = inp * scale
+    input_tensor = torch.from_numpy(inp_tensor).float()
+    input_scale = torch.from_numpy(scale).float()
+    input_quant_tensor = pack_quant_tensor(
+        tensor=input_tensor, scale=input_scale, bit_width=ibw_tensor
+    )
+    # export again to set the scale values correctly
+    bo.export_finn_onnx(b_avgpool, ishape, export_onnx_path, input_t=input_quant_tensor)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    b_avgpool.eval()
+    expected = b_avgpool.forward(input_quant_tensor).tensor.detach().numpy()
+    # finn execution
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+
+    assert np.isclose(expected, produced).all()
+
+    os.remove(export_onnx_path)
diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py
index c9d8f2d812bc7bea1a2fd2598a7711099ad421e6..c5ddad12ca3e8d353682fbb20449d44358485f69 100644
--- a/tests/brevitas/test_brevitas_relu_act_export.py
+++ b/tests/brevitas/test_brevitas_relu_act_export.py
@@ -23,6 +23,7 @@ export_onnx_path = "test_act.onnx"
 def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type):
     min_val = -1.0
     ishape = (1, 15)
+
     b_act = QuantReLU(
         bit_width=abits,
         max_val=max_val,
@@ -67,3 +68,60 @@ scaling_impl.learned_value": torch.tensor(
 
     assert np.isclose(produced, expected, atol=1e-3).all()
     os.remove(export_onnx_path)
+
+
+@pytest.mark.parametrize("abits", [1, 2, 4, 8])
+@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)])
+@pytest.mark.parametrize("scaling_per_channel", [True, False])
+def test_brevitas_act_export_relu_imagenet(abits, max_val, scaling_per_channel):
+    out_channels = 32
+    ishape = (1, out_channels, 1, 1)
+    min_val = -1.0
+    b_act = QuantReLU(
+        bit_width=abits,
+        quant_type=QuantType.INT,
+        scaling_impl_type=ScalingImplType.PARAMETER,
+        scaling_per_channel=scaling_per_channel,
+        restrict_scaling_type=RestrictValueType.LOG_FP,
+        scaling_min_val=2e-16,
+        max_val=6.0,
+        return_quant_tensor=True,
+        per_channel_broadcastable_shape=(1, out_channels, 1, 1),
+    )
+    if scaling_per_channel is True:
+        rand_tensor = (2) * torch.rand((1, out_channels, 1, 1))
+    else:
+        rand_tensor = torch.tensor(1.2398)
+    checkpoint = {
+        "act_quant_proxy.fused_activation_quant_proxy.tensor_quant.\
+scaling_impl.learned_value": rand_tensor.type(
+            torch.FloatTensor
+        )
+    }
+    b_act.load_state_dict(checkpoint)
+    bo.export_finn_onnx(b_act, ishape, export_onnx_path)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(InferShapes())
+    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
+        np.float32
+    )
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    inp_tensor = torch.from_numpy(inp_tensor).float()
+    b_act.eval()
+    expected = b_act.forward(inp_tensor).tensor.detach().numpy()
+    if not np.isclose(produced, expected, atol=1e-3).all():
+        print(abits, max_val)
+        print("scale: ", b_act.quant_act_scale().type(torch.FloatTensor).detach())
+        if abits < 5:
+            print(
+                "thres:",
+                ", ".join(["{:8.4f}".format(x) for x in b_act.export_thres[0]]),
+            )
+        print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]]))
+        print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]]))
+        print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]]))
+
+    assert np.isclose(produced, expected, atol=1e-3).all()
+    os.remove(export_onnx_path)
diff --git a/tests/core/test_basic_onnx_exec.py b/tests/core/test_basic_onnx_exec.py
index a7b6da9965aa5912870812a8c1f8d6da2ee0d181..ddb2cbfc40c7647970f0c51ecb95340e7d1dddae 100644
--- a/tests/core/test_basic_onnx_exec.py
+++ b/tests/core/test_basic_onnx_exec.py
@@ -35,6 +35,8 @@ import onnx.numpy_helper as np_helper
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
+from finn.core.datatype import DataType
+from finn.util.basic import gen_finn_dt_tensor
 
 
 def test_mnist_onnx_download_extract_run():
@@ -47,9 +49,50 @@ def test_mnist_onnx_download_extract_run():
     raw_o = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/output_0.pb")
     input_tensor = onnx.load_tensor_from_string(raw_i)
     output_tensor = onnx.load_tensor_from_string(raw_o)
-    # run using FINN-based execution
+    # run using FINN-based execution (full graph)
     input_dict = {"Input3": np_helper.to_array(input_tensor)}
-    output_dict = oxe.execute_onnx(model, input_dict)
+    output_dict = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
     assert np.isclose(
         np_helper.to_array(output_tensor), output_dict["Plus214_Output_0"], atol=1e-3
     ).all()
+    # test subgraph execution
+    start_node = model.graph.node[1]
+    end_node = model.graph.node[3]
+    subgraph_i_dict = {start_node.input[0]: output_dict[start_node.input[0]]}
+    subgraph_o_dict = oxe.execute_onnx(
+        model,
+        subgraph_i_dict,
+        return_full_exec_context=True,
+        start_node=start_node,
+        end_node=end_node,
+    )
+    assert np.isclose(
+        subgraph_o_dict[end_node.output[0]], output_dict[end_node.output[0]], atol=1e-3
+    ).all()
+
+
+def test_onnx_exec_internal_rounding():
+    inp0 = onnx.helper.make_tensor_value_info("inp0", onnx.TensorProto.FLOAT, [2, 2])
+    inp1 = onnx.helper.make_tensor_value_info("inp1", onnx.TensorProto.FLOAT, [1])
+    outp = onnx.helper.make_tensor_value_info("outp", onnx.TensorProto.FLOAT, [2, 2])
+    mul_node = onnx.helper.make_node("Mul", inputs=["inp0", "inp1"], outputs=["outp"])
+    graph = onnx.helper.make_graph(
+        nodes=[mul_node], name="mul_graph", inputs=[inp0, inp1], outputs=[outp]
+    )
+
+    model = onnx.helper.make_model(graph, producer_name="mul-model")
+    model = ModelWrapper(model)
+    idt = DataType.INT2
+    model.set_tensor_datatype("inp0", idt)
+    model.set_tensor_datatype("inp1", idt)
+    model.transform(InferShapes())
+
+    mul_value = np.asarray([-1], dtype=np.float32)
+    inp_int = gen_finn_dt_tensor(idt, [2, 2])
+    scale = np.random.uniform(low=0, high=1, size=(2, 2)).astype(np.float32)
+    inp_rounded = (inp_int * scale) / (scale + 1e-7)
+    input_dict = {"inp0": inp_rounded, "inp1": mul_value}
+    output_dict = oxe.execute_onnx(model, input_dict)
+    produced = output_dict["outp"]
+    expected = np.multiply(inp_int, mul_value)
+    assert (produced == expected).all()
diff --git a/tests/end2end/test_end2end_cnv_w2a2.py b/tests/end2end/test_end2end_cnv_w2a2.py
new file mode 100644
index 0000000000000000000000000000000000000000..31ccebd4c175ad2badef17499bf113d978b637f7
--- /dev/null
+++ b/tests/end2end/test_end2end_cnv_w2a2.py
@@ -0,0 +1,377 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+
+import numpy as np
+
+# as of Feb'20 there is a bug that segfaults ONNX shape inference if we
+# import pytorch before onnx, so we make sure to import onnx first
+import onnx  # NOQA
+
+import pytest
+import pkg_resources as pk
+from finn.custom_op.registry import getCustomOp
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.double_to_single_float import DoubleToSingleFloat
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.streamline import Streamline
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+import finn.transformation.streamline.absorb as absorb
+from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
+from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
+from finn.util.basic import pynq_part_map
+from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
+from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.core.throughput_test import throughput_test_rtlsim
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 10
+mem_mode = "decoupled"
+
+
+def test_end2end_cnv_w2a2_export():
+    import brevitas.onnx as bo
+
+    cnv = get_test_model_trained("CNV", 2, 2)
+    bo.export_finn_onnx(
+        cnv, (1, 3, 32, 32), build_dir + "/end2end_cnv_w2a2_export.onnx"
+    )
+
+
+def test_end2end_cnv_w2a2_import_and_tidy():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_export.onnx")
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(build_dir + "/end2end_cnv_w2a2_tidy.onnx")
+
+
+def test_end2end_cnv_w2a2_streamline():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_tidy.onnx")
+    model = model.transform(Streamline())
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(MakeMaxPoolNHWC())
+    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(Streamline())
+    model.save(build_dir + "/end2end_cnv_w2a2_streamlined.onnx")
+
+
+def test_end2end_cnv_w2a2_convert_to_hls_layers():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_streamlined.onnx"
+    )
+    model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
+    model = model.transform(to_hls.InferConvInpGen())
+    model = model.transform(to_hls.InferStreamingMaxPool())
+    model = model.transform(RemoveCNVtoFCFlatten())
+    model.save(build_dir + "/end2end_cnv_w2a2_hls_layers.onnx")
+
+
+def test_end2end_cnv_w2a2_create_dataflow_partition():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_hls_layers.onnx"
+    )
+    parent_model = model.transform(CreateDataflowPartition())
+    parent_model.save(build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx")
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
+    dataflow_model.save(build_dir + "/end2end_cnv_w2a2_dataflow_model.onnx")
+
+
+def test_end2end_cnv_w2a2_fold_and_tlastmarker():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_dataflow_model.onnx"
+    )
+    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
+    # each tuple is (PE, SIMD, in_fifo_depth) for a layer
+    folding = [
+        (8, 3, 256, "auto"),
+        (16, 16, 256, "auto"),
+        (8, 16, 256, "auto"),
+        (8, 16, 256, "block"),
+        (4, 8, 214, "auto"),
+        (1, 8, 2, "auto"),
+        (1, 2, 126, "distributed"),
+        (2, 2, 62, "block"),
+        (5, 1, 6, "distributed"),
+    ]
+    for fcl, (pe, simd, ififodepth, ramstyle) in zip(fc_layers, folding):
+        fcl_inst = getCustomOp(fcl)
+        fcl_inst.set_nodeattr("PE", pe)
+        fcl_inst.set_nodeattr("SIMD", simd)
+        fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
+        fcl_inst.set_nodeattr("ram_style", ramstyle)
+
+    swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
+    swg_idepth = [2, 51, 9, 106, 2, 2]
+    for i in range(len(swg_layers)):
+        swg_inst = getCustomOp(swg_layers[i])
+        simd = folding[i][1]
+        swg_inst.set_nodeattr("SIMD", simd)
+        swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i])
+
+    model = model.transform(InsertDWC())
+    model = model.transform(InsertFIFO())
+    model = model.transform(InsertTLastMarker())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(AnnotateResources("estimate"))
+    model.save(build_dir + "/end2end_cnv_w2a2_folded.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_gen_hls_ip():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_folded.onnx")
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(AnnotateResources("hls"))
+    model.save(build_dir + "/end2end_cnv_w2a2_ipgen.onnx")
+
+
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_ip_stitch():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipgen.onnx")
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model.save(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx")
+
+
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_verify_dataflow_part():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx")
+    x = np.zeros((1, 32, 32, 3), dtype=np.float32)
+    inp_name = model.graph.input[0].name
+    out_name = model.graph.output[0].name
+    inp_dict = {inp_name: x}
+    # cppsim
+    model = model.transform(PrepareCppSim())
+    model = model.transform(CompileCppSim())
+    model = model.transform(SetExecMode("cppsim"))
+    model.save(build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx")
+    ret_cppsim = execute_onnx(model, inp_dict, True)
+    res_cppsim = ret_cppsim[out_name]
+    # node-by-node rtlsim
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareRTLSim())
+    model.save(build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx")
+    ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
+    res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
+    # whole-network (ip-stitched) rtlsim
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    model.save(build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx")
+    # this is a particularly long-running test, set liveness thr. to unlimited
+    os.environ["LIVENESS_THRESHOLD"] = "-1"
+    ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
+    res_rtlsim_whole = ret_rtlsim_whole[out_name]
+    assert np.isclose(res_cppsim, res_rtlsim_nodebynode).all()
+    assert np.isclose(res_cppsim, res_rtlsim_whole).all()
+
+
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_throughput_test_rtlsim():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx"
+    )
+    model.set_metadata_prop("rtlsim_trace", "rtlsim_trace.vcd")
+    # os.environ["RTLSIM_TRACE_DEPTH"] = "4"
+    # run through IP-stitched rtlsim with increasing batch sizes and
+    # check the number of cycles it takes to execute
+    ret = throughput_test_rtlsim(model, 10)
+    # TODO check for expected performance
+    assert ret["cycles"] > 0
+
+
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_verify_all():
+    # use the streamlined model as the "golden" model for right answers
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_streamlined.onnx"
+    )
+    iname = golden.graph.input[0].name
+    oname = golden.graph.output[0].name
+    # load one of the test vectors
+    fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
+    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    input_tensor = input_tensor / 255
+    assert input_tensor.shape == (1, 3, 32, 32)
+    x = input_tensor
+    # x = np.zeros(ishape, dtype=np.float32)
+    ret_golden = execute_onnx(golden, {iname: x}, True)
+    y_golden = ret_golden[oname]
+    # set up parent+child graph to test
+    # we'll use models from the previous step as the child model
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx"
+    )
+    iname = parent_model.graph.input[0].name
+    oname = parent_model.graph.output[0].name
+    # produce results with cppsim
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx")
+    sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx")
+    ret_cppsim = execute_onnx(parent_model, {iname: x}, True)
+    y_cppsim = ret_cppsim[oname]
+    # produce results with node-by-node rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx"
+    )
+    sdp_node.set_nodeattr(
+        "model", build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx"
+    )
+    ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
+    y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
+    # produce results with whole-network (stitched ip) rtlsim
+    load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx"
+    )
+    sdp_node.set_nodeattr(
+        "model", build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx"
+    )
+    # this is a particularly long-running test, set liveness thr. to unlimited
+    os.environ["LIVENESS_THRESHOLD"] = "-1"
+    ret_whole_rtlsim = execute_onnx(parent_model, {iname: x}, True)
+    y_whole_rtlsim = ret_whole_rtlsim[oname]
+    assert np.isclose(y_golden, y_cppsim).all()
+    assert np.isclose(y_golden, y_nodebynode_rtlsim).all()
+    assert np.isclose(y_golden, y_whole_rtlsim).all()
+    assert np.argmax(y_golden) == 3
+
+
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_make_pynq_proj():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx")
+    model = model.transform(MakePYNQProject(test_pynq_board))
+    model.save(build_dir + "/end2end_cnv_w2a2_pynq_project.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_end2end_cnv_w2a2_synth_pynq_project():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_pynq_project.onnx"
+    )
+    model = model.transform(SynthPYNQProject())
+    model = model.transform(AnnotateResources("synth"))
+    model.save(build_dir + "/end2end_cnv_w2a2_synth.onnx")
+
+
+def test_end2end_cnv_w2a2_make_driver():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_synth.onnx")
+    model = model.transform(MakePYNQDriver())
+    model.save(build_dir + "/end2end_cnv_w2a2_pynq_driver.onnx")
+
+
+def test_end2end_cnv_w2a2_deploy_on_pynq():
+    model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_pynq_driver.onnx"
+    )
+    try:
+        ip = os.environ["PYNQ_IP"]  # no fault for this one; skip if not defined
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        username = os.getenv("PYNQ_USERNAME", "xilinx")
+        password = os.getenv("PYNQ_PASSWORD", "xilinx")
+        port = os.getenv("PYNQ_PORT", 22)
+        target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn")
+        model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
+        # save the model to be able to link it to the parent
+        model.save(build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx")
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
+
+
+def test_end2end_cnv_w2a2_run_on_pynq():
+    # use the streamlined model as the "golden" model for right answers
+    golden = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_streamlined.onnx"
+    )
+    iname = golden.graph.input[0].name
+    oname = golden.graph.output[0].name
+    # load one of the test vectors
+    fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz")
+    input_tensor = np.load(fn)["arr_0"].astype(np.float32)
+    input_tensor = input_tensor / 255
+    assert input_tensor.shape == (1, 3, 32, 32)
+    x = input_tensor
+    # run using FINN-based execution
+    ret_golden = execute_onnx(golden, {iname: x}, True)
+    y_golden = ret_golden[oname]
+    # set up parent+child graph to test
+    # we'll use models from the previous step as the child model
+    parent_model = load_test_checkpoint_or_skip(
+        build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx"
+    )
+    iname = parent_model.graph.input[0].name
+    oname = parent_model.graph.output[0].name
+    try:
+        ip = os.environ["PYNQ_IP"]  # NOQA
+        if ip == "":
+            pytest.skip("PYNQ board IP address not specified")
+        # produce results with cppsim
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node = getCustomOp(sdp_node)
+        load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx")
+        sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx")
+        ret = execute_onnx(parent_model, {iname: x}, True)
+        y = ret[oname]
+        assert np.isclose(y, y_golden).all()
+        assert np.argmax(y) == 3
+
+    except KeyError:
+        pytest.skip("PYNQ board IP address not specified")
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f78dcea1a1ce364d0657ad64de7d440d41b822
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from onnx import TensorProto, helper
+import numpy as np
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.custom_op.registry import getCustomOp
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.infer_shapes import InferShapes
+
+
+def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt):
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim]
+    )
+
+    mp_node = helper.make_node(
+        "MaxPool",
+        ["inp"],
+        ["outp"],
+        kernel_shape=[k, k],
+        pads=[pad, pad, pad, pad],
+        strides=[stride, stride],
+    )
+    graph = helper.make_graph(
+        nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="mp-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+    model = model.transform(InferShapes())
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4])
+# pool configuration:                   ( k,stride, pad, ifm_dim )
+@pytest.mark.parametrize(
+    "pool_config", [(3, 2, 0, 5), (3, 2, 1, 5), (2, 2, 0, 8), (5, 2, 2, 7)]
+)
+# input channels
+@pytest.mark.parametrize("ifm_ch", [1, 4, 20])
+# number of out channel computed in parallel
+@pytest.mark.parametrize("pe", [1, 4, 20])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+# pool type
+@pytest.mark.parametrize("op_type", ["MaxPool"])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_type):
+    k, stride, pad, ifm_dim = pool_config
+
+    if ifm_ch % pe != 0:
+        pytest.skip("ifm_ch%pe != 0. Skipping")
+
+    if pad != 0 and idt.signed():
+        pytest.skip("No support for pal_val != 0. Skipping")
+
+    np.random.seed(0)
+    ofm_dim = int(((ifm_dim + 2 * pad - k) / stride) + 1)
+
+    x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim))
+    # prepare input data
+    input_dict = prepare_inputs(x)
+    if op_type == "MaxPool":
+        model = make_single_maxpool_modelwrapper(
+            k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt
+        )
+    else:
+        assert False, "{} is not a supported op_type".format(op_type)
+
+    y_expected = oxe.execute_onnx(model, input_dict)["outp"]
+
+    new_model = model.transform(to_hls.InferPool_Batch())
+    new_model = new_model.transform(GiveUniqueNodeNames())
+
+    if ifm_ch != pe:
+        new_model = new_model.transform(to_hls.InferConvInpGen())
+        # Folding
+        for n in new_model.graph.node:
+            if n.op_type == "ConvolutionInputGenerator":
+                inst = getCustomOp(n)
+                inst.set_nodeattr("SIMD", pe)
+            elif n.op_type == "Pool_Batch":
+                inst = getCustomOp(n)
+                inst.set_nodeattr("PE", pe)
+
+    if exec_mode == "cppsim":
+        new_model = new_model.transform(SetExecMode("cppsim"))
+        new_model = new_model.transform(PrepareCppSim())
+        new_model = new_model.transform(CompileCppSim())
+    elif exec_mode == "rtlsim":
+        new_model = new_model.transform(SetExecMode("rtlsim"))
+        new_model = new_model.transform(GiveUniqueNodeNames())
+        new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
+        new_model = new_model.transform(HLSSynthIP())
+        new_model = new_model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # execute new_model
+    y_produced = oxe.execute_onnx(new_model, input_dict)["outp"]
+    assert (y_produced == y_expected).all()
+    if stride != k:
+        if pad == 0 or ifm_ch == pe:
+            assert len(new_model.graph.node) == 4
+        else:
+            assert len(new_model.graph.node) == 5
+    else:
+        assert len(new_model.graph.node) == 1
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 9d6390b2673e5d2c0e72748183ac04ed222d078e..5ff3da87228a2a32a41226bb46e0b16b1a44df50 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -23,7 +23,7 @@ test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 10
 
 
-def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
+def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style):
     assert pad_style == 2, "only pad_style == 2 supported in hlslib"
     assert padding > 0, "Output dim should be greater than input dim"
     odim = idim + padding
@@ -47,6 +47,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
         inputDataType=str(idt.name),
         PaddingStyle=pad_style,
         numInputVectors=1,
+        SIMD=simd,
     )
 
     graph = helper.make_graph(
@@ -63,11 +64,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
 
 
 # input image dimension
-@pytest.mark.parametrize("idim", [8, 16])
+@pytest.mark.parametrize("idim", [8])
 # number of rows and number of cols to add
 @pytest.mark.parametrize("pad", [2, 3])
 # number of channels
-@pytest.mark.parametrize("num_ch", [1, 2])
+@pytest.mark.parametrize("num_ch", [2, 4])
+# Input parallelism
+@pytest.mark.parametrize("simd", [1, 2])
 # PaddingStyle: selects behavior when (odim-idim)%2 != 0
 @pytest.mark.parametrize("pad_style", [2])
 # FINN input datatype
@@ -76,14 +79,15 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode):
-
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
+    if num_ch % simd != 0:
+        pytest.skip(" num_ch % simd != 0, skipping")
     # generate input data
     x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch])
     input_dict = {"inp": x}
     odim = idim + pad
 
-    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style)
+    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style)
     model = model.transform(InferShapes())
     model = model.transform(SetExecMode(mode))
     model = model.transform(GiveUniqueNodeNames())
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
similarity index 99%
rename from tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
rename to tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 61dd81b728aafcd8ccc812cf0cb4c27eff00f471..b830693c32afe629dd6fc70868d0bddacac4c887 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -300,7 +300,7 @@ def test_fpgadataflow_ipstitch_synth_ooc():
     assert ret["FF"] > 0
     assert ret["DSP"] == 0
     assert ret["BRAM"] == 0
-    assert ret["fmax_mz"] > 100
+    assert ret["fmax_mhz"] > 100
 
 
 @pytest.mark.vivado
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index 38f792ed3cdd52044b28b4c19ac0603da4e502e6..398a17132a2ef6c92e600102ff5c0b71a1f65aaa 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -92,7 +92,7 @@ def test_res_estimate():
     model = model.transform(GiveUniqueNodeNames())
     prod_resource_estimation = model.analysis(res_estimation)
     expect_resource_estimation = {
-        "StreamingFCLayer_Batch_0": {"BRAM_18K": 1, "LUT": 304.4}
+        "StreamingFCLayer_Batch_0": {"BRAM_18K": 1, 'BRAM_efficiency': 0.001736111111111111, "LUT": 304.4}
     }
 
     assert check_two_dict_for_equality(
diff --git a/tests/pynq/test_pynq_performance_end2end.py b/tests/pynq/test_pynq_performance_end2end.py
index 66a93a190061e0142637be19bb2ea841d192745a..3b6ea86741b8adefce4faaa65b791f1d213cf3ae 100644
--- a/tests/pynq/test_pynq_performance_end2end.py
+++ b/tests/pynq/test_pynq_performance_end2end.py
@@ -10,7 +10,7 @@ from finn.core.throughput_test import throughput_test
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 
 
-@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1"])
+@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1", "cnv_w2a2"])
 @pytest.mark.slow
 def test_pynq_performance_end2end(end2end_example):
     model = load_test_checkpoint_or_skip(
diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py
index 56dcd26076ec0a5fba6e9be6acac7f5e13572c3d..103967dfb6b86cc6e2ce2bc9ab78249d8945d47d 100644
--- a/tests/transformation/streamline/test_streamline_cnv.py
+++ b/tests/transformation/streamline/test_streamline_cnv.py
@@ -44,9 +44,9 @@ from finn.transformation.double_to_single_float import DoubleToSingleFloat
 export_onnx_path = make_build_dir("test_streamline_cnv_")
 
 # act bits
-@pytest.mark.parametrize("abits", [1])
+@pytest.mark.parametrize("abits", [1, 2])
 # weight bits
-@pytest.mark.parametrize("wbits", [1])
+@pytest.mark.parametrize("wbits", [1, 2])
 # network topology / size
 @pytest.mark.parametrize("size", ["CNV"])
 def test_streamline_cnv(size, wbits, abits):
@@ -74,6 +74,7 @@ def test_streamline_cnv(size, wbits, abits):
     # model.save("orig_cnv.onnx")
     model = model.transform(Streamline())
     # model.save("streamlined_cnv.onnx")
+    assert len(model.graph.node) == 23
     produced_ctx = oxe.execute_onnx(model, input_dict, True)
     produced = produced_ctx[model.graph.output[0].name]
     assert np.isclose(expected, produced, atol=1e-3).all()
diff --git a/tests/transformation/test_absorb_mul_into_topk.py b/tests/transformation/test_absorb_mul_into_topk.py
new file mode 100644
index 0000000000000000000000000000000000000000..1394220f7c336ccea8fe9c494734c4175bf2e847
--- /dev/null
+++ b/tests/transformation/test_absorb_mul_into_topk.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.transformation.insert_topk import InsertTopK
+from finn.transformation.streamline.absorb import AbsorbScalarMulIntoTopK
+import finn.core.onnx_exec as oxe
+
+# parameter to indicate if mul parameter is negative or positive
+@pytest.mark.parametrize("mul_positive", [True, False])
+# parameter to indicate if mul parameter is scalar or not
+@pytest.mark.parametrize("scalar", [True, False])
+def test_absorb_mul_into_topk(mul_positive, scalar):
+    if scalar is True:
+        shape = [1]
+    else:
+        shape = [1, 1, 1, 1000]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 1, 1, 1000])
+    a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, 1000])
+
+    mul_node = helper.make_node("Mul", ["inp", "a0"], ["outp"])
+    mul_graph = helper.make_graph(
+        nodes=[mul_node],
+        name="mul-graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[a0],
+    )
+
+    model = helper.make_model(mul_graph, producer_name="mul_model")
+    model = ModelWrapper(model)
+    # initialize values
+    if mul_positive is True:
+        a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype(
+            np.float32
+        )
+    else:
+        a0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype(
+            np.float32
+        )
+    model.set_initializer("a0", a0_values)
+    model = model.transform(InsertTopK())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model_transformed = model.transform(AbsorbScalarMulIntoTopK())
+
+    # compare execution results
+    inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype(
+        np.float32
+    )
+    idict = {"global_in": inp_values}
+    odict = oxe.execute_onnx(model, idict, True)
+    y_indices = odict["global_out"]
+    y_values = odict["TopK_0_out0"]
+    odict = oxe.execute_onnx(model_transformed, idict, True)
+    y_tr_indices = odict["global_out"]
+    y_tr_values = odict["TopK_0_out0"]
+
+    # the indices stay the same, if the model is transformed or not
+    assert (y_indices == y_tr_indices).all()
+
+    if scalar is True and mul_positive is True:
+        # the values change if the model was transformed
+        assert (y_values != y_tr_values).all()
+
+        # check for new order
+        assert model.graph != model_transformed.graph
+        assert len(model.graph.node) - 1 == len(model_transformed.graph.node)
+        assert model_transformed.graph.node[0].op_type == "TopK"
+
+    else:
+        assert (y_values == y_tr_values).all()
+        assert model.graph == model_transformed.graph
diff --git a/tests/transformation/test_change_datalayout.py b/tests/transformation/test_change_datalayout.py
new file mode 100644
index 0000000000000000000000000000000000000000..66459d574957575e61ec1bec631fb7030a27cca1
--- /dev/null
+++ b/tests/transformation/test_change_datalayout.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+from onnx import helper, TensorProto
+
+from finn.custom_op.maxpoolnhwc import compute_pool_output_dim
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+import finn.core.data_layout as DataLayout
+from finn.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.util.basic import gen_finn_dt_tensor
+from finn.util.basic import get_by_name
+import finn.core.onnx_exec as oxe
+
+# stride
+@pytest.mark.parametrize("s", [1, 2])
+# kernel
+@pytest.mark.parametrize("k", [3, 4])
+# ibits
+@pytest.mark.parametrize("ibits", [4, 8])
+# obits
+@pytest.mark.parametrize("obits", [2, 4])
+# signed
+@pytest.mark.parametrize("signed", [False, True])
+# channels
+@pytest.mark.parametrize("c", [2, 3])
+# input dimension
+@pytest.mark.parametrize("idim", [6, 7])
+def test_change_datalayout_quantavgpool(s, k, ibits, obits, signed, c, idim):
+    n = 1
+    odim = compute_pool_output_dim(idim, k, s)
+    # determine input FINN datatype
+    if signed is True:
+        prefix = "INT"
+    else:
+        prefix = "UINT"
+    dt_name = prefix + str(ibits)
+    dtype = DataType[dt_name]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [n, c, idim, idim])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [n, c, odim, odim])
+
+    node = helper.make_node(
+        "QuantAvgPool2d",
+        ["inp"],
+        ["outp"],
+        domain="finn",
+        stride=s,
+        kernel=k,
+        ibits=ibits,
+        obits=obits,
+        signed=signed,
+        data_layout="NCHW",
+    )
+    graph = helper.make_graph(
+        nodes=[node], name="single-quantavgpool", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph)
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model_transformed = model.transform(ChangeDataLayoutQuantAvgPool2d())
+    model_transformed = model_transformed.transform(InferShapes())
+    model_transformed = model_transformed.transform(InferDataTypes())
+    model_transformed = model_transformed.transform(InferDataLayouts())
+    model_transformed = model_transformed.transform(GiveUniqueNodeNames())
+    model_transformed = model_transformed.transform(GiveReadableTensorNames())
+    inp_values = gen_finn_dt_tensor(dtype, [n, c, idim, idim])
+    idict = {"inp": inp_values}
+    assert oxe.compare_execution(model, model_transformed, idict)
+    assert len(model.graph.node) + 2 == len(model_transformed.graph.node)
+    assert model_transformed.graph.node[-1].op_type == "Transpose"
+    assert model_transformed.graph.node[0].op_type == "Transpose"
+    # check if QuantAvgPool2d node has datalayout set correctly
+    node = model_transformed.graph.node[1]
+    d_layout = get_by_name(node.attribute, "data_layout").s.decode("UTF-8")
+    assert d_layout == "NHWC"
+    assert model_transformed.get_tensor_layout(node.input[0]) == DataLayout.NHWC
+    assert model_transformed.get_tensor_layout(node.output[0]) == DataLayout.NHWC
diff --git a/tests/transformation/test_conv_lowering.py b/tests/transformation/test_conv_lowering.py
index 2cbc8e558940517168678b05c3bb46af8170abce..73891ded1b9691c7c48a2075ad6ca4668fcf6bfe 100644
--- a/tests/transformation/test_conv_lowering.py
+++ b/tests/transformation/test_conv_lowering.py
@@ -26,12 +26,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import onnx.helper as oh
+from onnx import TensorProto
 import os
 import pkg_resources as pk
 import brevitas.onnx as bo
 import numpy as np
 
-
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.infer_shapes import InferShapes
@@ -65,3 +66,51 @@ def test_conv_lowering_cnv_w1a1():
     assert np.isclose(produced, expected).all()
     assert np.argmax(produced) == 3
     os.remove(export_onnx_path)
+
+
+def test_conv_lowering_conv_1x1():
+    np.random.seed(0)
+
+    in_feature_dim = 7
+    in_chn = 3
+    kernel_size = 1
+    out_feature_dim = in_feature_dim
+
+    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
+    output_shape = [1, in_chn, out_feature_dim, out_feature_dim]
+
+    conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = 1
+    conv_config["kernel_shape"] = [kernel_size, kernel_size]
+    conv_config["pads"] = [0, 0, 0, 0]
+    conv_config["strides"] = [1, 1]
+
+    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+
+    value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)]
+
+    modelproto = oh.make_model(
+        oh.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[oh.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+    model.set_initializer("p1", np.random.rand(*conv_param_shape).astype(np.float32))
+
+    new_model = model.transform(LowerConvsToMatMul())
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+
+    assert oxe.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "Transpose"
+    assert new_model.graph.node[1].op_type == "MatMul"
+    assert new_model.graph.node[2].op_type == "Transpose"
+    assert len(new_model.graph.node) == 3
diff --git a/tests/transformation/test_move_maxpool_past_multithreshold.py b/tests/transformation/test_move_maxpool_past_multithreshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fc19debf8d6fc89d15e3d731f1e54daa491c321
--- /dev/null
+++ b/tests/transformation/test_move_maxpool_past_multithreshold.py
@@ -0,0 +1,100 @@
+from onnx import TensorProto, helper
+import numpy as np
+
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.streamline.reorder import MoveMaxPoolPastMultiThreshold
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+
+
+def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
+    if seed is not None:
+        np.random.seed(seed)
+    steps = np.random.rand(channels, 1) * 2
+    bias = np.random.rand(channels, 1) * 10
+    thres = [np.arange(num_of_thres) for chn in range(channels)]
+    thres = ((thres - bias) * steps).astype(np.float32)
+    return thres
+
+
+def test_move_maxpool_past_multithreshold():
+    # generate test vectors of correct shape
+    ch = 64
+    ifmdim = 16
+    ofmdim = 16 // 4
+    input_shape = (1, ch, ifmdim, ifmdim)
+    output_shape = (1, ch, ofmdim, ofmdim)
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+
+    maxpool_config = {}
+    maxpool_config["pads"] = [1, 1, 1, 1]
+    maxpool_config["kernel_shape"] = [3, 3]
+    maxpool_config["strides"] = [2, 2]
+
+    value_info = []
+    thres1_shape = [1, 1]
+    value_info += [
+        helper.make_tensor_value_info("thres1", TensorProto.FLOAT, thres1_shape)
+    ]
+
+    thres2_shape = [ch, 14]
+    value_info += [
+        helper.make_tensor_value_info("thres2", TensorProto.FLOAT, thres2_shape)
+    ]
+
+    nodes = []
+    nodes += [helper.make_node("MaxPool", ["top_in"], ["t1"], **maxpool_config)]
+    nodes += [
+        helper.make_node(
+            "MultiThreshold",
+            ["t1", "thres1"],
+            ["t2"],
+            domain="finn",
+            out_dtype="BIPOLAR",
+            out_bias=-1.0,
+            out_scale=1.0,
+        )
+    ]
+    nodes += [helper.make_node("MaxPool", ["t2"], ["t3"], **maxpool_config)]
+    nodes += [
+        helper.make_node(
+            "MultiThreshold",
+            ["t3", "thres2"],
+            ["top_out"],
+            domain="finn",
+            out_dtype="UINT4",
+        )
+    ]
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=nodes,
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    model.set_initializer("thres1", np.array([[0]]))
+    model.set_initializer(
+        "thres2", get_multithreshold_rand_params(*thres2_shape, seed=0)
+    )
+
+    # Transform
+    new_model = model.transform(MoveMaxPoolPastMultiThreshold())
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+
+    # Test
+    assert oxe.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "MaxPool"
+    assert new_model.graph.node[1].op_type == "MultiThreshold"
+    assert new_model.graph.node[2].op_type == "MultiThreshold"
+    assert new_model.graph.node[3].op_type == "MaxPool"
+    assert len(new_model.graph.node) == 4
diff --git a/tests/transformation/test_move_mul_past_dw_conv.py b/tests/transformation/test_move_mul_past_dw_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ae8fbfe89986d58d3d71f5f8735a98469d9d1e3
--- /dev/null
+++ b/tests/transformation/test_move_mul_past_dw_conv.py
@@ -0,0 +1,93 @@
+import pytest
+
+from onnx import helper, TensorProto
+from finn.custom_op.im2col import compute_conv_output_dim
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.streamline.reorder import MoveMulPastDWConv
+
+
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [4, 7])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [2, 3])
+# kernel size
+@pytest.mark.parametrize("k", [2, 3])
+# stride
+@pytest.mark.parametrize("stride", [1, 2])
+# padding
+@pytest.mark.parametrize("pad_amt", [0, 1])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw):
+    if dw == 1:
+        ofm_ch = ifm_ch
+        groups = ifm_ch
+        W_shape = [ofm_ch, 1, k, k]
+    else:
+        ofm_ch = ifm_ch + 2
+        groups = 1
+        W_shape = [ofm_ch, ifm_ch, k, k]
+
+    ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad_amt)
+
+    # set up onnx model
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
+    )
+    mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [1, ifm_ch, 1, 1])
+    W = helper.make_tensor_value_info("W", TensorProto.FLOAT, W_shape)
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim]
+    )
+
+    Mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"])
+
+    Conv_node = helper.make_node(
+        "Conv",
+        ["mul_out", "W"],
+        ["outp"],
+        group=groups,
+        kernel_shape=[k, k],
+        pads=[pad_amt, pad_amt, pad_amt, pad_amt],
+        strides=[stride, stride],
+    )
+
+    graph = helper.make_graph(
+        nodes=[Mul_node, Conv_node],
+        name="mulpastconv_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[mul, W],
+    )
+
+    model = helper.make_model(graph, producer_name="mulpastconv-model")
+    model = ModelWrapper(model)
+    inp_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, ifm_dim, ifm_dim])
+    mul_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, 1, 1])
+    W_values = gen_finn_dt_tensor(DataType.INT2, W_shape)
+    model.set_initializer("W", W_values)
+    model.set_initializer("mul", mul_values)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    idict = {"inp": inp_values}
+    odict = oxe.execute_onnx(model, idict, True)
+    out_before = odict["outp"]
+
+    # move channelwise multiplication past depthwise conv
+    model_transformed = model.transform(MoveMulPastDWConv())
+    odict = oxe.execute_onnx(model_transformed, idict, True)
+    out_after = odict["outp"]
+
+    assert (out_before == out_after).all()
+
+    if dw == 0:
+        assert model.graph.node[0].op_type == model_transformed.graph.node[0].op_type
+        assert model.graph.node[1].op_type == model_transformed.graph.node[1].op_type
+    else:
+        assert model.graph.node[0].op_type == model_transformed.graph.node[1].op_type
+        assert model.graph.node[1].op_type == model_transformed.graph.node[0].op_type
diff --git a/tests/transformation/test_remove_identity_ops.py b/tests/transformation/test_remove_identity_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..536c1ab0b48fa44388da23f45b528da3c5f3b2f2
--- /dev/null
+++ b/tests/transformation/test_remove_identity_ops.py
@@ -0,0 +1,81 @@
+import pytest
+
+import numpy as np
+from onnx import helper, TensorProto
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline.remove import RemoveIdentityOps
+from finn.util.basic import gen_finn_dt_tensor
+
+
+def insert_identity_op(model, op):
+    if op in ["Add", "Sub"]:
+        val = np.asarray([0.0], dtype=np.float32)
+    elif op in ["Mul", "Div"]:
+        val = np.asarray([1.0], dtype=np.float32)
+    else:
+        return
+
+    identity_node = helper.make_node(op, ["div_out", "value"], ["ident_out"])
+    graph = model.graph
+    graph.node.insert(3, identity_node)
+    graph.node[-1].input[0] = "ident_out"
+    model.set_initializer("value", val)
+
+    return model
+
+
+# identity operations to be inserted
+@pytest.mark.parametrize("op", ["Add", "Sub", "Mul", "Div"])
+def test_remove_identity_ops(op):
+
+    # set up onnx model
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 4, 1, 1])
+    mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [])
+    shape = helper.make_tensor_value_info("shape", TensorProto.FLOAT, [2])
+    div = helper.make_tensor_value_info("div", TensorProto.FLOAT, [])
+    matmul = helper.make_tensor_value_info("matmul", TensorProto.FLOAT, [4, 2])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 2])
+
+    mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"])
+    reshape_node = helper.make_node("Reshape", ["mul_out", "shape"], ["reshape_out"])
+    div_node = helper.make_node("Div", ["reshape_out", "div"], ["div_out"])
+    matmul_node = helper.make_node("MatMul", ["div_out", "matmul"], ["outp"])
+
+    graph = helper.make_graph(
+        nodes=[mul_node, reshape_node, div_node, matmul_node],
+        name="identity-graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[mul, shape, div, matmul],
+    )
+
+    model = helper.make_model(graph, producer_name="mulpastconv-model")
+    model = ModelWrapper(model)
+    inp_values = gen_finn_dt_tensor(DataType.INT2, [1, 4, 1, 1])
+    mul_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
+    shape_values = np.asarray([1, -1], dtype=np.int64)
+    div_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
+    matmul_values = gen_finn_dt_tensor(DataType.INT2, [4, 2])
+    model.set_initializer("mul", mul_values)
+    model.set_initializer("shape", shape_values)
+    model.set_initializer("div", div_values)
+    model.set_initializer("matmul", matmul_values)
+    insert_identity_op(model, op)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    idict = {"inp": inp_values}
+    odict = oxe.execute_onnx(model, idict)
+    out_before = odict["outp"]
+    num_of_nodes_before = len(model.graph.node)
+
+    model = model.transform(RemoveIdentityOps())
+    num_of_nodes_after = len(model.graph.node)
+    assert num_of_nodes_before - 1 == num_of_nodes_after
+
+    odict = oxe.execute_onnx(model, idict)
+    out_after = odict["outp"]
+    assert (out_before == out_after).all()