diff --git a/.gitignore b/.gitignore
index f838c1695130d232ac6a2b888aed0cea31aafaa7..8b3166a44070a4575aac86c445c4504b594cda08 100644
--- a/.gitignore
+++ b/.gitignore
@@ -78,3 +78,6 @@ MANIFEST
 
 # Jenkins cfg dir
 /docker/jenkins_home
+
+# SSH key dir mounted into Docker
+/ssh_keys/
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index 22e3eb623c7a5da19a5e3ae2284557577898ad23..0e12b504a26ccdb8fd78e162f04cfdeab5a186f1 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -42,7 +42,7 @@ WORKDIR /workspace
 RUN apt-get update
 RUN apt-get -y upgrade
 RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
-RUN apt-get install -y verilator nano zsh
+RUN apt-get install -y verilator nano zsh rsync
 RUN apt-get -y install sshpass
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 6ad4adc352ca5d71993ee8118568df20dba00810..2a6e41797307f350a8827050f9d027e72d547e57 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -13,9 +13,9 @@ gecho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-BREVITAS_COMMIT=989cdfdba4700fdd900ba0b25a820591d561c21a
+BREVITAS_COMMIT=026a509186b7e7b0b65d46a2f905043d41069306
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=afcfe75f3404249bddeeb3f15df65bd1fcb1072e
+HLSLIB_COMMIT=8f9f2018762f654f196b666838aeaf6fc730ad9a
 PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f
 PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d
 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 95594bb67a2be3a4c3fbba488c75a704f623c136..feba21ea20d6445b88f96cf8ed2ca57598e614f4 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -18,6 +18,7 @@ Requirements
 * A working Vivado 2019.1 installation
 * A `VIVADO_PATH` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located)
 * (optional) A PYNQ board with a network connection
+   * the ``bitstring`` package must be installed on the PYNQ: ``sudo pip install bitstring``
 
 Running FINN in Docker
 ======================
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index 7a4bc687eeb827320991f7d3f1ef8cc35e97f3da..dee62f09a9253380e05300dac8fa34915c20dab5 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -16,6 +16,10 @@ Custom Quantization Annotations
 
 ONNX does not support datatypes smaller than 8-bit integers, whereas in FINN we are interested in smaller integers down to ternary and bipolar. To make this work, FINN uses the quantization_annotation field in ONNX to annotate tensors with their FINN DataType (:py:mod:`finn.core.datatype.DataType`) information. However, all tensors are expected to use single-precision floating point (float32) storage in FINN. This means we store even a 1-bit value as floating point for the purposes of representation. The FINN compiler flow is responsible for eventually producing a packed representation for the target hardware, where the 1-bit is actually stored as 1-bit.
 
+Note that FINN uses floating point tensors as a carrier data type to represent integers. Floating point arithmetic can introduce rounding errors, e.g. (int_num * float_scale) / float_scale is not always equal to int_num.
+When using the custom ONNX execution flow, FINN will attempt to sanitize any rounding errors for integer tensors. See (:py:mod:`finn.util.basic.sanitize_quant_values`) for more information.
+This behavior can be disabled (not recommended!) by setting the environment variable SANITIZE_QUANT_TENSORS=0.
+
 Custom Operations/Nodes
 =======================
 
diff --git a/run-docker.sh b/run-docker.sh
index e07556716db335421f57a390f1e6a17168ac058b..00ca8f86985a78d8f2af099c51dcd4b80cd2e974 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -65,6 +65,11 @@ DOCKER_INST_NAME="finn_dev_${DOCKER_UNAME}"
 # ensure Docker tag and inst. name are all lowercase
 DOCKER_TAG=$(echo "$DOCKER_TAG" | tr '[:upper:]' '[:lower:]')
 DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
+# Absolute path to this script, e.g. /home/user/bin/foo.sh
+SCRIPT=$(readlink -f "$0")
+# Absolute path this script is in, thus /home/user/bin
+SCRIPTPATH=$(dirname "$SCRIPT")
+
 # the settings below will be taken from environment variables if available,
 # otherwise the defaults below will be used
 : ${JUPYTER_PORT=8888}
@@ -74,11 +79,7 @@ DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
 : ${PYNQ_BOARD="Pynq-Z1"}
 : ${PYNQ_TARGET_DIR="/home/xilinx/$DOCKER_INST_NAME"}
 : ${NUM_DEFAULT_WORKERS=1}
-
-# Absolute path to this script, e.g. /home/user/bin/foo.sh
-SCRIPT=$(readlink -f "$0")
-# Absolute path this script is in, thus /home/user/bin
-SCRIPTPATH=$(dirname "$SCRIPT")
+: ${FINN_SSH_KEY_DIR="$SCRIPTPATH/ssh_keys"}
 
 BUILD_LOCAL=/tmp/$DOCKER_INST_NAME
 VIVADO_HLS_LOCAL=$VIVADO_PATH
@@ -87,6 +88,7 @@ VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache
 # ensure build dir exists locally
 mkdir -p $BUILD_LOCAL
 mkdir -p $VIVADO_IP_CACHE
+mkdir -p $FINN_SSH_KEY_DIR
 
 gecho "Instance is named as $DOCKER_INST_NAME"
 gecho "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
@@ -133,6 +135,7 @@ docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init \
 -v $SCRIPTPATH:/workspace/finn \
 -v $BUILD_LOCAL:$BUILD_LOCAL \
 -v $VIVADO_PATH:$VIVADO_PATH \
+-v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh \
 -e VIVADO_PATH=$VIVADO_PATH \
 -e FINN_INST_NAME=$DOCKER_INST_NAME \
 -e FINN_ROOT="/workspace/finn" \
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
index c2f68a35076418e0cf2edb578bdb8d548772fc78..efdfaa19d9f9e5dfa41911a2184e989337b3d9c2 100644
--- a/src/finn/core/onnx_exec.py
+++ b/src/finn/core/onnx_exec.py
@@ -39,6 +39,7 @@ from finn.core.remote_exec import remote_exec
 from finn.core.rtlsim_exec import rtlsim_exec
 from finn.custom_op.registry import getCustomOp
 import finn.analysis.topology as ta
+from finn.util.basic import sanitize_quant_values, get_sanitize_quant_tensors
 
 
 def execute_node(node, context, graph):
@@ -102,10 +103,7 @@ def execute_node(node, context, graph):
                     raise Exception(
                         """Output shapes disagree after node execution:
                         found %s vs expected %s"""
-                        % (
-                            str(output_list[list_ind].shape),
-                            str(context[outp].shape),
-                        )
+                        % (str(output_list[list_ind].shape), str(context[outp].shape))
                     )
                 context[outp] = output_list[list_ind]
 
@@ -162,7 +160,17 @@ def execute_onnx(model, input_dict, return_full_exec_context=False):
         # we can simply walk down the list since the ONNX spec guarantees that it is
         # topologically sorted
         for node in graph.node:
+            if get_sanitize_quant_tensors() != 0:
+                # round input values to match quantization annotation
+                execution_context = sanitize_quant_values(
+                    model, node.input, execution_context
+                )
             execute_node(node, execution_context, graph)
+            if get_sanitize_quant_tensors() != 0:
+                # round output values to quantization annotation
+                execution_context = sanitize_quant_values(
+                    model, node.output, execution_context
+                )
     elif model_exec_mode == "remote_pynq":
         # use remote exec metadata built into model to execute on a remote PYNQ
         remote_exec(model, execution_context)
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
similarity index 88%
rename from src/finn/custom_op/fpgadataflow/fmpadding.py
rename to src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index fa321dfa65d14b67fa218fb6a49f602ddab8d57e..d326ae7dfc7830a0081c3b13233d67ef08b12eff 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -21,6 +21,8 @@ class FMPadding_Batch(HLSCustomOp):
             "Padding": ("i", True, 2),
             # number of channels in input image
             "NumChannels": ("i", True, 0),
+            # SIMD Input parallelism
+            "SIMD": ("i", False, 1),
             # FINN input datatype
             "inputDataType": ("s", True, ""),
             # controls distribution of padded pixels
@@ -55,20 +57,22 @@ class FMPadding_Batch(HLSCustomOp):
         return oshape
 
     def get_folded_input_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
-        ret = list(self.get_normal_input_shape())
-        ret.insert(-1, 1)
-        return tuple(ret)
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
 
     def get_folded_output_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
-        ret = list(self.get_normal_output_shape())
-        ret.insert(-1, 1)
-        return tuple(ret)
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
@@ -114,15 +118,13 @@ class FMPadding_Batch(HLSCustomOp):
 
     def get_instream_width(self):
         ibits = self.get_input_datatype().bitwidth()
-        num_ch = self.get_nodeattr("NumChannels")
-
-        return ibits * num_ch
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
 
     def get_outstream_width(self):
         obits = self.get_output_datatype().bitwidth()
-        num_ch = self.get_nodeattr("NumChannels")
-
-        return obits * num_ch
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
@@ -135,13 +137,15 @@ class FMPadding_Batch(HLSCustomOp):
         self.code_gen_dict["$DEFINES$"] = [
             """#define ImgDim1 {}\n#define OutputDim1 {}\n
             #define Padding1 {}\n#define NumChannels1 {}\n
-            #define PaddingStyle1 {}\n#define numReps {}\n""".format(
+            #define PaddingStyle1 {}\n#define numReps {}
+            #define SIMD1 {}\n""".format(
                 self.get_nodeattr("ImgDim"),
                 self.get_padded_odim(),
                 self.get_nodeattr("Padding"),
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PaddingStyle"),
                 self.get_nodeattr("numInputVectors"),
+                self.get_nodeattr("SIMD"),
             )
         ]
 
@@ -176,7 +180,7 @@ class FMPadding_Batch(HLSCustomOp):
         in_t = self.get_input_datatype().get_hls_datatype_str()
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,
+            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
             {}, PaddingStyle1> (in0, out, numReps);""".format(
                 node.op_type, in_t
             )
@@ -232,6 +236,7 @@ class FMPadding_Batch(HLSCustomOp):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
@@ -254,10 +259,8 @@ class FMPadding_Batch(HLSCustomOp):
         match expected shape (1, ImgDim, ImgDim, NumChannels)."""
         export_idt = self.get_input_datatype()
 
-        # no reshaping for input since assuming no folding on input
-        # make copy before saving array
-        inp = inp.copy()
-        np.save(os.path.join(code_gen_dir, "input_0.npy"), inp)
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
 
         if mode == "cppsim":
             # execute the precompiled model
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 25ea05e3607a52731ae1b64de421837bf137ee2b..17ba44b959577faf573d77ae222f7b2a3be6669d 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -30,20 +30,30 @@ from finn.custom_op.fpgadataflow import HLSCustomOp
 
 
 class TLastMarker(HLSCustomOp):
-    """Class that corresponds to the TLastMarker node that needs to be
-    inserted at the end of the model for rtlsim with stitched IP.
-    It marks the end of the current image/input sample."""
+    """Node that adds/removes AXI stream TLAST signals where needed. Its behavior
+    is transparent in node-by-node execution, only visible in IP-stitched rtlsim or
+    actual hardware.
+    This node  may be needed at the end of the network to signal a DMA write (needed by the
+    FINN PYNQ shell) or at the beginning to remove the end-of-burst from DMA read."""
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
 
     def get_nodeattr_types(self):
         my_attrs = {
+            # number of (static) iterations until TLAST=1 is generated for Direction=out
             "NumIters": ("i", True, 0),
+            # whether static or dynamic (from AXI lite) number of iterations are used
+            "DynIters": ("i", False, 1),
+            # direction: whether to insert or remove TLAST
+            "Direction": ("s", False, "out"),
             # width of input-output data streams, in bits
             "StreamWidth": ("i", True, 0),
             # width of individual element in stream, in bits
             "ElemWidth": ("i", True, 0),
+            # Protocol: external or internal
+            # Vitis docs recommend using qdma_axis for external, ap_axiu for internal
+            "Protocol": ("s", False, "external"),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -76,12 +86,33 @@ class TLastMarker(HLSCustomOp):
 
     def defines(self, var):
         stream_width = self.get_nodeattr("StreamWidth")
+        direction = self.get_nodeattr("Direction")
+        protocol = self.get_nodeattr("Protocol")
         # output stream must have TLAST, so we use this stream data type:
         # qdma_axis<stream_data_width,0,0,0 >
-        out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width
+        if direction == "out":
+            if protocol == "external":
+                out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width
+            elif protocol == "internal":
+                out_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width
+            else:
+                raise Exception("Unrecognized Protocol in TLastMarker")
+            in_stream_dtype = "ap_uint<%d>" % stream_width
+        elif direction == "in":
+            out_stream_dtype = "ap_uint<%d>" % stream_width
+            if protocol == "external":
+                in_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width
+            elif protocol == "internal":
+                in_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width
+            else:
+                raise Exception("Unrecognized Protocol in TLastMarker")
+        else:
+            raise Exception("Unrecognized Direction in TLastMarker")
+
         self.code_gen_dict["$DEFINES$"] = [
             "#define StreamWidth %d" % stream_width,
             "#define OutDType %s" % out_stream_dtype,
+            "#define InDType %s" % in_stream_dtype,
             "#define NumItersPerImg %d" % self.get_nodeattr("NumIters"),
         ]
 
@@ -89,27 +120,60 @@ class TLastMarker(HLSCustomOp):
         self.code_gen_dict["$READNPYDATA$"] = []
 
     def docompute(self):
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            "unsigned int n = 1;",
-            "OutDType t;",
-            "t.set_keep(-1);",
-            "io_section: { // start of cycle accurate region",
-            "#pragma HLS protocol fixed",
-            "// do a first read from stream before we decide on numIters",
-            "// giving software a chance to set up the numIters prior to startup",
-            "t.set_data(in0.read());",
-            "n = (numIters == 0 ? NumItersPerImg : numIters);",
-            "t.set_last(n==1);",
-            "out.write(t);",
-            "} // end of cycle accurate region",
-            "// do one less iteration than spec since we already did one",
-            "for(unsigned int i=1; i<n; i++) {",
-            "#pragma HLS PIPELINE II=1",
-            "t.set_data(in0.read());",
-            "t.set_last(i==(n-1));",
-            "out.write(t);",
-            "}",
-        ]
+        dyn_iters = self.get_nodeattr("DynIters")
+        direction = self.get_nodeattr("Direction")
+        use_qdma_axis = self.get_nodeattr("Protocol") == "external"
+        if direction == "in":
+            # read from input and just pass data along; ignore tlast
+            # no dyn iters on input, it doesnt make sense
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "for(unsigned int i=0; i<NumItersPerImg; i++) {",
+                "#pragma HLS PIPELINE II=1",
+                "out.write(in0.read().get_data());"
+                if use_qdma_axis
+                else "out.write(in0.read().data);",
+                "}",
+            ]
+
+        elif dyn_iters == 1:
+            # output, with dynamic iteration counts
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "unsigned int n = 1;",
+                "OutDType t;",
+                "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;",
+                "io_section: { // start of cycle accurate region",
+                "#pragma HLS protocol fixed",
+                "// do a first read from stream before we decide on numIters",
+                "// giving software a chance to set up the numIters prior to startup",
+                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "n = (numIters == 0 ? NumItersPerImg : numIters);",
+                "t.set_last(n==1);" if use_qdma_axis else "t.last = (n==1);",
+                "out.write(t);",
+                "} // end of cycle accurate region",
+                "// do one less iteration than spec since we already did one",
+                "for(unsigned int i=1; i<n; i++) {",
+                "#pragma HLS PIPELINE II=1",
+                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "t.set_last(i==(n-1));" if use_qdma_axis else "t.last = (i==(n-1));",
+                "out.write(t);",
+                "}",
+            ]
+
+        else:
+            # output, with static iteration counts
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "unsigned int n = 1;",
+                "OutDType t;",
+                "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;",
+                "for(unsigned int i=0; i<NumItersPerImg; i++) {",
+                "#pragma HLS PIPELINE II=1",
+                "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();",
+                "t.set_last(i==(NumItersPerImg-1));"
+                if use_qdma_axis
+                else "t.last = (i==(NumItersPerImg-1));",
+                "out.write(t);",
+                "}",
+            ]
 
     def dataoutstrm(self):
         self.code_gen_dict["$DATAOUTSTREAM$"] = []
@@ -118,18 +182,30 @@ class TLastMarker(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void %s(hls::stream<ap_uint<StreamWidth> > &in0,
-                hls::stream<OutDType> &out, unsigned int numIters)"""
-            % self.onnx_node.name
-        ]
+        dyn_iters = self.get_nodeattr("DynIters")
+
+        if dyn_iters == 1:
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void %s(hls::stream<InDType> &in0,
+                    hls::stream<OutDType> &out, unsigned int numIters)"""
+                % self.onnx_node.name
+            ]
+        else:
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void %s(hls::stream<InDType> &in0, hls::stream<OutDType> &out)"""
+                % self.onnx_node.name
+            ]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE s_axilite port=numIters bundle=control"
-        )
+
+        dyn_iters = self.get_nodeattr("DynIters")
+        if dyn_iters == 1:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE s_axilite port=numIters bundle=control"
+            )
+
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
@@ -158,7 +234,7 @@ class TLastMarker(HLSCustomOp):
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+            'hls::stream<InDType> in0 ("in0");'
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<OutDType> out ("out");'
diff --git a/src/finn/custom_op/quantavgpool2d.py b/src/finn/custom_op/quantavgpool2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc328a9f4f6670041d33491d58af6c553bafac9
--- /dev/null
+++ b/src/finn/custom_op/quantavgpool2d.py
@@ -0,0 +1,83 @@
+import numpy as np
+from onnx import TensorProto, helper
+import onnxruntime as rt
+
+from finn.custom_op import CustomOp
+from finn.core.datatype import DataType
+
+
+class QuantAvgPool2d(CustomOp):
+    """Class that corresponds to the quantized average pooling
+    layer from brevitas"""
+
+    def get_nodeattr_types(self):
+        return {
+            "stride": ("i", True, 1),
+            "kernel": ("i", True, 1),
+            "ibits": ("i", True, 1),
+            "obits": ("i", True, 1),
+            "signed": ("i", True, 0),
+        }
+
+    def make_shape_compatible_op(self, model):
+        node = self.onnx_node
+        k = self.get_nodeattr("kernel")
+        s = self.get_nodeattr("stride")
+        return helper.make_node(
+            "AveragePool",
+            inputs=[node.input[0]],
+            outputs=[node.output[0]],
+            kernel_shape=[k, k],
+            strides=[s, s],
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        bw = self.get_nodeattr("obits")
+        if bw in [2, 4, 8, 16, 32]:
+            if self.get_nodeattr("signed") == 0:
+                dtype = DataType["UINT%d" % bw]
+            else:
+                dtype = DataType["INT%d" % bw]
+        else:
+            raise Exception("Unsupported output datatype for QuantAvgPool2d")
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def execute_node(self, context, graph):
+        # create a standard average pooling node to help calculate the result
+        node = self.onnx_node
+        k = self.get_nodeattr("kernel")
+        s = self.get_nodeattr("stride")
+        ishape = context[node.input[0]].shape
+        oshape = context[node.output[0]].shape
+        inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape)
+        outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape)
+        node_avgpool = helper.make_node(
+            "AveragePool",
+            inputs=[node.input[0]],
+            outputs=[node.output[0]],
+            kernel_shape=[k, k],
+            strides=[s, s],
+        )
+        graph_avgpool = helper.make_graph(
+            nodes=[node_avgpool],
+            name="single-avgpool-exec",
+            inputs=[inp],
+            outputs=[outp],
+        )
+        model_avgpool = helper.make_model(graph_avgpool)
+        idict = {node.input[0]: context[node.input[0]]}
+        sess = rt.InferenceSession(model_avgpool.SerializeToString())
+        result_temp = sess.run(None, idict)
+        # remove scaling introduced by average
+        result_temp = result_temp[0] * (k * k)
+        ibits = self.get_nodeattr("ibits")
+        max_value = 2 ** ibits - 1
+        max_value = max_value * k * k
+        max_bit_width = int(max_value).bit_length()
+        shift_bits = max_bit_width - self.get_nodeattr("obits")
+        result = np.right_shift(result_temp.astype(int), shift_bits)
+        context[node.output[0]] = result.astype(np.float32)
+
+    def verify_node(self):
+        pass
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index 6105b1342595fb083a194b6d0fc4af3fedada7ba..46d27472a9802a4c2a9004bb28c8bd09be8fbfdb 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -44,10 +44,11 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
-from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch
+from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
 from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
+from finn.custom_op.quantavgpool2d import QuantAvgPool2d
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
 from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
 
@@ -70,6 +71,7 @@ custom_op["FMPadding_Batch"] = FMPadding_Batch
 custom_op["Thresholding_Batch"] = Thresholding_Batch
 custom_op["AddStreams_Batch"] = AddStreams_Batch
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
+custom_op["QuantAvgPool2d"] = QuantAvgPool2d
 custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
 custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
 
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 32f32ece585a93465ba32fede45d5eb606a2b0a3..04dd437af27b9fbe18b2255c20a8e4acda03b3d0 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -31,23 +31,34 @@ from onnx import helper as oh
 
 from finn.custom_op.registry import getCustomOp
 from finn.transformation import Transformation
+from finn.util.basic import get_by_name
+
+import numpy as np
 
 
 class InsertTLastMarker(Transformation):
-    """Ensure that the graph is terminated with a TLastMarker node, inserting
-    one if necessary."""
+    """Ensure that the graph is started/terminated with a TLastMarker node, inserting
+    one if necessary. Use constructor args to determine type of TLastMarker to be inserted.
+    More information available on the TLastMarker documentation.
+    """
 
-    def __init__(self):
+    def __init__(self, both=False, external=True, dynamic=True):
         super().__init__()
+        self.dyniters = dynamic
+        self.external = external
+        self.both = both
 
     def apply(self, model):
         # TODO only makes sense for a pure fpgadataflow graph -- check!
         graph_out_name = model.graph.output[0].name
         final_node = model.find_producer(graph_out_name)
-        if final_node.op_type == "TLastMarker":
-            # TODO maybe check the correctness of properties
-            return (model, False)
-        else:
+        graph_modified = False
+        if final_node.op_type != "TLastMarker" and not (
+            final_node.op_type == "IODMA"
+            and get_by_name(final_node.attribute, "direction").s.decode("UTF-8")
+            == "out"
+        ):
+
             custom_op = getCustomOp(final_node)
             num_iters = int(custom_op.get_number_output_values())
             stream_width = int(custom_op.get_outstream_width())
@@ -69,8 +80,51 @@ class InsertTLastMarker(Transformation):
                 NumIters=num_iters,
                 StreamWidth=stream_width,
                 ElemWidth=elem_width,
+                DynIters=(1 if self.dyniters else 0),
+                Direction="out",
+                Protocol=("external" if self.external else "internal"),
                 domain="finn",
                 backend="fpgadataflow",
             )
             model.graph.node.append(tlast_node)
-            return (model, True)
+            graph_modified = True
+        # if both is True, also insert marker on input
+        if self.both:
+            graph_in_name = model.graph.input[0].name
+            first_node = model.find_consumer(graph_in_name)
+            if first_node.op_type != "TLastMarker" and not (
+                first_node.op_type == "IODMA"
+                and get_by_name(first_node.attribute, "direction").s.decode("UTF-8")
+                == "in"
+            ):
+
+                custom_op = getCustomOp(first_node)
+                num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1])
+                stream_width = int(custom_op.get_instream_width())
+                in_shape = model.get_tensor_shape(graph_in_name)
+                in_dtype = model.get_tensor_datatype(graph_in_name)
+                elem_width = in_dtype.bitwidth()
+                # make new buffer
+                first_node_in = oh.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape
+                )
+                model.graph.value_info.append(first_node_in)
+                model.set_tensor_datatype(first_node_in.name, in_dtype)
+                # reroute final node output to first_node_in_name
+                first_node.input[0] = first_node_in.name
+                tlast_node = oh.make_node(
+                    "TLastMarker",
+                    [graph_in_name],
+                    [first_node_in.name],
+                    NumIters=num_iters,
+                    StreamWidth=stream_width,
+                    ElemWidth=elem_width,
+                    DynIters=(1 if self.dyniters else 0),
+                    Direction="in",
+                    Protocol=("external" if self.external else "internal"),
+                    domain="finn",
+                    backend="fpgadataflow",
+                )
+                model.graph.node.insert(0, tlast_node)
+                graph_modified = True
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
index a1524322ec03a4e96ef41f999144e3eed349c5af..4f050be8540ddf5ef48699d1658b571852ff4510 100644
--- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
@@ -29,9 +29,12 @@
 import os
 
 import finn.custom_op.registry as registry
-from finn.transformation import Transformation
 from finn.util.basic import make_build_dir
 from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.transformation import Transformation
+from finn.util.basic import get_num_default_workers
+import multiprocessing as mp
+import copy
 
 
 def _codegen_single_node(node, model):
@@ -66,8 +69,40 @@ class PrepareCppSim(Transformation):
     that contains generated C++ code that can be used to simulate node using cppsim.
     The subsequent transformation is CompileCppSim"""
 
+    def __init__(self, num_workers=None):
+        super().__init__()
+        if num_workers is None:
+            self._num_workers = get_num_default_workers()
+        else:
+            self._num_workers = num_workers
+        assert self._num_workers >= 0, "Number of workers must be nonnegative."
+        if self._num_workers == 0:
+            self._num_workers = mp.cpu_count()
+
+    def prepareCppSim_node(self, node):
+        print(node.name)
+        if is_fpgadataflow_node(node) is True:
+            _codegen_single_node(node, self.model)
+        return (node, False)
+
     def apply(self, model):
-        for node in model.graph.node:
-            if is_fpgadataflow_node(node) is True:
-                _codegen_single_node(node, model)
-        return (model, False)
+        # Remove old nodes from the current model
+        self.model = copy.deepcopy(model)
+        old_nodes = []
+        for i in range(len(model.graph.node)):
+            old_nodes.append(model.graph.node.pop())
+
+        # Execute transformation in parallel
+        with mp.Pool(self._num_workers) as p:
+            new_nodes_and_bool = p.map(self.prepareCppSim_node, old_nodes, chunksize=1)
+
+        # extract nodes and check if the transformation needs to run again
+        # Note: .pop() had initially reversed the node order
+        run_again = False
+        for node, run in reversed(new_nodes_and_bool):
+            # Reattach new nodes to old model
+            model.graph.node.append(node)
+            if run is True:
+                run_again = True
+
+        return (model, run_again)
diff --git a/src/finn/transformation/infer_datatypes.py b/src/finn/transformation/infer_datatypes.py
index 1acd4e3abe2d77248810cf15c15475e806a3bd32..39b7a787be8c725e7b6d474757dd96fc4848dfe0 100644
--- a/src/finn/transformation/infer_datatypes.py
+++ b/src/finn/transformation/infer_datatypes.py
@@ -71,7 +71,13 @@ def _infer_node_datatype(model, node):
         else:
             # unknown, assume node produces float32 outputs
             for o in node.output:
-                model.set_tensor_datatype(o, DataType.FLOAT32)
+                # check if output datatype is already set to a value != FLOAT32
+                odtype = model.get_tensor_datatype(o)
+                if odtype is not None and odtype != DataType.FLOAT32:
+                    # don't change data type
+                    model.set_tensor_datatype(o, odtype)
+                else:
+                    model.set_tensor_datatype(o, DataType.FLOAT32)
     # compare old and new output dtypes to see if anything changed
     new_odtypes = list(map(lambda x: model.get_tensor_datatype(x), node.output))
     graph_modified = new_odtypes != odtypes
diff --git a/src/finn/transformation/lower_convs_to_matmul.py b/src/finn/transformation/lower_convs_to_matmul.py
index 3da785d8dd21b2c6701bffc8ce3869fb14b237a9..aa231a43a3865a161a501b4997ff2f538800554f 100644
--- a/src/finn/transformation/lower_convs_to_matmul.py
+++ b/src/finn/transformation/lower_convs_to_matmul.py
@@ -80,14 +80,19 @@ class LowerConvsToMatMul(Transformation):
                 inp_trans_out = inp_trans_out.name
                 model.set_tensor_datatype(inp_trans_out, idt)
 
-                im2col_out = helper.make_tensor_value_info(
-                    model.make_new_valueinfo_name(),
-                    TensorProto.FLOAT,
-                    (1, ofm_dim, ofm_dim, ifm_ch * k * k),
-                )
-                graph.value_info.append(im2col_out)
-                im2col_out = im2col_out.name
-                model.set_tensor_datatype(im2col_out, idt)
+                need_im2col = True
+                if k == 1 and pad == 0 and stride == 1:
+                    need_im2col = False
+
+                if need_im2col:
+                    im2col_out = helper.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        (1, ofm_dim, ofm_dim, ifm_ch * k * k),
+                    )
+                    graph.value_info.append(im2col_out)
+                    im2col_out = im2col_out.name
+                    model.set_tensor_datatype(im2col_out, idt)
 
                 matmul_out = helper.make_tensor_value_info(
                     model.make_new_valueinfo_name(),
@@ -104,19 +109,23 @@ class LowerConvsToMatMul(Transformation):
                     "Transpose", [cnv_input], [inp_trans_out], perm=[0, 2, 3, 1]
                 )
                 # lower input tensor
-                im2col_node = helper.make_node(
-                    "Im2Col",
-                    [inp_trans_out],
-                    [im2col_out],
-                    domain="finn",
-                    stride=stride,
-                    kernel_size=k,
-                    pad_amount=pad,
-                    input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
-                )
+                matmul_input = inp_trans_out
+                if need_im2col:
+                    matmul_input = im2col_out
+                    im2col_node = helper.make_node(
+                        "Im2Col",
+                        [inp_trans_out],
+                        [im2col_out],
+                        domain="finn",
+                        stride=stride,
+                        kernel_size=k,
+                        pad_amount=pad,
+                        input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
+                    )
+
                 # do matmul
                 matmul_node = helper.make_node(
-                    "MatMul", [im2col_out, weight_name], [matmul_out]
+                    "MatMul", [matmul_input, weight_name], [matmul_out]
                 )
                 # NHWC -> NCHW
                 out_trans_node = helper.make_node(
@@ -124,9 +133,13 @@ class LowerConvsToMatMul(Transformation):
                 )
                 # insert nodes where the conv is to preserve topological ordering
                 graph.node.insert(node_ind, inp_trans_node)
-                graph.node.insert(node_ind + 1, im2col_node)
-                graph.node.insert(node_ind + 2, matmul_node)
-                graph.node.insert(node_ind + 3, out_trans_node)
+                if need_im2col:
+                    graph.node.insert(node_ind + 1, im2col_node)
+                    graph.node.insert(node_ind + 2, matmul_node)
+                    graph.node.insert(node_ind + 3, out_trans_node)
+                else:
+                    graph.node.insert(node_ind + 1, matmul_node)
+                    graph.node.insert(node_ind + 2, out_trans_node)
                 # remove old nodes
                 graph.node.remove(n)
         model = model.transform(InferShapes())
diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py
index 67824ad4f633983b93e3178d03118927a1ddd85b..769bed841ce07c1c9c62f762de4b2c0937a6d68f 100644
--- a/src/finn/transformation/streamline/collapse_repeated.py
+++ b/src/finn/transformation/streamline/collapse_repeated.py
@@ -30,6 +30,7 @@ from onnx import helper as oh
 
 from finn.transformation import Transformation
 from finn.transformation.infer_shapes import InferShapes
+from finn.core.datatype import DataType
 
 
 class CollapseRepeatedOp(Transformation):
@@ -83,6 +84,9 @@ class CollapseRepeatedOp(Transformation):
                     graph.node.insert(node_ind, new_node)
                     # replace parameter value
                     model.set_initializer(new_node_param_name, new_param)
+                    # be conservative with param/output DataTypes
+                    model.set_tensor_datatype(new_node_param_name, DataType.FLOAT32)
+                    model.set_tensor_datatype(end_name, DataType.FLOAT32)
                     # remove old nodes
                     graph.node.remove(n)
                     graph.node.remove(consumer)
diff --git a/src/finn/transformation/streamline/remove.py b/src/finn/transformation/streamline/remove.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddc4233ddafbc70c4d20d316ea72ea6bba1b82a8
--- /dev/null
+++ b/src/finn/transformation/streamline/remove.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+from finn.transformation import Transformation
+from finn.transformation.infer_shapes import InferShapes
+import numpy as np
+
+class RemoveIdentityOps(Transformation):
+    """Remove identity ops like Add/Sub with zero or Mul/Div with one"""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type in ["Add", "Sub"]
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                A = model.get_initializer(n.input[1])
+                if A is not None and (A == np.zeros_like(A)).all():
+                    producer = model.find_producer(n.input[0])
+                    # remove node and wire output tensor to
+                    # output of producer node
+                    producer.output[0] = n.output[0]
+                    graph.node.remove(n)
+
+            elif (
+                n.op_type in ["Mul", "Div"]
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                A = model.get_initializer(n.input[1])
+                if A is not None and (A == np.ones_like(A)).all():
+                    producer = model.find_producer(n.input[0])
+                    # remove node and wire output tensor to
+                    # output of producer node
+                    producer.output[0] = n.output[0]
+                    graph.node.remove(n)
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 0b6259a61d3eb67b7b38d4c6939019ce2893a875..b46b82c77a3f1b70a3b05d87cd3c48fc1d94fd45 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -27,12 +27,14 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import warnings
 from onnx import helper as oh
 
 from finn.transformation import Transformation
 from finn.transformation.infer_shapes import InferShapes
 from finn.core.onnx_exec import execute_node
 from finn.util.basic import get_by_name
+from finn.custom_op.registry import getCustomOp
 
 
 class MoveAddPastMul(Transformation):
@@ -531,3 +533,67 @@ class MoveMulPastFork(MoveOpPastFork):
 class MoveLinearPastFork(MoveOpPastFork):
     def __init__(self):
         super().__init__(["Add", "Mul"])
+
+
+class MoveMaxPoolPastMultiThreshold(Transformation):
+    """Move MaxPool nodes past MultiThreshold nodes on linear segments of the graph."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        nodes = [n for n in graph.node]
+        for n in nodes:
+            node_ind += 1
+            if n.op_type == "MaxPool" and not model.is_fork_node(n):
+                consumer = model.find_consumer(n.output[0])
+                pads = get_by_name(n.attribute, "pads")
+                has_padding = False
+                if pads is not None:
+                    pads = list(pads.ints)
+                    has_padding = np.prod(pads) != 0
+                if consumer is not None and consumer.op_type == "MultiThreshold":
+                    mt_out = consumer.output[0]
+                    mt_odt = model.get_tensor_datatype(mt_out)
+                    if mt_odt.signed() and has_padding:
+                        warnings.warn(
+                            "Skipping padded MaxPool + signed-output MultiThreshold"
+                        )
+                        continue
+                    # check for non-decreasing thresholds and nonnegative
+                    # scale factor in MultiThreshold
+                    # otherwise we cannot do the reordering
+                    T = model.get_initializer(consumer.input[1])
+                    T_sorted = np.sort(T, axis=1)
+                    assert (
+                        T == T_sorted
+                    ).all(), "MultiThreshold must have non-decreasing thresholds"
+                    mt_inst = getCustomOp(consumer)
+                    if mt_inst.get_nodeattr("out_scale") < 0:
+                        warnings.warn("Skipping MultiThreshold with negative out_scale")
+                        continue
+
+                    # remove old nodes
+                    graph.node.remove(n)
+                    graph.node.remove(consumer)
+
+                    # swap conections
+                    group_in = n.input[0]
+                    # new tensor because dims change
+                    group_middle = model.make_new_valueinfo_name()
+                    group_out = consumer.output[0]
+
+                    consumer.input[0] = group_in
+                    consumer.output[0] = group_middle
+
+                    n.input[0] = group_middle
+                    n.output[0] = group_out
+
+                    # insert them back in
+                    graph.node.insert(node_ind - 1, consumer)
+                    graph.node.insert(node_ind, n)
+
+                    graph_modified = True
+
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index eb3d46bcd66e3dc307a679e6b8dfbb9913398d36..4a8277e08d3fc21e0b20668edf2ecad947b36647 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -31,6 +31,7 @@ import random
 import string
 import subprocess
 import tempfile
+import warnings
 
 import numpy as np
 
@@ -105,6 +106,25 @@ def get_finn_root():
         )
 
 
+def get_execution_error_thresh():
+    "Return the max error that is allowed for rounding in FINN execution."
+    try:
+        return float(os.environ["ERROR_THRESH"])
+    except KeyError:
+        return 1e-2
+
+
+def get_sanitize_quant_tensors():
+    """Return whether tensors with quantization annotations should be sanitized.
+    Enabled by default, disabling will yield faster ONNX execution but may give
+    incorrect results. Use with caution."""
+    try:
+        return int(os.environ["SANITIZE_QUANT_TENSORS"])
+    except KeyError:
+        # enabled by default
+        return 1
+
+
 def make_build_dir(prefix=""):
     """Creates a temporary folder with given prefix to be used as a build dir.
     Use this function instead of tempfile.mkdtemp to ensure any generated files
@@ -264,6 +284,69 @@ def calculate_signed_dot_prod_range(dt_a, dt_b, len):
     return (min_prod, max_prod)
 
 
+def sanitize_quant_values(model, node_tensors, execution_context, check_values=False):
+    """ Sanitize given list of tensors in execution_context by rounding values
+    that are supposed to be integers (as indicated by their quantization
+    annotation). Will raise an assertion if the amount of rounding is too large.
+    Returns the sanitized execution context.
+
+    If check_values is specified, an extra DataType.allowed() check will be
+    performed on any rounded tensors.
+
+    Background:
+    FINN uses floating point tensors as a carrier data type to represent
+    integers. Floating point arithmetic can introduce rounding errors, e.g.
+    (int_num * float_scale) / float_scale is not always equal to int_num.
+    We use this function to ensure that the values that are supposed to be
+    integers are indeed integers.
+    """
+
+    for tensor in node_tensors:
+        dtype = model.get_tensor_datatype(tensor)
+        # floats don't need sanitization, skip to next
+        # introduces less quicker runtime
+        if dtype == DataType.FLOAT32:
+            continue
+        current_values = execution_context[tensor]
+        updated_values = current_values
+        has_to_be_rounded = False
+        # TODO: vectorize with numpy
+        for value in np.nditer(current_values):
+            if not dtype.allowed(value):
+                has_to_be_rounded = True
+                break
+        if has_to_be_rounded:
+            updated_values = np.round(current_values)
+            warnings.warn(
+                "The values of tensor {} can't be represented "
+                "with the set FINN datatype ({}), they will be rounded to match the "
+                "FINN datatype.".format(tensor, dtype)
+            )
+        # check if rounded values are not too far from original values
+        max_error = max(np.abs(current_values - updated_values).flatten())
+        if max_error <= get_execution_error_thresh():
+            if check_values is True:
+                # check again if values can now be represented with set finn datatype
+                # TODO: vectorize with numpy
+                for value in np.nditer(updated_values):
+                    if not dtype.allowed(value):
+                        raise Exception(
+                            """Values can't be represented with set
+                                finn datatype ({}) for input {}""".format(
+                                dtype, tensor
+                            )
+                        )
+            execution_context[tensor] = updated_values
+        else:
+            raise Exception(
+                """Rounding error is too high to match set FINN
+            datatype ({}) for input {}""".format(
+                    dtype, tensor
+                )
+            )
+    return execution_context
+
+
 class CppBuilder:
     """Builds the g++ compiler command to produces the executable of the c++ code
     in code_gen_dir which is passed to the function build() of this class."""
diff --git a/src/finn/util/vivado.py b/src/finn/util/vivado.py
index 0f82c52cb2c1fc5ee4ed5a1927f46e222e0ab9b5..6b6df3940cfeeed292345382471719c49f725de6 100644
--- a/src/finn/util/vivado.py
+++ b/src/finn/util/vivado.py
@@ -28,6 +28,7 @@
 
 import os
 import subprocess
+import stat
 from finn.util.basic import get_remote_vivado
 
 
@@ -91,6 +92,7 @@ def out_of_context_synth(
     vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name)
     res_counts_path = vivado_proj_folder + "/res.txt"
     if remote_server is not None:
+        print("Using remote Vivado OOC synth, remote server %s" % remote_server)
         run_synth = """
 #!/bin/bash
 which vivado;
@@ -105,14 +107,17 @@ cat %s
         )
         with open(vivado_proj_folder + "/run.sh", "w") as f:
             f.write(run_synth)
+        st = os.stat(vivado_proj_folder + "/run.sh")
+        os.chmod(vivado_proj_folder + "/run.sh", st.st_mode | stat.S_IEXEC)
         # note that this assumes the same temp folder can be created on the
         # remote server
-        remote_server_uri = remote_server + ":" + verilog_dir
-        copy_files = "rsync -avz %s %s" % (verilog_dir + "/", remote_server_uri + "/")
+        # note we set target path as / due to use of -R (relative)
+        remote_server_uri = remote_server + ":/"
+        copy_files = "rsync -avzR %s %s" % (verilog_dir + "/", remote_server_uri)
         copy_files = copy_files.split()
         proc = subprocess.Popen(copy_files, cwd=verilog_dir, env=os.environ)
         proc.communicate()
-        vivado_cmd = "bash %s/run.sh" % vivado_proj_folder
+        vivado_cmd = "bash -ic %s/run.sh" % vivado_proj_folder
         run_vivado = ["ssh", "-t", remote_server, vivado_cmd]
         proc = subprocess.Popen(run_vivado, cwd=verilog_dir, env=os.environ)
         proc.communicate()
diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..24854a2153df9af78feb8352ca119e831a9ac9eb
--- /dev/null
+++ b/tests/brevitas/test_brevitas_avg_pool_export.py
@@ -0,0 +1,103 @@
+import os
+
+import onnx  # noqa
+import torch
+import numpy as np
+import brevitas.onnx as bo
+from brevitas.nn import QuantAvgPool2d
+from brevitas.quant_tensor import pack_quant_tensor
+from brevitas.core.quant import QuantType
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.util.basic import gen_finn_dt_tensor
+import finn.core.onnx_exec as oxe
+
+import pytest
+
+export_onnx_path = "test_avg_pool.onnx"
+
+
+@pytest.mark.parametrize("kernel_size", [2, 3])
+@pytest.mark.parametrize("stride", [1, 2])
+@pytest.mark.parametrize("signed", [False, True])
+@pytest.mark.parametrize("bit_width", [2, 4])
+@pytest.mark.parametrize("input_bit_width", [4, 8, 32])
+@pytest.mark.parametrize("channels", [2, 4])
+@pytest.mark.parametrize("idim", [7, 8])
+def test_brevitas_avg_pool_export(
+    kernel_size, stride, signed, bit_width, input_bit_width, channels, idim
+):
+    ishape = (1, channels, idim, idim)
+    ibw_tensor = torch.Tensor([input_bit_width])
+
+    b_avgpool = QuantAvgPool2d(
+        kernel_size=kernel_size,
+        stride=stride,
+        signed=signed,
+        min_overall_bit_width=bit_width,
+        max_overall_bit_width=bit_width,
+        quant_type=QuantType.INT,
+    )
+    # call forward pass manually once to cache scale factor and bitwidth
+    input_tensor = torch.from_numpy(np.zeros(ishape)).float()
+    scale = np.ones((1, channels, 1, 1))
+    output_scale = torch.from_numpy(scale).float()
+    input_quant_tensor = pack_quant_tensor(
+        tensor=input_tensor, scale=output_scale, bit_width=ibw_tensor
+    )
+    bo.export_finn_onnx(b_avgpool, ishape, export_onnx_path, input_t=input_quant_tensor)
+    model = ModelWrapper(export_onnx_path)
+
+    # determine input FINN datatype
+    if signed is True:
+        prefix = "INT"
+    else:
+        prefix = "UINT"
+    dt_name = prefix + str(input_bit_width // 2)
+    dtype = DataType[dt_name]
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    # execution with input tensor using integers and scale = 1
+    # calculate golden output
+    inp = gen_finn_dt_tensor(dtype, ishape)
+    input_tensor = torch.from_numpy(inp).float()
+    input_quant_tensor = pack_quant_tensor(
+        tensor=input_tensor, scale=output_scale, bit_width=ibw_tensor
+    )
+    b_avgpool.eval()
+    expected = b_avgpool.forward(input_quant_tensor).tensor.detach().numpy()
+
+    # finn execution
+    idict = {model.graph.input[0].name: inp}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    assert (expected == produced).all()
+
+    # execution with input tensor using float and scale != 1
+    scale = np.random.uniform(low=0, high=1, size=(1, channels, 1, 1)).astype(
+        np.float32
+    )
+    inp_tensor = inp * scale
+    input_tensor = torch.from_numpy(inp_tensor).float()
+    input_scale = torch.from_numpy(scale).float()
+    input_quant_tensor = pack_quant_tensor(
+        tensor=input_tensor, scale=input_scale, bit_width=ibw_tensor
+    )
+    # export again to set the scale values correctly
+    bo.export_finn_onnx(b_avgpool, ishape, export_onnx_path, input_t=input_quant_tensor)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    b_avgpool.eval()
+    expected = b_avgpool.forward(input_quant_tensor).tensor.detach().numpy()
+    # finn execution
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+
+    assert np.isclose(expected, produced).all()
+
+    os.remove(export_onnx_path)
diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py
index c9d8f2d812bc7bea1a2fd2598a7711099ad421e6..c5ddad12ca3e8d353682fbb20449d44358485f69 100644
--- a/tests/brevitas/test_brevitas_relu_act_export.py
+++ b/tests/brevitas/test_brevitas_relu_act_export.py
@@ -23,6 +23,7 @@ export_onnx_path = "test_act.onnx"
 def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type):
     min_val = -1.0
     ishape = (1, 15)
+
     b_act = QuantReLU(
         bit_width=abits,
         max_val=max_val,
@@ -67,3 +68,60 @@ scaling_impl.learned_value": torch.tensor(
 
     assert np.isclose(produced, expected, atol=1e-3).all()
     os.remove(export_onnx_path)
+
+
+@pytest.mark.parametrize("abits", [1, 2, 4, 8])
+@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)])
+@pytest.mark.parametrize("scaling_per_channel", [True, False])
+def test_brevitas_act_export_relu_imagenet(abits, max_val, scaling_per_channel):
+    out_channels = 32
+    ishape = (1, out_channels, 1, 1)
+    min_val = -1.0
+    b_act = QuantReLU(
+        bit_width=abits,
+        quant_type=QuantType.INT,
+        scaling_impl_type=ScalingImplType.PARAMETER,
+        scaling_per_channel=scaling_per_channel,
+        restrict_scaling_type=RestrictValueType.LOG_FP,
+        scaling_min_val=2e-16,
+        max_val=6.0,
+        return_quant_tensor=True,
+        per_channel_broadcastable_shape=(1, out_channels, 1, 1),
+    )
+    if scaling_per_channel is True:
+        rand_tensor = (2) * torch.rand((1, out_channels, 1, 1))
+    else:
+        rand_tensor = torch.tensor(1.2398)
+    checkpoint = {
+        "act_quant_proxy.fused_activation_quant_proxy.tensor_quant.\
+scaling_impl.learned_value": rand_tensor.type(
+            torch.FloatTensor
+        )
+    }
+    b_act.load_state_dict(checkpoint)
+    bo.export_finn_onnx(b_act, ishape, export_onnx_path)
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(InferShapes())
+    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
+        np.float32
+    )
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    inp_tensor = torch.from_numpy(inp_tensor).float()
+    b_act.eval()
+    expected = b_act.forward(inp_tensor).tensor.detach().numpy()
+    if not np.isclose(produced, expected, atol=1e-3).all():
+        print(abits, max_val)
+        print("scale: ", b_act.quant_act_scale().type(torch.FloatTensor).detach())
+        if abits < 5:
+            print(
+                "thres:",
+                ", ".join(["{:8.4f}".format(x) for x in b_act.export_thres[0]]),
+            )
+        print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]]))
+        print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]]))
+        print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]]))
+
+    assert np.isclose(produced, expected, atol=1e-3).all()
+    os.remove(export_onnx_path)
diff --git a/tests/core/test_basic_onnx_exec.py b/tests/core/test_basic_onnx_exec.py
index a7b6da9965aa5912870812a8c1f8d6da2ee0d181..7b0412432cc6360cb9c42d66417bd187ed142563 100644
--- a/tests/core/test_basic_onnx_exec.py
+++ b/tests/core/test_basic_onnx_exec.py
@@ -35,6 +35,8 @@ import onnx.numpy_helper as np_helper
 import finn.core.onnx_exec as oxe
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.infer_shapes import InferShapes
+from finn.core.datatype import DataType
+from finn.util.basic import gen_finn_dt_tensor
 
 
 def test_mnist_onnx_download_extract_run():
@@ -53,3 +55,30 @@ def test_mnist_onnx_download_extract_run():
     assert np.isclose(
         np_helper.to_array(output_tensor), output_dict["Plus214_Output_0"], atol=1e-3
     ).all()
+
+
+def test_onnx_exec_internal_rounding():
+    inp0 = onnx.helper.make_tensor_value_info("inp0", onnx.TensorProto.FLOAT, [2, 2])
+    inp1 = onnx.helper.make_tensor_value_info("inp1", onnx.TensorProto.FLOAT, [1])
+    outp = onnx.helper.make_tensor_value_info("outp", onnx.TensorProto.FLOAT, [2, 2])
+    mul_node = onnx.helper.make_node("Mul", inputs=["inp0", "inp1"], outputs=["outp"],)
+    graph = onnx.helper.make_graph(
+        nodes=[mul_node], name="mul_graph", inputs=[inp0, inp1], outputs=[outp]
+    )
+
+    model = onnx.helper.make_model(graph, producer_name="mul-model")
+    model = ModelWrapper(model)
+    idt = DataType.INT2
+    model.set_tensor_datatype("inp0", idt)
+    model.set_tensor_datatype("inp1", idt)
+    model.transform(InferShapes())
+
+    mul_value = np.asarray([-1], dtype=np.float32)
+    inp_int = gen_finn_dt_tensor(idt, [2, 2])
+    scale = np.random.uniform(low=0, high=1, size=(2, 2)).astype(np.float32)
+    inp_rounded = (inp_int * scale) / (scale + 1e-7)
+    input_dict = {"inp0": inp_rounded, "inp1": mul_value}
+    output_dict = oxe.execute_onnx(model, input_dict)
+    produced = output_dict["outp"]
+    expected = np.multiply(inp_int, mul_value)
+    assert (produced == expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 9d6390b2673e5d2c0e72748183ac04ed222d078e..5ff3da87228a2a32a41226bb46e0b16b1a44df50 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -23,7 +23,7 @@ test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 10
 
 
-def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
+def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style):
     assert pad_style == 2, "only pad_style == 2 supported in hlslib"
     assert padding > 0, "Output dim should be greater than input dim"
     odim = idim + padding
@@ -47,6 +47,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
         inputDataType=str(idt.name),
         PaddingStyle=pad_style,
         numInputVectors=1,
+        SIMD=simd,
     )
 
     graph = helper.make_graph(
@@ -63,11 +64,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
 
 
 # input image dimension
-@pytest.mark.parametrize("idim", [8, 16])
+@pytest.mark.parametrize("idim", [8])
 # number of rows and number of cols to add
 @pytest.mark.parametrize("pad", [2, 3])
 # number of channels
-@pytest.mark.parametrize("num_ch", [1, 2])
+@pytest.mark.parametrize("num_ch", [2, 4])
+# Input parallelism
+@pytest.mark.parametrize("simd", [1, 2])
 # PaddingStyle: selects behavior when (odim-idim)%2 != 0
 @pytest.mark.parametrize("pad_style", [2])
 # FINN input datatype
@@ -76,14 +79,15 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode):
-
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
+    if num_ch % simd != 0:
+        pytest.skip(" num_ch % simd != 0, skipping")
     # generate input data
     x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch])
     input_dict = {"inp": x}
     odim = idim + pad
 
-    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style)
+    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style)
     model = model.transform(InferShapes())
     model = model.transform(SetExecMode(mode))
     model = model.transform(GiveUniqueNodeNames())
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
similarity index 99%
rename from tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
rename to tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 61dd81b728aafcd8ccc812cf0cb4c27eff00f471..b830693c32afe629dd6fc70868d0bddacac4c887 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -300,7 +300,7 @@ def test_fpgadataflow_ipstitch_synth_ooc():
     assert ret["FF"] > 0
     assert ret["DSP"] == 0
     assert ret["BRAM"] == 0
-    assert ret["fmax_mz"] > 100
+    assert ret["fmax_mhz"] > 100
 
 
 @pytest.mark.vivado
diff --git a/tests/transformation/test_conv_lowering.py b/tests/transformation/test_conv_lowering.py
index 2cbc8e558940517168678b05c3bb46af8170abce..73891ded1b9691c7c48a2075ad6ca4668fcf6bfe 100644
--- a/tests/transformation/test_conv_lowering.py
+++ b/tests/transformation/test_conv_lowering.py
@@ -26,12 +26,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import onnx.helper as oh
+from onnx import TensorProto
 import os
 import pkg_resources as pk
 import brevitas.onnx as bo
 import numpy as np
 
-
 from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fold_constants import FoldConstants
 from finn.transformation.infer_shapes import InferShapes
@@ -65,3 +66,51 @@ def test_conv_lowering_cnv_w1a1():
     assert np.isclose(produced, expected).all()
     assert np.argmax(produced) == 3
     os.remove(export_onnx_path)
+
+
+def test_conv_lowering_conv_1x1():
+    np.random.seed(0)
+
+    in_feature_dim = 7
+    in_chn = 3
+    kernel_size = 1
+    out_feature_dim = in_feature_dim
+
+    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
+    output_shape = [1, in_chn, out_feature_dim, out_feature_dim]
+
+    conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = 1
+    conv_config["kernel_shape"] = [kernel_size, kernel_size]
+    conv_config["pads"] = [0, 0, 0, 0]
+    conv_config["strides"] = [1, 1]
+
+    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+
+    value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)]
+
+    modelproto = oh.make_model(
+        oh.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[oh.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+    model.set_initializer("p1", np.random.rand(*conv_param_shape).astype(np.float32))
+
+    new_model = model.transform(LowerConvsToMatMul())
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+
+    assert oxe.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "Transpose"
+    assert new_model.graph.node[1].op_type == "MatMul"
+    assert new_model.graph.node[2].op_type == "Transpose"
+    assert len(new_model.graph.node) == 3
diff --git a/tests/transformation/test_move_maxpool_past_multithreshold.py b/tests/transformation/test_move_maxpool_past_multithreshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fc19debf8d6fc89d15e3d731f1e54daa491c321
--- /dev/null
+++ b/tests/transformation/test_move_maxpool_past_multithreshold.py
@@ -0,0 +1,100 @@
+from onnx import TensorProto, helper
+import numpy as np
+
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.streamline.reorder import MoveMaxPoolPastMultiThreshold
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+
+
+def get_multithreshold_rand_params(channels, num_of_thres, seed=None):
+    if seed is not None:
+        np.random.seed(seed)
+    steps = np.random.rand(channels, 1) * 2
+    bias = np.random.rand(channels, 1) * 10
+    thres = [np.arange(num_of_thres) for chn in range(channels)]
+    thres = ((thres - bias) * steps).astype(np.float32)
+    return thres
+
+
+def test_move_maxpool_past_multithreshold():
+    # generate test vectors of correct shape
+    ch = 64
+    ifmdim = 16
+    ofmdim = 16 // 4
+    input_shape = (1, ch, ifmdim, ifmdim)
+    output_shape = (1, ch, ofmdim, ofmdim)
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+
+    maxpool_config = {}
+    maxpool_config["pads"] = [1, 1, 1, 1]
+    maxpool_config["kernel_shape"] = [3, 3]
+    maxpool_config["strides"] = [2, 2]
+
+    value_info = []
+    thres1_shape = [1, 1]
+    value_info += [
+        helper.make_tensor_value_info("thres1", TensorProto.FLOAT, thres1_shape)
+    ]
+
+    thres2_shape = [ch, 14]
+    value_info += [
+        helper.make_tensor_value_info("thres2", TensorProto.FLOAT, thres2_shape)
+    ]
+
+    nodes = []
+    nodes += [helper.make_node("MaxPool", ["top_in"], ["t1"], **maxpool_config)]
+    nodes += [
+        helper.make_node(
+            "MultiThreshold",
+            ["t1", "thres1"],
+            ["t2"],
+            domain="finn",
+            out_dtype="BIPOLAR",
+            out_bias=-1.0,
+            out_scale=1.0,
+        )
+    ]
+    nodes += [helper.make_node("MaxPool", ["t2"], ["t3"], **maxpool_config)]
+    nodes += [
+        helper.make_node(
+            "MultiThreshold",
+            ["t3", "thres2"],
+            ["top_out"],
+            domain="finn",
+            out_dtype="UINT4",
+        )
+    ]
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=nodes,
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    model.set_initializer("thres1", np.array([[0]]))
+    model.set_initializer(
+        "thres2", get_multithreshold_rand_params(*thres2_shape, seed=0)
+    )
+
+    # Transform
+    new_model = model.transform(MoveMaxPoolPastMultiThreshold())
+    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+
+    # Test
+    assert oxe.compare_execution(model, new_model, inp_dict)
+    assert new_model.graph.node[0].op_type == "MaxPool"
+    assert new_model.graph.node[1].op_type == "MultiThreshold"
+    assert new_model.graph.node[2].op_type == "MultiThreshold"
+    assert new_model.graph.node[3].op_type == "MaxPool"
+    assert len(new_model.graph.node) == 4
diff --git a/tests/transformation/test_remove_identity_ops.py b/tests/transformation/test_remove_identity_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..536c1ab0b48fa44388da23f45b528da3c5f3b2f2
--- /dev/null
+++ b/tests/transformation/test_remove_identity_ops.py
@@ -0,0 +1,81 @@
+import pytest
+
+import numpy as np
+from onnx import helper, TensorProto
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline.remove import RemoveIdentityOps
+from finn.util.basic import gen_finn_dt_tensor
+
+
+def insert_identity_op(model, op):
+    if op in ["Add", "Sub"]:
+        val = np.asarray([0.0], dtype=np.float32)
+    elif op in ["Mul", "Div"]:
+        val = np.asarray([1.0], dtype=np.float32)
+    else:
+        return
+
+    identity_node = helper.make_node(op, ["div_out", "value"], ["ident_out"])
+    graph = model.graph
+    graph.node.insert(3, identity_node)
+    graph.node[-1].input[0] = "ident_out"
+    model.set_initializer("value", val)
+
+    return model
+
+
+# identity operations to be inserted
+@pytest.mark.parametrize("op", ["Add", "Sub", "Mul", "Div"])
+def test_remove_identity_ops(op):
+
+    # set up onnx model
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 4, 1, 1])
+    mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [])
+    shape = helper.make_tensor_value_info("shape", TensorProto.FLOAT, [2])
+    div = helper.make_tensor_value_info("div", TensorProto.FLOAT, [])
+    matmul = helper.make_tensor_value_info("matmul", TensorProto.FLOAT, [4, 2])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 2])
+
+    mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"])
+    reshape_node = helper.make_node("Reshape", ["mul_out", "shape"], ["reshape_out"])
+    div_node = helper.make_node("Div", ["reshape_out", "div"], ["div_out"])
+    matmul_node = helper.make_node("MatMul", ["div_out", "matmul"], ["outp"])
+
+    graph = helper.make_graph(
+        nodes=[mul_node, reshape_node, div_node, matmul_node],
+        name="identity-graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[mul, shape, div, matmul],
+    )
+
+    model = helper.make_model(graph, producer_name="mulpastconv-model")
+    model = ModelWrapper(model)
+    inp_values = gen_finn_dt_tensor(DataType.INT2, [1, 4, 1, 1])
+    mul_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
+    shape_values = np.asarray([1, -1], dtype=np.int64)
+    div_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32)
+    matmul_values = gen_finn_dt_tensor(DataType.INT2, [4, 2])
+    model.set_initializer("mul", mul_values)
+    model.set_initializer("shape", shape_values)
+    model.set_initializer("div", div_values)
+    model.set_initializer("matmul", matmul_values)
+    insert_identity_op(model, op)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    idict = {"inp": inp_values}
+    odict = oxe.execute_onnx(model, idict)
+    out_before = odict["outp"]
+    num_of_nodes_before = len(model.graph.node)
+
+    model = model.transform(RemoveIdentityOps())
+    num_of_nodes_after = len(model.graph.node)
+    assert num_of_nodes_before - 1 == num_of_nodes_after
+
+    odict = oxe.execute_onnx(model, idict)
+    out_after = odict["outp"]
+    assert (out_before == out_after).all()