diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 0074cce02f7de57dc778e0b671c484233df72a8a..80e5261c9b6568e0f340ae0add86d269c036ff16 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -15,7 +15,7 @@ gecho () {
 # the repos themselves are cloned in the Dockerfile
 BREVITAS_COMMIT=026a509186b7e7b0b65d46a2f905043d41069306
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716
+HLSLIB_COMMIT=8aed899c278c36c977a249558d71795086cf852c
 PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f
 PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d
 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
similarity index 88%
rename from src/finn/custom_op/fpgadataflow/fmpadding.py
rename to src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index fa321dfa65d14b67fa218fb6a49f602ddab8d57e..d326ae7dfc7830a0081c3b13233d67ef08b12eff 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -21,6 +21,8 @@ class FMPadding_Batch(HLSCustomOp):
             "Padding": ("i", True, 2),
             # number of channels in input image
             "NumChannels": ("i", True, 0),
+            # SIMD Input parallelism
+            "SIMD": ("i", False, 1),
             # FINN input datatype
             "inputDataType": ("s", True, ""),
             # controls distribution of padded pixels
@@ -55,20 +57,22 @@ class FMPadding_Batch(HLSCustomOp):
         return oshape
 
     def get_folded_input_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
-        ret = list(self.get_normal_input_shape())
-        ret.insert(-1, 1)
-        return tuple(ret)
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
 
     def get_folded_output_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
-        ret = list(self.get_normal_output_shape())
-        ret.insert(-1, 1)
-        return tuple(ret)
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
@@ -114,15 +118,13 @@ class FMPadding_Batch(HLSCustomOp):
 
     def get_instream_width(self):
         ibits = self.get_input_datatype().bitwidth()
-        num_ch = self.get_nodeattr("NumChannels")
-
-        return ibits * num_ch
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
 
     def get_outstream_width(self):
         obits = self.get_output_datatype().bitwidth()
-        num_ch = self.get_nodeattr("NumChannels")
-
-        return obits * num_ch
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
@@ -135,13 +137,15 @@ class FMPadding_Batch(HLSCustomOp):
         self.code_gen_dict["$DEFINES$"] = [
             """#define ImgDim1 {}\n#define OutputDim1 {}\n
             #define Padding1 {}\n#define NumChannels1 {}\n
-            #define PaddingStyle1 {}\n#define numReps {}\n""".format(
+            #define PaddingStyle1 {}\n#define numReps {}
+            #define SIMD1 {}\n""".format(
                 self.get_nodeattr("ImgDim"),
                 self.get_padded_odim(),
                 self.get_nodeattr("Padding"),
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PaddingStyle"),
                 self.get_nodeattr("numInputVectors"),
+                self.get_nodeattr("SIMD"),
             )
         ]
 
@@ -176,7 +180,7 @@ class FMPadding_Batch(HLSCustomOp):
         in_t = self.get_input_datatype().get_hls_datatype_str()
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,
+            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
             {}, PaddingStyle1> (in0, out, numReps);""".format(
                 node.op_type, in_t
             )
@@ -232,6 +236,7 @@ class FMPadding_Batch(HLSCustomOp):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
@@ -254,10 +259,8 @@ class FMPadding_Batch(HLSCustomOp):
         match expected shape (1, ImgDim, ImgDim, NumChannels)."""
         export_idt = self.get_input_datatype()
 
-        # no reshaping for input since assuming no folding on input
-        # make copy before saving array
-        inp = inp.copy()
-        np.save(os.path.join(code_gen_dir, "input_0.npy"), inp)
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
 
         if mode == "cppsim":
             # execute the precompiled model
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aedf7056b1381a8904ac63b3a64f86b1243cb36
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -0,0 +1,395 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import numpy as np
+
+from finn.custom_op.fpgadataflow import HLSCustomOp
+from finn.core.datatype import DataType
+from onnx import TensorProto, helper
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class Pool_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib Pool_batch function.
+    Requires ConvolutionInputGenerator(depthwise == 1) to format its input
+
+    TODO: explain input shape (to reuse im2col code)
+    Input shape (BatchSize,OutImgDim,OutImgDim,KernelSize^2*Channels)
+    Output shape (BatchSize,OutImgDim,OutImgDim,Channels)
+
+    # note: the actual data layout produced by the hlslib kernels is different
+    # for depthwise ops.
+    # * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/SIMD, K, K, SIMD)
+
+    Channels can be folded using PE (SIMD from the input perspective)
+    TODO: doc
+    """
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "Channels": ("i", True, 0),
+            "PE": ("i", True, 1),
+            "KernelSize": ("i", True, 0),
+            # Function:
+            #  - MaxPool
+            #  - AvgPool (not yet supported, but HLSLIB does)
+            #  - AccPool (not yet supported, but HLSLIB does)
+            "Function": ("s", True, ""),
+            "OutImgDim": ("i", True, 0),
+            # FINN DataTypes for inputs/outputs
+            "dataType": ("s", True, ""),
+            "BatchSize": ("i", False, 1),
+        }
+
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_input_datatype(self):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("dataType")]
+
+    def get_output_datatype(self):
+        """Returns FINN DataType of output."""
+        fxn = self.get_nodeattr("Function")
+        if fxn == "MaxPool":
+            # Same as input
+            return DataType[self.get_nodeattr("dataType")]
+        else:
+            raise Exception("Pool_Batch doesn't currently support " + fxn)
+
+    def get_normal_input_shape(self):
+        ifm_ch = self.get_nodeattr("Channels")
+        odim = self.get_nodeattr("OutImgDim")
+        batch_size = self.get_nodeattr("BatchSize")
+        k = self.get_nodeattr("KernelSize")
+        ishape = (batch_size, odim, odim, k * k * ifm_ch)
+        return ishape
+
+    def get_folded_input_shape(self):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        assert ifm_ch % pe == 0, "PE must divide input channels"
+        fold = int(normal_ishape[-1] / pe)
+        folded_ishape = normal_ishape[:-1] + [fold, pe]
+        return tuple(folded_ishape)
+
+    def get_normal_output_shape(self):
+        ofm_ch = self.get_nodeattr("Channels")
+        odim = self.get_nodeattr("OutImgDim")
+        batch_size = self.get_nodeattr("BatchSize")
+        oshape = (batch_size, odim, odim, ofm_ch)
+        return oshape
+
+    def get_folded_output_shape(self):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        assert ifm_ch % pe == 0, "PE must divide input channels"
+        fold = int(ifm_ch / pe)
+        folded_oshape = normal_oshape[:-1] + [fold, pe]
+        return tuple(folded_oshape)
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[1:-1])
+
+    def get_instream_width(self):
+        dt_bits = self.get_input_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        # ofm_ch = self.get_nodeattr("Channels")
+        # k = self.get_nodeattr("KernelSize")
+        # assert ifm_ch % pe == 0, "PE must divide input channels"
+        # simd = int(ifm_ch/pe)
+        in_width = int(dt_bits * pe)
+        return in_width
+
+    def get_outstream_width(self):
+        fxn = self.get_nodeattr("Function")
+        if fxn == "MaxPool":
+            return self.get_instream_width()
+        else:
+            raise Exception("Pool_Batch doesn't currently support " + fxn)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch."
+        # implement tensor with correct shape
+        values = np.random.randn(*oshape).astype(np.float32)
+        return helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.FLOAT,
+                dims=values.shape,
+                vals=values.flatten().astype(float),
+            ),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        info_messages = []
+
+        # verify that "domain" is set to "finn"
+        domain_value = self.onnx_node.domain
+        if domain_value == "finn":
+            info_messages.append("Attribute domain is set correctly")
+        else:
+            info_messages.append('Attribute domain should be set to "finn"')
+
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify the number of inputs
+        if len(self.onnx_node.input) == 1:
+            info_messages.append("The number of inputs is correct")
+        else:
+            info_messages.append("""Pool_Batch needs 1 data input""")
+
+        # check supported function
+        fnx = self.get_nodeattr("Function")
+        if fnx == "MaxPool":
+            info_messages.append(
+                "Attribute Function contains a supported pool function"
+            )
+        else:
+            info_messages.append(
+                "Attribute Function contains an unsupported pool function"
+            )
+        return info_messages
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "pool.hpp"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+        ifm_ch = self.get_nodeattr("Channels")
+        self.code_gen_dict["$DEFINES$"] += ["#define Channels {}".format(ifm_ch)]
+
+        pe = self.get_nodeattr("PE")
+        self.code_gen_dict["$DEFINES$"] += ["#define PE {}".format(pe)]
+
+        k = self.get_nodeattr("KernelSize")
+        self.code_gen_dict["$DEFINES$"] += ["#define KernelSize {}".format(k)]
+
+        odim = self.get_nodeattr("OutImgDim")
+        self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)]
+
+        numReps = self.get_nodeattr("BatchSize")
+        self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(numReps)]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);'
+            % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        idt = self.get_input_datatype()
+        i_hls_dt = idt.get_hls_datatype_str()
+        odt = self.get_output_datatype()
+        o_hls_dt = odt.get_hls_datatype_str()
+
+        self.code_gen_dict["$DOCOMPUTE$"] = []
+
+        fxn = self.get_nodeattr("Function")
+        if fxn == "MaxPool":
+            self.code_gen_dict["$DOCOMPUTE$"] += [
+                "MaxPoolFunction<{},KernelSize> pool_fxn;".format(i_hls_dt)
+            ]
+        else:
+            raise Exception("Pool_Batch doesn't currently support " + fxn)
+
+        self.code_gen_dict["$DOCOMPUTE$"] += [
+            """Pool_batch<Channels, PE, KernelSize,Slice<{} >, Slice< {} > >
+        (in0,out, pool_fxn, OFMDim*OFMDim*numReps);""".format(
+                i_hls_dt, o_hls_dt
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType.BIPOLAR:
+            # use binary for bipolar storage
+            dtype = DataType.BINARY
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s",false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        packed_ibits = self.get_instream_width()
+        packed_in_hls_type = "ap_uint<%d>" % packed_ibits
+
+        packed_obits = self.get_outstream_width()
+        packed_out_hls_type = "ap_uint<%d>" % packed_obits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
+            % (self.onnx_node.name, packed_in_hls_type, packed_out_hls_type)
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        folded_ishape = self.get_folded_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_oshape = self.get_folded_output_shape()
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (batch_size,odim,odim,k*k*ifm_ch)."""
+
+        export_idt = self.get_input_datatype()
+        reshaped_input = inp.reshape(folded_ishape)
+
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output
+        shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch)."""
diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py
index 82a6b140f7af1be4e5c0f429d077b99c7865383e..8ed0041704d421dab587f08bcbcd9e739e8434e9 100644
--- a/src/finn/custom_op/im2col.py
+++ b/src/finn/custom_op/im2col.py
@@ -80,6 +80,8 @@ class Im2Col(CustomOp):
             "input_shape": ("s", True, ""),
             "pad_amount": ("i", False, 0),
             "pad_value": ("i", False, 0),
+            # depthwise: if != 0, infer ConvolutionInputGenerator with depthwise == 1
+            "depthwise": ("i", False, 0),
         }
 
     def make_shape_compatible_op(self, model):
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index 2dae826cf9712bef17d0053a0878c41ef51fec36..0060e5d400f30055d532671c8cf1680f0668442a 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -44,7 +44,8 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
-from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch
+from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
+from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
 from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
@@ -66,6 +67,7 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
 custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
+custom_op["Pool_Batch"] = Pool_Batch
 custom_op["FMPadding_Batch"] = FMPadding_Batch
 custom_op["Thresholding_Batch"] = Thresholding_Batch
 custom_op["AddStreams_Batch"] = AddStreams_Batch
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index d421a5f3ef8ca980b399087de1482b2ae913da1b..b70b126680d650547cf376dd601c048c73a1cfd4 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -34,6 +34,7 @@ from finn.custom_op.registry import getCustomOp
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
 import finn.core.data_layout as DataLayout
+from finn.util.basic import get_by_name
 
 
 class InferConvInpGen(Transformation):
@@ -56,6 +57,7 @@ class InferConvInpGen(Transformation):
                 k = i2c_inst.get_nodeattr("kernel_size")
                 pad = i2c_inst.get_nodeattr("pad_amount")
                 pad_val = i2c_inst.get_nodeattr("pad_value")
+                depthwise = i2c_inst.get_nodeattr("depthwise")
                 ifm_ch = i2c_in_shape[-1]
                 ifm_dim = i2c_in_shape[1]
                 ofm_dim = i2c_out_shape[1]
@@ -67,7 +69,11 @@ class InferConvInpGen(Transformation):
 
                 if pad > 0:
                     # if padding enabled, ensure pad_val supported by DataType
-                    assert dt.allowed(pad_val), "Im2Col DataType must support pad_val"
+                    # assert dt.allowed(pad_val),"""FMPadding_Batch DataType
+                    # must support pad_val"""
+                    assert (
+                        pad_val == 0
+                    ), "FMPadding_Batch doesn't currently support pad_val!= 0"
 
                     odim_padding = ifm_dim + 2 * pad
 
@@ -112,6 +118,7 @@ class InferConvInpGen(Transformation):
                     Stride=stride,
                     inputDataType=dt.name,
                     outputDataType=dt.name,
+                    depthwise=depthwise,
                 )
                 graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
@@ -169,6 +176,137 @@ class InferStreamingMaxPool(Transformation):
         return (model, graph_modified)
 
 
+class InferPool_Batch(Transformation):
+    """If kernel_shape > strides, replace Pool layer with  with of Im2col
+    + pool(with kernel_shape == strides), plus Transpose layers to keep the original
+    data layout."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type in ["MaxPool"]:
+                # extract pool parameters
+                k = get_by_name(n.attribute, "kernel_shape").ints[-1]
+                stride = get_by_name(n.attribute, "strides").ints[-1]
+
+                if k <= stride:
+                    continue
+
+                try:
+                    pad = get_by_name(n.attribute, "pads").ints[-1]
+                except AttributeError:
+                    pad = 0
+
+                node_input = n.input[0]
+                node_output = n.output[0]
+                idt = model.get_tensor_datatype(node_input)
+                if not idt.is_integer():
+                    continue
+
+                # odt = model.get_tensor_datatype(node_output)
+
+                ifm_ch = model.get_tensor_shape(n.input[0])[1]  # assume NCHW
+                ofm_ch = ifm_ch
+                ifm_dim = model.get_tensor_shape(n.input[0])[-1]  # assume NCHW
+                ofm_dim = model.get_tensor_shape(n.output[0])[-1]  # assume NCHW
+                # create new intermediate values
+                inp_trans_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (1, ifm_dim, ifm_dim, ifm_ch),  # NHWC
+                )
+                graph.value_info.append(inp_trans_out)
+                inp_trans_out = inp_trans_out.name
+                model.set_tensor_datatype(inp_trans_out, idt)
+
+                im2col_out = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (1, ofm_dim, ofm_dim, ifm_ch * k * k),
+                )
+                graph.value_info.append(im2col_out)
+                im2col_out = im2col_out.name
+                model.set_tensor_datatype(im2col_out, idt)
+
+                pool_output = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(),
+                    TensorProto.FLOAT,
+                    (1, ofm_dim, ofm_dim, ofm_ch),
+                )
+                graph.value_info.append(pool_output)
+                pool_output = pool_output.name
+                # model.set_tensor_datatype(pool_output, odt)
+
+                # create new nodes
+                # NCHW -> NHWC
+                inp_trans_node = helper.make_node(
+                    "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1]
+                )
+
+                if n.op_type == "MaxPool":
+                    pool_fxn = "MaxPool"
+                    pad_value = idt.min()
+                else:
+                    raise Exception(
+                        "pad_value and pool_fxn not configured for {}".format(n.op_type)
+                    )
+
+                # format input tensor
+                im2col_node = helper.make_node(
+                    "Im2Col",
+                    [inp_trans_out],
+                    [im2col_out],
+                    domain="finn",
+                    stride=stride,
+                    kernel_size=k,
+                    pad_amount=pad,
+                    pad_value=pad_value,
+                    depthwise=1,
+                    input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
+                )
+
+                # Warning PE has to be equal to ifm_ch until Im2Col is replaced by
+                # ConvolutionInputGenerator with depthwise=1.
+                # For other settings the output will be incorrect due to incorrect input
+                # data layout
+                pool_node = helper.make_node(
+                    "Pool_Batch",
+                    [im2col_out],
+                    [pool_output],
+                    domain="finn",
+                    backend="fpgadataflow",
+                    dataType=idt.name,
+                    Channels=ifm_ch,
+                    PE=ifm_ch,
+                    KernelSize=k,
+                    Function=pool_fxn,
+                    OutImgDim=ofm_dim,
+                    BatchSize=1,
+                )
+
+                # NHWC -> NCHW
+                out_trans_node = helper.make_node(
+                    "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2]
+                )
+
+                # insert nodes where the conv is to preserve topological ordering
+                graph.node.insert(node_ind, inp_trans_node)
+                graph.node.insert(node_ind + 1, im2col_node)
+                graph.node.insert(node_ind + 2, pool_node)
+                graph.node.insert(node_ind + 3, out_trans_node)
+                # remove old node
+                graph.node.remove(n)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class InferBinaryStreamingFCLayer(Transformation):
     """Convert XnorPopcountMatMul layers to
     StreamingFCLayer_Batch layers. Any immediately following MultiThreshold
diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
index 4f050be8540ddf5ef48699d1658b571852ff4510..6eae560e1191642cfaf85d92c6d0fcf644630973 100644
--- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
@@ -80,7 +80,6 @@ class PrepareCppSim(Transformation):
             self._num_workers = mp.cpu_count()
 
     def prepareCppSim_node(self, node):
-        print(node.name)
         if is_fpgadataflow_node(node) is True:
             _codegen_single_node(node, self.model)
         return (node, False)
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f78dcea1a1ce364d0657ad64de7d440d41b822
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from onnx import TensorProto, helper
+import numpy as np
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.custom_op.registry import getCustomOp
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.infer_shapes import InferShapes
+
+
+def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt):
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim]
+    )
+
+    mp_node = helper.make_node(
+        "MaxPool",
+        ["inp"],
+        ["outp"],
+        kernel_shape=[k, k],
+        pads=[pad, pad, pad, pad],
+        strides=[stride, stride],
+    )
+    graph = helper.make_graph(
+        nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="mp-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+    model = model.transform(InferShapes())
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4])
+# pool configuration:                   ( k,stride, pad, ifm_dim )
+@pytest.mark.parametrize(
+    "pool_config", [(3, 2, 0, 5), (3, 2, 1, 5), (2, 2, 0, 8), (5, 2, 2, 7)]
+)
+# input channels
+@pytest.mark.parametrize("ifm_ch", [1, 4, 20])
+# number of out channel computed in parallel
+@pytest.mark.parametrize("pe", [1, 4, 20])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+# pool type
+@pytest.mark.parametrize("op_type", ["MaxPool"])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_type):
+    k, stride, pad, ifm_dim = pool_config
+
+    if ifm_ch % pe != 0:
+        pytest.skip("ifm_ch%pe != 0. Skipping")
+
+    if pad != 0 and idt.signed():
+        pytest.skip("No support for pal_val != 0. Skipping")
+
+    np.random.seed(0)
+    ofm_dim = int(((ifm_dim + 2 * pad - k) / stride) + 1)
+
+    x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim))
+    # prepare input data
+    input_dict = prepare_inputs(x)
+    if op_type == "MaxPool":
+        model = make_single_maxpool_modelwrapper(
+            k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt
+        )
+    else:
+        assert False, "{} is not a supported op_type".format(op_type)
+
+    y_expected = oxe.execute_onnx(model, input_dict)["outp"]
+
+    new_model = model.transform(to_hls.InferPool_Batch())
+    new_model = new_model.transform(GiveUniqueNodeNames())
+
+    if ifm_ch != pe:
+        new_model = new_model.transform(to_hls.InferConvInpGen())
+        # Folding
+        for n in new_model.graph.node:
+            if n.op_type == "ConvolutionInputGenerator":
+                inst = getCustomOp(n)
+                inst.set_nodeattr("SIMD", pe)
+            elif n.op_type == "Pool_Batch":
+                inst = getCustomOp(n)
+                inst.set_nodeattr("PE", pe)
+
+    if exec_mode == "cppsim":
+        new_model = new_model.transform(SetExecMode("cppsim"))
+        new_model = new_model.transform(PrepareCppSim())
+        new_model = new_model.transform(CompileCppSim())
+    elif exec_mode == "rtlsim":
+        new_model = new_model.transform(SetExecMode("rtlsim"))
+        new_model = new_model.transform(GiveUniqueNodeNames())
+        new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5))
+        new_model = new_model.transform(HLSSynthIP())
+        new_model = new_model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+
+    # execute new_model
+    y_produced = oxe.execute_onnx(new_model, input_dict)["outp"]
+    assert (y_produced == y_expected).all()
+    if stride != k:
+        if pad == 0 or ifm_ch == pe:
+            assert len(new_model.graph.node) == 4
+        else:
+            assert len(new_model.graph.node) == 5
+    else:
+        assert len(new_model.graph.node) == 1
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 9d6390b2673e5d2c0e72748183ac04ed222d078e..5ff3da87228a2a32a41226bb46e0b16b1a44df50 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -23,7 +23,7 @@ test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 10
 
 
-def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
+def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style):
     assert pad_style == 2, "only pad_style == 2 supported in hlslib"
     assert padding > 0, "Output dim should be greater than input dim"
     odim = idim + padding
@@ -47,6 +47,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
         inputDataType=str(idt.name),
         PaddingStyle=pad_style,
         numInputVectors=1,
+        SIMD=simd,
     )
 
     graph = helper.make_graph(
@@ -63,11 +64,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
 
 
 # input image dimension
-@pytest.mark.parametrize("idim", [8, 16])
+@pytest.mark.parametrize("idim", [8])
 # number of rows and number of cols to add
 @pytest.mark.parametrize("pad", [2, 3])
 # number of channels
-@pytest.mark.parametrize("num_ch", [1, 2])
+@pytest.mark.parametrize("num_ch", [2, 4])
+# Input parallelism
+@pytest.mark.parametrize("simd", [1, 2])
 # PaddingStyle: selects behavior when (odim-idim)%2 != 0
 @pytest.mark.parametrize("pad_style", [2])
 # FINN input datatype
@@ -76,14 +79,15 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode):
-
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
+    if num_ch % simd != 0:
+        pytest.skip(" num_ch % simd != 0, skipping")
     # generate input data
     x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch])
     input_dict = {"inp": x}
     odim = idim + pad
 
-    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style)
+    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style)
     model = model.transform(InferShapes())
     model = model.transform(SetExecMode(mode))
     model = model.transform(GiveUniqueNodeNames())