diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
old mode 100644
new mode 100755
index bc771cc796796a9e38c7c266c81a4e65431e6524..daa8319cd3699c9482eed06f1042ae6694dbc5ca
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -32,7 +32,6 @@ import warnings
 
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.custom_op.general.im2col import compute_conv_output_dim
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -44,6 +43,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
             "PoolDim": ("ints", True, []),  # [H, W] = [Y, X]
             "NumChannels": ("i", True, 0),
+            "PE": ("i", True, 0),
             # FINN DataTypes for inputs/outputs
             "dataType": ("s", True, ""),
         }
@@ -82,24 +82,29 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         return ishape
 
     def get_folded_input_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
-        ret = list(self.get_normal_input_shape())
-        ret.insert(-1, 1)
-        return tuple(ret)
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
+        ifm_ch = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        nf = int(ifm_ch / pe)
+        if self.is_1d():
+            folded_ishape = (1, ifm_dim_h, ifm_dim_w, nf, pe)
+        else:
+            folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch)
+        return folded_ishape
 
     def get_normal_output_shape(self):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         k_h, k_w = tuple(self.get_nodeattr("PoolDim"))
         ifm_ch = self.get_nodeattr("NumChannels")
-        stride_h = k_h
-        stride_w = k_w
-        pad = 0
-        assert ifm_dim_h % k_h == 0, "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0"
-        assert ifm_dim_w % k_w == 0, "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
-        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad)
-        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad)
+        if not self.is_1d():
+            assert (
+                ifm_dim_h % k_h == 0
+            ), "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0"
+            assert (
+                ifm_dim_w % k_w == 0
+            ), "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
+        ofm_dim_h = int(np.floor(ifm_dim_h / k_w))
+        ofm_dim_w = int(np.floor(ifm_dim_w / k_w))
         oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch)
         return oshape
 
@@ -107,8 +112,15 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         # even though there is no folding in the current hlslib op,
         # insert a time multiplexing axis to remain compatible with the
         # shapes produced by the rest of the dataflow pipeline
+        ifm_ch = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        nf = int(ifm_ch / pe)
         ret = list(self.get_normal_output_shape())
-        ret.insert(-1, 1)
+        if self.is_1d():
+            ret[-1] = nf
+            ret.append(pe)
+        else:
+            ret.insert(-1, 1)
         return tuple(ret)
 
     def get_number_output_values(self):
@@ -118,20 +130,27 @@ class StreamingMaxPool_Batch(HLSCustomOp):
     def get_exp_cycles(self):
         # derived from StreamingMaxPool_Batch loop nest
         ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
+        _, _, ofm_dim_w, nf, _ = self.get_folded_output_shape()
+
         if self.is_1d():
-            return int(ifm_dim[1] + k[1])
+            exp_cycles = ofm_dim_w * nf * (k[1] + 1)
+            return int(exp_cycles)
         else:
             # TODO: adjust inaccurate formula
             return int(ifm_dim[1] * (ifm_dim[1] + (ifm_dim[1] / k[1])))
 
     def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
         ifm_ch = self.get_nodeattr("NumChannels")
-        in_width = int(dt_bits * ifm_ch)
+        if self.is_1d():
+            in_width = int(dt_bits * pe)
+        else:
+            in_width = int(dt_bits * ifm_ch)
         return in_width
 
     def get_outstream_width(self):
-        """For streaming maxpool out stream with is the same as in stream width"""
+        """For streaming maxpool out stream width is the same as in stream width"""
         return self.get_instream_width()
 
     def make_shape_compatible_op(self, model):
@@ -179,15 +198,27 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         numReps = 1
         ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
 
-        self.code_gen_dict["$DEFINES$"] = [
-            """#define ImgDim {}\n #define PoolDim {}\n
-            #define NumChannels {}\n #define numReps {}""".format(
-                ifm_dim[1],
-                k[1],
-                self.get_nodeattr("NumChannels"),
-                numReps,
-            )
-        ]
+        if self.is_1d():
+            self.code_gen_dict["$DEFINES$"] = [
+                """#define ImgDim {}\n #define PoolDim {}\n
+                #define NumChannels {}\n #define PE {}\n #define numReps {}""".format(
+                    ifm_dim[1],
+                    k[1],
+                    self.get_nodeattr("NumChannels"),
+                    self.get_nodeattr("PE"),
+                    numReps,
+                )
+            ]
+        else:
+            self.code_gen_dict["$DEFINES$"] = [
+                """#define ImgDim {}\n #define PoolDim {}\n
+                #define NumChannels {}\n #define numReps {}""".format(
+                    ifm_dim[1],
+                    k[1],
+                    self.get_nodeattr("NumChannels"),
+                    numReps,
+                )
+            ]
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -227,17 +258,21 @@ class StreamingMaxPool_Batch(HLSCustomOp):
                 "%s<ImgDim, PoolDim, NumChannels>(in0, out);" % (op)
             ]
         else:
+            dtype = self.get_input_datatype()
+            dtype_hls = dtype.get_hls_datatype_str()
+            minval_str = str(int(dtype.min()))
             if self.is_1d():
                 op = "StreamingMaxPool_Precision_1d"
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    "%s<ImgDim, PoolDim, NumChannels, PE, %s, %s>(in0, out);"
+                    % (op, dtype_hls, minval_str)
+                ]
             else:
                 op = "StreamingMaxPool_Precision"
-            dtype = self.get_input_datatype()
-            dtype_hls = dtype.get_hls_datatype_str()
-            minval_str = str(int(dtype.min()))
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                "%s<ImgDim, PoolDim, NumChannels, %s, %s>(in0, out);"
-                % (op, dtype_hls, minval_str)
-            ]
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    "%s<ImgDim, PoolDim, NumChannels, %s, %s>(in0, out);"
+                    % (op, dtype_hls, minval_str)
+                ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -293,6 +328,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
         # TODO ensure codegen dir exists
@@ -320,9 +356,8 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             export_idt = DataType["BINARY"]
         else:
             export_idt = self.get_input_datatype()
-        # no reshaping for input since assuming no folding on input
-        # make copy before saving array
-        reshaped_input = inp.copy()
+
+        reshaped_input = inp.reshape(folded_ishape)
         np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
 
         if mode == "cppsim":
@@ -333,7 +368,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             assert (
                 context[node.output[0]].shape == folded_oshape
             ), "cppsim \
-            did not produce expected ofolded utput shape"
+            did not produce expected folded output shape"
             context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
@@ -371,4 +406,4 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         assert (
             context[node.output[0]].shape == exp_oshape
         ), """Output
-        shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch)."""
+        shape doesn't match expected shape (1, ofm_dim, ofm_dim, ifm_ch)."""
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 2f83f23cb51248c6a6e6a9a1233f41435c3bf966..eb9912b48265f771a1d96c2364c33889e7535b29 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -362,7 +362,10 @@ class InferStreamingMaxPool(Transformation):
                 ifm_ch = mp_in_shape[-1]
                 ifm_dim_h = mp_in_shape[1]
                 ifm_dim_w = mp_in_shape[2]
-                if ifm_dim_h % k_h == 0 and ifm_dim_w % k_w == 0:
+                pe = 1
+                is_1d = (ifm_dim_h == 1 and k_h == 1) or (ifm_dim_w == 1 and k_w == 1)
+                is_divisable = ifm_dim_h % k_h == 0 or ifm_dim_w % k_w == 0
+                if is_1d or is_divisable:
                     # create equivalent StreamingMaxPool_Batch node
                     new_node = helper.make_node(
                         "StreamingMaxPool_Batch",
@@ -374,6 +377,7 @@ class InferStreamingMaxPool(Transformation):
                         NumChannels=ifm_ch,
                         ImgDim=(ifm_dim_h, ifm_dim_w),
                         dataType=dt.name,
+                        PE=pe,
                         name="StreamingMaxPool_Batch_" + n.name,
                     )
                     graph.node.insert(node_ind, new_node)
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index f18fd8d1019337e7b87ae9e47ba3a5b53ec849f7..884e64e06cace43e92c7a62073f59aa00f0d9c0e 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -81,7 +81,7 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
     return model
 
 
-def make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
+def make_single_streamingmaxpool_modelwrapper(k, ifm_ch, pe, ifm_dim, ofm_dim, idt):
     k_h, k_w = k
     ifm_dim_h, ifm_dim_w = ifm_dim
     ofm_dim_h, ofm_dim_w = ofm_dim
@@ -101,6 +101,7 @@ def make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
         backend="fpgadataflow",
         PoolDim=[k_h, k_w],
         NumChannels=ifm_ch,
+        PE=pe,
         ImgDim=[ifm_dim_h, ifm_dim_w],
         dataType=idt.name,
     )
@@ -131,11 +132,13 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("ifm_dim", [4, 8])
 # input channels
 @pytest.mark.parametrize("ifm_ch", [1, 3])  # 1,3
+# pe
+@pytest.mark.parametrize("pe", [1, 3])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["rtlsim", "cppsim"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, exec_mode):
+def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, exec_mode):
     ifm_dim_h = ifm_dim
     k_h = k
     if dim_1d:
@@ -156,6 +159,8 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, exec_mod
         pytest.skip("Skipping binary StreamingMaxPool_1d (not implemented)")
     if ifm_dim_h % k_h != 0 or ifm_dim_w % k_w != 0:
         pytest.skip("Skipping StreamingMaxPool test w/ ImgDim % PoolDim != 0")
+    if pe > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
 
     x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
     # prepare input data
@@ -164,7 +169,9 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, exec_mod
     golden = make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt)
     y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
 
-    model = make_single_streamingmaxpool_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt)
+    model = make_single_streamingmaxpool_modelwrapper(
+        k, ifm_ch, pe, ifm_dim, ofm_dim, idt
+    )
 
     if exec_mode == "cppsim":
         model = model.transform(SetExecMode("cppsim"))
@@ -173,7 +180,7 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, exec_mod
     elif exec_mode == "rtlsim":
         model = model.transform(SetExecMode("rtlsim"))
         model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(PrepareIP("xczu3eg-sbva484-1-e", 5))
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
     else: