diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 765a86476c5204477888818cb8c2a85c5da2eb2d..daa8319cd3699c9482eed06f1042ae6694dbc5ca 100755
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -82,9 +82,6 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         return ishape
 
     def get_folded_input_shape(self):
-        # even though there is no folding in the current hlslib op,
-        # insert a time multiplexing axis to remain compatible with the
-        # shapes produced by the rest of the dataflow pipeline
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         ifm_ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
@@ -92,7 +89,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         if self.is_1d():
             folded_ishape = (1, ifm_dim_h, ifm_dim_w, nf, pe)
         else:
-            folded_ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch, 1)
+            folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch)
         return folded_ishape
 
     def get_normal_output_shape(self):
@@ -106,14 +103,8 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             assert (
                 ifm_dim_w % k_w == 0
             ), "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
-        if ifm_dim_h % k_h == 0:
-            ofm_dim_h = int(ifm_dim_h / k_h)
-        else:
-            ofm_dim_h = int(np.ceil(ifm_dim_h / k_h))
-        if ifm_dim_w % k_w == 0:
-            ofm_dim_w = int(ifm_dim_w / k_w)
-        else:
-            ofm_dim_w = int(np.ceil(ifm_dim_w / k_w))
+        ofm_dim_h = int(np.floor(ifm_dim_h / k_w))
+        ofm_dim_w = int(np.floor(ifm_dim_w / k_w))
         oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch)
         return oshape
 
@@ -129,7 +120,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             ret[-1] = nf
             ret.append(pe)
         else:
-            ret.append(1)
+            ret.insert(-1, 1)
         return tuple(ret)
 
     def get_number_output_values(self):
@@ -262,24 +253,24 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             if self.is_1d():
                 raise Exception("Binary 1d MaxPool not implemented on HLS backend")
             else:
-                op = "StreamingMaxPool_Batch"
+                op = "StreamingMaxPool"
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                "%s<ImgDim, PoolDim, NumChannels>(in0, out, numReps);" % (op)
+                "%s<ImgDim, PoolDim, NumChannels>(in0, out);" % (op)
             ]
         else:
             dtype = self.get_input_datatype()
             dtype_hls = dtype.get_hls_datatype_str()
             minval_str = str(int(dtype.min()))
             if self.is_1d():
-                op = "StreamingMaxPool_Precision_Batch_1d"
+                op = "StreamingMaxPool_Precision_1d"
                 self.code_gen_dict["$DOCOMPUTE$"] = [
-                    "%s<ImgDim, PoolDim, NumChannels, PE, %s, %s>(in0, out, numReps);"
+                    "%s<ImgDim, PoolDim, NumChannels, PE, %s, %s>(in0, out);"
                     % (op, dtype_hls, minval_str)
                 ]
             else:
-                op = "StreamingMaxPool_Precision_Batch"
+                op = "StreamingMaxPool_Precision"
                 self.code_gen_dict["$DOCOMPUTE$"] = [
-                    "%s<ImgDim, PoolDim, NumChannels, %s, %s>(in0, out, numReps);"
+                    "%s<ImgDim, PoolDim, NumChannels, %s, %s>(in0, out);"
                     % (op, dtype_hls, minval_str)
                 ]
 
@@ -365,10 +356,8 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             export_idt = DataType["BINARY"]
         else:
             export_idt = self.get_input_datatype()
-        # reshape input into folded form
-        inp = inp.reshape(folded_ishape)
-        # make copy before saving array
-        reshaped_input = inp.copy()
+
+        reshaped_input = inp.reshape(folded_ishape)
         np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
 
         if mode == "cppsim":