diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
old mode 100644
new mode 100755
index 23e993800493829ab4a38eeb7f51f98e08d5d3a8..765a86476c5204477888818cb8c2a85c5da2eb2d
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -99,16 +99,21 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         k_h, k_w = tuple(self.get_nodeattr("PoolDim"))
         ifm_ch = self.get_nodeattr("NumChannels")
-        #assert ifm_dim_h % k_h == 0, "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0"
-        #assert ifm_dim_w % k_w == 0, "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
-        if (ifm_dim_h % k_h == 0):
+        if not self.is_1d():
+            assert (
+                ifm_dim_h % k_h == 0
+            ), "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0"
+            assert (
+                ifm_dim_w % k_w == 0
+            ), "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
+        if ifm_dim_h % k_h == 0:
             ofm_dim_h = int(ifm_dim_h / k_h)
         else:
-            ofm_dim_h = int(np.floor(ifm_dim_h / k_h) + 1)
-        if (ifm_dim_w % k_w == 0):
+            ofm_dim_h = int(np.ceil(ifm_dim_h / k_h))
+        if ifm_dim_w % k_w == 0:
             ofm_dim_w = int(ifm_dim_w / k_w)
         else:
-            ofm_dim_w = int(np.floor(ifm_dim_w / k_w) + 1)
+            ofm_dim_w = int(np.ceil(ifm_dim_w / k_w))
         oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch)
         return oshape
 
@@ -139,7 +144,6 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         if self.is_1d():
             exp_cycles = ofm_dim_w * nf * (k[1] + 1)
             return int(exp_cycles)
-            #return int(ifm_dim[1] + k[1])
         else:
             # TODO: adjust inaccurate formula
             return int(ifm_dim[1] * (ifm_dim[1] + (ifm_dim[1] / k[1])))
@@ -258,26 +262,26 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             if self.is_1d():
                 raise Exception("Binary 1d MaxPool not implemented on HLS backend")
             else:
-                op = "StreamingMaxPool"
+                op = "StreamingMaxPool_Batch"
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                "%s<ImgDim, PoolDim, NumChannels>(in0, out);" % (op)
+                "%s<ImgDim, PoolDim, NumChannels>(in0, out, numReps);" % (op)
             ]
         else:
             dtype = self.get_input_datatype()
             dtype_hls = dtype.get_hls_datatype_str()
             minval_str = str(int(dtype.min()))
             if self.is_1d():
-                op = "StreamingMaxPool_Precision_1d"
+                op = "StreamingMaxPool_Precision_Batch_1d"
                 self.code_gen_dict["$DOCOMPUTE$"] = [
-                "%s<ImgDim, PoolDim, NumChannels, PE, %s, %s>(in0, out);"
-                % (op, dtype_hls, minval_str)
-            ]
+                    "%s<ImgDim, PoolDim, NumChannels, PE, %s, %s>(in0, out, numReps);"
+                    % (op, dtype_hls, minval_str)
+                ]
             else:
-                op = "StreamingMaxPool_Precision"
+                op = "StreamingMaxPool_Precision_Batch"
                 self.code_gen_dict["$DOCOMPUTE$"] = [
-                "%s<ImgDim, PoolDim, NumChannels, %s, %s>(in0, out);"
-                % (op, dtype_hls, minval_str)
-            ]
+                    "%s<ImgDim, PoolDim, NumChannels, %s, %s>(in0, out, numReps);"
+                    % (op, dtype_hls, minval_str)
+                ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -333,6 +337,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
         # TODO ensure codegen dir exists
@@ -360,7 +365,8 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             export_idt = DataType["BINARY"]
         else:
             export_idt = self.get_input_datatype()
-        # no reshaping for input since assuming no folding on input
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
         # make copy before saving array
         reshaped_input = inp.copy()
         np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
@@ -373,7 +379,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             assert (
                 context[node.output[0]].shape == folded_oshape
             ), "cppsim \
-            did not produce expected ofolded utput shape"
+            did not produce expected folded output shape"
             context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
@@ -411,4 +417,4 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         assert (
             context[node.output[0]].shape == exp_oshape
         ), """Output
-        shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch)."""
+        shape doesn't match expected shape (1, ofm_dim, ofm_dim, ifm_ch)."""