diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index daa8319cd3699c9482eed06f1042ae6694dbc5ca..d2e1406d2c4f77e28ee11573cd67831a62b97564 100755
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -32,6 +32,7 @@ import warnings
 
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -44,6 +45,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             "PoolDim": ("ints", True, []),  # [H, W] = [Y, X]
             "NumChannels": ("i", True, 0),
             "PE": ("i", True, 0),
+            "CeilMode": ("i", False, 0),
             # FINN DataTypes for inputs/outputs
             "dataType": ("s", True, ""),
         }
@@ -96,6 +98,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         k_h, k_w = tuple(self.get_nodeattr("PoolDim"))
         ifm_ch = self.get_nodeattr("NumChannels")
+        ceil_mode = self.get_nodeattr("CeilMode")
         if not self.is_1d():
             assert (
                 ifm_dim_h % k_h == 0
@@ -103,8 +106,8 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             assert (
                 ifm_dim_w % k_w == 0
             ), "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0"
-        ofm_dim_h = int(np.floor(ifm_dim_h / k_w))
-        ofm_dim_w = int(np.floor(ifm_dim_w / k_w))
+        ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, k_h, 0, ceil_mode)
+        ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, k_w, 0, ceil_mode)
         oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch)
         return oshape
 
@@ -197,15 +200,19 @@ class StreamingMaxPool_Batch(HLSCustomOp):
     def defines(self, var):
         numReps = 1
         ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
+        ceil_mode = self.get_nodeattr("CeilMode")
+        output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode)
 
         if self.is_1d():
             self.code_gen_dict["$DEFINES$"] = [
                 """#define ImgDim {}\n #define PoolDim {}\n
-                #define NumChannels {}\n #define PE {}\n #define numReps {}""".format(
+                #define NumChannels {}\n #define PE {}\n #define OutputSize {}
+                \n #define numReps {}""".format(
                     ifm_dim[1],
                     k[1],
                     self.get_nodeattr("NumChannels"),
                     self.get_nodeattr("PE"),
+                    output_size,
                     numReps,
                 )
             ]
@@ -264,7 +271,8 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             if self.is_1d():
                 op = "StreamingMaxPool_Precision_1d"
                 self.code_gen_dict["$DOCOMPUTE$"] = [
-                    "%s<ImgDim, PoolDim, NumChannels, PE, %s, %s>(in0, out);"
+                    """%s<ImgDim, PoolDim, NumChannels, PE,
+                     OutputSize, %s, %s>(in0, out);"""
                     % (op, dtype_hls, minval_str)
                 ]
             else:
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index eb9912b48265f771a1d96c2364c33889e7535b29..652136c82303d8b9ca6772f228312fe96efd33ea 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -363,6 +363,7 @@ class InferStreamingMaxPool(Transformation):
                 ifm_dim_h = mp_in_shape[1]
                 ifm_dim_w = mp_in_shape[2]
                 pe = 1
+                ceil_mode = mp_inst.get_nodeattr("ceil_mode")
                 is_1d = (ifm_dim_h == 1 and k_h == 1) or (ifm_dim_w == 1 and k_w == 1)
                 is_divisable = ifm_dim_h % k_h == 0 or ifm_dim_w % k_w == 0
                 if is_1d or is_divisable:
@@ -378,6 +379,7 @@ class InferStreamingMaxPool(Transformation):
                         ImgDim=(ifm_dim_h, ifm_dim_w),
                         dataType=dt.name,
                         PE=pe,
+                        CeilMode=ceil_mode,
                         name="StreamingMaxPool_Batch_" + n.name,
                     )
                     graph.node.insert(node_ind, new_node)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 0cdd6651d982426b1d81d7313346dcd899294bf7..9751f41d7eb1d82229a3409f74ea6ef7441b4729 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -670,6 +670,13 @@ class MakeMaxPoolNHWC(Transformation):
                 if consumer is not None and consumer.op_type == "Transpose":
                     perms = list(get_by_name(consumer.attribute, "perm").ints)
                     if perms == [0, 2, 3, 1]:
+                        ceil_mode = get_by_name(n.attribute, "ceil_mode")
+                        if ceil_mode is not None:
+                            ceil_mode = ceil_mode.i
+                        else:
+                            ceil_mode = (
+                                0  # default to ceil_mode=0 (equivalent to np.floor)
+                            )
                         n.op_type = "MaxPoolNHWC"
                         n.domain = "finn.custom_op.general"
                         start_name = n.input[0]
@@ -683,12 +690,20 @@ class MakeMaxPoolNHWC(Transformation):
                         n.output[0] = end_name
                         model.set_tensor_shape(mid_name, (b, hi, wi, c))
                         model.set_tensor_shape(end_name, (b, ho, wo, c))
+                        getCustomOp(n).set_nodeattr("ceil_mode", ceil_mode)
                         graph.node.remove(consumer)
                         graph.node.insert(node_ind - 1, consumer)
                         graph_modified = True
                 elif producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        ceil_mode = get_by_name(n.attribute, "ceil_mode")
+                        if ceil_mode is not None:
+                            ceil_mode = ceil_mode.i
+                        else:
+                            ceil_mode = (
+                                0  # default to ceil_mode=0 (equivalent to np.floor)
+                            )
                         n.op_type = "MaxPoolNHWC"
                         n.domain = "finn.custom_op.general"
                         start_name = producer.input[0]
@@ -702,6 +717,7 @@ class MakeMaxPoolNHWC(Transformation):
                         n.output[0] = mid_name
                         model.set_tensor_shape(mid_name, (b, ho, wo, c))
                         model.set_tensor_shape(end_name, (b, c, ho, wo))
+                        getCustomOp(n).set_nodeattr("ceil_mode", ceil_mode)
                         graph.node.remove(producer)
                         graph.node.insert(node_ind, producer)
                         graph_modified = True
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index 884e64e06cace43e92c7a62073f59aa00f0d9c0e..313748bff930a06ccc68b8f06c0f24be178990f0 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -35,19 +35,22 @@ import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.core.datatype import DataType
 from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 
 # from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.convert_to_hls_layers import InferStreamingMaxPool
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.infer_shapes import InferShapes
 from finn.util.basic import gen_finn_dt_tensor
 
 
-def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
+def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode):
     k_h, k_w = k
     ifm_dim_h, ifm_dim_w = ifm_dim
     ofm_dim_h, ofm_dim_w = ofm_dim
@@ -66,6 +69,7 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
         domain="finn.custom_op.general",
         kernel_shape=[k_h, k_w],
         strides=[k_h, k_w],
+        ceil_mode=ceil_mode,
         pads=[0, 0, 0, 0],
     )
     graph = helper.make_graph(
@@ -81,7 +85,9 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt):
     return model
 
 
-def make_single_streamingmaxpool_modelwrapper(k, ifm_ch, pe, ifm_dim, ofm_dim, idt):
+def make_single_streamingmaxpool_modelwrapper(
+    k, ifm_ch, pe, ifm_dim, ofm_dim, idt, ceil_mode
+):
     k_h, k_w = k
     ifm_dim_h, ifm_dim_w = ifm_dim
     ofm_dim_h, ofm_dim_w = ofm_dim
@@ -103,6 +109,7 @@ def make_single_streamingmaxpool_modelwrapper(k, ifm_ch, pe, ifm_dim, ofm_dim, i
         NumChannels=ifm_ch,
         PE=pe,
         ImgDim=[ifm_dim_h, ifm_dim_w],
+        CeilMode=ceil_mode,
         dataType=idt.name,
     )
     graph = helper.make_graph(
@@ -129,16 +136,20 @@ def prepare_inputs(input_tensor):
 # kernel size
 @pytest.mark.parametrize("k", [2, 4])
 # input dimension
-@pytest.mark.parametrize("ifm_dim", [4, 8])
+@pytest.mark.parametrize("ifm_dim", [4, 10])
 # input channels
 @pytest.mark.parametrize("ifm_ch", [1, 3])  # 1,3
 # pe
 @pytest.mark.parametrize("pe", [1, 3])
+# ceil mode
+@pytest.mark.parametrize("ceil_mode", [1])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["rtlsim", "cppsim"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, exec_mode):
+def test_fpgadataflow_streamingmaxpool(
+    idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil_mode, exec_mode
+):
     ifm_dim_h = ifm_dim
     k_h = k
     if dim_1d:
@@ -152,12 +163,12 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, exec
 
     stride_h = k_h
     stride_w = k_w
-    ofm_dim_h = int(((ifm_dim_h - k_h) / stride_h) + 1)
-    ofm_dim_w = int(((ifm_dim_w - k_w) / stride_w) + 1)
+    ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, stride_h, 0, ceil_mode)
+    ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, stride_w, 0, ceil_mode)
     ofm_dim = (ofm_dim_h, ofm_dim_w)
     if idt == DataType["BIPOLAR"] and dim_1d:
         pytest.skip("Skipping binary StreamingMaxPool_1d (not implemented)")
-    if ifm_dim_h % k_h != 0 or ifm_dim_w % k_w != 0:
+    if (ifm_dim_h % k_h != 0 or ifm_dim_w % k_w != 0) and (not dim_1d):
         pytest.skip("Skipping StreamingMaxPool test w/ ImgDim % PoolDim != 0")
     if pe > ifm_ch:
         pytest.skip("SIMD cannot be larger than number of input channels")
@@ -166,12 +177,13 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, exec
     # prepare input data
     input_dict = prepare_inputs(x)
 
-    golden = make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt)
+    golden = make_single_maxpoolnhwc_modelwrapper(
+        k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode
+    )
     y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
 
-    model = make_single_streamingmaxpool_modelwrapper(
-        k, ifm_ch, pe, ifm_dim, ofm_dim, idt
-    )
+    model = golden.transform(InferStreamingMaxPool())
+    model = model.transform(InferShapes())
 
     if exec_mode == "cppsim":
         model = model.transform(SetExecMode("cppsim"))
diff --git a/tests/transformation/streamline/test_maxpool_nhwc.py b/tests/transformation/streamline/test_maxpool_nhwc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e600577981f0e7305b8183fcfabec8cc1e042712
--- /dev/null
+++ b/tests/transformation/streamline/test_maxpool_nhwc.py
@@ -0,0 +1,108 @@
+import pytest
+
+import onnx
+import onnx.helper as oh
+from onnx import TensorProto
+
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.custom_op.general.maxpoolnhwc import compute_pool_output_dim
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+from finn.util.basic import gen_finn_dt_tensor
+
+
+def create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt):
+    ofm_dim_h = compute_pool_output_dim(
+        ifm_dim[0], kernel_shape[0], strides[0], pads[0], ceil_mode
+    )
+    ofm_dim_w = compute_pool_output_dim(
+        ifm_dim[1], kernel_shape[1], strides[1], pads[1], ceil_mode
+    )
+    inp = oh.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
+    )
+    outp_mp = oh.make_tensor_value_info(
+        "outp_mp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
+    )
+    outp = oh.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
+    )
+
+    maxpool_node = oh.make_node(
+        "MaxPool",
+        inputs=["inp"],
+        outputs=["out_mp"],
+        ceil_mode=ceil_mode,
+        kernel_shape=kernel_shape,
+        pads=pads,
+        strides=strides,
+    )
+
+    transpose_node = onnx.helper.make_node(
+        "Transpose",
+        inputs=["out_mp"],
+        outputs=["outp"],
+        name="Transpose1",
+        perm=[0, 2, 3, 1],
+    )
+
+    graph = oh.make_graph(
+        nodes=[maxpool_node, transpose_node],
+        name="maxpool_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_mp],
+    )
+
+    model = oh.make_model(graph, producer_name="maxpool_model")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+
+    model = model.transform(InferShapes())
+
+    return model
+
+
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[8, 8], [9, 9]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [3])
+# kernel shape
+@pytest.mark.parametrize("kernel_shape", [[2, 2]])
+# padding
+@pytest.mark.parametrize("pads", [[0, 0, 0, 0], [1, 1, 1, 1]])
+# strides
+@pytest.mark.parametrize("strides", [[2, 2]])
+# ceil_mode
+@pytest.mark.parametrize("ceil_mode", [0, 1])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+def test_maxpool_nhwc(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt):
+    # create MaxPool node
+    maxpool_model = create_maxpool(
+        ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt
+    )
+
+    # generate input tensor for testing
+    input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim[0], ifm_dim[1]])
+    input_dict = {"inp": input_tensor}
+
+    # execute first model
+    output_dict = oxe.execute_onnx(maxpool_model, input_dict)
+    expected = output_dict["outp"]
+
+    # transform MaxPool into MaxPoolNHWC
+    maxpool_model = maxpool_model.transform(MakeMaxPoolNHWC())
+
+    # execute transformed model
+    output_node_name = maxpool_model.graph.output[0].name
+    output_dict = oxe.execute_onnx(
+        maxpool_model, input_dict, return_full_exec_context=False
+    )
+    output = output_dict[output_node_name]
+
+    # compare outputs
+    assert (expected == output).all()