diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index 27dfab54ec6d483d948dd383e54a44117d7c1a65..99f959bf59f06d6ab5b71dd7245d657f4964cca4 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -17,9 +17,17 @@ class FMPadding_Batch(HLSCustomOp):
     def get_nodeattr_types(self):
         my_attrs = {
             # spatial size of input images
-            "ImgDim": ("i", True, 0),
+            "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
             # total padding (per dimension) to apply
-            "Padding": ("i", True, 2),
+            # NOTE: Current padding scheme that is applied tries to pad the same
+            # amount of zeros in front and behind the image for each dimension.
+            # As an example, a padding scheme such as [1, x, 3, x] is equal
+            # to [2, x, 2, x]
+            "Padding": (
+                "ints",
+                True,
+                [1, 1, 1, 1],
+            ),  # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end]
             # number of channels in input image
             "NumChannels": ("i", True, 0),
             # SIMD Input parallelism
@@ -38,31 +46,33 @@ class FMPadding_Batch(HLSCustomOp):
 
     def get_padded_odim(self):
         "Return the padded spatial size of the output."
-
-        idim = self.get_nodeattr("ImgDim")
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
         pad = self.get_nodeattr("Padding")
-        return idim + pad
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        odim_h = idim_h + pad_h
+        odim_w = idim_w + pad_w
+        return [odim_h, odim_w]
 
     def get_exp_cycles(self):
-        odim = self.get_padded_odim()
+        odim_h, odim_w = self.get_padded_odim()
         channels = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
         batch_size = self.get_nodeattr("numInputVectors")
-        exp_cycles = (channels / simd) * batch_size * odim * odim
+        exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
         return int(exp_cycles)
 
     def get_normal_input_shape(self):
-        idim = self.get_nodeattr("ImgDim")
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
-
-        ishape = (1, idim, idim, num_ch)
+        ishape = (1, idim_h, idim_w, num_ch)
         return ishape
 
     def get_normal_output_shape(self):
-        odim = self.get_padded_odim()
+        odim_h, odim_w = self.get_padded_odim()
         num_ch = self.get_nodeattr("NumChannels")
 
-        oshape = (1, odim, odim, num_ch)
+        oshape = (1, odim_h, odim_w, num_ch)
         return oshape
 
     def get_folded_input_shape(self):
@@ -148,20 +158,53 @@ class FMPadding_Batch(HLSCustomOp):
         self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
 
     def defines(self, var):
-        self.code_gen_dict["$DEFINES$"] = [
-            """#define ImgDim1 {}\n#define OutputDim1 {}\n
-            #define Padding1 {}\n#define NumChannels1 {}\n
-            #define PaddingStyle1 {}\n#define numReps {}
-            #define SIMD1 {}\n""".format(
-                self.get_nodeattr("ImgDim"),
-                self.get_padded_odim(),
-                self.get_nodeattr("Padding"),
-                self.get_nodeattr("NumChannels"),
-                self.get_nodeattr("PaddingStyle"),
-                self.get_nodeattr("numInputVectors"),
-                self.get_nodeattr("SIMD"),
-            )
-        ]
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        odim_h, odim_w = self.get_padded_odim()
+        pad = self.get_nodeattr("Padding")
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        is_square = idim_h == idim_w
+
+        if is_square:
+            assert (
+                pad_h == pad_w
+            ), "Only equal padding along the dimensions for square images is supported"
+            self.code_gen_dict["$DEFINES$"] = [
+                """#define ImgDim1 {}\n#define OutputDim1 {}\n
+                #define Padding1 {}\n#define NumChannels1 {}\n
+                #define SIMD1 {}\n#define PaddingStyle1 {}\n
+                #define numReps {}\n""".format(
+                    idim_h,
+                    odim_h,
+                    pad_h,
+                    self.get_nodeattr("NumChannels"),
+                    self.get_nodeattr("SIMD"),
+                    self.get_nodeattr("PaddingStyle"),
+                    self.get_nodeattr("numInputVectors"),
+                )
+            ]
+        else:
+            self.code_gen_dict["$DEFINES$"] = [
+                """
+                #define OutputDim1_x {}\n
+                #define OutputDim1_y {}\n
+                #define Padding1_x {}\n
+                #define Padding1_y {}\n
+                #define NumChannels1 {}\n
+                #define SIMD1 {}\n
+                #define PaddingStyle1 {}\n
+                #define numReps {}\n
+                """.format(
+                    odim_w,
+                    odim_h,
+                    pad_w,
+                    pad_h,
+                    self.get_nodeattr("NumChannels"),
+                    self.get_nodeattr("SIMD"),
+                    self.get_nodeattr("PaddingStyle"),
+                    self.get_nodeattr("numInputVectors"),
+                )
+            ]
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -193,12 +236,26 @@ class FMPadding_Batch(HLSCustomOp):
     def docompute(self):
         in_t = self.get_input_datatype().get_hls_datatype_str()
         node = self.onnx_node
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
-            {}, PaddingStyle1> (in0, out, numReps);""".format(
-                node.op_type, in_t
-            )
-        ]
+
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        is_square = idim_h == idim_w
+
+        if is_square:
+            hls_call = node.op_type
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
+                {}, PaddingStyle1> (in0, out, numReps);""".format(
+                    hls_call, in_t
+                )
+            ]
+        else:
+            hls_call = "FMPadding_nonsquare_Batch"
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<OutputDim1_x, OutputDim1_y, Padding1_x, Padding1_y, NumChannels1,
+                SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format(
+                    hls_call, in_t
+                )
+            ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -270,7 +327,7 @@ class FMPadding_Batch(HLSCustomOp):
         assert (
             inp.shape == exp_ishape
         ), """Input shape doesn't
-        match expected shape (1, ImgDim, ImgDim, NumChannels)."""
+        match expected shape (1, ImgDim_h, ImgDim_w, NumChannels)."""
         export_idt = self.get_input_datatype()
 
         reshaped_input = inp.reshape(folded_ishape)
@@ -316,4 +373,4 @@ class FMPadding_Batch(HLSCustomOp):
         assert (
             context[node.output[0]].shape == exp_oshape
         ), """Output shape doesn't match expected shape
-            (1, OutputDim, OutputDim, NumChannels)."""
+            (1, OutputDim_H, OutputDim_W, NumChannels)."""
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index b2835d578b03ee689330d53a9a7b233c9b9f4222..ab47b300136bd95622f064b30e3bbaae76a61597 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -54,15 +54,20 @@ target_clk_ns = 10
 
 
 def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style):
+    pad_h = padding[0] + padding[2]
+    pad_w = padding[1] + padding[3]
+    idim_h, idim_w = idim
+
     assert pad_style == 2, "only pad_style == 2 supported in hlslib"
-    assert padding > 0, "Output dim should be greater than input dim"
-    odim = idim + padding
+    assert pad_h > 0 or pad_w > 0, "Output dim should be greater than input dim"
+    odim_h = idim_h + pad_h
+    odim_w = idim_w + pad_w
 
     inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, idim, idim, num_ch]
+        "inp", TensorProto.FLOAT, [1, idim_h, idim_w, num_ch]
     )
     outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, odim, odim, num_ch]
+        "outp", TensorProto.FLOAT, [1, odim_h, odim_w, num_ch]
     )
 
     FMPadding = helper.make_node(
@@ -94,9 +99,9 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
 
 
 # input image dimension
-@pytest.mark.parametrize("idim", [8])
+@pytest.mark.parametrize("idim", [[8, 8], [10, 8]])
 # number of rows and number of cols to add
-@pytest.mark.parametrize("pad", [2, 3])
+@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3]])
 # number of channels
 @pytest.mark.parametrize("num_ch", [2, 4])
 # Input parallelism
@@ -112,10 +117,22 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
 def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
     if num_ch % simd != 0:
         pytest.skip(" num_ch % simd != 0, skipping")
+
+    idim_h, idim_w = idim
+    pad_h = pad[0] + pad[2]
+    pad_w = pad[1] + pad[3]
+
+    if idim_h == idim_w and pad_h != pad_w:
+        pytest.skip(
+            """Only equal padding along the dimensions for square images
+            is supported, skipping"""
+        )
+
     # generate input data
-    x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch])
+    x = gen_finn_dt_tensor(idt, [1, idim_h, idim_w, num_ch])
     input_dict = {"inp": x}
-    odim = idim + pad
+    odim_h = idim_h + pad_h
+    odim_w = idim_w + pad_w
 
     model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style)
     model = model.transform(InferShapes())
@@ -129,24 +146,26 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
-    expected_oshape = (1, odim, odim, num_ch)
+    expected_oshape = (1, odim_h, odim_w, num_ch)
     assert y_produced.shape == expected_oshape
 
     # calculate reference
     # calculate correct pad according to parameters
     if pad_style == 2:
-        if pad % 2 == 0:
-            pad_up = pad // 2
-            pad_left = pad // 2
+        if pad_h % 2 == 0:
+            pad_up = pad_h // 2
+        else:
+            pad_up = pad_h // 2 + 1
+        if pad_w % 2 == 0:
+            pad_left = pad_w // 2
         else:
-            pad_up = pad // 2 + 1
-            pad_left = pad // 2 + 1
+            pad_left = pad_w // 2 + 1
     else:
-        pad_up = pad // 2
-        pad_left = pad // 2
+        pad_up = pad_h // 2
+        pad_left = pad_w // 2
 
-    pad_down = pad - pad_up
-    pad_right = pad - pad_left
+    pad_down = pad_h - pad_up
+    pad_right = pad_w - pad_left
 
     y_expected = np.pad(
         x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant"