diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 51b831f758dc8573b62d0765c97a50025c6d1e6c..ffea9f4c26f532b189c477d2e8afad9cb8352ba7 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -107,7 +107,6 @@ class ConvolutionInputGenerator(HLSCustomOp):
         pad = 0
         ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad)
         assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        assert k % stride == 0, "stride must divide kernel size k"
         wf = int((k * k * ifm_ch) // simd)
         folded_oshape = (1, ofm_dim, ofm_dim, wf, simd)
         return folded_oshape
@@ -314,18 +313,25 @@ class ConvolutionInputGenerator(HLSCustomOp):
             "ultra": "ap_resource_uram()",
         }
         hls_ram_style = map_to_hls_ram_style[ram_style]
+        hls_call = node.op_type
+        # check if non optimized ConvolutionInputGenerator is needed
+        k = self.get_nodeattr("ConvKernelDim")
+        stride = self.get_nodeattr("Stride")
+        if k % stride != 0:
+            hls_call += "_kernel_stride"
+
         if self.get_nodeattr("depthwise") == 1:
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}_dws<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
                     OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format(
-                    node.op_type, hls_ram_style
+                    hls_call, hls_ram_style
                 )
             ]
         else:
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
                     OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format(
-                    node.op_type, hls_ram_style
+                    hls_call, hls_ram_style
                 )
             ]
 
diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py
index 16446c15d46ee7996162f864708f7fde6cfedaf3..1ac2dad677f76b8f2aca1a04d96f4ae379940e9a 100644
--- a/src/finn/custom_op/im2col.py
+++ b/src/finn/custom_op/im2col.py
@@ -21,8 +21,6 @@ def get_im2col_indices_nchw(
     """Returns im2col indices."""
     # First figure out what the size of the output should be
     N, C, H, W = x_shape
-    assert (H + 2 * padding - field_height) % stride_y == 0
-    assert (W + 2 * padding - field_width) % stride_x == 0
     out_height = compute_conv_output_dim(H, field_height, stride_y, padding)
     out_width = compute_conv_output_dim(W, field_width, stride_x, padding)
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 2b7201f08f7af03711d71d1f9dd12ac8b365c0f6..86511554ea7411a556bf1af0718af5e30a99841d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -127,7 +127,7 @@ def prepare_inputs(input_tensor):
 # input datatype
 @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2])
 # kernel size
-@pytest.mark.parametrize("k", [2, 4])
+@pytest.mark.parametrize("k", [2, 3, 4])
 # input dimension
 @pytest.mark.parametrize("ifm_dim", [6, 8])
 # input channels