diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
index 669419f909f517b0a9589743bb2dc1e4114d79d3..ee59d14c202e3e6b1d33a4bd8bb8b70a36748a9f 100644
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        5. July 2021
- * $Revision:    V.5.6.0
+ * $Date:        20. July 2021
+ * $Revision:    V.5.7.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -48,6 +48,13 @@ extern "C" {
 #define CLAMP(x, h, l) MAX(MIN((x), (h)), (l))
 #define REDUCE_MULTIPLIER(_mult) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)
 
+/**
+ * @brief definition to pack four 8 bit values.
+ */
+#define PACK_Q7x4_32x1(v0, v1, v2, v3)                                                                                 \
+    ((((int32_t)(v0) << 0) & (int32_t)0x000000FF) | (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) |                     \
+     (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | (((int32_t)(v3) << 24) & (int32_t)0xFF000000))
+
 /**
  * @brief Union for SIMD access of q31/q15/q7 types
  */
@@ -539,6 +546,18 @@ __STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
     return (val);
 }
 
+/**
+  @brief         Write four q7 to q7 pointer and increment pointer afterwards.
+  @param[in]     in       Double pointer to input value
+  @param[in]     value    Four bytes to copy
+  @return        none
+ */
+__STATIC_FORCEINLINE void arm_nn_write_q7x4_ia(q7_t **in, q31_t value)
+{
+    memcpy(*in, &value, 4);
+    *in += 4;
+}
+
 /**
  * @brief           memset optimized for MVE
  * @param[in, out]  dst         Destination pointer
diff --git a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
index d62117c789cef5d1b7275a6624b24fb9e6d66aab..1d4ea4e084d31f45289f7d146dc51e258dea44e5 100644
--- a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
+++ b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,7 +21,7 @@
  * Title:        arm_relu_q15.c
  * Description:  Q15 version of ReLU
  *
- * $Date:        09. October 2020
+ * $Date:        20. July 2021
  * $Revision:    V.1.0.2
  *
  * Target Processor:  Cortex-M cores
@@ -54,7 +54,7 @@
 void arm_relu_q15(q15_t *data, uint16_t size)
 {
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for M cores with DSP extension */
 
     uint16_t i = size >> 1;
@@ -66,7 +66,7 @@ void arm_relu_q15(q15_t *data, uint16_t size)
 
     while (i)
     {
-        in = read_q15x2_ia(&input);
+        in = arm_nn_read_q15x2_ia((const q15_t **)&input);
 
         /* extract the first bit */
         buf = __ROR(in & 0x80008000, 15);
diff --git a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
index 75be35d995c302dd23cc5e537b968ecfb9165bf5..a3163cddca015489406d5088878589b83c177a26 100644
--- a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
+++ b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_relu_q7.c
  * Description:  Q7 version of ReLU
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.3
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.3
  *
  * Target Processor:  Cortex-M cores
  *
@@ -54,7 +54,7 @@
 void arm_relu_q7(q7_t *data, uint16_t size)
 {
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for M cores with DSP extension */
 
     uint16_t i = size >> 2;
@@ -66,7 +66,7 @@ void arm_relu_q7(q7_t *data, uint16_t size)
 
     while (i)
     {
-        in = read_q7x4_ia(&input);
+        in = arm_nn_read_q7x4_ia((const q7_t **)&input);
 
         /* extract the first bit */
         buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7);
@@ -74,7 +74,7 @@ void arm_relu_q7(q7_t *data, uint16_t size)
         /* if MSB=1, mask will be 0xFF, 0x0 otherwise */
         mask = __QSUB8(0x00000000, buf);
 
-        write_q7x4_ia(&output, in & (~mask));
+        arm_nn_write_q7x4_ia(&output, in & (~mask));
 
         i--;
     }
diff --git a/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c b/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
index 85740edb87b0352fbda0234e22c18c0f9490ee8e..6bade7ba6cb9143cbb5c76d5a07ac61d44b59f74 100644
--- a/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
+++ b/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_elementwise_add_s8
  * Description:  Element wise add
  *
- * $Date:        01. March 2021
- * $Revision:    V.2.5.3
+ * $Date:        20. July 2021
+ * $Revision:    V.2.5.4
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -30,9 +30,6 @@
 
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
-#if defined(ARM_MATH_MVEI)
-#include "arm_helium_utils.h"
-#endif
 
 #if defined(ARM_MATH_MVEI)
 #define SAT_INPUT_VECT(__INPUT_V, __MULT, __SHIFT)                                                                     \
@@ -209,7 +206,7 @@ arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
         sum = MIN(sum, out_activation_max);
         r4 = (q7_t)sum;
 
-        write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
+        arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4));
 
         loop_count--;
     }
diff --git a/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c b/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
index 7c560fe5c392fb4ac915e20b216d777c7f7b7372..3e3a63b902da1e45292e47c3963c518dd25a779e 100644
--- a/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
+++ b/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_elementwise_mul_s8
  * Description:  Element wise multiplication
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.5
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.0.6
  *
  * Target Processor:  Cortex-M cores
  *
@@ -163,7 +163,7 @@ arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
         mul_res = MIN(mul_res, out_activation_max);
         r4 = (q7_t)mul_res;
 
-        write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
+        arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4));
 
         loop_count--;
     }
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
index 6418707f1ca5c4499fb6000eb4afdce7b97f980d..3db3ba4c6c515eda9d716174d52ce2512ffaa841 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_1x1_HWC_q7_fast_nonsquare.c
  * Description:  Fast Q7 version of 1x1 convolution (non-square shape)
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -100,7 +100,7 @@ arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
                                                   q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
     (void)dim_im_in_y;
     int16_t i_out_y, i_out_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c
index e3502ebf4acde9420218c982e8e1320ac47218f0..0a6868a2181e7f25ef4a8abd80c9ab65d21bea91 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q15_basic.c
  * Description:  Q15 version of convolution
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -88,7 +88,7 @@ arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
                                       q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c
index ac007e4a986553478e71857d0340b528b99edba0..6a32229454f979cf387fe4a880d594979a0d3f99 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q15_fast.c
  * Description:  Fast Q15 version of convolution
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -93,7 +93,7 @@ arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
                                      q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
 
     q15_t *pBuffer = bufferA;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c
index 27947e8480b8c6637cfdbdd49a79bf4befbd9d7a..7babe51eaeb034d866b16eb0ed433a47989574cd 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q15_fast.c
  * Description:  Fast Q15 version of convolution
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -103,7 +103,7 @@ arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
                                                q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
 
     q15_t *pBuffer = bufferA;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
index 46e9a77882c7a39d3e744fbd108c259f58fb23a6..618f4923023ddf21e5ccb5d63b0ad8bb3e590ec6 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q7_RGB.c
  * Description:  Q7 version of convolution for RGB image
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -94,7 +94,7 @@ arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
                                    q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
 
@@ -122,8 +122,7 @@ arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
                     if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
                     {
                         /* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */
-                        *__SIMD32(pBuffer) = 0x0;
-                        *(pBuffer + 2) = 0;
+                        arm_memset_q7((q7_t *)pBuffer, (q7_t)0, 3 * sizeof(q15_t));
                         pBuffer += 3;
                     }
                     else
@@ -155,7 +154,8 @@ arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
                          *  version 2, no weight shuffling required
                          */
                         *pBuffer++ = top.half_words[0];
-                        *__SIMD32(pBuffer) = __PKHBT(bottom.word, top.word, 0);
+                        int32_t packed_word = __PKHBT(bottom.word, top.word, 0);
+                        arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
 #else
                         /*
                          *  big-endian,    | 1st  | 2nd  | 3rd  | omit |
@@ -169,7 +169,8 @@ arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
                          *  version 2, no weight shuffling required
                          */
                         *pBuffer++ = bottom.half_words[0];
-                        *__SIMD32(pBuffer) = __PKHTB(top.word, bottom.word, 0);
+                        int32_t packed_word = __PKHTB(top.word, bottom.word, 0);
+                        arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
 #endif
                         pBuffer += 2;
                     }
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
index 942682e097dd544e493f7658f86f437492fec5f3..e274413addcc932b403bb04d4e5b542076391326 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q7_basic.c
  * Description:	 Q7 version of convolution
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -88,7 +88,7 @@ arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
                                      q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
@@ -182,7 +182,7 @@ arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
     }
 #else
     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
-
+    (void)bufferA;
     int i, j, k, l, m, n;
     int conv_out;
     int in_row, in_col;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
index cd9f78fd01cd6d32a625593e0440d88b7c31428f..b42a57dca89f812b541e2bcf81a9288534a5fabd 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q7_basic.c
  * Description:	 Q7 version of convolution
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -87,7 +87,7 @@ arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
                                                q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
index bd9959f2c25930eecac0d45a3430887443893616..51d98fd85f114ca257987aedc475001a49d2abed 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q7_fast.c
  * Description:  Fast Q7 version of convolution
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -105,7 +105,7 @@ arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
                                     q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
index 6ad061b10f7f88cb6a34869d2f153ac709841f60..25f17bb45b93de772f2dc73b37aa993ef886f6a2 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q7_fast_nonsquare.c
  * Description:  Fast Q7 version of convolution (non-sqaure shape)
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -93,7 +93,7 @@ arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
                                               q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c
index de0ef8fec6b35cfb13336e7bddeb0387f662d439..729147fdc39e48212398ed76b7b9eb80e8a61074 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c
@@ -21,8 +21,8 @@
  * Title:        arm_depthwise_separable_conv_HWC_q7.c
  * Description:  Q7 depthwise separable convolution function
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -96,7 +96,7 @@ arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
                                                q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c
index 9cf89b30319a4713ad575cbb28f7e5224673d68b..829acf9006bcebb02698e1ced127e9e89f5406f4 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c
@@ -21,8 +21,8 @@
  * Title:        arm_depthwise_separable_conv_HWC_q7_nonsquare.c
  * Description:  Q7 depthwise separable convolution function (non-square shape)
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -95,7 +95,7 @@ arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
 
     (void)bufferB;
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     /*
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c
index fa9f775b22af2d2c8de2ff4a08763ff6b99d95d6..9eb02ebe610838d50abe9820a6f4b16c9917d058 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_mat_q7_vec_q15.c
  * Description:  Mixed Q15-Q7 fully-connected layer function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -76,7 +76,7 @@ arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV,
                                               q15_t *vec_buffer)
 {
     (void)vec_buffer;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q7_t *pB = pM;
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c
index 2826ac5f692e37b5c980834c5308e7f1507a02cb..a2da77298615aa2c9dda307a65372c6a86aaeee7 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_mat_q7_vec_q15_opt.c
  * Description:  Mixed Q15-Q7 opt fully-connected layer function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -125,7 +125,7 @@ arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV,
 {
 
     (void)vec_buffer;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q7_t *pB = pM;
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c
index 67d70ec12c1b7ed857952648131e35f92534f1e7..d8b6887b582b2bbce329650a1889e5f5eb105209 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_q15.c
  * Description:  Q15 basic fully-connected layer function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -73,7 +73,7 @@ arm_status arm_fully_connected_q15(const q15_t *pV,
                                    q15_t *vec_buffer)
 {
     (void)vec_buffer;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q15_t *pB = pM;
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c
index 9de8618251de94bd6248a2f2af0cd2767e7a9f52..f6c9b16999692142b21be4573f6d7e4252b08ea3 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_q15_opt.c
  * Description:  Q15 opt fully-connected layer function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -104,7 +104,7 @@ arm_status arm_fully_connected_q15_opt(const q15_t *pV,
                                        q15_t *vec_buffer)
 {
     (void)vec_buffer;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q15_t *pB = pM;
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c
index 178102dac6df0fa0b5ebdc94fa183356a6d7b69e..d500efe9d7447cb6a13d20fc243c0bfb24476a29 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_q7.c
  * Description:  Q7 basic fully-connected layer function
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -75,7 +75,7 @@ arm_status arm_fully_connected_q7(const q7_t *pV,
                                   q15_t *vec_buffer)
 {
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q7_t *pB = pM;
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c
index 77c338636782ad57db333d2fea38dc6523b65a89..2f3d6539e62c6c27cfcea68411df18452bc6df3e 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_q7_opt.c
  * Description:  Q7 basic fully-connected layer function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -136,7 +136,7 @@ arm_status arm_fully_connected_q7_opt(const q7_t *pV,
                                       q15_t *vec_buffer)
 {
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q7_t *pB = pM;
@@ -382,6 +382,7 @@ arm_status arm_fully_connected_q7_opt(const q7_t *pV,
 
 #else
     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+    (void)vec_buffer;
     uint16_t rowCnt = num_of_rows >> 2;
     const q7_t *pB = pM;
     const q7_t *pA;
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
index 82c295281d8131d75c74880a2e79634f097dd766..c3f666aa2c0808133e2ca858f9a3cdaca08596b2 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_accumulate_q7_to_q15.c
  * Description:  Accumulate q7 vector into q15 one.
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.2
+ * $Date:        20 July 2021
+ * $Revision:    V.1.1.2
  *
  * pSrc Processor:  Cortex-M CPUs
  *
@@ -44,11 +44,13 @@ void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
 {
     q15_t *pCnt = pDst;
     const q7_t *pV = pSrc;
+    int32_t count = length;
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     q31_t v1, v2, vo1, vo2;
-    int32_t cnt = length >> 2;
+    count = length >> 2;
     q31_t in;
 
-    while (cnt > 0l)
+    while (count > 0l)
     {
         q31_t value = arm_nn_read_q7x4_ia(&pV);
         v1 = __SXTB16(__ROR((uint32_t)value, 8));
@@ -67,13 +69,14 @@ void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
         in = arm_nn_read_q15x2(pCnt);
         arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));
 
-        cnt--;
+        count--;
     }
-    cnt = length & 0x3;
-    while (cnt > 0l)
+    count = length & 0x3;
+#endif
+    while (count > 0l)
     {
         *pCnt++ += *pV++;
-        cnt--;
+        count--;
     }
 }
 
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c
index 86cf5475c335ccb9e506245636b086e30f71c6c6..511e58633b4c4fb1cbae75aeb585acca252d3545 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c
@@ -21,8 +21,8 @@
  * Title:        arm_nn_add_q7.c
  * Description:  Non saturating addition of elements of a q7 vector.
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -44,7 +44,7 @@ void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size)
 {
     uint32_t block_count;
     q31_t result = 0;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Loop unrolling: Compute 4 outputs at a time */
     block_count = block_size >> 2U;
 
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
index 6c54618bb00cbd4acf6b8cc779d36763ad44edea..d6a45efe48cf6219d8e748da880e147b8af9212f 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mult_q15.c
  * Description:  Q15 vector multiplication with variable output shifts
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.2
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -55,72 +55,7 @@
 
 void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize)
 {
-    uint32_t blkCnt; /* loop counters */
-
-#if defined(ARM_MATH_DSP)
-
-    /* Run the below code for Cortex-M4 and Cortex-M3 */
-    q31_t inA1, inA2, inB1, inB2; /* temporary input variables */
-    q15_t out1, out2, out3, out4; /* temporary output variables */
-    q31_t mul1, mul2, mul3, mul4; /* temporary variables */
-
-    /* loop Unrolling */
-    blkCnt = blockSize >> 2U;
-
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
-    while (blkCnt > 0U)
-    {
-        /* read two samples at a time from sourceA */
-        inA1 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcA);
-        /* read two samples at a time from sourceB */
-        inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcB);
-        /* read two samples at a time from sourceA */
-        inA2 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcA);
-        /* read two samples at a time from sourceB */
-        inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcB);
-
-        /* multiply mul = sourceA * sourceB */
-        mul1 = (q31_t)((q15_t)(inA1 >> 16) * (q15_t)(inB1 >> 16));
-        mul2 = (q31_t)((q15_t)inA1 * (q15_t)inB1);
-        mul3 = (q31_t)((q15_t)(inA2 >> 16) * (q15_t)(inB2 >> 16));
-        mul4 = (q31_t)((q15_t)inA2 * (q15_t)inB2);
-
-        /* saturate result to 16 bit */
-        out1 = (q15_t)__SSAT((q31_t)(mul1 + NN_ROUND(out_shift)) >> out_shift, 16);
-        out2 = (q15_t)__SSAT((q31_t)(mul2 + NN_ROUND(out_shift)) >> out_shift, 16);
-        out3 = (q15_t)__SSAT((q31_t)(mul3 + NN_ROUND(out_shift)) >> out_shift, 16);
-        out4 = (q15_t)__SSAT((q31_t)(mul4 + NN_ROUND(out_shift)) >> out_shift, 16);
-
-        /* store the result */
-#ifndef ARM_MATH_BIG_ENDIAN
-
-        *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
-        *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
-
-#else
-
-        *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
-        *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /* Decrement the blockSize loop counter */
-        blkCnt--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    blkCnt = blockSize % 0x4U;
-
-#else
-
-    /* Run the below code for Cortex-M0 */
-
-    /* Initialize blkCnt with number of samples */
-    blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+    uint32_t blkCnt = blockSize; /* loop counters */
 
     while (blkCnt > 0U)
     {
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
index 40dd1cdad0e4e54fcf9434432107134e42f85e04..fdced4cf58388c17b41c463fd67eca1e70e2db88 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mult_q7.c
  * Description:  Q7 vector multiplication with variable output shifts
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.2
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -55,46 +55,7 @@
 
 void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize)
 {
-    uint32_t blkCnt; /* loop counters */
-
-#if defined(ARM_MATH_DSP)
-
-    /* Run the below code for Cortex-M4 and Cortex-M3 */
-    q7_t out1, out2, out3, out4; /* Temporary variables to store the product */
-
-    /* loop Unrolling */
-    blkCnt = blockSize >> 2U;
-
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
-    while (blkCnt > 0U)
-    {
-        /* C = A * B */
-        /* Multiply the inputs and store the results in temporary variables */
-        out1 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
-        out2 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
-        out3 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
-        out4 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
-
-        /* Store the results of 4 inputs in the destination buffer in single cycle by packing */
-        *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
-
-        /* Decrement the blockSize loop counter */
-        blkCnt--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    blkCnt = blockSize % 0x4U;
-
-#else
-
-    /* Run the below code for Cortex-M0 */
-
-    /* Initialize blkCnt with number of samples */
-    blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+    uint32_t blkCnt = blockSize; /* loop counters */
 
     while (blkCnt > 0U)
     {
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c
index 9017970df1a242c6a7c0629daa911d4099cf7abd..8abbc3a5afe1e9ec761773d76cd61c638655745e 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_q7_to_q15_reordered_no_shift.c
  * Description:  Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
  *
- * $Date:        May 29, 2020
- * $Revision:    V.1.0.1
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -79,7 +79,7 @@ void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t bl
     const q7_t *pIn = pSrc; /* Src pointer */
     uint32_t blkCnt;        /* loop counter */
 
-#ifndef ARM_MATH_CM0_FAMILY
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     q31_t in;
     q31_t in1, in2;
 
@@ -103,11 +103,11 @@ void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t bl
         in2 = __SXTB16(in);
 
 #ifndef ARM_MATH_BIG_ENDIAN
-        *__SIMD32(pDst)++ = in2;
-        *__SIMD32(pDst)++ = in1;
+        arm_nn_write_q7x4_ia((q7_t **)&pDst, in2);
+        arm_nn_write_q7x4_ia((q7_t **)&pDst, in1);
 #else
-        *__SIMD32(pDst)++ = in1;
-        *__SIMD32(pDst)++ = in2;
+        arm_nn_write_q7x4_ia((q7_t **)&pDst, in1);
+        arm_nn_write_q7x4_ia((q7_t **)&pDst, in2);
 #endif
 
         /* Decrement the loop counter */
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
index 36163667ba83533132ddb364e30aed55a1ef98c4..78e6fca03fd28895446ef5990a8df8b70e45dd5f 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_max_pool_s8.c
  * Description:  Pooling function implementations
  *
- * $Date:        19. Februari 2021
- * $Revision:    V.2.0.2
+ * $Date:        20. July 2021
+ * $Revision:    V.2.0.3
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -75,7 +75,7 @@ static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int
             ref_max.bytes[3] = comp_max.bytes[3];
         }
 
-        write_q7x4_ia(&dst, ref_max.word);
+        arm_nn_write_q7x4_ia(&dst, ref_max.word);
 
         cnt--;
     }
@@ -127,7 +127,7 @@ static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, co
         in.bytes[3] = MAX(in.bytes[3], act_min);
         in.bytes[3] = MIN(in.bytes[3], act_max);
 
-        write_q7x4_ia(&source, in.word);
+        arm_nn_write_q7x4_ia(&source, in.word);
         cnt--;
     }
 
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
index 7546049cecf74570bd8559ee481297d1dd7128f7..5a3b1afd36741f693c52e70654c3792568a1be2e 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_pool_q7_HWC.c
  * Description:  Pooling function implementations
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -31,7 +31,7 @@
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
 
 /**
  * @brief A few utility functions used by pooling functions
@@ -75,7 +75,7 @@ static void compare_and_replace_if_larger_q7(q7_t *base,           // base data
         if (com.bytes[3] > in.bytes[3])
             in.bytes[3] = com.bytes[3];
 
-        *__SIMD32(pIn)++ = in.word;
+        arm_nn_write_q7x4_ia(&pIn, in.word);
 
         cnt--;
     }
@@ -119,10 +119,10 @@ static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t lengt
 #endif
 
         in = arm_nn_read_q15x2(pCnt);
-        *__SIMD32(pCnt)++ = __QADD16(vo1, in);
+        arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in));
 
         in = arm_nn_read_q15x2(pCnt);
-        *__SIMD32(pCnt)++ = __QADD16(vo2, in);
+        arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));
 
         cnt--;
     }
@@ -178,7 +178,7 @@ void arm_maxpool_q7_HWC(q7_t *Im_in,
                         q7_t *Im_out)
 {
     (void)bufferA;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_x, i_y;
@@ -334,7 +334,7 @@ void arm_avepool_q7_HWC(q7_t *Im_in,
                         q7_t *Im_out)
 {
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     q15_t *buffer = (q15_t *)bufferA;
diff --git a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
index 2c426bd75935123a0e2462a4c649a56cca8d8760..ffd8f8fc8a50d72b93c9982a07a7f44f314d6ea6 100644
--- a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
+++ b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
@@ -26,6 +26,7 @@ add_compile_options(-O0
                     -Werror
                     -Wimplicit-function-declaration
                     -Wunused-variable
+                    -Wunused-function
                     -Wno-redundant-decls)
 
 option(BUILD_CMSIS_NN_UNIT "If building the unit tests from another project, i.e. \