From 68e9b60133bf9278fcd9763c41c82ccc88823ae2 Mon Sep 17 00:00:00 2001
From: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Date: Tue, 20 Jul 2021 06:49:51 +0200
Subject: [PATCH] CMSIS-NN: Cleanup of CMSIS-DSP dependencies

Left over CMSIS-DSP dependencies are removed

For functions with just DSP extension optimization,
the reference C implementation(if available) is picked
for processors with Helium Technology as auto vectorization
can potentially give a better performance. This affects only
the non-TFLM functions.

Change-Id: I8ededad1d34eeb27c1ac2d65b3250b85562cc0d4
---
 CMSIS/NN/Include/arm_nnsupportfunctions.h     | 23 +++++-
 .../Source/ActivationFunctions/arm_relu_q15.c |  8 +-
 .../Source/ActivationFunctions/arm_relu_q7.c  | 12 +--
 .../arm_elementwise_add_s8.c                  |  9 +--
 .../arm_elementwise_mul_s8.c                  |  6 +-
 .../arm_convolve_1x1_HWC_q7_fast_nonsquare.c  |  6 +-
 .../arm_convolve_HWC_q15_basic.c              |  6 +-
 .../arm_convolve_HWC_q15_fast.c               |  6 +-
 .../arm_convolve_HWC_q15_fast_nonsquare.c     |  6 +-
 .../arm_convolve_HWC_q7_RGB.c                 | 15 ++--
 .../arm_convolve_HWC_q7_basic.c               |  8 +-
 .../arm_convolve_HWC_q7_basic_nonsquare.c     |  6 +-
 .../arm_convolve_HWC_q7_fast.c                |  6 +-
 .../arm_convolve_HWC_q7_fast_nonsquare.c      |  6 +-
 .../arm_depthwise_separable_conv_HWC_q7.c     |  6 +-
 ...epthwise_separable_conv_HWC_q7_nonsquare.c |  6 +-
 .../arm_fully_connected_mat_q7_vec_q15.c      |  6 +-
 .../arm_fully_connected_mat_q7_vec_q15_opt.c  |  6 +-
 .../arm_fully_connected_q15.c                 |  6 +-
 .../arm_fully_connected_q15_opt.c             |  8 +-
 .../arm_fully_connected_q7.c                  |  6 +-
 .../arm_fully_connected_q7_opt.c              |  9 ++-
 .../arm_nn_accumulate_q7_to_q15.c             | 21 +++---
 .../Source/NNSupportFunctions/arm_nn_add_q7.c |  6 +-
 .../NNSupportFunctions/arm_nn_mult_q15.c      | 73 +------------------
 .../NNSupportFunctions/arm_nn_mult_q7.c       | 47 +-----------
 .../arm_q7_to_q15_reordered_no_shift.c        | 16 ++--
 .../Source/PoolingFunctions/arm_max_pool_s8.c | 10 +--
 .../Source/PoolingFunctions/arm_pool_q7_HWC.c | 18 ++---
 CMSIS/NN/Tests/UnitTest/CMakeLists.txt        |  1 +
 30 files changed, 143 insertions(+), 225 deletions(-)

diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
index 669419f90..ee59d14c2 100644
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        5. July 2021
- * $Revision:    V.5.6.0
+ * $Date:        20. July 2021
+ * $Revision:    V.5.7.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -48,6 +48,13 @@ extern "C" {
 #define CLAMP(x, h, l) MAX(MIN((x), (h)), (l))
 #define REDUCE_MULTIPLIER(_mult) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)
 
+/**
+ * @brief definition to pack four 8 bit values.
+ */
+#define PACK_Q7x4_32x1(v0, v1, v2, v3)                                                                                 \
+    ((((int32_t)(v0) << 0) & (int32_t)0x000000FF) | (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) |                     \
+     (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | (((int32_t)(v3) << 24) & (int32_t)0xFF000000))
+
 /**
  * @brief Union for SIMD access of q31/q15/q7 types
  */
@@ -539,6 +546,18 @@ __STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
     return (val);
 }
 
+/**
+  @brief         Write four q7 to q7 pointer and increment pointer afterwards.
+  @param[in]     in       Double pointer to input value
+  @param[in]     value    Four bytes to copy
+  @return        none
+ */
+__STATIC_FORCEINLINE void arm_nn_write_q7x4_ia(q7_t **in, q31_t value)
+{
+    memcpy(*in, &value, 4);
+    *in += 4;
+}
+
 /**
  * @brief           memset optimized for MVE
  * @param[in, out]  dst         Destination pointer
diff --git a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
index d62117c78..1d4ea4e08 100644
--- a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
+++ b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,7 +21,7 @@
  * Title:        arm_relu_q15.c
  * Description:  Q15 version of ReLU
  *
- * $Date:        09. October 2020
+ * $Date:        20. July 2021
  * $Revision:    V.1.0.2
  *
  * Target Processor:  Cortex-M cores
@@ -54,7 +54,7 @@
 void arm_relu_q15(q15_t *data, uint16_t size)
 {
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for M cores with DSP extension */
 
     uint16_t i = size >> 1;
@@ -66,7 +66,7 @@ void arm_relu_q15(q15_t *data, uint16_t size)
 
     while (i)
     {
-        in = read_q15x2_ia(&input);
+        in = arm_nn_read_q15x2_ia((const q15_t **)&input);
 
         /* extract the first bit */
         buf = __ROR(in & 0x80008000, 15);
diff --git a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
index 75be35d99..a3163cddc 100644
--- a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
+++ b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_relu_q7.c
  * Description:  Q7 version of ReLU
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.3
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.3
  *
  * Target Processor:  Cortex-M cores
  *
@@ -54,7 +54,7 @@
 void arm_relu_q7(q7_t *data, uint16_t size)
 {
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for M cores with DSP extension */
 
     uint16_t i = size >> 2;
@@ -66,7 +66,7 @@ void arm_relu_q7(q7_t *data, uint16_t size)
 
     while (i)
     {
-        in = read_q7x4_ia(&input);
+        in = arm_nn_read_q7x4_ia((const q7_t **)&input);
 
         /* extract the first bit */
         buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7);
@@ -74,7 +74,7 @@ void arm_relu_q7(q7_t *data, uint16_t size)
         /* if MSB=1, mask will be 0xFF, 0x0 otherwise */
         mask = __QSUB8(0x00000000, buf);
 
-        write_q7x4_ia(&output, in & (~mask));
+        arm_nn_write_q7x4_ia(&output, in & (~mask));
 
         i--;
     }
diff --git a/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c b/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
index 85740edb8..6bade7ba6 100644
--- a/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
+++ b/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_elementwise_add_s8
  * Description:  Element wise add
  *
- * $Date:        01. March 2021
- * $Revision:    V.2.5.3
+ * $Date:        20. July 2021
+ * $Revision:    V.2.5.4
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -30,9 +30,6 @@
 
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
-#if defined(ARM_MATH_MVEI)
-#include "arm_helium_utils.h"
-#endif
 
 #if defined(ARM_MATH_MVEI)
 #define SAT_INPUT_VECT(__INPUT_V, __MULT, __SHIFT)                                                                     \
@@ -209,7 +206,7 @@ arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
         sum = MIN(sum, out_activation_max);
         r4 = (q7_t)sum;
 
-        write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
+        arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4));
 
         loop_count--;
     }
diff --git a/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c b/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
index 7c560fe5c..3e3a63b90 100644
--- a/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
+++ b/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_elementwise_mul_s8
  * Description:  Element wise multiplication
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.5
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.0.6
  *
  * Target Processor:  Cortex-M cores
  *
@@ -163,7 +163,7 @@ arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
         mul_res = MIN(mul_res, out_activation_max);
         r4 = (q7_t)mul_res;
 
-        write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
+        arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4));
 
         loop_count--;
     }
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
index 6418707f1..3db3ba4c6 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_1x1_HWC_q7_fast_nonsquare.c
  * Description:  Fast Q7 version of 1x1 convolution (non-square shape)
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -100,7 +100,7 @@ arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
                                                   q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
     (void)dim_im_in_y;
     int16_t i_out_y, i_out_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c
index e3502ebf4..0a6868a21 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q15_basic.c
  * Description:  Q15 version of convolution
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -88,7 +88,7 @@ arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
                                       q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c
index ac007e4a9..6a3222945 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q15_fast.c
  * Description:  Fast Q15 version of convolution
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -93,7 +93,7 @@ arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
                                      q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
 
     q15_t *pBuffer = bufferA;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c
index 27947e848..7babe51ea 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q15_fast.c
  * Description:  Fast Q15 version of convolution
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -103,7 +103,7 @@ arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
                                                q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
 
     q15_t *pBuffer = bufferA;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
index 46e9a7788..618f49230 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q7_RGB.c
  * Description:  Q7 version of convolution for RGB image
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -94,7 +94,7 @@ arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
                                    q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
 
@@ -122,8 +122,7 @@ arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
                     if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
                     {
                         /* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */
-                        *__SIMD32(pBuffer) = 0x0;
-                        *(pBuffer + 2) = 0;
+                        arm_memset_q7((q7_t *)pBuffer, (q7_t)0, 3 * sizeof(q15_t));
                         pBuffer += 3;
                     }
                     else
@@ -155,7 +154,8 @@ arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
                          *  version 2, no weight shuffling required
                          */
                         *pBuffer++ = top.half_words[0];
-                        *__SIMD32(pBuffer) = __PKHBT(bottom.word, top.word, 0);
+                        int32_t packed_word = __PKHBT(bottom.word, top.word, 0);
+                        arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
 #else
                         /*
                          *  big-endian,    | 1st  | 2nd  | 3rd  | omit |
@@ -169,7 +169,8 @@ arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
                          *  version 2, no weight shuffling required
                          */
                         *pBuffer++ = bottom.half_words[0];
-                        *__SIMD32(pBuffer) = __PKHTB(top.word, bottom.word, 0);
+                        int32_t packed_word = __PKHTB(top.word, bottom.word, 0);
+                        arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
 #endif
                         pBuffer += 2;
                     }
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
index 942682e09..e274413ad 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q7_basic.c
  * Description:	 Q7 version of convolution
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -88,7 +88,7 @@ arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
                                      q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
@@ -182,7 +182,7 @@ arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
     }
 #else
     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
-
+    (void)bufferA;
     int i, j, k, l, m, n;
     int conv_out;
     int in_row, in_col;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
index cd9f78fd0..b42a57dca 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q7_basic.c
  * Description:	 Q7 version of convolution
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -87,7 +87,7 @@ arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
                                                q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
index bd9959f2c..51d98fd85 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q7_fast.c
  * Description:  Fast Q7 version of convolution
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -105,7 +105,7 @@ arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
                                     q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
index 6ad061b10..25f17bb45 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
@@ -21,8 +21,8 @@
  * Title:        arm_convolve_HWC_q7_fast_nonsquare.c
  * Description:  Fast Q7 version of convolution (non-sqaure shape)
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -93,7 +93,7 @@ arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
                                               q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c
index de0ef8fec..729147fdc 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c
@@ -21,8 +21,8 @@
  * Title:        arm_depthwise_separable_conv_HWC_q7.c
  * Description:  Q7 depthwise separable convolution function
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -96,7 +96,7 @@ arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
                                                q7_t *bufferB)
 {
     (void)bufferB;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_out_y, i_out_x;
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c
index 9cf89b303..829acf900 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c
@@ -21,8 +21,8 @@
  * Title:        arm_depthwise_separable_conv_HWC_q7_nonsquare.c
  * Description:  Q7 depthwise separable convolution function (non-square shape)
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -95,7 +95,7 @@ arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
 
     (void)bufferB;
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     /*
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c
index fa9f775b2..9eb02ebe6 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_mat_q7_vec_q15.c
  * Description:  Mixed Q15-Q7 fully-connected layer function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -76,7 +76,7 @@ arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV,
                                               q15_t *vec_buffer)
 {
     (void)vec_buffer;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q7_t *pB = pM;
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c
index 2826ac5f6..a2da77298 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_mat_q7_vec_q15_opt.c
  * Description:  Mixed Q15-Q7 opt fully-connected layer function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -125,7 +125,7 @@ arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV,
 {
 
     (void)vec_buffer;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q7_t *pB = pM;
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c
index 67d70ec12..d8b6887b5 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_q15.c
  * Description:  Q15 basic fully-connected layer function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -73,7 +73,7 @@ arm_status arm_fully_connected_q15(const q15_t *pV,
                                    q15_t *vec_buffer)
 {
     (void)vec_buffer;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q15_t *pB = pM;
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c
index 9de861825..f6c9b1699 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_q15_opt.c
  * Description:  Q15 opt fully-connected layer function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -104,7 +104,7 @@ arm_status arm_fully_connected_q15_opt(const q15_t *pV,
                                        q15_t *vec_buffer)
 {
     (void)vec_buffer;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q15_t *pB = pM;
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c
index 178102dac..d500efe9d 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_q7.c
  * Description:  Q7 basic fully-connected layer function
  *
- * $Date:        January 26, 2021
- * $Revision:    V.1.0.2
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -75,7 +75,7 @@ arm_status arm_fully_connected_q7(const q7_t *pV,
                                   q15_t *vec_buffer)
 {
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q7_t *pB = pM;
diff --git a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c
index 77c338636..2f3d6539e 100644
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_q7_opt.c
  * Description:  Q7 basic fully-connected layer function
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -136,7 +136,7 @@ arm_status arm_fully_connected_q7_opt(const q7_t *pV,
                                       q15_t *vec_buffer)
 {
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     const q7_t *pB = pM;
@@ -382,6 +382,7 @@ arm_status arm_fully_connected_q7_opt(const q7_t *pV,
 
 #else
     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+    (void)vec_buffer;
     uint16_t rowCnt = num_of_rows >> 2;
     const q7_t *pB = pM;
     const q7_t *pA;
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
index 82c295281..c3f666aa2 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_accumulate_q7_to_q15.c
  * Description:  Accumulate q7 vector into q15 one.
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.2
+ * $Date:        20 July 2021
+ * $Revision:    V.1.1.2
  *
  * pSrc Processor:  Cortex-M CPUs
  *
@@ -44,11 +44,13 @@ void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
 {
     q15_t *pCnt = pDst;
     const q7_t *pV = pSrc;
+    int32_t count = length;
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     q31_t v1, v2, vo1, vo2;
-    int32_t cnt = length >> 2;
+    count = length >> 2;
     q31_t in;
 
-    while (cnt > 0l)
+    while (count > 0l)
     {
         q31_t value = arm_nn_read_q7x4_ia(&pV);
         v1 = __SXTB16(__ROR((uint32_t)value, 8));
@@ -67,13 +69,14 @@ void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
         in = arm_nn_read_q15x2(pCnt);
         arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));
 
-        cnt--;
+        count--;
     }
-    cnt = length & 0x3;
-    while (cnt > 0l)
+    count = length & 0x3;
+#endif
+    while (count > 0l)
     {
         *pCnt++ += *pV++;
-        cnt--;
+        count--;
     }
 }
 
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c
index 86cf5475c..511e58633 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c
@@ -21,8 +21,8 @@
  * Title:        arm_nn_add_q7.c
  * Description:  Non saturating addition of elements of a q7 vector.
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -44,7 +44,7 @@ void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size)
 {
     uint32_t block_count;
     q31_t result = 0;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Loop unrolling: Compute 4 outputs at a time */
     block_count = block_size >> 2U;
 
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
index 6c54618bb..d6a45efe4 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mult_q15.c
  * Description:  Q15 vector multiplication with variable output shifts
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.2
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -55,72 +55,7 @@
 
 void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize)
 {
-    uint32_t blkCnt; /* loop counters */
-
-#if defined(ARM_MATH_DSP)
-
-    /* Run the below code for Cortex-M4 and Cortex-M3 */
-    q31_t inA1, inA2, inB1, inB2; /* temporary input variables */
-    q15_t out1, out2, out3, out4; /* temporary output variables */
-    q31_t mul1, mul2, mul3, mul4; /* temporary variables */
-
-    /* loop Unrolling */
-    blkCnt = blockSize >> 2U;
-
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
-    while (blkCnt > 0U)
-    {
-        /* read two samples at a time from sourceA */
-        inA1 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcA);
-        /* read two samples at a time from sourceB */
-        inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcB);
-        /* read two samples at a time from sourceA */
-        inA2 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcA);
-        /* read two samples at a time from sourceB */
-        inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pSrcB);
-
-        /* multiply mul = sourceA * sourceB */
-        mul1 = (q31_t)((q15_t)(inA1 >> 16) * (q15_t)(inB1 >> 16));
-        mul2 = (q31_t)((q15_t)inA1 * (q15_t)inB1);
-        mul3 = (q31_t)((q15_t)(inA2 >> 16) * (q15_t)(inB2 >> 16));
-        mul4 = (q31_t)((q15_t)inA2 * (q15_t)inB2);
-
-        /* saturate result to 16 bit */
-        out1 = (q15_t)__SSAT((q31_t)(mul1 + NN_ROUND(out_shift)) >> out_shift, 16);
-        out2 = (q15_t)__SSAT((q31_t)(mul2 + NN_ROUND(out_shift)) >> out_shift, 16);
-        out3 = (q15_t)__SSAT((q31_t)(mul3 + NN_ROUND(out_shift)) >> out_shift, 16);
-        out4 = (q15_t)__SSAT((q31_t)(mul4 + NN_ROUND(out_shift)) >> out_shift, 16);
-
-        /* store the result */
-#ifndef ARM_MATH_BIG_ENDIAN
-
-        *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
-        *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
-
-#else
-
-        *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
-        *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
-
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /* Decrement the blockSize loop counter */
-        blkCnt--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    blkCnt = blockSize % 0x4U;
-
-#else
-
-    /* Run the below code for Cortex-M0 */
-
-    /* Initialize blkCnt with number of samples */
-    blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+    uint32_t blkCnt = blockSize; /* loop counters */
 
     while (blkCnt > 0U)
     {
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
index 40dd1cdad..fdced4cf5 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mult_q7.c
  * Description:  Q7 vector multiplication with variable output shifts
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.2
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -55,46 +55,7 @@
 
 void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize)
 {
-    uint32_t blkCnt; /* loop counters */
-
-#if defined(ARM_MATH_DSP)
-
-    /* Run the below code for Cortex-M4 and Cortex-M3 */
-    q7_t out1, out2, out3, out4; /* Temporary variables to store the product */
-
-    /* loop Unrolling */
-    blkCnt = blockSize >> 2U;
-
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
-    while (blkCnt > 0U)
-    {
-        /* C = A * B */
-        /* Multiply the inputs and store the results in temporary variables */
-        out1 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
-        out2 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
-        out3 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
-        out4 = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
-
-        /* Store the results of 4 inputs in the destination buffer in single cycle by packing */
-        *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
-
-        /* Decrement the blockSize loop counter */
-        blkCnt--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    blkCnt = blockSize % 0x4U;
-
-#else
-
-    /* Run the below code for Cortex-M0 */
-
-    /* Initialize blkCnt with number of samples */
-    blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_DSP) */
+    uint32_t blkCnt = blockSize; /* loop counters */
 
     while (blkCnt > 0U)
     {
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c
index 9017970df..8abbc3a5a 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_q7_to_q15_reordered_no_shift.c
  * Description:  Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
  *
- * $Date:        May 29, 2020
- * $Revision:    V.1.0.1
+ * $Date:        July 20, 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -79,7 +79,7 @@ void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t bl
     const q7_t *pIn = pSrc; /* Src pointer */
     uint32_t blkCnt;        /* loop counter */
 
-#ifndef ARM_MATH_CM0_FAMILY
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     q31_t in;
     q31_t in1, in2;
 
@@ -103,11 +103,11 @@ void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t bl
         in2 = __SXTB16(in);
 
 #ifndef ARM_MATH_BIG_ENDIAN
-        *__SIMD32(pDst)++ = in2;
-        *__SIMD32(pDst)++ = in1;
+        arm_nn_write_q7x4_ia((q7_t **)&pDst, in2);
+        arm_nn_write_q7x4_ia((q7_t **)&pDst, in1);
 #else
-        *__SIMD32(pDst)++ = in1;
-        *__SIMD32(pDst)++ = in2;
+        arm_nn_write_q7x4_ia((q7_t **)&pDst, in1);
+        arm_nn_write_q7x4_ia((q7_t **)&pDst, in2);
 #endif
 
         /* Decrement the loop counter */
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
index 36163667b..78e6fca03 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_max_pool_s8.c
  * Description:  Pooling function implementations
  *
- * $Date:        19. Februari 2021
- * $Revision:    V.2.0.2
+ * $Date:        20. July 2021
+ * $Revision:    V.2.0.3
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -75,7 +75,7 @@ static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int
             ref_max.bytes[3] = comp_max.bytes[3];
         }
 
-        write_q7x4_ia(&dst, ref_max.word);
+        arm_nn_write_q7x4_ia(&dst, ref_max.word);
 
         cnt--;
     }
@@ -127,7 +127,7 @@ static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, co
         in.bytes[3] = MAX(in.bytes[3], act_min);
         in.bytes[3] = MIN(in.bytes[3], act_max);
 
-        write_q7x4_ia(&source, in.word);
+        arm_nn_write_q7x4_ia(&source, in.word);
         cnt--;
     }
 
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
index 7546049ce..5a3b1afd3 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_pool_q7_HWC.c
  * Description:  Pooling function implementations
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.1
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
@@ -31,7 +31,7 @@
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
 
 /**
  * @brief A few utility functions used by pooling functions
@@ -75,7 +75,7 @@ static void compare_and_replace_if_larger_q7(q7_t *base,           // base data
         if (com.bytes[3] > in.bytes[3])
             in.bytes[3] = com.bytes[3];
 
-        *__SIMD32(pIn)++ = in.word;
+        arm_nn_write_q7x4_ia(&pIn, in.word);
 
         cnt--;
     }
@@ -119,10 +119,10 @@ static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t lengt
 #endif
 
         in = arm_nn_read_q15x2(pCnt);
-        *__SIMD32(pCnt)++ = __QADD16(vo1, in);
+        arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in));
 
         in = arm_nn_read_q15x2(pCnt);
-        *__SIMD32(pCnt)++ = __QADD16(vo2, in);
+        arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));
 
         cnt--;
     }
@@ -178,7 +178,7 @@ void arm_maxpool_q7_HWC(q7_t *Im_in,
                         q7_t *Im_out)
 {
     (void)bufferA;
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     int16_t i_x, i_y;
@@ -334,7 +334,7 @@ void arm_avepool_q7_HWC(q7_t *Im_in,
                         q7_t *Im_out)
 {
 
-#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */
 
     q15_t *buffer = (q15_t *)bufferA;
diff --git a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
index 2c426bd75..ffd8f8fc8 100644
--- a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
+++ b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
@@ -26,6 +26,7 @@ add_compile_options(-O0
                     -Werror
                     -Wimplicit-function-declaration
                     -Wunused-variable
+                    -Wunused-function
                     -Wno-redundant-decls)
 
 option(BUILD_CMSIS_NN_UNIT "If building the unit tests from another project, i.e. \
-- 
GitLab