From 4820adae80432feb7bf37f9e1e133aad8b1399ba Mon Sep 17 00:00:00 2001
From: Levon Gevorgyan <lgevorgyan03@gmail.com>
Date: Sun, 15 Jun 2025 11:34:10 -0500
Subject: [PATCH] fix enable-overlays=1 for svtav1-psy arm64 builds

---
 ...od_implementation_of_sad_loop_kernel.patch | 156 ++++++++++++++++++
 scripts/build.sh                              |   8 +-
 2 files changed, 163 insertions(+), 1 deletion(-)
 create mode 100644 patches/fix_neon_dotprod_implementation_of_sad_loop_kernel.patch

diff --git a/patches/fix_neon_dotprod_implementation_of_sad_loop_kernel.patch b/patches/fix_neon_dotprod_implementation_of_sad_loop_kernel.patch
new file mode 100644
index 0000000..005a969
--- /dev/null
+++ b/patches/fix_neon_dotprod_implementation_of_sad_loop_kernel.patch
@@ -0,0 +1,156 @@
+commit 5def505f7f193d890be61e869831378f212a07bd
+Author: Salome Thirot <salome.thirot@arm.com>
+Date:   Fri May 2 11:20:54 2025 +0100
+
+    Fix Neon Dotprod implementation of sad_loop_kernel
+    
+    search_area_width and search_area_height can sometimes be 0, so replace
+    all the do while loops with for loops.
+
+diff --git a/Source/Lib/ASM_NEON_DOTPROD/compute_sad_neon_dotprod.c b/Source/Lib/ASM_NEON_DOTPROD/compute_sad_neon_dotprod.c
+index c116037..15d3da6 100644
+--- a/Source/Lib/ASM_NEON_DOTPROD/compute_sad_neon_dotprod.c
++++ b/Source/Lib/ASM_NEON_DOTPROD/compute_sad_neon_dotprod.c
+@@ -237,10 +237,8 @@ static inline void svt_sad_loop_kernel16xh_neon_dotprod(uint8_t *src, uint32_t s
+         y_search_step  = 2;
+     }
+ 
+-    int y_search_index = y_search_start;
+-    do {
+-        int x_search_index = 0;
+-        do {
++    for (int y_search_index = y_search_start; y_search_index < search_area_height; y_search_index += y_search_step) {
++        for (int x_search_index = 0; x_search_index < search_area_width; x_search_index += 8) {
+             /* Get the SAD of 8 search spaces aligned along the width and store it in 'sad4'. */
+             uint32x4_t sad4_0 = sad16xhx4d_neon_dotprod(
+                 src, src_stride, ref + x_search_index, ref_stride, block_height);
+@@ -248,13 +246,10 @@ static inline void svt_sad_loop_kernel16xh_neon_dotprod(uint8_t *src, uint32_t s
+                 src, src_stride, ref + x_search_index + 4, ref_stride, block_height);
+             update_best_sad_u32(sad4_0, best_sad, x_search_center, y_search_center, x_search_index, y_search_index);
+             update_best_sad_u32(sad4_1, best_sad, x_search_center, y_search_center, x_search_index + 4, y_search_index);
+-
+-            x_search_index += 8;
+-        } while (x_search_index != search_area_width);
++        }
+ 
+         ref += src_stride_raw;
+-        y_search_index += y_search_step;
+-    } while (y_search_index < search_area_height);
++    }
+ }
+ 
+ static inline void svt_sad_loop_kernel16xh_small_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref,
+@@ -273,8 +268,7 @@ static inline void svt_sad_loop_kernel16xh_small_neon_dotprod(uint8_t *src, uint
+         y_search_step  = 2;
+     }
+ 
+-    int y_search_index = y_search_start;
+-    do {
++    for (int y_search_index = y_search_start; y_search_index < search_area_height; y_search_index += y_search_step) {
+         int x_search_index;
+         for (x_search_index = 0; x_search_index <= search_area_width - 4; x_search_index += 4) {
+             /* Get the SAD of 4 search spaces aligned along the width and store it in 'sad4'. */
+@@ -289,8 +283,7 @@ static inline void svt_sad_loop_kernel16xh_small_neon_dotprod(uint8_t *src, uint
+         }
+ 
+         ref += src_stride_raw;
+-        y_search_index += y_search_step;
+-    } while (y_search_index < search_area_height);
++    }
+ }
+ 
+ static inline void svt_sad_loop_kernel32xh_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref,
+@@ -298,10 +291,8 @@ static inline void svt_sad_loop_kernel32xh_neon_dotprod(uint8_t *src, uint32_t s
+                                                         int16_t *x_search_center, int16_t *y_search_center,
+                                                         uint32_t src_stride_raw, int16_t search_area_width,
+                                                         int16_t search_area_height) {
+-    int y_search_index = 0;
+-    do {
+-        int x_search_index = 0;
+-        do {
++    for (int y_search_index = 0; y_search_index < search_area_height; y_search_index++) {
++        for (int x_search_index = 0; x_search_index < search_area_width; x_search_index += 8) {
+             /* Get the SAD of 4 search spaces aligned along the width and store it in 'sad4'. */
+             uint32x4_t sad4_0 = sad32xhx4d_neon_dotprod(
+                 src, src_stride, ref + x_search_index, ref_stride, block_height);
+@@ -309,11 +300,10 @@ static inline void svt_sad_loop_kernel32xh_neon_dotprod(uint8_t *src, uint32_t s
+                 src, src_stride, ref + x_search_index + 4, ref_stride, block_height);
+             update_best_sad_u32(sad4_0, best_sad, x_search_center, y_search_center, x_search_index, y_search_index);
+             update_best_sad_u32(sad4_1, best_sad, x_search_center, y_search_center, x_search_index + 4, y_search_index);
+-            x_search_index += 8;
+-        } while (x_search_index != search_area_width);
++        }
+ 
+         ref += src_stride_raw;
+-    } while (++y_search_index != search_area_height);
++    }
+ }
+ 
+ static inline void svt_sad_loop_kernel32xh_small_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref,
+@@ -321,8 +311,7 @@ static inline void svt_sad_loop_kernel32xh_small_neon_dotprod(uint8_t *src, uint
+                                                               uint64_t *best_sad, int16_t *x_search_center,
+                                                               int16_t *y_search_center, uint32_t src_stride_raw,
+                                                               int16_t search_area_width, int16_t search_area_height) {
+-    int y_search_index = 0;
+-    do {
++    for (int y_search_index = 0; y_search_index < search_area_height; y_search_index++) {
+         int x_search_index;
+         for (x_search_index = 0; x_search_index <= search_area_width - 4; x_search_index += 4) {
+             /* Get the SAD of 4 search spaces aligned along the width and store it in 'sad4'. */
+@@ -337,7 +326,7 @@ static inline void svt_sad_loop_kernel32xh_small_neon_dotprod(uint8_t *src, uint
+         }
+ 
+         ref += src_stride_raw;
+-    } while (++y_search_index != search_area_height);
++    }
+ }
+ 
+ static inline void svt_sad_loop_kernel64xh_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref,
+@@ -345,10 +334,8 @@ static inline void svt_sad_loop_kernel64xh_neon_dotprod(uint8_t *src, uint32_t s
+                                                         int16_t *x_search_center, int16_t *y_search_center,
+                                                         uint32_t src_stride_raw, int16_t search_area_width,
+                                                         int16_t search_area_height) {
+-    int y_search_index = 0;
+-    do {
+-        int x_search_index = 0;
+-        do {
++    for (int y_search_index = 0; y_search_index < search_area_height; y_search_index++) {
++        for (int x_search_index = 0; x_search_index < search_area_width; x_search_index += 8) {
+             /* Get the SAD of 4 search spaces aligned along the width and store it in 'sad4'. */
+             uint32x4_t sad4_0 = sad64xhx4d_neon_dotprod(
+                 src, src_stride, ref + x_search_index, ref_stride, block_height);
+@@ -356,11 +343,10 @@ static inline void svt_sad_loop_kernel64xh_neon_dotprod(uint8_t *src, uint32_t s
+                 src, src_stride, ref + x_search_index + 4, ref_stride, block_height);
+             update_best_sad_u32(sad4_0, best_sad, x_search_center, y_search_center, x_search_index, y_search_index);
+             update_best_sad_u32(sad4_1, best_sad, x_search_center, y_search_center, x_search_index + 4, y_search_index);
++        }
+ 
+-            x_search_index += 8;
+-        } while (x_search_index != search_area_width);
+         ref += src_stride_raw;
+-    } while (++y_search_index != search_area_height);
++    }
+ }
+ 
+ static inline void svt_sad_loop_kernel64xh_small_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref,
+@@ -368,8 +354,7 @@ static inline void svt_sad_loop_kernel64xh_small_neon_dotprod(uint8_t *src, uint
+                                                               uint64_t *best_sad, int16_t *x_search_center,
+                                                               int16_t *y_search_center, uint32_t src_stride_raw,
+                                                               int16_t search_area_width, int16_t search_area_height) {
+-    int y_search_index = 0;
+-    do {
++    for (int y_search_index = 0; y_search_index < search_area_height; y_search_index++) {
+         int x_search_index;
+         for (x_search_index = 0; x_search_index <= search_area_width - 4; x_search_index += 4) {
+             /* Get the SAD of 4 search spaces aligned along the width and store it in 'sad4'. */
+@@ -382,8 +367,9 @@ static inline void svt_sad_loop_kernel64xh_small_neon_dotprod(uint8_t *src, uint
+             uint64_t temp_sad = sad64xh_neon_dotprod(src, src_stride, ref + x_search_index, ref_stride, block_height);
+             update_best_sad(temp_sad, best_sad, x_search_center, y_search_center, x_search_index, y_search_index);
+         }
++
+         ref += src_stride_raw;
+-    } while (++y_search_index != search_area_height);
++    }
+ }
+ 
+ void svt_sad_loop_kernel_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref, uint32_t ref_stride,
diff --git a/scripts/build.sh b/scripts/build.sh
index 9a00f4f..3793288 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -16,6 +16,7 @@ usage() {
 SCRIPT_PATH="$(readlink -f "${BASH_SOURCE[0]}")"
 SCRIPT_DIR="$(dirname "$SCRIPT_PATH")"
 BUILDER_DIR="$(dirname "$SCRIPT_DIR")"
+PATCHES_DIR="${BUILDER_DIR}/patches"
 cd "$BUILDER_DIR" || exit
 
 # build with psy and lto as default
@@ -195,9 +196,11 @@ COMP_FLAGS=""
 if [[ "$ARCH" == "x86_64" ]]
 then
   COMP_FLAGS+=" -march=native"
+  PATCH_SVT_PSY="true"
 elif [[ "$ARCH" == "aarch64" || "$ARCH" == "arm64" ]]
 then
   COMP_FLAGS+=" -mcpu=native"
+  PATCH_SVT_PSY="patch -p1 -i ${PATCHES_DIR}/fix_neon_dotprod_implementation_of_sad_loop_kernel.patch"
 fi
 
 # compilation job count
@@ -237,7 +240,7 @@ if [[ "$(uname -r)" =~ "WSL" ]] ; then
 fi
 
 # clone ffmpeg
-git clone https://github.com/FFmpeg/FFmpeg "$FFMPEG_DIR"
+test -d "$FFMPEG_DIR" || git clone https://github.com/FFmpeg/FFmpeg "$FFMPEG_DIR"
 
 build_mpp() {
      # build mpp
@@ -346,11 +349,14 @@ build_svt_av1_psy() {
      fi
      cd "$SVT_PSY_DIR" || return 1
 
+     # disabling skipping build until patch is not needed
+     local FORCE_REBUILD=1
      check_for_rebuild && \
           cd "$CMAKE_BUILD_DIR" && \
           sudo make install && \
           set_commit_status && \
           return 0
+     ${PATCH_SVT_PSY} || return 1
 
      sudo rm -rf "$CMAKE_BUILD_DIR"
      mkdir "$CMAKE_BUILD_DIR"