From 4820adae80432feb7bf37f9e1e133aad8b1399ba Mon Sep 17 00:00:00 2001 From: Levon Gevorgyan Date: Sun, 15 Jun 2025 11:34:10 -0500 Subject: [PATCH] fix enable-overlays=1 for svtav1-psy arm64 builds --- ...od_implementation_of_sad_loop_kernel.patch | 156 ++++++++++++++++++ scripts/build.sh | 8 +- 2 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 patches/fix_neon_dotprod_implementation_of_sad_loop_kernel.patch diff --git a/patches/fix_neon_dotprod_implementation_of_sad_loop_kernel.patch b/patches/fix_neon_dotprod_implementation_of_sad_loop_kernel.patch new file mode 100644 index 0000000..005a969 --- /dev/null +++ b/patches/fix_neon_dotprod_implementation_of_sad_loop_kernel.patch @@ -0,0 +1,156 @@ +commit 5def505f7f193d890be61e869831378f212a07bd +Author: Salome Thirot +Date: Fri May 2 11:20:54 2025 +0100 + + Fix Neon Dotprod implementation of sad_loop_kernel + + search_area_width and search_area_height can sometimes be 0, so replace + all the do while loops with for loops. + +diff --git a/Source/Lib/ASM_NEON_DOTPROD/compute_sad_neon_dotprod.c b/Source/Lib/ASM_NEON_DOTPROD/compute_sad_neon_dotprod.c +index c116037..15d3da6 100644 +--- a/Source/Lib/ASM_NEON_DOTPROD/compute_sad_neon_dotprod.c ++++ b/Source/Lib/ASM_NEON_DOTPROD/compute_sad_neon_dotprod.c +@@ -237,10 +237,8 @@ static inline void svt_sad_loop_kernel16xh_neon_dotprod(uint8_t *src, uint32_t s + y_search_step = 2; + } + +- int y_search_index = y_search_start; +- do { +- int x_search_index = 0; +- do { ++ for (int y_search_index = y_search_start; y_search_index < search_area_height; y_search_index += y_search_step) { ++ for (int x_search_index = 0; x_search_index < search_area_width; x_search_index += 8) { + /* Get the SAD of 8 search spaces aligned along the width and store it in 'sad4'. */ + uint32x4_t sad4_0 = sad16xhx4d_neon_dotprod( + src, src_stride, ref + x_search_index, ref_stride, block_height); +@@ -248,13 +246,10 @@ static inline void svt_sad_loop_kernel16xh_neon_dotprod(uint8_t *src, uint32_t s + src, src_stride, ref + x_search_index + 4, ref_stride, block_height); + update_best_sad_u32(sad4_0, best_sad, x_search_center, y_search_center, x_search_index, y_search_index); + update_best_sad_u32(sad4_1, best_sad, x_search_center, y_search_center, x_search_index + 4, y_search_index); +- +- x_search_index += 8; +- } while (x_search_index != search_area_width); ++ } + + ref += src_stride_raw; +- y_search_index += y_search_step; +- } while (y_search_index < search_area_height); ++ } + } + + static inline void svt_sad_loop_kernel16xh_small_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref, +@@ -273,8 +268,7 @@ static inline void svt_sad_loop_kernel16xh_small_neon_dotprod(uint8_t *src, uint + y_search_step = 2; + } + +- int y_search_index = y_search_start; +- do { ++ for (int y_search_index = y_search_start; y_search_index < search_area_height; y_search_index += y_search_step) { + int x_search_index; + for (x_search_index = 0; x_search_index <= search_area_width - 4; x_search_index += 4) { + /* Get the SAD of 4 search spaces aligned along the width and store it in 'sad4'. */ +@@ -289,8 +283,7 @@ static inline void svt_sad_loop_kernel16xh_small_neon_dotprod(uint8_t *src, uint + } + + ref += src_stride_raw; +- y_search_index += y_search_step; +- } while (y_search_index < search_area_height); ++ } + } + + static inline void svt_sad_loop_kernel32xh_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref, +@@ -298,10 +291,8 @@ static inline void svt_sad_loop_kernel32xh_neon_dotprod(uint8_t *src, uint32_t s + int16_t *x_search_center, int16_t *y_search_center, + uint32_t src_stride_raw, int16_t search_area_width, + int16_t search_area_height) { +- int y_search_index = 0; +- do { +- int x_search_index = 0; +- do { ++ for (int y_search_index = 0; y_search_index < search_area_height; y_search_index++) { ++ for (int x_search_index = 0; x_search_index < search_area_width; x_search_index += 8) { + /* Get the SAD of 4 search spaces aligned along the width and store it in 'sad4'. */ + uint32x4_t sad4_0 = sad32xhx4d_neon_dotprod( + src, src_stride, ref + x_search_index, ref_stride, block_height); +@@ -309,11 +300,10 @@ static inline void svt_sad_loop_kernel32xh_neon_dotprod(uint8_t *src, uint32_t s + src, src_stride, ref + x_search_index + 4, ref_stride, block_height); + update_best_sad_u32(sad4_0, best_sad, x_search_center, y_search_center, x_search_index, y_search_index); + update_best_sad_u32(sad4_1, best_sad, x_search_center, y_search_center, x_search_index + 4, y_search_index); +- x_search_index += 8; +- } while (x_search_index != search_area_width); ++ } + + ref += src_stride_raw; +- } while (++y_search_index != search_area_height); ++ } + } + + static inline void svt_sad_loop_kernel32xh_small_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref, +@@ -321,8 +311,7 @@ static inline void svt_sad_loop_kernel32xh_small_neon_dotprod(uint8_t *src, uint + uint64_t *best_sad, int16_t *x_search_center, + int16_t *y_search_center, uint32_t src_stride_raw, + int16_t search_area_width, int16_t search_area_height) { +- int y_search_index = 0; +- do { ++ for (int y_search_index = 0; y_search_index < search_area_height; y_search_index++) { + int x_search_index; + for (x_search_index = 0; x_search_index <= search_area_width - 4; x_search_index += 4) { + /* Get the SAD of 4 search spaces aligned along the width and store it in 'sad4'. */ +@@ -337,7 +326,7 @@ static inline void svt_sad_loop_kernel32xh_small_neon_dotprod(uint8_t *src, uint + } + + ref += src_stride_raw; +- } while (++y_search_index != search_area_height); ++ } + } + + static inline void svt_sad_loop_kernel64xh_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref, +@@ -345,10 +334,8 @@ static inline void svt_sad_loop_kernel64xh_neon_dotprod(uint8_t *src, uint32_t s + int16_t *x_search_center, int16_t *y_search_center, + uint32_t src_stride_raw, int16_t search_area_width, + int16_t search_area_height) { +- int y_search_index = 0; +- do { +- int x_search_index = 0; +- do { ++ for (int y_search_index = 0; y_search_index < search_area_height; y_search_index++) { ++ for (int x_search_index = 0; x_search_index < search_area_width; x_search_index += 8) { + /* Get the SAD of 4 search spaces aligned along the width and store it in 'sad4'. */ + uint32x4_t sad4_0 = sad64xhx4d_neon_dotprod( + src, src_stride, ref + x_search_index, ref_stride, block_height); +@@ -356,11 +343,10 @@ static inline void svt_sad_loop_kernel64xh_neon_dotprod(uint8_t *src, uint32_t s + src, src_stride, ref + x_search_index + 4, ref_stride, block_height); + update_best_sad_u32(sad4_0, best_sad, x_search_center, y_search_center, x_search_index, y_search_index); + update_best_sad_u32(sad4_1, best_sad, x_search_center, y_search_center, x_search_index + 4, y_search_index); ++ } + +- x_search_index += 8; +- } while (x_search_index != search_area_width); + ref += src_stride_raw; +- } while (++y_search_index != search_area_height); ++ } + } + + static inline void svt_sad_loop_kernel64xh_small_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref, +@@ -368,8 +354,7 @@ static inline void svt_sad_loop_kernel64xh_small_neon_dotprod(uint8_t *src, uint + uint64_t *best_sad, int16_t *x_search_center, + int16_t *y_search_center, uint32_t src_stride_raw, + int16_t search_area_width, int16_t search_area_height) { +- int y_search_index = 0; +- do { ++ for (int y_search_index = 0; y_search_index < search_area_height; y_search_index++) { + int x_search_index; + for (x_search_index = 0; x_search_index <= search_area_width - 4; x_search_index += 4) { + /* Get the SAD of 4 search spaces aligned along the width and store it in 'sad4'. */ +@@ -382,8 +367,9 @@ static inline void svt_sad_loop_kernel64xh_small_neon_dotprod(uint8_t *src, uint + uint64_t temp_sad = sad64xh_neon_dotprod(src, src_stride, ref + x_search_index, ref_stride, block_height); + update_best_sad(temp_sad, best_sad, x_search_center, y_search_center, x_search_index, y_search_index); + } ++ + ref += src_stride_raw; +- } while (++y_search_index != search_area_height); ++ } + } + + void svt_sad_loop_kernel_neon_dotprod(uint8_t *src, uint32_t src_stride, uint8_t *ref, uint32_t ref_stride, diff --git a/scripts/build.sh b/scripts/build.sh index 9a00f4f..3793288 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -16,6 +16,7 @@ usage() { SCRIPT_PATH="$(readlink -f "${BASH_SOURCE[0]}")" SCRIPT_DIR="$(dirname "$SCRIPT_PATH")" BUILDER_DIR="$(dirname "$SCRIPT_DIR")" +PATCHES_DIR="${BUILDER_DIR}/patches" cd "$BUILDER_DIR" || exit # build with psy and lto as default @@ -195,9 +196,11 @@ COMP_FLAGS="" if [[ "$ARCH" == "x86_64" ]] then COMP_FLAGS+=" -march=native" + PATCH_SVT_PSY="true" elif [[ "$ARCH" == "aarch64" || "$ARCH" == "arm64" ]] then COMP_FLAGS+=" -mcpu=native" + PATCH_SVT_PSY="patch -p1 -i ${PATCHES_DIR}/fix_neon_dotprod_implementation_of_sad_loop_kernel.patch" fi # compilation job count @@ -237,7 +240,7 @@ if [[ "$(uname -r)" =~ "WSL" ]] ; then fi # clone ffmpeg -git clone https://github.com/FFmpeg/FFmpeg "$FFMPEG_DIR" +test -d "$FFMPEG_DIR" || git clone https://github.com/FFmpeg/FFmpeg "$FFMPEG_DIR" build_mpp() { # build mpp @@ -346,11 +349,14 @@ build_svt_av1_psy() { fi cd "$SVT_PSY_DIR" || return 1 + # disabling skipping build until patch is not needed + local FORCE_REBUILD=1 check_for_rebuild && \ cd "$CMAKE_BUILD_DIR" && \ sudo make install && \ set_commit_status && \ return 0 + ${PATCH_SVT_PSY} || return 1 sudo rm -rf "$CMAKE_BUILD_DIR" mkdir "$CMAKE_BUILD_DIR"