diff --git a/media/libaom/config/generic/config/aom_config.asm b/media/libaom/config/generic/config/aom_config.asm index 72307c318492..a28ad482eafb 100644 --- a/media/libaom/config/generic/config/aom_config.asm +++ b/media/libaom/config/generic/config/aom_config.asm @@ -12,6 +12,7 @@ AOM_ARCH_AARCH64 equ 0 AOM_ARCH_ARM equ 0 AOM_ARCH_PPC equ 0 +AOM_ARCH_RISCV equ 0 AOM_ARCH_X86 equ 0 AOM_ARCH_X86_64 equ 0 CONFIG_ACCOUNTING equ 0 @@ -82,6 +83,7 @@ HAVE_MMX equ 0 HAVE_NEON equ 0 HAVE_NEON_DOTPROD equ 0 HAVE_NEON_I8MM equ 0 +HAVE_RVV equ 0 HAVE_SSE equ 0 HAVE_SSE2 equ 0 HAVE_SSE3 equ 0 diff --git a/media/libaom/config/generic/config/aom_config.h b/media/libaom/config/generic/config/aom_config.h index ef68ec51f589..61b49dbc66a6 100644 --- a/media/libaom/config/generic/config/aom_config.h +++ b/media/libaom/config/generic/config/aom_config.h @@ -14,6 +14,7 @@ #define AOM_ARCH_AARCH64 0 #define AOM_ARCH_ARM 0 #define AOM_ARCH_PPC 0 +#define AOM_ARCH_RISCV 0 #define AOM_ARCH_X86 0 #define AOM_ARCH_X86_64 0 #define CONFIG_ACCOUNTING 0 @@ -84,6 +85,7 @@ #define HAVE_NEON 0 #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 +#define HAVE_RVV 0 #define HAVE_SSE 0 #define HAVE_SSE2 0 #define HAVE_SSE3 0 diff --git a/media/libaom/config/linux/arm/config/aom_config.asm b/media/libaom/config/linux/arm/config/aom_config.asm index c3cd1fe3e48e..9fd3159dba6d 100644 --- a/media/libaom/config/linux/arm/config/aom_config.asm +++ b/media/libaom/config/linux/arm/config/aom_config.asm @@ -12,6 +12,7 @@ .equ AOM_ARCH_AARCH64, 0 .equ AOM_ARCH_ARM, 1 .equ AOM_ARCH_PPC, 0 +.equ AOM_ARCH_RISCV, 0 .equ AOM_ARCH_X86, 0 .equ AOM_ARCH_X86_64, 0 .equ CONFIG_ACCOUNTING, 0 @@ -82,6 +83,7 @@ .equ HAVE_NEON, 1 .equ HAVE_NEON_DOTPROD, 0 .equ HAVE_NEON_I8MM, 0 +.equ HAVE_RVV, 0 .equ HAVE_SSE, 0 .equ HAVE_SSE2, 0 .equ HAVE_SSE3, 0 diff --git a/media/libaom/config/linux/arm/config/aom_config.h b/media/libaom/config/linux/arm/config/aom_config.h index 29385cec2267..15350b976cfb 100644 --- a/media/libaom/config/linux/arm/config/aom_config.h +++ b/media/libaom/config/linux/arm/config/aom_config.h @@ -14,6 +14,7 @@ #define AOM_ARCH_AARCH64 0 #define AOM_ARCH_ARM 1 #define AOM_ARCH_PPC 0 +#define AOM_ARCH_RISCV 0 #define AOM_ARCH_X86 0 #define AOM_ARCH_X86_64 0 #define CONFIG_ACCOUNTING 0 @@ -84,6 +85,7 @@ #define HAVE_NEON 1 #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 +#define HAVE_RVV 0 #define HAVE_SSE 0 #define HAVE_SSE2 0 #define HAVE_SSE3 0 diff --git a/media/libaom/config/linux/ia32/config/aom_config.asm b/media/libaom/config/linux/ia32/config/aom_config.asm index eb2ddfa7599e..0f2be2761ba4 100644 --- a/media/libaom/config/linux/ia32/config/aom_config.asm +++ b/media/libaom/config/linux/ia32/config/aom_config.asm @@ -12,6 +12,7 @@ AOM_ARCH_AARCH64 equ 0 AOM_ARCH_ARM equ 0 AOM_ARCH_PPC equ 0 +AOM_ARCH_RISCV equ 0 AOM_ARCH_X86 equ 1 AOM_ARCH_X86_64 equ 0 CONFIG_ACCOUNTING equ 0 @@ -82,6 +83,7 @@ HAVE_MMX equ 1 HAVE_NEON equ 0 HAVE_NEON_DOTPROD equ 0 HAVE_NEON_I8MM equ 0 +HAVE_RVV equ 0 HAVE_SSE equ 1 HAVE_SSE2 equ 1 HAVE_SSE3 equ 1 diff --git a/media/libaom/config/linux/ia32/config/aom_config.h b/media/libaom/config/linux/ia32/config/aom_config.h index 8f14a39b5aa3..89ff5324d067 100644 --- a/media/libaom/config/linux/ia32/config/aom_config.h +++ b/media/libaom/config/linux/ia32/config/aom_config.h @@ -14,6 +14,7 @@ #define AOM_ARCH_AARCH64 0 #define AOM_ARCH_ARM 0 #define AOM_ARCH_PPC 0 +#define AOM_ARCH_RISCV 0 #define AOM_ARCH_X86 1 #define AOM_ARCH_X86_64 0 #define CONFIG_ACCOUNTING 0 @@ -84,6 +85,7 @@ #define HAVE_NEON 0 #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 +#define HAVE_RVV 0 #define HAVE_SSE 1 #define HAVE_SSE2 1 #define HAVE_SSE3 1 diff --git a/media/libaom/config/linux/x64/config/aom_config.asm b/media/libaom/config/linux/x64/config/aom_config.asm index d9cfcc099029..3091f2ae3233 100644 --- a/media/libaom/config/linux/x64/config/aom_config.asm +++ b/media/libaom/config/linux/x64/config/aom_config.asm @@ -12,6 +12,7 @@ AOM_ARCH_AARCH64 equ 0 AOM_ARCH_ARM equ 0 AOM_ARCH_PPC equ 0 +AOM_ARCH_RISCV equ 0 AOM_ARCH_X86 equ 0 AOM_ARCH_X86_64 equ 1 CONFIG_ACCOUNTING equ 0 @@ -82,6 +83,7 @@ HAVE_MMX equ 1 HAVE_NEON equ 0 HAVE_NEON_DOTPROD equ 0 HAVE_NEON_I8MM equ 0 +HAVE_RVV equ 0 HAVE_SSE equ 1 HAVE_SSE2 equ 1 HAVE_SSE3 equ 1 diff --git a/media/libaom/config/linux/x64/config/aom_config.h b/media/libaom/config/linux/x64/config/aom_config.h index 5bc1b2c7f31e..a86fe4a4eac4 100644 --- a/media/libaom/config/linux/x64/config/aom_config.h +++ b/media/libaom/config/linux/x64/config/aom_config.h @@ -14,6 +14,7 @@ #define AOM_ARCH_AARCH64 0 #define AOM_ARCH_ARM 0 #define AOM_ARCH_PPC 0 +#define AOM_ARCH_RISCV 0 #define AOM_ARCH_X86 0 #define AOM_ARCH_X86_64 1 #define CONFIG_ACCOUNTING 0 @@ -84,6 +85,7 @@ #define HAVE_NEON 0 #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 +#define HAVE_RVV 0 #define HAVE_SSE 1 #define HAVE_SSE2 1 #define HAVE_SSE3 1 diff --git a/media/libaom/config/mac/arm64/config/aom_config.asm b/media/libaom/config/mac/arm64/config/aom_config.asm index 29a3de8f8be1..c8f503e63601 100644 --- a/media/libaom/config/mac/arm64/config/aom_config.asm +++ b/media/libaom/config/mac/arm64/config/aom_config.asm @@ -12,6 +12,7 @@ AOM_ARCH_AARCH64 equ 1 AOM_ARCH_ARM equ 1 AOM_ARCH_PPC equ 0 +AOM_ARCH_RISCV equ 0 AOM_ARCH_X86 equ 0 AOM_ARCH_X86_64 equ 0 CONFIG_ACCOUNTING equ 0 @@ -82,6 +83,7 @@ HAVE_MMX equ 0 HAVE_NEON equ 1 HAVE_NEON_DOTPROD equ 1 HAVE_NEON_I8MM equ 1 +HAVE_RVV equ 0 HAVE_SSE equ 0 HAVE_SSE2 equ 0 HAVE_SSE3 equ 0 diff --git a/media/libaom/config/mac/arm64/config/aom_config.h b/media/libaom/config/mac/arm64/config/aom_config.h index 2dd1aa07a858..c501b7ef2586 100644 --- a/media/libaom/config/mac/arm64/config/aom_config.h +++ b/media/libaom/config/mac/arm64/config/aom_config.h @@ -14,6 +14,7 @@ #define AOM_ARCH_AARCH64 1 #define AOM_ARCH_ARM 1 #define AOM_ARCH_PPC 0 +#define AOM_ARCH_RISCV 0 #define AOM_ARCH_X86 0 #define AOM_ARCH_X86_64 0 #define CONFIG_ACCOUNTING 0 @@ -84,6 +85,7 @@ #define HAVE_NEON 1 #define HAVE_NEON_DOTPROD 1 #define HAVE_NEON_I8MM 1 +#define HAVE_RVV 0 #define HAVE_SSE 0 #define HAVE_SSE2 0 #define HAVE_SSE3 0 diff --git a/media/libaom/config/mac/x64/config/aom_config.asm b/media/libaom/config/mac/x64/config/aom_config.asm index d9cfcc099029..3091f2ae3233 100644 --- a/media/libaom/config/mac/x64/config/aom_config.asm +++ b/media/libaom/config/mac/x64/config/aom_config.asm @@ -12,6 +12,7 @@ AOM_ARCH_AARCH64 equ 0 AOM_ARCH_ARM equ 0 AOM_ARCH_PPC equ 0 +AOM_ARCH_RISCV equ 0 AOM_ARCH_X86 equ 0 AOM_ARCH_X86_64 equ 1 CONFIG_ACCOUNTING equ 0 @@ -82,6 +83,7 @@ HAVE_MMX equ 1 HAVE_NEON equ 0 HAVE_NEON_DOTPROD equ 0 HAVE_NEON_I8MM equ 0 +HAVE_RVV equ 0 HAVE_SSE equ 1 HAVE_SSE2 equ 1 HAVE_SSE3 equ 1 diff --git a/media/libaom/config/mac/x64/config/aom_config.h b/media/libaom/config/mac/x64/config/aom_config.h index 5bc1b2c7f31e..a86fe4a4eac4 100644 --- a/media/libaom/config/mac/x64/config/aom_config.h +++ b/media/libaom/config/mac/x64/config/aom_config.h @@ -14,6 +14,7 @@ #define AOM_ARCH_AARCH64 0 #define AOM_ARCH_ARM 0 #define AOM_ARCH_PPC 0 +#define AOM_ARCH_RISCV 0 #define AOM_ARCH_X86 0 #define AOM_ARCH_X86_64 1 #define CONFIG_ACCOUNTING 0 @@ -84,6 +85,7 @@ #define HAVE_NEON 0 #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 +#define HAVE_RVV 0 #define HAVE_SSE 1 #define HAVE_SSE2 1 #define HAVE_SSE3 1 diff --git a/media/libaom/config/win/ia32/config/aom_config.asm b/media/libaom/config/win/ia32/config/aom_config.asm index 75fd77c753db..c30f034e5502 100644 --- a/media/libaom/config/win/ia32/config/aom_config.asm +++ b/media/libaom/config/win/ia32/config/aom_config.asm @@ -12,6 +12,7 @@ AOM_ARCH_AARCH64 equ 0 AOM_ARCH_ARM equ 0 AOM_ARCH_PPC equ 0 +AOM_ARCH_RISCV equ 0 AOM_ARCH_X86 equ 1 AOM_ARCH_X86_64 equ 0 CONFIG_ACCOUNTING equ 0 @@ -82,6 +83,7 @@ HAVE_MMX equ 1 HAVE_NEON equ 0 HAVE_NEON_DOTPROD equ 0 HAVE_NEON_I8MM equ 0 +HAVE_RVV equ 0 HAVE_SSE equ 1 HAVE_SSE2 equ 1 HAVE_SSE3 equ 1 diff --git a/media/libaom/config/win/ia32/config/aom_config.h b/media/libaom/config/win/ia32/config/aom_config.h index f7a251fe936e..bb73fdd79fc6 100644 --- a/media/libaom/config/win/ia32/config/aom_config.h +++ b/media/libaom/config/win/ia32/config/aom_config.h @@ -14,6 +14,7 @@ #define AOM_ARCH_AARCH64 0 #define AOM_ARCH_ARM 0 #define AOM_ARCH_PPC 0 +#define AOM_ARCH_RISCV 0 #define AOM_ARCH_X86 1 #define AOM_ARCH_X86_64 0 #define CONFIG_ACCOUNTING 0 @@ -84,6 +85,7 @@ #define HAVE_NEON 0 #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 +#define HAVE_RVV 0 #define HAVE_SSE 1 #define HAVE_SSE2 1 #define HAVE_SSE3 1 diff --git a/media/libaom/config/win/x64/config/aom_config.asm b/media/libaom/config/win/x64/config/aom_config.asm index d9cfcc099029..3091f2ae3233 100644 --- a/media/libaom/config/win/x64/config/aom_config.asm +++ b/media/libaom/config/win/x64/config/aom_config.asm @@ -12,6 +12,7 @@ AOM_ARCH_AARCH64 equ 0 AOM_ARCH_ARM equ 0 AOM_ARCH_PPC equ 0 +AOM_ARCH_RISCV equ 0 AOM_ARCH_X86 equ 0 AOM_ARCH_X86_64 equ 1 CONFIG_ACCOUNTING equ 0 @@ -82,6 +83,7 @@ HAVE_MMX equ 1 HAVE_NEON equ 0 HAVE_NEON_DOTPROD equ 0 HAVE_NEON_I8MM equ 0 +HAVE_RVV equ 0 HAVE_SSE equ 1 HAVE_SSE2 equ 1 HAVE_SSE3 equ 1 diff --git a/media/libaom/config/win/x64/config/aom_config.h b/media/libaom/config/win/x64/config/aom_config.h index 5bc1b2c7f31e..a86fe4a4eac4 100644 --- a/media/libaom/config/win/x64/config/aom_config.h +++ b/media/libaom/config/win/x64/config/aom_config.h @@ -14,6 +14,7 @@ #define AOM_ARCH_AARCH64 0 #define AOM_ARCH_ARM 0 #define AOM_ARCH_PPC 0 +#define AOM_ARCH_RISCV 0 #define AOM_ARCH_X86 0 #define AOM_ARCH_X86_64 1 #define CONFIG_ACCOUNTING 0 @@ -84,6 +85,7 @@ #define HAVE_NEON 0 #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 +#define HAVE_RVV 0 #define HAVE_SSE 1 #define HAVE_SSE2 1 #define HAVE_SSE3 1 diff --git a/media/libaom/moz.yaml b/media/libaom/moz.yaml index 3da03697b989..893d9c43c0c6 100644 --- a/media/libaom/moz.yaml +++ b/media/libaom/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: 0c13a5d54053f82bf8500b421b5cdefb1cc1b3ed (Sun Jan 05 09:13:09 2025 -0800). + release: 3990233fc06a35944d6d33797e63931802122a95 (Thu Jan 30 11:32:16 2025 -0800). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: 0c13a5d54053f82bf8500b421b5cdefb1cc1b3ed + revision: 3990233fc06a35944d6d33797e63931802122a95 # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt index a62100431447..1c37223cbce5 100644 --- a/third_party/aom/CMakeLists.txt +++ b/third_party/aom/CMakeLists.txt @@ -333,6 +333,12 @@ if(CONFIG_AV1_ENCODER) # libaom static library. if(BUILD_SHARED_LIBS) target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom_static) + # TODO: https://aomedia.issues.chromium.org/391715078 - This condition can + # be removed after aom_av1_rc restricts its symbol visibility. + if(CYGWIN OR MINGW) + target_link_options(aom_av1_rc ${AOM_LIB_LINK_TYPE} + LINKER:--allow-multiple-definition) + endif() else() target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom) endif() @@ -858,8 +864,8 @@ if(BUILD_SHARED_LIBS) # errors (don't use it with AddressSanitizer)." See # https://clang.llvm.org/docs/AddressSanitizer.html#usage. Similarly, see # https://clang.llvm.org/docs/MemorySanitizer.html#usage. - if(NOT WIN32 - AND NOT APPLE + if(NOT + (APPLE OR CYGWIN OR WIN32) AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE)) # The -z defs linker option reports unresolved symbol references from object # files when building a shared library. diff --git a/third_party/aom/README.md b/third_party/aom/README.md index db7ad37b898f..2e0902a0ef82 100644 --- a/third_party/aom/README.md +++ b/third_party/aom/README.md @@ -60,7 +60,9 @@ README.md {#LREADME} present, yasm will be used by default. Pass -DENABLE_NASM=ON to cmake to select nasm.) If you download yasm with the intention to work with Visual Studio, please download win32.exe or win64.exe and rename it into yasm.exe. - DO NOT download or use vsyasm.exe. + DO NOT download or use vsyasm.exe. The MSYS2 version of the yasm binary can + also be used and avoids an issue caused by a missing Visual C++ + Redistributable install (Visual Studio 2010, MSVCR100.dll). 6. Building the documentation requires [doxygen version 1.8.10 or newer](http://doxygen.org). 7. Emscripten builds require the portable diff --git a/third_party/aom/aom/exports_com b/third_party/aom/aom/exports_com index 1166104aaee1..f3dbeaea33e9 100644 --- a/third_party/aom/aom/exports_com +++ b/third_party/aom/aom/exports_com @@ -10,7 +10,6 @@ text aom_codec_set_option text aom_codec_version text aom_codec_version_extra_str text aom_codec_version_str -text aom_free text aom_img_add_metadata text aom_img_alloc text aom_img_alloc_with_border @@ -25,7 +24,6 @@ text aom_img_plane_width text aom_img_remove_metadata text aom_img_set_rect text aom_img_wrap -text aom_malloc text aom_rb_bytes_read text aom_rb_read_bit text aom_rb_read_literal diff --git a/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c index b2fcc512513f..b3c373e210f7 100644 --- a/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c +++ b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c @@ -15,6 +15,7 @@ #include "aom/aom_integer.h" #include "aom_dsp/arm/transpose_neon.h" +#include "mem_neon.h" static inline int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low, const int16x4_t high) { @@ -226,13 +227,8 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - uint16_t *const dst_p1 = (uint16_t *)(s - 2 * pitch); - uint16_t *const dst_p0 = (uint16_t *)(s - pitch); - uint16_t *const dst_q0 = (uint16_t *)(s); - uint16_t *const dst_q1 = (uint16_t *)(s + pitch); - - const uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), - vld1_u16(dst_q0), vld1_u16(dst_q1) }; + uint16x4_t src[4]; + load_u16_4x4(s - 2 * pitch, pitch, &src[0], &src[1], &src[2], &src[3]); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); @@ -247,12 +243,10 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, &needs_filter4_mask); -#if AOM_ARCH_AARCH64 - if (vaddv_u16(needs_filter4_mask) == 0) { + if (vget_lane_u64(vreinterpret_u64_u16(needs_filter4_mask), 0) == 0) { // None of the values will be filtered. return; } -#endif // AOM_ARCH_AARCH64 // Copy the masks to the high bits for packed comparisons later. const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); @@ -272,10 +266,9 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); + store_u16_4x4(s - 2 * pitch, pitch, vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output)); } void aom_highbd_lpf_horizontal_4_dual_neon( @@ -290,14 +283,8 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { // Offset by 2 uint16_t values to load from first p1 position. - uint16_t *dst = s - 2; - uint16_t *dst_p1 = dst; - uint16_t *dst_p0 = dst + pitch; - uint16_t *dst_q0 = dst + pitch * 2; - uint16_t *dst_q1 = dst + pitch * 3; - - uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1) }; + uint16x4_t src[4]; + load_u16_4x4(s - 2, pitch, &src[0], &src[1], &src[2], &src[3]); transpose_array_inplace_u16_4x4(src); // Adjust thresholds to bitdepth. @@ -313,12 +300,10 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, &needs_filter4_mask); -#if AOM_ARCH_AARCH64 - if (vaddv_u16(needs_filter4_mask) == 0) { + if (vget_lane_u64(vreinterpret_u64_u16(needs_filter4_mask), 0) == 0) { // None of the values will be filtered. return; } -#endif // AOM_ARCH_AARCH64 // Copy the masks to the high bits for packed comparisons later. const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); @@ -346,10 +331,7 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, }; transpose_array_inplace_u16_4x4(output); - vst1_u16(dst_p1, output[0]); - vst1_u16(dst_p0, output[1]); - vst1_u16(dst_q0, output[2]); - vst1_u16(dst_q1, output[3]); + store_u16_4x4(s - 2, pitch, output[0], output[1], output[2], output[3]); } void aom_highbd_lpf_vertical_4_dual_neon( @@ -379,16 +361,14 @@ static inline void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1, // ^^^^^^ sum = vaddq_u16(sum, p0q0); - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^ - sum = vshlq_n_u16(sum, 1); - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 // ^^^^^^ ^^^^^^ // Should dual issue with the left shift. const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4); const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0); - sum = vaddq_u16(sum, outer_sum); + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^^^^^^ ^^^^ + sum = vmlaq_n_u16(outer_sum, sum, 2); *p1q1_output = vrshrq_n_u16(sum, 3); @@ -396,11 +376,8 @@ static inline void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1, // p0 = p1 - (2 * p2) + q0 + q1 // q0 = q1 - (2 * q2) + p0 + p1 // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 - // ^^^^^^^^ - const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1); - // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 - // ^^^^^^^^ - sum = vsubq_u16(sum, p2q2_double); + // ^^^^^^^^^^^^^^^^^ + sum = vmlsq_n_u16(sum, p2q2, 2); const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4); sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1)); @@ -411,16 +388,9 @@ void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - uint16_t *const dst_p2 = s - 3 * pitch; - uint16_t *const dst_p1 = s - 2 * pitch; - uint16_t *const dst_p0 = s - pitch; - uint16_t *const dst_q0 = s; - uint16_t *const dst_q1 = s + pitch; - uint16_t *const dst_q2 = s + 2 * pitch; - - const uint16x4_t src[6] = { vld1_u16(dst_p2), vld1_u16(dst_p1), - vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1), vld1_u16(dst_q2) }; + uint16x4_t src[6]; + load_u16_4x6(s - 3 * pitch, pitch, &src[0], &src[1], &src[2], &src[3], + &src[4], &src[5]); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); @@ -437,50 +407,56 @@ void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch, filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat3_mask, &hev_mask); -#if AOM_ARCH_AARCH64 - if (vaddv_u16(needs_filter_mask) == 0) { + if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } -#endif // AOM_ARCH_AARCH64 - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); - filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); uint16x8_t p0q0_output, p1q1_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or - // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6 - // output is not used. uint16x8_t f6_p1q1, f6_p0q0; - const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); - if (vget_lane_u64(need_filter6, 0) == 0) { - // filter6() does not apply, but filter4() applies to one or more values. - p0q0_output = p0q0; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { + // Not needing filter4() at all is a very common case, so isolate it to avoid + // needlessly computing filter4(). + if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 && + vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) { filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); - p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + p1q1_output = f6_p1q1; + p0q0_output = f6_p0q0; + } else { + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t is_flat3_mask_8 = + vcombine_u16(is_flat3_mask, is_flat3_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or + // filter6. Therefore if it is false when |needs_filter_mask| is true, + // filter6 output is not used. + const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); + if (vget_lane_u64(need_filter6, 0) == 0) { + // filter6() does not apply, but filter4() applies to one or more values. + p0q0_output = p0q0; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); + p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } } - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); + store_u16_4x4(s - 2 * pitch, pitch, vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output)); } void aom_highbd_lpf_horizontal_6_dual_neon( @@ -494,17 +470,12 @@ void aom_highbd_lpf_horizontal_6_dual_neon( void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - // Left side of the filter window. - uint16_t *const dst = s - 3; - uint16_t *const dst_0 = dst; - uint16_t *const dst_1 = dst + pitch; - uint16_t *const dst_2 = dst + 2 * pitch; - uint16_t *const dst_3 = dst + 3 * pitch; - // Overread by 2 values. These overreads become the high halves of src_raw[2] // and src_raw[3] after transpose. - uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), - vld1q_u16(dst_2), vld1q_u16(dst_3) }; + uint16x8_t src_raw[4]; + load_u16_8x4(s - 3, pitch, &src_raw[0], &src_raw[1], &src_raw[2], + &src_raw[3]); + transpose_array_inplace_u16_4x8(src_raw); // p2, p1, p0, q0, q1, q2 const uint16x4_t src[6] = { @@ -528,25 +499,10 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch, filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat3_mask, &hev_mask); -#if AOM_ARCH_AARCH64 - if (vaddv_u16(needs_filter_mask) == 0) { + if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } -#endif // AOM_ARCH_AARCH64 - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); - filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); uint16x8_t p0q0_output, p1q1_output; // Because we did not return after testing |needs_filter_mask| we know it is @@ -555,17 +511,39 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch, // output is not used. uint16x8_t f6_p1q1, f6_p0q0; const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); - if (vget_lane_u64(need_filter6, 0) == 0) { - // filter6() does not apply, but filter4() applies to one or more values. - p0q0_output = p0q0; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { + // Not needing filter4() at all is a very common case, so isolate it to avoid + // needlessly computing filter4(). + if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 && + vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) { filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); - p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + p1q1_output = f6_p1q1; + p0q0_output = f6_p0q0; + } else { + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t is_flat3_mask_8 = + vcombine_u16(is_flat3_mask, is_flat3_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + if (vget_lane_u64(need_filter6, 0) == 0) { + // filter6() does not apply, but filter4() applies to one or more values. + p0q0_output = p0q0; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); + p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } } uint16x4_t output[4] = { @@ -576,11 +554,7 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch, }; transpose_array_inplace_u16_4x4(output); - // dst_n starts at p2, so adjust to p1. - vst1_u16(dst_0 + 1, output[0]); - vst1_u16(dst_1 + 1, output[1]); - vst1_u16(dst_2 + 1, output[2]); - vst1_u16(dst_3 + 1, output[3]); + store_u16_4x4(s - 2, pitch, output[0], output[1], output[2], output[3]); } void aom_highbd_lpf_vertical_6_dual_neon( @@ -607,18 +581,14 @@ static inline void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2, // ^^^^^^^^^^^ const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2); - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^ - uint16x8_t sum = vshlq_n_u16(p23q23, 1); - // Add two other terms to make dual issue with shift more likely. // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 // ^^^^^^^^^^^ const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1); // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^^^^^^^^ - sum = vaddq_u16(sum, p01q01); + // ^^^^^ ^^^^^^^^^^^^^ + uint16x8_t sum = vmlaq_n_u16(p01q01, p23q23, 2); // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 // ^^^^^^ @@ -654,19 +624,9 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - uint16_t *const dst_p3 = s - 4 * pitch; - uint16_t *const dst_p2 = s - 3 * pitch; - uint16_t *const dst_p1 = s - 2 * pitch; - uint16_t *const dst_p0 = s - pitch; - uint16_t *const dst_q0 = s; - uint16_t *const dst_q1 = s + pitch; - uint16_t *const dst_q2 = s + 2 * pitch; - uint16_t *const dst_q3 = s + 3 * pitch; - - const uint16x4_t src[8] = { vld1_u16(dst_p3), vld1_u16(dst_p2), - vld1_u16(dst_p1), vld1_u16(dst_p0), - vld1_u16(dst_q0), vld1_u16(dst_q1), - vld1_u16(dst_q2), vld1_u16(dst_q3) }; + uint16x4_t src[8]; + load_u16_4x8(s - 4 * pitch, pitch, &src[0], &src[1], &src[2], &src[3], + &src[4], &src[5], &src[6], &src[7]); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); @@ -684,54 +644,59 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); -#if AOM_ARCH_AARCH64 - if (vaddv_u16(needs_filter_mask) == 0) { + if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } -#endif // AOM_ARCH_AARCH64 - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]); - filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); uint16x8_t p0q0_output, p1q1_output, p2q2_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or - // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8 - // output is not used. uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // filter8() does not apply, but filter4() applies to one or more values. - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t is_flat4_mask_8 = - vcombine_u16(is_flat4_mask, is_flat4_mask); + // Not needing filter4() at all is a very common case, so isolate it to avoid + // needlessly computing filter4(). + if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 && + vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) { filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + p2q2_output = f8_p2q2; + p1q1_output = f8_p1q1; + p0q0_output = f8_p0q0; + } else { + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or + // filter8. Therefore if it is false when |needs_filter_mask| is true, + // filter8 output is not used. + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // filter8() does not apply, but filter4() applies to one or more values. + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t is_flat4_mask_8 = + vcombine_u16(is_flat4_mask, is_flat4_mask); + filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } } - vst1_u16(dst_p2, vget_low_u16(p2q2_output)); - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); - vst1_u16(dst_q2, vget_high_u16(p2q2_output)); + store_u16_4x6(s - 3 * pitch, pitch, vget_low_u16(p2q2_output), + vget_low_u16(p1q1_output), vget_low_u16(p0q0_output), + vget_high_u16(p0q0_output), vget_high_u16(p1q1_output), + vget_high_u16(p2q2_output)); } void aom_highbd_lpf_horizontal_8_dual_neon( @@ -749,16 +714,10 @@ static inline uint16x8_t reverse_low_half(const uint16x8_t a) { void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - uint16_t *const dst = s - 4; - uint16_t *const dst_0 = dst; - uint16_t *const dst_1 = dst + pitch; - uint16_t *const dst_2 = dst + 2 * pitch; - uint16_t *const dst_3 = dst + 3 * pitch; - // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n. // To get desired pairs after transpose, one half should be reversed. - uint16x8_t src[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3) }; + uint16x8_t src[4]; + load_u16_8x4(s - 4, pitch, &src[0], &src[1], &src[2], &src[3]); // src[0] = p0q0 // src[1] = p1q1 @@ -783,45 +742,54 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); -#if AOM_ARCH_AARCH64 - if (vaddv_u16(needs_filter_mask) == 0) { + if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } -#endif // AOM_ARCH_AARCH64 - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); - filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); uint16x8_t p0q0_output, p1q1_output, p2q2_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or - // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8 - // output is not used. - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // filter8() does not apply, but filter4() applies to one or more values. - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t is_flat4_mask_8 = - vcombine_u16(is_flat4_mask, is_flat4_mask); - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + // Not needing filter4() at all is a very common case, so isolate it to avoid + // needlessly computing filter4(). + if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 && + vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) { filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + p2q2_output = f8_p2q2; + p1q1_output = f8_p1q1; + p0q0_output = f8_p0q0; + } else { + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = + vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or + // filter8. Therefore if it is false when |needs_filter_mask| is true, + // filter8 output is not used. + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // filter8() does not apply, but filter4() applies to one or more values. + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t is_flat4_mask_8 = + vcombine_u16(is_flat4_mask, is_flat4_mask); + filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } } uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 }; @@ -831,10 +799,9 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, // Reverse p values to produce original order: // p3 p2 p1 p0 q0 q1 q2 q3 - vst1q_u16(dst_0, reverse_low_half(output[0])); - vst1q_u16(dst_1, reverse_low_half(output[1])); - vst1q_u16(dst_2, reverse_low_half(output[2])); - vst1q_u16(dst_3, reverse_low_half(output[3])); + store_u16_8x4(s - 4, pitch, reverse_low_half(output[0]), + reverse_low_half(output[1]), reverse_low_half(output[2]), + reverse_low_half(output[3])); } void aom_highbd_lpf_vertical_8_dual_neon( @@ -864,8 +831,8 @@ static inline void filter14( // ^^^^^^^^^^^^^^^^^^^ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) // ^^^^^^^^^^^^^^^^^^^ - uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1); - sum = vaddq_u16(sum, p6q6_x7); + const uint16x8_t p45q45 = vaddq_u16(p5q5, p4q4); + uint16x8_t sum = vmlaq_n_u16(p6q6_x7, p45q45, 2); // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 // ^^^^^^^ @@ -938,27 +905,10 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - uint16_t *const dst_p6 = s - 7 * pitch; - uint16_t *const dst_p5 = s - 6 * pitch; - uint16_t *const dst_p4 = s - 5 * pitch; - uint16_t *const dst_p3 = s - 4 * pitch; - uint16_t *const dst_p2 = s - 3 * pitch; - uint16_t *const dst_p1 = s - 2 * pitch; - uint16_t *const dst_p0 = s - pitch; - uint16_t *const dst_q0 = s; - uint16_t *const dst_q1 = s + pitch; - uint16_t *const dst_q2 = s + 2 * pitch; - uint16_t *const dst_q3 = s + 3 * pitch; - uint16_t *const dst_q4 = s + 4 * pitch; - uint16_t *const dst_q5 = s + 5 * pitch; - uint16_t *const dst_q6 = s + 6 * pitch; - - const uint16x4_t src[14] = { - vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3), - vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4), - vld1_u16(dst_q5), vld1_u16(dst_q6) - }; + uint16x4_t src[14]; + load_u16_4x14(s - 7 * pitch, pitch, &src[0], &src[1], &src[2], &src[3], + &src[4], &src[5], &src[6], &src[7], &src[8], &src[9], &src[10], + &src[11], &src[12], &src[13]); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); @@ -976,12 +926,10 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch, filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); -#if AOM_ARCH_AARCH64 - if (vaddv_u16(needs_filter_mask) == 0) { + if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } -#endif // AOM_ARCH_AARCH64 const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]); const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]); const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]); @@ -991,85 +939,102 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch, const uint16x4_t is_flat4_outer_mask = vand_u16( is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), vabdq_u16(p0q0, p6q6), bd)); - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]); - filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, p5q5_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or - // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8 - // output is not used. uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // filter8() and filter14() do not apply, but filter4() applies to one or - // more values. + uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) { + // filter14() applies to all values. + filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = f14_p5q5; + p4q4_output = f14_p4q4; + p3q3_output = f14_p3q3; + p2q2_output = f14_p2q2; + p1q1_output = f14_p1q1; + p0q0_output = f14_p0q0; + } else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 && + vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) { + // filter8() applies to all values. + filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); p5q5_output = p5q5; p4q4_output = p4q4; p3q3_output = p3q3; - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + p2q2_output = f8_p2q2; + p1q1_output = f8_p1q1; + p0q0_output = f8_p0q0; } else { - const uint16x8_t use_filter8_mask = - vcombine_u16(is_flat4_mask, is_flat4_mask); - filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); - if (vget_lane_u64(need_filter14, 0) == 0) { - // filter14() does not apply, but filter8() and filter4() apply to one or + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or + // filter8. Therefore if it is false when |needs_filter_mask| is true, + // filter8 output is not used. + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // filter8() and filter14() do not apply, but filter4() applies to one or // more values. p5q5_output = p5q5; p4q4_output = p4q4; p3q3_output = p3q3; - p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); } else { - // All filters may contribute values to final outputs. - const uint16x8_t use_filter14_mask = - vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); - uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; - filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, - &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); - p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); - p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); - p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); - p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); - p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); - p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); - p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); - p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); - p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + const uint16x8_t use_filter8_mask = + vcombine_u16(is_flat4_mask, is_flat4_mask); + filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + const uint64x1_t need_filter14 = + vreinterpret_u64_u16(is_flat4_outer_mask); + if (vget_lane_u64(need_filter14, 0) == 0) { + // filter14() does not apply, but filter8() and filter4() apply to one + // or more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } else { + // All filters may contribute values to final outputs. + const uint16x8_t use_filter14_mask = + vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); + filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); + p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); + p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); + p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); + p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); + p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } } } - vst1_u16(dst_p5, vget_low_u16(p5q5_output)); - vst1_u16(dst_p4, vget_low_u16(p4q4_output)); - vst1_u16(dst_p3, vget_low_u16(p3q3_output)); - vst1_u16(dst_p2, vget_low_u16(p2q2_output)); - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); - vst1_u16(dst_q2, vget_high_u16(p2q2_output)); - vst1_u16(dst_q3, vget_high_u16(p3q3_output)); - vst1_u16(dst_q4, vget_high_u16(p4q4_output)); - vst1_u16(dst_q5, vget_high_u16(p5q5_output)); + store_u16_4x12(s - 6 * pitch, pitch, vget_low_u16(p5q5_output), + vget_low_u16(p4q4_output), vget_low_u16(p3q3_output), + vget_low_u16(p2q2_output), vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output), vget_high_u16(p2q2_output), + vget_high_u16(p3q3_output), vget_high_u16(p4q4_output), + vget_high_u16(p5q5_output)); } void aom_highbd_lpf_horizontal_14_dual_neon( @@ -1107,23 +1072,17 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - uint16_t *const dst = s - 8; - uint16_t *const dst_0 = dst; - uint16_t *const dst_1 = dst + pitch; - uint16_t *const dst_2 = dst + 2 * pitch; - uint16_t *const dst_3 = dst + 3 * pitch; - // Low halves: p7 p6 p5 p4 // High halves: p3 p2 p1 p0 - uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3) }; + uint16x8_t src_p[4]; + load_u16_8x4(s - 8, pitch, &src_p[0], &src_p[1], &src_p[2], &src_p[3]); // p7 will be the low half of src_p[0]. Not used until the end. transpose_array_inplace_u16_4x8(src_p); // Low halves: q0 q1 q2 q3 // High halves: q4 q5 q6 q7 - uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8), - vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) }; + uint16x8_t src_q[4]; + load_u16_8x4(s, pitch, &src_q[0], &src_q[1], &src_q[2], &src_q[3]); // q7 will be the high half of src_q[3]. Not used until the end. transpose_array_inplace_u16_4x8(src_q); @@ -1144,12 +1103,11 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch, filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); -#if AOM_ARCH_AARCH64 - if (vaddv_u16(needs_filter_mask) == 0) { + if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } -#endif // AOM_ARCH_AARCH64 + const uint16x8_t p4q4 = vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0])); const uint16x8_t p5q5 = @@ -1164,71 +1122,96 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch, const uint16x4_t is_flat4_outer_mask = vand_u16( is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), vabdq_u16(p0q0, p6q6), bd)); - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); - filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, p5q5_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or - // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8 - // output is not used. uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // filter8() and filter14() do not apply, but filter4() applies to one or - // more values. + uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) { + // filter14() applies to all values. + filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = f14_p5q5; + p4q4_output = f14_p4q4; + p3q3_output = f14_p3q3; + p2q2_output = f14_p2q2; + p1q1_output = f14_p1q1; + p0q0_output = f14_p0q0; + } else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 && + vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) { + // filter8() applies to all values. + filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); p5q5_output = p5q5; p4q4_output = p4q4; p3q3_output = p3q3; - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + p2q2_output = f8_p2q2; + p1q1_output = f8_p1q1; + p0q0_output = f8_p0q0; } else { - const uint16x8_t use_filter8_mask = - vcombine_u16(is_flat4_mask, is_flat4_mask); - filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); - if (vget_lane_u64(need_filter14, 0) == 0) { - // filter14() does not apply, but filter8() and filter4() apply to one or + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = + vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or + // filter8. Therefore if it is false when |needs_filter_mask| is true, + // filter8 output is not used. + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // filter8() and filter14() do not apply, but filter4() applies to one or // more values. p5q5_output = p5q5; p4q4_output = p4q4; p3q3_output = p3q3; - p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); } else { - // All filters may contribute values to final outputs. - const uint16x8_t use_filter14_mask = - vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); - uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; - filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, - &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); - p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); - p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); - p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); - p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); - p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); - p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); - p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); - p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); - p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + const uint16x8_t use_filter8_mask = + vcombine_u16(is_flat4_mask, is_flat4_mask); + filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + const uint64x1_t need_filter14 = + vreinterpret_u64_u16(is_flat4_outer_mask); + if (vget_lane_u64(need_filter14, 0) == 0) { + // filter14() does not apply, but filter8() and filter4() apply to one + // or more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } else { + // All filters may contribute values to final outputs. + const uint16x8_t use_filter14_mask = + vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); + filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); + p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); + p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); + p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); + p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); + p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } } } + // To get the correctly ordered rows from the transpose, we need: // p7p3 p6p2 p5p1 p4p0 // q0q4 q1q5 q2q6 q3q7 @@ -1236,23 +1219,20 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch, const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output); const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output); const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output); + uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0], p5p1_q1q5.val[0], p4p0_q0q4.val[0] }; - transpose_array_inplace_u16_4x8(output_p); uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1], p6p2_q2q6.val[1], p7p3_q3q7.val[1] }; + + transpose_array_inplace_u16_4x8(output_p); transpose_array_inplace_u16_4x8(output_q); // Reverse p values to produce original order: // p3 p2 p1 p0 q0 q1 q2 q3 - vst1q_u16(dst_0, output_p[0]); - vst1q_u16(dst_0 + 8, output_q[0]); - vst1q_u16(dst_1, output_p[1]); - vst1q_u16(dst_1 + 8, output_q[1]); - vst1q_u16(dst_2, output_p[2]); - vst1q_u16(dst_2 + 8, output_q[2]); - vst1q_u16(dst_3, output_p[3]); - vst1q_u16(dst_3 + 8, output_q[3]); + store_u16_8x4(s - 8, pitch, output_p[0], output_p[1], output_p[2], + output_p[3]); + store_u16_8x4(s, pitch, output_q[0], output_q[1], output_q[2], output_q[3]); } void aom_highbd_lpf_vertical_14_dual_neon( diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c index 6beb73ca0d6b..1c0b24ad1af4 100644 --- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c +++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c @@ -146,473 +146,393 @@ static inline uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1, return mask_8x8; } -static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4, - uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, - uint8x8_t *p0q0, const uint8_t blimit, - const uint8_t limit, const uint8_t thresh) { - uint16x8_t out; - uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4, - out_f14_pq5; - uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; - uint8x8_t out_f4_pq0, out_f4_pq1; - uint8x8_t mask_8x8, flat_8x8, flat2_8x8; - uint8x8_t q0p0, q1p1, q2p2; - - // Calculate filter masks - mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); - flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); - flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0); - { - // filter 4 - int32x2x2_t ps0_qs0, ps1_qs1; - int16x8_t filter_s16; - const uint8x8_t thresh_f4 = vdup_n_u8(thresh); - uint8x8_t temp0_8x8, temp1_8x8; - int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; - int8x8_t op0, oq0, op1, oq1; - int8x8_t pq_s0, pq_s1; - int8x8_t filter_s8, filter1_s8, filter2_s8; - int8x8_t hev_8x8; - const int8x8_t sign_mask = vdup_n_s8(0x80); - const int8x8_t val_4 = vdup_n_s8(4); - const int8x8_t val_3 = vdup_n_s8(3); - - pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); - pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); - - ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); - ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); - ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); - qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); - ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); - qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); - - // hev_mask - temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); - temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); - hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); - - // add outer taps if we have high edge variance - filter_s8 = vqsub_s8(ps1_s8, qs1_s8); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - // inner taps - temp_s8 = vqsub_s8(qs0_s8, ps0_s8); - filter_s16 = vmovl_s8(filter_s8); - filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); - filter_s8 = vqmovn_s16(filter_s16); - filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); - - filter1_s8 = vqadd_s8(filter_s8, val_4); - filter2_s8 = vqadd_s8(filter_s8, val_3); - filter1_s8 = vshr_n_s8(filter1_s8, 3); - filter2_s8 = vshr_n_s8(filter2_s8, 3); - - oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); - op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); - - hev_8x8 = vmvn_s8(hev_8x8); - filter_s8 = vrshr_n_s8(filter1_s8, 1); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); - op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); - - out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); - out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); - } - // reverse p and q - q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); - q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); - q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); - { - // filter 8 - uint16x8_t out_pq0, out_pq1, out_pq2; - out = vaddl_u8(*p3q3, *p2q2); - out = vaddw_u8(out, *p1q1); - out = vaddw_u8(out, *p0q0); - - out = vaddw_u8(out, q0p0); - out_pq1 = vaddw_u8(out, *p3q3); - out_pq2 = vaddw_u8(out_pq1, *p3q3); - out_pq2 = vaddw_u8(out_pq2, *p2q2); - out_pq1 = vaddw_u8(out_pq1, *p1q1); - out_pq1 = vaddw_u8(out_pq1, q1p1); - - out_pq0 = vaddw_u8(out, *p0q0); - out_pq0 = vaddw_u8(out_pq0, q1p1); - out_pq0 = vaddw_u8(out_pq0, q2p2); - - out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); - out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); - out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); - } - { - // filter 14 - uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5; - uint16x8_t p6q6_2, p6q6_temp, qp_sum; - uint8x8_t qp_rev; - - out = vaddw_u8(out, *p4q4); - out = vaddw_u8(out, *p5q5); - out = vaddw_u8(out, *p6q6); - - out_pq5 = vaddw_u8(out, *p4q4); - out_pq4 = vaddw_u8(out_pq5, *p3q3); - out_pq3 = vaddw_u8(out_pq4, *p2q2); - - out_pq5 = vaddw_u8(out_pq5, *p5q5); - out_pq4 = vaddw_u8(out_pq4, *p5q5); - - out_pq0 = vaddw_u8(out, *p1q1); - out_pq1 = vaddw_u8(out_pq0, *p2q2); - out_pq2 = vaddw_u8(out_pq1, *p3q3); - - out_pq0 = vaddw_u8(out_pq0, *p0q0); - out_pq1 = vaddw_u8(out_pq1, *p0q0); - - out_pq1 = vaddw_u8(out_pq1, *p6q6); - p6q6_2 = vaddl_u8(*p6q6, *p6q6); - out_pq2 = vaddq_u16(out_pq2, p6q6_2); - p6q6_temp = vaddw_u8(p6q6_2, *p6q6); - out_pq3 = vaddq_u16(out_pq3, p6q6_temp); - p6q6_temp = vaddw_u8(p6q6_temp, *p6q6); - out_pq4 = vaddq_u16(out_pq4, p6q6_temp); - p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2); - out_pq5 = vaddq_u16(out_pq5, p6q6_temp); - - out_pq4 = vaddw_u8(out_pq4, q1p1); - - qp_sum = vaddl_u8(q2p2, q1p1); - out_pq3 = vaddq_u16(out_pq3, qp_sum); - - qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3))); - qp_sum = vaddw_u8(qp_sum, qp_rev); - out_pq2 = vaddq_u16(out_pq2, qp_sum); - - qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4))); - qp_sum = vaddw_u8(qp_sum, qp_rev); - out_pq1 = vaddq_u16(out_pq1, qp_sum); - - qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5))); - qp_sum = vaddw_u8(qp_sum, qp_rev); - out_pq0 = vaddq_u16(out_pq0, qp_sum); - - out_pq0 = vaddw_u8(out_pq0, q0p0); - - out_f14_pq0 = vrshrn_n_u16(out_pq0, 4); - out_f14_pq1 = vrshrn_n_u16(out_pq1, 4); - out_f14_pq2 = vrshrn_n_u16(out_pq2, 4); - out_f14_pq3 = vrshrn_n_u16(out_pq3, 4); - out_f14_pq4 = vrshrn_n_u16(out_pq4, 4); - out_f14_pq5 = vrshrn_n_u16(out_pq5, 4); - } - { - uint8x8_t filter4_cond, filter8_cond, filter14_cond; - filter8_cond = vand_u8(flat_8x8, mask_8x8); - filter4_cond = vmvn_u8(filter8_cond); - filter14_cond = vand_u8(filter8_cond, flat2_8x8); - - // filter4 outputs - *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); - *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); - - // filter8 outputs - *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); - *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); - *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); - - // filter14 outputs - *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0); - *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1); - *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2); - *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3); - *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4); - *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5); - } -} - -static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, - uint8x8_t *p0q0, const uint8_t blimit, - const uint8_t limit, const uint8_t thresh) { - uint16x8_t out; - uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; - uint8x8_t out_f4_pq0, out_f4_pq1; - uint8x8_t mask_8x8, flat_8x8; - - // Calculate filter masks - mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); - flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); - { - // filter 4 - int32x2x2_t ps0_qs0, ps1_qs1; - int16x8_t filter_s16; - const uint8x8_t thresh_f4 = vdup_n_u8(thresh); - uint8x8_t temp0_8x8, temp1_8x8; - int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; - int8x8_t op0, oq0, op1, oq1; - int8x8_t pq_s0, pq_s1; - int8x8_t filter_s8, filter1_s8, filter2_s8; - int8x8_t hev_8x8; - const int8x8_t sign_mask = vdup_n_s8(0x80); - const int8x8_t val_4 = vdup_n_s8(4); - const int8x8_t val_3 = vdup_n_s8(3); - - pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); - pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); - - ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); - ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); - ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); - qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); - ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); - qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); - - // hev_mask - temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); - temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); - hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); - - // add outer taps if we have high edge variance - filter_s8 = vqsub_s8(ps1_s8, qs1_s8); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - // inner taps - temp_s8 = vqsub_s8(qs0_s8, ps0_s8); - filter_s16 = vmovl_s8(filter_s8); - filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); - filter_s8 = vqmovn_s16(filter_s16); - filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); - - filter1_s8 = vqadd_s8(filter_s8, val_4); - filter2_s8 = vqadd_s8(filter_s8, val_3); - filter1_s8 = vshr_n_s8(filter1_s8, 3); - filter2_s8 = vshr_n_s8(filter2_s8, 3); - - oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); - op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); - - hev_8x8 = vmvn_s8(hev_8x8); - filter_s8 = vrshr_n_s8(filter1_s8, 1); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); - op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); - - out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); - out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); - } - { - // filter 8 - uint16x8_t out_pq0, out_pq1, out_pq2; - uint8x8_t q0p0, q1p1, q2p2; - - out = vaddl_u8(*p3q3, *p2q2); - out = vaddw_u8(out, *p1q1); - out = vaddw_u8(out, *p0q0); - - // reverse p and q - q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); - q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); - q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); - - out = vaddw_u8(out, q0p0); - out_pq1 = vaddw_u8(out, *p3q3); - out_pq2 = vaddw_u8(out_pq1, *p3q3); - out_pq2 = vaddw_u8(out_pq2, *p2q2); - out_pq1 = vaddw_u8(out_pq1, *p1q1); - out_pq1 = vaddw_u8(out_pq1, q1p1); - - out_pq0 = vaddw_u8(out, *p0q0); - out_pq0 = vaddw_u8(out_pq0, q1p1); - out_pq0 = vaddw_u8(out_pq0, q2p2); - - out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); - out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); - out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); - } - { - uint8x8_t filter4_cond, filter8_cond; - filter8_cond = vand_u8(flat_8x8, mask_8x8); - filter4_cond = vmvn_u8(filter8_cond); - - // filter4 outputs - *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); - *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); - - // filter8 outputs - *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); - *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); - *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); - } -} - -static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, - const uint8_t blimit, const uint8_t limit, - const uint8_t thresh) { - uint16x8_t out; - uint8x8_t out_f6_pq0, out_f6_pq1; - uint8x8_t out_f4_pq0, out_f4_pq1; - uint8x8_t mask_8x8, flat_8x8; - - // Calculate filter masks - mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit); - flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0); - { - // filter 4 - int32x2x2_t ps0_qs0, ps1_qs1; - int16x8_t filter_s16; - const uint8x8_t thresh_f4 = vdup_n_u8(thresh); - uint8x8_t temp0_8x8, temp1_8x8; - int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; - int8x8_t op0, oq0, op1, oq1; - int8x8_t pq_s0, pq_s1; - int8x8_t filter_s8, filter1_s8, filter2_s8; - int8x8_t hev_8x8; - const int8x8_t sign_mask = vdup_n_s8(0x80); - const int8x8_t val_4 = vdup_n_s8(4); - const int8x8_t val_3 = vdup_n_s8(3); - - pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); - pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); - - ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); - ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); - ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); - qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); - ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); - qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); - - // hev_mask - temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); - temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); - hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); - - // add outer taps if we have high edge variance - filter_s8 = vqsub_s8(ps1_s8, qs1_s8); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - // inner taps - temp_s8 = vqsub_s8(qs0_s8, ps0_s8); - filter_s16 = vmovl_s8(filter_s8); - filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); - filter_s8 = vqmovn_s16(filter_s16); - filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); - - filter1_s8 = vqadd_s8(filter_s8, val_4); - filter2_s8 = vqadd_s8(filter_s8, val_3); - filter1_s8 = vshr_n_s8(filter1_s8, 3); - filter2_s8 = vshr_n_s8(filter2_s8, 3); - - oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); - op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); - - filter_s8 = vrshr_n_s8(filter1_s8, 1); - filter_s8 = vbic_s8(filter_s8, hev_8x8); - - oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); - op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); - - out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); - out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); - } - { - // filter 6 - uint16x8_t out_pq0, out_pq1; - uint8x8_t pq_rev; - - out = vaddl_u8(*p0q0, *p1q1); - out = vaddq_u16(out, out); - out = vaddw_u8(out, *p2q2); - - pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); - out = vaddw_u8(out, pq_rev); - - out_pq0 = vaddw_u8(out, pq_rev); - pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); - out_pq0 = vaddw_u8(out_pq0, pq_rev); - - out_pq1 = vaddw_u8(out, *p2q2); - out_pq1 = vaddw_u8(out_pq1, *p2q2); - - out_f6_pq0 = vrshrn_n_u16(out_pq0, 3); - out_f6_pq1 = vrshrn_n_u16(out_pq1, 3); - } - { - uint8x8_t filter4_cond, filter6_cond; - filter6_cond = vand_u8(flat_8x8, mask_8x8); - filter4_cond = vmvn_u8(filter6_cond); - - // filter4 outputs - *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); - *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); - - // filter6 outputs - *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0); - *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1); - } -} - -static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit, - const uint8_t limit, const uint8_t thresh) { - int32x2x2_t ps0_qs0, ps1_qs1; - int16x8_t filter_s16; +static inline void filter4(const uint8x8_t p0q0, const uint8x8_t p1q1, + uint8x8_t *p0q0_output, uint8x8_t *p1q1_output, + uint8x8_t mask_8x8, const uint8_t thresh) { const uint8x8_t thresh_f4 = vdup_n_u8(thresh); - uint8x8_t mask_8x8, temp0_8x8, temp1_8x8; - int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; - int8x8_t op0, oq0, op1, oq1; - int8x8_t pq_s0, pq_s1; - int8x8_t filter_s8, filter1_s8, filter2_s8; - int8x8_t hev_8x8; const int8x8_t sign_mask = vdup_n_s8(0x80); const int8x8_t val_4 = vdup_n_s8(4); const int8x8_t val_3 = vdup_n_s8(3); - // Calculate filter mask - mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit); + int8x8_t pq_s0 = veor_s8(vreinterpret_s8_u8(p0q0), sign_mask); + int8x8_t pq_s1 = veor_s8(vreinterpret_s8_u8(p1q1), sign_mask); - pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); - pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); - - ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); - ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); - ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); - qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); - ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); - qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + int32x2x2_t ps0_qs0 = + vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + int32x2x2_t ps1_qs1 = + vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + int8x8_t ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + int8x8_t qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + int8x8_t ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + int8x8_t qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); // hev_mask - temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); - temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); - hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + uint8x8_t temp0_8x8 = vcgt_u8(vabd_u8(p0q0, p1q1), thresh_f4); + uint8x8_t temp1_8x8 = + vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + int8x8_t hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); // add outer taps if we have high edge variance - filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + int8x8_t filter_s8 = vqsub_s8(ps1_s8, qs1_s8); filter_s8 = vand_s8(filter_s8, hev_8x8); // inner taps - temp_s8 = vqsub_s8(qs0_s8, ps0_s8); - filter_s16 = vmovl_s8(filter_s8); + int8x8_t temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + int16x8_t filter_s16 = vmovl_s8(filter_s8); filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); filter_s8 = vqmovn_s16(filter_s16); filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); - filter1_s8 = vqadd_s8(filter_s8, val_4); - filter2_s8 = vqadd_s8(filter_s8, val_3); + int8x8_t filter1_s8 = vqadd_s8(filter_s8, val_4); + int8x8_t filter2_s8 = vqadd_s8(filter_s8, val_3); filter1_s8 = vshr_n_s8(filter1_s8, 3); filter2_s8 = vshr_n_s8(filter2_s8, 3); - oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); - op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + int8x8_t oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + int8x8_t op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); filter_s8 = vrshr_n_s8(filter1_s8, 1); filter_s8 = vbic_s8(filter_s8, hev_8x8); - oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); - op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + int8x8_t oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + int8x8_t op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); - *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); - *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + *p0q0_output = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + *p1q1_output = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); +} + +static inline void filter8(const uint8x8_t p0q0, const uint8x8_t p1q1, + const uint8x8_t p2q2, const uint8x8_t p3q3, + uint8x8_t *p0q0_output, uint8x8_t *p1q1_output, + uint8x8_t *p2q2_output) { + // Reverse p and q. + uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4); + uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4); + uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4); + + uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1); + uint16x8_t p2q2_p3q3 = vaddl_u8(p3q3, p2q2); + uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3); + + uint16x8_t q0p0_p3q3 = vaddl_u8(q0p0, p3q3); + uint16x8_t out_q0p0_p3q3 = vaddq_u16(out, q0p0_p3q3); + + uint16x8_t out_pq2 = vaddq_u16(out_q0p0_p3q3, p2q2_p3q3); + + uint16x8_t p1q1_q1p1 = vaddl_u8(p1q1, q1p1); + uint16x8_t out_pq1 = vaddq_u16(out_q0p0_p3q3, p1q1_q1p1); + + uint16x8_t q0p0_p0q0 = vaddl_u8(q0p0, p0q0); + uint16x8_t q1p1_q2p2 = vaddl_u8(q1p1, q2p2); + uint16x8_t out_pq0 = vaddq_u16(q0p0_p0q0, q1p1_q2p2); + out_pq0 = vaddq_u16(out_pq0, out); + + *p0q0_output = vrshrn_n_u16(out_pq0, 3); + *p1q1_output = vrshrn_n_u16(out_pq1, 3); + *p2q2_output = vrshrn_n_u16(out_pq2, 3); +} + +static inline void filter14(const uint8x8_t p0q0, const uint8x8_t p1q1, + const uint8x8_t p2q2, const uint8x8_t p3q3, + const uint8x8_t p4q4, const uint8x8_t p5q5, + const uint8x8_t p6q6, uint8x8_t *p0q0_output, + uint8x8_t *p1q1_output, uint8x8_t *p2q2_output, + uint8x8_t *p3q3_output, uint8x8_t *p4q4_output, + uint8x8_t *p5q5_output) { + // Reverse p and q. + uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4); + uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4); + uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4); + uint8x8_t q3p3 = vext_u8(p3q3, p3q3, 4); + uint8x8_t q4p4 = vext_u8(p4q4, p4q4, 4); + uint8x8_t q5p5 = vext_u8(p5q5, p5q5, 4); + + uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1); + uint16x8_t p2q2_p3q3 = vaddl_u8(p2q2, p3q3); + uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3); + + uint16x8_t q0p0_p4q4 = vaddl_u8(q0p0, p4q4); + uint16x8_t p5q5_p6q6 = vaddl_u8(p5q5, p6q6); + uint16x8_t tmp = vaddq_u16(q0p0_p4q4, p5q5_p6q6); + // This offset removes the need for a rounding shift at the end. + uint16x8_t tmp_offset = vaddq_u16(tmp, vdupq_n_u16(1 << 3)); + out = vaddq_u16(out, tmp_offset); + + uint16x8_t out_pq5 = vaddw_u8(out, p4q4); + uint16x8_t out_pq4 = vaddw_u8(out_pq5, p3q3); + uint16x8_t out_pq3 = vaddw_u8(out_pq4, p2q2); + + out_pq5 = vaddw_u8(out_pq5, p5q5); + + uint16x8_t out_pq0 = vaddw_u8(out, p1q1); + uint16x8_t out_pq1 = vaddw_u8(out_pq0, p2q2); + uint16x8_t out_pq2 = vaddw_u8(out_pq1, p3q3); + + uint16x8_t p0q0_q0p0 = vaddl_u8(p0q0, q0p0); + out_pq0 = vaddq_u16(out_pq0, p0q0_q0p0); + + uint16x8_t p0q0_p6q6 = vaddl_u8(p0q0, p6q6); + out_pq1 = vaddq_u16(out_pq1, p0q0_p6q6); + uint16x8_t p5q5_q1p1 = vaddl_u8(p5q5, q1p1); + out_pq4 = vaddq_u16(out_pq4, p5q5_q1p1); + + uint16x8_t p6q6_p6q6 = vaddl_u8(p6q6, p6q6); + out_pq2 = vaddq_u16(out_pq2, p6q6_p6q6); + uint16x8_t p6q6_temp = vaddw_u8(p6q6_p6q6, p6q6); + out_pq3 = vaddq_u16(out_pq3, p6q6_temp); + p6q6_temp = vaddw_u8(p6q6_temp, p6q6); + out_pq4 = vaddq_u16(out_pq4, p6q6_temp); + p6q6_temp = vaddq_u16(p6q6_temp, p6q6_p6q6); + out_pq5 = vaddq_u16(out_pq5, p6q6_temp); + + uint16x8_t qp_sum = vaddl_u8(q2p2, q1p1); + out_pq3 = vaddq_u16(out_pq3, qp_sum); + + qp_sum = vaddw_u8(qp_sum, q3p3); + out_pq2 = vaddq_u16(out_pq2, qp_sum); + + qp_sum = vaddw_u8(qp_sum, q4p4); + out_pq1 = vaddq_u16(out_pq1, qp_sum); + + qp_sum = vaddw_u8(qp_sum, q5p5); + out_pq0 = vaddq_u16(out_pq0, qp_sum); + + *p0q0_output = vshrn_n_u16(out_pq0, 4); + *p1q1_output = vshrn_n_u16(out_pq1, 4); + *p2q2_output = vshrn_n_u16(out_pq2, 4); + *p3q3_output = vshrn_n_u16(out_pq3, 4); + *p4q4_output = vshrn_n_u16(out_pq4, 4); + *p5q5_output = vshrn_n_u16(out_pq5, 4); +} + +static inline void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, + uint8x8_t *p4q4, uint8x8_t *p3q3, + uint8x8_t *p2q2, uint8x8_t *p1q1, + uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4, + out_f14_pq5; + uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; + uint8x8_t out_f4_pq0, out_f4_pq1; + + // Calculate filter masks. + uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); + uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); + uint8x8_t flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0); + + // No filtering. + if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) { + return; + } + + uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8); + uint8x8_t filter4_cond = vmvn_u8(filter8_cond); + uint8x8_t filter14_cond = vand_u8(filter8_cond, flat2_8x8); + + if (vget_lane_s64(vreinterpret_s64_u8(filter14_cond), 0) == -1) { + // Only filter14() applies. + filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0, + &out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4, + &out_f14_pq5); + + *p0q0 = out_f14_pq0; + *p1q1 = out_f14_pq1; + *p2q2 = out_f14_pq2; + *p3q3 = out_f14_pq3; + *p4q4 = out_f14_pq4; + *p5q5 = out_f14_pq5; + } else if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 && + vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) { + // Only filter8() applies. + filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2); + + *p0q0 = out_f7_pq0; + *p1q1 = out_f7_pq1; + *p2q2 = out_f7_pq2; + } else { + filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh); + + if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 && + vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) { + // filter8() and filter14() do not apply, but filter4() applies to one or + // more values. + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + } else { + filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, + &out_f7_pq2); + + if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0) { + // filter14() does not apply, but filter8() and filter4() apply to one + // or more values. filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + } else { + // All filters may contribute values to final outputs. + filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0, + &out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4, + &out_f14_pq5); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + + // filter14 outputs + *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0); + *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1); + *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2); + *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3); + *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4); + *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5); + } + } + } +} + +static inline void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, + uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; + uint8x8_t out_f4_pq0, out_f4_pq1; + + // Calculate filter masks. + uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); + uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); + + // No filtering. + if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) { + return; + } + + uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8); + uint8x8_t filter4_cond = vmvn_u8(filter8_cond); + + // Not needing filter4() at all is a very common case, so isolate it to avoid + // needlessly computing filter4(). + if (vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) { + filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2); + + *p0q0 = out_f7_pq0; + *p1q1 = out_f7_pq1; + *p2q2 = out_f7_pq2; + } else { + filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh); + + if (vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) { + // filter8() does not apply, but filter4() applies to one or more values. + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + } else { + filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, + &out_f7_pq2); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + } + } +} + +static inline void filter6(const uint8x8_t p0q0, const uint8x8_t p1q1, + const uint8x8_t p2q2, uint8x8_t *p0q0_output, + uint8x8_t *p1q1_output) { + uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4); + + uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1); + uint16x8_t out = vaddq_u16(p0q0_p1q1, p0q0_p1q1); + + uint16x8_t q0p0_p2q2 = vaddl_u8(q0p0, p2q2); + out = vaddq_u16(out, q0p0_p2q2); + + uint16x8_t q0p0_q1p1 = vextq_u16(p0q0_p1q1, p0q0_p1q1, 4); + uint16x8_t out_pq0 = vaddq_u16(out, q0p0_q1p1); + + uint16x8_t p2q2_p2q2 = vaddl_u8(p2q2, p2q2); + uint16x8_t out_pq1 = vaddq_u16(out, p2q2_p2q2); + + *p0q0_output = vrshrn_n_u16(out_pq0, 3); + *p1q1_output = vrshrn_n_u16(out_pq1, 3); +} + +static inline void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, + const uint8_t blimit, const uint8_t limit, + const uint8_t thresh) { + uint8x8_t out_f6_pq0, out_f6_pq1; + uint8x8_t out_f4_pq0, out_f4_pq1; + + // Calculate filter masks. + uint8x8_t mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit); + uint8x8_t flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0); + + // No filtering. + if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) { + return; + } + + uint8x8_t filter6_cond = vand_u8(flat_8x8, mask_8x8); + uint8x8_t filter4_cond = vmvn_u8(filter6_cond); + + // Not needing filter4 at all is a very common case, so isolate it to avoid + // needlessly computing filter4. + if (vget_lane_s64(vreinterpret_s64_u8(filter6_cond), 0) == -1) { + filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1); + + *p0q0 = out_f6_pq0; + *p1q1 = out_f6_pq1; + } else { + filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh); + + if (vget_lane_u64(vreinterpret_u64_u8(filter6_cond), 0) == 0) { + // filter6 does not apply, but filter4 applies to one or more values. + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + } else { + // All filters may contribute to the final output. + filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter6 outputs + *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0); + *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1); + } + } +} + +static inline void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, + const uint8_t blimit, const uint8_t limit, + const uint8_t thresh) { + uint8x8_t out_f4_pq0, out_f4_pq1; + + // Calculate filter mask + uint8x8_t mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit); + + // No filtering. + if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) { + return; + } + + filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh); + + *p0q0 = out_f4_pq0; + *p1q1 = out_f4_pq1; } void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit, diff --git a/third_party/aom/aom_dsp/arm/mem_neon.h b/third_party/aom/aom_dsp/arm/mem_neon.h index 494dde14a3ee..9cdafecedcd5 100644 --- a/third_party/aom/aom_dsp/arm/mem_neon.h +++ b/third_party/aom/aom_dsp/arm/mem_neon.h @@ -55,12 +55,52 @@ static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { return res; } +static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) { + int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } }; + return res; +} + +static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) { + int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8), + vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } }; + return res; +} + +static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) { + vst1_u8(ptr + 0 * 8, a.val[0]); + vst1_u8(ptr + 1 * 8, a.val[1]); +} + +static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) { + vst1_u8(ptr + 0 * 8, a.val[0]); + vst1_u8(ptr + 1 * 8, a.val[1]); + vst1_u8(ptr + 2 * 8, a.val[2]); + vst1_u8(ptr + 3 * 8, a.val[3]); +} + +static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) { + vst1q_u16(ptr + 0 * 8, a.val[0]); + vst1q_u16(ptr + 1 * 8, a.val[1]); +} + +static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) { + vst1q_u16(ptr + 0 * 8, a.val[0]); + vst1q_u16(ptr + 1 * 8, a.val[1]); + vst1q_u16(ptr + 2 * 8, a.val[2]); + vst1q_u16(ptr + 3 * 8, a.val[3]); +} + #elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit. #if __GNUC__ < 8 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) { uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; return res; } + +static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) { + int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } }; + return res; +} #endif // __GNUC__ < 8 #if __GNUC__ < 9 @@ -71,13 +111,30 @@ static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) { } #endif // __GNUC__ < 9 -// vld1q_u16_x4 is defined from GCC 8.5.0 and onwards. #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8), vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } }; return res; } + +static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) { + int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8), + vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } }; + return res; +} + +static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) { + vst1_u8(ptr + 0 * 8, a.val[0]); + vst1_u8(ptr + 1 * 8, a.val[1]); +} + +static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) { + vst1_u8(ptr + 0 * 8, a.val[0]); + vst1_u8(ptr + 1 * 8, a.val[1]); + vst1_u8(ptr + 2 * 8, a.val[2]); + vst1_u8(ptr + 3 * 8, a.val[3]); +} #endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805 #endif // defined(__GNUC__) && !defined(__clang__) @@ -215,6 +272,23 @@ static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p, s += p; } +static inline void load_u16_4x6(const uint16_t *s, ptrdiff_t p, + uint16x4_t *const s0, uint16x4_t *const s1, + uint16x4_t *const s2, uint16x4_t *const s3, + uint16x4_t *const s4, uint16x4_t *const s5) { + *s0 = vld1_u16(s); + s += p; + *s1 = vld1_u16(s); + s += p; + *s2 = vld1_u16(s); + s += p; + *s3 = vld1_u16(s); + s += p; + *s4 = vld1_u16(s); + s += p; + *s5 = vld1_u16(s); +} + static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p, uint16x4_t *const s0, uint16x4_t *const s1, uint16x4_t *const s2, uint16x4_t *const s3, @@ -235,6 +309,65 @@ static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p, *s6 = vld1_u16(s); } +static inline void load_u16_4x8(const uint16_t *s, ptrdiff_t p, + uint16x4_t *const s0, uint16x4_t *const s1, + uint16x4_t *const s2, uint16x4_t *const s3, + uint16x4_t *const s4, uint16x4_t *const s5, + uint16x4_t *const s6, uint16x4_t *const s7) { + *s0 = vld1_u16(s); + s += p; + *s1 = vld1_u16(s); + s += p; + *s2 = vld1_u16(s); + s += p; + *s3 = vld1_u16(s); + s += p; + *s4 = vld1_u16(s); + s += p; + *s5 = vld1_u16(s); + s += p; + *s6 = vld1_u16(s); + s += p; + *s7 = vld1_u16(s); +} + +static inline void load_u16_4x14(const uint16_t *s, ptrdiff_t p, + uint16x4_t *const s0, uint16x4_t *const s1, + uint16x4_t *const s2, uint16x4_t *const s3, + uint16x4_t *const s4, uint16x4_t *const s5, + uint16x4_t *const s6, uint16x4_t *const s7, + uint16x4_t *const s8, uint16x4_t *const s9, + uint16x4_t *const s10, uint16x4_t *const s11, + uint16x4_t *const s12, uint16x4_t *const s13) { + *s0 = vld1_u16(s); + s += p; + *s1 = vld1_u16(s); + s += p; + *s2 = vld1_u16(s); + s += p; + *s3 = vld1_u16(s); + s += p; + *s4 = vld1_u16(s); + s += p; + *s5 = vld1_u16(s); + s += p; + *s6 = vld1_u16(s); + s += p; + *s7 = vld1_u16(s); + s += p; + *s8 = vld1_u16(s); + s += p; + *s9 = vld1_u16(s); + s += p; + *s10 = vld1_u16(s); + s += p; + *s11 = vld1_u16(s); + s += p; + *s12 = vld1_u16(s); + s += p; + *s13 = vld1_u16(s); +} + static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1) { *s0 = vld1q_s16(s); @@ -597,6 +730,56 @@ static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride, vst1_u16(s, s3); } +static inline void store_u16_4x6(uint16_t *s, ptrdiff_t dst_stride, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2, const uint16x4_t s3, + const uint16x4_t s4, const uint16x4_t s5) { + vst1_u16(s, s0); + s += dst_stride; + vst1_u16(s, s1); + s += dst_stride; + vst1_u16(s, s2); + s += dst_stride; + vst1_u16(s, s3); + s += dst_stride; + vst1_u16(s, s4); + s += dst_stride; + vst1_u16(s, s5); +} + +static inline void store_u16_4x12(uint16_t *s, ptrdiff_t dst_stride, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2, const uint16x4_t s3, + const uint16x4_t s4, const uint16x4_t s5, + const uint16x4_t s6, const uint16x4_t s7, + const uint16x4_t s8, const uint16x4_t s9, + const uint16x4_t s10, const uint16x4_t s11) { + vst1_u16(s, s0); + s += dst_stride; + vst1_u16(s, s1); + s += dst_stride; + vst1_u16(s, s2); + s += dst_stride; + vst1_u16(s, s3); + s += dst_stride; + vst1_u16(s, s4); + s += dst_stride; + vst1_u16(s, s5); + s += dst_stride; + vst1_u16(s, s6); + s += dst_stride; + vst1_u16(s, s7); + s += dst_stride; + vst1_u16(s, s8); + s += dst_stride; + vst1_u16(s, s9); + s += dst_stride; + vst1_u16(s, s10); + s += dst_stride; + vst1_u16(s, s11); + s += dst_stride; +} + static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride, const uint16x8_t s0, const uint16x8_t s1) { vst1q_u16(s, s0); diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index bbaa0a0c4818..0f829821a996 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -46,16 +46,6 @@ static inline __m128i xx_loadu_128(const void *a) { return _mm_loadu_si128((const __m128i *)a); } -// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function -// manually on older compilers. -#if !defined(__clang__) && __GNUC_MAJOR__ < 9 -static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) { - __m64 hi_, lo_; - memcpy(&hi_, hi, sizeof(hi_)); - memcpy(&lo_, lo, sizeof(lo_)); - return _mm_set_epi64(hi_, lo_); -} -#else // Load 64 bits from each of hi and low, and pack into an SSE register // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate // the strict aliasing rule, this takes a different approach @@ -63,7 +53,6 @@ static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) { return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo), _mm_loadl_epi64((const __m128i *)hi)); } -#endif static inline void xx_storel_32(void *const a, const __m128i v) { const int val = _mm_cvtsi128_si32(v); diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h index 5b8a79f8c445..20e6a4b23a04 100644 --- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h +++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h @@ -76,26 +76,11 @@ static inline __m256i yy_loadu_4x64(const void *e3, const void *e2, return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01)); } -#define GCC_VERSION (__GNUC__ * 10000 \ - + __GNUC_MINOR__ * 100 \ - + __GNUC_PATCHLEVEL__) - -// _mm256_loadu2_m128i has been introduced in GCC 10.1 -#if !defined(__clang__) && GCC_VERSION < 101000 -static inline __m256i yy_loadu2_128(const void *hi, const void *lo) { - __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); - __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); - return _mm256_set_m128i(mhi, mlo); -} -#else static inline __m256i yy_loadu2_128(const void *hi, const void *lo) { __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); return yy_set_m128i(mhi, mlo); } -#endif - -#undef GCC_VERSION static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) { _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1)); diff --git a/third_party/aom/aom_ports/aom_ports.cmake b/third_party/aom/aom_ports/aom_ports.cmake index 1746efa3cc29..33382e9541ab 100644 --- a/third_party/aom/aom_ports/aom_ports.cmake +++ b/third_party/aom/aom_ports/aom_ports.cmake @@ -38,6 +38,9 @@ endif() list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h" "${AOM_ROOT}/aom_ports/ppc_cpudetect.c") +list(APPEND AOM_PORTS_SOURCES_RISCV "${AOM_ROOT}/aom_ports/riscv.h" + "${AOM_ROOT}/aom_ports/riscv_cpudetect.c") + # For arm and x86 targets: # # * Creates the aom_ports build target, adds the includes in aom_ports to the @@ -68,9 +71,12 @@ function(setup_aom_ports_targets) elseif("${AOM_TARGET_CPU}" MATCHES "ppc") add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC}) set(aom_ports_has_symbols 1) + elseif("${AOM_TARGET_CPU}" MATCHES "riscv") + add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_RISCV}) + set(aom_ports_has_symbols 1) endif() - if("${AOM_TARGET_CPU}" MATCHES "arm|ppc") + if("${AOM_TARGET_CPU}" MATCHES "arm|ppc|riscv") target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) diff --git a/third_party/aom/aom_ports/riscv.h b/third_party/aom/aom_ports/riscv.h new file mode 100644 index 000000000000..91cee48e6f0e --- /dev/null +++ b/third_party/aom/aom_ports/riscv.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved. + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_RISCV_H_ +#define AOM_AOM_PORTS_RISCV_H_ +#include + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_RVV 0x01 + +int riscv_simd_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_PORTS_RISCV_H_ diff --git a/third_party/aom/aom_ports/riscv_cpudetect.c b/third_party/aom/aom_ports/riscv_cpudetect.c new file mode 100644 index 000000000000..af3663c1a8f6 --- /dev/null +++ b/third_party/aom/aom_ports/riscv_cpudetect.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved. + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/riscv.h" + +#if CONFIG_RUNTIME_CPU_DETECT + +#include + +#define HWCAP_RVV (1 << ('v' - 'a')) + +int riscv_simd_caps(void) { + int flags = 0; +#if HAVE_RVV + unsigned long hwcap = getauxval(AT_HWCAP); + if (hwcap & HWCAP_RVV) flags |= HAS_RVV; +#endif + return flags; +} +#else +// If there is no RTCD the function pointers are not used and can not be +// changed. +int riscv_simd_caps(void) { return 0; } +#endif // CONFIG_RUNTIME_CPU_DETECT diff --git a/third_party/aom/apps/aomenc.c b/third_party/aom/apps/aomenc.c index ff60ef0112c0..3d58a8946ca2 100644 --- a/third_party/aom/apps/aomenc.c +++ b/third_party/aom/apps/aomenc.c @@ -2318,8 +2318,9 @@ int main(int argc, const char **argv_) { "match input format.\n", stream->config.cfg.g_profile); } - if ((global.show_psnr == 2) && (stream->config.cfg.g_input_bit_depth == - stream->config.cfg.g_bit_depth)) { + if (global.show_psnr == 2 && + stream->config.cfg.g_input_bit_depth == + (unsigned int)stream->config.cfg.g_bit_depth) { fprintf(stderr, "Warning: --psnr==2 and --psnr==1 will provide same " "results when input bit-depth == stream bit-depth, " diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake index 486596256841..b5e656c4c3e0 100644 --- a/third_party/aom/av1/av1.cmake +++ b/third_party/aom/av1/av1.cmake @@ -445,6 +445,9 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2 list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c") +list(APPEND AOM_AV1_COMMON_INTRIN_RVV + "${AOM_ROOT}/av1/common/riscv/cdef_block_rvv.c") + if(CONFIG_THREE_PASS) list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/thirdpass.c" "${AOM_ROOT}/av1/encoder/thirdpass.h") @@ -822,6 +825,13 @@ function(setup_av1_targets) endif() endif() + if(HAVE_RVV) + if(AOM_AV1_COMMON_INTRIN_RVV) + add_intrinsics_object_library("-march=rv64gcv" "rvv" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_RVV") + endif() + endif() + # Pass the new lib targets up to the parent scope instance of # $AOM_LIB_TARGETS. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c index 813a9889104a..971dd7bc9db3 100644 --- a/third_party/aom/av1/av1_cx_iface.c +++ b/third_party/aom/av1/av1_cx_iface.c @@ -1084,7 +1084,6 @@ static void set_encoder_config(AV1EncoderConfig *oxcf, AlgoCfg *const algo_cfg = &oxcf->algo_cfg; ToolCfg *const tool_cfg = &oxcf->tool_cfg; - const int is_vbr = cfg->rc_end_usage == AOM_VBR; oxcf->profile = cfg->g_profile; oxcf->max_threads = (int)cfg->g_threads; @@ -1167,9 +1166,9 @@ static void set_encoder_config(AV1EncoderConfig *oxcf, rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level); rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct; rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct; - rc_cfg->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz; - rc_cfg->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz; - rc_cfg->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz; + rc_cfg->maximum_buffer_size_ms = cfg->rc_buf_sz; + rc_cfg->starting_buffer_level_ms = cfg->rc_buf_initial_sz; + rc_cfg->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz; // Convert target bandwidth from Kbit/s to Bit/s rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate; rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh; diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c index c1763ff8b76e..e7f0ff094eb9 100644 --- a/third_party/aom/av1/common/arm/cfl_neon.c +++ b/third_party/aom/av1/common/arm/cfl_neon.c @@ -13,6 +13,7 @@ #include "config/aom_config.h" #include "config/av1_rtcd.h" +#include "aom_dsp/arm/mem_neon.h" #include "av1/common/cfl.h" static inline void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset, @@ -428,10 +429,7 @@ static inline int16x8_t predict_w8(const int16_t *pred_buf_q3, static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3, int16x8_t alpha_sign, int abs_alpha_q12, int16x8_t dc) { - // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2 - // does not interleave, but is not currently available in the compilier used - // by the AOM build system. - const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3); + const int16x8x2_t ac_q3 = vld1q_s16_x2(pred_buf_q3); const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); const int16x8_t scaled_luma_0 = @@ -447,10 +445,7 @@ static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3, static inline int16x8x4_t predict_w32(const int16_t *pred_buf_q3, int16x8_t alpha_sign, int abs_alpha_q12, int16x8_t dc) { - // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4 - // does not interleave, but is not currently available in the compilier used - // by the AOM build system. - const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3); + const int16x8x4_t ac_q3 = vld1q_s16_x4(pred_buf_q3); const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]); @@ -497,7 +492,7 @@ static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3, predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]) } }; - vst2_u8(dst, predun); + vst1_u8_x2(dst, predun); } else { const int16x8x4_t pred = predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); @@ -505,7 +500,7 @@ static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3, { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]), vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) } }; - vst4_u8(dst, predun); + vst1_u8_x4(dst, predun); } dst += dst_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); @@ -574,11 +569,11 @@ static inline void cfl_predict_hbd_neon(const int16_t *pred_buf_q3, } else if (width == 16) { const int16x8x2_t pred = predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); - vst2q_u16(dst, clamp2q_s16(pred, max_16x8)); + vst1q_u16_x2(dst, clamp2q_s16(pred, max_16x8)); } else { const int16x8x4_t pred = predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); - vst4q_u16(dst, clamp4q_s16(pred, max_16x8)); + vst1q_u16_x4(dst, clamp4q_s16(pred, max_16x8)); } dst += dst_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h index 766abffff0ae..c5c8421f6a05 100644 --- a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h +++ b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h @@ -53,8 +53,7 @@ static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, static AOM_FORCE_INLINE int16x8_t load_filters_1(int ofs) { const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS); - const int16_t *base = - (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS]; return vld1q_s16(base + ofs0 * 8); } @@ -65,8 +64,7 @@ static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int ofs, const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); - const int16_t *base = - (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS]; out[0] = vld1q_s16(base + ofs0 * 8); out[1] = vld1q_s16(base + ofs1 * 8); out[2] = vld1q_s16(base + ofs2 * 8); @@ -84,8 +82,7 @@ static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int ofs, const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS); const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS); - const int16_t *base = - (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS]; out[0] = vld1q_s16(base + ofs0 * 8); out[1] = vld1q_s16(base + ofs1 * 8); out[2] = vld1q_s16(base + ofs2 * 8); diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c index 3656beb3995a..497273bc65c9 100644 --- a/third_party/aom/av1/common/arm/warp_plane_neon.c +++ b/third_party/aom/av1/common/arm/warp_plane_neon.c @@ -101,8 +101,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { - int16x8_t f_s16 = - vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_4x1_f1_beta0(in, f_s16); } @@ -140,8 +139,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { - int16x8_t f_s16 = - vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_8x1_f1_beta0(in, f_s16); } @@ -156,8 +154,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int16x4_t s6 = vget_low_s16(src[6]); int16x4_t s7 = vget_low_s16(src[7]); - int16x8_t f = - vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); @@ -210,8 +207,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, int16x8_t s6 = src[6]; int16x8_t s7 = src[7]; - int16x8_t f = - vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.h b/third_party/aom/av1/common/arm/warp_plane_neon.h index 777ac4b9568b..2909df7b7f74 100644 --- a/third_party/aom/av1/common/arm/warp_plane_neon.h +++ b/third_party/aom/av1/common/arm/warp_plane_neon.h @@ -61,34 +61,34 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset, int stride) { - out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> - WARPEDDIFF_PREC_BITS))); - out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> - WARPEDDIFF_PREC_BITS))); - out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >> - WARPEDDIFF_PREC_BITS))); - out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >> - WARPEDDIFF_PREC_BITS))); + out[0] = vld1q_s16( + av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]); + out[1] = vld1q_s16( + av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]); + out[2] = vld1q_s16( + av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]); + out[3] = vld1q_s16( + av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]); } static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int offset, int stride) { - out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> - WARPEDDIFF_PREC_BITS))); - out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> - WARPEDDIFF_PREC_BITS))); - out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >> - WARPEDDIFF_PREC_BITS))); - out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >> - WARPEDDIFF_PREC_BITS))); - out[4] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 4 * stride) >> - WARPEDDIFF_PREC_BITS))); - out[5] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 5 * stride) >> - WARPEDDIFF_PREC_BITS))); - out[6] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 6 * stride) >> - WARPEDDIFF_PREC_BITS))); - out[7] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 7 * stride) >> - WARPEDDIFF_PREC_BITS))); + out[0] = vld1q_s16( + av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]); + out[1] = vld1q_s16( + av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]); + out[2] = vld1q_s16( + av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]); + out[3] = vld1q_s16( + av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]); + out[4] = vld1q_s16( + av1_warped_filter[(offset + 4 * stride) >> WARPEDDIFF_PREC_BITS]); + out[5] = vld1q_s16( + av1_warped_filter[(offset + 5 * stride) >> WARPEDDIFF_PREC_BITS]); + out[6] = vld1q_s16( + av1_warped_filter[(offset + 6 * stride) >> WARPEDDIFF_PREC_BITS]); + out[7] = vld1q_s16( + av1_warped_filter[(offset + 7 * stride) >> WARPEDDIFF_PREC_BITS]); } static AOM_FORCE_INLINE int clamp_iy(int iy, int height) { @@ -175,8 +175,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal( if (p_width == 4) { if (beta == 0) { if (alpha == 0) { - int16x8_t f_s16 = vld1q_s16( - (int16_t *)(av1_warped_filter + (sx4 >> WARPEDDIFF_PREC_BITS))); + int16x8_t f_s16 = + vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]); APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16); } else { APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha); @@ -193,8 +193,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal( } else { if (beta == 0) { if (alpha == 0) { - int16x8_t f_s16 = vld1q_s16( - (int16_t *)(av1_warped_filter + (sx4 >> WARPEDDIFF_PREC_BITS))); + int16x8_t f_s16 = + vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]); APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16); } else { APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha); diff --git a/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c index 7ef762692831..15ac0043cac7 100644 --- a/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c +++ b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c @@ -109,8 +109,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { - int16x8_t f_s16 = - vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_4x1_f1_beta0(in, f_s16); } @@ -145,8 +144,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { - int16x8_t f_s16 = - vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_8x1_f1_beta0(in, f_s16); } @@ -161,8 +159,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int16x4_t s6 = vget_low_s16(src[6]); int16x4_t s7 = vget_low_s16(src[7]); - int16x8_t f = - vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); @@ -215,8 +212,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, int16x8_t s6 = src[6]; int16x8_t s7 = src[7]; - int16x8_t f = - vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); diff --git a/third_party/aom/av1/common/arm/warp_plane_sve.c b/third_party/aom/av1/common/arm/warp_plane_sve.c index 51b1bb75e45b..10aee35b1a6f 100644 --- a/third_party/aom/av1/common/arm/warp_plane_sve.c +++ b/third_party/aom/av1/common/arm/warp_plane_sve.c @@ -112,8 +112,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { - int16x8_t f_s16 = - vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_4x1_f1_beta0(in, f_s16); } @@ -148,8 +147,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { - int16x8_t f_s16 = - vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_8x1_f1_beta0(in, f_s16); } @@ -164,8 +162,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int16x4_t s6 = vget_low_s16(src[6]); int16x4_t s7 = vget_low_s16(src[7]); - int16x8_t f = - vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); @@ -215,8 +212,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, int16x8_t s6 = src[6]; int16x8_t s7 = src[7]; - int16x8_t f = - vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl index 13e1d311c1b5..cfde810bd6e7 100644 --- a/third_party/aom/av1/common/av1_rtcd_defs.pl +++ b/third_party/aom/av1/common/av1_rtcd_defs.pl @@ -495,22 +495,22 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { # structs as arguments, which makes the v256 type of the intrinsics # hard to support, so optimizations for this target are disabled. if ($opts{config} !~ /libs-x86-win32-vs.*/) { - specialize qw/cdef_find_dir sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_find_dir sse4_1 avx2 neon rvv/, "$ssse3_x86"; specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_filter_8_0 sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_filter_8_1 sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_filter_8_2 sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_filter_8_3 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_8_0 sse4_1 avx2 neon rvv/, "$ssse3_x86"; + specialize qw/cdef_filter_8_1 sse4_1 avx2 neon rvv/, "$ssse3_x86"; + specialize qw/cdef_filter_8_2 sse4_1 avx2 neon rvv/, "$ssse3_x86"; + specialize qw/cdef_filter_8_3 sse4_1 avx2 neon rvv/, "$ssse3_x86"; - specialize qw/cdef_filter_16_0 sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_filter_16_1 sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_filter_16_2 sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_filter_16_3 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_16_0 sse4_1 avx2 neon rvv/, "$ssse3_x86"; + specialize qw/cdef_filter_16_1 sse4_1 avx2 neon rvv/, "$ssse3_x86"; + specialize qw/cdef_filter_16_2 sse4_1 avx2 neon rvv/, "$ssse3_x86"; + specialize qw/cdef_filter_16_3 sse4_1 avx2 neon rvv/, "$ssse3_x86"; - specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon rvv/, "$ssse3_x86"; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { - specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon rvv/, "$ssse3_x86"; } } diff --git a/third_party/aom/av1/common/riscv/cdef_block_rvv.c b/third_party/aom/av1/common/riscv/cdef_block_rvv.c new file mode 100644 index 000000000000..8ccfc2a654b7 --- /dev/null +++ b/third_party/aom/av1/common/riscv/cdef_block_rvv.c @@ -0,0 +1,1354 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved. + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/common/cdef_block.h" + +// partial A is a 16-bit vector of the form: +// [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: +// [0 y1 y2 y3 y4 y5 y6 y7]. +// This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... +// (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 +// and const2. +static inline vuint32m1_t fold_mul_and_sum_rvv(vint16m1_t partiala, + vint16m1_t partialb, + vuint32m1_t const1, + vuint32m1_t const2) { + // Square and add the corresponding x and y values. + vint32m2_t cost = __riscv_vwmul_vv_i32m2(partiala, partiala, 8); + cost = __riscv_vwmacc_vv_i32m2(cost, partialb, partialb, 8); + + // Multiply by constant. + vuint32m2_t tmp1_u32m2 = __riscv_vreinterpret_v_i32m2_u32m2(cost); + vuint32m1_t cost_u32m1 = __riscv_vmul_vv_u32m1( + __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const1, 4); + tmp1_u32m2 = __riscv_vslidedown_vx_u32m2(tmp1_u32m2, 4, 8); + vuint32m1_t ret = __riscv_vmacc_vv_u32m1( + cost_u32m1, __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const2, 4); + return ret; +} + +// This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal +// down-right, 6 is vertical). +// +// For each direction the lines are shifted so that we can perform a +// basic sum on each vector element. For example, direction 5 is "south by +// southeast", so we need to add the pixels along each line i below: +// +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// For this to fit nicely in vectors, the lines need to be shifted like so: +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// In this configuration we can now perform SIMD additions to get the cost +// along direction 5. Since this won't fit into a single 128-bit vector, we use +// two of them to compute each half of the new configuration, and pad the empty +// spaces with zeros. Similar shifting is done for other directions, except +// direction 6 which is straightforward as it's the vertical direction. +static vuint32m1_t compute_vert_directions_rvv( + vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, + vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5, + vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) { + size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16); + vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl); + + // Partial sums for lines 0 and 1. + vint16m1_t partial4a = + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 1), vl); + vint16m1_t tmp1_i16m1 = + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 2), vl); + partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); + vint16m1_t partial4b = __riscv_vslide1down_vx_i16m1(lines_0, 0, vl); + tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_1, 2, VL_SLIDE_DOWN); + partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); + tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_0, lines_1, VL_SLIDE_DOWN); + vint16m1_t partial5a = + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl); + vint16m1_t partial5b = + __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN); + vint16m1_t partial7a = + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl); + vint16m1_t partial7b = + __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN); + vint16m1_t partial6 = __riscv_vmv_v_v_i16m1(tmp1_i16m1, vl); + + // Partial sums for lines 2 and 3. + tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 3), vl); + partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); + tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 4), vl); + partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); + tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_2, 3, VL_SLIDE_DOWN); + partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); + tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_3, 4, VL_SLIDE_DOWN); + partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); + tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_2, lines_3, VL_SLIDE_DOWN); + partial5a = __riscv_vadd_vv_i16m1( + partial5a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl); + partial5b = __riscv_vadd_vv_i16m1( + partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl); + partial7a = __riscv_vadd_vv_i16m1( + partial7a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl); + partial7b = __riscv_vadd_vv_i16m1( + partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl); + partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); + + // Partial sums for lines 4 and 5. + partial4a = __riscv_vadd_vv_i16m1( + partial4a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 5), vl), vl); + partial4a = __riscv_vadd_vv_i16m1( + partial4a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl); + partial4b = __riscv_vadd_vv_i16m1( + partial4b, __riscv_vslidedown_vx_i16m1(lines_4, 5, VL_SLIDE_DOWN), vl); + partial4b = __riscv_vadd_vv_i16m1( + partial4b, __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN), vl); + tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_4, lines_5, VL_SLIDE_DOWN); + partial5a = __riscv_vadd_vv_i16m1( + partial5a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl); + partial5b = __riscv_vadd_vv_i16m1( + partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl); + partial7a = __riscv_vadd_vv_i16m1( + partial7a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl); + partial7b = __riscv_vadd_vv_i16m1( + partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl); + partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); + + // Partial sums for lines 6 and 7. + partial4a = __riscv_vadd_vv_i16m1( + partial4a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 7), vl), vl); + partial4a = __riscv_vadd_vv_i16m1(partial4a, lines_7, vl); + partial4b = __riscv_vadd_vv_i16m1( + partial4b, __riscv_vslidedown_vx_i16m1(lines_6, 7, VL_SLIDE_DOWN), vl); + tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_6, lines_7, VL_SLIDE_DOWN); + partial5a = __riscv_vadd_vv_i16m1( + partial5a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl), vl); + partial5b = __riscv_vadd_vv_i16m1( + partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN), vl); + partial7a = __riscv_vadd_vv_i16m1( + partial7a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl), vl); + partial7b = __riscv_vadd_vv_i16m1( + partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN), vl); + partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); + + // const0 = { 840, 420, 280, 210, } + vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4); + const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4); + const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4); + const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4); + + // const1 = { 168, 140, 120, 105, } + vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4); + const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4); + const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4); + const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4); + + // const2 = { 0, 0, 420, 210, } + vuint32m1_t const2 = __riscv_vmv_v_x_u32m1(0, 4); + const2 = __riscv_vslide1down_vx_u32m1(const2, 420, 4); + const2 = __riscv_vslide1down_vx_u32m1(const2, 210, 4); + + // const3 = { 140, 105, 105, 105, }; + vuint32m1_t const3 = __riscv_vmv_v_x_u32m1(105, 4); + const3 = __riscv_vslide1up_vx_u32m1(const3, 140, 4); + + // Compute costs in terms of partial sums. + vint32m2_t tmp1_i32m2 = __riscv_vwmul_vv_i32m2(partial6, partial6, vl); + vint32m2_t partial6_s32 = __riscv_vslidedown_vx_i32m2(tmp1_i32m2, 4, vl); + partial6_s32 = __riscv_vadd_vv_i32m2(partial6_s32, tmp1_i32m2, 4); + + // Reverse partial B. + // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, }. + vuint32m1_t costs_0, costs_1, costs_2, costs_3; + static const uint16_t tab_u16[8] = { + 6, 5, 4, 3, 2, 1, 0, 7, + }; + vuint16m1_t index_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8); + vint16m1_t partial4b_rv = + __riscv_vrgather_vv_i16m1(partial4b, index_u16m1, 8); + costs_0 = fold_mul_and_sum_rvv(partial4a, partial4b_rv, const0, const1); + vuint32m1_t partial6_u32 = __riscv_vreinterpret_v_i32m1_u32m1( + __riscv_vlmul_trunc_v_i32m2_i32m1(partial6_s32)); + costs_2 = __riscv_vmul_vx_u32m1(partial6_u32, 105, 4); + vint16m1_t partial5b_rv = + __riscv_vrgather_vv_i16m1(partial5b, index_u16m1, 8); + costs_1 = fold_mul_and_sum_rvv(partial5a, partial5b_rv, const2, const3); + vint16m1_t partial7b_rv = + __riscv_vrgather_vv_i16m1(partial7b, index_u16m1, 8); + costs_3 = fold_mul_and_sum_rvv(partial7a, partial7b_rv, const2, const3); + + // combine values + vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); + vuint32m1_t cost0_sum = + __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4); + vuint32m1_t cost1_sum = + __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4); + vuint32m1_t cost2_sum = + __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4); + vuint32m1_t cost3_sum = + __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4); + + vuint32m1_t cost47 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4); + cost47 = __riscv_vslideup_vx_u32m1(cost47, cost2_sum, 2, 4); + cost47 = __riscv_vslideup_vx_u32m1(cost47, cost3_sum, 3, 4); + __riscv_vse32_v_u32m1(&cost[0], cost47, 4); + return cost47; +} + +static inline vuint32m1_t fold_mul_and_sum_pairwise_rvv(vint16m1_t partiala, + vint16m1_t partialb, + vint16m1_t partialc, + vuint32m1_t const0) { + vuint16m1_t vid_u16m1 = __riscv_vid_v_u16m1(4); + vuint16m1_t index_u16m1 = __riscv_vsll_vx_u16m1(vid_u16m1, 1, 4); + vint16m1_t tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partiala, 0, 8); + vint32m2_t partiala_i32m2 = __riscv_vwadd_vv_i32m2(partiala, tmp_i16m1, 8); + tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialb, 0, 8); + vint32m2_t partialb_i32m2 = __riscv_vwadd_vv_i32m2(partialb, tmp_i16m1, 8); + + tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialc, 0, 8); + vint32m2_t partialc_i32m2 = __riscv_vwadd_vv_i32m2(partialc, tmp_i16m1, 8); + partiala_i32m2 = __riscv_vmul_vv_i32m2(partiala_i32m2, partiala_i32m2, 8); + partialb_i32m2 = __riscv_vmul_vv_i32m2(partialb_i32m2, partialb_i32m2, 8); + vint32m1_t partialb_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1( + __riscv_vrgatherei16_vv_i32m2(partialb_i32m2, index_u16m1, 4)); + partialc_i32m2 = __riscv_vmul_vv_i32m2(partialc_i32m2, partialc_i32m2, 8); + partiala_i32m2 = __riscv_vadd_vv_i32m2(partiala_i32m2, partialc_i32m2, 8); + vint32m1_t partiala_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1( + __riscv_vrgatherei16_vv_i32m2(partiala_i32m2, index_u16m1, 4)); + + vuint32m1_t cost = __riscv_vmul_vx_u32m1( + __riscv_vreinterpret_v_i32m1_u32m1(partialb_i32m1), 105, 4); + cost = __riscv_vmacc_vv_u32m1( + cost, __riscv_vreinterpret_v_i32m1_u32m1(partiala_i32m1), const0, 4); + return cost; +} + +static inline vint32m1_t horizontal_add_4d_s16x8(vint16m1_t lines_0, + vint16m1_t lines_1, + vint16m1_t lines_2, + vint16m1_t lines_3) { + vint32m1_t vec_scalar_i32m1 = __riscv_vmv_s_x_i32m1(0, 1); + vint32m1_t lines0_sum = + __riscv_vwredsum_vs_i16m1_i32m1(lines_0, vec_scalar_i32m1, 8); + vint32m1_t lines1_sum = + __riscv_vwredsum_vs_i16m1_i32m1(lines_1, vec_scalar_i32m1, 8); + vint32m1_t lines2_sum = + __riscv_vwredsum_vs_i16m1_i32m1(lines_2, vec_scalar_i32m1, 8); + vint32m1_t lines3_sum = + __riscv_vwredsum_vs_i16m1_i32m1(lines_3, vec_scalar_i32m1, 8); + + vint32m1_t ret = __riscv_vslideup_vx_i32m1(lines0_sum, lines1_sum, 1, 4); + ret = __riscv_vslideup_vx_i32m1(ret, lines2_sum, 2, 4); + ret = __riscv_vslideup_vx_i32m1(ret, lines3_sum, 3, 4); + return ret; +} + +// This function computes the cost along directions 0, 1, 2, 3. (0 means +// 45-degree up-right, 2 is horizontal). +// +// For direction 1 and 3 ("east northeast" and "east southeast") the shifted +// lines need three vectors instead of two. For direction 1 for example, we need +// to compute the sums along the line i below: +// 0 0 1 1 2 2 3 3 +// 1 1 2 2 3 3 4 4 +// 2 2 3 3 4 4 5 5 +// 3 3 4 4 5 5 6 6 +// 4 4 5 5 6 6 7 7 +// 5 5 6 6 7 7 8 8 +// 6 6 7 7 8 8 9 9 +// 7 7 8 8 9 9 10 10 +// +// Which means we need the following configuration: +// 0 0 1 1 2 2 3 3 +// 1 1 2 2 3 3 4 4 +// 2 2 3 3 4 4 5 5 +// 3 3 4 4 5 5 6 6 +// 4 4 5 5 6 6 7 7 +// 5 5 6 6 7 7 8 8 +// 6 6 7 7 8 8 9 9 +// 7 7 8 8 9 9 10 10 +// +// Three vectors are needed to compute this, as well as some extra pairwise +// additions. +static vuint32m1_t compute_horiz_directions_rvv( + vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, + vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5, + vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) { + // Compute diagonal directions (1, 2, 3). + // Partial sums for lines 0 and 1. + size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16); + vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl); + vint16m1_t partial0a = __riscv_vmv_v_v_i16m1(lines_0, vl); + partial0a = __riscv_vadd_vv_i16m1( + partial0a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 7), vl), vl); + vint16m1_t partial0b = __riscv_vslidedown_vx_i16m1(lines_1, 7, VL_SLIDE_DOWN); + vint16m1_t partial1a = __riscv_vadd_vv_i16m1( + lines_0, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 6), vl), + vl); + vint16m1_t partial1b = __riscv_vslidedown_vx_i16m1(lines_1, 6, VL_SLIDE_DOWN); + vint16m1_t partial3a = __riscv_vslidedown_vx_i16m1(lines_0, 2, VL_SLIDE_DOWN); + partial3a = __riscv_vadd_vv_i16m1( + partial3a, __riscv_vslidedown_vx_i16m1(lines_1, 4, VL_SLIDE_DOWN), vl); + vint16m1_t partial3b = + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 2), vl); + partial3b = __riscv_vadd_vv_i16m1( + partial3b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, 4, vl), vl); + + // Partial sums for lines 2 and 3. + partial0a = __riscv_vadd_vv_i16m1( + partial0a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl); + partial0a = __riscv_vadd_vv_i16m1( + partial0a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 5), vl), vl); + partial0b = __riscv_vadd_vv_i16m1( + partial0b, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl); + partial0b = __riscv_vadd_vv_i16m1( + partial0b, __riscv_vslidedown_vx_i16m1(lines_3, 5, VL_SLIDE_DOWN), vl); + partial1a = __riscv_vadd_vv_i16m1( + partial1a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 4), vl), vl); + partial1a = __riscv_vadd_vv_i16m1( + partial1a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 2), vl), vl); + partial1b = __riscv_vadd_vv_i16m1( + partial1b, __riscv_vslidedown_vx_i16m1(lines_2, 4, VL_SLIDE_DOWN), vl); + partial1b = __riscv_vadd_vv_i16m1( + partial1b, __riscv_vslidedown_vx_i16m1(lines_3, 2, VL_SLIDE_DOWN), vl); + partial3a = __riscv_vadd_vv_i16m1( + partial3a, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl); + partial3b = __riscv_vadd_vv_i16m1( + partial3b, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl); + partial3b = __riscv_vadd_vv_i16m1(partial3b, lines_3, vl); + + // Partial sums for lines 4 and 5. + partial0a = __riscv_vadd_vv_i16m1( + partial0a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 4), vl), vl); + partial0a = __riscv_vadd_vv_i16m1( + partial0a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 3), vl), vl); + partial0b = __riscv_vadd_vv_i16m1( + partial0b, __riscv_vslidedown_vx_i16m1(lines_4, 4, VL_SLIDE_DOWN), vl); + partial0b = __riscv_vadd_vv_i16m1( + partial0b, __riscv_vslidedown_vx_i16m1(lines_5, 3, VL_SLIDE_DOWN), vl); + partial1b = __riscv_vadd_vv_i16m1(partial1b, lines_4, vl); + partial1b = __riscv_vadd_vv_i16m1( + partial1b, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl); + vint16m1_t partial1c = __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN); + partial3b = __riscv_vadd_vv_i16m1( + partial3b, __riscv_vslidedown_vx_i16m1(lines_4, 2, VL_SLIDE_DOWN), vl); + partial3b = __riscv_vadd_vv_i16m1( + partial3b, __riscv_vslidedown_vx_i16m1(lines_5, 4, VL_SLIDE_DOWN), vl); + vint16m1_t partial3c = + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 2), vl); + partial3c = __riscv_vadd_vv_i16m1( + partial3c, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 4), vl), vl); + + // Partial sums for lines 6 and 7. + partial0a = __riscv_vadd_vv_i16m1( + partial0a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 2), vl), vl); + partial0a = __riscv_vadd_vv_i16m1( + partial0a, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 1), vl), vl); + partial0b = __riscv_vadd_vv_i16m1( + partial0b, __riscv_vslidedown_vx_i16m1(lines_6, 2, VL_SLIDE_DOWN), vl); + partial0b = __riscv_vadd_vv_i16m1( + partial0b, __riscv_vslide1down_vx_i16m1(lines_7, 0, vl), vl); + partial1b = __riscv_vadd_vv_i16m1( + partial1b, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 4), vl), vl); + partial1b = __riscv_vadd_vv_i16m1( + partial1b, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 2), vl), vl); + partial1c = __riscv_vadd_vv_i16m1( + partial1c, __riscv_vslidedown_vx_i16m1(lines_6, 4, VL_SLIDE_DOWN), vl); + partial1c = __riscv_vadd_vv_i16m1( + partial1c, __riscv_vslidedown_vx_i16m1(lines_7, 2, VL_SLIDE_DOWN), vl); + partial3b = __riscv_vadd_vv_i16m1( + partial3b, __riscv_vslidedown_vx_i16m1(lines_6, 6, VL_SLIDE_DOWN), vl); + partial3c = __riscv_vadd_vv_i16m1( + partial3c, + __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 6), vl), vl); + partial3c = __riscv_vadd_vv_i16m1(partial3c, lines_7, vl); + + // Special case for direction 2 as it's just a sum along each line. + vint32m1_t partial2a = + horizontal_add_4d_s16x8(lines_0, lines_1, lines_2, lines_3); + vint32m1_t partial2b = + horizontal_add_4d_s16x8(lines_4, lines_5, lines_6, lines_7); + vuint32m1_t partial2a_u32 = __riscv_vreinterpret_v_i32m1_u32m1( + __riscv_vmul_vv_i32m1(partial2a, partial2a, 4)); + vuint32m1_t partial2b_u32 = __riscv_vreinterpret_v_i32m1_u32m1( + __riscv_vmul_vv_i32m1(partial2b, partial2b, 4)); + + // const0 = { 840, 420, 280, 210, } + vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4); + const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4); + const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4); + const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4); + + // const1 = { 168, 140, 120, 105, } + vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4); + const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4); + const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4); + const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4); + + // const2 = { 420, 210, 140, 105, }; + vuint32m1_t const2 = __riscv_vmv_s_x_u32m1(105, 4); + const2 = __riscv_vslide1up_vx_u32m1(const2, 140, 4); + const2 = __riscv_vslide1up_vx_u32m1(const2, 210, 4); + const2 = __riscv_vslide1up_vx_u32m1(const2, 420, 4); + + static const uint16_t tab_u16[8] = { + 0, 6, 5, 4, 3, 2, 1, 0, + }; + vuint32m1_t costs_0, costs_1, costs_2, costs_3; + vuint16m1_t template_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8); + + // Reverse partial c. + // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, } + vuint16m1_t index_u16m1 = __riscv_vslide1down_vx_u16m1(template_u16m1, 7, 8); + vint16m1_t partial0b_rv = + __riscv_vrgather_vv_i16m1(partial0b, index_u16m1, 8); + costs_0 = fold_mul_and_sum_rvv(partial0a, partial0b_rv, const0, const1); + + // Reverse partial c. + // pattern = { 5, 4, 3, 2, 1, 0, 6, 7, } + vuint16m1_t index_pair_u16m1 = + __riscv_vslide1down_vx_u16m1(template_u16m1, 6, 8); + index_pair_u16m1 = __riscv_vslide1down_vx_u16m1(index_pair_u16m1, 7, 8); + vint16m1_t partialc_rv = + __riscv_vrgather_vv_i16m1(partial1c, index_pair_u16m1, 8); + costs_1 = + fold_mul_and_sum_pairwise_rvv(partial1a, partial1b, partialc_rv, const2); + + costs_2 = __riscv_vadd_vv_u32m1(partial2a_u32, partial2b_u32, 4); + costs_2 = __riscv_vmul_vx_u32m1(costs_2, 105, 4); + + vint16m1_t partial3a_rv = + __riscv_vrgather_vv_i16m1(partial3a, index_pair_u16m1, 8); + costs_3 = + fold_mul_and_sum_pairwise_rvv(partial3c, partial3b, partial3a_rv, const2); + + // combine values + vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); + vuint32m1_t cost0_sum = + __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4); + vuint32m1_t cost1_sum = + __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4); + vuint32m1_t cost2_sum = + __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4); + vuint32m1_t cost3_sum = + __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4); + + costs_0 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4); + costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost2_sum, 2, 4); + costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost3_sum, 3, 4); + __riscv_vse32_v_u32m1(&cost[0], costs_0, 4); + return costs_0; +} + +int cdef_find_dir_rvv(const uint16_t *img, int stride, int32_t *var, + int coeff_shift) { + size_t vl = 8; + size_t vlmax = __riscv_vsetvlmax_e16m1(); + vuint16m1_t s; + vint16m1_t lines_0, lines_1, lines_2, lines_3; + vint16m1_t lines_4, lines_5, lines_6, lines_7; + vuint16m1_t vec_zero_u16m1 = + __riscv_vmv_v_x_u16m1(0, __riscv_vsetvl_e16m1(16)); + + if (vlmax == 8) + s = __riscv_vle16_v_u16m1(img, vl); + else + s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); + lines_0 = __riscv_vreinterpret_v_u16m1_i16m1( + __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); + lines_0 = __riscv_vsub_vx_i16m1(lines_0, 128, vl); + + img += stride; + if (vlmax == 8) + s = __riscv_vle16_v_u16m1(img, vl); + else + s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); + lines_1 = __riscv_vreinterpret_v_u16m1_i16m1( + __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); + lines_1 = __riscv_vsub_vx_i16m1(lines_1, 128, vl); + + img += stride; + if (vlmax == 8) + s = __riscv_vle16_v_u16m1(img, vl); + else + s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); + lines_2 = __riscv_vreinterpret_v_u16m1_i16m1( + __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); + lines_2 = __riscv_vsub_vx_i16m1(lines_2, 128, vl); + + img += stride; + if (vlmax == 8) + s = __riscv_vle16_v_u16m1(img, vl); + else + s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); + lines_3 = __riscv_vreinterpret_v_u16m1_i16m1( + __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); + lines_3 = __riscv_vsub_vx_i16m1(lines_3, 128, vl); + + img += stride; + if (vlmax == 8) + s = __riscv_vle16_v_u16m1(img, vl); + else + s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); + lines_4 = __riscv_vreinterpret_v_u16m1_i16m1( + __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); + lines_4 = __riscv_vsub_vx_i16m1(lines_4, 128, vl); + + img += stride; + if (vlmax == 8) + s = __riscv_vle16_v_u16m1(img, vl); + else + s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); + lines_5 = __riscv_vreinterpret_v_u16m1_i16m1( + __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); + lines_5 = __riscv_vsub_vx_i16m1(lines_5, 128, vl); + + img += stride; + if (vlmax == 8) + s = __riscv_vle16_v_u16m1(img, vl); + else + s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); + lines_6 = __riscv_vreinterpret_v_u16m1_i16m1( + __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); + lines_6 = __riscv_vsub_vx_i16m1(lines_6, 128, vl); + + img += stride; + if (vlmax == 8) + s = __riscv_vle16_v_u16m1(img, vl); + else + s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); + lines_7 = __riscv_vreinterpret_v_u16m1_i16m1( + __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); + lines_7 = __riscv_vsub_vx_i16m1(lines_7, 128, vl); + + // Compute "mostly vertical" directions. + uint32_t cost[8]; + vuint32m1_t cost47 = + compute_vert_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4, + lines_5, lines_6, lines_7, cost + 4, vl); + + // Compute "mostly horizontal" directions. + vuint32m1_t cost03 = + compute_horiz_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4, + lines_5, lines_6, lines_7, cost, vl); + + // Find max cost as well as its index to get best_dir. + // The max cost needs to be propagated in the whole vector to find its + // position in the original cost vectors cost03 and cost47. + vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); + vuint32m1_t cost07 = __riscv_vmaxu_vv_u32m1(cost03, cost47, 4); + uint32_t best_cost = __riscv_vmv_x_s_u32m1_u32( + __riscv_vredmaxu_vs_u32m1_u32m1(cost07, vec_scalar_u32m1, 4)); + vbool32_t mask_cost = __riscv_vmseq_vx_u32m1_b32(cost03, best_cost, 4); + long best_dir = __riscv_vfirst_m_b32(mask_cost, 4); + if (best_dir == -1) { + mask_cost = __riscv_vmseq_vx_u32m1_b32(cost47, best_cost, 4); + best_dir = __riscv_vfirst_m_b32(mask_cost, 4); + best_dir += 4; + } + + // Difference between the optimal variance and the variance along the + // orthogonal direction. Again, the sum(x^2) terms cancel out. + *var = best_cost - cost[(best_dir + 4) & 7]; + + // We'd normally divide by 840, but dividing by 1024 is close enough + // for what we're going to do with this. + *var >>= 10; + return (int)best_dir; +} + +void cdef_copy_rect8_8bit_to_16bit_rvv(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + do { + int w = 0; + size_t num_cols = width; + while (num_cols > 0) { + size_t vl = __riscv_vsetvl_e8mf2(num_cols); + vuint8mf2_t u8_src = __riscv_vle8_v_u8mf2(src + w, vl); + vuint16m1_t u16_src = __riscv_vwcvtu_x_x_v_u16m1(u8_src, vl); + __riscv_vse16_v_u16m1(dst + w, u16_src, vl); + + w += vl; + num_cols -= vl; + } + src += sstride; + dst += dstride; + } while (--height != 0); +} + +void cdef_copy_rect8_16bit_to_16bit_rvv(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int width, int height) { + do { + int w = 0; + size_t num_cols = width; + while (num_cols > 0) { + size_t vl = __riscv_vsetvl_e16m1(num_cols); + vuint16m1_t u16_src = __riscv_vle16_v_u16m1(src + w, vl); + __riscv_vse16_v_u16m1(dst + w, u16_src, vl); + + w += vl; + num_cols -= vl; + } + src += sstride; + dst += dstride; + } while (--height != 0); +} + +static inline vint16m1_t constrain16(vint16m1_t a, vint16m1_t b, + int16_t threshold, int16_t adjdamp, + size_t vl) { + if (!threshold) return __riscv_vmv_v_x_i16m1(0, vl); + const vbool16_t mask = __riscv_vmslt_vv_i16m1_b16(a, b, vl); + const vint16m1_t diff = __riscv_vsub_vv_i16m1(a, b, vl); + const vint16m1_t abs_diff = __riscv_vneg_v_i16m1_tumu(mask, diff, diff, vl); + const vint16m1_t shift = __riscv_vsra_vx_i16m1(abs_diff, adjdamp, vl); + const vint16m1_t thr = __riscv_vmv_v_x_i16m1(threshold, vl); + const vint16m1_t sub = __riscv_vsub_vv_i16m1(thr, shift, vl); + const vint16m1_t max = __riscv_vmax_vx_i16m1(sub, 0, vl); + const vint16m1_t min = __riscv_vmin_vv_i16m1(abs_diff, max, vl); + return __riscv_vneg_v_i16m1_tumu(mask, min, min, vl); +} + +static inline vint16m1_t vmax_mask(vint16m1_t a, vint16m1_t b, size_t vl) { + const vbool16_t mask = + __riscv_vmseq_vx_i16m1_b16(a, (int16_t)CDEF_VERY_LARGE, vl); + const vint16m1_t val = __riscv_vmerge_vvm_i16m1(a, b, mask, vl); + return __riscv_vmax_vv_i16m1(val, b, vl); +} + +static inline vint16m1_t load_strided_i16_4x2(int16_t *addr, + const ptrdiff_t stride, + size_t vl) { + const vint16m1_t px_l1 = __riscv_vle16_v_i16m1(addr + stride, vl); + const vint16m1_t px_l0 = __riscv_vle16_v_i16m1(addr, vl); + return __riscv_vslideup_vx_i16m1(px_l0, px_l1, 4, vl); +} + +static inline void store_strided_u8_4x2(uint8_t *addr, vuint8mf2_t vdst, + const ptrdiff_t stride, size_t vl) { + __riscv_vse8_v_u8mf2(addr, vdst, vl >> 1); + vdst = __riscv_vslidedown_vx_u8mf2(vdst, 4, vl); + __riscv_vse8_v_u8mf2(addr + stride, vdst, vl >> 1); +} + +static inline void store_strided_u16_4x2(uint16_t *addr, vuint16m1_t vdst, + const ptrdiff_t stride, size_t vl) { + __riscv_vse16_v_u16m1(addr, vdst, vl >> 1); + vdst = __riscv_vslidedown_vx_u16m1(vdst, 4, vl); + __riscv_vse16_v_u16m1(addr + stride, vdst, vl >> 1); +} + +#define LOAD_PIX(addr) \ + const vint16m1_t px = __riscv_vle16_v_i16m1((int16_t *)addr, vl); \ + vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl) + +#define LOAD_PIX4(addr) \ + const vint16m1_t px = \ + load_strided_i16_4x2((int16_t *)addr, CDEF_BSTRIDE, vl); \ + vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl) + +#define LOAD_DIR(p, addr, o0, o1) \ + const vint16m1_t p##0 = __riscv_vle16_v_i16m1((int16_t *)addr + o0, vl); \ + const vint16m1_t p##1 = __riscv_vle16_v_i16m1((int16_t *)addr - o0, vl); \ + const vint16m1_t p##2 = __riscv_vle16_v_i16m1((int16_t *)addr + o1, vl); \ + const vint16m1_t p##3 = __riscv_vle16_v_i16m1((int16_t *)addr - o1, vl) + +#define LOAD_DIR4(p, addr, o0, o1) \ + const vint16m1_t p##0 = \ + load_strided_i16_4x2((int16_t *)addr + o0, CDEF_BSTRIDE, vl); \ + const vint16m1_t p##1 = \ + load_strided_i16_4x2((int16_t *)addr - o0, CDEF_BSTRIDE, vl); \ + const vint16m1_t p##2 = \ + load_strided_i16_4x2((int16_t *)addr + o1, CDEF_BSTRIDE, vl); \ + const vint16m1_t p##3 = \ + load_strided_i16_4x2((int16_t *)addr - o1, CDEF_BSTRIDE, vl) + +#define MAKE_TAPS \ + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; \ + const int16_t tap0 = (int16_t)(pri_taps[0]); \ + const int16_t tap1 = (int16_t)(pri_taps[1]) + +#define CONSTRAIN(p, strength, shift) \ + vint16m1_t p##_c0 = \ + constrain16(p##0, px, (int16_t)strength, (int16_t)shift, vl); \ + vint16m1_t p##_c1 = \ + constrain16(p##1, px, (int16_t)strength, (int16_t)shift, vl); \ + vint16m1_t p##_c2 = \ + constrain16(p##2, px, (int16_t)strength, (int16_t)shift, vl); \ + vint16m1_t p##_c3 = \ + constrain16(p##3, px, (int16_t)strength, (int16_t)shift, vl) + +#define SETUP_MINMAX \ + vint16m1_t max = px; \ + vint16m1_t min = px + +#define MIN_MAX(p) \ + do { \ + max = vmax_mask(p##0, max, vl); \ + min = __riscv_vmin_vv_i16m1(p##0, min, vl); \ + max = vmax_mask(p##1, max, vl); \ + min = __riscv_vmin_vv_i16m1(p##1, min, vl); \ + max = vmax_mask(p##2, max, vl); \ + min = __riscv_vmin_vv_i16m1(p##2, min, vl); \ + max = vmax_mask(p##3, max, vl); \ + min = __riscv_vmin_vv_i16m1(p##3, min, vl); \ + } while (0) + +#define PRI_0_UPDATE_SUM(p) \ + const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ + const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ + sum = __riscv_vmacc_vx_i16m1(sum, tap0, p##sum0, vl); \ + sum = __riscv_vmacc_vx_i16m1(sum, tap1, p##sum1, vl) + +#define UPDATE_SUM(p) \ + const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ + const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ + sum = __riscv_vadd_vv_i16m1(sum, p##sum0, vl); \ + sum = __riscv_vadd_vv_i16m1(sum, p##sum1, vl) + +#define SEC_0_UPDATE_SUM(p) \ + const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ + const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ + const vint16m1_t p##sum2 = __riscv_vadd_vv_i16m1(p##sum0, p##sum1, vl); \ + sum = __riscv_vadd_vv_i16m1(sum, __riscv_vsll_vx_i16m1(p##sum2, 1, vl), vl) + +#define BIAS \ + const vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(sum, 0, vl); \ + const vint16m1_t v_8 = __riscv_vmv_v_x_i16m1(8, vl); \ + const vint16m1_t bias = __riscv_vsub_vx_i16m1_tumu(mask, v_8, v_8, 1, vl); \ + const vint16m1_t unclamped = __riscv_vadd_vv_i16m1( \ + px, __riscv_vsra_vx_i16m1(__riscv_vadd_vv_i16m1(bias, sum, vl), 4, vl), \ + vl) + +#define STORE4 \ + do { \ + store_strided_u8_4x2(dst8, vdst, dstride, vl); \ + \ + in += (CDEF_BSTRIDE << 1); \ + dst8 += (dstride << 1); \ + } while (0) + +#define STORE4_CLAMPED \ + do { \ + BIAS; \ + vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ + __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ + vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ + __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl); \ + STORE4; \ + } while (0) + +#define STORE4_UNCLAMPED \ + do { \ + BIAS; \ + vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ + __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \ + STORE4; \ + } while (0) + +#define STORE8 \ + do { \ + __riscv_vse8_v_u8mf2(dst8, vdst, vl); \ + \ + in += CDEF_BSTRIDE; \ + dst8 += dstride; \ + } while (0) + +#define STORE8_CLAMPED \ + do { \ + BIAS; \ + vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ + __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ + vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ + __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl); \ + STORE8; \ + } while (0) + +#define STORE8_UNCLAMPED \ + do { \ + BIAS; \ + vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ + __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \ + STORE8; \ + } while (0) + +#define STORE16_4 \ + do { \ + store_strided_u16_4x2(dst16, vdst, dstride, vl); \ + \ + in += (CDEF_BSTRIDE << 1); \ + dst16 += (dstride << 1); \ + } while (0) + +#define STORE16_4_CLAMPED \ + do { \ + BIAS; \ + vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ + __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ + vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \ + STORE16_4; \ + } while (0) + +#define STORE16_4_UNCLAMPED \ + do { \ + BIAS; \ + vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \ + STORE16_4; \ + } while (0) + +#define STORE16 \ + do { \ + __riscv_vse16_v_u16m1(dst16, vdst, vl); \ + \ + in += CDEF_BSTRIDE; \ + dst16 += dstride; \ + } while (0) + +#define STORE16_CLAMPED \ + do { \ + BIAS; \ + vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ + __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ + vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \ + STORE16; \ + } while (0) + +#define STORE16_UNCLAMPED \ + do { \ + BIAS; \ + vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \ + STORE16; \ + } while (0) + +void cdef_filter_8_0_rvv(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + MAKE_TAPS; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + const size_t vl = block_width; + do { + LOAD_PIX(in); + SETUP_MINMAX; + + // Primary pass + LOAD_DIR(p, in, po1, po2); + CONSTRAIN(p, pri_strength, pri_damping); + MIN_MAX(p); + PRI_0_UPDATE_SUM(p); + + // Secondary pass 1 + LOAD_DIR(s, in, s1o1, s2o1); + CONSTRAIN(s, sec_strength, sec_damping); + MIN_MAX(s); + SEC_0_UPDATE_SUM(s); + + // Secondary pass 2 + LOAD_DIR(s2, in, s1o2, s2o2); + CONSTRAIN(s2, sec_strength, sec_damping); + MIN_MAX(s2); + UPDATE_SUM(s2); + + // Store + STORE8_CLAMPED; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + const size_t vl = block_width << 1; + do { + LOAD_PIX4(in); + SETUP_MINMAX; + + // Primary pass + LOAD_DIR4(p, in, po1, po2); + CONSTRAIN(p, pri_strength, pri_damping); + MIN_MAX(p); + PRI_0_UPDATE_SUM(p); + + // Secondary pass 1 + LOAD_DIR4(s, in, s1o1, s2o1); + CONSTRAIN(s, sec_strength, sec_damping); + MIN_MAX(s); + SEC_0_UPDATE_SUM(s); + + // Secondary pass 2 + LOAD_DIR4(s2, in, s1o2, s2o2); + CONSTRAIN(s2, sec_strength, sec_damping); + MIN_MAX(s2); + UPDATE_SUM(s2); + + // Store + STORE4_CLAMPED; + + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_8_1_rvv(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)sec_strength; + (void)sec_damping; + + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + MAKE_TAPS; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + const size_t vl = block_width; + do { + LOAD_PIX(in); + + // Primary pass + LOAD_DIR(p, in, po1, po2); + CONSTRAIN(p, pri_strength, pri_damping); + PRI_0_UPDATE_SUM(p); + + // Store + STORE8_UNCLAMPED; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + const size_t vl = block_width << 1; + do { + LOAD_PIX4(in); + + // Primary pass + LOAD_DIR4(p, in, po1, po2); + CONSTRAIN(p, pri_strength, pri_damping); + PRI_0_UPDATE_SUM(p); + + // Store + STORE4_UNCLAMPED; + + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_8_2_rvv(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)pri_damping; + (void)coeff_shift; + + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + const size_t vl = block_width; + do { + LOAD_PIX(in); + + // Secondary pass 1 + LOAD_DIR(s, in, s1o1, s2o1); + CONSTRAIN(s, sec_strength, sec_damping); + SEC_0_UPDATE_SUM(s); + + // Secondary pass 2 + LOAD_DIR(s2, in, s1o2, s2o2); + CONSTRAIN(s2, sec_strength, sec_damping); + UPDATE_SUM(s2); + + // Store + STORE8_UNCLAMPED; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + const size_t vl = block_width << 1; + do { + LOAD_PIX4(in); + + // Secondary pass 1 + LOAD_DIR4(s, in, s1o1, s2o1); + CONSTRAIN(s, sec_strength, sec_damping); + SEC_0_UPDATE_SUM(s); + + // Secondary pass 2 + LOAD_DIR4(s2, in, s1o2, s2o2); + CONSTRAIN(s2, sec_strength, sec_damping); + UPDATE_SUM(s2); + + // Store + STORE4_UNCLAMPED; + + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_8_3_rvv(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)sec_strength; + (void)dir; + (void)pri_damping; + (void)sec_damping; + (void)coeff_shift; + + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + const size_t vl = block_width; + do { + const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl); + const vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(px, vl); + __riscv_vse8_v_u8mf2(dst8, vdst, vl); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + const size_t vl = block_width << 1; + do { + const vint16m1_t px = + load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl); + vuint8mf2_t vdst = + __riscv_vncvt_x_x_w_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(px), vl); + store_strided_u8_4x2(dst8, vdst, dstride, vl); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_0_rvv(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + MAKE_TAPS; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + const size_t vl = block_width; + do { + LOAD_PIX(in); + SETUP_MINMAX; + + // Primary pass + LOAD_DIR(p, in, po1, po2); + CONSTRAIN(p, pri_strength, pri_damping); + MIN_MAX(p); + PRI_0_UPDATE_SUM(p); + + // Secondary pass 1 + LOAD_DIR(s, in, s1o1, s2o1); + CONSTRAIN(s, sec_strength, sec_damping); + MIN_MAX(s); + SEC_0_UPDATE_SUM(s); + + // Secondary pass 2 + LOAD_DIR(s2, in, s1o2, s2o2); + CONSTRAIN(s2, sec_strength, sec_damping); + MIN_MAX(s2); + UPDATE_SUM(s2); + + // Store + STORE16_CLAMPED; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + const size_t vl = block_width << 1; + do { + LOAD_PIX4(in); + SETUP_MINMAX; + + // Primary pass + LOAD_DIR4(p, in, po1, po2); + CONSTRAIN(p, pri_strength, pri_damping); + MIN_MAX(p); + PRI_0_UPDATE_SUM(p); + + // Secondary pass 1 + LOAD_DIR4(s, in, s1o1, s2o1); + CONSTRAIN(s, sec_strength, sec_damping); + MIN_MAX(s); + SEC_0_UPDATE_SUM(s); + + // Secondary pass 2 + LOAD_DIR4(s2, in, s1o2, s2o2); + CONSTRAIN(s2, sec_strength, sec_damping); + MIN_MAX(s2); + UPDATE_SUM(s2); + + // Store + STORE16_4_CLAMPED; + + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_1_rvv(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)sec_strength; + (void)sec_damping; + + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + MAKE_TAPS; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + const size_t vl = block_width; + do { + LOAD_PIX(in); + + // Primary pass + LOAD_DIR(p, in, po1, po2); + CONSTRAIN(p, pri_strength, pri_damping); + PRI_0_UPDATE_SUM(p); + + // Store + STORE16_UNCLAMPED; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + const size_t vl = block_width << 1; + do { + LOAD_PIX4(in); + + // Primary pass + LOAD_DIR4(p, in, po1, po2); + CONSTRAIN(p, pri_strength, pri_damping); + PRI_0_UPDATE_SUM(p); + + // Store + STORE16_4_UNCLAMPED; + + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_2_rvv(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)pri_damping; + (void)coeff_shift; + + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + const size_t vl = block_width; + do { + LOAD_PIX(in); + + // Secondary pass 1 + LOAD_DIR(s, in, s1o1, s2o1); + CONSTRAIN(s, sec_strength, sec_damping); + SEC_0_UPDATE_SUM(s); + + // Secondary pass 2 + LOAD_DIR(s2, in, s1o2, s2o2); + CONSTRAIN(s2, sec_strength, sec_damping); + UPDATE_SUM(s2); + + // Store + STORE16_UNCLAMPED; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + const size_t vl = block_width << 1; + do { + LOAD_PIX4(in); + + // Secondary pass 1 + LOAD_DIR4(s, in, s1o1, s2o1); + CONSTRAIN(s, sec_strength, sec_damping); + SEC_0_UPDATE_SUM(s); + + // Secondary pass 2 + LOAD_DIR4(s2, in, s1o2, s2o2); + CONSTRAIN(s2, sec_strength, sec_damping); + UPDATE_SUM(s2); + + // Store + STORE16_4_UNCLAMPED; + + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_3_rvv(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)sec_strength; + (void)dir; + (void)pri_damping; + (void)sec_damping; + (void)coeff_shift; + + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + const size_t vl = block_width; + do { + const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl); + __riscv_vse16_v_u16m1(dst16, px, vl); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + const size_t vl = block_width << 1; + do { + const vint16m1_t px = + load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl); + vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(px); + store_strided_u16_4x2(dst16, vdst, dstride, vl); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c index 4e41cc44489e..ecfc86551181 100644 --- a/third_party/aom/av1/common/warped_motion.c +++ b/third_party/aom/av1/common/warped_motion.c @@ -27,7 +27,8 @@ // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS. // We need an extra 2 taps to fit this in, for a total of 8 taps. /* clang-format off */ -const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = { +const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1] + [8] = { // [-1, 0) { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 }, { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 }, @@ -344,7 +345,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); - const int16_t *coeffs = av1_warped_filter[offs]; + const WarpedFilterCoeff *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_horiz; for (int m = 0; m < 8; ++m) { @@ -365,7 +366,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); - const int16_t *coeffs = av1_warped_filter[offs]; + const WarpedFilterCoeff *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_vert; for (int m = 0; m < 8; ++m) { @@ -575,7 +576,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); - const int16_t *coeffs = av1_warped_filter[offs]; + const WarpedFilterCoeff *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_horiz; for (int m = 0; m < 8; ++m) { @@ -599,7 +600,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); - const int16_t *coeffs = av1_warped_filter[offs]; + const WarpedFilterCoeff *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_vert; for (int m = 0; m < 8; ++m) { diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h index 00ede2afa77c..fbd24d1adc6d 100644 --- a/third_party/aom/av1/common/warped_motion.h +++ b/third_party/aom/av1/common/warped_motion.h @@ -33,7 +33,14 @@ #define WARP_ERROR_BLOCK_LOG 5 #define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG) -extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]; +#if AOM_ARCH_ARM || AOM_ARCH_AARCH64 || AOM_ARCH_X86 || AOM_ARCH_X86_64 +typedef int16_t WarpedFilterCoeff; +#else +typedef int8_t WarpedFilterCoeff; +#endif + +extern const WarpedFilterCoeff + av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]; DECLARE_ALIGNED(8, extern const int8_t, av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]); diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c index ae92471b634e..5f3b206e980f 100644 --- a/third_party/aom/av1/encoder/ratectrl.c +++ b/third_party/aom/av1/encoder/ratectrl.c @@ -3822,6 +3822,10 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type, resize_reset_rc(cpi, resize_pending_params->width, resize_pending_params->height, cm->width, cm->height); } + if (svc->temporal_layer_id == 0) { + rc->num_col_blscroll_last_tl0 = 0; + rc->num_row_blscroll_last_tl0 = 0; + } // Set the GF interval and update flag. if (!rc->rtc_external_ratectrl) set_gf_interval_update_onepass_rt(cpi, *frame_type); diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h index 0e7ede9b8444..88c048fb7e2b 100644 --- a/third_party/aom/av1/encoder/ratectrl.h +++ b/third_party/aom/av1/encoder/ratectrl.h @@ -200,6 +200,8 @@ typedef struct { int last_target_size_keyframe; int frames_since_scene_change; int perc_spatial_flat_blocks; + int num_col_blscroll_last_tl0; + int num_row_blscroll_last_tl0; int avg_frame_bandwidth; // Average frame size target for clip int min_frame_bandwidth; // Minimum allocation used for any frame diff --git a/third_party/aom/av1/encoder/var_based_part.c b/third_party/aom/av1/encoder/var_based_part.c index bc00cf05bfbe..37b295dba632 100644 --- a/third_party/aom/av1/encoder/var_based_part.c +++ b/third_party/aom/av1/encoder/var_based_part.c @@ -1325,6 +1325,53 @@ static inline void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x, } } +static void do_int_pro_motion_estimation(AV1_COMP *cpi, MACROBLOCK *x, + unsigned int *y_sad, int mi_row, + int mi_col, int source_sad_nonrd) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mi = xd->mi[0]; + const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; + const int increase_col_sw = source_sad_nonrd > kMedSad && + !cpi->rc.high_motion_content_screen_rtc && + (cpi->svc.temporal_layer_id == 0 || + cpi->rc.num_col_blscroll_last_tl0 > 2); + int me_search_size_col = is_screen + ? increase_col_sw ? 512 : 96 + : block_size_wide[cm->seq_params->sb_size] >> 1; + // For screen use larger search size row motion to capture + // vertical scroll, which can be larger motion. + int me_search_size_row = is_screen + ? source_sad_nonrd > kMedSad ? 512 : 192 + : block_size_high[cm->seq_params->sb_size] >> 1; + unsigned int y_sad_zero; + *y_sad = av1_int_pro_motion_estimation( + cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv, &y_sad_zero, + me_search_size_col, me_search_size_row); + // The logic below selects whether the motion estimated in the + // int_pro_motion() will be used in nonrd_pickmode. Only do this + // for screen for now. + if (is_screen) { + unsigned int thresh_sad = + (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000; + if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) { + x->sb_me_partition = 1; + x->sb_me_mv.as_int = mi->mv[0].as_int; + if (cpi->svc.temporal_layer_id == 0) { + if (abs(mi->mv[0].as_mv.col) > 16 && abs(mi->mv[0].as_mv.row) == 0) + cpi->rc.num_col_blscroll_last_tl0++; + else if (abs(mi->mv[0].as_mv.row) > 16 && abs(mi->mv[0].as_mv.col) == 0) + cpi->rc.num_row_blscroll_last_tl0++; + } + } else { + x->sb_me_partition = 0; + // Fall back to using zero motion. + *y_sad = y_sad_zero; + mi->mv[0].as_int = 0; + } + } +} + static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, unsigned int *y_sad_g, unsigned int *y_sad_alt, unsigned int *y_sad_last, @@ -1418,42 +1465,11 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, // so for now force it to 2 based on superblock sad. if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2; - if (est_motion == 1 || est_motion == 2) { - if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) { - // For screen only do int_pro_motion for spatial variance above - // threshold and motion level above LowSad. - if (x->source_variance > 100 && source_sad_nonrd > kLowSad) { - int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; - int me_search_size_col = - is_screen ? source_sad_nonrd > kMedSad ? 160 : 96 - : block_size_wide[cm->seq_params->sb_size] >> 1; - // For screen use larger search size row motion to capture - // vertical scroll, which can be larger motion. - int me_search_size_row = - is_screen ? source_sad_nonrd > kMedSad ? 512 : 192 - : block_size_high[cm->seq_params->sb_size] >> 1; - unsigned int y_sad_zero; - *y_sad = av1_int_pro_motion_estimation( - cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv, - &y_sad_zero, me_search_size_col, me_search_size_row); - // The logic below selects whether the motion estimated in the - // int_pro_motion() will be used in nonrd_pickmode. Only do this - // for screen for now. - if (is_screen) { - unsigned int thresh_sad = - (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000; - if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) { - x->sb_me_partition = 1; - x->sb_me_mv.as_int = mi->mv[0].as_int; - } else { - x->sb_me_partition = 0; - // Fall back to using zero motion. - *y_sad = y_sad_zero; - mi->mv[0].as_int = 0; - } - } - } - } + if ((est_motion == 1 || est_motion == 2) && xd->mb_to_right_edge >= 0 && + xd->mb_to_bottom_edge >= 0 && x->source_variance > 100 && + source_sad_nonrd > kLowSad) { + do_int_pro_motion_estimation(cpi, x, y_sad, mi_row, mi_col, + source_sad_nonrd); } if (*y_sad == UINT_MAX) { diff --git a/third_party/aom/build/cmake/aom_config_defaults.cmake b/third_party/aom/build/cmake/aom_config_defaults.cmake index 43ada3626461..b78c9ec98fba 100644 --- a/third_party/aom/build/cmake/aom_config_defaults.cmake +++ b/third_party/aom/build/cmake/aom_config_defaults.cmake @@ -26,6 +26,7 @@ set_aom_detect_var(AOM_ARCH_ARM 0 "Enables ARM architecture.") set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.") set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.") set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.") +set_aom_detect_var(AOM_ARCH_RISCV 0 "Enables RISC-V architecture.") # Arm/AArch64 feature flags. set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.") @@ -51,6 +52,9 @@ set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.") set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.") set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.") +# RISC-V64 feature flags. +set_aom_detect_var(HAVE_RVV 0 "Enables RVV optimizations.") + # Flags describing the build environment. set_aom_detect_var(HAVE_FEXCEPT 0 "Internal flag, GNU fenv.h present for target.") @@ -241,3 +245,6 @@ set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets." ON) set_aom_option_var(ENABLE_AVX2 "Enables AVX2 optimizations on x86/x86_64 targets." ON) + +# RVV intrinsics flags. +set_aom_option_var(ENABLE_RVV "Enables RVV optimizations on RISC-V targets." ON) diff --git a/third_party/aom/build/cmake/aom_configure.cmake b/third_party/aom/build/cmake/aom_configure.cmake index 7350652f0040..572429545923 100644 --- a/third_party/aom/build/cmake/aom_configure.cmake +++ b/third_party/aom/build/cmake/aom_configure.cmake @@ -75,6 +75,8 @@ if(NOT AOM_TARGET_CPU) set(AOM_TARGET_CPU "arm64") elseif(cpu_lowercase MATCHES "^ppc") set(AOM_TARGET_CPU "ppc") + elseif(cpu_lowercase MATCHES "^riscv") + set(AOM_TARGET_CPU "riscv") else() message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not " "supported, falling back to the generic target") diff --git a/third_party/aom/build/cmake/cpu.cmake b/third_party/aom/build/cmake/cpu.cmake index 84a3ae9b49bb..59826617ad24 100644 --- a/third_party/aom/build/cmake/cpu.cmake +++ b/third_party/aom/build/cmake/cpu.cmake @@ -132,4 +132,15 @@ elseif("${AOM_TARGET_CPU}" MATCHES "^x86") set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor}) endif() endforeach() +elseif("${AOM_TARGET_CPU}" MATCHES "riscv") + set(AOM_ARCH_RISCV64 1) + set(RTCD_ARCH_RISCV64 "yes") + + if(ENABLE_RVV) + set(HAVE_RVV 1) + set(RTCD_HAVE_RVV "yes") + else() + set(HAVE_RVV 0) + set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-rvv) + endif() endif() diff --git a/third_party/aom/build/cmake/rtcd.pl b/third_party/aom/build/cmake/rtcd.pl index 464d1986c78c..6cd6a5ca3674 100755 --- a/third_party/aom/build/cmake/rtcd.pl +++ b/third_party/aom/build/cmake/rtcd.pl @@ -370,6 +370,36 @@ EOF common_bottom; } +sub riscv() { + determine_indirection("c", @ALL_ARCHS); + + # Assign the helper variable for each enabled extension + foreach my $opt (@ALL_ARCHS) { + my $opt_uc = uc $opt; + eval "\$have_${opt}=\"flags & HAS_${opt_uc}\""; + } + + common_top; + print <