Bug 1945604 - Update aom to 3990233fc06a35944d6d33797e63931802122a95 r=padenot

Differential Revision: https://phabricator.services.mozilla.com/D236581
2025-02-04 13:56:57 +00:00
parent 6817e402c2
commit 82b87cc7a3
52 changed files with 2695 additions and 995 deletions
--- a/media/libaom/config/generic/config/aom_config.asm
+++ b/media/libaom/config/generic/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
 AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 0
 AOM_ARCH_X86_64 equ 0
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 0
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
 HAVE_RVV equ 0
 HAVE_SSE equ 0
 HAVE_SSE2 equ 0
 HAVE_SSE3 equ 0
--- a/media/libaom/config/generic/config/aom_config.h
+++ b/media/libaom/config/generic/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
 #define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 0
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_RVV 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
 #define HAVE_SSE3 0
--- a/media/libaom/config/linux/arm/config/aom_config.asm
+++ b/media/libaom/config/linux/arm/config/aom_config.asm
@@ -12,6 +12,7 @@
 .equ AOM_ARCH_AARCH64, 0
 .equ AOM_ARCH_ARM, 1
 .equ AOM_ARCH_PPC, 0
 .equ AOM_ARCH_RISCV, 0
 .equ AOM_ARCH_X86, 0
 .equ AOM_ARCH_X86_64, 0
 .equ CONFIG_ACCOUNTING, 0
@@ -82,6 +83,7 @@
 .equ HAVE_NEON, 1
 .equ HAVE_NEON_DOTPROD, 0
 .equ HAVE_NEON_I8MM, 0
 .equ HAVE_RVV, 0
 .equ HAVE_SSE, 0
 .equ HAVE_SSE2, 0
 .equ HAVE_SSE3, 0
--- a/media/libaom/config/linux/arm/config/aom_config.h
+++ b/media/libaom/config/linux/arm/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 1
 #define AOM_ARCH_PPC 0
 #define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 0
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 1
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_RVV 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
 #define HAVE_SSE3 0
--- a/media/libaom/config/linux/ia32/config/aom_config.asm
+++ b/media/libaom/config/linux/ia32/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
 AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 1
 AOM_ARCH_X86_64 equ 0
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
 HAVE_RVV equ 0
 HAVE_SSE equ 1
 HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
--- a/media/libaom/config/linux/ia32/config/aom_config.h
+++ b/media/libaom/config/linux/ia32/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
 #define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 1
 #define AOM_ARCH_X86_64 0
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_RVV 0
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
 #define HAVE_SSE3 1
--- a/media/libaom/config/linux/x64/config/aom_config.asm
+++ b/media/libaom/config/linux/x64/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
 AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 0
 AOM_ARCH_X86_64 equ 1
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
 HAVE_RVV equ 0
 HAVE_SSE equ 1
 HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
--- a/media/libaom/config/linux/x64/config/aom_config.h
+++ b/media/libaom/config/linux/x64/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
 #define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 1
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_RVV 0
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
 #define HAVE_SSE3 1
--- a/media/libaom/config/mac/arm64/config/aom_config.asm
+++ b/media/libaom/config/mac/arm64/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 1
 AOM_ARCH_ARM equ 1
 AOM_ARCH_PPC equ 0
 AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 0
 AOM_ARCH_X86_64 equ 0
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 0
 HAVE_NEON equ 1
 HAVE_NEON_DOTPROD equ 1
 HAVE_NEON_I8MM equ 1
 HAVE_RVV equ 0
 HAVE_SSE equ 0
 HAVE_SSE2 equ 0
 HAVE_SSE3 equ 0
--- a/media/libaom/config/mac/arm64/config/aom_config.h
+++ b/media/libaom/config/mac/arm64/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 1
 #define AOM_ARCH_ARM 1
 #define AOM_ARCH_PPC 0
 #define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 0
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 1
 #define HAVE_NEON_DOTPROD 1
 #define HAVE_NEON_I8MM 1
 #define HAVE_RVV 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
 #define HAVE_SSE3 0
--- a/media/libaom/config/mac/x64/config/aom_config.asm
+++ b/media/libaom/config/mac/x64/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
 AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 0
 AOM_ARCH_X86_64 equ 1
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
 HAVE_RVV equ 0
 HAVE_SSE equ 1
 HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
--- a/media/libaom/config/mac/x64/config/aom_config.h
+++ b/media/libaom/config/mac/x64/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
 #define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 1
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_RVV 0
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
 #define HAVE_SSE3 1
--- a/media/libaom/config/win/ia32/config/aom_config.asm
+++ b/media/libaom/config/win/ia32/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
 AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 1
 AOM_ARCH_X86_64 equ 0
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
 HAVE_RVV equ 0
 HAVE_SSE equ 1
 HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
--- a/media/libaom/config/win/ia32/config/aom_config.h
+++ b/media/libaom/config/win/ia32/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
 #define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 1
 #define AOM_ARCH_X86_64 0
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_RVV 0
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
 #define HAVE_SSE3 1
--- a/media/libaom/config/win/x64/config/aom_config.asm
+++ b/media/libaom/config/win/x64/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
 AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 0
 AOM_ARCH_X86_64 equ 1
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
 HAVE_RVV equ 0
 HAVE_SSE equ 1
 HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
--- a/media/libaom/config/win/x64/config/aom_config.h
+++ b/media/libaom/config/win/x64/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
 #define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 1
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_RVV 0
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
 #define HAVE_SSE3 1
--- a/media/libaom/moz.yaml
+++ b/media/libaom/moz.yaml
@@ -20,11 +20,11 @@ origin:
  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 0c13a5d54053f82bf8500b421b5cdefb1cc1b3ed (Sun Jan 05 09:13:09 2025 -0800).
+  release: 3990233fc06a35944d6d33797e63931802122a95 (Thu Jan 30 11:32:16 2025 -0800).
  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 0c13a5d54053f82bf8500b421b5cdefb1cc1b3ed
+  revision: 3990233fc06a35944d6d33797e63931802122a95
  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/third_party/aom/CMakeLists.txt
+++ b/third_party/aom/CMakeLists.txt
@@ -333,6 +333,12 @@ if(CONFIG_AV1_ENCODER)
  # libaom static library.
  if(BUILD_SHARED_LIBS)
    target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom_static)
    # TODO: https://aomedia.issues.chromium.org/391715078 - This condition can
    # be removed after aom_av1_rc restricts its symbol visibility.
    if(CYGWIN OR MINGW)
      target_link_options(aom_av1_rc ${AOM_LIB_LINK_TYPE}
                          LINKER:--allow-multiple-definition)
    endif()
  else()
    target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom)
  endif()
@@ -858,8 +864,8 @@ if(BUILD_SHARED_LIBS)
  # errors (don't use it with AddressSanitizer)." See
  # https://clang.llvm.org/docs/AddressSanitizer.html#usage. Similarly, see
  # https://clang.llvm.org/docs/MemorySanitizer.html#usage.
-  if(NOT WIN32
+  if(NOT
-     AND NOT APPLE
+     (APPLE OR CYGWIN OR WIN32)
     AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE))
    # The -z defs linker option reports unresolved symbol references from object
    # files when building a shared library.
--- a/third_party/aom/README.md
+++ b/third_party/aom/README.md
@@ -60,7 +60,9 @@ README.md                {#LREADME}
   present, yasm will be used by default. Pass -DENABLE_NASM=ON to cmake to
   select nasm.) If you download yasm with the intention to work with Visual
   Studio, please download win32.exe or win64.exe and rename it into yasm.exe.
-   DO NOT download or use vsyasm.exe.
+   DO NOT download or use vsyasm.exe. The MSYS2 version of the yasm binary can
   also be used and avoids an issue caused by a missing Visual C++
   Redistributable install (Visual Studio 2010, MSVCR100.dll).
 6. Building the documentation requires
   [doxygen version 1.8.10 or newer](http://doxygen.org).
 7. Emscripten builds require the portable
--- a/third_party/aom/aom/exports_com
+++ b/third_party/aom/aom/exports_com
@@ -10,7 +10,6 @@ text aom_codec_set_option
 text aom_codec_version
 text aom_codec_version_extra_str
 text aom_codec_version_str
 text aom_free
 text aom_img_add_metadata
 text aom_img_alloc
 text aom_img_alloc_with_border
@@ -25,7 +24,6 @@ text aom_img_plane_width
 text aom_img_remove_metadata
 text aom_img_set_rect
 text aom_img_wrap
 text aom_malloc
 text aom_rb_bytes_read
 text aom_rb_read_bit
 text aom_rb_read_literal
--- a/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -15,6 +15,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "mem_neon.h"
 static inline int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low,
                                  const int16x4_t high) {
@@ -226,13 +227,8 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
                                      const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int bd) {
-  uint16_t *const dst_p1 = (uint16_t *)(s - 2 * pitch);
+  uint16x4_t src[4];
-  uint16_t *const dst_p0 = (uint16_t *)(s - pitch);
+  load_u16_4x4(s - 2 * pitch, pitch, &src[0], &src[1], &src[2], &src[3]);
  uint16_t *const dst_q0 = (uint16_t *)(s);
  uint16_t *const dst_q1 = (uint16_t *)(s + pitch);
  const uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0),
                              vld1_u16(dst_q0), vld1_u16(dst_q1) };
  // Adjust thresholds to bitdepth.
  const int outer_thresh = *blimit << (bd - 8);
@@ -247,12 +243,10 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
  filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
                &needs_filter4_mask);
-#if AOM_ARCH_AARCH64
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter4_mask), 0) == 0) {
  if (vaddv_u16(needs_filter4_mask) == 0) {
    // None of the values will be filtered.
    return;
  }
 #endif  // AOM_ARCH_AARCH64
  // Copy the masks to the high bits for packed comparisons later.
  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
@@ -272,10 +266,9 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  store_u16_4x4(s - 2 * pitch, pitch, vget_low_u16(p1q1_output),
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+                vget_low_u16(p0q0_output), vget_high_u16(p0q0_output),
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+                vget_high_u16(p1q1_output));
  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
 }
 void aom_highbd_lpf_horizontal_4_dual_neon(
@@ -290,14 +283,8 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int bd) {
  // Offset by 2 uint16_t values to load from first p1 position.
-  uint16_t *dst = s - 2;
+  uint16x4_t src[4];
-  uint16_t *dst_p1 = dst;
+  load_u16_4x4(s - 2, pitch, &src[0], &src[1], &src[2], &src[3]);
  uint16_t *dst_p0 = dst + pitch;
  uint16_t *dst_q0 = dst + pitch * 2;
  uint16_t *dst_q1 = dst + pitch * 3;
  uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
                        vld1_u16(dst_q1) };
  transpose_array_inplace_u16_4x4(src);
  // Adjust thresholds to bitdepth.
@@ -313,12 +300,10 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
  filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
                &needs_filter4_mask);
-#if AOM_ARCH_AARCH64
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter4_mask), 0) == 0) {
  if (vaddv_u16(needs_filter4_mask) == 0) {
    // None of the values will be filtered.
    return;
  }
 #endif  // AOM_ARCH_AARCH64
  // Copy the masks to the high bits for packed comparisons later.
  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
@@ -346,10 +331,7 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
  };
  transpose_array_inplace_u16_4x4(output);
-  vst1_u16(dst_p1, output[0]);
+  store_u16_4x4(s - 2, pitch, output[0], output[1], output[2], output[3]);
  vst1_u16(dst_p0, output[1]);
  vst1_u16(dst_q0, output[2]);
  vst1_u16(dst_q1, output[3]);
 }
 void aom_highbd_lpf_vertical_4_dual_neon(
@@ -379,16 +361,14 @@ static inline void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
  //                                ^^^^^^
  sum = vaddq_u16(sum, p0q0);
  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
  //               ^^^^^
  sum = vshlq_n_u16(sum, 1);
  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
  //        ^^^^^^                          ^^^^^^
  // Should dual issue with the left shift.
  const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
  const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
-  sum = vaddq_u16(sum, outer_sum);
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
  //        ^^^^^^^^^^^                       ^^^^
  sum = vmlaq_n_u16(outer_sum, sum, 2);
  *p1q1_output = vrshrq_n_u16(sum, 3);
@@ -396,11 +376,8 @@ static inline void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
  // p0 = p1 - (2 * p2) + q0 + q1
  // q0 = q1 - (2 * q2) + p0 + p1
  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
-  //                ^^^^^^^^
+  //        ^^^^^^^^^^^^^^^^^
-  const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+  sum = vmlsq_n_u16(sum, p2q2, 2);
  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
  //        ^^^^^^^^
  sum = vsubq_u16(sum, p2q2_double);
  const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
  sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
@@ -411,16 +388,9 @@ void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
                                      const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int bd) {
-  uint16_t *const dst_p2 = s - 3 * pitch;
+  uint16x4_t src[6];
-  uint16_t *const dst_p1 = s - 2 * pitch;
+  load_u16_4x6(s - 3 * pitch, pitch, &src[0], &src[1], &src[2], &src[3],
-  uint16_t *const dst_p0 = s - pitch;
+               &src[4], &src[5]);
  uint16_t *const dst_q0 = s;
  uint16_t *const dst_q1 = s + pitch;
  uint16_t *const dst_q2 = s + 2 * pitch;
  const uint16x4_t src[6] = { vld1_u16(dst_p2), vld1_u16(dst_p1),
                              vld1_u16(dst_p0), vld1_u16(dst_q0),
                              vld1_u16(dst_q1), vld1_u16(dst_q2) };
  // Adjust thresholds to bitdepth.
  const int outer_thresh = *blimit << (bd - 8);
@@ -437,32 +407,38 @@ void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
  filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
                &needs_filter_mask, &is_flat3_mask, &hev_mask);
-#if AOM_ARCH_AARCH64
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
  if (vaddv_u16(needs_filter_mask) == 0) {
    // None of the values will be filtered.
    return;
  }
 #endif  // AOM_ARCH_AARCH64
  uint16x8_t p0q0_output, p1q1_output;
  uint16x8_t f6_p1q1, f6_p0q0;
  // Not needing filter4() at all is a very common case, so isolate it to avoid
  // needlessly computing filter4().
  if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 &&
      vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
    filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
    p1q1_output = f6_p1q1;
    p0q0_output = f6_p0q0;
  } else {
    // Copy the masks to the high bits for packed comparisons later.
    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+    const uint16x8_t is_flat3_mask_8 =
        vcombine_u16(is_flat3_mask, is_flat3_mask);
    const uint16x8_t needs_filter_mask_8 =
        vcombine_u16(needs_filter_mask, needs_filter_mask);
    uint16x8_t f4_p1q1;
    uint16x8_t f4_p0q0;
  // ZIP1 p0q0, p1q1 may perform better here.
    const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
  uint16x8_t p0q0_output, p1q1_output;
    // Because we did not return after testing |needs_filter_mask| we know it is
    // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
-  // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
+    // filter6. Therefore if it is false when |needs_filter_mask| is true,
-  // output is not used.
+    // filter6 output is not used.
  uint16x8_t f6_p1q1, f6_p0q0;
    const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
    if (vget_lane_u64(need_filter6, 0) == 0) {
      // filter6() does not apply, but filter4() applies to one or more values.
@@ -476,11 +452,11 @@ void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
      p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
    }
  }
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  store_u16_4x4(s - 2 * pitch, pitch, vget_low_u16(p1q1_output),
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+                vget_low_u16(p0q0_output), vget_high_u16(p0q0_output),
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+                vget_high_u16(p1q1_output));
  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
 }
 void aom_highbd_lpf_horizontal_6_dual_neon(
@@ -494,17 +470,12 @@ void aom_highbd_lpf_horizontal_6_dual_neon(
 void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int bd) {
  // Left side of the filter window.
  uint16_t *const dst = s - 3;
  uint16_t *const dst_0 = dst;
  uint16_t *const dst_1 = dst + pitch;
  uint16_t *const dst_2 = dst + 2 * pitch;
  uint16_t *const dst_3 = dst + 3 * pitch;
  // Overread by 2 values. These overreads become the high halves of src_raw[2]
  // and src_raw[3] after transpose.
-  uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
+  uint16x8_t src_raw[4];
-                            vld1q_u16(dst_2), vld1q_u16(dst_3) };
+  load_u16_8x4(s - 3, pitch, &src_raw[0], &src_raw[1], &src_raw[2],
               &src_raw[3]);
  transpose_array_inplace_u16_4x8(src_raw);
  // p2, p1, p0, q0, q1, q2
  const uint16x4_t src[6] = {
@@ -528,16 +499,30 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
  filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
                &needs_filter_mask, &is_flat3_mask, &hev_mask);
-#if AOM_ARCH_AARCH64
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
  if (vaddv_u16(needs_filter_mask) == 0) {
    // None of the values will be filtered.
    return;
  }
 #endif  // AOM_ARCH_AARCH64
  uint16x8_t p0q0_output, p1q1_output;
  // Because we did not return after testing |needs_filter_mask| we know it is
  // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
  // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
  // output is not used.
  uint16x8_t f6_p1q1, f6_p0q0;
  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
  // Not needing filter4() at all is a very common case, so isolate it to avoid
  // needlessly computing filter4().
  if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 &&
      vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
    filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
    p1q1_output = f6_p1q1;
    p0q0_output = f6_p0q0;
  } else {
    // Copy the masks to the high bits for packed comparisons later.
    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+    const uint16x8_t is_flat3_mask_8 =
        vcombine_u16(is_flat3_mask, is_flat3_mask);
    const uint16x8_t needs_filter_mask_8 =
        vcombine_u16(needs_filter_mask, needs_filter_mask);
@@ -547,14 +532,6 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
    const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
  uint16x8_t p0q0_output, p1q1_output;
  // Because we did not return after testing |needs_filter_mask| we know it is
  // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
  // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
  // output is not used.
  uint16x8_t f6_p1q1, f6_p0q0;
  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
    if (vget_lane_u64(need_filter6, 0) == 0) {
      // filter6() does not apply, but filter4() applies to one or more values.
      p0q0_output = p0q0;
@@ -567,6 +544,7 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
      p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
    }
  }
  uint16x4_t output[4] = {
    vget_low_u16(p1q1_output),
@@ -576,11 +554,7 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
  };
  transpose_array_inplace_u16_4x4(output);
-  // dst_n starts at p2, so adjust to p1.
+  store_u16_4x4(s - 2, pitch, output[0], output[1], output[2], output[3]);
  vst1_u16(dst_0 + 1, output[0]);
  vst1_u16(dst_1 + 1, output[1]);
  vst1_u16(dst_2 + 1, output[2]);
  vst1_u16(dst_3 + 1, output[3]);
 }
 void aom_highbd_lpf_vertical_6_dual_neon(
@@ -607,18 +581,14 @@ static inline void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
  //                    ^^^^^^^^^^^
  const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
  //               ^^^^^
  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
  // Add two other terms to make dual issue with shift more likely.
  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
  //                                   ^^^^^^^^^^^
  const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
-  //                                 ^^^^^^^^^^^^^
+  //               ^^^^^             ^^^^^^^^^^^^^
-  sum = vaddq_u16(sum, p01q01);
+  uint16x8_t sum = vmlaq_n_u16(p01q01, p23q23, 2);
  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
  //        ^^^^^^
@@ -654,19 +624,9 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
                                      const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int bd) {
-  uint16_t *const dst_p3 = s - 4 * pitch;
+  uint16x4_t src[8];
-  uint16_t *const dst_p2 = s - 3 * pitch;
+  load_u16_4x8(s - 4 * pitch, pitch, &src[0], &src[1], &src[2], &src[3],
-  uint16_t *const dst_p1 = s - 2 * pitch;
+               &src[4], &src[5], &src[6], &src[7]);
  uint16_t *const dst_p0 = s - pitch;
  uint16_t *const dst_q0 = s;
  uint16_t *const dst_q1 = s + pitch;
  uint16_t *const dst_q2 = s + 2 * pitch;
  uint16_t *const dst_q3 = s + 3 * pitch;
  const uint16x4_t src[8] = { vld1_u16(dst_p3), vld1_u16(dst_p2),
                              vld1_u16(dst_p1), vld1_u16(dst_p0),
                              vld1_u16(dst_q0), vld1_u16(dst_q1),
                              vld1_u16(dst_q2), vld1_u16(dst_q3) };
  // Adjust thresholds to bitdepth.
  const int outer_thresh = *blimit << (bd - 8);
@@ -684,13 +644,22 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
-#if AOM_ARCH_AARCH64
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
  if (vaddv_u16(needs_filter_mask) == 0) {
    // None of the values will be filtered.
    return;
  }
 #endif  // AOM_ARCH_AARCH64
  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
  // Not needing filter4() at all is a very common case, so isolate it to avoid
  // needlessly computing filter4().
  if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
      vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
    p2q2_output = f8_p2q2;
    p1q1_output = f8_p1q1;
    p0q0_output = f8_p0q0;
  } else {
    // Copy the masks to the high bits for packed comparisons later.
    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
    const uint16x8_t needs_filter_mask_8 =
@@ -698,17 +667,14 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
    uint16x8_t f4_p1q1;
    uint16x8_t f4_p0q0;
  // ZIP1 p0q0, p1q1 may perform better here.
    const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
    // Because we did not return after testing |needs_filter_mask| we know it is
    // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
-  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+    // filter8. Therefore if it is false when |needs_filter_mask| is true,
-  // output is not used.
+    // filter8 output is not used.
  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
    const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
    if (vget_lane_u64(need_filter8, 0) == 0) {
      // filter8() does not apply, but filter4() applies to one or more values.
@@ -725,13 +691,12 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
      p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
    }
  }
-  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  store_u16_4x6(s - 3 * pitch, pitch, vget_low_u16(p2q2_output),
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+                vget_low_u16(p1q1_output), vget_low_u16(p0q0_output),
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+                vget_high_u16(p0q0_output), vget_high_u16(p1q1_output),
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+                vget_high_u16(p2q2_output));
  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
 }
 void aom_highbd_lpf_horizontal_8_dual_neon(
@@ -749,16 +714,10 @@ static inline uint16x8_t reverse_low_half(const uint16x8_t a) {
 void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int bd) {
  uint16_t *const dst = s - 4;
  uint16_t *const dst_0 = dst;
  uint16_t *const dst_1 = dst + pitch;
  uint16_t *const dst_2 = dst + 2 * pitch;
  uint16_t *const dst_3 = dst + 3 * pitch;
  // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
  // To get desired pairs after transpose, one half should be reversed.
-  uint16x8_t src[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+  uint16x8_t src[4];
-                        vld1q_u16(dst_3) };
+  load_u16_8x4(s - 4, pitch, &src[0], &src[1], &src[2], &src[3]);
  // src[0] = p0q0
  // src[1] = p1q1
@@ -783,13 +742,22 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
-#if AOM_ARCH_AARCH64
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
  if (vaddv_u16(needs_filter_mask) == 0) {
    // None of the values will be filtered.
    return;
  }
 #endif  // AOM_ARCH_AARCH64
  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
  // Not needing filter4() at all is a very common case, so isolate it to avoid
  // needlessly computing filter4().
  if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
      vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
    p2q2_output = f8_p2q2;
    p1q1_output = f8_p1q1;
    p0q0_output = f8_p0q0;
  } else {
    // Copy the masks to the high bits for packed comparisons later.
    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
    const uint16x8_t needs_filter_mask_8 =
@@ -797,15 +765,15 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
    uint16x8_t f4_p1q1;
    uint16x8_t f4_p0q0;
-  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+    const uint16x8_t p0q1 =
        vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
    // Because we did not return after testing |needs_filter_mask| we know it is
    // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
-  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+    // filter8. Therefore if it is false when |needs_filter_mask| is true,
-  // output is not used.
+    // filter8 output is not used.
    const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
    if (vget_lane_u64(need_filter8, 0) == 0) {
      // filter8() does not apply, but filter4() applies to one or more values.
@@ -815,7 +783,6 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
    } else {
      const uint16x8_t is_flat4_mask_8 =
          vcombine_u16(is_flat4_mask, is_flat4_mask);
    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
      filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
      p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
      p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
@@ -823,6 +790,7 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
      p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
    }
  }
  uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
  // After transpose, |output| will contain rows of the form:
@@ -831,10 +799,9 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
  // Reverse p values to produce original order:
  // p3 p2 p1 p0 q0 q1 q2 q3
-  vst1q_u16(dst_0, reverse_low_half(output[0]));
+  store_u16_8x4(s - 4, pitch, reverse_low_half(output[0]),
-  vst1q_u16(dst_1, reverse_low_half(output[1]));
+                reverse_low_half(output[1]), reverse_low_half(output[2]),
-  vst1q_u16(dst_2, reverse_low_half(output[2]));
+                reverse_low_half(output[3]));
  vst1q_u16(dst_3, reverse_low_half(output[3]));
 }
 void aom_highbd_lpf_vertical_8_dual_neon(
@@ -864,8 +831,8 @@ static inline void filter14(
  //                 ^^^^^^^^^^^^^^^^^^^
  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
  //                               ^^^^^^^^^^^^^^^^^^^
-  uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+  const uint16x8_t p45q45 = vaddq_u16(p5q5, p4q4);
-  sum = vaddq_u16(sum, p6q6_x7);
+  uint16x8_t sum = vmlaq_n_u16(p6q6_x7, p45q45, 2);
  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
  //                                       ^^^^^^^
@@ -938,27 +905,10 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh, int bd) {
-  uint16_t *const dst_p6 = s - 7 * pitch;
+  uint16x4_t src[14];
-  uint16_t *const dst_p5 = s - 6 * pitch;
+  load_u16_4x14(s - 7 * pitch, pitch, &src[0], &src[1], &src[2], &src[3],
-  uint16_t *const dst_p4 = s - 5 * pitch;
+                &src[4], &src[5], &src[6], &src[7], &src[8], &src[9], &src[10],
-  uint16_t *const dst_p3 = s - 4 * pitch;
+                &src[11], &src[12], &src[13]);
  uint16_t *const dst_p2 = s - 3 * pitch;
  uint16_t *const dst_p1 = s - 2 * pitch;
  uint16_t *const dst_p0 = s - pitch;
  uint16_t *const dst_q0 = s;
  uint16_t *const dst_q1 = s + pitch;
  uint16_t *const dst_q2 = s + 2 * pitch;
  uint16_t *const dst_q3 = s + 3 * pitch;
  uint16_t *const dst_q4 = s + 4 * pitch;
  uint16_t *const dst_q5 = s + 5 * pitch;
  uint16_t *const dst_q6 = s + 6 * pitch;
  const uint16x4_t src[14] = {
    vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
    vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
    vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
    vld1_u16(dst_q5), vld1_u16(dst_q6)
  };
  // Adjust thresholds to bitdepth.
  const int outer_thresh = *blimit << (bd - 8);
@@ -976,12 +926,10 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
-#if AOM_ARCH_AARCH64
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
  if (vaddv_u16(needs_filter_mask) == 0) {
    // None of the values will be filtered.
    return;
  }
 #endif  // AOM_ARCH_AARCH64
  const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
  const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
  const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
@@ -991,6 +939,32 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
  const uint16x4_t is_flat4_outer_mask = vand_u16(
      is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
                              vabdq_u16(p0q0, p6q6), bd));
  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
      p5q5_output;
  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
  uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
  if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) {
    // filter14() applies to all values.
    filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
             &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
    p5q5_output = f14_p5q5;
    p4q4_output = f14_p4q4;
    p3q3_output = f14_p3q3;
    p2q2_output = f14_p2q2;
    p1q1_output = f14_p1q1;
    p0q0_output = f14_p0q0;
  } else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
             vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) {
    // filter8() applies to all values.
    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
    p5q5_output = p5q5;
    p4q4_output = p4q4;
    p3q3_output = p3q3;
    p2q2_output = f8_p2q2;
    p1q1_output = f8_p1q1;
    p0q0_output = f8_p0q0;
  } else {
    // Copy the masks to the high bits for packed comparisons later.
    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
    const uint16x8_t needs_filter_mask_8 =
@@ -998,18 +972,13 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
    uint16x8_t f4_p1q1;
    uint16x8_t f4_p0q0;
  // ZIP1 p0q0, p1q1 may perform better here.
    const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
      p5q5_output;
    // Because we did not return after testing |needs_filter_mask| we know it is
    // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
-  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+    // filter8. Therefore if it is false when |needs_filter_mask| is true,
-  // output is not used.
+    // filter8 output is not used.
  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
    const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
    if (vget_lane_u64(need_filter8, 0) == 0) {
      // filter8() and filter14() do not apply, but filter4() applies to one or
@@ -1024,10 +993,11 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
      const uint16x8_t use_filter8_mask =
          vcombine_u16(is_flat4_mask, is_flat4_mask);
      filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
-    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+      const uint64x1_t need_filter14 =
          vreinterpret_u64_u16(is_flat4_outer_mask);
      if (vget_lane_u64(need_filter14, 0) == 0) {
-      // filter14() does not apply, but filter8() and filter4() apply to one or
+        // filter14() does not apply, but filter8() and filter4() apply to one
-      // more values.
+        // or more values.
        p5q5_output = p5q5;
        p4q4_output = p4q4;
        p3q3_output = p3q3;
@@ -1040,7 +1010,6 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
        // All filters may contribute values to final outputs.
        const uint16x8_t use_filter14_mask =
            vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
        filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
                 &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
        p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
@@ -1057,19 +1026,15 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
        p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
      }
    }
  }
-  vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+  store_u16_4x12(s - 6 * pitch, pitch, vget_low_u16(p5q5_output),
-  vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+                 vget_low_u16(p4q4_output), vget_low_u16(p3q3_output),
-  vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+                 vget_low_u16(p2q2_output), vget_low_u16(p1q1_output),
-  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+                 vget_low_u16(p0q0_output), vget_high_u16(p0q0_output),
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+                 vget_high_u16(p1q1_output), vget_high_u16(p2q2_output),
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+                 vget_high_u16(p3q3_output), vget_high_u16(p4q4_output),
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+                 vget_high_u16(p5q5_output));
  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
  vst1_u16(dst_q3, vget_high_u16(p3q3_output));
  vst1_u16(dst_q4, vget_high_u16(p4q4_output));
  vst1_u16(dst_q5, vget_high_u16(p5q5_output));
 }
 void aom_highbd_lpf_horizontal_14_dual_neon(
@@ -1107,23 +1072,17 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh, int bd) {
  uint16_t *const dst = s - 8;
  uint16_t *const dst_0 = dst;
  uint16_t *const dst_1 = dst + pitch;
  uint16_t *const dst_2 = dst + 2 * pitch;
  uint16_t *const dst_3 = dst + 3 * pitch;
  // Low halves:  p7 p6 p5 p4
  // High halves: p3 p2 p1 p0
-  uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+  uint16x8_t src_p[4];
-                          vld1q_u16(dst_3) };
+  load_u16_8x4(s - 8, pitch, &src_p[0], &src_p[1], &src_p[2], &src_p[3]);
  // p7 will be the low half of src_p[0]. Not used until the end.
  transpose_array_inplace_u16_4x8(src_p);
  // Low halves:  q0 q1 q2 q3
  // High halves: q4 q5 q6 q7
-  uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+  uint16x8_t src_q[4];
-                          vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
+  load_u16_8x4(s, pitch, &src_q[0], &src_q[1], &src_q[2], &src_q[3]);
  // q7 will be the high half of src_q[3]. Not used until the end.
  transpose_array_inplace_u16_4x8(src_q);
@@ -1144,12 +1103,11 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
-#if AOM_ARCH_AARCH64
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
  if (vaddv_u16(needs_filter_mask) == 0) {
    // None of the values will be filtered.
    return;
  }
-#endif  // AOM_ARCH_AARCH64
+
  const uint16x8_t p4q4 =
      vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
  const uint16x8_t p5q5 =
@@ -1164,6 +1122,32 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
  const uint16x4_t is_flat4_outer_mask = vand_u16(
      is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
                              vabdq_u16(p0q0, p6q6), bd));
  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
      p5q5_output;
  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
  uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
  if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) {
    // filter14() applies to all values.
    filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
             &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
    p5q5_output = f14_p5q5;
    p4q4_output = f14_p4q4;
    p3q3_output = f14_p3q3;
    p2q2_output = f14_p2q2;
    p1q1_output = f14_p1q1;
    p0q0_output = f14_p0q0;
  } else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
             vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) {
    // filter8() applies to all values.
    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
    p5q5_output = p5q5;
    p4q4_output = p4q4;
    p3q3_output = p3q3;
    p2q2_output = f8_p2q2;
    p1q1_output = f8_p1q1;
    p0q0_output = f8_p0q0;
  } else {
    // Copy the masks to the high bits for packed comparisons later.
    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
    const uint16x8_t needs_filter_mask_8 =
@@ -1171,17 +1155,14 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
    uint16x8_t f4_p1q1;
    uint16x8_t f4_p0q0;
-  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+    const uint16x8_t p0q1 =
        vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
      p5q5_output;
    // Because we did not return after testing |needs_filter_mask| we know it is
    // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
-  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+    // filter8. Therefore if it is false when |needs_filter_mask| is true,
-  // output is not used.
+    // filter8 output is not used.
  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
    const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
    if (vget_lane_u64(need_filter8, 0) == 0) {
      // filter8() and filter14() do not apply, but filter4() applies to one or
@@ -1196,10 +1177,11 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
      const uint16x8_t use_filter8_mask =
          vcombine_u16(is_flat4_mask, is_flat4_mask);
      filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
-    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+      const uint64x1_t need_filter14 =
          vreinterpret_u64_u16(is_flat4_outer_mask);
      if (vget_lane_u64(need_filter14, 0) == 0) {
-      // filter14() does not apply, but filter8() and filter4() apply to one or
+        // filter14() does not apply, but filter8() and filter4() apply to one
-      // more values.
+        // or more values.
        p5q5_output = p5q5;
        p4q4_output = p4q4;
        p3q3_output = p3q3;
@@ -1212,7 +1194,6 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
        // All filters may contribute values to final outputs.
        const uint16x8_t use_filter14_mask =
            vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
        filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
                 &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
        p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
@@ -1229,6 +1210,8 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
        p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
      }
    }
  }
  // To get the correctly ordered rows from the transpose, we need:
  // p7p3 p6p2 p5p1 p4p0
  // q0q4 q1q5 q2q6 q3q7
@@ -1236,23 +1219,20 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
  const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output);
  const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output);
  const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
  uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
                             p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
  transpose_array_inplace_u16_4x8(output_p);
  uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
                             p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
  transpose_array_inplace_u16_4x8(output_p);
  transpose_array_inplace_u16_4x8(output_q);
  // Reverse p values to produce original order:
  // p3 p2 p1 p0 q0 q1 q2 q3
-  vst1q_u16(dst_0, output_p[0]);
+  store_u16_8x4(s - 8, pitch, output_p[0], output_p[1], output_p[2],
-  vst1q_u16(dst_0 + 8, output_q[0]);
+                output_p[3]);
-  vst1q_u16(dst_1, output_p[1]);
+  store_u16_8x4(s, pitch, output_q[0], output_q[1], output_q[2], output_q[3]);
  vst1q_u16(dst_1 + 8, output_q[1]);
  vst1q_u16(dst_2, output_p[2]);
  vst1q_u16(dst_2 + 8, output_q[2]);
  vst1q_u16(dst_3, output_p[3]);
  vst1q_u16(dst_3 + 8, output_q[3]);
 }
 void aom_highbd_lpf_vertical_14_dual_neon(
--- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@@ -146,172 +146,237 @@ static inline uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
  return mask_8x8;
 }
-static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
+static inline void filter4(const uint8x8_t p0q0, const uint8x8_t p1q1,
-                        uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                           uint8x8_t *p0q0_output, uint8x8_t *p1q1_output,
-                        uint8x8_t *p0q0, const uint8_t blimit,
+                           uint8x8_t mask_8x8, const uint8_t thresh) {
                        const uint8_t limit, const uint8_t thresh) {
  uint16x8_t out;
  uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
      out_f14_pq5;
  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
  uint8x8_t out_f4_pq0, out_f4_pq1;
  uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
  uint8x8_t q0p0, q1p1, q2p2;
  // Calculate filter masks
  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
  flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
  {
    // filter 4
    int32x2x2_t ps0_qs0, ps1_qs1;
    int16x8_t filter_s16;
  const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
    uint8x8_t temp0_8x8, temp1_8x8;
    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
    int8x8_t op0, oq0, op1, oq1;
    int8x8_t pq_s0, pq_s1;
    int8x8_t filter_s8, filter1_s8, filter2_s8;
    int8x8_t hev_8x8;
  const int8x8_t sign_mask = vdup_n_s8(0x80);
  const int8x8_t val_4 = vdup_n_s8(4);
  const int8x8_t val_3 = vdup_n_s8(3);
-    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+  int8x8_t pq_s0 = veor_s8(vreinterpret_s8_u8(p0q0), sign_mask);
-    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+  int8x8_t pq_s1 = veor_s8(vreinterpret_s8_u8(p1q1), sign_mask);
-    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+  int32x2x2_t ps0_qs0 =
-    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+      vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
-    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+  int32x2x2_t ps1_qs1 =
-    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+      vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
-    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+  int8x8_t ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
-    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+  int8x8_t qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
  int8x8_t ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
  int8x8_t qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
  // hev_mask
-    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+  uint8x8_t temp0_8x8 = vcgt_u8(vabd_u8(p0q0, p1q1), thresh_f4);
-    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+  uint8x8_t temp1_8x8 =
-    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+      vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
  int8x8_t hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
  // add outer taps if we have high edge variance
-    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+  int8x8_t filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
  filter_s8 = vand_s8(filter_s8, hev_8x8);
  // inner taps
-    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+  int8x8_t temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
-    filter_s16 = vmovl_s8(filter_s8);
+  int16x8_t filter_s16 = vmovl_s8(filter_s8);
  filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
  filter_s8 = vqmovn_s16(filter_s16);
  filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-    filter1_s8 = vqadd_s8(filter_s8, val_4);
+  int8x8_t filter1_s8 = vqadd_s8(filter_s8, val_4);
-    filter2_s8 = vqadd_s8(filter_s8, val_3);
+  int8x8_t filter2_s8 = vqadd_s8(filter_s8, val_3);
  filter1_s8 = vshr_n_s8(filter1_s8, 3);
  filter2_s8 = vshr_n_s8(filter2_s8, 3);
-    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+  int8x8_t oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
-    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+  int8x8_t op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
    hev_8x8 = vmvn_s8(hev_8x8);
  filter_s8 = vrshr_n_s8(filter1_s8, 1);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
+  filter_s8 = vbic_s8(filter_s8, hev_8x8);
-    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+  int8x8_t oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
-    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+  int8x8_t op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+  *p0q0_output = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
-    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  *p1q1_output = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
-  }
+}
  // reverse p and q
  q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
  q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
  q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
  {
    // filter 8
    uint16x8_t out_pq0, out_pq1, out_pq2;
    out = vaddl_u8(*p3q3, *p2q2);
    out = vaddw_u8(out, *p1q1);
    out = vaddw_u8(out, *p0q0);
-    out = vaddw_u8(out, q0p0);
+static inline void filter8(const uint8x8_t p0q0, const uint8x8_t p1q1,
-    out_pq1 = vaddw_u8(out, *p3q3);
+                           const uint8x8_t p2q2, const uint8x8_t p3q3,
-    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+                           uint8x8_t *p0q0_output, uint8x8_t *p1q1_output,
-    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+                           uint8x8_t *p2q2_output) {
-    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+  // Reverse p and q.
-    out_pq1 = vaddw_u8(out_pq1, q1p1);
+  uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
  uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4);
  uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4);
-    out_pq0 = vaddw_u8(out, *p0q0);
+  uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
-    out_pq0 = vaddw_u8(out_pq0, q1p1);
+  uint16x8_t p2q2_p3q3 = vaddl_u8(p3q3, p2q2);
-    out_pq0 = vaddw_u8(out_pq0, q2p2);
+  uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3);
-    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+  uint16x8_t q0p0_p3q3 = vaddl_u8(q0p0, p3q3);
-    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+  uint16x8_t out_q0p0_p3q3 = vaddq_u16(out, q0p0_p3q3);
    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
  }
  {
    // filter 14
    uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
    uint16x8_t p6q6_2, p6q6_temp, qp_sum;
    uint8x8_t qp_rev;
-    out = vaddw_u8(out, *p4q4);
+  uint16x8_t out_pq2 = vaddq_u16(out_q0p0_p3q3, p2q2_p3q3);
    out = vaddw_u8(out, *p5q5);
    out = vaddw_u8(out, *p6q6);
-    out_pq5 = vaddw_u8(out, *p4q4);
+  uint16x8_t p1q1_q1p1 = vaddl_u8(p1q1, q1p1);
-    out_pq4 = vaddw_u8(out_pq5, *p3q3);
+  uint16x8_t out_pq1 = vaddq_u16(out_q0p0_p3q3, p1q1_q1p1);
    out_pq3 = vaddw_u8(out_pq4, *p2q2);
-    out_pq5 = vaddw_u8(out_pq5, *p5q5);
+  uint16x8_t q0p0_p0q0 = vaddl_u8(q0p0, p0q0);
-    out_pq4 = vaddw_u8(out_pq4, *p5q5);
+  uint16x8_t q1p1_q2p2 = vaddl_u8(q1p1, q2p2);
  uint16x8_t out_pq0 = vaddq_u16(q0p0_p0q0, q1p1_q2p2);
  out_pq0 = vaddq_u16(out_pq0, out);
-    out_pq0 = vaddw_u8(out, *p1q1);
+  *p0q0_output = vrshrn_n_u16(out_pq0, 3);
-    out_pq1 = vaddw_u8(out_pq0, *p2q2);
+  *p1q1_output = vrshrn_n_u16(out_pq1, 3);
-    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+  *p2q2_output = vrshrn_n_u16(out_pq2, 3);
 }
-    out_pq0 = vaddw_u8(out_pq0, *p0q0);
+static inline void filter14(const uint8x8_t p0q0, const uint8x8_t p1q1,
-    out_pq1 = vaddw_u8(out_pq1, *p0q0);
+                            const uint8x8_t p2q2, const uint8x8_t p3q3,
                            const uint8x8_t p4q4, const uint8x8_t p5q5,
                            const uint8x8_t p6q6, uint8x8_t *p0q0_output,
                            uint8x8_t *p1q1_output, uint8x8_t *p2q2_output,
                            uint8x8_t *p3q3_output, uint8x8_t *p4q4_output,
                            uint8x8_t *p5q5_output) {
  // Reverse p and q.
  uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
  uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4);
  uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4);
  uint8x8_t q3p3 = vext_u8(p3q3, p3q3, 4);
  uint8x8_t q4p4 = vext_u8(p4q4, p4q4, 4);
  uint8x8_t q5p5 = vext_u8(p5q5, p5q5, 4);
-    out_pq1 = vaddw_u8(out_pq1, *p6q6);
+  uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
-    p6q6_2 = vaddl_u8(*p6q6, *p6q6);
+  uint16x8_t p2q2_p3q3 = vaddl_u8(p2q2, p3q3);
-    out_pq2 = vaddq_u16(out_pq2, p6q6_2);
+  uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3);
-    p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
+
  uint16x8_t q0p0_p4q4 = vaddl_u8(q0p0, p4q4);
  uint16x8_t p5q5_p6q6 = vaddl_u8(p5q5, p6q6);
  uint16x8_t tmp = vaddq_u16(q0p0_p4q4, p5q5_p6q6);
  // This offset removes the need for a rounding shift at the end.
  uint16x8_t tmp_offset = vaddq_u16(tmp, vdupq_n_u16(1 << 3));
  out = vaddq_u16(out, tmp_offset);
  uint16x8_t out_pq5 = vaddw_u8(out, p4q4);
  uint16x8_t out_pq4 = vaddw_u8(out_pq5, p3q3);
  uint16x8_t out_pq3 = vaddw_u8(out_pq4, p2q2);
  out_pq5 = vaddw_u8(out_pq5, p5q5);
  uint16x8_t out_pq0 = vaddw_u8(out, p1q1);
  uint16x8_t out_pq1 = vaddw_u8(out_pq0, p2q2);
  uint16x8_t out_pq2 = vaddw_u8(out_pq1, p3q3);
  uint16x8_t p0q0_q0p0 = vaddl_u8(p0q0, q0p0);
  out_pq0 = vaddq_u16(out_pq0, p0q0_q0p0);
  uint16x8_t p0q0_p6q6 = vaddl_u8(p0q0, p6q6);
  out_pq1 = vaddq_u16(out_pq1, p0q0_p6q6);
  uint16x8_t p5q5_q1p1 = vaddl_u8(p5q5, q1p1);
  out_pq4 = vaddq_u16(out_pq4, p5q5_q1p1);
  uint16x8_t p6q6_p6q6 = vaddl_u8(p6q6, p6q6);
  out_pq2 = vaddq_u16(out_pq2, p6q6_p6q6);
  uint16x8_t p6q6_temp = vaddw_u8(p6q6_p6q6, p6q6);
  out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
-    p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
+  p6q6_temp = vaddw_u8(p6q6_temp, p6q6);
  out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
-    p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
+  p6q6_temp = vaddq_u16(p6q6_temp, p6q6_p6q6);
  out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
-    out_pq4 = vaddw_u8(out_pq4, q1p1);
+  uint16x8_t qp_sum = vaddl_u8(q2p2, q1p1);
    qp_sum = vaddl_u8(q2p2, q1p1);
  out_pq3 = vaddq_u16(out_pq3, qp_sum);
-    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
+  qp_sum = vaddw_u8(qp_sum, q3p3);
    qp_sum = vaddw_u8(qp_sum, qp_rev);
  out_pq2 = vaddq_u16(out_pq2, qp_sum);
-    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
+  qp_sum = vaddw_u8(qp_sum, q4p4);
    qp_sum = vaddw_u8(qp_sum, qp_rev);
  out_pq1 = vaddq_u16(out_pq1, qp_sum);
-    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
+  qp_sum = vaddw_u8(qp_sum, q5p5);
    qp_sum = vaddw_u8(qp_sum, qp_rev);
  out_pq0 = vaddq_u16(out_pq0, qp_sum);
-    out_pq0 = vaddw_u8(out_pq0, q0p0);
+  *p0q0_output = vshrn_n_u16(out_pq0, 4);
  *p1q1_output = vshrn_n_u16(out_pq1, 4);
  *p2q2_output = vshrn_n_u16(out_pq2, 4);
  *p3q3_output = vshrn_n_u16(out_pq3, 4);
  *p4q4_output = vshrn_n_u16(out_pq4, 4);
  *p5q5_output = vshrn_n_u16(out_pq5, 4);
 }
-    out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
+static inline void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5,
-    out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
+                               uint8x8_t *p4q4, uint8x8_t *p3q3,
-    out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
+                               uint8x8_t *p2q2, uint8x8_t *p1q1,
-    out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
+                               uint8x8_t *p0q0, const uint8_t blimit,
-    out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
+                               const uint8_t limit, const uint8_t thresh) {
-    out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
+  uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
      out_f14_pq5;
  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
  uint8x8_t out_f4_pq0, out_f4_pq1;
  // Calculate filter masks.
  uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
  uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
  uint8x8_t flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
  // No filtering.
  if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
    return;
  }
-  {
+
-    uint8x8_t filter4_cond, filter8_cond, filter14_cond;
+  uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8);
-    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+  uint8x8_t filter4_cond = vmvn_u8(filter8_cond);
-    filter4_cond = vmvn_u8(filter8_cond);
+  uint8x8_t filter14_cond = vand_u8(filter8_cond, flat2_8x8);
-    filter14_cond = vand_u8(filter8_cond, flat2_8x8);
+
  if (vget_lane_s64(vreinterpret_s64_u8(filter14_cond), 0) == -1) {
    // Only filter14() applies.
    filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0,
             &out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4,
             &out_f14_pq5);
    *p0q0 = out_f14_pq0;
    *p1q1 = out_f14_pq1;
    *p2q2 = out_f14_pq2;
    *p3q3 = out_f14_pq3;
    *p4q4 = out_f14_pq4;
    *p5q5 = out_f14_pq5;
  } else if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 &&
             vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) {
    // Only filter8() applies.
    filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2);
    *p0q0 = out_f7_pq0;
    *p1q1 = out_f7_pq1;
    *p2q2 = out_f7_pq2;
  } else {
    filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
    if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 &&
        vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) {
      // filter8() and filter14() do not apply, but filter4() applies to one or
      // more values.
      *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
      *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
    } else {
      filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1,
              &out_f7_pq2);
      if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0) {
        // filter14() does not apply, but filter8() and filter4() apply to one
        // or more values. filter4 outputs
        *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
        *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
        // filter8 outputs
        *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
        *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
        *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
      } else {
        // All filters may contribute values to final outputs.
        filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0,
                 &out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4,
                 &out_f14_pq5);
        // filter4 outputs
        *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
@@ -330,111 +395,46 @@ static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
        *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
        *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
      }
    }
  }
 }
-static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+static inline void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
                              uint8x8_t *p0q0, const uint8_t blimit,
                              const uint8_t limit, const uint8_t thresh) {
  uint16x8_t out;
  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
  uint8x8_t out_f4_pq0, out_f4_pq1;
  uint8x8_t mask_8x8, flat_8x8;
-  // Calculate filter masks
+  // Calculate filter masks.
-  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
-  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
  {
    // filter 4
    int32x2x2_t ps0_qs0, ps1_qs1;
    int16x8_t filter_s16;
    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
    uint8x8_t temp0_8x8, temp1_8x8;
    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
    int8x8_t op0, oq0, op1, oq1;
    int8x8_t pq_s0, pq_s1;
    int8x8_t filter_s8, filter1_s8, filter2_s8;
    int8x8_t hev_8x8;
    const int8x8_t sign_mask = vdup_n_s8(0x80);
    const int8x8_t val_4 = vdup_n_s8(4);
    const int8x8_t val_3 = vdup_n_s8(3);
-    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+  // No filtering.
-    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+  if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
-
+    return;
    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
    // hev_mask
    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
    // add outer taps if we have high edge variance
    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
    filter_s8 = vand_s8(filter_s8, hev_8x8);
    // inner taps
    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
    filter_s16 = vmovl_s8(filter_s8);
    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
    filter_s8 = vqmovn_s16(filter_s16);
    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
    filter1_s8 = vqadd_s8(filter_s8, val_4);
    filter2_s8 = vqadd_s8(filter_s8, val_3);
    filter1_s8 = vshr_n_s8(filter1_s8, 3);
    filter2_s8 = vshr_n_s8(filter2_s8, 3);
    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
    hev_8x8 = vmvn_s8(hev_8x8);
    filter_s8 = vrshr_n_s8(filter1_s8, 1);
    filter_s8 = vand_s8(filter_s8, hev_8x8);
    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
  }
  {
    // filter 8
    uint16x8_t out_pq0, out_pq1, out_pq2;
    uint8x8_t q0p0, q1p1, q2p2;
-    out = vaddl_u8(*p3q3, *p2q2);
+  uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8);
-    out = vaddw_u8(out, *p1q1);
+  uint8x8_t filter4_cond = vmvn_u8(filter8_cond);
    out = vaddw_u8(out, *p0q0);
-    // reverse p and q
+  // Not needing filter4() at all is a very common case, so isolate it to avoid
-    q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+  // needlessly computing filter4().
-    q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+  if (vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) {
-    q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+    filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2);
-    out = vaddw_u8(out, q0p0);
+    *p0q0 = out_f7_pq0;
-    out_pq1 = vaddw_u8(out, *p3q3);
+    *p1q1 = out_f7_pq1;
-    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    *p2q2 = out_f7_pq2;
-    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+  } else {
-    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
    out_pq1 = vaddw_u8(out_pq1, q1p1);
-    out_pq0 = vaddw_u8(out, *p0q0);
+    if (vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) {
-    out_pq0 = vaddw_u8(out_pq0, q1p1);
+      // filter8() does not apply, but filter4() applies to one or more values.
-    out_pq0 = vaddw_u8(out_pq0, q2p2);
+      *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
-
+      *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    } else {
-    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+      filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1,
-    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+              &out_f7_pq2);
  }
  {
    uint8x8_t filter4_cond, filter8_cond;
    filter8_cond = vand_u8(flat_8x8, mask_8x8);
    filter4_cond = vmvn_u8(filter8_cond);
      // filter4 outputs
      *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
@@ -445,103 +445,65 @@ static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
      *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
      *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
    }
  }
 }
-static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
+static inline void filter6(const uint8x8_t p0q0, const uint8x8_t p1q1,
                           const uint8x8_t p2q2, uint8x8_t *p0q0_output,
                           uint8x8_t *p1q1_output) {
  uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
  uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
  uint16x8_t out = vaddq_u16(p0q0_p1q1, p0q0_p1q1);
  uint16x8_t q0p0_p2q2 = vaddl_u8(q0p0, p2q2);
  out = vaddq_u16(out, q0p0_p2q2);
  uint16x8_t q0p0_q1p1 = vextq_u16(p0q0_p1q1, p0q0_p1q1, 4);
  uint16x8_t out_pq0 = vaddq_u16(out, q0p0_q1p1);
  uint16x8_t p2q2_p2q2 = vaddl_u8(p2q2, p2q2);
  uint16x8_t out_pq1 = vaddq_u16(out, p2q2_p2q2);
  *p0q0_output = vrshrn_n_u16(out_pq0, 3);
  *p1q1_output = vrshrn_n_u16(out_pq1, 3);
 }
 static inline void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
                              const uint8_t blimit, const uint8_t limit,
                              const uint8_t thresh) {
  uint16x8_t out;
  uint8x8_t out_f6_pq0, out_f6_pq1;
  uint8x8_t out_f4_pq0, out_f4_pq1;
  uint8x8_t mask_8x8, flat_8x8;
-  // Calculate filter masks
+  // Calculate filter masks.
-  mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
+  uint8x8_t mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
-  flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
+  uint8x8_t flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
  {
    // filter 4
    int32x2x2_t ps0_qs0, ps1_qs1;
    int16x8_t filter_s16;
    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
    uint8x8_t temp0_8x8, temp1_8x8;
    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
    int8x8_t op0, oq0, op1, oq1;
    int8x8_t pq_s0, pq_s1;
    int8x8_t filter_s8, filter1_s8, filter2_s8;
    int8x8_t hev_8x8;
    const int8x8_t sign_mask = vdup_n_s8(0x80);
    const int8x8_t val_4 = vdup_n_s8(4);
    const int8x8_t val_3 = vdup_n_s8(3);
-    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+  // No filtering.
-    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+  if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
-
+    return;
    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
    // hev_mask
    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
    // add outer taps if we have high edge variance
    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
    filter_s8 = vand_s8(filter_s8, hev_8x8);
    // inner taps
    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
    filter_s16 = vmovl_s8(filter_s8);
    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
    filter_s8 = vqmovn_s16(filter_s16);
    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
    filter1_s8 = vqadd_s8(filter_s8, val_4);
    filter2_s8 = vqadd_s8(filter_s8, val_3);
    filter1_s8 = vshr_n_s8(filter1_s8, 3);
    filter2_s8 = vshr_n_s8(filter2_s8, 3);
    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
    filter_s8 = vrshr_n_s8(filter1_s8, 1);
    filter_s8 = vbic_s8(filter_s8, hev_8x8);
    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
  }
  {
    // filter 6
    uint16x8_t out_pq0, out_pq1;
    uint8x8_t pq_rev;
-    out = vaddl_u8(*p0q0, *p1q1);
+  uint8x8_t filter6_cond = vand_u8(flat_8x8, mask_8x8);
-    out = vaddq_u16(out, out);
+  uint8x8_t filter4_cond = vmvn_u8(filter6_cond);
    out = vaddw_u8(out, *p2q2);
-    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+  // Not needing filter4 at all is a very common case, so isolate it to avoid
-    out = vaddw_u8(out, pq_rev);
+  // needlessly computing filter4.
  if (vget_lane_s64(vreinterpret_s64_u8(filter6_cond), 0) == -1) {
    filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1);
-    out_pq0 = vaddw_u8(out, pq_rev);
+    *p0q0 = out_f6_pq0;
-    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    *p1q1 = out_f6_pq1;
-    out_pq0 = vaddw_u8(out_pq0, pq_rev);
+  } else {
    filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
-    out_pq1 = vaddw_u8(out, *p2q2);
+    if (vget_lane_u64(vreinterpret_u64_u8(filter6_cond), 0) == 0) {
-    out_pq1 = vaddw_u8(out_pq1, *p2q2);
+      // filter6 does not apply, but filter4 applies to one or more values.
-
+      *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
-    out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
+      *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-    out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
+    } else {
-  }
+      // All filters may contribute to the final output.
-  {
+      filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1);
    uint8x8_t filter4_cond, filter6_cond;
    filter6_cond = vand_u8(flat_8x8, mask_8x8);
    filter4_cond = vmvn_u8(filter6_cond);
      // filter4 outputs
      *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
@@ -551,68 +513,26 @@ static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
      *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
      *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
    }
  }
 }
-static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
+static inline void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0,
-                       const uint8_t limit, const uint8_t thresh) {
+                              const uint8_t blimit, const uint8_t limit,
-  int32x2x2_t ps0_qs0, ps1_qs1;
+                              const uint8_t thresh) {
-  int16x8_t filter_s16;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
  const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
  uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
  int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
  int8x8_t op0, oq0, op1, oq1;
  int8x8_t pq_s0, pq_s1;
  int8x8_t filter_s8, filter1_s8, filter2_s8;
  int8x8_t hev_8x8;
  const int8x8_t sign_mask = vdup_n_s8(0x80);
  const int8x8_t val_4 = vdup_n_s8(4);
  const int8x8_t val_3 = vdup_n_s8(3);
  // Calculate filter mask
-  mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+  uint8x8_t mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
-  pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+  // No filtering.
-  pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+  if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
    return;
  }
-  ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+  filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
  ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
  ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
  qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
  ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
  qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-  // hev_mask
+  *p0q0 = out_f4_pq0;
-  temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+  *p1q1 = out_f4_pq1;
  temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
  hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
  // add outer taps if we have high edge variance
  filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
  filter_s8 = vand_s8(filter_s8, hev_8x8);
  // inner taps
  temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
  filter_s16 = vmovl_s8(filter_s8);
  filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
  filter_s8 = vqmovn_s16(filter_s16);
  filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
  filter1_s8 = vqadd_s8(filter_s8, val_4);
  filter2_s8 = vqadd_s8(filter_s8, val_3);
  filter1_s8 = vshr_n_s8(filter1_s8, 3);
  filter2_s8 = vshr_n_s8(filter2_s8, 3);
  oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
  op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
  filter_s8 = vrshr_n_s8(filter1_s8, 1);
  filter_s8 = vbic_s8(filter_s8, hev_8x8);
  oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
  op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
  *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
  *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
 }
 void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
--- a/third_party/aom/aom_dsp/arm/mem_neon.h
+++ b/third_party/aom/aom_dsp/arm/mem_neon.h
@@ -55,12 +55,52 @@ static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
  return res;
 }
 static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
  int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
  return res;
 }
 static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
  int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
                        vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
  return res;
 }
 static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
  vst1_u8(ptr + 0 * 8, a.val[0]);
  vst1_u8(ptr + 1 * 8, a.val[1]);
 }
 static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
  vst1_u8(ptr + 0 * 8, a.val[0]);
  vst1_u8(ptr + 1 * 8, a.val[1]);
  vst1_u8(ptr + 2 * 8, a.val[2]);
  vst1_u8(ptr + 3 * 8, a.val[3]);
 }
 static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) {
  vst1q_u16(ptr + 0 * 8, a.val[0]);
  vst1q_u16(ptr + 1 * 8, a.val[1]);
 }
 static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) {
  vst1q_u16(ptr + 0 * 8, a.val[0]);
  vst1q_u16(ptr + 1 * 8, a.val[1]);
  vst1q_u16(ptr + 2 * 8, a.val[2]);
  vst1q_u16(ptr + 3 * 8, a.val[3]);
 }
 #elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
 #if __GNUC__ < 8
 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
  return res;
 }
 static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
  int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
  return res;
 }
 #endif  // __GNUC__ < 8
 #if __GNUC__ < 9
@@ -71,13 +111,30 @@ static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
 }
 #endif  // __GNUC__ < 9
 // vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
  return res;
 }
 static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
  int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
                        vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
  return res;
 }
 static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
  vst1_u8(ptr + 0 * 8, a.val[0]);
  vst1_u8(ptr + 1 * 8, a.val[1]);
 }
 static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
  vst1_u8(ptr + 0 * 8, a.val[0]);
  vst1_u8(ptr + 1 * 8, a.val[1]);
  vst1_u8(ptr + 2 * 8, a.val[2]);
  vst1_u8(ptr + 3 * 8, a.val[3]);
 }
 #endif  // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
 #endif  // defined(__GNUC__) && !defined(__clang__)
@@ -215,6 +272,23 @@ static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
  s += p;
 }
 static inline void load_u16_4x6(const uint16_t *s, ptrdiff_t p,
                                uint16x4_t *const s0, uint16x4_t *const s1,
                                uint16x4_t *const s2, uint16x4_t *const s3,
                                uint16x4_t *const s4, uint16x4_t *const s5) {
  *s0 = vld1_u16(s);
  s += p;
  *s1 = vld1_u16(s);
  s += p;
  *s2 = vld1_u16(s);
  s += p;
  *s3 = vld1_u16(s);
  s += p;
  *s4 = vld1_u16(s);
  s += p;
  *s5 = vld1_u16(s);
 }
 static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
                                uint16x4_t *const s0, uint16x4_t *const s1,
                                uint16x4_t *const s2, uint16x4_t *const s3,
@@ -235,6 +309,65 @@ static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
  *s6 = vld1_u16(s);
 }
 static inline void load_u16_4x8(const uint16_t *s, ptrdiff_t p,
                                uint16x4_t *const s0, uint16x4_t *const s1,
                                uint16x4_t *const s2, uint16x4_t *const s3,
                                uint16x4_t *const s4, uint16x4_t *const s5,
                                uint16x4_t *const s6, uint16x4_t *const s7) {
  *s0 = vld1_u16(s);
  s += p;
  *s1 = vld1_u16(s);
  s += p;
  *s2 = vld1_u16(s);
  s += p;
  *s3 = vld1_u16(s);
  s += p;
  *s4 = vld1_u16(s);
  s += p;
  *s5 = vld1_u16(s);
  s += p;
  *s6 = vld1_u16(s);
  s += p;
  *s7 = vld1_u16(s);
 }
 static inline void load_u16_4x14(const uint16_t *s, ptrdiff_t p,
                                 uint16x4_t *const s0, uint16x4_t *const s1,
                                 uint16x4_t *const s2, uint16x4_t *const s3,
                                 uint16x4_t *const s4, uint16x4_t *const s5,
                                 uint16x4_t *const s6, uint16x4_t *const s7,
                                 uint16x4_t *const s8, uint16x4_t *const s9,
                                 uint16x4_t *const s10, uint16x4_t *const s11,
                                 uint16x4_t *const s12, uint16x4_t *const s13) {
  *s0 = vld1_u16(s);
  s += p;
  *s1 = vld1_u16(s);
  s += p;
  *s2 = vld1_u16(s);
  s += p;
  *s3 = vld1_u16(s);
  s += p;
  *s4 = vld1_u16(s);
  s += p;
  *s5 = vld1_u16(s);
  s += p;
  *s6 = vld1_u16(s);
  s += p;
  *s7 = vld1_u16(s);
  s += p;
  *s8 = vld1_u16(s);
  s += p;
  *s9 = vld1_u16(s);
  s += p;
  *s10 = vld1_u16(s);
  s += p;
  *s11 = vld1_u16(s);
  s += p;
  *s12 = vld1_u16(s);
  s += p;
  *s13 = vld1_u16(s);
 }
 static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
                                int16x8_t *const s0, int16x8_t *const s1) {
  *s0 = vld1q_s16(s);
@@ -597,6 +730,56 @@ static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
  vst1_u16(s, s3);
 }
 static inline void store_u16_4x6(uint16_t *s, ptrdiff_t dst_stride,
                                 const uint16x4_t s0, const uint16x4_t s1,
                                 const uint16x4_t s2, const uint16x4_t s3,
                                 const uint16x4_t s4, const uint16x4_t s5) {
  vst1_u16(s, s0);
  s += dst_stride;
  vst1_u16(s, s1);
  s += dst_stride;
  vst1_u16(s, s2);
  s += dst_stride;
  vst1_u16(s, s3);
  s += dst_stride;
  vst1_u16(s, s4);
  s += dst_stride;
  vst1_u16(s, s5);
 }
 static inline void store_u16_4x12(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x4_t s0, const uint16x4_t s1,
                                  const uint16x4_t s2, const uint16x4_t s3,
                                  const uint16x4_t s4, const uint16x4_t s5,
                                  const uint16x4_t s6, const uint16x4_t s7,
                                  const uint16x4_t s8, const uint16x4_t s9,
                                  const uint16x4_t s10, const uint16x4_t s11) {
  vst1_u16(s, s0);
  s += dst_stride;
  vst1_u16(s, s1);
  s += dst_stride;
  vst1_u16(s, s2);
  s += dst_stride;
  vst1_u16(s, s3);
  s += dst_stride;
  vst1_u16(s, s4);
  s += dst_stride;
  vst1_u16(s, s5);
  s += dst_stride;
  vst1_u16(s, s6);
  s += dst_stride;
  vst1_u16(s, s7);
  s += dst_stride;
  vst1_u16(s, s8);
  s += dst_stride;
  vst1_u16(s, s9);
  s += dst_stride;
  vst1_u16(s, s10);
  s += dst_stride;
  vst1_u16(s, s11);
  s += dst_stride;
 }
 static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
                                 const uint16x8_t s0, const uint16x8_t s1) {
  vst1q_u16(s, s0);
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -46,16 +46,6 @@ static inline __m128i xx_loadu_128(const void *a) {
  return _mm_loadu_si128((const __m128i *)a);
 }
 // _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
 // manually on older compilers.
 #if !defined(__clang__) && __GNUC_MAJOR__ < 9
 static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
  __m64 hi_, lo_;
  memcpy(&hi_, hi, sizeof(hi_));
  memcpy(&lo_, lo, sizeof(lo_));
  return _mm_set_epi64(hi_, lo_);
 }
 #else
 // Load 64 bits from each of hi and low, and pack into an SSE register
 // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
 // the strict aliasing rule, this takes a different approach
@@ -63,7 +53,6 @@ static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
  return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo),
                            _mm_loadl_epi64((const __m128i *)hi));
 }
 #endif
 static inline void xx_storel_32(void *const a, const __m128i v) {
  const int val = _mm_cvtsi128_si32(v);
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -76,26 +76,11 @@ static inline __m256i yy_loadu_4x64(const void *e3, const void *e2,
  return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
 }
 #define GCC_VERSION (__GNUC__ * 10000 \
                     + __GNUC_MINOR__ * 100 \
                     + __GNUC_PATCHLEVEL__)
 // _mm256_loadu2_m128i has been introduced in GCC 10.1
 #if !defined(__clang__) && GCC_VERSION < 101000
 static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
  return _mm256_set_m128i(mhi, mlo);
 }
 #else
 static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
  return yy_set_m128i(mhi, mlo);
 }
 #endif
 #undef GCC_VERSION
 static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
--- a/third_party/aom/aom_ports/aom_ports.cmake
+++ b/third_party/aom/aom_ports/aom_ports.cmake
@@ -38,6 +38,9 @@ endif()
 list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h"
            "${AOM_ROOT}/aom_ports/ppc_cpudetect.c")
 list(APPEND AOM_PORTS_SOURCES_RISCV "${AOM_ROOT}/aom_ports/riscv.h"
            "${AOM_ROOT}/aom_ports/riscv_cpudetect.c")
 # For arm and x86 targets:
 #
 # * Creates the aom_ports build target, adds the includes in aom_ports to the
@@ -68,9 +71,12 @@ function(setup_aom_ports_targets)
  elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
    set(aom_ports_has_symbols 1)
  elseif("${AOM_TARGET_CPU}" MATCHES "riscv")
    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_RISCV})
    set(aom_ports_has_symbols 1)
  endif()
-  if("${AOM_TARGET_CPU}" MATCHES "arm|ppc")
+  if("${AOM_TARGET_CPU}" MATCHES "arm|ppc|riscv")
    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
    if(BUILD_SHARED_LIBS)
      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_ports>)
--- a/third_party/aom/aom_ports/riscv.h
+++ b/third_party/aom/aom_ports/riscv.h
@@ -0,0 +1,30 @@
 /*
 * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_AOM_PORTS_RISCV_H_
 #define AOM_AOM_PORTS_RISCV_H_
 #include <stdlib.h>
 #include "config/aom_config.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define HAS_RVV 0x01
 int riscv_simd_caps(void);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_AOM_PORTS_RISCV_H_
--- a/third_party/aom/aom_ports/riscv_cpudetect.c
+++ b/third_party/aom/aom_ports/riscv_cpudetect.c
@@ -0,0 +1,38 @@
 /*
 * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <fcntl.h>
 #include <unistd.h>
 #include <stdint.h>
 #include "config/aom_config.h"
 #include "aom_ports/riscv.h"
 #if CONFIG_RUNTIME_CPU_DETECT
 #include <sys/auxv.h>
 #define HWCAP_RVV (1 << ('v' - 'a'))
 int riscv_simd_caps(void) {
  int flags = 0;
 #if HAVE_RVV
  unsigned long hwcap = getauxval(AT_HWCAP);
  if (hwcap & HWCAP_RVV) flags |= HAS_RVV;
 #endif
  return flags;
 }
 #else
 // If there is no RTCD the function pointers are not used and can not be
 // changed.
 int riscv_simd_caps(void) { return 0; }
 #endif  // CONFIG_RUNTIME_CPU_DETECT
--- a/third_party/aom/apps/aomenc.c
+++ b/third_party/aom/apps/aomenc.c
@@ -2318,8 +2318,9 @@ int main(int argc, const char **argv_) {
                "match input format.\n",
                stream->config.cfg.g_profile);
      }
-      if ((global.show_psnr == 2) && (stream->config.cfg.g_input_bit_depth ==
+      if (global.show_psnr == 2 &&
-                                      stream->config.cfg.g_bit_depth)) {
+          stream->config.cfg.g_input_bit_depth ==
              (unsigned int)stream->config.cfg.g_bit_depth) {
        fprintf(stderr,
                "Warning: --psnr==2 and --psnr==1 will provide same "
                "results when input bit-depth == stream bit-depth, "
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -445,6 +445,9 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
 list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
 list(APPEND AOM_AV1_COMMON_INTRIN_RVV
            "${AOM_ROOT}/av1/common/riscv/cdef_block_rvv.c")
 if(CONFIG_THREE_PASS)
  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/thirdpass.c"
              "${AOM_ROOT}/av1/encoder/thirdpass.h")
@@ -822,6 +825,13 @@ function(setup_av1_targets)
    endif()
  endif()
  if(HAVE_RVV)
    if(AOM_AV1_COMMON_INTRIN_RVV)
      add_intrinsics_object_library("-march=rv64gcv" "rvv" "aom_av1_common"
                                    "AOM_AV1_COMMON_INTRIN_RVV")
    endif()
  endif()
  # Pass the new lib targets up to the parent scope instance of
  # $AOM_LIB_TARGETS.
  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -1084,7 +1084,6 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
  AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
  ToolCfg *const tool_cfg = &oxcf->tool_cfg;
  const int is_vbr = cfg->rc_end_usage == AOM_VBR;
  oxcf->profile = cfg->g_profile;
  oxcf->max_threads = (int)cfg->g_threads;
@@ -1167,9 +1166,9 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
  rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
  rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct;
  rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct;
-  rc_cfg->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
+  rc_cfg->maximum_buffer_size_ms = cfg->rc_buf_sz;
-  rc_cfg->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  rc_cfg->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
-  rc_cfg->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+  rc_cfg->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz;
  // Convert target bandwidth from Kbit/s to Bit/s
  rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate;
  rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh;
--- a/third_party/aom/av1/common/arm/cfl_neon.c
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -13,6 +13,7 @@
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "av1/common/cfl.h"
 static inline void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
@@ -428,10 +429,7 @@ static inline int16x8_t predict_w8(const int16_t *pred_buf_q3,
 static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
                                      int16x8_t alpha_sign, int abs_alpha_q12,
                                      int16x8_t dc) {
-  // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
+  const int16x8x2_t ac_q3 = vld1q_s16_x2(pred_buf_q3);
  // does not interleave, but is not currently available in the compilier used
  // by the AOM build system.
  const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3);
  const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
  const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
  const int16x8_t scaled_luma_0 =
@@ -447,10 +445,7 @@ static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
 static inline int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
                                      int16x8_t alpha_sign, int abs_alpha_q12,
                                      int16x8_t dc) {
-  // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
+  const int16x8x4_t ac_q3 = vld1q_s16_x4(pred_buf_q3);
  // does not interleave, but is not currently available in the compilier used
  // by the AOM build system.
  const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3);
  const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
  const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
  const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
@@ -497,7 +492,7 @@ static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
        const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
                                       vqmovun_s16(pred.val[1]) } };
-        vst2_u8(dst, predun);
+        vst1_u8_x2(dst, predun);
      } else {
        const int16x8x4_t pred =
            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
@@ -505,7 +500,7 @@ static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
          { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
            vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
        };
-        vst4_u8(dst, predun);
+        vst1_u8_x4(dst, predun);
      }
      dst += dst_stride;
    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
@@ -574,11 +569,11 @@ static inline void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
      } else if (width == 16) {
        const int16x8x2_t pred =
            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
-        vst2q_u16(dst, clamp2q_s16(pred, max_16x8));
+        vst1q_u16_x2(dst, clamp2q_s16(pred, max_16x8));
      } else {
        const int16x8x4_t pred =
            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
-        vst4q_u16(dst, clamp4q_s16(pred, max_16x8));
+        vst1q_u16_x4(dst, clamp4q_s16(pred, max_16x8));
      }
      dst += dst_stride;
    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
--- a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h
+++ b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h
@@ -53,8 +53,7 @@ static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp,
 static AOM_FORCE_INLINE int16x8_t load_filters_1(int ofs) {
  const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS);
-  const int16_t *base =
+  const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
      (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
  return vld1q_s16(base + ofs0 * 8);
 }
@@ -65,8 +64,7 @@ static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int ofs,
  const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
  const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
-  const int16_t *base =
+  const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
      (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
  out[0] = vld1q_s16(base + ofs0 * 8);
  out[1] = vld1q_s16(base + ofs1 * 8);
  out[2] = vld1q_s16(base + ofs2 * 8);
@@ -84,8 +82,7 @@ static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int ofs,
  const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS);
  const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS);
-  const int16_t *base =
+  const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
      (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
  out[0] = vld1q_s16(base + ofs0 * 8);
  out[1] = vld1q_s16(base + ofs1 * 8);
  out[2] = vld1q_s16(base + ofs2 * 8);
--- a/third_party/aom/av1/common/arm/warp_plane_neon.c
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.c
@@ -101,8 +101,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
                                                           int sx) {
-  int16x8_t f_s16 =
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
  return horizontal_filter_4x1_f1_beta0(in, f_s16);
 }
@@ -140,8 +139,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
                                                           int sx) {
-  int16x8_t f_s16 =
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
  return horizontal_filter_8x1_f1_beta0(in, f_s16);
 }
@@ -156,8 +154,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
  int16x4_t s6 = vget_low_s16(src[6]);
  int16x4_t s7 = vget_low_s16(src[7]);
-  int16x8_t f =
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
  int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
  m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
@@ -210,8 +207,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
  int16x8_t s6 = src[6];
  int16x8_t s7 = src[7];
-  int16x8_t f =
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
--- a/third_party/aom/av1/common/arm/warp_plane_neon.h
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.h
@@ -61,34 +61,34 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
 static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset,
                                            int stride) {
-  out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
+  out[0] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]);
-  out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
+  out[1] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]);
-  out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
+  out[2] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]);
-  out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
+  out[3] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]);
 }
 static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int offset,
                                            int stride) {
-  out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
+  out[0] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]);
-  out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
+  out[1] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]);
-  out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
+  out[2] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]);
-  out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
+  out[3] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]);
-  out[4] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 4 * stride) >>
+  out[4] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 4 * stride) >> WARPEDDIFF_PREC_BITS]);
-  out[5] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 5 * stride) >>
+  out[5] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 5 * stride) >> WARPEDDIFF_PREC_BITS]);
-  out[6] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 6 * stride) >>
+  out[6] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 6 * stride) >> WARPEDDIFF_PREC_BITS]);
-  out[7] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 7 * stride) >>
+  out[7] = vld1q_s16(
-                                                      WARPEDDIFF_PREC_BITS)));
+      av1_warped_filter[(offset + 7 * stride) >> WARPEDDIFF_PREC_BITS]);
 }
 static AOM_FORCE_INLINE int clamp_iy(int iy, int height) {
@@ -175,8 +175,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
  if (p_width == 4) {
    if (beta == 0) {
      if (alpha == 0) {
-        int16x8_t f_s16 = vld1q_s16(
+        int16x8_t f_s16 =
-            (int16_t *)(av1_warped_filter + (sx4 >> WARPEDDIFF_PREC_BITS)));
+            vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
        APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16);
      } else {
        APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha);
@@ -193,8 +193,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
  } else {
    if (beta == 0) {
      if (alpha == 0) {
-        int16x8_t f_s16 = vld1q_s16(
+        int16x8_t f_s16 =
-            (int16_t *)(av1_warped_filter + (sx4 >> WARPEDDIFF_PREC_BITS)));
+            vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
        APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16);
      } else {
        APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha);
--- a/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c
+++ b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c
@@ -109,8 +109,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
                                                           int sx) {
-  int16x8_t f_s16 =
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
  return horizontal_filter_4x1_f1_beta0(in, f_s16);
 }
@@ -145,8 +144,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
                                                           int sx) {
-  int16x8_t f_s16 =
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
  return horizontal_filter_8x1_f1_beta0(in, f_s16);
 }
@@ -161,8 +159,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
  int16x4_t s6 = vget_low_s16(src[6]);
  int16x4_t s7 = vget_low_s16(src[7]);
-  int16x8_t f =
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
  int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
  m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
@@ -215,8 +212,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
  int16x8_t s6 = src[6];
  int16x8_t s7 = src[7];
-  int16x8_t f =
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
--- a/third_party/aom/av1/common/arm/warp_plane_sve.c
+++ b/third_party/aom/av1/common/arm/warp_plane_sve.c
@@ -112,8 +112,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
                                                           int sx) {
-  int16x8_t f_s16 =
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
  return horizontal_filter_4x1_f1_beta0(in, f_s16);
 }
@@ -148,8 +147,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
                                                           int sx) {
-  int16x8_t f_s16 =
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
  return horizontal_filter_8x1_f1_beta0(in, f_s16);
 }
@@ -164,8 +162,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
  int16x4_t s6 = vget_low_s16(src[6]);
  int16x4_t s7 = vget_low_s16(src[7]);
-  int16x8_t f =
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
  int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
  m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
@@ -215,8 +212,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
  int16x8_t s6 = src[6];
  int16x8_t s7 = src[7];
-  int16x8_t f =
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -495,22 +495,22 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
 # structs as arguments, which makes the v256 type of the intrinsics
 # hard to support, so optimizations for this target are disabled.
 if ($opts{config} !~ /libs-x86-win32-vs.*/) {
-  specialize qw/cdef_find_dir sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_find_dir sse4_1 avx2 neon rvv/, "$ssse3_x86";
  specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/, "$ssse3_x86";
-  specialize qw/cdef_filter_8_0 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_0 sse4_1 avx2 neon rvv/, "$ssse3_x86";
-  specialize qw/cdef_filter_8_1 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_1 sse4_1 avx2 neon rvv/, "$ssse3_x86";
-  specialize qw/cdef_filter_8_2 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_2 sse4_1 avx2 neon rvv/, "$ssse3_x86";
-  specialize qw/cdef_filter_8_3 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_3 sse4_1 avx2 neon rvv/, "$ssse3_x86";
-  specialize qw/cdef_filter_16_0 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_0 sse4_1 avx2 neon rvv/, "$ssse3_x86";
-  specialize qw/cdef_filter_16_1 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_1 sse4_1 avx2 neon rvv/, "$ssse3_x86";
-  specialize qw/cdef_filter_16_2 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_2 sse4_1 avx2 neon rvv/, "$ssse3_x86";
-  specialize qw/cdef_filter_16_3 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_3 sse4_1 avx2 neon rvv/, "$ssse3_x86";
-  specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon rvv/, "$ssse3_x86";
  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86";
+    specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon rvv/, "$ssse3_x86";
  }
 }
--- a/third_party/aom/av1/common/riscv/cdef_block_rvv.c
+++ b/third_party/aom/av1/common/riscv/cdef_block_rvv.c
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -27,7 +27,8 @@
 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
 // We need an extra 2 taps to fit this in, for a total of 8 taps.
 /* clang-format off */
-const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
+const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1]
                                         [8] = {
  // [-1, 0)
  { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
  { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
@@ -344,7 +345,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                           WARPEDPIXEL_PREC_SHIFTS;
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = av1_warped_filter[offs];
+          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
          int32_t sum = 1 << offset_bits_horiz;
          for (int m = 0; m < 8; ++m) {
@@ -365,7 +366,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                           WARPEDPIXEL_PREC_SHIFTS;
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = av1_warped_filter[offs];
+          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
          int32_t sum = 1 << offset_bits_vert;
          for (int m = 0; m < 8; ++m) {
@@ -575,7 +576,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                           WARPEDPIXEL_PREC_SHIFTS;
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = av1_warped_filter[offs];
+          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
          int32_t sum = 1 << offset_bits_horiz;
          for (int m = 0; m < 8; ++m) {
@@ -599,7 +600,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                           WARPEDPIXEL_PREC_SHIFTS;
          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = av1_warped_filter[offs];
+          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
          int32_t sum = 1 << offset_bits_vert;
          for (int m = 0; m < 8; ++m) {
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -33,7 +33,14 @@
 #define WARP_ERROR_BLOCK_LOG 5
 #define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)
-extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+#if AOM_ARCH_ARM || AOM_ARCH_AARCH64 || AOM_ARCH_X86 || AOM_ARCH_X86_64
 typedef int16_t WarpedFilterCoeff;
 #else
 typedef int8_t WarpedFilterCoeff;
 #endif
 extern const WarpedFilterCoeff
    av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
 DECLARE_ALIGNED(8, extern const int8_t,
                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -3822,6 +3822,10 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
    resize_reset_rc(cpi, resize_pending_params->width,
                    resize_pending_params->height, cm->width, cm->height);
  }
  if (svc->temporal_layer_id == 0) {
    rc->num_col_blscroll_last_tl0 = 0;
    rc->num_row_blscroll_last_tl0 = 0;
  }
  // Set the GF interval and update flag.
  if (!rc->rtc_external_ratectrl)
    set_gf_interval_update_onepass_rt(cpi, *frame_type);
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -200,6 +200,8 @@ typedef struct {
  int last_target_size_keyframe;
  int frames_since_scene_change;
  int perc_spatial_flat_blocks;
  int num_col_blscroll_last_tl0;
  int num_row_blscroll_last_tl0;
  int avg_frame_bandwidth;  // Average frame size target for clip
  int min_frame_bandwidth;  // Minimum allocation used for any frame
--- a/third_party/aom/av1/encoder/var_based_part.c
+++ b/third_party/aom/av1/encoder/var_based_part.c
@@ -1325,6 +1325,53 @@ static inline void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
  }
 }
 static void do_int_pro_motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
                                         unsigned int *y_sad, int mi_row,
                                         int mi_col, int source_sad_nonrd) {
  AV1_COMMON *const cm = &cpi->common;
  MACROBLOCKD *xd = &x->e_mbd;
  MB_MODE_INFO *mi = xd->mi[0];
  const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
  const int increase_col_sw = source_sad_nonrd > kMedSad &&
                              !cpi->rc.high_motion_content_screen_rtc &&
                              (cpi->svc.temporal_layer_id == 0 ||
                               cpi->rc.num_col_blscroll_last_tl0 > 2);
  int me_search_size_col = is_screen
                               ? increase_col_sw ? 512 : 96
                               : block_size_wide[cm->seq_params->sb_size] >> 1;
  // For screen use larger search size row motion to capture
  // vertical scroll, which can be larger motion.
  int me_search_size_row = is_screen
                               ? source_sad_nonrd > kMedSad ? 512 : 192
                               : block_size_high[cm->seq_params->sb_size] >> 1;
  unsigned int y_sad_zero;
  *y_sad = av1_int_pro_motion_estimation(
      cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv, &y_sad_zero,
      me_search_size_col, me_search_size_row);
  // The logic below selects whether the motion estimated in the
  // int_pro_motion() will be used in nonrd_pickmode. Only do this
  // for screen for now.
  if (is_screen) {
    unsigned int thresh_sad =
        (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
    if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
      x->sb_me_partition = 1;
      x->sb_me_mv.as_int = mi->mv[0].as_int;
      if (cpi->svc.temporal_layer_id == 0) {
        if (abs(mi->mv[0].as_mv.col) > 16 && abs(mi->mv[0].as_mv.row) == 0)
          cpi->rc.num_col_blscroll_last_tl0++;
        else if (abs(mi->mv[0].as_mv.row) > 16 && abs(mi->mv[0].as_mv.col) == 0)
          cpi->rc.num_row_blscroll_last_tl0++;
      }
    } else {
      x->sb_me_partition = 0;
      // Fall back to using zero motion.
      *y_sad = y_sad_zero;
      mi->mv[0].as_int = 0;
    }
  }
 }
 static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
                         unsigned int *y_sad_g, unsigned int *y_sad_alt,
                         unsigned int *y_sad_last,
@@ -1418,42 +1465,11 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
    // so for now force it to 2 based on superblock sad.
    if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2;
-    if (est_motion == 1 || est_motion == 2) {
+    if ((est_motion == 1 || est_motion == 2) && xd->mb_to_right_edge >= 0 &&
-      if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+        xd->mb_to_bottom_edge >= 0 && x->source_variance > 100 &&
-        // For screen only do int_pro_motion for spatial variance above
+        source_sad_nonrd > kLowSad) {
-        // threshold and motion level above LowSad.
+      do_int_pro_motion_estimation(cpi, x, y_sad, mi_row, mi_col,
-        if (x->source_variance > 100 && source_sad_nonrd > kLowSad) {
+                                   source_sad_nonrd);
          int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
          int me_search_size_col =
              is_screen ? source_sad_nonrd > kMedSad ? 160 : 96
                        : block_size_wide[cm->seq_params->sb_size] >> 1;
          // For screen use larger search size row motion to capture
          // vertical scroll, which can be larger motion.
          int me_search_size_row =
              is_screen ? source_sad_nonrd > kMedSad ? 512 : 192
                        : block_size_high[cm->seq_params->sb_size] >> 1;
          unsigned int y_sad_zero;
          *y_sad = av1_int_pro_motion_estimation(
              cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv,
              &y_sad_zero, me_search_size_col, me_search_size_row);
          // The logic below selects whether the motion estimated in the
          // int_pro_motion() will be used in nonrd_pickmode. Only do this
          // for screen for now.
          if (is_screen) {
            unsigned int thresh_sad =
                (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
            if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
              x->sb_me_partition = 1;
              x->sb_me_mv.as_int = mi->mv[0].as_int;
            } else {
              x->sb_me_partition = 0;
              // Fall back to using zero motion.
              *y_sad = y_sad_zero;
              mi->mv[0].as_int = 0;
            }
          }
        }
      }
    }
    if (*y_sad == UINT_MAX) {
--- a/third_party/aom/build/cmake/aom_config_defaults.cmake
+++ b/third_party/aom/build/cmake/aom_config_defaults.cmake
@@ -26,6 +26,7 @@ set_aom_detect_var(AOM_ARCH_ARM 0 "Enables ARM architecture.")
 set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.")
 set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.")
 set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.")
 set_aom_detect_var(AOM_ARCH_RISCV 0 "Enables RISC-V architecture.")
 # Arm/AArch64 feature flags.
 set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.")
@@ -51,6 +52,9 @@ set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.")
 set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.")
 set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.")
 # RISC-V64 feature flags.
 set_aom_detect_var(HAVE_RVV 0 "Enables RVV optimizations.")
 # Flags describing the build environment.
 set_aom_detect_var(HAVE_FEXCEPT 0
                   "Internal flag, GNU fenv.h present for target.")
@@ -241,3 +245,6 @@ set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets."
                   ON)
 set_aom_option_var(ENABLE_AVX2
                   "Enables AVX2 optimizations on x86/x86_64 targets." ON)
 # RVV intrinsics flags.
 set_aom_option_var(ENABLE_RVV "Enables RVV optimizations on RISC-V targets." ON)
--- a/third_party/aom/build/cmake/aom_configure.cmake
+++ b/third_party/aom/build/cmake/aom_configure.cmake
@@ -75,6 +75,8 @@ if(NOT AOM_TARGET_CPU)
    set(AOM_TARGET_CPU "arm64")
  elseif(cpu_lowercase MATCHES "^ppc")
    set(AOM_TARGET_CPU "ppc")
  elseif(cpu_lowercase MATCHES "^riscv")
    set(AOM_TARGET_CPU "riscv")
  else()
    message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
                    "supported, falling back to the generic target")
--- a/third_party/aom/build/cmake/cpu.cmake
+++ b/third_party/aom/build/cmake/cpu.cmake
@@ -132,4 +132,15 @@ elseif("${AOM_TARGET_CPU}" MATCHES "^x86")
      set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
    endif()
  endforeach()
 elseif("${AOM_TARGET_CPU}" MATCHES "riscv")
  set(AOM_ARCH_RISCV64 1)
  set(RTCD_ARCH_RISCV64 "yes")
  if(ENABLE_RVV)
    set(HAVE_RVV 1)
    set(RTCD_HAVE_RVV "yes")
  else()
    set(HAVE_RVV 0)
    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-rvv)
  endif()
 endif()
--- a/third_party/aom/build/cmake/rtcd.pl
+++ b/third_party/aom/build/cmake/rtcd.pl
@@ -370,6 +370,36 @@ EOF
  common_bottom;
 }
 sub riscv() {
  determine_indirection("c", @ALL_ARCHS);
  # Assign the helper variable for each enabled extension
  foreach my $opt (@ALL_ARCHS) {
    my $opt_uc = uc $opt;
    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
  }
  common_top;
  print <<EOF;
 #ifdef RTCD_C
 #include "aom_ports/riscv.h"
 static void setup_rtcd_internal(void)
 {
    int flags = riscv_simd_caps();
    (void)flags;
 EOF
  set_function_pointers("c", @ALL_ARCHS);
  print <<EOF;
 }
 #endif
 EOF
  common_bottom;
 }
 sub unoptimized() {
  determine_indirection "c";
  common_top;
@@ -415,6 +445,9 @@ if ($opts{arch} eq 'x86') {
 } elsif ($opts{arch} eq 'ppc') {
  @ALL_ARCHS = filter(qw/vsx/);
  ppc;
 } elsif ($opts{arch} eq 'riscv') {
  @ALL_ARCHS = filter(qw/rvv/);
  riscv;
 } else {
  unoptimized;
 }
--- a/third_party/aom/test/cdef_test.cc
+++ b/third_party/aom/test/cdef_test.cc
@@ -618,7 +618,8 @@ TEST_P(CDEFCopyRect16to16Test, TestSIMDNoMismatch) {
 using std::make_tuple;
-#if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON)
+#if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON || \
     HAVE_RVV)
 static const CdefFilterBlockFunctions kCdefFilterFuncC[] = {
  { &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c,
    &cdef_filter_8_3_c }
@@ -811,6 +812,46 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif
 #if HAVE_RVV
 static const CdefFilterBlockFunctions kCdefFilterFuncRvv[] = {
  { &cdef_filter_8_0_rvv, &cdef_filter_8_1_rvv, &cdef_filter_8_2_rvv,
    &cdef_filter_8_3_rvv }
 };
 static const CdefFilterBlockFunctions kCdefFilterHighbdFuncRvv[] = {
  { &cdef_filter_16_0_rvv, &cdef_filter_16_1_rvv, &cdef_filter_16_2_rvv,
    &cdef_filter_16_3_rvv }
 };
 INSTANTIATE_TEST_SUITE_P(
    RVV, CDEFBlockTest,
    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncRvv),
                       ::testing::ValuesIn(kCdefFilterFuncC),
                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                         BLOCK_8X8),
                       ::testing::Range(0, 16), ::testing::Values(8)));
 INSTANTIATE_TEST_SUITE_P(
    RVV, CDEFBlockHighbdTest,
    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncRvv),
                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                         BLOCK_8X8),
                       ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
 INSTANTIATE_TEST_SUITE_P(RVV, CDEFFindDirTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_rvv,
                                                      &cdef_find_dir_c)));
 INSTANTIATE_TEST_SUITE_P(
    RVV, CDEFCopyRect8to16Test,
    ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
                                 &cdef_copy_rect8_8bit_to_16bit_rvv)));
 INSTANTIATE_TEST_SUITE_P(
    RVV, CDEFCopyRect16to16Test,
    ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
                                 &cdef_copy_rect8_16bit_to_16bit_rvv)));
 #endif
 // Test speed for all supported architectures
 #if AOM_ARCH_X86 && HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
@@ -905,4 +946,24 @@ INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirDualSpeedTest,
                                                      &cdef_find_dir_dual_c)));
 #endif
 #if HAVE_RVV
 INSTANTIATE_TEST_SUITE_P(
    RVV, CDEFSpeedTest,
    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncRvv),
                       ::testing::ValuesIn(kCdefFilterFuncC),
                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                         BLOCK_8X8),
                       ::testing::Range(0, 16), ::testing::Values(8)));
 INSTANTIATE_TEST_SUITE_P(
    RVV, CDEFSpeedHighbdTest,
    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncRvv),
                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
                                         BLOCK_8X8),
                       ::testing::Range(0, 16), ::testing::Values(10)));
 INSTANTIATE_TEST_SUITE_P(RVV, CDEFFindDirSpeedTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_rvv,
                                                      &cdef_find_dir_c)));
 #endif
 }  // namespace
--- a/third_party/aom/test/svc_datarate_test.cc
+++ b/third_party/aom/test/svc_datarate_test.cc
@@ -1078,6 +1078,39 @@ class DatarateTestSVC
 #endif
  }
  virtual void BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080Test() {
    cfg_.rc_buf_initial_sz = 50;
    cfg_.rc_buf_optimal_sz = 50;
    cfg_.rc_buf_sz = 100;
    cfg_.rc_dropframe_thresh = 30;
    cfg_.rc_min_quantizer = 0;
    cfg_.rc_max_quantizer = 52;
    cfg_.rc_end_usage = AOM_CBR;
    cfg_.g_lag_in_frames = 0;
    cfg_.g_error_resilient = 0;
    ::libaom_test::Y4mVideoSource video("screendata.1920_1080.y4m", 0, 60);
    const int bitrate_array[2] = { 60, 100 };
    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
    ResetModel();
    screen_mode_ = 1;
    number_temporal_layers_ = 2;
    number_spatial_layers_ = 1;
    target_layer_bitrate_[0] = 60 * cfg_.rc_target_bitrate / 100;
    target_layer_bitrate_[1] = cfg_.rc_target_bitrate;
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_AV1_DECODER
    // Top temporal layers are non_reference, so exclude them from
    // mismatch count, since loopfilter/cdef is not applied for these on
    // encoder side, but is always applied on decoder.
    // This means 150 = #frames(300) - #TL2_frames(150).
    // We use LE for screen since loopfilter level can become very small
    // or zero and then the frame is not a mismatch.
    EXPECT_LE(GetMismatchFrames(), 150u);
 #endif
  }
  virtual void BasicRateTargetingSVC1TL3SLScreenTest() {
    cfg_.rc_buf_initial_sz = 500;
    cfg_.rc_buf_optimal_sz = 500;
@@ -2651,6 +2684,14 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLScreenDropFrame) {
  BasicRateTargetingSVC2TL1SLScreenDropFrameTest();
 }
 // Check basic rate targeting for CBR, for 2 temporal layers, 1 spatial
 // for screen mode, with frame dropper on at low bitrates. Use small
 // values of rc_buf_initial/optimal/sz to trigger postencode frame drop.
 // Use 1920x1080 clip.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080) {
  BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080Test();
 }
 // Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal
 // for screen mode.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLScreen) {
--- a/third_party/aom/test/test-data.sha1
+++ b/third_party/aom/test/test-data.sha1
@@ -573,3 +573,4 @@ c7f336958e7af6162c20ddc84d67c7dfa9826910 *av1-1-b8-16-intra_only-intrabc-extreme
 4d1ad6d3070268ccb000d7fc3ae0f5a9447bfe82 *test_input_w1h1.yuv
 ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
 9c2aa2d0f63f706f775bf661dfa81e8bb3089d8b *wikipedia_420_360p_60f.y4m
 9e4d2ba84ba62f7ea4b617a13af5db9c39e7f0f9 *screendata.1920_1080.y4m
--- a/third_party/aom/test/test_data_util.cmake
+++ b/third_party/aom/test/test_data_util.cmake
@@ -35,6 +35,7 @@ list(APPEND AOM_TEST_DATA_FILE_NAMES
            "niklas_1280_720_30.y4m"
            "rush_hour_444.y4m"
            "screendata.y4m"
            "screendata.1920_1080.y4m"
            "niklas_640_480_30.yuv"
            "vase10x10.yuv"
            "vase10x10_tiles.txt"