diff --git a/media/libaom/config/generic/config/aom_config.asm b/media/libaom/config/generic/config/aom_config.asm
index 72307c318492..a28ad482eafb 100644
--- a/media/libaom/config/generic/config/aom_config.asm
+++ b/media/libaom/config/generic/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
+AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 0
 AOM_ARCH_X86_64 equ 0
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 0
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
+HAVE_RVV equ 0
 HAVE_SSE equ 0
 HAVE_SSE2 equ 0
 HAVE_SSE3 equ 0
diff --git a/media/libaom/config/generic/config/aom_config.h b/media/libaom/config/generic/config/aom_config.h
index ef68ec51f589..61b49dbc66a6 100644
--- a/media/libaom/config/generic/config/aom_config.h
+++ b/media/libaom/config/generic/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
+#define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 0
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
+#define HAVE_RVV 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
 #define HAVE_SSE3 0
diff --git a/media/libaom/config/linux/arm/config/aom_config.asm b/media/libaom/config/linux/arm/config/aom_config.asm
index c3cd1fe3e48e..9fd3159dba6d 100644
--- a/media/libaom/config/linux/arm/config/aom_config.asm
+++ b/media/libaom/config/linux/arm/config/aom_config.asm
@@ -12,6 +12,7 @@
 .equ AOM_ARCH_AARCH64, 0
 .equ AOM_ARCH_ARM, 1
 .equ AOM_ARCH_PPC, 0
+.equ AOM_ARCH_RISCV, 0
 .equ AOM_ARCH_X86, 0
 .equ AOM_ARCH_X86_64, 0
 .equ CONFIG_ACCOUNTING, 0
@@ -82,6 +83,7 @@
 .equ HAVE_NEON, 1
 .equ HAVE_NEON_DOTPROD, 0
 .equ HAVE_NEON_I8MM, 0
+.equ HAVE_RVV, 0
 .equ HAVE_SSE, 0
 .equ HAVE_SSE2, 0
 .equ HAVE_SSE3, 0
diff --git a/media/libaom/config/linux/arm/config/aom_config.h b/media/libaom/config/linux/arm/config/aom_config.h
index 29385cec2267..15350b976cfb 100644
--- a/media/libaom/config/linux/arm/config/aom_config.h
+++ b/media/libaom/config/linux/arm/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 1
 #define AOM_ARCH_PPC 0
+#define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 0
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 1
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
+#define HAVE_RVV 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
 #define HAVE_SSE3 0
diff --git a/media/libaom/config/linux/ia32/config/aom_config.asm b/media/libaom/config/linux/ia32/config/aom_config.asm
index eb2ddfa7599e..0f2be2761ba4 100644
--- a/media/libaom/config/linux/ia32/config/aom_config.asm
+++ b/media/libaom/config/linux/ia32/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
+AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 1
 AOM_ARCH_X86_64 equ 0
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
+HAVE_RVV equ 0
 HAVE_SSE equ 1
 HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
diff --git a/media/libaom/config/linux/ia32/config/aom_config.h b/media/libaom/config/linux/ia32/config/aom_config.h
index 8f14a39b5aa3..89ff5324d067 100644
--- a/media/libaom/config/linux/ia32/config/aom_config.h
+++ b/media/libaom/config/linux/ia32/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
+#define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 1
 #define AOM_ARCH_X86_64 0
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
+#define HAVE_RVV 0
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
 #define HAVE_SSE3 1
diff --git a/media/libaom/config/linux/x64/config/aom_config.asm b/media/libaom/config/linux/x64/config/aom_config.asm
index d9cfcc099029..3091f2ae3233 100644
--- a/media/libaom/config/linux/x64/config/aom_config.asm
+++ b/media/libaom/config/linux/x64/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
+AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 0
 AOM_ARCH_X86_64 equ 1
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
+HAVE_RVV equ 0
 HAVE_SSE equ 1
 HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
diff --git a/media/libaom/config/linux/x64/config/aom_config.h b/media/libaom/config/linux/x64/config/aom_config.h
index 5bc1b2c7f31e..a86fe4a4eac4 100644
--- a/media/libaom/config/linux/x64/config/aom_config.h
+++ b/media/libaom/config/linux/x64/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
+#define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 1
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
+#define HAVE_RVV 0
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
 #define HAVE_SSE3 1
diff --git a/media/libaom/config/mac/arm64/config/aom_config.asm b/media/libaom/config/mac/arm64/config/aom_config.asm
index 29a3de8f8be1..c8f503e63601 100644
--- a/media/libaom/config/mac/arm64/config/aom_config.asm
+++ b/media/libaom/config/mac/arm64/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 1
 AOM_ARCH_ARM equ 1
 AOM_ARCH_PPC equ 0
+AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 0
 AOM_ARCH_X86_64 equ 0
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 0
 HAVE_NEON equ 1
 HAVE_NEON_DOTPROD equ 1
 HAVE_NEON_I8MM equ 1
+HAVE_RVV equ 0
 HAVE_SSE equ 0
 HAVE_SSE2 equ 0
 HAVE_SSE3 equ 0
diff --git a/media/libaom/config/mac/arm64/config/aom_config.h b/media/libaom/config/mac/arm64/config/aom_config.h
index 2dd1aa07a858..c501b7ef2586 100644
--- a/media/libaom/config/mac/arm64/config/aom_config.h
+++ b/media/libaom/config/mac/arm64/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 1
 #define AOM_ARCH_ARM 1
 #define AOM_ARCH_PPC 0
+#define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 0
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 1
 #define HAVE_NEON_DOTPROD 1
 #define HAVE_NEON_I8MM 1
+#define HAVE_RVV 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
 #define HAVE_SSE3 0
diff --git a/media/libaom/config/mac/x64/config/aom_config.asm b/media/libaom/config/mac/x64/config/aom_config.asm
index d9cfcc099029..3091f2ae3233 100644
--- a/media/libaom/config/mac/x64/config/aom_config.asm
+++ b/media/libaom/config/mac/x64/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
+AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 0
 AOM_ARCH_X86_64 equ 1
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
+HAVE_RVV equ 0
 HAVE_SSE equ 1
 HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
diff --git a/media/libaom/config/mac/x64/config/aom_config.h b/media/libaom/config/mac/x64/config/aom_config.h
index 5bc1b2c7f31e..a86fe4a4eac4 100644
--- a/media/libaom/config/mac/x64/config/aom_config.h
+++ b/media/libaom/config/mac/x64/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
+#define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 1
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
+#define HAVE_RVV 0
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
 #define HAVE_SSE3 1
diff --git a/media/libaom/config/win/ia32/config/aom_config.asm b/media/libaom/config/win/ia32/config/aom_config.asm
index 75fd77c753db..c30f034e5502 100644
--- a/media/libaom/config/win/ia32/config/aom_config.asm
+++ b/media/libaom/config/win/ia32/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
+AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 1
 AOM_ARCH_X86_64 equ 0
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
+HAVE_RVV equ 0
 HAVE_SSE equ 1
 HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
diff --git a/media/libaom/config/win/ia32/config/aom_config.h b/media/libaom/config/win/ia32/config/aom_config.h
index f7a251fe936e..bb73fdd79fc6 100644
--- a/media/libaom/config/win/ia32/config/aom_config.h
+++ b/media/libaom/config/win/ia32/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
+#define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 1
 #define AOM_ARCH_X86_64 0
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
+#define HAVE_RVV 0
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
 #define HAVE_SSE3 1
diff --git a/media/libaom/config/win/x64/config/aom_config.asm b/media/libaom/config/win/x64/config/aom_config.asm
index d9cfcc099029..3091f2ae3233 100644
--- a/media/libaom/config/win/x64/config/aom_config.asm
+++ b/media/libaom/config/win/x64/config/aom_config.asm
@@ -12,6 +12,7 @@
 AOM_ARCH_AARCH64 equ 0
 AOM_ARCH_ARM equ 0
 AOM_ARCH_PPC equ 0
+AOM_ARCH_RISCV equ 0
 AOM_ARCH_X86 equ 0
 AOM_ARCH_X86_64 equ 1
 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
 HAVE_NEON equ 0
 HAVE_NEON_DOTPROD equ 0
 HAVE_NEON_I8MM equ 0
+HAVE_RVV equ 0
 HAVE_SSE equ 1
 HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
diff --git a/media/libaom/config/win/x64/config/aom_config.h b/media/libaom/config/win/x64/config/aom_config.h
index 5bc1b2c7f31e..a86fe4a4eac4 100644
--- a/media/libaom/config/win/x64/config/aom_config.h
+++ b/media/libaom/config/win/x64/config/aom_config.h
@@ -14,6 +14,7 @@
 #define AOM_ARCH_AARCH64 0
 #define AOM_ARCH_ARM 0
 #define AOM_ARCH_PPC 0
+#define AOM_ARCH_RISCV 0
 #define AOM_ARCH_X86 0
 #define AOM_ARCH_X86_64 1
 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
 #define HAVE_NEON 0
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
+#define HAVE_RVV 0
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
 #define HAVE_SSE3 1
diff --git a/media/libaom/moz.yaml b/media/libaom/moz.yaml
index 3da03697b989..893d9c43c0c6 100644
--- a/media/libaom/moz.yaml
+++ b/media/libaom/moz.yaml
@@ -20,11 +20,11 @@ origin:
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 0c13a5d54053f82bf8500b421b5cdefb1cc1b3ed (Sun Jan 05 09:13:09 2025 -0800).
+  release: 3990233fc06a35944d6d33797e63931802122a95 (Thu Jan 30 11:32:16 2025 -0800).
 
   # Revision to pull in
   # Must be a long or short commit SHA (long preferred)
-  revision: 0c13a5d54053f82bf8500b421b5cdefb1cc1b3ed
+  revision: 3990233fc06a35944d6d33797e63931802122a95
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt
index a62100431447..1c37223cbce5 100644
--- a/third_party/aom/CMakeLists.txt
+++ b/third_party/aom/CMakeLists.txt
@@ -333,6 +333,12 @@ if(CONFIG_AV1_ENCODER)
   # libaom static library.
   if(BUILD_SHARED_LIBS)
     target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom_static)
+    # TODO: https://aomedia.issues.chromium.org/391715078 - This condition can
+    # be removed after aom_av1_rc restricts its symbol visibility.
+    if(CYGWIN OR MINGW)
+      target_link_options(aom_av1_rc ${AOM_LIB_LINK_TYPE}
+                          LINKER:--allow-multiple-definition)
+    endif()
   else()
     target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom)
   endif()
@@ -858,8 +864,8 @@ if(BUILD_SHARED_LIBS)
   # errors (don't use it with AddressSanitizer)." See
   # https://clang.llvm.org/docs/AddressSanitizer.html#usage. Similarly, see
   # https://clang.llvm.org/docs/MemorySanitizer.html#usage.
-  if(NOT WIN32
-     AND NOT APPLE
+  if(NOT
+     (APPLE OR CYGWIN OR WIN32)
      AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE))
     # The -z defs linker option reports unresolved symbol references from object
     # files when building a shared library.
diff --git a/third_party/aom/README.md b/third_party/aom/README.md
index db7ad37b898f..2e0902a0ef82 100644
--- a/third_party/aom/README.md
+++ b/third_party/aom/README.md
@@ -60,7 +60,9 @@ README.md                {#LREADME}
    present, yasm will be used by default. Pass -DENABLE_NASM=ON to cmake to
    select nasm.) If you download yasm with the intention to work with Visual
    Studio, please download win32.exe or win64.exe and rename it into yasm.exe.
-   DO NOT download or use vsyasm.exe.
+   DO NOT download or use vsyasm.exe. The MSYS2 version of the yasm binary can
+   also be used and avoids an issue caused by a missing Visual C++
+   Redistributable install (Visual Studio 2010, MSVCR100.dll).
 6. Building the documentation requires
    [doxygen version 1.8.10 or newer](http://doxygen.org).
 7. Emscripten builds require the portable
diff --git a/third_party/aom/aom/exports_com b/third_party/aom/aom/exports_com
index 1166104aaee1..f3dbeaea33e9 100644
--- a/third_party/aom/aom/exports_com
+++ b/third_party/aom/aom/exports_com
@@ -10,7 +10,6 @@ text aom_codec_set_option
 text aom_codec_version
 text aom_codec_version_extra_str
 text aom_codec_version_str
-text aom_free
 text aom_img_add_metadata
 text aom_img_alloc
 text aom_img_alloc_with_border
@@ -25,7 +24,6 @@ text aom_img_plane_width
 text aom_img_remove_metadata
 text aom_img_set_rect
 text aom_img_wrap
-text aom_malloc
 text aom_rb_bytes_read
 text aom_rb_read_bit
 text aom_rb_read_literal
diff --git a/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
index b2fcc512513f..b3c373e210f7 100644
--- a/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
+++ b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -15,6 +15,7 @@
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/transpose_neon.h"
+#include "mem_neon.h"
 
 static inline int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low,
                                   const int16x4_t high) {
@@ -226,13 +227,8 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh, int bd) {
-  uint16_t *const dst_p1 = (uint16_t *)(s - 2 * pitch);
-  uint16_t *const dst_p0 = (uint16_t *)(s - pitch);
-  uint16_t *const dst_q0 = (uint16_t *)(s);
-  uint16_t *const dst_q1 = (uint16_t *)(s + pitch);
-
-  const uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0),
-                              vld1_u16(dst_q0), vld1_u16(dst_q1) };
+  uint16x4_t src[4];
+  load_u16_4x4(s - 2 * pitch, pitch, &src[0], &src[1], &src[2], &src[3]);
 
   // Adjust thresholds to bitdepth.
   const int outer_thresh = *blimit << (bd - 8);
@@ -247,12 +243,10 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
   filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
                 &needs_filter4_mask);
 
-#if AOM_ARCH_AARCH64
-  if (vaddv_u16(needs_filter4_mask) == 0) {
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter4_mask), 0) == 0) {
     // None of the values will be filtered.
     return;
   }
-#endif  // AOM_ARCH_AARCH64
 
   // Copy the masks to the high bits for packed comparisons later.
   const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
@@ -272,10 +266,9 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
   const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
   const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
 
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
-  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  store_u16_4x4(s - 2 * pitch, pitch, vget_low_u16(p1q1_output),
+                vget_low_u16(p0q0_output), vget_high_u16(p0q0_output),
+                vget_high_u16(p1q1_output));
 }
 
 void aom_highbd_lpf_horizontal_4_dual_neon(
@@ -290,14 +283,8 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
                                     const uint8_t *blimit, const uint8_t *limit,
                                     const uint8_t *thresh, int bd) {
   // Offset by 2 uint16_t values to load from first p1 position.
-  uint16_t *dst = s - 2;
-  uint16_t *dst_p1 = dst;
-  uint16_t *dst_p0 = dst + pitch;
-  uint16_t *dst_q0 = dst + pitch * 2;
-  uint16_t *dst_q1 = dst + pitch * 3;
-
-  uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
-                        vld1_u16(dst_q1) };
+  uint16x4_t src[4];
+  load_u16_4x4(s - 2, pitch, &src[0], &src[1], &src[2], &src[3]);
   transpose_array_inplace_u16_4x4(src);
 
   // Adjust thresholds to bitdepth.
@@ -313,12 +300,10 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
   filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
                 &needs_filter4_mask);
 
-#if AOM_ARCH_AARCH64
-  if (vaddv_u16(needs_filter4_mask) == 0) {
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter4_mask), 0) == 0) {
     // None of the values will be filtered.
     return;
   }
-#endif  // AOM_ARCH_AARCH64
 
   // Copy the masks to the high bits for packed comparisons later.
   const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
@@ -346,10 +331,7 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
   };
   transpose_array_inplace_u16_4x4(output);
 
-  vst1_u16(dst_p1, output[0]);
-  vst1_u16(dst_p0, output[1]);
-  vst1_u16(dst_q0, output[2]);
-  vst1_u16(dst_q1, output[3]);
+  store_u16_4x4(s - 2, pitch, output[0], output[1], output[2], output[3]);
 }
 
 void aom_highbd_lpf_vertical_4_dual_neon(
@@ -379,16 +361,14 @@ static inline void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
   //                                ^^^^^^
   sum = vaddq_u16(sum, p0q0);
 
-  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
-  //               ^^^^^
-  sum = vshlq_n_u16(sum, 1);
-
   // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
   //        ^^^^^^                          ^^^^^^
   // Should dual issue with the left shift.
   const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
   const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
-  sum = vaddq_u16(sum, outer_sum);
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //        ^^^^^^^^^^^                       ^^^^
+  sum = vmlaq_n_u16(outer_sum, sum, 2);
 
   *p1q1_output = vrshrq_n_u16(sum, 3);
 
@@ -396,11 +376,8 @@ static inline void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
   // p0 = p1 - (2 * p2) + q0 + q1
   // q0 = q1 - (2 * q2) + p0 + p1
   // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
-  //                ^^^^^^^^
-  const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
-  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
-  //        ^^^^^^^^
-  sum = vsubq_u16(sum, p2q2_double);
+  //        ^^^^^^^^^^^^^^^^^
+  sum = vmlsq_n_u16(sum, p2q2, 2);
   const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
   sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
 
@@ -411,16 +388,9 @@ void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh, int bd) {
-  uint16_t *const dst_p2 = s - 3 * pitch;
-  uint16_t *const dst_p1 = s - 2 * pitch;
-  uint16_t *const dst_p0 = s - pitch;
-  uint16_t *const dst_q0 = s;
-  uint16_t *const dst_q1 = s + pitch;
-  uint16_t *const dst_q2 = s + 2 * pitch;
-
-  const uint16x4_t src[6] = { vld1_u16(dst_p2), vld1_u16(dst_p1),
-                              vld1_u16(dst_p0), vld1_u16(dst_q0),
-                              vld1_u16(dst_q1), vld1_u16(dst_q2) };
+  uint16x4_t src[6];
+  load_u16_4x6(s - 3 * pitch, pitch, &src[0], &src[1], &src[2], &src[3],
+               &src[4], &src[5]);
 
   // Adjust thresholds to bitdepth.
   const int outer_thresh = *blimit << (bd - 8);
@@ -437,50 +407,56 @@ void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
   filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
                 &needs_filter_mask, &is_flat3_mask, &hev_mask);
 
-#if AOM_ARCH_AARCH64
-  if (vaddv_u16(needs_filter_mask) == 0) {
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
     // None of the values will be filtered.
     return;
   }
-#endif  // AOM_ARCH_AARCH64
-
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  // ZIP1 p0q0, p1q1 may perform better here.
-  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
-  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
 
   uint16x8_t p0q0_output, p1q1_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
-  // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
-  // output is not used.
   uint16x8_t f6_p1q1, f6_p0q0;
-  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
-  if (vget_lane_u64(need_filter6, 0) == 0) {
-    // filter6() does not apply, but filter4() applies to one or more values.
-    p0q0_output = p0q0;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
-  } else {
+  // Not needing filter4() at all is a very common case, so isolate it to avoid
+  // needlessly computing filter4().
+  if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 &&
+      vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
     filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
-    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
-    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    p1q1_output = f6_p1q1;
+    p0q0_output = f6_p0q0;
+  } else {
+    // Copy the masks to the high bits for packed comparisons later.
+    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+    const uint16x8_t is_flat3_mask_8 =
+        vcombine_u16(is_flat3_mask, is_flat3_mask);
+    const uint16x8_t needs_filter_mask_8 =
+        vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+    uint16x8_t f4_p1q1;
+    uint16x8_t f4_p0q0;
+    const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+    // Because we did not return after testing |needs_filter_mask| we know it is
+    // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
+    // filter6. Therefore if it is false when |needs_filter_mask| is true,
+    // filter6 output is not used.
+    const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+    if (vget_lane_u64(need_filter6, 0) == 0) {
+      // filter6() does not apply, but filter4() applies to one or more values.
+      p0q0_output = p0q0;
+      p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+    } else {
+      filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+      p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
   }
 
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
-  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  store_u16_4x4(s - 2 * pitch, pitch, vget_low_u16(p1q1_output),
+                vget_low_u16(p0q0_output), vget_high_u16(p0q0_output),
+                vget_high_u16(p1q1_output));
 }
 
 void aom_highbd_lpf_horizontal_6_dual_neon(
@@ -494,17 +470,12 @@ void aom_highbd_lpf_horizontal_6_dual_neon(
 void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
                                     const uint8_t *blimit, const uint8_t *limit,
                                     const uint8_t *thresh, int bd) {
-  // Left side of the filter window.
-  uint16_t *const dst = s - 3;
-  uint16_t *const dst_0 = dst;
-  uint16_t *const dst_1 = dst + pitch;
-  uint16_t *const dst_2 = dst + 2 * pitch;
-  uint16_t *const dst_3 = dst + 3 * pitch;
-
   // Overread by 2 values. These overreads become the high halves of src_raw[2]
   // and src_raw[3] after transpose.
-  uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
-                            vld1q_u16(dst_2), vld1q_u16(dst_3) };
+  uint16x8_t src_raw[4];
+  load_u16_8x4(s - 3, pitch, &src_raw[0], &src_raw[1], &src_raw[2],
+               &src_raw[3]);
+
   transpose_array_inplace_u16_4x8(src_raw);
   // p2, p1, p0, q0, q1, q2
   const uint16x4_t src[6] = {
@@ -528,25 +499,10 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
   filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
                 &needs_filter_mask, &is_flat3_mask, &hev_mask);
 
-#if AOM_ARCH_AARCH64
-  if (vaddv_u16(needs_filter_mask) == 0) {
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
     // None of the values will be filtered.
     return;
   }
-#endif  // AOM_ARCH_AARCH64
-
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  // ZIP1 p0q0, p1q1 may perform better here.
-  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
-  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
 
   uint16x8_t p0q0_output, p1q1_output;
   // Because we did not return after testing |needs_filter_mask| we know it is
@@ -555,17 +511,39 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
   // output is not used.
   uint16x8_t f6_p1q1, f6_p0q0;
   const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
-  if (vget_lane_u64(need_filter6, 0) == 0) {
-    // filter6() does not apply, but filter4() applies to one or more values.
-    p0q0_output = p0q0;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
-  } else {
+  // Not needing filter4() at all is a very common case, so isolate it to avoid
+  // needlessly computing filter4().
+  if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 &&
+      vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
     filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
-    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
-    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    p1q1_output = f6_p1q1;
+    p0q0_output = f6_p0q0;
+  } else {
+    // Copy the masks to the high bits for packed comparisons later.
+    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+    const uint16x8_t is_flat3_mask_8 =
+        vcombine_u16(is_flat3_mask, is_flat3_mask);
+    const uint16x8_t needs_filter_mask_8 =
+        vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+    uint16x8_t f4_p1q1;
+    uint16x8_t f4_p0q0;
+    // ZIP1 p0q0, p1q1 may perform better here.
+    const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+    if (vget_lane_u64(need_filter6, 0) == 0) {
+      // filter6() does not apply, but filter4() applies to one or more values.
+      p0q0_output = p0q0;
+      p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+    } else {
+      filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+      p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
   }
 
   uint16x4_t output[4] = {
@@ -576,11 +554,7 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
   };
   transpose_array_inplace_u16_4x4(output);
 
-  // dst_n starts at p2, so adjust to p1.
-  vst1_u16(dst_0 + 1, output[0]);
-  vst1_u16(dst_1 + 1, output[1]);
-  vst1_u16(dst_2 + 1, output[2]);
-  vst1_u16(dst_3 + 1, output[3]);
+  store_u16_4x4(s - 2, pitch, output[0], output[1], output[2], output[3]);
 }
 
 void aom_highbd_lpf_vertical_6_dual_neon(
@@ -607,18 +581,14 @@ static inline void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
   //                    ^^^^^^^^^^^
   const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
 
-  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
-  //               ^^^^^
-  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
-
   // Add two other terms to make dual issue with shift more likely.
   // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
   //                                   ^^^^^^^^^^^
   const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
 
   // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
-  //                                 ^^^^^^^^^^^^^
-  sum = vaddq_u16(sum, p01q01);
+  //               ^^^^^             ^^^^^^^^^^^^^
+  uint16x8_t sum = vmlaq_n_u16(p01q01, p23q23, 2);
 
   // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
   //        ^^^^^^
@@ -654,19 +624,9 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh, int bd) {
-  uint16_t *const dst_p3 = s - 4 * pitch;
-  uint16_t *const dst_p2 = s - 3 * pitch;
-  uint16_t *const dst_p1 = s - 2 * pitch;
-  uint16_t *const dst_p0 = s - pitch;
-  uint16_t *const dst_q0 = s;
-  uint16_t *const dst_q1 = s + pitch;
-  uint16_t *const dst_q2 = s + 2 * pitch;
-  uint16_t *const dst_q3 = s + 3 * pitch;
-
-  const uint16x4_t src[8] = { vld1_u16(dst_p3), vld1_u16(dst_p2),
-                              vld1_u16(dst_p1), vld1_u16(dst_p0),
-                              vld1_u16(dst_q0), vld1_u16(dst_q1),
-                              vld1_u16(dst_q2), vld1_u16(dst_q3) };
+  uint16x4_t src[8];
+  load_u16_4x8(s - 4 * pitch, pitch, &src[0], &src[1], &src[2], &src[3],
+               &src[4], &src[5], &src[6], &src[7]);
 
   // Adjust thresholds to bitdepth.
   const int outer_thresh = *blimit << (bd - 8);
@@ -684,54 +644,59 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
   filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
                 bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
 
-#if AOM_ARCH_AARCH64
-  if (vaddv_u16(needs_filter_mask) == 0) {
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
     // None of the values will be filtered.
     return;
   }
-#endif  // AOM_ARCH_AARCH64
-
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  // ZIP1 p0q0, p1q1 may perform better here.
-  const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
-  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
 
   uint16x8_t p0q0_output, p1q1_output, p2q2_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
-  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
-  // output is not used.
   uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
-  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
-  if (vget_lane_u64(need_filter8, 0) == 0) {
-    // filter8() does not apply, but filter4() applies to one or more values.
-    p2q2_output = p2q2;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
-  } else {
-    const uint16x8_t is_flat4_mask_8 =
-        vcombine_u16(is_flat4_mask, is_flat4_mask);
+  // Not needing filter4() at all is a very common case, so isolate it to avoid
+  // needlessly computing filter4().
+  if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
+      vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
     filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
-    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
-    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
-    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    p2q2_output = f8_p2q2;
+    p1q1_output = f8_p1q1;
+    p0q0_output = f8_p0q0;
+  } else {
+    // Copy the masks to the high bits for packed comparisons later.
+    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+    const uint16x8_t needs_filter_mask_8 =
+        vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+    uint16x8_t f4_p1q1;
+    uint16x8_t f4_p0q0;
+    const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+    // Because we did not return after testing |needs_filter_mask| we know it is
+    // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+    // filter8. Therefore if it is false when |needs_filter_mask| is true,
+    // filter8 output is not used.
+    const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+    if (vget_lane_u64(need_filter8, 0) == 0) {
+      // filter8() does not apply, but filter4() applies to one or more values.
+      p2q2_output = p2q2;
+      p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+    } else {
+      const uint16x8_t is_flat4_mask_8 =
+          vcombine_u16(is_flat4_mask, is_flat4_mask);
+      filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+      p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
   }
 
-  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
-  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
-  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+  store_u16_4x6(s - 3 * pitch, pitch, vget_low_u16(p2q2_output),
+                vget_low_u16(p1q1_output), vget_low_u16(p0q0_output),
+                vget_high_u16(p0q0_output), vget_high_u16(p1q1_output),
+                vget_high_u16(p2q2_output));
 }
 
 void aom_highbd_lpf_horizontal_8_dual_neon(
@@ -749,16 +714,10 @@ static inline uint16x8_t reverse_low_half(const uint16x8_t a) {
 void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
                                     const uint8_t *blimit, const uint8_t *limit,
                                     const uint8_t *thresh, int bd) {
-  uint16_t *const dst = s - 4;
-  uint16_t *const dst_0 = dst;
-  uint16_t *const dst_1 = dst + pitch;
-  uint16_t *const dst_2 = dst + 2 * pitch;
-  uint16_t *const dst_3 = dst + 3 * pitch;
-
   // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
   // To get desired pairs after transpose, one half should be reversed.
-  uint16x8_t src[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
-                        vld1q_u16(dst_3) };
+  uint16x8_t src[4];
+  load_u16_8x4(s - 4, pitch, &src[0], &src[1], &src[2], &src[3]);
 
   // src[0] = p0q0
   // src[1] = p1q1
@@ -783,45 +742,54 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
   filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
                 bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
 
-#if AOM_ARCH_AARCH64
-  if (vaddv_u16(needs_filter_mask) == 0) {
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
     // None of the values will be filtered.
     return;
   }
-#endif  // AOM_ARCH_AARCH64
-
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
-  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
 
   uint16x8_t p0q0_output, p1q1_output, p2q2_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
-  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
-  // output is not used.
-  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
-  if (vget_lane_u64(need_filter8, 0) == 0) {
-    // filter8() does not apply, but filter4() applies to one or more values.
-    p2q2_output = p2q2;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
-  } else {
-    const uint16x8_t is_flat4_mask_8 =
-        vcombine_u16(is_flat4_mask, is_flat4_mask);
-    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  // Not needing filter4() at all is a very common case, so isolate it to avoid
+  // needlessly computing filter4().
+  if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
+      vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
     filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
-    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
-    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
-    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    p2q2_output = f8_p2q2;
+    p1q1_output = f8_p1q1;
+    p0q0_output = f8_p0q0;
+  } else {
+    // Copy the masks to the high bits for packed comparisons later.
+    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+    const uint16x8_t needs_filter_mask_8 =
+        vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+    uint16x8_t f4_p1q1;
+    uint16x8_t f4_p0q0;
+    const uint16x8_t p0q1 =
+        vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+    // Because we did not return after testing |needs_filter_mask| we know it is
+    // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+    // filter8. Therefore if it is false when |needs_filter_mask| is true,
+    // filter8 output is not used.
+    const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+    if (vget_lane_u64(need_filter8, 0) == 0) {
+      // filter8() does not apply, but filter4() applies to one or more values.
+      p2q2_output = p2q2;
+      p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+    } else {
+      const uint16x8_t is_flat4_mask_8 =
+          vcombine_u16(is_flat4_mask, is_flat4_mask);
+      filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+      p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
   }
 
   uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
@@ -831,10 +799,9 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
 
   // Reverse p values to produce original order:
   // p3 p2 p1 p0 q0 q1 q2 q3
-  vst1q_u16(dst_0, reverse_low_half(output[0]));
-  vst1q_u16(dst_1, reverse_low_half(output[1]));
-  vst1q_u16(dst_2, reverse_low_half(output[2]));
-  vst1q_u16(dst_3, reverse_low_half(output[3]));
+  store_u16_8x4(s - 4, pitch, reverse_low_half(output[0]),
+                reverse_low_half(output[1]), reverse_low_half(output[2]),
+                reverse_low_half(output[3]));
 }
 
 void aom_highbd_lpf_vertical_8_dual_neon(
@@ -864,8 +831,8 @@ static inline void filter14(
   //                 ^^^^^^^^^^^^^^^^^^^
   // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
   //                               ^^^^^^^^^^^^^^^^^^^
-  uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
-  sum = vaddq_u16(sum, p6q6_x7);
+  const uint16x8_t p45q45 = vaddq_u16(p5q5, p4q4);
+  uint16x8_t sum = vmlaq_n_u16(p6q6_x7, p45q45, 2);
 
   // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
   //                                       ^^^^^^^
@@ -938,27 +905,10 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh, int bd) {
-  uint16_t *const dst_p6 = s - 7 * pitch;
-  uint16_t *const dst_p5 = s - 6 * pitch;
-  uint16_t *const dst_p4 = s - 5 * pitch;
-  uint16_t *const dst_p3 = s - 4 * pitch;
-  uint16_t *const dst_p2 = s - 3 * pitch;
-  uint16_t *const dst_p1 = s - 2 * pitch;
-  uint16_t *const dst_p0 = s - pitch;
-  uint16_t *const dst_q0 = s;
-  uint16_t *const dst_q1 = s + pitch;
-  uint16_t *const dst_q2 = s + 2 * pitch;
-  uint16_t *const dst_q3 = s + 3 * pitch;
-  uint16_t *const dst_q4 = s + 4 * pitch;
-  uint16_t *const dst_q5 = s + 5 * pitch;
-  uint16_t *const dst_q6 = s + 6 * pitch;
-
-  const uint16x4_t src[14] = {
-    vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
-    vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
-    vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
-    vld1_u16(dst_q5), vld1_u16(dst_q6)
-  };
+  uint16x4_t src[14];
+  load_u16_4x14(s - 7 * pitch, pitch, &src[0], &src[1], &src[2], &src[3],
+                &src[4], &src[5], &src[6], &src[7], &src[8], &src[9], &src[10],
+                &src[11], &src[12], &src[13]);
 
   // Adjust thresholds to bitdepth.
   const int outer_thresh = *blimit << (bd - 8);
@@ -976,12 +926,10 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
   filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
                 bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
 
-#if AOM_ARCH_AARCH64
-  if (vaddv_u16(needs_filter_mask) == 0) {
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
     // None of the values will be filtered.
     return;
   }
-#endif  // AOM_ARCH_AARCH64
   const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
   const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
   const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
@@ -991,85 +939,102 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
   const uint16x4_t is_flat4_outer_mask = vand_u16(
       is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
                               vabdq_u16(p0q0, p6q6), bd));
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  // ZIP1 p0q0, p1q1 may perform better here.
-  const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
-  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
 
   uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
       p5q5_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
-  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
-  // output is not used.
   uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
-  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
-  if (vget_lane_u64(need_filter8, 0) == 0) {
-    // filter8() and filter14() do not apply, but filter4() applies to one or
-    // more values.
+  uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+  if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) {
+    // filter14() applies to all values.
+    filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+             &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+    p5q5_output = f14_p5q5;
+    p4q4_output = f14_p4q4;
+    p3q3_output = f14_p3q3;
+    p2q2_output = f14_p2q2;
+    p1q1_output = f14_p1q1;
+    p0q0_output = f14_p0q0;
+  } else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
+             vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) {
+    // filter8() applies to all values.
+    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
     p5q5_output = p5q5;
     p4q4_output = p4q4;
     p3q3_output = p3q3;
-    p2q2_output = p2q2;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+    p2q2_output = f8_p2q2;
+    p1q1_output = f8_p1q1;
+    p0q0_output = f8_p0q0;
   } else {
-    const uint16x8_t use_filter8_mask =
-        vcombine_u16(is_flat4_mask, is_flat4_mask);
-    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
-    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
-    if (vget_lane_u64(need_filter14, 0) == 0) {
-      // filter14() does not apply, but filter8() and filter4() apply to one or
+    // Copy the masks to the high bits for packed comparisons later.
+    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+    const uint16x8_t needs_filter_mask_8 =
+        vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+    uint16x8_t f4_p1q1;
+    uint16x8_t f4_p0q0;
+    const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+    // Because we did not return after testing |needs_filter_mask| we know it is
+    // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+    // filter8. Therefore if it is false when |needs_filter_mask| is true,
+    // filter8 output is not used.
+    const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+    if (vget_lane_u64(need_filter8, 0) == 0) {
+      // filter8() and filter14() do not apply, but filter4() applies to one or
       // more values.
       p5q5_output = p5q5;
       p4q4_output = p4q4;
       p3q3_output = p3q3;
-      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
-      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
-      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
-      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+      p2q2_output = p2q2;
+      p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
     } else {
-      // All filters may contribute values to final outputs.
-      const uint16x8_t use_filter14_mask =
-          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
-      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
-      filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
-               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
-      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
-      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
-      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
-      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
-      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
-      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
-      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
-      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
-      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
-      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
-      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+      const uint16x8_t use_filter8_mask =
+          vcombine_u16(is_flat4_mask, is_flat4_mask);
+      filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+      const uint64x1_t need_filter14 =
+          vreinterpret_u64_u16(is_flat4_outer_mask);
+      if (vget_lane_u64(need_filter14, 0) == 0) {
+        // filter14() does not apply, but filter8() and filter4() apply to one
+        // or more values.
+        p5q5_output = p5q5;
+        p4q4_output = p4q4;
+        p3q3_output = p3q3;
+        p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+        p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+        p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+        p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+        p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+      } else {
+        // All filters may contribute values to final outputs.
+        const uint16x8_t use_filter14_mask =
+            vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+        filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+                 &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+        p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+        p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+        p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+        p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+        p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+        p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+        p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+        p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+        p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+        p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+        p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+        p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+      }
     }
   }
 
-  vst1_u16(dst_p5, vget_low_u16(p5q5_output));
-  vst1_u16(dst_p4, vget_low_u16(p4q4_output));
-  vst1_u16(dst_p3, vget_low_u16(p3q3_output));
-  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
-  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
-  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
-  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
-  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
-  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
-  vst1_u16(dst_q3, vget_high_u16(p3q3_output));
-  vst1_u16(dst_q4, vget_high_u16(p4q4_output));
-  vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+  store_u16_4x12(s - 6 * pitch, pitch, vget_low_u16(p5q5_output),
+                 vget_low_u16(p4q4_output), vget_low_u16(p3q3_output),
+                 vget_low_u16(p2q2_output), vget_low_u16(p1q1_output),
+                 vget_low_u16(p0q0_output), vget_high_u16(p0q0_output),
+                 vget_high_u16(p1q1_output), vget_high_u16(p2q2_output),
+                 vget_high_u16(p3q3_output), vget_high_u16(p4q4_output),
+                 vget_high_u16(p5q5_output));
 }
 
 void aom_highbd_lpf_horizontal_14_dual_neon(
@@ -1107,23 +1072,17 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
                                      const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int bd) {
-  uint16_t *const dst = s - 8;
-  uint16_t *const dst_0 = dst;
-  uint16_t *const dst_1 = dst + pitch;
-  uint16_t *const dst_2 = dst + 2 * pitch;
-  uint16_t *const dst_3 = dst + 3 * pitch;
-
   // Low halves:  p7 p6 p5 p4
   // High halves: p3 p2 p1 p0
-  uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
-                          vld1q_u16(dst_3) };
+  uint16x8_t src_p[4];
+  load_u16_8x4(s - 8, pitch, &src_p[0], &src_p[1], &src_p[2], &src_p[3]);
   // p7 will be the low half of src_p[0]. Not used until the end.
   transpose_array_inplace_u16_4x8(src_p);
 
   // Low halves:  q0 q1 q2 q3
   // High halves: q4 q5 q6 q7
-  uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
-                          vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
+  uint16x8_t src_q[4];
+  load_u16_8x4(s, pitch, &src_q[0], &src_q[1], &src_q[2], &src_q[3]);
   // q7 will be the high half of src_q[3]. Not used until the end.
   transpose_array_inplace_u16_4x8(src_q);
 
@@ -1144,12 +1103,11 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
   filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
                 bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
 
-#if AOM_ARCH_AARCH64
-  if (vaddv_u16(needs_filter_mask) == 0) {
+  if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
     // None of the values will be filtered.
     return;
   }
-#endif  // AOM_ARCH_AARCH64
+
   const uint16x8_t p4q4 =
       vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
   const uint16x8_t p5q5 =
@@ -1164,71 +1122,96 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
   const uint16x4_t is_flat4_outer_mask = vand_u16(
       is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
                               vabdq_u16(p0q0, p6q6), bd));
-  // Copy the masks to the high bits for packed comparisons later.
-  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
-  const uint16x8_t needs_filter_mask_8 =
-      vcombine_u16(needs_filter_mask, needs_filter_mask);
-
-  uint16x8_t f4_p1q1;
-  uint16x8_t f4_p0q0;
-  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
-  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
-  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
 
   uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
       p5q5_output;
-  // Because we did not return after testing |needs_filter_mask| we know it is
-  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
-  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
-  // output is not used.
   uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
-  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
-  if (vget_lane_u64(need_filter8, 0) == 0) {
-    // filter8() and filter14() do not apply, but filter4() applies to one or
-    // more values.
+  uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+  if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) {
+    // filter14() applies to all values.
+    filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+             &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+    p5q5_output = f14_p5q5;
+    p4q4_output = f14_p4q4;
+    p3q3_output = f14_p3q3;
+    p2q2_output = f14_p2q2;
+    p1q1_output = f14_p1q1;
+    p0q0_output = f14_p0q0;
+  } else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
+             vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) {
+    // filter8() applies to all values.
+    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
     p5q5_output = p5q5;
     p4q4_output = p4q4;
     p3q3_output = p3q3;
-    p2q2_output = p2q2;
-    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
-    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+    p2q2_output = f8_p2q2;
+    p1q1_output = f8_p1q1;
+    p0q0_output = f8_p0q0;
   } else {
-    const uint16x8_t use_filter8_mask =
-        vcombine_u16(is_flat4_mask, is_flat4_mask);
-    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
-    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
-    if (vget_lane_u64(need_filter14, 0) == 0) {
-      // filter14() does not apply, but filter8() and filter4() apply to one or
+    // Copy the masks to the high bits for packed comparisons later.
+    const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+    const uint16x8_t needs_filter_mask_8 =
+        vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+    uint16x8_t f4_p1q1;
+    uint16x8_t f4_p0q0;
+    const uint16x8_t p0q1 =
+        vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+    filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+    f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+    // Because we did not return after testing |needs_filter_mask| we know it is
+    // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+    // filter8. Therefore if it is false when |needs_filter_mask| is true,
+    // filter8 output is not used.
+    const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+    if (vget_lane_u64(need_filter8, 0) == 0) {
+      // filter8() and filter14() do not apply, but filter4() applies to one or
       // more values.
       p5q5_output = p5q5;
       p4q4_output = p4q4;
       p3q3_output = p3q3;
-      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
-      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
-      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
-      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+      p2q2_output = p2q2;
+      p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
     } else {
-      // All filters may contribute values to final outputs.
-      const uint16x8_t use_filter14_mask =
-          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
-      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
-      filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
-               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
-      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
-      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
-      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
-      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
-      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
-      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
-      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
-      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
-      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
-      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
-      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
-      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+      const uint16x8_t use_filter8_mask =
+          vcombine_u16(is_flat4_mask, is_flat4_mask);
+      filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+      const uint64x1_t need_filter14 =
+          vreinterpret_u64_u16(is_flat4_outer_mask);
+      if (vget_lane_u64(need_filter14, 0) == 0) {
+        // filter14() does not apply, but filter8() and filter4() apply to one
+        // or more values.
+        p5q5_output = p5q5;
+        p4q4_output = p4q4;
+        p3q3_output = p3q3;
+        p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+        p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+        p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+        p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+        p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+      } else {
+        // All filters may contribute values to final outputs.
+        const uint16x8_t use_filter14_mask =
+            vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+        filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+                 &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+        p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+        p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+        p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+        p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+        p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+        p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+        p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+        p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+        p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+        p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+        p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+        p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+      }
     }
   }
+
   // To get the correctly ordered rows from the transpose, we need:
   // p7p3 p6p2 p5p1 p4p0
   // q0q4 q1q5 q2q6 q3q7
@@ -1236,23 +1219,20 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
   const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output);
   const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output);
   const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
+
   uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
                              p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
-  transpose_array_inplace_u16_4x8(output_p);
   uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
                              p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
+
+  transpose_array_inplace_u16_4x8(output_p);
   transpose_array_inplace_u16_4x8(output_q);
 
   // Reverse p values to produce original order:
   // p3 p2 p1 p0 q0 q1 q2 q3
-  vst1q_u16(dst_0, output_p[0]);
-  vst1q_u16(dst_0 + 8, output_q[0]);
-  vst1q_u16(dst_1, output_p[1]);
-  vst1q_u16(dst_1 + 8, output_q[1]);
-  vst1q_u16(dst_2, output_p[2]);
-  vst1q_u16(dst_2 + 8, output_q[2]);
-  vst1q_u16(dst_3, output_p[3]);
-  vst1q_u16(dst_3 + 8, output_q[3]);
+  store_u16_8x4(s - 8, pitch, output_p[0], output_p[1], output_p[2],
+                output_p[3]);
+  store_u16_8x4(s, pitch, output_q[0], output_q[1], output_q[2], output_q[3]);
 }
 
 void aom_highbd_lpf_vertical_14_dual_neon(
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
index 6beb73ca0d6b..1c0b24ad1af4 100644
--- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@@ -146,473 +146,393 @@ static inline uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
   return mask_8x8;
 }
 
-static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
-                        uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
-                        uint8x8_t *p0q0, const uint8_t blimit,
-                        const uint8_t limit, const uint8_t thresh) {
-  uint16x8_t out;
-  uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
-      out_f14_pq5;
-  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
-  uint8x8_t out_f4_pq0, out_f4_pq1;
-  uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
-  uint8x8_t q0p0, q1p1, q2p2;
-
-  // Calculate filter masks
-  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
-  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
-  flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
-  {
-    // filter 4
-    int32x2x2_t ps0_qs0, ps1_qs1;
-    int16x8_t filter_s16;
-    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
-    uint8x8_t temp0_8x8, temp1_8x8;
-    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
-    int8x8_t op0, oq0, op1, oq1;
-    int8x8_t pq_s0, pq_s1;
-    int8x8_t filter_s8, filter1_s8, filter2_s8;
-    int8x8_t hev_8x8;
-    const int8x8_t sign_mask = vdup_n_s8(0x80);
-    const int8x8_t val_4 = vdup_n_s8(4);
-    const int8x8_t val_3 = vdup_n_s8(3);
-
-    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
-    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
-    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
-    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
-    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
-    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
-    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
-    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
-    // hev_mask
-    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
-    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
-    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
-    // add outer taps if we have high edge variance
-    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-    // inner taps
-    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
-    filter_s16 = vmovl_s8(filter_s8);
-    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
-    filter_s8 = vqmovn_s16(filter_s16);
-    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
-    filter1_s8 = vqadd_s8(filter_s8, val_4);
-    filter2_s8 = vqadd_s8(filter_s8, val_3);
-    filter1_s8 = vshr_n_s8(filter1_s8, 3);
-    filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
-    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
-    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
-    hev_8x8 = vmvn_s8(hev_8x8);
-    filter_s8 = vrshr_n_s8(filter1_s8, 1);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
-    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
-    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
-    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
-  }
-  // reverse p and q
-  q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
-  q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
-  q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
-  {
-    // filter 8
-    uint16x8_t out_pq0, out_pq1, out_pq2;
-    out = vaddl_u8(*p3q3, *p2q2);
-    out = vaddw_u8(out, *p1q1);
-    out = vaddw_u8(out, *p0q0);
-
-    out = vaddw_u8(out, q0p0);
-    out_pq1 = vaddw_u8(out, *p3q3);
-    out_pq2 = vaddw_u8(out_pq1, *p3q3);
-    out_pq2 = vaddw_u8(out_pq2, *p2q2);
-    out_pq1 = vaddw_u8(out_pq1, *p1q1);
-    out_pq1 = vaddw_u8(out_pq1, q1p1);
-
-    out_pq0 = vaddw_u8(out, *p0q0);
-    out_pq0 = vaddw_u8(out_pq0, q1p1);
-    out_pq0 = vaddw_u8(out_pq0, q2p2);
-
-    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
-    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
-    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
-  }
-  {
-    // filter 14
-    uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
-    uint16x8_t p6q6_2, p6q6_temp, qp_sum;
-    uint8x8_t qp_rev;
-
-    out = vaddw_u8(out, *p4q4);
-    out = vaddw_u8(out, *p5q5);
-    out = vaddw_u8(out, *p6q6);
-
-    out_pq5 = vaddw_u8(out, *p4q4);
-    out_pq4 = vaddw_u8(out_pq5, *p3q3);
-    out_pq3 = vaddw_u8(out_pq4, *p2q2);
-
-    out_pq5 = vaddw_u8(out_pq5, *p5q5);
-    out_pq4 = vaddw_u8(out_pq4, *p5q5);
-
-    out_pq0 = vaddw_u8(out, *p1q1);
-    out_pq1 = vaddw_u8(out_pq0, *p2q2);
-    out_pq2 = vaddw_u8(out_pq1, *p3q3);
-
-    out_pq0 = vaddw_u8(out_pq0, *p0q0);
-    out_pq1 = vaddw_u8(out_pq1, *p0q0);
-
-    out_pq1 = vaddw_u8(out_pq1, *p6q6);
-    p6q6_2 = vaddl_u8(*p6q6, *p6q6);
-    out_pq2 = vaddq_u16(out_pq2, p6q6_2);
-    p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
-    out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
-    p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
-    out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
-    p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
-    out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
-
-    out_pq4 = vaddw_u8(out_pq4, q1p1);
-
-    qp_sum = vaddl_u8(q2p2, q1p1);
-    out_pq3 = vaddq_u16(out_pq3, qp_sum);
-
-    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
-    qp_sum = vaddw_u8(qp_sum, qp_rev);
-    out_pq2 = vaddq_u16(out_pq2, qp_sum);
-
-    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
-    qp_sum = vaddw_u8(qp_sum, qp_rev);
-    out_pq1 = vaddq_u16(out_pq1, qp_sum);
-
-    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
-    qp_sum = vaddw_u8(qp_sum, qp_rev);
-    out_pq0 = vaddq_u16(out_pq0, qp_sum);
-
-    out_pq0 = vaddw_u8(out_pq0, q0p0);
-
-    out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
-    out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
-    out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
-    out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
-    out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
-    out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
-  }
-  {
-    uint8x8_t filter4_cond, filter8_cond, filter14_cond;
-    filter8_cond = vand_u8(flat_8x8, mask_8x8);
-    filter4_cond = vmvn_u8(filter8_cond);
-    filter14_cond = vand_u8(filter8_cond, flat2_8x8);
-
-    // filter4 outputs
-    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-
-    // filter8 outputs
-    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
-    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
-
-    // filter14 outputs
-    *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
-    *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
-    *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
-    *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
-    *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
-  }
-}
-
-static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
-                       uint8x8_t *p0q0, const uint8_t blimit,
-                       const uint8_t limit, const uint8_t thresh) {
-  uint16x8_t out;
-  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
-  uint8x8_t out_f4_pq0, out_f4_pq1;
-  uint8x8_t mask_8x8, flat_8x8;
-
-  // Calculate filter masks
-  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
-  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
-  {
-    // filter 4
-    int32x2x2_t ps0_qs0, ps1_qs1;
-    int16x8_t filter_s16;
-    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
-    uint8x8_t temp0_8x8, temp1_8x8;
-    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
-    int8x8_t op0, oq0, op1, oq1;
-    int8x8_t pq_s0, pq_s1;
-    int8x8_t filter_s8, filter1_s8, filter2_s8;
-    int8x8_t hev_8x8;
-    const int8x8_t sign_mask = vdup_n_s8(0x80);
-    const int8x8_t val_4 = vdup_n_s8(4);
-    const int8x8_t val_3 = vdup_n_s8(3);
-
-    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
-    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
-    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
-    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
-    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
-    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
-    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
-    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
-    // hev_mask
-    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
-    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
-    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
-    // add outer taps if we have high edge variance
-    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-    // inner taps
-    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
-    filter_s16 = vmovl_s8(filter_s8);
-    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
-    filter_s8 = vqmovn_s16(filter_s16);
-    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
-    filter1_s8 = vqadd_s8(filter_s8, val_4);
-    filter2_s8 = vqadd_s8(filter_s8, val_3);
-    filter1_s8 = vshr_n_s8(filter1_s8, 3);
-    filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
-    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
-    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
-    hev_8x8 = vmvn_s8(hev_8x8);
-    filter_s8 = vrshr_n_s8(filter1_s8, 1);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
-    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
-    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
-    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
-  }
-  {
-    // filter 8
-    uint16x8_t out_pq0, out_pq1, out_pq2;
-    uint8x8_t q0p0, q1p1, q2p2;
-
-    out = vaddl_u8(*p3q3, *p2q2);
-    out = vaddw_u8(out, *p1q1);
-    out = vaddw_u8(out, *p0q0);
-
-    // reverse p and q
-    q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
-    q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
-    q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
-
-    out = vaddw_u8(out, q0p0);
-    out_pq1 = vaddw_u8(out, *p3q3);
-    out_pq2 = vaddw_u8(out_pq1, *p3q3);
-    out_pq2 = vaddw_u8(out_pq2, *p2q2);
-    out_pq1 = vaddw_u8(out_pq1, *p1q1);
-    out_pq1 = vaddw_u8(out_pq1, q1p1);
-
-    out_pq0 = vaddw_u8(out, *p0q0);
-    out_pq0 = vaddw_u8(out_pq0, q1p1);
-    out_pq0 = vaddw_u8(out_pq0, q2p2);
-
-    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
-    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
-    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
-  }
-  {
-    uint8x8_t filter4_cond, filter8_cond;
-    filter8_cond = vand_u8(flat_8x8, mask_8x8);
-    filter4_cond = vmvn_u8(filter8_cond);
-
-    // filter4 outputs
-    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-
-    // filter8 outputs
-    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
-    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
-  }
-}
-
-static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
-                       const uint8_t blimit, const uint8_t limit,
-                       const uint8_t thresh) {
-  uint16x8_t out;
-  uint8x8_t out_f6_pq0, out_f6_pq1;
-  uint8x8_t out_f4_pq0, out_f4_pq1;
-  uint8x8_t mask_8x8, flat_8x8;
-
-  // Calculate filter masks
-  mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
-  flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
-  {
-    // filter 4
-    int32x2x2_t ps0_qs0, ps1_qs1;
-    int16x8_t filter_s16;
-    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
-    uint8x8_t temp0_8x8, temp1_8x8;
-    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
-    int8x8_t op0, oq0, op1, oq1;
-    int8x8_t pq_s0, pq_s1;
-    int8x8_t filter_s8, filter1_s8, filter2_s8;
-    int8x8_t hev_8x8;
-    const int8x8_t sign_mask = vdup_n_s8(0x80);
-    const int8x8_t val_4 = vdup_n_s8(4);
-    const int8x8_t val_3 = vdup_n_s8(3);
-
-    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
-    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
-    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
-    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
-    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
-    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
-    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
-    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
-    // hev_mask
-    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
-    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
-    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
-    // add outer taps if we have high edge variance
-    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-    // inner taps
-    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
-    filter_s16 = vmovl_s8(filter_s8);
-    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
-    filter_s8 = vqmovn_s16(filter_s16);
-    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
-    filter1_s8 = vqadd_s8(filter_s8, val_4);
-    filter2_s8 = vqadd_s8(filter_s8, val_3);
-    filter1_s8 = vshr_n_s8(filter1_s8, 3);
-    filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
-    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
-    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
-    filter_s8 = vrshr_n_s8(filter1_s8, 1);
-    filter_s8 = vbic_s8(filter_s8, hev_8x8);
-
-    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
-    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
-    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
-    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
-  }
-  {
-    // filter 6
-    uint16x8_t out_pq0, out_pq1;
-    uint8x8_t pq_rev;
-
-    out = vaddl_u8(*p0q0, *p1q1);
-    out = vaddq_u16(out, out);
-    out = vaddw_u8(out, *p2q2);
-
-    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
-    out = vaddw_u8(out, pq_rev);
-
-    out_pq0 = vaddw_u8(out, pq_rev);
-    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
-    out_pq0 = vaddw_u8(out_pq0, pq_rev);
-
-    out_pq1 = vaddw_u8(out, *p2q2);
-    out_pq1 = vaddw_u8(out_pq1, *p2q2);
-
-    out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
-    out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
-  }
-  {
-    uint8x8_t filter4_cond, filter6_cond;
-    filter6_cond = vand_u8(flat_8x8, mask_8x8);
-    filter4_cond = vmvn_u8(filter6_cond);
-
-    // filter4 outputs
-    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-
-    // filter6 outputs
-    *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
-  }
-}
-
-static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
-                       const uint8_t limit, const uint8_t thresh) {
-  int32x2x2_t ps0_qs0, ps1_qs1;
-  int16x8_t filter_s16;
+static inline void filter4(const uint8x8_t p0q0, const uint8x8_t p1q1,
+                           uint8x8_t *p0q0_output, uint8x8_t *p1q1_output,
+                           uint8x8_t mask_8x8, const uint8_t thresh) {
   const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
-  uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
-  int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
-  int8x8_t op0, oq0, op1, oq1;
-  int8x8_t pq_s0, pq_s1;
-  int8x8_t filter_s8, filter1_s8, filter2_s8;
-  int8x8_t hev_8x8;
   const int8x8_t sign_mask = vdup_n_s8(0x80);
   const int8x8_t val_4 = vdup_n_s8(4);
   const int8x8_t val_3 = vdup_n_s8(3);
 
-  // Calculate filter mask
-  mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+  int8x8_t pq_s0 = veor_s8(vreinterpret_s8_u8(p0q0), sign_mask);
+  int8x8_t pq_s1 = veor_s8(vreinterpret_s8_u8(p1q1), sign_mask);
 
-  pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
-  pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
-  ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
-  ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
-  ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
-  qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
-  ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
-  qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+  int32x2x2_t ps0_qs0 =
+      vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+  int32x2x2_t ps1_qs1 =
+      vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+  int8x8_t ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+  int8x8_t qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+  int8x8_t ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+  int8x8_t qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
 
   // hev_mask
-  temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
-  temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
-  hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+  uint8x8_t temp0_8x8 = vcgt_u8(vabd_u8(p0q0, p1q1), thresh_f4);
+  uint8x8_t temp1_8x8 =
+      vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+  int8x8_t hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
 
   // add outer taps if we have high edge variance
-  filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+  int8x8_t filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
   filter_s8 = vand_s8(filter_s8, hev_8x8);
 
   // inner taps
-  temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
-  filter_s16 = vmovl_s8(filter_s8);
+  int8x8_t temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+  int16x8_t filter_s16 = vmovl_s8(filter_s8);
   filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
   filter_s8 = vqmovn_s16(filter_s16);
   filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
 
-  filter1_s8 = vqadd_s8(filter_s8, val_4);
-  filter2_s8 = vqadd_s8(filter_s8, val_3);
+  int8x8_t filter1_s8 = vqadd_s8(filter_s8, val_4);
+  int8x8_t filter2_s8 = vqadd_s8(filter_s8, val_3);
   filter1_s8 = vshr_n_s8(filter1_s8, 3);
   filter2_s8 = vshr_n_s8(filter2_s8, 3);
 
-  oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
-  op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+  int8x8_t oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+  int8x8_t op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
 
   filter_s8 = vrshr_n_s8(filter1_s8, 1);
   filter_s8 = vbic_s8(filter_s8, hev_8x8);
 
-  oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
-  op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+  int8x8_t oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+  int8x8_t op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
 
-  *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
-  *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  *p0q0_output = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+  *p1q1_output = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+}
+
+static inline void filter8(const uint8x8_t p0q0, const uint8x8_t p1q1,
+                           const uint8x8_t p2q2, const uint8x8_t p3q3,
+                           uint8x8_t *p0q0_output, uint8x8_t *p1q1_output,
+                           uint8x8_t *p2q2_output) {
+  // Reverse p and q.
+  uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
+  uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4);
+  uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4);
+
+  uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
+  uint16x8_t p2q2_p3q3 = vaddl_u8(p3q3, p2q2);
+  uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3);
+
+  uint16x8_t q0p0_p3q3 = vaddl_u8(q0p0, p3q3);
+  uint16x8_t out_q0p0_p3q3 = vaddq_u16(out, q0p0_p3q3);
+
+  uint16x8_t out_pq2 = vaddq_u16(out_q0p0_p3q3, p2q2_p3q3);
+
+  uint16x8_t p1q1_q1p1 = vaddl_u8(p1q1, q1p1);
+  uint16x8_t out_pq1 = vaddq_u16(out_q0p0_p3q3, p1q1_q1p1);
+
+  uint16x8_t q0p0_p0q0 = vaddl_u8(q0p0, p0q0);
+  uint16x8_t q1p1_q2p2 = vaddl_u8(q1p1, q2p2);
+  uint16x8_t out_pq0 = vaddq_u16(q0p0_p0q0, q1p1_q2p2);
+  out_pq0 = vaddq_u16(out_pq0, out);
+
+  *p0q0_output = vrshrn_n_u16(out_pq0, 3);
+  *p1q1_output = vrshrn_n_u16(out_pq1, 3);
+  *p2q2_output = vrshrn_n_u16(out_pq2, 3);
+}
+
+static inline void filter14(const uint8x8_t p0q0, const uint8x8_t p1q1,
+                            const uint8x8_t p2q2, const uint8x8_t p3q3,
+                            const uint8x8_t p4q4, const uint8x8_t p5q5,
+                            const uint8x8_t p6q6, uint8x8_t *p0q0_output,
+                            uint8x8_t *p1q1_output, uint8x8_t *p2q2_output,
+                            uint8x8_t *p3q3_output, uint8x8_t *p4q4_output,
+                            uint8x8_t *p5q5_output) {
+  // Reverse p and q.
+  uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
+  uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4);
+  uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4);
+  uint8x8_t q3p3 = vext_u8(p3q3, p3q3, 4);
+  uint8x8_t q4p4 = vext_u8(p4q4, p4q4, 4);
+  uint8x8_t q5p5 = vext_u8(p5q5, p5q5, 4);
+
+  uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
+  uint16x8_t p2q2_p3q3 = vaddl_u8(p2q2, p3q3);
+  uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3);
+
+  uint16x8_t q0p0_p4q4 = vaddl_u8(q0p0, p4q4);
+  uint16x8_t p5q5_p6q6 = vaddl_u8(p5q5, p6q6);
+  uint16x8_t tmp = vaddq_u16(q0p0_p4q4, p5q5_p6q6);
+  // This offset removes the need for a rounding shift at the end.
+  uint16x8_t tmp_offset = vaddq_u16(tmp, vdupq_n_u16(1 << 3));
+  out = vaddq_u16(out, tmp_offset);
+
+  uint16x8_t out_pq5 = vaddw_u8(out, p4q4);
+  uint16x8_t out_pq4 = vaddw_u8(out_pq5, p3q3);
+  uint16x8_t out_pq3 = vaddw_u8(out_pq4, p2q2);
+
+  out_pq5 = vaddw_u8(out_pq5, p5q5);
+
+  uint16x8_t out_pq0 = vaddw_u8(out, p1q1);
+  uint16x8_t out_pq1 = vaddw_u8(out_pq0, p2q2);
+  uint16x8_t out_pq2 = vaddw_u8(out_pq1, p3q3);
+
+  uint16x8_t p0q0_q0p0 = vaddl_u8(p0q0, q0p0);
+  out_pq0 = vaddq_u16(out_pq0, p0q0_q0p0);
+
+  uint16x8_t p0q0_p6q6 = vaddl_u8(p0q0, p6q6);
+  out_pq1 = vaddq_u16(out_pq1, p0q0_p6q6);
+  uint16x8_t p5q5_q1p1 = vaddl_u8(p5q5, q1p1);
+  out_pq4 = vaddq_u16(out_pq4, p5q5_q1p1);
+
+  uint16x8_t p6q6_p6q6 = vaddl_u8(p6q6, p6q6);
+  out_pq2 = vaddq_u16(out_pq2, p6q6_p6q6);
+  uint16x8_t p6q6_temp = vaddw_u8(p6q6_p6q6, p6q6);
+  out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
+  p6q6_temp = vaddw_u8(p6q6_temp, p6q6);
+  out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
+  p6q6_temp = vaddq_u16(p6q6_temp, p6q6_p6q6);
+  out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
+
+  uint16x8_t qp_sum = vaddl_u8(q2p2, q1p1);
+  out_pq3 = vaddq_u16(out_pq3, qp_sum);
+
+  qp_sum = vaddw_u8(qp_sum, q3p3);
+  out_pq2 = vaddq_u16(out_pq2, qp_sum);
+
+  qp_sum = vaddw_u8(qp_sum, q4p4);
+  out_pq1 = vaddq_u16(out_pq1, qp_sum);
+
+  qp_sum = vaddw_u8(qp_sum, q5p5);
+  out_pq0 = vaddq_u16(out_pq0, qp_sum);
+
+  *p0q0_output = vshrn_n_u16(out_pq0, 4);
+  *p1q1_output = vshrn_n_u16(out_pq1, 4);
+  *p2q2_output = vshrn_n_u16(out_pq2, 4);
+  *p3q3_output = vshrn_n_u16(out_pq3, 4);
+  *p4q4_output = vshrn_n_u16(out_pq4, 4);
+  *p5q5_output = vshrn_n_u16(out_pq5, 4);
+}
+
+static inline void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5,
+                               uint8x8_t *p4q4, uint8x8_t *p3q3,
+                               uint8x8_t *p2q2, uint8x8_t *p1q1,
+                               uint8x8_t *p0q0, const uint8_t blimit,
+                               const uint8_t limit, const uint8_t thresh) {
+  uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
+      out_f14_pq5;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+
+  // Calculate filter masks.
+  uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  uint8x8_t flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
+
+  // No filtering.
+  if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
+    return;
+  }
+
+  uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8);
+  uint8x8_t filter4_cond = vmvn_u8(filter8_cond);
+  uint8x8_t filter14_cond = vand_u8(filter8_cond, flat2_8x8);
+
+  if (vget_lane_s64(vreinterpret_s64_u8(filter14_cond), 0) == -1) {
+    // Only filter14() applies.
+    filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0,
+             &out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4,
+             &out_f14_pq5);
+
+    *p0q0 = out_f14_pq0;
+    *p1q1 = out_f14_pq1;
+    *p2q2 = out_f14_pq2;
+    *p3q3 = out_f14_pq3;
+    *p4q4 = out_f14_pq4;
+    *p5q5 = out_f14_pq5;
+  } else if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 &&
+             vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) {
+    // Only filter8() applies.
+    filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2);
+
+    *p0q0 = out_f7_pq0;
+    *p1q1 = out_f7_pq1;
+    *p2q2 = out_f7_pq2;
+  } else {
+    filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
+
+    if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 &&
+        vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) {
+      // filter8() and filter14() do not apply, but filter4() applies to one or
+      // more values.
+      *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+      *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+    } else {
+      filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1,
+              &out_f7_pq2);
+
+      if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0) {
+        // filter14() does not apply, but filter8() and filter4() apply to one
+        // or more values. filter4 outputs
+        *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+        *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+        // filter8 outputs
+        *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+        *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+        *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+      } else {
+        // All filters may contribute values to final outputs.
+        filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0,
+                 &out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4,
+                 &out_f14_pq5);
+
+        // filter4 outputs
+        *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+        *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+        // filter8 outputs
+        *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+        *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+        *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+
+        // filter14 outputs
+        *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
+        *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
+        *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
+        *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
+        *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
+        *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
+      }
+    }
+  }
+}
+
+static inline void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                              uint8x8_t *p0q0, const uint8_t blimit,
+                              const uint8_t limit, const uint8_t thresh) {
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+
+  // Calculate filter masks.
+  uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+
+  // No filtering.
+  if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
+    return;
+  }
+
+  uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8);
+  uint8x8_t filter4_cond = vmvn_u8(filter8_cond);
+
+  // Not needing filter4() at all is a very common case, so isolate it to avoid
+  // needlessly computing filter4().
+  if (vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) {
+    filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2);
+
+    *p0q0 = out_f7_pq0;
+    *p1q1 = out_f7_pq1;
+    *p2q2 = out_f7_pq2;
+  } else {
+    filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
+
+    if (vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) {
+      // filter8() does not apply, but filter4() applies to one or more values.
+      *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+      *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+    } else {
+      filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1,
+              &out_f7_pq2);
+
+      // filter4 outputs
+      *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+      *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+      // filter8 outputs
+      *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+      *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+      *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+    }
+  }
+}
+
+static inline void filter6(const uint8x8_t p0q0, const uint8x8_t p1q1,
+                           const uint8x8_t p2q2, uint8x8_t *p0q0_output,
+                           uint8x8_t *p1q1_output) {
+  uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
+
+  uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
+  uint16x8_t out = vaddq_u16(p0q0_p1q1, p0q0_p1q1);
+
+  uint16x8_t q0p0_p2q2 = vaddl_u8(q0p0, p2q2);
+  out = vaddq_u16(out, q0p0_p2q2);
+
+  uint16x8_t q0p0_q1p1 = vextq_u16(p0q0_p1q1, p0q0_p1q1, 4);
+  uint16x8_t out_pq0 = vaddq_u16(out, q0p0_q1p1);
+
+  uint16x8_t p2q2_p2q2 = vaddl_u8(p2q2, p2q2);
+  uint16x8_t out_pq1 = vaddq_u16(out, p2q2_p2q2);
+
+  *p0q0_output = vrshrn_n_u16(out_pq0, 3);
+  *p1q1_output = vrshrn_n_u16(out_pq1, 3);
+}
+
+static inline void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
+                              const uint8_t blimit, const uint8_t limit,
+                              const uint8_t thresh) {
+  uint8x8_t out_f6_pq0, out_f6_pq1;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+
+  // Calculate filter masks.
+  uint8x8_t mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
+  uint8x8_t flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
+
+  // No filtering.
+  if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
+    return;
+  }
+
+  uint8x8_t filter6_cond = vand_u8(flat_8x8, mask_8x8);
+  uint8x8_t filter4_cond = vmvn_u8(filter6_cond);
+
+  // Not needing filter4 at all is a very common case, so isolate it to avoid
+  // needlessly computing filter4.
+  if (vget_lane_s64(vreinterpret_s64_u8(filter6_cond), 0) == -1) {
+    filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1);
+
+    *p0q0 = out_f6_pq0;
+    *p1q1 = out_f6_pq1;
+  } else {
+    filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
+
+    if (vget_lane_u64(vreinterpret_u64_u8(filter6_cond), 0) == 0) {
+      // filter6 does not apply, but filter4 applies to one or more values.
+      *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+      *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+    } else {
+      // All filters may contribute to the final output.
+      filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1);
+
+      // filter4 outputs
+      *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+      *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+      // filter6 outputs
+      *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
+      *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
+    }
+  }
+}
+
+static inline void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0,
+                              const uint8_t blimit, const uint8_t limit,
+                              const uint8_t thresh) {
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+
+  // Calculate filter mask
+  uint8x8_t mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+
+  // No filtering.
+  if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
+    return;
+  }
+
+  filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
+
+  *p0q0 = out_f4_pq0;
+  *p1q1 = out_f4_pq1;
 }
 
 void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
diff --git a/third_party/aom/aom_dsp/arm/mem_neon.h b/third_party/aom/aom_dsp/arm/mem_neon.h
index 494dde14a3ee..9cdafecedcd5 100644
--- a/third_party/aom/aom_dsp/arm/mem_neon.h
+++ b/third_party/aom/aom_dsp/arm/mem_neon.h
@@ -55,12 +55,52 @@ static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
   return res;
 }
 
+static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
+  int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
+  return res;
+}
+
+static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
+  int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
+                        vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
+  return res;
+}
+
+static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
+  vst1_u8(ptr + 0 * 8, a.val[0]);
+  vst1_u8(ptr + 1 * 8, a.val[1]);
+}
+
+static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
+  vst1_u8(ptr + 0 * 8, a.val[0]);
+  vst1_u8(ptr + 1 * 8, a.val[1]);
+  vst1_u8(ptr + 2 * 8, a.val[2]);
+  vst1_u8(ptr + 3 * 8, a.val[3]);
+}
+
+static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) {
+  vst1q_u16(ptr + 0 * 8, a.val[0]);
+  vst1q_u16(ptr + 1 * 8, a.val[1]);
+}
+
+static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) {
+  vst1q_u16(ptr + 0 * 8, a.val[0]);
+  vst1q_u16(ptr + 1 * 8, a.val[1]);
+  vst1q_u16(ptr + 2 * 8, a.val[2]);
+  vst1q_u16(ptr + 3 * 8, a.val[3]);
+}
+
 #elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
 #if __GNUC__ < 8
 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
   return res;
 }
+
+static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
+  int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
+  return res;
+}
 #endif  // __GNUC__ < 8
 
 #if __GNUC__ < 9
@@ -71,13 +111,30 @@ static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
 }
 #endif  // __GNUC__ < 9
 
-// vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
   return res;
 }
+
+static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
+  int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
+                        vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
+  return res;
+}
+
+static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
+  vst1_u8(ptr + 0 * 8, a.val[0]);
+  vst1_u8(ptr + 1 * 8, a.val[1]);
+}
+
+static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
+  vst1_u8(ptr + 0 * 8, a.val[0]);
+  vst1_u8(ptr + 1 * 8, a.val[1]);
+  vst1_u8(ptr + 2 * 8, a.val[2]);
+  vst1_u8(ptr + 3 * 8, a.val[3]);
+}
 #endif  // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
 #endif  // defined(__GNUC__) && !defined(__clang__)
 
@@ -215,6 +272,23 @@ static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
   s += p;
 }
 
+static inline void load_u16_4x6(const uint16_t *s, ptrdiff_t p,
+                                uint16x4_t *const s0, uint16x4_t *const s1,
+                                uint16x4_t *const s2, uint16x4_t *const s3,
+                                uint16x4_t *const s4, uint16x4_t *const s5) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+  *s4 = vld1_u16(s);
+  s += p;
+  *s5 = vld1_u16(s);
+}
+
 static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
                                 uint16x4_t *const s0, uint16x4_t *const s1,
                                 uint16x4_t *const s2, uint16x4_t *const s3,
@@ -235,6 +309,65 @@ static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
   *s6 = vld1_u16(s);
 }
 
+static inline void load_u16_4x8(const uint16_t *s, ptrdiff_t p,
+                                uint16x4_t *const s0, uint16x4_t *const s1,
+                                uint16x4_t *const s2, uint16x4_t *const s3,
+                                uint16x4_t *const s4, uint16x4_t *const s5,
+                                uint16x4_t *const s6, uint16x4_t *const s7) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+  *s4 = vld1_u16(s);
+  s += p;
+  *s5 = vld1_u16(s);
+  s += p;
+  *s6 = vld1_u16(s);
+  s += p;
+  *s7 = vld1_u16(s);
+}
+
+static inline void load_u16_4x14(const uint16_t *s, ptrdiff_t p,
+                                 uint16x4_t *const s0, uint16x4_t *const s1,
+                                 uint16x4_t *const s2, uint16x4_t *const s3,
+                                 uint16x4_t *const s4, uint16x4_t *const s5,
+                                 uint16x4_t *const s6, uint16x4_t *const s7,
+                                 uint16x4_t *const s8, uint16x4_t *const s9,
+                                 uint16x4_t *const s10, uint16x4_t *const s11,
+                                 uint16x4_t *const s12, uint16x4_t *const s13) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+  *s4 = vld1_u16(s);
+  s += p;
+  *s5 = vld1_u16(s);
+  s += p;
+  *s6 = vld1_u16(s);
+  s += p;
+  *s7 = vld1_u16(s);
+  s += p;
+  *s8 = vld1_u16(s);
+  s += p;
+  *s9 = vld1_u16(s);
+  s += p;
+  *s10 = vld1_u16(s);
+  s += p;
+  *s11 = vld1_u16(s);
+  s += p;
+  *s12 = vld1_u16(s);
+  s += p;
+  *s13 = vld1_u16(s);
+}
+
 static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
                                 int16x8_t *const s0, int16x8_t *const s1) {
   *s0 = vld1q_s16(s);
@@ -597,6 +730,56 @@ static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
   vst1_u16(s, s3);
 }
 
+static inline void store_u16_4x6(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2, const uint16x4_t s3,
+                                 const uint16x4_t s4, const uint16x4_t s5) {
+  vst1_u16(s, s0);
+  s += dst_stride;
+  vst1_u16(s, s1);
+  s += dst_stride;
+  vst1_u16(s, s2);
+  s += dst_stride;
+  vst1_u16(s, s3);
+  s += dst_stride;
+  vst1_u16(s, s4);
+  s += dst_stride;
+  vst1_u16(s, s5);
+}
+
+static inline void store_u16_4x12(uint16_t *s, ptrdiff_t dst_stride,
+                                  const uint16x4_t s0, const uint16x4_t s1,
+                                  const uint16x4_t s2, const uint16x4_t s3,
+                                  const uint16x4_t s4, const uint16x4_t s5,
+                                  const uint16x4_t s6, const uint16x4_t s7,
+                                  const uint16x4_t s8, const uint16x4_t s9,
+                                  const uint16x4_t s10, const uint16x4_t s11) {
+  vst1_u16(s, s0);
+  s += dst_stride;
+  vst1_u16(s, s1);
+  s += dst_stride;
+  vst1_u16(s, s2);
+  s += dst_stride;
+  vst1_u16(s, s3);
+  s += dst_stride;
+  vst1_u16(s, s4);
+  s += dst_stride;
+  vst1_u16(s, s5);
+  s += dst_stride;
+  vst1_u16(s, s6);
+  s += dst_stride;
+  vst1_u16(s, s7);
+  s += dst_stride;
+  vst1_u16(s, s8);
+  s += dst_stride;
+  vst1_u16(s, s9);
+  s += dst_stride;
+  vst1_u16(s, s10);
+  s += dst_stride;
+  vst1_u16(s, s11);
+  s += dst_stride;
+}
+
 static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x8_t s0, const uint16x8_t s1) {
   vst1q_u16(s, s0);
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
index bbaa0a0c4818..0f829821a996 100644
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -46,16 +46,6 @@ static inline __m128i xx_loadu_128(const void *a) {
   return _mm_loadu_si128((const __m128i *)a);
 }
 
-// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
-// manually on older compilers.
-#if !defined(__clang__) && __GNUC_MAJOR__ < 9
-static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
-  __m64 hi_, lo_;
-  memcpy(&hi_, hi, sizeof(hi_));
-  memcpy(&lo_, lo, sizeof(lo_));
-  return _mm_set_epi64(hi_, lo_);
-}
-#else
 // Load 64 bits from each of hi and low, and pack into an SSE register
 // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
 // the strict aliasing rule, this takes a different approach
@@ -63,7 +53,6 @@ static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
   return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo),
                             _mm_loadl_epi64((const __m128i *)hi));
 }
-#endif
 
 static inline void xx_storel_32(void *const a, const __m128i v) {
   const int val = _mm_cvtsi128_si32(v);
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
index 5b8a79f8c445..20e6a4b23a04 100644
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -76,26 +76,11 @@ static inline __m256i yy_loadu_4x64(const void *e3, const void *e2,
   return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
 }
 
-#define GCC_VERSION (__GNUC__ * 10000 \
-                     + __GNUC_MINOR__ * 100 \
-                     + __GNUC_PATCHLEVEL__)
-
-// _mm256_loadu2_m128i has been introduced in GCC 10.1
-#if !defined(__clang__) && GCC_VERSION < 101000
-static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
-  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
-  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
-  return _mm256_set_m128i(mhi, mlo);
-}
-#else
 static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
   __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
   __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
   return yy_set_m128i(mhi, mlo);
 }
-#endif
-
-#undef GCC_VERSION
 
 static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
   _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
diff --git a/third_party/aom/aom_ports/aom_ports.cmake b/third_party/aom/aom_ports/aom_ports.cmake
index 1746efa3cc29..33382e9541ab 100644
--- a/third_party/aom/aom_ports/aom_ports.cmake
+++ b/third_party/aom/aom_ports/aom_ports.cmake
@@ -38,6 +38,9 @@ endif()
 list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h"
             "${AOM_ROOT}/aom_ports/ppc_cpudetect.c")
 
+list(APPEND AOM_PORTS_SOURCES_RISCV "${AOM_ROOT}/aom_ports/riscv.h"
+            "${AOM_ROOT}/aom_ports/riscv_cpudetect.c")
+
 # For arm and x86 targets:
 #
 # * Creates the aom_ports build target, adds the includes in aom_ports to the
@@ -68,9 +71,12 @@ function(setup_aom_ports_targets)
   elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
     add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
     set(aom_ports_has_symbols 1)
+  elseif("${AOM_TARGET_CPU}" MATCHES "riscv")
+    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_RISCV})
+    set(aom_ports_has_symbols 1)
   endif()
 
-  if("${AOM_TARGET_CPU}" MATCHES "arm|ppc")
+  if("${AOM_TARGET_CPU}" MATCHES "arm|ppc|riscv")
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
     if(BUILD_SHARED_LIBS)
       target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_ports>)
diff --git a/third_party/aom/aom_ports/riscv.h b/third_party/aom/aom_ports/riscv.h
new file mode 100644
index 000000000000..91cee48e6f0e
--- /dev/null
+++ b/third_party/aom/aom_ports/riscv.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_RISCV_H_
+#define AOM_AOM_PORTS_RISCV_H_
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_RVV 0x01
+
+int riscv_simd_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_PORTS_RISCV_H_
diff --git a/third_party/aom/aom_ports/riscv_cpudetect.c b/third_party/aom/aom_ports/riscv_cpudetect.c
new file mode 100644
index 000000000000..af3663c1a8f6
--- /dev/null
+++ b/third_party/aom/aom_ports/riscv_cpudetect.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/riscv.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+
+#include <sys/auxv.h>
+
+#define HWCAP_RVV (1 << ('v' - 'a'))
+
+int riscv_simd_caps(void) {
+  int flags = 0;
+#if HAVE_RVV
+  unsigned long hwcap = getauxval(AT_HWCAP);
+  if (hwcap & HWCAP_RVV) flags |= HAS_RVV;
+#endif
+  return flags;
+}
+#else
+// If there is no RTCD the function pointers are not used and can not be
+// changed.
+int riscv_simd_caps(void) { return 0; }
+#endif  // CONFIG_RUNTIME_CPU_DETECT
diff --git a/third_party/aom/apps/aomenc.c b/third_party/aom/apps/aomenc.c
index ff60ef0112c0..3d58a8946ca2 100644
--- a/third_party/aom/apps/aomenc.c
+++ b/third_party/aom/apps/aomenc.c
@@ -2318,8 +2318,9 @@ int main(int argc, const char **argv_) {
                 "match input format.\n",
                 stream->config.cfg.g_profile);
       }
-      if ((global.show_psnr == 2) && (stream->config.cfg.g_input_bit_depth ==
-                                      stream->config.cfg.g_bit_depth)) {
+      if (global.show_psnr == 2 &&
+          stream->config.cfg.g_input_bit_depth ==
+              (unsigned int)stream->config.cfg.g_bit_depth) {
         fprintf(stderr,
                 "Warning: --psnr==2 and --psnr==1 will provide same "
                 "results when input bit-depth == stream bit-depth, "
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
index 486596256841..b5e656c4c3e0 100644
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -445,6 +445,9 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
 
 list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
 
+list(APPEND AOM_AV1_COMMON_INTRIN_RVV
+            "${AOM_ROOT}/av1/common/riscv/cdef_block_rvv.c")
+
 if(CONFIG_THREE_PASS)
   list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/thirdpass.c"
               "${AOM_ROOT}/av1/encoder/thirdpass.h")
@@ -822,6 +825,13 @@ function(setup_av1_targets)
     endif()
   endif()
 
+  if(HAVE_RVV)
+    if(AOM_AV1_COMMON_INTRIN_RVV)
+      add_intrinsics_object_library("-march=rv64gcv" "rvv" "aom_av1_common"
+                                    "AOM_AV1_COMMON_INTRIN_RVV")
+    endif()
+  endif()
+
   # Pass the new lib targets up to the parent scope instance of
   # $AOM_LIB_TARGETS.
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
index 813a9889104a..971dd7bc9db3 100644
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -1084,7 +1084,6 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
   AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
   ToolCfg *const tool_cfg = &oxcf->tool_cfg;
 
-  const int is_vbr = cfg->rc_end_usage == AOM_VBR;
   oxcf->profile = cfg->g_profile;
   oxcf->max_threads = (int)cfg->g_threads;
 
@@ -1167,9 +1166,9 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
   rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
   rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct;
   rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct;
-  rc_cfg->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
-  rc_cfg->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
-  rc_cfg->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+  rc_cfg->maximum_buffer_size_ms = cfg->rc_buf_sz;
+  rc_cfg->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
+  rc_cfg->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz;
   // Convert target bandwidth from Kbit/s to Bit/s
   rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate;
   rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh;
diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c
index c1763ff8b76e..e7f0ff094eb9 100644
--- a/third_party/aom/av1/common/arm/cfl_neon.c
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -13,6 +13,7 @@
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
+#include "aom_dsp/arm/mem_neon.h"
 #include "av1/common/cfl.h"
 
 static inline void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
@@ -428,10 +429,7 @@ static inline int16x8_t predict_w8(const int16_t *pred_buf_q3,
 static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
                                       int16x8_t alpha_sign, int abs_alpha_q12,
                                       int16x8_t dc) {
-  // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
-  // does not interleave, but is not currently available in the compilier used
-  // by the AOM build system.
-  const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3);
+  const int16x8x2_t ac_q3 = vld1q_s16_x2(pred_buf_q3);
   const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
   const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
   const int16x8_t scaled_luma_0 =
@@ -447,10 +445,7 @@ static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
 static inline int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
                                       int16x8_t alpha_sign, int abs_alpha_q12,
                                       int16x8_t dc) {
-  // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
-  // does not interleave, but is not currently available in the compilier used
-  // by the AOM build system.
-  const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3);
+  const int16x8x4_t ac_q3 = vld1q_s16_x4(pred_buf_q3);
   const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
   const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
   const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
@@ -497,7 +492,7 @@ static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
             predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
         const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
                                        vqmovun_s16(pred.val[1]) } };
-        vst2_u8(dst, predun);
+        vst1_u8_x2(dst, predun);
       } else {
         const int16x8x4_t pred =
             predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
@@ -505,7 +500,7 @@ static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
           { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
             vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
         };
-        vst4_u8(dst, predun);
+        vst1_u8_x4(dst, predun);
       }
       dst += dst_stride;
     } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
@@ -574,11 +569,11 @@ static inline void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
       } else if (width == 16) {
         const int16x8x2_t pred =
             predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
-        vst2q_u16(dst, clamp2q_s16(pred, max_16x8));
+        vst1q_u16_x2(dst, clamp2q_s16(pred, max_16x8));
       } else {
         const int16x8x4_t pred =
             predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
-        vst4q_u16(dst, clamp4q_s16(pred, max_16x8));
+        vst1q_u16_x4(dst, clamp4q_s16(pred, max_16x8));
       }
       dst += dst_stride;
     } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h
index 766abffff0ae..c5c8421f6a05 100644
--- a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h
+++ b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h
@@ -53,8 +53,7 @@ static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp,
 static AOM_FORCE_INLINE int16x8_t load_filters_1(int ofs) {
   const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS);
 
-  const int16_t *base =
-      (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+  const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
   return vld1q_s16(base + ofs0 * 8);
 }
 
@@ -65,8 +64,7 @@ static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int ofs,
   const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
   const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
 
-  const int16_t *base =
-      (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+  const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
   out[0] = vld1q_s16(base + ofs0 * 8);
   out[1] = vld1q_s16(base + ofs1 * 8);
   out[2] = vld1q_s16(base + ofs2 * 8);
@@ -84,8 +82,7 @@ static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int ofs,
   const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS);
   const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS);
 
-  const int16_t *base =
-      (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+  const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
   out[0] = vld1q_s16(base + ofs0 * 8);
   out[1] = vld1q_s16(base + ofs1 * 8);
   out[2] = vld1q_s16(base + ofs2 * 8);
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c
index 3656beb3995a..497273bc65c9 100644
--- a/third_party/aom/av1/common/arm/warp_plane_neon.c
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.c
@@ -101,8 +101,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
                                                            int sx) {
-  int16x8_t f_s16 =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
   return horizontal_filter_4x1_f1_beta0(in, f_s16);
 }
 
@@ -140,8 +139,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
                                                            int sx) {
-  int16x8_t f_s16 =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
   return horizontal_filter_8x1_f1_beta0(in, f_s16);
 }
 
@@ -156,8 +154,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
   int16x4_t s6 = vget_low_s16(src[6]);
   int16x4_t s7 = vget_low_s16(src[7]);
 
-  int16x8_t f =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
 
   int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
   m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
@@ -210,8 +207,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
   int16x8_t s6 = src[6];
   int16x8_t s7 = src[7];
 
-  int16x8_t f =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
 
   int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
   m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.h b/third_party/aom/av1/common/arm/warp_plane_neon.h
index 777ac4b9568b..2909df7b7f74 100644
--- a/third_party/aom/av1/common/arm/warp_plane_neon.h
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.h
@@ -61,34 +61,34 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
 
 static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset,
                                             int stride) {
-  out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
-  out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
-  out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
-  out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
+  out[0] = vld1q_s16(
+      av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]);
+  out[1] = vld1q_s16(
+      av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]);
+  out[2] = vld1q_s16(
+      av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]);
+  out[3] = vld1q_s16(
+      av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]);
 }
 
 static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int offset,
                                             int stride) {
-  out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
-  out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
-  out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
-  out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
-  out[4] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 4 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
-  out[5] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 5 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
-  out[6] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 6 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
-  out[7] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 7 * stride) >>
-                                                      WARPEDDIFF_PREC_BITS)));
+  out[0] = vld1q_s16(
+      av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]);
+  out[1] = vld1q_s16(
+      av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]);
+  out[2] = vld1q_s16(
+      av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]);
+  out[3] = vld1q_s16(
+      av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]);
+  out[4] = vld1q_s16(
+      av1_warped_filter[(offset + 4 * stride) >> WARPEDDIFF_PREC_BITS]);
+  out[5] = vld1q_s16(
+      av1_warped_filter[(offset + 5 * stride) >> WARPEDDIFF_PREC_BITS]);
+  out[6] = vld1q_s16(
+      av1_warped_filter[(offset + 6 * stride) >> WARPEDDIFF_PREC_BITS]);
+  out[7] = vld1q_s16(
+      av1_warped_filter[(offset + 7 * stride) >> WARPEDDIFF_PREC_BITS]);
 }
 
 static AOM_FORCE_INLINE int clamp_iy(int iy, int height) {
@@ -175,8 +175,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
   if (p_width == 4) {
     if (beta == 0) {
       if (alpha == 0) {
-        int16x8_t f_s16 = vld1q_s16(
-            (int16_t *)(av1_warped_filter + (sx4 >> WARPEDDIFF_PREC_BITS)));
+        int16x8_t f_s16 =
+            vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
         APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16);
       } else {
         APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha);
@@ -193,8 +193,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
   } else {
     if (beta == 0) {
       if (alpha == 0) {
-        int16x8_t f_s16 = vld1q_s16(
-            (int16_t *)(av1_warped_filter + (sx4 >> WARPEDDIFF_PREC_BITS)));
+        int16x8_t f_s16 =
+            vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
         APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16);
       } else {
         APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha);
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c
index 7ef762692831..15ac0043cac7 100644
--- a/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c
+++ b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c
@@ -109,8 +109,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
                                                            int sx) {
-  int16x8_t f_s16 =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
   return horizontal_filter_4x1_f1_beta0(in, f_s16);
 }
 
@@ -145,8 +144,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
                                                            int sx) {
-  int16x8_t f_s16 =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
   return horizontal_filter_8x1_f1_beta0(in, f_s16);
 }
 
@@ -161,8 +159,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
   int16x4_t s6 = vget_low_s16(src[6]);
   int16x4_t s7 = vget_low_s16(src[7]);
 
-  int16x8_t f =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
 
   int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
   m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
@@ -215,8 +212,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
   int16x8_t s6 = src[6];
   int16x8_t s7 = src[7];
 
-  int16x8_t f =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
 
   int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
   m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
diff --git a/third_party/aom/av1/common/arm/warp_plane_sve.c b/third_party/aom/av1/common/arm/warp_plane_sve.c
index 51b1bb75e45b..10aee35b1a6f 100644
--- a/third_party/aom/av1/common/arm/warp_plane_sve.c
+++ b/third_party/aom/av1/common/arm/warp_plane_sve.c
@@ -112,8 +112,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
                                                            int sx) {
-  int16x8_t f_s16 =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
   return horizontal_filter_4x1_f1_beta0(in, f_s16);
 }
 
@@ -148,8 +147,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
 
 static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
                                                            int sx) {
-  int16x8_t f_s16 =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
   return horizontal_filter_8x1_f1_beta0(in, f_s16);
 }
 
@@ -164,8 +162,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
   int16x4_t s6 = vget_low_s16(src[6]);
   int16x4_t s7 = vget_low_s16(src[7]);
 
-  int16x8_t f =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
 
   int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
   m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
@@ -215,8 +212,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
   int16x8_t s6 = src[6];
   int16x8_t s7 = src[7];
 
-  int16x8_t f =
-      vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
 
   int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
   m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index 13e1d311c1b5..cfde810bd6e7 100644
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -495,22 +495,22 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
 # structs as arguments, which makes the v256 type of the intrinsics
 # hard to support, so optimizations for this target are disabled.
 if ($opts{config} !~ /libs-x86-win32-vs.*/) {
-  specialize qw/cdef_find_dir sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_find_dir sse4_1 avx2 neon rvv/, "$ssse3_x86";
   specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/, "$ssse3_x86";
 
-  specialize qw/cdef_filter_8_0 sse4_1 avx2 neon/, "$ssse3_x86";
-  specialize qw/cdef_filter_8_1 sse4_1 avx2 neon/, "$ssse3_x86";
-  specialize qw/cdef_filter_8_2 sse4_1 avx2 neon/, "$ssse3_x86";
-  specialize qw/cdef_filter_8_3 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_0 sse4_1 avx2 neon rvv/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_1 sse4_1 avx2 neon rvv/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_2 sse4_1 avx2 neon rvv/, "$ssse3_x86";
+  specialize qw/cdef_filter_8_3 sse4_1 avx2 neon rvv/, "$ssse3_x86";
 
-  specialize qw/cdef_filter_16_0 sse4_1 avx2 neon/, "$ssse3_x86";
-  specialize qw/cdef_filter_16_1 sse4_1 avx2 neon/, "$ssse3_x86";
-  specialize qw/cdef_filter_16_2 sse4_1 avx2 neon/, "$ssse3_x86";
-  specialize qw/cdef_filter_16_3 sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_0 sse4_1 avx2 neon rvv/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_1 sse4_1 avx2 neon rvv/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_2 sse4_1 avx2 neon rvv/, "$ssse3_x86";
+  specialize qw/cdef_filter_16_3 sse4_1 avx2 neon rvv/, "$ssse3_x86";
 
-  specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86";
+  specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon rvv/, "$ssse3_x86";
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86";
+    specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon rvv/, "$ssse3_x86";
   }
 }
 
diff --git a/third_party/aom/av1/common/riscv/cdef_block_rvv.c b/third_party/aom/av1/common/riscv/cdef_block_rvv.c
new file mode 100644
index 000000000000..8ccfc2a654b7
--- /dev/null
+++ b/third_party/aom/av1/common/riscv/cdef_block_rvv.c
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <riscv_vector.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/cdef_block.h"
+
+// partial A is a 16-bit vector of the form:
+// [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+// [0  y1 y2 y3 y4 y5 y6 y7].
+// This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+// (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+// and const2.
+static inline vuint32m1_t fold_mul_and_sum_rvv(vint16m1_t partiala,
+                                               vint16m1_t partialb,
+                                               vuint32m1_t const1,
+                                               vuint32m1_t const2) {
+  // Square and add the corresponding x and y values.
+  vint32m2_t cost = __riscv_vwmul_vv_i32m2(partiala, partiala, 8);
+  cost = __riscv_vwmacc_vv_i32m2(cost, partialb, partialb, 8);
+
+  // Multiply by constant.
+  vuint32m2_t tmp1_u32m2 = __riscv_vreinterpret_v_i32m2_u32m2(cost);
+  vuint32m1_t cost_u32m1 = __riscv_vmul_vv_u32m1(
+      __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const1, 4);
+  tmp1_u32m2 = __riscv_vslidedown_vx_u32m2(tmp1_u32m2, 4, 8);
+  vuint32m1_t ret = __riscv_vmacc_vv_u32m1(
+      cost_u32m1, __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const2, 4);
+  return ret;
+}
+
+// This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal
+// down-right, 6 is vertical).
+//
+// For each direction the lines are shifted so that we can perform a
+// basic sum on each vector element. For example, direction 5 is "south by
+// southeast", so we need to add the pixels along each line i below:
+//
+// 0  1 2 3 4 5 6 7
+// 0  1 2 3 4 5 6 7
+// 8  0 1 2 3 4 5 6
+// 8  0 1 2 3 4 5 6
+// 9  8 0 1 2 3 4 5
+// 9  8 0 1 2 3 4 5
+// 10 9 8 0 1 2 3 4
+// 10 9 8 0 1 2 3 4
+//
+// For this to fit nicely in vectors, the lines need to be shifted like so:
+//        0 1 2 3 4 5 6 7
+//        0 1 2 3 4 5 6 7
+//      8 0 1 2 3 4 5 6
+//      8 0 1 2 3 4 5 6
+//    9 8 0 1 2 3 4 5
+//    9 8 0 1 2 3 4 5
+// 10 9 8 0 1 2 3 4
+// 10 9 8 0 1 2 3 4
+//
+// In this configuration we can now perform SIMD additions to get the cost
+// along direction 5. Since this won't fit into a single 128-bit vector, we use
+// two of them to compute each half of the new configuration, and pad the empty
+// spaces with zeros. Similar shifting is done for other directions, except
+// direction 6 which is straightforward as it's the vertical direction.
+static vuint32m1_t compute_vert_directions_rvv(
+    vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2,
+    vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5,
+    vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) {
+  size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16);
+  vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl);
+
+  // Partial sums for lines 0 and 1.
+  vint16m1_t partial4a =
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 1), vl);
+  vint16m1_t tmp1_i16m1 =
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 2), vl);
+  partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl);
+  vint16m1_t partial4b = __riscv_vslide1down_vx_i16m1(lines_0, 0, vl);
+  tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_1, 2, VL_SLIDE_DOWN);
+  partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl);
+  tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_0, lines_1, VL_SLIDE_DOWN);
+  vint16m1_t partial5a =
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl);
+  vint16m1_t partial5b =
+      __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN);
+  vint16m1_t partial7a =
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl);
+  vint16m1_t partial7b =
+      __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN);
+  vint16m1_t partial6 = __riscv_vmv_v_v_i16m1(tmp1_i16m1, vl);
+
+  // Partial sums for lines 2 and 3.
+  tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 3), vl);
+  partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl);
+  tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 4), vl);
+  partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl);
+  tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_2, 3, VL_SLIDE_DOWN);
+  partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl);
+  tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_3, 4, VL_SLIDE_DOWN);
+  partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl);
+  tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_2, lines_3, VL_SLIDE_DOWN);
+  partial5a = __riscv_vadd_vv_i16m1(
+      partial5a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl);
+  partial5b = __riscv_vadd_vv_i16m1(
+      partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl);
+  partial7a = __riscv_vadd_vv_i16m1(
+      partial7a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl);
+  partial7b = __riscv_vadd_vv_i16m1(
+      partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl);
+  partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl);
+
+  // Partial sums for lines 4 and 5.
+  partial4a = __riscv_vadd_vv_i16m1(
+      partial4a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 5), vl), vl);
+  partial4a = __riscv_vadd_vv_i16m1(
+      partial4a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl);
+  partial4b = __riscv_vadd_vv_i16m1(
+      partial4b, __riscv_vslidedown_vx_i16m1(lines_4, 5, VL_SLIDE_DOWN), vl);
+  partial4b = __riscv_vadd_vv_i16m1(
+      partial4b, __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN), vl);
+  tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_4, lines_5, VL_SLIDE_DOWN);
+  partial5a = __riscv_vadd_vv_i16m1(
+      partial5a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl);
+  partial5b = __riscv_vadd_vv_i16m1(
+      partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl);
+  partial7a = __riscv_vadd_vv_i16m1(
+      partial7a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl);
+  partial7b = __riscv_vadd_vv_i16m1(
+      partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl);
+  partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl);
+
+  // Partial sums for lines 6 and 7.
+  partial4a = __riscv_vadd_vv_i16m1(
+      partial4a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 7), vl), vl);
+  partial4a = __riscv_vadd_vv_i16m1(partial4a, lines_7, vl);
+  partial4b = __riscv_vadd_vv_i16m1(
+      partial4b, __riscv_vslidedown_vx_i16m1(lines_6, 7, VL_SLIDE_DOWN), vl);
+  tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_6, lines_7, VL_SLIDE_DOWN);
+  partial5a = __riscv_vadd_vv_i16m1(
+      partial5a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl), vl);
+  partial5b = __riscv_vadd_vv_i16m1(
+      partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN), vl);
+  partial7a = __riscv_vadd_vv_i16m1(
+      partial7a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl), vl);
+  partial7b = __riscv_vadd_vv_i16m1(
+      partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN), vl);
+  partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl);
+
+  // const0 = { 840, 420, 280, 210, }
+  vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4);
+  const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4);
+  const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4);
+  const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4);
+
+  // const1 = { 168, 140, 120, 105, }
+  vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4);
+  const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4);
+  const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4);
+  const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4);
+
+  // const2 = { 0, 0, 420, 210, }
+  vuint32m1_t const2 = __riscv_vmv_v_x_u32m1(0, 4);
+  const2 = __riscv_vslide1down_vx_u32m1(const2, 420, 4);
+  const2 = __riscv_vslide1down_vx_u32m1(const2, 210, 4);
+
+  // const3 = { 140, 105, 105, 105, };
+  vuint32m1_t const3 = __riscv_vmv_v_x_u32m1(105, 4);
+  const3 = __riscv_vslide1up_vx_u32m1(const3, 140, 4);
+
+  // Compute costs in terms of partial sums.
+  vint32m2_t tmp1_i32m2 = __riscv_vwmul_vv_i32m2(partial6, partial6, vl);
+  vint32m2_t partial6_s32 = __riscv_vslidedown_vx_i32m2(tmp1_i32m2, 4, vl);
+  partial6_s32 = __riscv_vadd_vv_i32m2(partial6_s32, tmp1_i32m2, 4);
+
+  // Reverse partial B.
+  // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, }.
+  vuint32m1_t costs_0, costs_1, costs_2, costs_3;
+  static const uint16_t tab_u16[8] = {
+    6, 5, 4, 3, 2, 1, 0, 7,
+  };
+  vuint16m1_t index_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8);
+  vint16m1_t partial4b_rv =
+      __riscv_vrgather_vv_i16m1(partial4b, index_u16m1, 8);
+  costs_0 = fold_mul_and_sum_rvv(partial4a, partial4b_rv, const0, const1);
+  vuint32m1_t partial6_u32 = __riscv_vreinterpret_v_i32m1_u32m1(
+      __riscv_vlmul_trunc_v_i32m2_i32m1(partial6_s32));
+  costs_2 = __riscv_vmul_vx_u32m1(partial6_u32, 105, 4);
+  vint16m1_t partial5b_rv =
+      __riscv_vrgather_vv_i16m1(partial5b, index_u16m1, 8);
+  costs_1 = fold_mul_and_sum_rvv(partial5a, partial5b_rv, const2, const3);
+  vint16m1_t partial7b_rv =
+      __riscv_vrgather_vv_i16m1(partial7b, index_u16m1, 8);
+  costs_3 = fold_mul_and_sum_rvv(partial7a, partial7b_rv, const2, const3);
+
+  // combine values
+  vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1);
+  vuint32m1_t cost0_sum =
+      __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4);
+  vuint32m1_t cost1_sum =
+      __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4);
+  vuint32m1_t cost2_sum =
+      __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4);
+  vuint32m1_t cost3_sum =
+      __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4);
+
+  vuint32m1_t cost47 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4);
+  cost47 = __riscv_vslideup_vx_u32m1(cost47, cost2_sum, 2, 4);
+  cost47 = __riscv_vslideup_vx_u32m1(cost47, cost3_sum, 3, 4);
+  __riscv_vse32_v_u32m1(&cost[0], cost47, 4);
+  return cost47;
+}
+
+static inline vuint32m1_t fold_mul_and_sum_pairwise_rvv(vint16m1_t partiala,
+                                                        vint16m1_t partialb,
+                                                        vint16m1_t partialc,
+                                                        vuint32m1_t const0) {
+  vuint16m1_t vid_u16m1 = __riscv_vid_v_u16m1(4);
+  vuint16m1_t index_u16m1 = __riscv_vsll_vx_u16m1(vid_u16m1, 1, 4);
+  vint16m1_t tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partiala, 0, 8);
+  vint32m2_t partiala_i32m2 = __riscv_vwadd_vv_i32m2(partiala, tmp_i16m1, 8);
+  tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialb, 0, 8);
+  vint32m2_t partialb_i32m2 = __riscv_vwadd_vv_i32m2(partialb, tmp_i16m1, 8);
+
+  tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialc, 0, 8);
+  vint32m2_t partialc_i32m2 = __riscv_vwadd_vv_i32m2(partialc, tmp_i16m1, 8);
+  partiala_i32m2 = __riscv_vmul_vv_i32m2(partiala_i32m2, partiala_i32m2, 8);
+  partialb_i32m2 = __riscv_vmul_vv_i32m2(partialb_i32m2, partialb_i32m2, 8);
+  vint32m1_t partialb_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1(
+      __riscv_vrgatherei16_vv_i32m2(partialb_i32m2, index_u16m1, 4));
+  partialc_i32m2 = __riscv_vmul_vv_i32m2(partialc_i32m2, partialc_i32m2, 8);
+  partiala_i32m2 = __riscv_vadd_vv_i32m2(partiala_i32m2, partialc_i32m2, 8);
+  vint32m1_t partiala_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1(
+      __riscv_vrgatherei16_vv_i32m2(partiala_i32m2, index_u16m1, 4));
+
+  vuint32m1_t cost = __riscv_vmul_vx_u32m1(
+      __riscv_vreinterpret_v_i32m1_u32m1(partialb_i32m1), 105, 4);
+  cost = __riscv_vmacc_vv_u32m1(
+      cost, __riscv_vreinterpret_v_i32m1_u32m1(partiala_i32m1), const0, 4);
+  return cost;
+}
+
+static inline vint32m1_t horizontal_add_4d_s16x8(vint16m1_t lines_0,
+                                                 vint16m1_t lines_1,
+                                                 vint16m1_t lines_2,
+                                                 vint16m1_t lines_3) {
+  vint32m1_t vec_scalar_i32m1 = __riscv_vmv_s_x_i32m1(0, 1);
+  vint32m1_t lines0_sum =
+      __riscv_vwredsum_vs_i16m1_i32m1(lines_0, vec_scalar_i32m1, 8);
+  vint32m1_t lines1_sum =
+      __riscv_vwredsum_vs_i16m1_i32m1(lines_1, vec_scalar_i32m1, 8);
+  vint32m1_t lines2_sum =
+      __riscv_vwredsum_vs_i16m1_i32m1(lines_2, vec_scalar_i32m1, 8);
+  vint32m1_t lines3_sum =
+      __riscv_vwredsum_vs_i16m1_i32m1(lines_3, vec_scalar_i32m1, 8);
+
+  vint32m1_t ret = __riscv_vslideup_vx_i32m1(lines0_sum, lines1_sum, 1, 4);
+  ret = __riscv_vslideup_vx_i32m1(ret, lines2_sum, 2, 4);
+  ret = __riscv_vslideup_vx_i32m1(ret, lines3_sum, 3, 4);
+  return ret;
+}
+
+// This function computes the cost along directions 0, 1, 2, 3. (0 means
+// 45-degree up-right, 2 is horizontal).
+//
+// For direction 1 and 3 ("east northeast" and "east southeast") the shifted
+// lines need three vectors instead of two. For direction 1 for example, we need
+// to compute the sums along the line i below:
+// 0 0 1 1 2 2 3  3
+// 1 1 2 2 3 3 4  4
+// 2 2 3 3 4 4 5  5
+// 3 3 4 4 5 5 6  6
+// 4 4 5 5 6 6 7  7
+// 5 5 6 6 7 7 8  8
+// 6 6 7 7 8 8 9  9
+// 7 7 8 8 9 9 10 10
+//
+// Which means we need the following configuration:
+// 0 0 1 1 2 2 3 3
+//     1 1 2 2 3 3 4 4
+//         2 2 3 3 4 4 5 5
+//             3 3 4 4 5 5 6 6
+//                 4 4 5 5 6 6 7 7
+//                     5 5 6 6 7 7 8 8
+//                         6 6 7 7 8 8 9 9
+//                             7 7 8 8 9 9 10 10
+//
+// Three vectors are needed to compute this, as well as some extra pairwise
+// additions.
+static vuint32m1_t compute_horiz_directions_rvv(
+    vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2,
+    vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5,
+    vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) {
+  // Compute diagonal directions (1, 2, 3).
+  // Partial sums for lines 0 and 1.
+  size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16);
+  vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl);
+  vint16m1_t partial0a = __riscv_vmv_v_v_i16m1(lines_0, vl);
+  partial0a = __riscv_vadd_vv_i16m1(
+      partial0a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 7), vl), vl);
+  vint16m1_t partial0b = __riscv_vslidedown_vx_i16m1(lines_1, 7, VL_SLIDE_DOWN);
+  vint16m1_t partial1a = __riscv_vadd_vv_i16m1(
+      lines_0, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 6), vl),
+      vl);
+  vint16m1_t partial1b = __riscv_vslidedown_vx_i16m1(lines_1, 6, VL_SLIDE_DOWN);
+  vint16m1_t partial3a = __riscv_vslidedown_vx_i16m1(lines_0, 2, VL_SLIDE_DOWN);
+  partial3a = __riscv_vadd_vv_i16m1(
+      partial3a, __riscv_vslidedown_vx_i16m1(lines_1, 4, VL_SLIDE_DOWN), vl);
+  vint16m1_t partial3b =
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 2), vl);
+  partial3b = __riscv_vadd_vv_i16m1(
+      partial3b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, 4, vl), vl);
+
+  // Partial sums for lines 2 and 3.
+  partial0a = __riscv_vadd_vv_i16m1(
+      partial0a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl);
+  partial0a = __riscv_vadd_vv_i16m1(
+      partial0a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 5), vl), vl);
+  partial0b = __riscv_vadd_vv_i16m1(
+      partial0b, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl);
+  partial0b = __riscv_vadd_vv_i16m1(
+      partial0b, __riscv_vslidedown_vx_i16m1(lines_3, 5, VL_SLIDE_DOWN), vl);
+  partial1a = __riscv_vadd_vv_i16m1(
+      partial1a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 4), vl), vl);
+  partial1a = __riscv_vadd_vv_i16m1(
+      partial1a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 2), vl), vl);
+  partial1b = __riscv_vadd_vv_i16m1(
+      partial1b, __riscv_vslidedown_vx_i16m1(lines_2, 4, VL_SLIDE_DOWN), vl);
+  partial1b = __riscv_vadd_vv_i16m1(
+      partial1b, __riscv_vslidedown_vx_i16m1(lines_3, 2, VL_SLIDE_DOWN), vl);
+  partial3a = __riscv_vadd_vv_i16m1(
+      partial3a, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl);
+  partial3b = __riscv_vadd_vv_i16m1(
+      partial3b,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl);
+  partial3b = __riscv_vadd_vv_i16m1(partial3b, lines_3, vl);
+
+  // Partial sums for lines 4 and 5.
+  partial0a = __riscv_vadd_vv_i16m1(
+      partial0a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 4), vl), vl);
+  partial0a = __riscv_vadd_vv_i16m1(
+      partial0a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 3), vl), vl);
+  partial0b = __riscv_vadd_vv_i16m1(
+      partial0b, __riscv_vslidedown_vx_i16m1(lines_4, 4, VL_SLIDE_DOWN), vl);
+  partial0b = __riscv_vadd_vv_i16m1(
+      partial0b, __riscv_vslidedown_vx_i16m1(lines_5, 3, VL_SLIDE_DOWN), vl);
+  partial1b = __riscv_vadd_vv_i16m1(partial1b, lines_4, vl);
+  partial1b = __riscv_vadd_vv_i16m1(
+      partial1b,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl);
+  vint16m1_t partial1c = __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN);
+  partial3b = __riscv_vadd_vv_i16m1(
+      partial3b, __riscv_vslidedown_vx_i16m1(lines_4, 2, VL_SLIDE_DOWN), vl);
+  partial3b = __riscv_vadd_vv_i16m1(
+      partial3b, __riscv_vslidedown_vx_i16m1(lines_5, 4, VL_SLIDE_DOWN), vl);
+  vint16m1_t partial3c =
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 2), vl);
+  partial3c = __riscv_vadd_vv_i16m1(
+      partial3c,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 4), vl), vl);
+
+  // Partial sums for lines 6 and 7.
+  partial0a = __riscv_vadd_vv_i16m1(
+      partial0a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 2), vl), vl);
+  partial0a = __riscv_vadd_vv_i16m1(
+      partial0a,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 1), vl), vl);
+  partial0b = __riscv_vadd_vv_i16m1(
+      partial0b, __riscv_vslidedown_vx_i16m1(lines_6, 2, VL_SLIDE_DOWN), vl);
+  partial0b = __riscv_vadd_vv_i16m1(
+      partial0b, __riscv_vslide1down_vx_i16m1(lines_7, 0, vl), vl);
+  partial1b = __riscv_vadd_vv_i16m1(
+      partial1b,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 4), vl), vl);
+  partial1b = __riscv_vadd_vv_i16m1(
+      partial1b,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 2), vl), vl);
+  partial1c = __riscv_vadd_vv_i16m1(
+      partial1c, __riscv_vslidedown_vx_i16m1(lines_6, 4, VL_SLIDE_DOWN), vl);
+  partial1c = __riscv_vadd_vv_i16m1(
+      partial1c, __riscv_vslidedown_vx_i16m1(lines_7, 2, VL_SLIDE_DOWN), vl);
+  partial3b = __riscv_vadd_vv_i16m1(
+      partial3b, __riscv_vslidedown_vx_i16m1(lines_6, 6, VL_SLIDE_DOWN), vl);
+  partial3c = __riscv_vadd_vv_i16m1(
+      partial3c,
+      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 6), vl), vl);
+  partial3c = __riscv_vadd_vv_i16m1(partial3c, lines_7, vl);
+
+  // Special case for direction 2 as it's just a sum along each line.
+  vint32m1_t partial2a =
+      horizontal_add_4d_s16x8(lines_0, lines_1, lines_2, lines_3);
+  vint32m1_t partial2b =
+      horizontal_add_4d_s16x8(lines_4, lines_5, lines_6, lines_7);
+  vuint32m1_t partial2a_u32 = __riscv_vreinterpret_v_i32m1_u32m1(
+      __riscv_vmul_vv_i32m1(partial2a, partial2a, 4));
+  vuint32m1_t partial2b_u32 = __riscv_vreinterpret_v_i32m1_u32m1(
+      __riscv_vmul_vv_i32m1(partial2b, partial2b, 4));
+
+  // const0 = { 840, 420, 280, 210, }
+  vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4);
+  const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4);
+  const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4);
+  const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4);
+
+  // const1 = { 168, 140, 120, 105, }
+  vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4);
+  const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4);
+  const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4);
+  const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4);
+
+  // const2 = { 420, 210, 140, 105, };
+  vuint32m1_t const2 = __riscv_vmv_s_x_u32m1(105, 4);
+  const2 = __riscv_vslide1up_vx_u32m1(const2, 140, 4);
+  const2 = __riscv_vslide1up_vx_u32m1(const2, 210, 4);
+  const2 = __riscv_vslide1up_vx_u32m1(const2, 420, 4);
+
+  static const uint16_t tab_u16[8] = {
+    0, 6, 5, 4, 3, 2, 1, 0,
+  };
+  vuint32m1_t costs_0, costs_1, costs_2, costs_3;
+  vuint16m1_t template_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8);
+
+  // Reverse partial c.
+  // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, }
+  vuint16m1_t index_u16m1 = __riscv_vslide1down_vx_u16m1(template_u16m1, 7, 8);
+  vint16m1_t partial0b_rv =
+      __riscv_vrgather_vv_i16m1(partial0b, index_u16m1, 8);
+  costs_0 = fold_mul_and_sum_rvv(partial0a, partial0b_rv, const0, const1);
+
+  // Reverse partial c.
+  // pattern = { 5, 4, 3, 2, 1, 0, 6, 7, }
+  vuint16m1_t index_pair_u16m1 =
+      __riscv_vslide1down_vx_u16m1(template_u16m1, 6, 8);
+  index_pair_u16m1 = __riscv_vslide1down_vx_u16m1(index_pair_u16m1, 7, 8);
+  vint16m1_t partialc_rv =
+      __riscv_vrgather_vv_i16m1(partial1c, index_pair_u16m1, 8);
+  costs_1 =
+      fold_mul_and_sum_pairwise_rvv(partial1a, partial1b, partialc_rv, const2);
+
+  costs_2 = __riscv_vadd_vv_u32m1(partial2a_u32, partial2b_u32, 4);
+  costs_2 = __riscv_vmul_vx_u32m1(costs_2, 105, 4);
+
+  vint16m1_t partial3a_rv =
+      __riscv_vrgather_vv_i16m1(partial3a, index_pair_u16m1, 8);
+  costs_3 =
+      fold_mul_and_sum_pairwise_rvv(partial3c, partial3b, partial3a_rv, const2);
+
+  // combine values
+  vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1);
+  vuint32m1_t cost0_sum =
+      __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4);
+  vuint32m1_t cost1_sum =
+      __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4);
+  vuint32m1_t cost2_sum =
+      __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4);
+  vuint32m1_t cost3_sum =
+      __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4);
+
+  costs_0 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4);
+  costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost2_sum, 2, 4);
+  costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost3_sum, 3, 4);
+  __riscv_vse32_v_u32m1(&cost[0], costs_0, 4);
+  return costs_0;
+}
+
+int cdef_find_dir_rvv(const uint16_t *img, int stride, int32_t *var,
+                      int coeff_shift) {
+  size_t vl = 8;
+  size_t vlmax = __riscv_vsetvlmax_e16m1();
+  vuint16m1_t s;
+  vint16m1_t lines_0, lines_1, lines_2, lines_3;
+  vint16m1_t lines_4, lines_5, lines_6, lines_7;
+  vuint16m1_t vec_zero_u16m1 =
+      __riscv_vmv_v_x_u16m1(0, __riscv_vsetvl_e16m1(16));
+
+  if (vlmax == 8)
+    s = __riscv_vle16_v_u16m1(img, vl);
+  else
+    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
+  lines_0 = __riscv_vreinterpret_v_u16m1_i16m1(
+      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
+  lines_0 = __riscv_vsub_vx_i16m1(lines_0, 128, vl);
+
+  img += stride;
+  if (vlmax == 8)
+    s = __riscv_vle16_v_u16m1(img, vl);
+  else
+    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
+  lines_1 = __riscv_vreinterpret_v_u16m1_i16m1(
+      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
+  lines_1 = __riscv_vsub_vx_i16m1(lines_1, 128, vl);
+
+  img += stride;
+  if (vlmax == 8)
+    s = __riscv_vle16_v_u16m1(img, vl);
+  else
+    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
+  lines_2 = __riscv_vreinterpret_v_u16m1_i16m1(
+      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
+  lines_2 = __riscv_vsub_vx_i16m1(lines_2, 128, vl);
+
+  img += stride;
+  if (vlmax == 8)
+    s = __riscv_vle16_v_u16m1(img, vl);
+  else
+    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
+  lines_3 = __riscv_vreinterpret_v_u16m1_i16m1(
+      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
+  lines_3 = __riscv_vsub_vx_i16m1(lines_3, 128, vl);
+
+  img += stride;
+  if (vlmax == 8)
+    s = __riscv_vle16_v_u16m1(img, vl);
+  else
+    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
+  lines_4 = __riscv_vreinterpret_v_u16m1_i16m1(
+      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
+  lines_4 = __riscv_vsub_vx_i16m1(lines_4, 128, vl);
+
+  img += stride;
+  if (vlmax == 8)
+    s = __riscv_vle16_v_u16m1(img, vl);
+  else
+    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
+  lines_5 = __riscv_vreinterpret_v_u16m1_i16m1(
+      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
+  lines_5 = __riscv_vsub_vx_i16m1(lines_5, 128, vl);
+
+  img += stride;
+  if (vlmax == 8)
+    s = __riscv_vle16_v_u16m1(img, vl);
+  else
+    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
+  lines_6 = __riscv_vreinterpret_v_u16m1_i16m1(
+      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
+  lines_6 = __riscv_vsub_vx_i16m1(lines_6, 128, vl);
+
+  img += stride;
+  if (vlmax == 8)
+    s = __riscv_vle16_v_u16m1(img, vl);
+  else
+    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
+  lines_7 = __riscv_vreinterpret_v_u16m1_i16m1(
+      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
+  lines_7 = __riscv_vsub_vx_i16m1(lines_7, 128, vl);
+
+  // Compute "mostly vertical" directions.
+  uint32_t cost[8];
+  vuint32m1_t cost47 =
+      compute_vert_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4,
+                                  lines_5, lines_6, lines_7, cost + 4, vl);
+
+  // Compute "mostly horizontal" directions.
+  vuint32m1_t cost03 =
+      compute_horiz_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4,
+                                   lines_5, lines_6, lines_7, cost, vl);
+
+  // Find max cost as well as its index to get best_dir.
+  // The max cost needs to be propagated in the whole vector to find its
+  // position in the original cost vectors cost03 and cost47.
+  vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1);
+  vuint32m1_t cost07 = __riscv_vmaxu_vv_u32m1(cost03, cost47, 4);
+  uint32_t best_cost = __riscv_vmv_x_s_u32m1_u32(
+      __riscv_vredmaxu_vs_u32m1_u32m1(cost07, vec_scalar_u32m1, 4));
+  vbool32_t mask_cost = __riscv_vmseq_vx_u32m1_b32(cost03, best_cost, 4);
+  long best_dir = __riscv_vfirst_m_b32(mask_cost, 4);
+  if (best_dir == -1) {
+    mask_cost = __riscv_vmseq_vx_u32m1_b32(cost47, best_cost, 4);
+    best_dir = __riscv_vfirst_m_b32(mask_cost, 4);
+    best_dir += 4;
+  }
+
+  // Difference between the optimal variance and the variance along the
+  // orthogonal direction. Again, the sum(x^2) terms cancel out.
+  *var = best_cost - cost[(best_dir + 4) & 7];
+
+  // We'd normally divide by 840, but dividing by 1024 is close enough
+  // for what we're going to do with this.
+  *var >>= 10;
+  return (int)best_dir;
+}
+
+void cdef_copy_rect8_8bit_to_16bit_rvv(uint16_t *dst, int dstride,
+                                       const uint8_t *src, int sstride,
+                                       int width, int height) {
+  do {
+    int w = 0;
+    size_t num_cols = width;
+    while (num_cols > 0) {
+      size_t vl = __riscv_vsetvl_e8mf2(num_cols);
+      vuint8mf2_t u8_src = __riscv_vle8_v_u8mf2(src + w, vl);
+      vuint16m1_t u16_src = __riscv_vwcvtu_x_x_v_u16m1(u8_src, vl);
+      __riscv_vse16_v_u16m1(dst + w, u16_src, vl);
+
+      w += vl;
+      num_cols -= vl;
+    }
+    src += sstride;
+    dst += dstride;
+  } while (--height != 0);
+}
+
+void cdef_copy_rect8_16bit_to_16bit_rvv(uint16_t *dst, int dstride,
+                                        const uint16_t *src, int sstride,
+                                        int width, int height) {
+  do {
+    int w = 0;
+    size_t num_cols = width;
+    while (num_cols > 0) {
+      size_t vl = __riscv_vsetvl_e16m1(num_cols);
+      vuint16m1_t u16_src = __riscv_vle16_v_u16m1(src + w, vl);
+      __riscv_vse16_v_u16m1(dst + w, u16_src, vl);
+
+      w += vl;
+      num_cols -= vl;
+    }
+    src += sstride;
+    dst += dstride;
+  } while (--height != 0);
+}
+
+static inline vint16m1_t constrain16(vint16m1_t a, vint16m1_t b,
+                                     int16_t threshold, int16_t adjdamp,
+                                     size_t vl) {
+  if (!threshold) return __riscv_vmv_v_x_i16m1(0, vl);
+  const vbool16_t mask = __riscv_vmslt_vv_i16m1_b16(a, b, vl);
+  const vint16m1_t diff = __riscv_vsub_vv_i16m1(a, b, vl);
+  const vint16m1_t abs_diff = __riscv_vneg_v_i16m1_tumu(mask, diff, diff, vl);
+  const vint16m1_t shift = __riscv_vsra_vx_i16m1(abs_diff, adjdamp, vl);
+  const vint16m1_t thr = __riscv_vmv_v_x_i16m1(threshold, vl);
+  const vint16m1_t sub = __riscv_vsub_vv_i16m1(thr, shift, vl);
+  const vint16m1_t max = __riscv_vmax_vx_i16m1(sub, 0, vl);
+  const vint16m1_t min = __riscv_vmin_vv_i16m1(abs_diff, max, vl);
+  return __riscv_vneg_v_i16m1_tumu(mask, min, min, vl);
+}
+
+static inline vint16m1_t vmax_mask(vint16m1_t a, vint16m1_t b, size_t vl) {
+  const vbool16_t mask =
+      __riscv_vmseq_vx_i16m1_b16(a, (int16_t)CDEF_VERY_LARGE, vl);
+  const vint16m1_t val = __riscv_vmerge_vvm_i16m1(a, b, mask, vl);
+  return __riscv_vmax_vv_i16m1(val, b, vl);
+}
+
+static inline vint16m1_t load_strided_i16_4x2(int16_t *addr,
+                                              const ptrdiff_t stride,
+                                              size_t vl) {
+  const vint16m1_t px_l1 = __riscv_vle16_v_i16m1(addr + stride, vl);
+  const vint16m1_t px_l0 = __riscv_vle16_v_i16m1(addr, vl);
+  return __riscv_vslideup_vx_i16m1(px_l0, px_l1, 4, vl);
+}
+
+static inline void store_strided_u8_4x2(uint8_t *addr, vuint8mf2_t vdst,
+                                        const ptrdiff_t stride, size_t vl) {
+  __riscv_vse8_v_u8mf2(addr, vdst, vl >> 1);
+  vdst = __riscv_vslidedown_vx_u8mf2(vdst, 4, vl);
+  __riscv_vse8_v_u8mf2(addr + stride, vdst, vl >> 1);
+}
+
+static inline void store_strided_u16_4x2(uint16_t *addr, vuint16m1_t vdst,
+                                         const ptrdiff_t stride, size_t vl) {
+  __riscv_vse16_v_u16m1(addr, vdst, vl >> 1);
+  vdst = __riscv_vslidedown_vx_u16m1(vdst, 4, vl);
+  __riscv_vse16_v_u16m1(addr + stride, vdst, vl >> 1);
+}
+
+#define LOAD_PIX(addr)                                              \
+  const vint16m1_t px = __riscv_vle16_v_i16m1((int16_t *)addr, vl); \
+  vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl)
+
+#define LOAD_PIX4(addr)                                        \
+  const vint16m1_t px =                                        \
+      load_strided_i16_4x2((int16_t *)addr, CDEF_BSTRIDE, vl); \
+  vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl)
+
+#define LOAD_DIR(p, addr, o0, o1)                                          \
+  const vint16m1_t p##0 = __riscv_vle16_v_i16m1((int16_t *)addr + o0, vl); \
+  const vint16m1_t p##1 = __riscv_vle16_v_i16m1((int16_t *)addr - o0, vl); \
+  const vint16m1_t p##2 = __riscv_vle16_v_i16m1((int16_t *)addr + o1, vl); \
+  const vint16m1_t p##3 = __riscv_vle16_v_i16m1((int16_t *)addr - o1, vl)
+
+#define LOAD_DIR4(p, addr, o0, o1)                                  \
+  const vint16m1_t p##0 =                                           \
+      load_strided_i16_4x2((int16_t *)addr + o0, CDEF_BSTRIDE, vl); \
+  const vint16m1_t p##1 =                                           \
+      load_strided_i16_4x2((int16_t *)addr - o0, CDEF_BSTRIDE, vl); \
+  const vint16m1_t p##2 =                                           \
+      load_strided_i16_4x2((int16_t *)addr + o1, CDEF_BSTRIDE, vl); \
+  const vint16m1_t p##3 =                                           \
+      load_strided_i16_4x2((int16_t *)addr - o1, CDEF_BSTRIDE, vl)
+
+#define MAKE_TAPS                                                         \
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; \
+  const int16_t tap0 = (int16_t)(pri_taps[0]);                            \
+  const int16_t tap1 = (int16_t)(pri_taps[1])
+
+#define CONSTRAIN(p, strength, shift)                               \
+  vint16m1_t p##_c0 =                                               \
+      constrain16(p##0, px, (int16_t)strength, (int16_t)shift, vl); \
+  vint16m1_t p##_c1 =                                               \
+      constrain16(p##1, px, (int16_t)strength, (int16_t)shift, vl); \
+  vint16m1_t p##_c2 =                                               \
+      constrain16(p##2, px, (int16_t)strength, (int16_t)shift, vl); \
+  vint16m1_t p##_c3 =                                               \
+      constrain16(p##3, px, (int16_t)strength, (int16_t)shift, vl)
+
+#define SETUP_MINMAX   \
+  vint16m1_t max = px; \
+  vint16m1_t min = px
+
+#define MIN_MAX(p)                              \
+  do {                                          \
+    max = vmax_mask(p##0, max, vl);             \
+    min = __riscv_vmin_vv_i16m1(p##0, min, vl); \
+    max = vmax_mask(p##1, max, vl);             \
+    min = __riscv_vmin_vv_i16m1(p##1, min, vl); \
+    max = vmax_mask(p##2, max, vl);             \
+    min = __riscv_vmin_vv_i16m1(p##2, min, vl); \
+    max = vmax_mask(p##3, max, vl);             \
+    min = __riscv_vmin_vv_i16m1(p##3, min, vl); \
+  } while (0)
+
+#define PRI_0_UPDATE_SUM(p)                                             \
+  const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \
+  const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \
+  sum = __riscv_vmacc_vx_i16m1(sum, tap0, p##sum0, vl);                 \
+  sum = __riscv_vmacc_vx_i16m1(sum, tap1, p##sum1, vl)
+
+#define UPDATE_SUM(p)                                                   \
+  const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \
+  const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \
+  sum = __riscv_vadd_vv_i16m1(sum, p##sum0, vl);                        \
+  sum = __riscv_vadd_vv_i16m1(sum, p##sum1, vl)
+
+#define SEC_0_UPDATE_SUM(p)                                               \
+  const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl);   \
+  const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl);   \
+  const vint16m1_t p##sum2 = __riscv_vadd_vv_i16m1(p##sum0, p##sum1, vl); \
+  sum = __riscv_vadd_vv_i16m1(sum, __riscv_vsll_vx_i16m1(p##sum2, 1, vl), vl)
+
+#define BIAS                                                                  \
+  const vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(sum, 0, vl);              \
+  const vint16m1_t v_8 = __riscv_vmv_v_x_i16m1(8, vl);                        \
+  const vint16m1_t bias = __riscv_vsub_vx_i16m1_tumu(mask, v_8, v_8, 1, vl);  \
+  const vint16m1_t unclamped = __riscv_vadd_vv_i16m1(                         \
+      px, __riscv_vsra_vx_i16m1(__riscv_vadd_vv_i16m1(bias, sum, vl), 4, vl), \
+      vl)
+
+#define STORE4                                     \
+  do {                                             \
+    store_strided_u8_4x2(dst8, vdst, dstride, vl); \
+                                                   \
+    in += (CDEF_BSTRIDE << 1);                     \
+    dst8 += (dstride << 1);                        \
+  } while (0)
+
+#define STORE4_CLAMPED                                       \
+  do {                                                       \
+    BIAS;                                                    \
+    vint16m1_t clamped = __riscv_vmin_vv_i16m1(              \
+        __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \
+    vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(            \
+        __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl);    \
+    STORE4;                                                  \
+  } while (0)
+
+#define STORE4_UNCLAMPED                                    \
+  do {                                                      \
+    BIAS;                                                   \
+    vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(           \
+        __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \
+    STORE4;                                                 \
+  } while (0)
+
+#define STORE8                            \
+  do {                                    \
+    __riscv_vse8_v_u8mf2(dst8, vdst, vl); \
+                                          \
+    in += CDEF_BSTRIDE;                   \
+    dst8 += dstride;                      \
+  } while (0)
+
+#define STORE8_CLAMPED                                       \
+  do {                                                       \
+    BIAS;                                                    \
+    vint16m1_t clamped = __riscv_vmin_vv_i16m1(              \
+        __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \
+    vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(            \
+        __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl);    \
+    STORE8;                                                  \
+  } while (0)
+
+#define STORE8_UNCLAMPED                                    \
+  do {                                                      \
+    BIAS;                                                   \
+    vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(           \
+        __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \
+    STORE8;                                                 \
+  } while (0)
+
+#define STORE16_4                                    \
+  do {                                               \
+    store_strided_u16_4x2(dst16, vdst, dstride, vl); \
+                                                     \
+    in += (CDEF_BSTRIDE << 1);                       \
+    dst16 += (dstride << 1);                         \
+  } while (0)
+
+#define STORE16_4_CLAMPED                                           \
+  do {                                                              \
+    BIAS;                                                           \
+    vint16m1_t clamped = __riscv_vmin_vv_i16m1(                     \
+        __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl);        \
+    vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \
+    STORE16_4;                                                      \
+  } while (0)
+
+#define STORE16_4_UNCLAMPED                                           \
+  do {                                                                \
+    BIAS;                                                             \
+    vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \
+    STORE16_4;                                                        \
+  } while (0)
+
+#define STORE16                             \
+  do {                                      \
+    __riscv_vse16_v_u16m1(dst16, vdst, vl); \
+                                            \
+    in += CDEF_BSTRIDE;                     \
+    dst16 += dstride;                       \
+  } while (0)
+
+#define STORE16_CLAMPED                                             \
+  do {                                                              \
+    BIAS;                                                           \
+    vint16m1_t clamped = __riscv_vmin_vv_i16m1(                     \
+        __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl);        \
+    vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \
+    STORE16;                                                        \
+  } while (0)
+
+#define STORE16_UNCLAMPED                                             \
+  do {                                                                \
+    BIAS;                                                             \
+    vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \
+    STORE16;                                                          \
+  } while (0)
+
+void cdef_filter_8_0_rvv(void *dest, int dstride, const uint16_t *in,
+                         int pri_strength, int sec_strength, int dir,
+                         int pri_damping, int sec_damping, int coeff_shift,
+                         int block_width, int block_height) {
+  const int po1 = cdef_directions[dir][0];
+  const int po2 = cdef_directions[dir][1];
+  const int s1o1 = cdef_directions[dir + 2][0];
+  const int s1o2 = cdef_directions[dir + 2][1];
+  const int s2o1 = cdef_directions[dir - 2][0];
+  const int s2o2 = cdef_directions[dir - 2][1];
+  MAKE_TAPS;
+
+  if (pri_strength) {
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  }
+  if (sec_strength) {
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+  }
+
+  if (block_width == 8) {
+    uint8_t *dst8 = (uint8_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width;
+    do {
+      LOAD_PIX(in);
+      SETUP_MINMAX;
+
+      // Primary pass
+      LOAD_DIR(p, in, po1, po2);
+      CONSTRAIN(p, pri_strength, pri_damping);
+      MIN_MAX(p);
+      PRI_0_UPDATE_SUM(p);
+
+      // Secondary pass 1
+      LOAD_DIR(s, in, s1o1, s2o1);
+      CONSTRAIN(s, sec_strength, sec_damping);
+      MIN_MAX(s);
+      SEC_0_UPDATE_SUM(s);
+
+      // Secondary pass 2
+      LOAD_DIR(s2, in, s1o2, s2o2);
+      CONSTRAIN(s2, sec_strength, sec_damping);
+      MIN_MAX(s2);
+      UPDATE_SUM(s2);
+
+      // Store
+      STORE8_CLAMPED;
+    } while (--h != 0);
+  } else {
+    uint8_t *dst8 = (uint8_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width << 1;
+    do {
+      LOAD_PIX4(in);
+      SETUP_MINMAX;
+
+      // Primary pass
+      LOAD_DIR4(p, in, po1, po2);
+      CONSTRAIN(p, pri_strength, pri_damping);
+      MIN_MAX(p);
+      PRI_0_UPDATE_SUM(p);
+
+      // Secondary pass 1
+      LOAD_DIR4(s, in, s1o1, s2o1);
+      CONSTRAIN(s, sec_strength, sec_damping);
+      MIN_MAX(s);
+      SEC_0_UPDATE_SUM(s);
+
+      // Secondary pass 2
+      LOAD_DIR4(s2, in, s1o2, s2o2);
+      CONSTRAIN(s2, sec_strength, sec_damping);
+      MIN_MAX(s2);
+      UPDATE_SUM(s2);
+
+      // Store
+      STORE4_CLAMPED;
+
+      h -= 2;
+    } while (h != 0);
+  }
+}
+
+void cdef_filter_8_1_rvv(void *dest, int dstride, const uint16_t *in,
+                         int pri_strength, int sec_strength, int dir,
+                         int pri_damping, int sec_damping, int coeff_shift,
+                         int block_width, int block_height) {
+  (void)sec_strength;
+  (void)sec_damping;
+
+  const int po1 = cdef_directions[dir][0];
+  const int po2 = cdef_directions[dir][1];
+  MAKE_TAPS;
+
+  if (pri_strength) {
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  }
+
+  if (block_width == 8) {
+    uint8_t *dst8 = (uint8_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width;
+    do {
+      LOAD_PIX(in);
+
+      // Primary pass
+      LOAD_DIR(p, in, po1, po2);
+      CONSTRAIN(p, pri_strength, pri_damping);
+      PRI_0_UPDATE_SUM(p);
+
+      // Store
+      STORE8_UNCLAMPED;
+    } while (--h != 0);
+  } else {
+    uint8_t *dst8 = (uint8_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width << 1;
+    do {
+      LOAD_PIX4(in);
+
+      // Primary pass
+      LOAD_DIR4(p, in, po1, po2);
+      CONSTRAIN(p, pri_strength, pri_damping);
+      PRI_0_UPDATE_SUM(p);
+
+      // Store
+      STORE4_UNCLAMPED;
+
+      h -= 2;
+    } while (h != 0);
+  }
+}
+
+void cdef_filter_8_2_rvv(void *dest, int dstride, const uint16_t *in,
+                         int pri_strength, int sec_strength, int dir,
+                         int pri_damping, int sec_damping, int coeff_shift,
+                         int block_width, int block_height) {
+  (void)pri_strength;
+  (void)pri_damping;
+  (void)coeff_shift;
+
+  const int s1o1 = cdef_directions[dir + 2][0];
+  const int s1o2 = cdef_directions[dir + 2][1];
+  const int s2o1 = cdef_directions[dir - 2][0];
+  const int s2o2 = cdef_directions[dir - 2][1];
+
+  if (sec_strength) {
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+  }
+
+  if (block_width == 8) {
+    uint8_t *dst8 = (uint8_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width;
+    do {
+      LOAD_PIX(in);
+
+      // Secondary pass 1
+      LOAD_DIR(s, in, s1o1, s2o1);
+      CONSTRAIN(s, sec_strength, sec_damping);
+      SEC_0_UPDATE_SUM(s);
+
+      // Secondary pass 2
+      LOAD_DIR(s2, in, s1o2, s2o2);
+      CONSTRAIN(s2, sec_strength, sec_damping);
+      UPDATE_SUM(s2);
+
+      // Store
+      STORE8_UNCLAMPED;
+    } while (--h != 0);
+  } else {
+    uint8_t *dst8 = (uint8_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width << 1;
+    do {
+      LOAD_PIX4(in);
+
+      // Secondary pass 1
+      LOAD_DIR4(s, in, s1o1, s2o1);
+      CONSTRAIN(s, sec_strength, sec_damping);
+      SEC_0_UPDATE_SUM(s);
+
+      // Secondary pass 2
+      LOAD_DIR4(s2, in, s1o2, s2o2);
+      CONSTRAIN(s2, sec_strength, sec_damping);
+      UPDATE_SUM(s2);
+
+      // Store
+      STORE4_UNCLAMPED;
+
+      h -= 2;
+    } while (h != 0);
+  }
+}
+
+void cdef_filter_8_3_rvv(void *dest, int dstride, const uint16_t *in,
+                         int pri_strength, int sec_strength, int dir,
+                         int pri_damping, int sec_damping, int coeff_shift,
+                         int block_width, int block_height) {
+  (void)pri_strength;
+  (void)sec_strength;
+  (void)dir;
+  (void)pri_damping;
+  (void)sec_damping;
+  (void)coeff_shift;
+
+  if (block_width == 8) {
+    uint8_t *dst8 = (uint8_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width;
+    do {
+      const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl);
+      const vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(px, vl);
+      __riscv_vse8_v_u8mf2(dst8, vdst, vl);
+
+      in += CDEF_BSTRIDE;
+      dst8 += dstride;
+    } while (--h != 0);
+  } else {
+    uint8_t *dst8 = (uint8_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width << 1;
+    do {
+      const vint16m1_t px =
+          load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl);
+      vuint8mf2_t vdst =
+          __riscv_vncvt_x_x_w_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(px), vl);
+      store_strided_u8_4x2(dst8, vdst, dstride, vl);
+
+      in += 2 * CDEF_BSTRIDE;
+      dst8 += 2 * dstride;
+      h -= 2;
+    } while (h != 0);
+  }
+}
+
+void cdef_filter_16_0_rvv(void *dest, int dstride, const uint16_t *in,
+                          int pri_strength, int sec_strength, int dir,
+                          int pri_damping, int sec_damping, int coeff_shift,
+                          int block_width, int block_height) {
+  const int po1 = cdef_directions[dir][0];
+  const int po2 = cdef_directions[dir][1];
+  const int s1o1 = cdef_directions[dir + 2][0];
+  const int s1o2 = cdef_directions[dir + 2][1];
+  const int s2o1 = cdef_directions[dir - 2][0];
+  const int s2o2 = cdef_directions[dir - 2][1];
+  MAKE_TAPS;
+
+  if (pri_strength) {
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  }
+  if (sec_strength) {
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+  }
+
+  if (block_width == 8) {
+    uint16_t *dst16 = (uint16_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width;
+    do {
+      LOAD_PIX(in);
+      SETUP_MINMAX;
+
+      // Primary pass
+      LOAD_DIR(p, in, po1, po2);
+      CONSTRAIN(p, pri_strength, pri_damping);
+      MIN_MAX(p);
+      PRI_0_UPDATE_SUM(p);
+
+      // Secondary pass 1
+      LOAD_DIR(s, in, s1o1, s2o1);
+      CONSTRAIN(s, sec_strength, sec_damping);
+      MIN_MAX(s);
+      SEC_0_UPDATE_SUM(s);
+
+      // Secondary pass 2
+      LOAD_DIR(s2, in, s1o2, s2o2);
+      CONSTRAIN(s2, sec_strength, sec_damping);
+      MIN_MAX(s2);
+      UPDATE_SUM(s2);
+
+      // Store
+      STORE16_CLAMPED;
+    } while (--h != 0);
+  } else {
+    uint16_t *dst16 = (uint16_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width << 1;
+    do {
+      LOAD_PIX4(in);
+      SETUP_MINMAX;
+
+      // Primary pass
+      LOAD_DIR4(p, in, po1, po2);
+      CONSTRAIN(p, pri_strength, pri_damping);
+      MIN_MAX(p);
+      PRI_0_UPDATE_SUM(p);
+
+      // Secondary pass 1
+      LOAD_DIR4(s, in, s1o1, s2o1);
+      CONSTRAIN(s, sec_strength, sec_damping);
+      MIN_MAX(s);
+      SEC_0_UPDATE_SUM(s);
+
+      // Secondary pass 2
+      LOAD_DIR4(s2, in, s1o2, s2o2);
+      CONSTRAIN(s2, sec_strength, sec_damping);
+      MIN_MAX(s2);
+      UPDATE_SUM(s2);
+
+      // Store
+      STORE16_4_CLAMPED;
+
+      h -= 2;
+    } while (h != 0);
+  }
+}
+
+void cdef_filter_16_1_rvv(void *dest, int dstride, const uint16_t *in,
+                          int pri_strength, int sec_strength, int dir,
+                          int pri_damping, int sec_damping, int coeff_shift,
+                          int block_width, int block_height) {
+  (void)sec_strength;
+  (void)sec_damping;
+
+  const int po1 = cdef_directions[dir][0];
+  const int po2 = cdef_directions[dir][1];
+  MAKE_TAPS;
+
+  if (pri_strength) {
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  }
+
+  if (block_width == 8) {
+    uint16_t *dst16 = (uint16_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width;
+    do {
+      LOAD_PIX(in);
+
+      // Primary pass
+      LOAD_DIR(p, in, po1, po2);
+      CONSTRAIN(p, pri_strength, pri_damping);
+      PRI_0_UPDATE_SUM(p);
+
+      // Store
+      STORE16_UNCLAMPED;
+    } while (--h != 0);
+  } else {
+    uint16_t *dst16 = (uint16_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width << 1;
+    do {
+      LOAD_PIX4(in);
+
+      // Primary pass
+      LOAD_DIR4(p, in, po1, po2);
+      CONSTRAIN(p, pri_strength, pri_damping);
+      PRI_0_UPDATE_SUM(p);
+
+      // Store
+      STORE16_4_UNCLAMPED;
+
+      h -= 2;
+    } while (h != 0);
+  }
+}
+
+void cdef_filter_16_2_rvv(void *dest, int dstride, const uint16_t *in,
+                          int pri_strength, int sec_strength, int dir,
+                          int pri_damping, int sec_damping, int coeff_shift,
+                          int block_width, int block_height) {
+  (void)pri_strength;
+  (void)pri_damping;
+  (void)coeff_shift;
+
+  const int s1o1 = cdef_directions[dir + 2][0];
+  const int s1o2 = cdef_directions[dir + 2][1];
+  const int s2o1 = cdef_directions[dir - 2][0];
+  const int s2o2 = cdef_directions[dir - 2][1];
+
+  if (sec_strength) {
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+  }
+
+  if (block_width == 8) {
+    uint16_t *dst16 = (uint16_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width;
+    do {
+      LOAD_PIX(in);
+
+      // Secondary pass 1
+      LOAD_DIR(s, in, s1o1, s2o1);
+      CONSTRAIN(s, sec_strength, sec_damping);
+      SEC_0_UPDATE_SUM(s);
+
+      // Secondary pass 2
+      LOAD_DIR(s2, in, s1o2, s2o2);
+      CONSTRAIN(s2, sec_strength, sec_damping);
+      UPDATE_SUM(s2);
+
+      // Store
+      STORE16_UNCLAMPED;
+    } while (--h != 0);
+  } else {
+    uint16_t *dst16 = (uint16_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width << 1;
+    do {
+      LOAD_PIX4(in);
+
+      // Secondary pass 1
+      LOAD_DIR4(s, in, s1o1, s2o1);
+      CONSTRAIN(s, sec_strength, sec_damping);
+      SEC_0_UPDATE_SUM(s);
+
+      // Secondary pass 2
+      LOAD_DIR4(s2, in, s1o2, s2o2);
+      CONSTRAIN(s2, sec_strength, sec_damping);
+      UPDATE_SUM(s2);
+
+      // Store
+      STORE16_4_UNCLAMPED;
+
+      h -= 2;
+    } while (h != 0);
+  }
+}
+
+void cdef_filter_16_3_rvv(void *dest, int dstride, const uint16_t *in,
+                          int pri_strength, int sec_strength, int dir,
+                          int pri_damping, int sec_damping, int coeff_shift,
+                          int block_width, int block_height) {
+  (void)pri_strength;
+  (void)sec_strength;
+  (void)dir;
+  (void)pri_damping;
+  (void)sec_damping;
+  (void)coeff_shift;
+
+  if (block_width == 8) {
+    uint16_t *dst16 = (uint16_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width;
+    do {
+      const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl);
+      __riscv_vse16_v_u16m1(dst16, px, vl);
+
+      in += CDEF_BSTRIDE;
+      dst16 += dstride;
+    } while (--h != 0);
+  } else {
+    uint16_t *dst16 = (uint16_t *)dest;
+
+    int h = block_height;
+    const size_t vl = block_width << 1;
+    do {
+      const vint16m1_t px =
+          load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl);
+      vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(px);
+      store_strided_u16_4x2(dst16, vdst, dstride, vl);
+
+      in += 2 * CDEF_BSTRIDE;
+      dst16 += 2 * dstride;
+      h -= 2;
+    } while (h != 0);
+  }
+}
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index 4e41cc44489e..ecfc86551181 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -27,7 +27,8 @@
 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
 // We need an extra 2 taps to fit this in, for a total of 8 taps.
 /* clang-format off */
-const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
+const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1]
+                                         [8] = {
   // [-1, 0)
   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
   { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
@@ -344,7 +345,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = av1_warped_filter[offs];
+          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_horiz;
           for (int m = 0; m < 8; ++m) {
@@ -365,7 +366,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = av1_warped_filter[offs];
+          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
           for (int m = 0; m < 8; ++m) {
@@ -575,7 +576,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = av1_warped_filter[offs];
+          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_horiz;
           for (int m = 0; m < 8; ++m) {
@@ -599,7 +600,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = av1_warped_filter[offs];
+          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
           for (int m = 0; m < 8; ++m) {
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
index 00ede2afa77c..fbd24d1adc6d 100644
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -33,7 +33,14 @@
 #define WARP_ERROR_BLOCK_LOG 5
 #define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)
 
-extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+#if AOM_ARCH_ARM || AOM_ARCH_AARCH64 || AOM_ARCH_X86 || AOM_ARCH_X86_64
+typedef int16_t WarpedFilterCoeff;
+#else
+typedef int8_t WarpedFilterCoeff;
+#endif
+
+extern const WarpedFilterCoeff
+    av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
 
 DECLARE_ALIGNED(8, extern const int8_t,
                 av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index ae92471b634e..5f3b206e980f 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -3822,6 +3822,10 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
     resize_reset_rc(cpi, resize_pending_params->width,
                     resize_pending_params->height, cm->width, cm->height);
   }
+  if (svc->temporal_layer_id == 0) {
+    rc->num_col_blscroll_last_tl0 = 0;
+    rc->num_row_blscroll_last_tl0 = 0;
+  }
   // Set the GF interval and update flag.
   if (!rc->rtc_external_ratectrl)
     set_gf_interval_update_onepass_rt(cpi, *frame_type);
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index 0e7ede9b8444..88c048fb7e2b 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -200,6 +200,8 @@ typedef struct {
   int last_target_size_keyframe;
   int frames_since_scene_change;
   int perc_spatial_flat_blocks;
+  int num_col_blscroll_last_tl0;
+  int num_row_blscroll_last_tl0;
 
   int avg_frame_bandwidth;  // Average frame size target for clip
   int min_frame_bandwidth;  // Minimum allocation used for any frame
diff --git a/third_party/aom/av1/encoder/var_based_part.c b/third_party/aom/av1/encoder/var_based_part.c
index bc00cf05bfbe..37b295dba632 100644
--- a/third_party/aom/av1/encoder/var_based_part.c
+++ b/third_party/aom/av1/encoder/var_based_part.c
@@ -1325,6 +1325,53 @@ static inline void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
   }
 }
 
+static void do_int_pro_motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
+                                         unsigned int *y_sad, int mi_row,
+                                         int mi_col, int source_sad_nonrd) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mi = xd->mi[0];
+  const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+  const int increase_col_sw = source_sad_nonrd > kMedSad &&
+                              !cpi->rc.high_motion_content_screen_rtc &&
+                              (cpi->svc.temporal_layer_id == 0 ||
+                               cpi->rc.num_col_blscroll_last_tl0 > 2);
+  int me_search_size_col = is_screen
+                               ? increase_col_sw ? 512 : 96
+                               : block_size_wide[cm->seq_params->sb_size] >> 1;
+  // For screen use larger search size row motion to capture
+  // vertical scroll, which can be larger motion.
+  int me_search_size_row = is_screen
+                               ? source_sad_nonrd > kMedSad ? 512 : 192
+                               : block_size_high[cm->seq_params->sb_size] >> 1;
+  unsigned int y_sad_zero;
+  *y_sad = av1_int_pro_motion_estimation(
+      cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv, &y_sad_zero,
+      me_search_size_col, me_search_size_row);
+  // The logic below selects whether the motion estimated in the
+  // int_pro_motion() will be used in nonrd_pickmode. Only do this
+  // for screen for now.
+  if (is_screen) {
+    unsigned int thresh_sad =
+        (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
+    if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
+      x->sb_me_partition = 1;
+      x->sb_me_mv.as_int = mi->mv[0].as_int;
+      if (cpi->svc.temporal_layer_id == 0) {
+        if (abs(mi->mv[0].as_mv.col) > 16 && abs(mi->mv[0].as_mv.row) == 0)
+          cpi->rc.num_col_blscroll_last_tl0++;
+        else if (abs(mi->mv[0].as_mv.row) > 16 && abs(mi->mv[0].as_mv.col) == 0)
+          cpi->rc.num_row_blscroll_last_tl0++;
+      }
+    } else {
+      x->sb_me_partition = 0;
+      // Fall back to using zero motion.
+      *y_sad = y_sad_zero;
+      mi->mv[0].as_int = 0;
+    }
+  }
+}
+
 static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
                          unsigned int *y_sad_g, unsigned int *y_sad_alt,
                          unsigned int *y_sad_last,
@@ -1418,42 +1465,11 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
     // so for now force it to 2 based on superblock sad.
     if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2;
 
-    if (est_motion == 1 || est_motion == 2) {
-      if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
-        // For screen only do int_pro_motion for spatial variance above
-        // threshold and motion level above LowSad.
-        if (x->source_variance > 100 && source_sad_nonrd > kLowSad) {
-          int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
-          int me_search_size_col =
-              is_screen ? source_sad_nonrd > kMedSad ? 160 : 96
-                        : block_size_wide[cm->seq_params->sb_size] >> 1;
-          // For screen use larger search size row motion to capture
-          // vertical scroll, which can be larger motion.
-          int me_search_size_row =
-              is_screen ? source_sad_nonrd > kMedSad ? 512 : 192
-                        : block_size_high[cm->seq_params->sb_size] >> 1;
-          unsigned int y_sad_zero;
-          *y_sad = av1_int_pro_motion_estimation(
-              cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv,
-              &y_sad_zero, me_search_size_col, me_search_size_row);
-          // The logic below selects whether the motion estimated in the
-          // int_pro_motion() will be used in nonrd_pickmode. Only do this
-          // for screen for now.
-          if (is_screen) {
-            unsigned int thresh_sad =
-                (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
-            if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
-              x->sb_me_partition = 1;
-              x->sb_me_mv.as_int = mi->mv[0].as_int;
-            } else {
-              x->sb_me_partition = 0;
-              // Fall back to using zero motion.
-              *y_sad = y_sad_zero;
-              mi->mv[0].as_int = 0;
-            }
-          }
-        }
-      }
+    if ((est_motion == 1 || est_motion == 2) && xd->mb_to_right_edge >= 0 &&
+        xd->mb_to_bottom_edge >= 0 && x->source_variance > 100 &&
+        source_sad_nonrd > kLowSad) {
+      do_int_pro_motion_estimation(cpi, x, y_sad, mi_row, mi_col,
+                                   source_sad_nonrd);
     }
 
     if (*y_sad == UINT_MAX) {
diff --git a/third_party/aom/build/cmake/aom_config_defaults.cmake b/third_party/aom/build/cmake/aom_config_defaults.cmake
index 43ada3626461..b78c9ec98fba 100644
--- a/third_party/aom/build/cmake/aom_config_defaults.cmake
+++ b/third_party/aom/build/cmake/aom_config_defaults.cmake
@@ -26,6 +26,7 @@ set_aom_detect_var(AOM_ARCH_ARM 0 "Enables ARM architecture.")
 set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.")
 set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.")
 set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.")
+set_aom_detect_var(AOM_ARCH_RISCV 0 "Enables RISC-V architecture.")
 
 # Arm/AArch64 feature flags.
 set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.")
@@ -51,6 +52,9 @@ set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.")
 set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.")
 set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.")
 
+# RISC-V64 feature flags.
+set_aom_detect_var(HAVE_RVV 0 "Enables RVV optimizations.")
+
 # Flags describing the build environment.
 set_aom_detect_var(HAVE_FEXCEPT 0
                    "Internal flag, GNU fenv.h present for target.")
@@ -241,3 +245,6 @@ set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets."
                    ON)
 set_aom_option_var(ENABLE_AVX2
                    "Enables AVX2 optimizations on x86/x86_64 targets." ON)
+
+# RVV intrinsics flags.
+set_aom_option_var(ENABLE_RVV "Enables RVV optimizations on RISC-V targets." ON)
diff --git a/third_party/aom/build/cmake/aom_configure.cmake b/third_party/aom/build/cmake/aom_configure.cmake
index 7350652f0040..572429545923 100644
--- a/third_party/aom/build/cmake/aom_configure.cmake
+++ b/third_party/aom/build/cmake/aom_configure.cmake
@@ -75,6 +75,8 @@ if(NOT AOM_TARGET_CPU)
     set(AOM_TARGET_CPU "arm64")
   elseif(cpu_lowercase MATCHES "^ppc")
     set(AOM_TARGET_CPU "ppc")
+  elseif(cpu_lowercase MATCHES "^riscv")
+    set(AOM_TARGET_CPU "riscv")
   else()
     message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
                     "supported, falling back to the generic target")
diff --git a/third_party/aom/build/cmake/cpu.cmake b/third_party/aom/build/cmake/cpu.cmake
index 84a3ae9b49bb..59826617ad24 100644
--- a/third_party/aom/build/cmake/cpu.cmake
+++ b/third_party/aom/build/cmake/cpu.cmake
@@ -132,4 +132,15 @@ elseif("${AOM_TARGET_CPU}" MATCHES "^x86")
       set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
     endif()
   endforeach()
+elseif("${AOM_TARGET_CPU}" MATCHES "riscv")
+  set(AOM_ARCH_RISCV64 1)
+  set(RTCD_ARCH_RISCV64 "yes")
+
+  if(ENABLE_RVV)
+    set(HAVE_RVV 1)
+    set(RTCD_HAVE_RVV "yes")
+  else()
+    set(HAVE_RVV 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-rvv)
+  endif()
 endif()
diff --git a/third_party/aom/build/cmake/rtcd.pl b/third_party/aom/build/cmake/rtcd.pl
index 464d1986c78c..6cd6a5ca3674 100755
--- a/third_party/aom/build/cmake/rtcd.pl
+++ b/third_party/aom/build/cmake/rtcd.pl
@@ -370,6 +370,36 @@ EOF
   common_bottom;
 }
 
+sub riscv() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#ifdef RTCD_C
+#include "aom_ports/riscv.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = riscv_simd_caps();
+
+    (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
 sub unoptimized() {
   determine_indirection "c";
   common_top;
@@ -415,6 +445,9 @@ if ($opts{arch} eq 'x86') {
 } elsif ($opts{arch} eq 'ppc') {
   @ALL_ARCHS = filter(qw/vsx/);
   ppc;
+} elsif ($opts{arch} eq 'riscv') {
+  @ALL_ARCHS = filter(qw/rvv/);
+  riscv;
 } else {
   unoptimized;
 }
diff --git a/third_party/aom/test/cdef_test.cc b/third_party/aom/test/cdef_test.cc
index 410c9af5f0f1..958e6e56c362 100644
--- a/third_party/aom/test/cdef_test.cc
+++ b/third_party/aom/test/cdef_test.cc
@@ -618,7 +618,8 @@ TEST_P(CDEFCopyRect16to16Test, TestSIMDNoMismatch) {
 
 using std::make_tuple;
 
-#if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON)
+#if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON || \
+     HAVE_RVV)
 static const CdefFilterBlockFunctions kCdefFilterFuncC[] = {
   { &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c,
     &cdef_filter_8_3_c }
@@ -811,6 +812,46 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif
 
+#if HAVE_RVV
+static const CdefFilterBlockFunctions kCdefFilterFuncRvv[] = {
+  { &cdef_filter_8_0_rvv, &cdef_filter_8_1_rvv, &cdef_filter_8_2_rvv,
+    &cdef_filter_8_3_rvv }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncRvv[] = {
+  { &cdef_filter_16_0_rvv, &cdef_filter_16_1_rvv, &cdef_filter_16_2_rvv,
+    &cdef_filter_16_3_rvv }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    RVV, CDEFBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncRvv),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    RVV, CDEFBlockHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncRvv),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(RVV, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_rvv,
+                                                      &cdef_find_dir_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    RVV, CDEFCopyRect8to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+                                 &cdef_copy_rect8_8bit_to_16bit_rvv)));
+
+INSTANTIATE_TEST_SUITE_P(
+    RVV, CDEFCopyRect16to16Test,
+    ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+                                 &cdef_copy_rect8_16bit_to_16bit_rvv)));
+#endif
+
 // Test speed for all supported architectures
 #if AOM_ARCH_X86 && HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
@@ -905,4 +946,24 @@ INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirDualSpeedTest,
                                                       &cdef_find_dir_dual_c)));
 #endif
 
+#if HAVE_RVV
+INSTANTIATE_TEST_SUITE_P(
+    RVV, CDEFSpeedTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncRvv),
+                       ::testing::ValuesIn(kCdefFilterFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+    RVV, CDEFSpeedHighbdTest,
+    ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncRvv),
+                       ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Values(10)));
+INSTANTIATE_TEST_SUITE_P(RVV, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_rvv,
+                                                      &cdef_find_dir_c)));
+#endif
+
 }  // namespace
diff --git a/third_party/aom/test/svc_datarate_test.cc b/third_party/aom/test/svc_datarate_test.cc
index 2b08fa4b07f5..28e235e9fcaa 100644
--- a/third_party/aom/test/svc_datarate_test.cc
+++ b/third_party/aom/test/svc_datarate_test.cc
@@ -1078,6 +1078,39 @@ class DatarateTestSVC
 #endif
   }
 
+  virtual void BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080Test() {
+    cfg_.rc_buf_initial_sz = 50;
+    cfg_.rc_buf_optimal_sz = 50;
+    cfg_.rc_buf_sz = 100;
+    cfg_.rc_dropframe_thresh = 30;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 52;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::Y4mVideoSource video("screendata.1920_1080.y4m", 0, 60);
+
+    const int bitrate_array[2] = { 60, 100 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    screen_mode_ = 1;
+    number_temporal_layers_ = 2;
+    number_spatial_layers_ = 1;
+    target_layer_bitrate_[0] = 60 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_AV1_DECODER
+    // Top temporal layers are non_reference, so exclude them from
+    // mismatch count, since loopfilter/cdef is not applied for these on
+    // encoder side, but is always applied on decoder.
+    // This means 150 = #frames(300) - #TL2_frames(150).
+    // We use LE for screen since loopfilter level can become very small
+    // or zero and then the frame is not a mismatch.
+    EXPECT_LE(GetMismatchFrames(), 150u);
+#endif
+  }
+
   virtual void BasicRateTargetingSVC1TL3SLScreenTest() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
@@ -2651,6 +2684,14 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLScreenDropFrame) {
   BasicRateTargetingSVC2TL1SLScreenDropFrameTest();
 }
 
+// Check basic rate targeting for CBR, for 2 temporal layers, 1 spatial
+// for screen mode, with frame dropper on at low bitrates. Use small
+// values of rc_buf_initial/optimal/sz to trigger postencode frame drop.
+// Use 1920x1080 clip.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080) {
+  BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080Test();
+}
+
 // Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal
 // for screen mode.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLScreen) {
diff --git a/third_party/aom/test/test-data.sha1 b/third_party/aom/test/test-data.sha1
index 4b4a96d4440e..743ad45b085b 100644
--- a/third_party/aom/test/test-data.sha1
+++ b/third_party/aom/test/test-data.sha1
@@ -573,3 +573,4 @@ c7f336958e7af6162c20ddc84d67c7dfa9826910 *av1-1-b8-16-intra_only-intrabc-extreme
 4d1ad6d3070268ccb000d7fc3ae0f5a9447bfe82 *test_input_w1h1.yuv
 ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
 9c2aa2d0f63f706f775bf661dfa81e8bb3089d8b *wikipedia_420_360p_60f.y4m
+9e4d2ba84ba62f7ea4b617a13af5db9c39e7f0f9 *screendata.1920_1080.y4m
diff --git a/third_party/aom/test/test_data_util.cmake b/third_party/aom/test/test_data_util.cmake
index 2a99a56e146e..b6d62e72f249 100644
--- a/third_party/aom/test/test_data_util.cmake
+++ b/third_party/aom/test/test_data_util.cmake
@@ -35,6 +35,7 @@ list(APPEND AOM_TEST_DATA_FILE_NAMES
             "niklas_1280_720_30.y4m"
             "rush_hour_444.y4m"
             "screendata.y4m"
+            "screendata.1920_1080.y4m"
             "niklas_640_480_30.yuv"
             "vase10x10.yuv"
             "vase10x10_tiles.txt"