Bug 1945604 - Update aom to 3990233fc06a35944d6d33797e63931802122a95 r=padenot

Differential Revision: https://phabricator.services.mozilla.com/D236581
This commit is contained in:
Updatebot
2025-02-04 13:56:57 +00:00
parent 6817e402c2
commit 82b87cc7a3
52 changed files with 2695 additions and 995 deletions

View File

@@ -12,6 +12,7 @@
AOM_ARCH_AARCH64 equ 0 AOM_ARCH_AARCH64 equ 0
AOM_ARCH_ARM equ 0 AOM_ARCH_ARM equ 0
AOM_ARCH_PPC equ 0 AOM_ARCH_PPC equ 0
AOM_ARCH_RISCV equ 0
AOM_ARCH_X86 equ 0 AOM_ARCH_X86 equ 0
AOM_ARCH_X86_64 equ 0 AOM_ARCH_X86_64 equ 0
CONFIG_ACCOUNTING equ 0 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 0
HAVE_NEON equ 0 HAVE_NEON equ 0
HAVE_NEON_DOTPROD equ 0 HAVE_NEON_DOTPROD equ 0
HAVE_NEON_I8MM equ 0 HAVE_NEON_I8MM equ 0
HAVE_RVV equ 0
HAVE_SSE equ 0 HAVE_SSE equ 0
HAVE_SSE2 equ 0 HAVE_SSE2 equ 0
HAVE_SSE3 equ 0 HAVE_SSE3 equ 0

View File

@@ -14,6 +14,7 @@
#define AOM_ARCH_AARCH64 0 #define AOM_ARCH_AARCH64 0
#define AOM_ARCH_ARM 0 #define AOM_ARCH_ARM 0
#define AOM_ARCH_PPC 0 #define AOM_ARCH_PPC 0
#define AOM_ARCH_RISCV 0
#define AOM_ARCH_X86 0 #define AOM_ARCH_X86 0
#define AOM_ARCH_X86_64 0 #define AOM_ARCH_X86_64 0
#define CONFIG_ACCOUNTING 0 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
#define HAVE_NEON 0 #define HAVE_NEON 0
#define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0 #define HAVE_NEON_I8MM 0
#define HAVE_RVV 0
#define HAVE_SSE 0 #define HAVE_SSE 0
#define HAVE_SSE2 0 #define HAVE_SSE2 0
#define HAVE_SSE3 0 #define HAVE_SSE3 0

View File

@@ -12,6 +12,7 @@
.equ AOM_ARCH_AARCH64, 0 .equ AOM_ARCH_AARCH64, 0
.equ AOM_ARCH_ARM, 1 .equ AOM_ARCH_ARM, 1
.equ AOM_ARCH_PPC, 0 .equ AOM_ARCH_PPC, 0
.equ AOM_ARCH_RISCV, 0
.equ AOM_ARCH_X86, 0 .equ AOM_ARCH_X86, 0
.equ AOM_ARCH_X86_64, 0 .equ AOM_ARCH_X86_64, 0
.equ CONFIG_ACCOUNTING, 0 .equ CONFIG_ACCOUNTING, 0
@@ -82,6 +83,7 @@
.equ HAVE_NEON, 1 .equ HAVE_NEON, 1
.equ HAVE_NEON_DOTPROD, 0 .equ HAVE_NEON_DOTPROD, 0
.equ HAVE_NEON_I8MM, 0 .equ HAVE_NEON_I8MM, 0
.equ HAVE_RVV, 0
.equ HAVE_SSE, 0 .equ HAVE_SSE, 0
.equ HAVE_SSE2, 0 .equ HAVE_SSE2, 0
.equ HAVE_SSE3, 0 .equ HAVE_SSE3, 0

View File

@@ -14,6 +14,7 @@
#define AOM_ARCH_AARCH64 0 #define AOM_ARCH_AARCH64 0
#define AOM_ARCH_ARM 1 #define AOM_ARCH_ARM 1
#define AOM_ARCH_PPC 0 #define AOM_ARCH_PPC 0
#define AOM_ARCH_RISCV 0
#define AOM_ARCH_X86 0 #define AOM_ARCH_X86 0
#define AOM_ARCH_X86_64 0 #define AOM_ARCH_X86_64 0
#define CONFIG_ACCOUNTING 0 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
#define HAVE_NEON 1 #define HAVE_NEON 1
#define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0 #define HAVE_NEON_I8MM 0
#define HAVE_RVV 0
#define HAVE_SSE 0 #define HAVE_SSE 0
#define HAVE_SSE2 0 #define HAVE_SSE2 0
#define HAVE_SSE3 0 #define HAVE_SSE3 0

View File

@@ -12,6 +12,7 @@
AOM_ARCH_AARCH64 equ 0 AOM_ARCH_AARCH64 equ 0
AOM_ARCH_ARM equ 0 AOM_ARCH_ARM equ 0
AOM_ARCH_PPC equ 0 AOM_ARCH_PPC equ 0
AOM_ARCH_RISCV equ 0
AOM_ARCH_X86 equ 1 AOM_ARCH_X86 equ 1
AOM_ARCH_X86_64 equ 0 AOM_ARCH_X86_64 equ 0
CONFIG_ACCOUNTING equ 0 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
HAVE_NEON equ 0 HAVE_NEON equ 0
HAVE_NEON_DOTPROD equ 0 HAVE_NEON_DOTPROD equ 0
HAVE_NEON_I8MM equ 0 HAVE_NEON_I8MM equ 0
HAVE_RVV equ 0
HAVE_SSE equ 1 HAVE_SSE equ 1
HAVE_SSE2 equ 1 HAVE_SSE2 equ 1
HAVE_SSE3 equ 1 HAVE_SSE3 equ 1

View File

@@ -14,6 +14,7 @@
#define AOM_ARCH_AARCH64 0 #define AOM_ARCH_AARCH64 0
#define AOM_ARCH_ARM 0 #define AOM_ARCH_ARM 0
#define AOM_ARCH_PPC 0 #define AOM_ARCH_PPC 0
#define AOM_ARCH_RISCV 0
#define AOM_ARCH_X86 1 #define AOM_ARCH_X86 1
#define AOM_ARCH_X86_64 0 #define AOM_ARCH_X86_64 0
#define CONFIG_ACCOUNTING 0 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
#define HAVE_NEON 0 #define HAVE_NEON 0
#define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0 #define HAVE_NEON_I8MM 0
#define HAVE_RVV 0
#define HAVE_SSE 1 #define HAVE_SSE 1
#define HAVE_SSE2 1 #define HAVE_SSE2 1
#define HAVE_SSE3 1 #define HAVE_SSE3 1

View File

@@ -12,6 +12,7 @@
AOM_ARCH_AARCH64 equ 0 AOM_ARCH_AARCH64 equ 0
AOM_ARCH_ARM equ 0 AOM_ARCH_ARM equ 0
AOM_ARCH_PPC equ 0 AOM_ARCH_PPC equ 0
AOM_ARCH_RISCV equ 0
AOM_ARCH_X86 equ 0 AOM_ARCH_X86 equ 0
AOM_ARCH_X86_64 equ 1 AOM_ARCH_X86_64 equ 1
CONFIG_ACCOUNTING equ 0 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
HAVE_NEON equ 0 HAVE_NEON equ 0
HAVE_NEON_DOTPROD equ 0 HAVE_NEON_DOTPROD equ 0
HAVE_NEON_I8MM equ 0 HAVE_NEON_I8MM equ 0
HAVE_RVV equ 0
HAVE_SSE equ 1 HAVE_SSE equ 1
HAVE_SSE2 equ 1 HAVE_SSE2 equ 1
HAVE_SSE3 equ 1 HAVE_SSE3 equ 1

View File

@@ -14,6 +14,7 @@
#define AOM_ARCH_AARCH64 0 #define AOM_ARCH_AARCH64 0
#define AOM_ARCH_ARM 0 #define AOM_ARCH_ARM 0
#define AOM_ARCH_PPC 0 #define AOM_ARCH_PPC 0
#define AOM_ARCH_RISCV 0
#define AOM_ARCH_X86 0 #define AOM_ARCH_X86 0
#define AOM_ARCH_X86_64 1 #define AOM_ARCH_X86_64 1
#define CONFIG_ACCOUNTING 0 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
#define HAVE_NEON 0 #define HAVE_NEON 0
#define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0 #define HAVE_NEON_I8MM 0
#define HAVE_RVV 0
#define HAVE_SSE 1 #define HAVE_SSE 1
#define HAVE_SSE2 1 #define HAVE_SSE2 1
#define HAVE_SSE3 1 #define HAVE_SSE3 1

View File

@@ -12,6 +12,7 @@
AOM_ARCH_AARCH64 equ 1 AOM_ARCH_AARCH64 equ 1
AOM_ARCH_ARM equ 1 AOM_ARCH_ARM equ 1
AOM_ARCH_PPC equ 0 AOM_ARCH_PPC equ 0
AOM_ARCH_RISCV equ 0
AOM_ARCH_X86 equ 0 AOM_ARCH_X86 equ 0
AOM_ARCH_X86_64 equ 0 AOM_ARCH_X86_64 equ 0
CONFIG_ACCOUNTING equ 0 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 0
HAVE_NEON equ 1 HAVE_NEON equ 1
HAVE_NEON_DOTPROD equ 1 HAVE_NEON_DOTPROD equ 1
HAVE_NEON_I8MM equ 1 HAVE_NEON_I8MM equ 1
HAVE_RVV equ 0
HAVE_SSE equ 0 HAVE_SSE equ 0
HAVE_SSE2 equ 0 HAVE_SSE2 equ 0
HAVE_SSE3 equ 0 HAVE_SSE3 equ 0

View File

@@ -14,6 +14,7 @@
#define AOM_ARCH_AARCH64 1 #define AOM_ARCH_AARCH64 1
#define AOM_ARCH_ARM 1 #define AOM_ARCH_ARM 1
#define AOM_ARCH_PPC 0 #define AOM_ARCH_PPC 0
#define AOM_ARCH_RISCV 0
#define AOM_ARCH_X86 0 #define AOM_ARCH_X86 0
#define AOM_ARCH_X86_64 0 #define AOM_ARCH_X86_64 0
#define CONFIG_ACCOUNTING 0 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
#define HAVE_NEON 1 #define HAVE_NEON 1
#define HAVE_NEON_DOTPROD 1 #define HAVE_NEON_DOTPROD 1
#define HAVE_NEON_I8MM 1 #define HAVE_NEON_I8MM 1
#define HAVE_RVV 0
#define HAVE_SSE 0 #define HAVE_SSE 0
#define HAVE_SSE2 0 #define HAVE_SSE2 0
#define HAVE_SSE3 0 #define HAVE_SSE3 0

View File

@@ -12,6 +12,7 @@
AOM_ARCH_AARCH64 equ 0 AOM_ARCH_AARCH64 equ 0
AOM_ARCH_ARM equ 0 AOM_ARCH_ARM equ 0
AOM_ARCH_PPC equ 0 AOM_ARCH_PPC equ 0
AOM_ARCH_RISCV equ 0
AOM_ARCH_X86 equ 0 AOM_ARCH_X86 equ 0
AOM_ARCH_X86_64 equ 1 AOM_ARCH_X86_64 equ 1
CONFIG_ACCOUNTING equ 0 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
HAVE_NEON equ 0 HAVE_NEON equ 0
HAVE_NEON_DOTPROD equ 0 HAVE_NEON_DOTPROD equ 0
HAVE_NEON_I8MM equ 0 HAVE_NEON_I8MM equ 0
HAVE_RVV equ 0
HAVE_SSE equ 1 HAVE_SSE equ 1
HAVE_SSE2 equ 1 HAVE_SSE2 equ 1
HAVE_SSE3 equ 1 HAVE_SSE3 equ 1

View File

@@ -14,6 +14,7 @@
#define AOM_ARCH_AARCH64 0 #define AOM_ARCH_AARCH64 0
#define AOM_ARCH_ARM 0 #define AOM_ARCH_ARM 0
#define AOM_ARCH_PPC 0 #define AOM_ARCH_PPC 0
#define AOM_ARCH_RISCV 0
#define AOM_ARCH_X86 0 #define AOM_ARCH_X86 0
#define AOM_ARCH_X86_64 1 #define AOM_ARCH_X86_64 1
#define CONFIG_ACCOUNTING 0 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
#define HAVE_NEON 0 #define HAVE_NEON 0
#define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0 #define HAVE_NEON_I8MM 0
#define HAVE_RVV 0
#define HAVE_SSE 1 #define HAVE_SSE 1
#define HAVE_SSE2 1 #define HAVE_SSE2 1
#define HAVE_SSE3 1 #define HAVE_SSE3 1

View File

@@ -12,6 +12,7 @@
AOM_ARCH_AARCH64 equ 0 AOM_ARCH_AARCH64 equ 0
AOM_ARCH_ARM equ 0 AOM_ARCH_ARM equ 0
AOM_ARCH_PPC equ 0 AOM_ARCH_PPC equ 0
AOM_ARCH_RISCV equ 0
AOM_ARCH_X86 equ 1 AOM_ARCH_X86 equ 1
AOM_ARCH_X86_64 equ 0 AOM_ARCH_X86_64 equ 0
CONFIG_ACCOUNTING equ 0 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
HAVE_NEON equ 0 HAVE_NEON equ 0
HAVE_NEON_DOTPROD equ 0 HAVE_NEON_DOTPROD equ 0
HAVE_NEON_I8MM equ 0 HAVE_NEON_I8MM equ 0
HAVE_RVV equ 0
HAVE_SSE equ 1 HAVE_SSE equ 1
HAVE_SSE2 equ 1 HAVE_SSE2 equ 1
HAVE_SSE3 equ 1 HAVE_SSE3 equ 1

View File

@@ -14,6 +14,7 @@
#define AOM_ARCH_AARCH64 0 #define AOM_ARCH_AARCH64 0
#define AOM_ARCH_ARM 0 #define AOM_ARCH_ARM 0
#define AOM_ARCH_PPC 0 #define AOM_ARCH_PPC 0
#define AOM_ARCH_RISCV 0
#define AOM_ARCH_X86 1 #define AOM_ARCH_X86 1
#define AOM_ARCH_X86_64 0 #define AOM_ARCH_X86_64 0
#define CONFIG_ACCOUNTING 0 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
#define HAVE_NEON 0 #define HAVE_NEON 0
#define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0 #define HAVE_NEON_I8MM 0
#define HAVE_RVV 0
#define HAVE_SSE 1 #define HAVE_SSE 1
#define HAVE_SSE2 1 #define HAVE_SSE2 1
#define HAVE_SSE3 1 #define HAVE_SSE3 1

View File

@@ -12,6 +12,7 @@
AOM_ARCH_AARCH64 equ 0 AOM_ARCH_AARCH64 equ 0
AOM_ARCH_ARM equ 0 AOM_ARCH_ARM equ 0
AOM_ARCH_PPC equ 0 AOM_ARCH_PPC equ 0
AOM_ARCH_RISCV equ 0
AOM_ARCH_X86 equ 0 AOM_ARCH_X86 equ 0
AOM_ARCH_X86_64 equ 1 AOM_ARCH_X86_64 equ 1
CONFIG_ACCOUNTING equ 0 CONFIG_ACCOUNTING equ 0
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
HAVE_NEON equ 0 HAVE_NEON equ 0
HAVE_NEON_DOTPROD equ 0 HAVE_NEON_DOTPROD equ 0
HAVE_NEON_I8MM equ 0 HAVE_NEON_I8MM equ 0
HAVE_RVV equ 0
HAVE_SSE equ 1 HAVE_SSE equ 1
HAVE_SSE2 equ 1 HAVE_SSE2 equ 1
HAVE_SSE3 equ 1 HAVE_SSE3 equ 1

View File

@@ -14,6 +14,7 @@
#define AOM_ARCH_AARCH64 0 #define AOM_ARCH_AARCH64 0
#define AOM_ARCH_ARM 0 #define AOM_ARCH_ARM 0
#define AOM_ARCH_PPC 0 #define AOM_ARCH_PPC 0
#define AOM_ARCH_RISCV 0
#define AOM_ARCH_X86 0 #define AOM_ARCH_X86 0
#define AOM_ARCH_X86_64 1 #define AOM_ARCH_X86_64 1
#define CONFIG_ACCOUNTING 0 #define CONFIG_ACCOUNTING 0
@@ -84,6 +85,7 @@
#define HAVE_NEON 0 #define HAVE_NEON 0
#define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0 #define HAVE_NEON_I8MM 0
#define HAVE_RVV 0
#define HAVE_SSE 1 #define HAVE_SSE 1
#define HAVE_SSE2 1 #define HAVE_SSE2 1
#define HAVE_SSE3 1 #define HAVE_SSE3 1

View File

@@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release # Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS" # Generally "version NNN", "tag SSS", "bookmark SSS"
release: 0c13a5d54053f82bf8500b421b5cdefb1cc1b3ed (Sun Jan 05 09:13:09 2025 -0800). release: 3990233fc06a35944d6d33797e63931802122a95 (Thu Jan 30 11:32:16 2025 -0800).
# Revision to pull in # Revision to pull in
# Must be a long or short commit SHA (long preferred) # Must be a long or short commit SHA (long preferred)
revision: 0c13a5d54053f82bf8500b421b5cdefb1cc1b3ed revision: 3990233fc06a35944d6d33797e63931802122a95
# The package's license, where possible using the mnemonic from # The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/ # https://spdx.org/licenses/

View File

@@ -333,6 +333,12 @@ if(CONFIG_AV1_ENCODER)
# libaom static library. # libaom static library.
if(BUILD_SHARED_LIBS) if(BUILD_SHARED_LIBS)
target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom_static) target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom_static)
# TODO: https://aomedia.issues.chromium.org/391715078 - This condition can
# be removed after aom_av1_rc restricts its symbol visibility.
if(CYGWIN OR MINGW)
target_link_options(aom_av1_rc ${AOM_LIB_LINK_TYPE}
LINKER:--allow-multiple-definition)
endif()
else() else()
target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom) target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom)
endif() endif()
@@ -858,8 +864,8 @@ if(BUILD_SHARED_LIBS)
# errors (don't use it with AddressSanitizer)." See # errors (don't use it with AddressSanitizer)." See
# https://clang.llvm.org/docs/AddressSanitizer.html#usage. Similarly, see # https://clang.llvm.org/docs/AddressSanitizer.html#usage. Similarly, see
# https://clang.llvm.org/docs/MemorySanitizer.html#usage. # https://clang.llvm.org/docs/MemorySanitizer.html#usage.
if(NOT WIN32 if(NOT
AND NOT APPLE (APPLE OR CYGWIN OR WIN32)
AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE)) AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE))
# The -z defs linker option reports unresolved symbol references from object # The -z defs linker option reports unresolved symbol references from object
# files when building a shared library. # files when building a shared library.

View File

@@ -60,7 +60,9 @@ README.md {#LREADME}
present, yasm will be used by default. Pass -DENABLE_NASM=ON to cmake to present, yasm will be used by default. Pass -DENABLE_NASM=ON to cmake to
select nasm.) If you download yasm with the intention to work with Visual select nasm.) If you download yasm with the intention to work with Visual
Studio, please download win32.exe or win64.exe and rename it into yasm.exe. Studio, please download win32.exe or win64.exe and rename it into yasm.exe.
DO NOT download or use vsyasm.exe. DO NOT download or use vsyasm.exe. The MSYS2 version of the yasm binary can
also be used and avoids an issue caused by a missing Visual C++
Redistributable install (Visual Studio 2010, MSVCR100.dll).
6. Building the documentation requires 6. Building the documentation requires
[doxygen version 1.8.10 or newer](http://doxygen.org). [doxygen version 1.8.10 or newer](http://doxygen.org).
7. Emscripten builds require the portable 7. Emscripten builds require the portable

View File

@@ -10,7 +10,6 @@ text aom_codec_set_option
text aom_codec_version text aom_codec_version
text aom_codec_version_extra_str text aom_codec_version_extra_str
text aom_codec_version_str text aom_codec_version_str
text aom_free
text aom_img_add_metadata text aom_img_add_metadata
text aom_img_alloc text aom_img_alloc
text aom_img_alloc_with_border text aom_img_alloc_with_border
@@ -25,7 +24,6 @@ text aom_img_plane_width
text aom_img_remove_metadata text aom_img_remove_metadata
text aom_img_set_rect text aom_img_set_rect
text aom_img_wrap text aom_img_wrap
text aom_malloc
text aom_rb_bytes_read text aom_rb_bytes_read
text aom_rb_read_bit text aom_rb_read_bit
text aom_rb_read_literal text aom_rb_read_literal

File diff suppressed because it is too large Load Diff

View File

@@ -146,473 +146,393 @@ static inline uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
return mask_8x8; return mask_8x8;
} }
static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4, static inline void filter4(const uint8x8_t p0q0, const uint8x8_t p1q1,
uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0_output, uint8x8_t *p1q1_output,
uint8x8_t *p0q0, const uint8_t blimit, uint8x8_t mask_8x8, const uint8_t thresh) {
const uint8_t limit, const uint8_t thresh) {
uint16x8_t out;
uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
out_f14_pq5;
uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
uint8x8_t out_f4_pq0, out_f4_pq1;
uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
uint8x8_t q0p0, q1p1, q2p2;
// Calculate filter masks
mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
{
// filter 4
int32x2x2_t ps0_qs0, ps1_qs1;
int16x8_t filter_s16;
const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
uint8x8_t temp0_8x8, temp1_8x8;
int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
int8x8_t op0, oq0, op1, oq1;
int8x8_t pq_s0, pq_s1;
int8x8_t filter_s8, filter1_s8, filter2_s8;
int8x8_t hev_8x8;
const int8x8_t sign_mask = vdup_n_s8(0x80);
const int8x8_t val_4 = vdup_n_s8(4);
const int8x8_t val_3 = vdup_n_s8(3);
pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
// hev_mask
temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
// add outer taps if we have high edge variance
filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
filter_s8 = vand_s8(filter_s8, hev_8x8);
// inner taps
temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
filter_s16 = vmovl_s8(filter_s8);
filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
filter_s8 = vqmovn_s16(filter_s16);
filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
filter1_s8 = vqadd_s8(filter_s8, val_4);
filter2_s8 = vqadd_s8(filter_s8, val_3);
filter1_s8 = vshr_n_s8(filter1_s8, 3);
filter2_s8 = vshr_n_s8(filter2_s8, 3);
oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
hev_8x8 = vmvn_s8(hev_8x8);
filter_s8 = vrshr_n_s8(filter1_s8, 1);
filter_s8 = vand_s8(filter_s8, hev_8x8);
oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
}
// reverse p and q
q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
{
// filter 8
uint16x8_t out_pq0, out_pq1, out_pq2;
out = vaddl_u8(*p3q3, *p2q2);
out = vaddw_u8(out, *p1q1);
out = vaddw_u8(out, *p0q0);
out = vaddw_u8(out, q0p0);
out_pq1 = vaddw_u8(out, *p3q3);
out_pq2 = vaddw_u8(out_pq1, *p3q3);
out_pq2 = vaddw_u8(out_pq2, *p2q2);
out_pq1 = vaddw_u8(out_pq1, *p1q1);
out_pq1 = vaddw_u8(out_pq1, q1p1);
out_pq0 = vaddw_u8(out, *p0q0);
out_pq0 = vaddw_u8(out_pq0, q1p1);
out_pq0 = vaddw_u8(out_pq0, q2p2);
out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
}
{
// filter 14
uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
uint16x8_t p6q6_2, p6q6_temp, qp_sum;
uint8x8_t qp_rev;
out = vaddw_u8(out, *p4q4);
out = vaddw_u8(out, *p5q5);
out = vaddw_u8(out, *p6q6);
out_pq5 = vaddw_u8(out, *p4q4);
out_pq4 = vaddw_u8(out_pq5, *p3q3);
out_pq3 = vaddw_u8(out_pq4, *p2q2);
out_pq5 = vaddw_u8(out_pq5, *p5q5);
out_pq4 = vaddw_u8(out_pq4, *p5q5);
out_pq0 = vaddw_u8(out, *p1q1);
out_pq1 = vaddw_u8(out_pq0, *p2q2);
out_pq2 = vaddw_u8(out_pq1, *p3q3);
out_pq0 = vaddw_u8(out_pq0, *p0q0);
out_pq1 = vaddw_u8(out_pq1, *p0q0);
out_pq1 = vaddw_u8(out_pq1, *p6q6);
p6q6_2 = vaddl_u8(*p6q6, *p6q6);
out_pq2 = vaddq_u16(out_pq2, p6q6_2);
p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
out_pq4 = vaddw_u8(out_pq4, q1p1);
qp_sum = vaddl_u8(q2p2, q1p1);
out_pq3 = vaddq_u16(out_pq3, qp_sum);
qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
qp_sum = vaddw_u8(qp_sum, qp_rev);
out_pq2 = vaddq_u16(out_pq2, qp_sum);
qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
qp_sum = vaddw_u8(qp_sum, qp_rev);
out_pq1 = vaddq_u16(out_pq1, qp_sum);
qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
qp_sum = vaddw_u8(qp_sum, qp_rev);
out_pq0 = vaddq_u16(out_pq0, qp_sum);
out_pq0 = vaddw_u8(out_pq0, q0p0);
out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
}
{
uint8x8_t filter4_cond, filter8_cond, filter14_cond;
filter8_cond = vand_u8(flat_8x8, mask_8x8);
filter4_cond = vmvn_u8(filter8_cond);
filter14_cond = vand_u8(filter8_cond, flat2_8x8);
// filter4 outputs
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
// filter8 outputs
*p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
*p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
*p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
// filter14 outputs
*p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
*p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
*p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
*p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
*p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
*p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
}
}
static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
uint8x8_t *p0q0, const uint8_t blimit,
const uint8_t limit, const uint8_t thresh) {
uint16x8_t out;
uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
uint8x8_t out_f4_pq0, out_f4_pq1;
uint8x8_t mask_8x8, flat_8x8;
// Calculate filter masks
mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
{
// filter 4
int32x2x2_t ps0_qs0, ps1_qs1;
int16x8_t filter_s16;
const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
uint8x8_t temp0_8x8, temp1_8x8;
int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
int8x8_t op0, oq0, op1, oq1;
int8x8_t pq_s0, pq_s1;
int8x8_t filter_s8, filter1_s8, filter2_s8;
int8x8_t hev_8x8;
const int8x8_t sign_mask = vdup_n_s8(0x80);
const int8x8_t val_4 = vdup_n_s8(4);
const int8x8_t val_3 = vdup_n_s8(3);
pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
// hev_mask
temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
// add outer taps if we have high edge variance
filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
filter_s8 = vand_s8(filter_s8, hev_8x8);
// inner taps
temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
filter_s16 = vmovl_s8(filter_s8);
filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
filter_s8 = vqmovn_s16(filter_s16);
filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
filter1_s8 = vqadd_s8(filter_s8, val_4);
filter2_s8 = vqadd_s8(filter_s8, val_3);
filter1_s8 = vshr_n_s8(filter1_s8, 3);
filter2_s8 = vshr_n_s8(filter2_s8, 3);
oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
hev_8x8 = vmvn_s8(hev_8x8);
filter_s8 = vrshr_n_s8(filter1_s8, 1);
filter_s8 = vand_s8(filter_s8, hev_8x8);
oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
}
{
// filter 8
uint16x8_t out_pq0, out_pq1, out_pq2;
uint8x8_t q0p0, q1p1, q2p2;
out = vaddl_u8(*p3q3, *p2q2);
out = vaddw_u8(out, *p1q1);
out = vaddw_u8(out, *p0q0);
// reverse p and q
q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
out = vaddw_u8(out, q0p0);
out_pq1 = vaddw_u8(out, *p3q3);
out_pq2 = vaddw_u8(out_pq1, *p3q3);
out_pq2 = vaddw_u8(out_pq2, *p2q2);
out_pq1 = vaddw_u8(out_pq1, *p1q1);
out_pq1 = vaddw_u8(out_pq1, q1p1);
out_pq0 = vaddw_u8(out, *p0q0);
out_pq0 = vaddw_u8(out_pq0, q1p1);
out_pq0 = vaddw_u8(out_pq0, q2p2);
out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
}
{
uint8x8_t filter4_cond, filter8_cond;
filter8_cond = vand_u8(flat_8x8, mask_8x8);
filter4_cond = vmvn_u8(filter8_cond);
// filter4 outputs
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
// filter8 outputs
*p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
*p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
*p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
}
}
static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
const uint8_t blimit, const uint8_t limit,
const uint8_t thresh) {
uint16x8_t out;
uint8x8_t out_f6_pq0, out_f6_pq1;
uint8x8_t out_f4_pq0, out_f4_pq1;
uint8x8_t mask_8x8, flat_8x8;
// Calculate filter masks
mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
{
// filter 4
int32x2x2_t ps0_qs0, ps1_qs1;
int16x8_t filter_s16;
const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
uint8x8_t temp0_8x8, temp1_8x8;
int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
int8x8_t op0, oq0, op1, oq1;
int8x8_t pq_s0, pq_s1;
int8x8_t filter_s8, filter1_s8, filter2_s8;
int8x8_t hev_8x8;
const int8x8_t sign_mask = vdup_n_s8(0x80);
const int8x8_t val_4 = vdup_n_s8(4);
const int8x8_t val_3 = vdup_n_s8(3);
pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
// hev_mask
temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
// add outer taps if we have high edge variance
filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
filter_s8 = vand_s8(filter_s8, hev_8x8);
// inner taps
temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
filter_s16 = vmovl_s8(filter_s8);
filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
filter_s8 = vqmovn_s16(filter_s16);
filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
filter1_s8 = vqadd_s8(filter_s8, val_4);
filter2_s8 = vqadd_s8(filter_s8, val_3);
filter1_s8 = vshr_n_s8(filter1_s8, 3);
filter2_s8 = vshr_n_s8(filter2_s8, 3);
oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
filter_s8 = vrshr_n_s8(filter1_s8, 1);
filter_s8 = vbic_s8(filter_s8, hev_8x8);
oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
}
{
// filter 6
uint16x8_t out_pq0, out_pq1;
uint8x8_t pq_rev;
out = vaddl_u8(*p0q0, *p1q1);
out = vaddq_u16(out, out);
out = vaddw_u8(out, *p2q2);
pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
out = vaddw_u8(out, pq_rev);
out_pq0 = vaddw_u8(out, pq_rev);
pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
out_pq0 = vaddw_u8(out_pq0, pq_rev);
out_pq1 = vaddw_u8(out, *p2q2);
out_pq1 = vaddw_u8(out_pq1, *p2q2);
out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
}
{
uint8x8_t filter4_cond, filter6_cond;
filter6_cond = vand_u8(flat_8x8, mask_8x8);
filter4_cond = vmvn_u8(filter6_cond);
// filter4 outputs
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
// filter6 outputs
*p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
*p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
}
}
static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
const uint8_t limit, const uint8_t thresh) {
int32x2x2_t ps0_qs0, ps1_qs1;
int16x8_t filter_s16;
const uint8x8_t thresh_f4 = vdup_n_u8(thresh); const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
int8x8_t op0, oq0, op1, oq1;
int8x8_t pq_s0, pq_s1;
int8x8_t filter_s8, filter1_s8, filter2_s8;
int8x8_t hev_8x8;
const int8x8_t sign_mask = vdup_n_s8(0x80); const int8x8_t sign_mask = vdup_n_s8(0x80);
const int8x8_t val_4 = vdup_n_s8(4); const int8x8_t val_4 = vdup_n_s8(4);
const int8x8_t val_3 = vdup_n_s8(3); const int8x8_t val_3 = vdup_n_s8(3);
// Calculate filter mask int8x8_t pq_s0 = veor_s8(vreinterpret_s8_u8(p0q0), sign_mask);
mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit); int8x8_t pq_s1 = veor_s8(vreinterpret_s8_u8(p1q1), sign_mask);
pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); int32x2x2_t ps0_qs0 =
pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
int32x2x2_t ps1_qs1 =
ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); int8x8_t ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); int8x8_t qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); int8x8_t ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); int8x8_t qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
// hev_mask // hev_mask
temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); uint8x8_t temp0_8x8 = vcgt_u8(vabd_u8(p0q0, p1q1), thresh_f4);
temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); uint8x8_t temp1_8x8 =
hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
int8x8_t hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
// add outer taps if we have high edge variance // add outer taps if we have high edge variance
filter_s8 = vqsub_s8(ps1_s8, qs1_s8); int8x8_t filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
filter_s8 = vand_s8(filter_s8, hev_8x8); filter_s8 = vand_s8(filter_s8, hev_8x8);
// inner taps // inner taps
temp_s8 = vqsub_s8(qs0_s8, ps0_s8); int8x8_t temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
filter_s16 = vmovl_s8(filter_s8); int16x8_t filter_s16 = vmovl_s8(filter_s8);
filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
filter_s8 = vqmovn_s16(filter_s16); filter_s8 = vqmovn_s16(filter_s16);
filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
filter1_s8 = vqadd_s8(filter_s8, val_4); int8x8_t filter1_s8 = vqadd_s8(filter_s8, val_4);
filter2_s8 = vqadd_s8(filter_s8, val_3); int8x8_t filter2_s8 = vqadd_s8(filter_s8, val_3);
filter1_s8 = vshr_n_s8(filter1_s8, 3); filter1_s8 = vshr_n_s8(filter1_s8, 3);
filter2_s8 = vshr_n_s8(filter2_s8, 3); filter2_s8 = vshr_n_s8(filter2_s8, 3);
oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); int8x8_t oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); int8x8_t op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
filter_s8 = vrshr_n_s8(filter1_s8, 1); filter_s8 = vrshr_n_s8(filter1_s8, 1);
filter_s8 = vbic_s8(filter_s8, hev_8x8); filter_s8 = vbic_s8(filter_s8, hev_8x8);
oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); int8x8_t oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); int8x8_t op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
*p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); *p0q0_output = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
*p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); *p1q1_output = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
}
static inline void filter8(const uint8x8_t p0q0, const uint8x8_t p1q1,
const uint8x8_t p2q2, const uint8x8_t p3q3,
uint8x8_t *p0q0_output, uint8x8_t *p1q1_output,
uint8x8_t *p2q2_output) {
// Reverse p and q.
uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4);
uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4);
uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
uint16x8_t p2q2_p3q3 = vaddl_u8(p3q3, p2q2);
uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3);
uint16x8_t q0p0_p3q3 = vaddl_u8(q0p0, p3q3);
uint16x8_t out_q0p0_p3q3 = vaddq_u16(out, q0p0_p3q3);
uint16x8_t out_pq2 = vaddq_u16(out_q0p0_p3q3, p2q2_p3q3);
uint16x8_t p1q1_q1p1 = vaddl_u8(p1q1, q1p1);
uint16x8_t out_pq1 = vaddq_u16(out_q0p0_p3q3, p1q1_q1p1);
uint16x8_t q0p0_p0q0 = vaddl_u8(q0p0, p0q0);
uint16x8_t q1p1_q2p2 = vaddl_u8(q1p1, q2p2);
uint16x8_t out_pq0 = vaddq_u16(q0p0_p0q0, q1p1_q2p2);
out_pq0 = vaddq_u16(out_pq0, out);
*p0q0_output = vrshrn_n_u16(out_pq0, 3);
*p1q1_output = vrshrn_n_u16(out_pq1, 3);
*p2q2_output = vrshrn_n_u16(out_pq2, 3);
}
static inline void filter14(const uint8x8_t p0q0, const uint8x8_t p1q1,
const uint8x8_t p2q2, const uint8x8_t p3q3,
const uint8x8_t p4q4, const uint8x8_t p5q5,
const uint8x8_t p6q6, uint8x8_t *p0q0_output,
uint8x8_t *p1q1_output, uint8x8_t *p2q2_output,
uint8x8_t *p3q3_output, uint8x8_t *p4q4_output,
uint8x8_t *p5q5_output) {
// Reverse p and q.
uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4);
uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4);
uint8x8_t q3p3 = vext_u8(p3q3, p3q3, 4);
uint8x8_t q4p4 = vext_u8(p4q4, p4q4, 4);
uint8x8_t q5p5 = vext_u8(p5q5, p5q5, 4);
uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
uint16x8_t p2q2_p3q3 = vaddl_u8(p2q2, p3q3);
uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3);
uint16x8_t q0p0_p4q4 = vaddl_u8(q0p0, p4q4);
uint16x8_t p5q5_p6q6 = vaddl_u8(p5q5, p6q6);
uint16x8_t tmp = vaddq_u16(q0p0_p4q4, p5q5_p6q6);
// This offset removes the need for a rounding shift at the end.
uint16x8_t tmp_offset = vaddq_u16(tmp, vdupq_n_u16(1 << 3));
out = vaddq_u16(out, tmp_offset);
uint16x8_t out_pq5 = vaddw_u8(out, p4q4);
uint16x8_t out_pq4 = vaddw_u8(out_pq5, p3q3);
uint16x8_t out_pq3 = vaddw_u8(out_pq4, p2q2);
out_pq5 = vaddw_u8(out_pq5, p5q5);
uint16x8_t out_pq0 = vaddw_u8(out, p1q1);
uint16x8_t out_pq1 = vaddw_u8(out_pq0, p2q2);
uint16x8_t out_pq2 = vaddw_u8(out_pq1, p3q3);
uint16x8_t p0q0_q0p0 = vaddl_u8(p0q0, q0p0);
out_pq0 = vaddq_u16(out_pq0, p0q0_q0p0);
uint16x8_t p0q0_p6q6 = vaddl_u8(p0q0, p6q6);
out_pq1 = vaddq_u16(out_pq1, p0q0_p6q6);
uint16x8_t p5q5_q1p1 = vaddl_u8(p5q5, q1p1);
out_pq4 = vaddq_u16(out_pq4, p5q5_q1p1);
uint16x8_t p6q6_p6q6 = vaddl_u8(p6q6, p6q6);
out_pq2 = vaddq_u16(out_pq2, p6q6_p6q6);
uint16x8_t p6q6_temp = vaddw_u8(p6q6_p6q6, p6q6);
out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
p6q6_temp = vaddw_u8(p6q6_temp, p6q6);
out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
p6q6_temp = vaddq_u16(p6q6_temp, p6q6_p6q6);
out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
uint16x8_t qp_sum = vaddl_u8(q2p2, q1p1);
out_pq3 = vaddq_u16(out_pq3, qp_sum);
qp_sum = vaddw_u8(qp_sum, q3p3);
out_pq2 = vaddq_u16(out_pq2, qp_sum);
qp_sum = vaddw_u8(qp_sum, q4p4);
out_pq1 = vaddq_u16(out_pq1, qp_sum);
qp_sum = vaddw_u8(qp_sum, q5p5);
out_pq0 = vaddq_u16(out_pq0, qp_sum);
*p0q0_output = vshrn_n_u16(out_pq0, 4);
*p1q1_output = vshrn_n_u16(out_pq1, 4);
*p2q2_output = vshrn_n_u16(out_pq2, 4);
*p3q3_output = vshrn_n_u16(out_pq3, 4);
*p4q4_output = vshrn_n_u16(out_pq4, 4);
*p5q5_output = vshrn_n_u16(out_pq5, 4);
}
static inline void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5,
uint8x8_t *p4q4, uint8x8_t *p3q3,
uint8x8_t *p2q2, uint8x8_t *p1q1,
uint8x8_t *p0q0, const uint8_t blimit,
const uint8_t limit, const uint8_t thresh) {
uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
out_f14_pq5;
uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
uint8x8_t out_f4_pq0, out_f4_pq1;
// Calculate filter masks.
uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
uint8x8_t flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
// No filtering.
if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
return;
}
uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8);
uint8x8_t filter4_cond = vmvn_u8(filter8_cond);
uint8x8_t filter14_cond = vand_u8(filter8_cond, flat2_8x8);
if (vget_lane_s64(vreinterpret_s64_u8(filter14_cond), 0) == -1) {
// Only filter14() applies.
filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0,
&out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4,
&out_f14_pq5);
*p0q0 = out_f14_pq0;
*p1q1 = out_f14_pq1;
*p2q2 = out_f14_pq2;
*p3q3 = out_f14_pq3;
*p4q4 = out_f14_pq4;
*p5q5 = out_f14_pq5;
} else if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 &&
vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) {
// Only filter8() applies.
filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2);
*p0q0 = out_f7_pq0;
*p1q1 = out_f7_pq1;
*p2q2 = out_f7_pq2;
} else {
filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 &&
vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) {
// filter8() and filter14() do not apply, but filter4() applies to one or
// more values.
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
} else {
filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1,
&out_f7_pq2);
if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0) {
// filter14() does not apply, but filter8() and filter4() apply to one
// or more values. filter4 outputs
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
// filter8 outputs
*p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
*p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
*p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
} else {
// All filters may contribute values to final outputs.
filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0,
&out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4,
&out_f14_pq5);
// filter4 outputs
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
// filter8 outputs
*p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
*p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
*p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
// filter14 outputs
*p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
*p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
*p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
*p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
*p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
*p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
}
}
}
}
static inline void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
uint8x8_t *p0q0, const uint8_t blimit,
const uint8_t limit, const uint8_t thresh) {
uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
uint8x8_t out_f4_pq0, out_f4_pq1;
// Calculate filter masks.
uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
// No filtering.
if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
return;
}
uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8);
uint8x8_t filter4_cond = vmvn_u8(filter8_cond);
// Not needing filter4() at all is a very common case, so isolate it to avoid
// needlessly computing filter4().
if (vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) {
filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2);
*p0q0 = out_f7_pq0;
*p1q1 = out_f7_pq1;
*p2q2 = out_f7_pq2;
} else {
filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
if (vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) {
// filter8() does not apply, but filter4() applies to one or more values.
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
} else {
filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1,
&out_f7_pq2);
// filter4 outputs
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
// filter8 outputs
*p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
*p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
*p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
}
}
}
static inline void filter6(const uint8x8_t p0q0, const uint8x8_t p1q1,
const uint8x8_t p2q2, uint8x8_t *p0q0_output,
uint8x8_t *p1q1_output) {
uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
uint16x8_t out = vaddq_u16(p0q0_p1q1, p0q0_p1q1);
uint16x8_t q0p0_p2q2 = vaddl_u8(q0p0, p2q2);
out = vaddq_u16(out, q0p0_p2q2);
uint16x8_t q0p0_q1p1 = vextq_u16(p0q0_p1q1, p0q0_p1q1, 4);
uint16x8_t out_pq0 = vaddq_u16(out, q0p0_q1p1);
uint16x8_t p2q2_p2q2 = vaddl_u8(p2q2, p2q2);
uint16x8_t out_pq1 = vaddq_u16(out, p2q2_p2q2);
*p0q0_output = vrshrn_n_u16(out_pq0, 3);
*p1q1_output = vrshrn_n_u16(out_pq1, 3);
}
static inline void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
const uint8_t blimit, const uint8_t limit,
const uint8_t thresh) {
uint8x8_t out_f6_pq0, out_f6_pq1;
uint8x8_t out_f4_pq0, out_f4_pq1;
// Calculate filter masks.
uint8x8_t mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
uint8x8_t flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
// No filtering.
if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
return;
}
uint8x8_t filter6_cond = vand_u8(flat_8x8, mask_8x8);
uint8x8_t filter4_cond = vmvn_u8(filter6_cond);
// Not needing filter4 at all is a very common case, so isolate it to avoid
// needlessly computing filter4.
if (vget_lane_s64(vreinterpret_s64_u8(filter6_cond), 0) == -1) {
filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1);
*p0q0 = out_f6_pq0;
*p1q1 = out_f6_pq1;
} else {
filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
if (vget_lane_u64(vreinterpret_u64_u8(filter6_cond), 0) == 0) {
// filter6 does not apply, but filter4 applies to one or more values.
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
} else {
// All filters may contribute to the final output.
filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1);
// filter4 outputs
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
// filter6 outputs
*p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
*p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
}
}
}
static inline void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0,
const uint8_t blimit, const uint8_t limit,
const uint8_t thresh) {
uint8x8_t out_f4_pq0, out_f4_pq1;
// Calculate filter mask
uint8x8_t mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
// No filtering.
if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
return;
}
filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
*p0q0 = out_f4_pq0;
*p1q1 = out_f4_pq1;
} }
void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit, void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,

View File

@@ -55,12 +55,52 @@ static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
return res; return res;
} }
static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
return res;
}
static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
return res;
}
static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
vst1_u8(ptr + 0 * 8, a.val[0]);
vst1_u8(ptr + 1 * 8, a.val[1]);
}
static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
vst1_u8(ptr + 0 * 8, a.val[0]);
vst1_u8(ptr + 1 * 8, a.val[1]);
vst1_u8(ptr + 2 * 8, a.val[2]);
vst1_u8(ptr + 3 * 8, a.val[3]);
}
static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) {
vst1q_u16(ptr + 0 * 8, a.val[0]);
vst1q_u16(ptr + 1 * 8, a.val[1]);
}
static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) {
vst1q_u16(ptr + 0 * 8, a.val[0]);
vst1q_u16(ptr + 1 * 8, a.val[1]);
vst1q_u16(ptr + 2 * 8, a.val[2]);
vst1q_u16(ptr + 3 * 8, a.val[3]);
}
#elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit. #elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit.
#if __GNUC__ < 8 #if __GNUC__ < 8
static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) { static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
return res; return res;
} }
static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
return res;
}
#endif // __GNUC__ < 8 #endif // __GNUC__ < 8
#if __GNUC__ < 9 #if __GNUC__ < 9
@@ -71,13 +111,30 @@ static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
} }
#endif // __GNUC__ < 9 #endif // __GNUC__ < 9
// vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
#if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8), uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } }; vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
return res; return res;
} }
static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
return res;
}
static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
vst1_u8(ptr + 0 * 8, a.val[0]);
vst1_u8(ptr + 1 * 8, a.val[1]);
}
static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
vst1_u8(ptr + 0 * 8, a.val[0]);
vst1_u8(ptr + 1 * 8, a.val[1]);
vst1_u8(ptr + 2 * 8, a.val[2]);
vst1_u8(ptr + 3 * 8, a.val[3]);
}
#endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805 #endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
#endif // defined(__GNUC__) && !defined(__clang__) #endif // defined(__GNUC__) && !defined(__clang__)
@@ -215,6 +272,23 @@ static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
s += p; s += p;
} }
static inline void load_u16_4x6(const uint16_t *s, ptrdiff_t p,
uint16x4_t *const s0, uint16x4_t *const s1,
uint16x4_t *const s2, uint16x4_t *const s3,
uint16x4_t *const s4, uint16x4_t *const s5) {
*s0 = vld1_u16(s);
s += p;
*s1 = vld1_u16(s);
s += p;
*s2 = vld1_u16(s);
s += p;
*s3 = vld1_u16(s);
s += p;
*s4 = vld1_u16(s);
s += p;
*s5 = vld1_u16(s);
}
static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p, static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
uint16x4_t *const s0, uint16x4_t *const s1, uint16x4_t *const s0, uint16x4_t *const s1,
uint16x4_t *const s2, uint16x4_t *const s3, uint16x4_t *const s2, uint16x4_t *const s3,
@@ -235,6 +309,65 @@ static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
*s6 = vld1_u16(s); *s6 = vld1_u16(s);
} }
static inline void load_u16_4x8(const uint16_t *s, ptrdiff_t p,
uint16x4_t *const s0, uint16x4_t *const s1,
uint16x4_t *const s2, uint16x4_t *const s3,
uint16x4_t *const s4, uint16x4_t *const s5,
uint16x4_t *const s6, uint16x4_t *const s7) {
*s0 = vld1_u16(s);
s += p;
*s1 = vld1_u16(s);
s += p;
*s2 = vld1_u16(s);
s += p;
*s3 = vld1_u16(s);
s += p;
*s4 = vld1_u16(s);
s += p;
*s5 = vld1_u16(s);
s += p;
*s6 = vld1_u16(s);
s += p;
*s7 = vld1_u16(s);
}
static inline void load_u16_4x14(const uint16_t *s, ptrdiff_t p,
uint16x4_t *const s0, uint16x4_t *const s1,
uint16x4_t *const s2, uint16x4_t *const s3,
uint16x4_t *const s4, uint16x4_t *const s5,
uint16x4_t *const s6, uint16x4_t *const s7,
uint16x4_t *const s8, uint16x4_t *const s9,
uint16x4_t *const s10, uint16x4_t *const s11,
uint16x4_t *const s12, uint16x4_t *const s13) {
*s0 = vld1_u16(s);
s += p;
*s1 = vld1_u16(s);
s += p;
*s2 = vld1_u16(s);
s += p;
*s3 = vld1_u16(s);
s += p;
*s4 = vld1_u16(s);
s += p;
*s5 = vld1_u16(s);
s += p;
*s6 = vld1_u16(s);
s += p;
*s7 = vld1_u16(s);
s += p;
*s8 = vld1_u16(s);
s += p;
*s9 = vld1_u16(s);
s += p;
*s10 = vld1_u16(s);
s += p;
*s11 = vld1_u16(s);
s += p;
*s12 = vld1_u16(s);
s += p;
*s13 = vld1_u16(s);
}
static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p, static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
int16x8_t *const s0, int16x8_t *const s1) { int16x8_t *const s0, int16x8_t *const s1) {
*s0 = vld1q_s16(s); *s0 = vld1q_s16(s);
@@ -597,6 +730,56 @@ static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
vst1_u16(s, s3); vst1_u16(s, s3);
} }
static inline void store_u16_4x6(uint16_t *s, ptrdiff_t dst_stride,
const uint16x4_t s0, const uint16x4_t s1,
const uint16x4_t s2, const uint16x4_t s3,
const uint16x4_t s4, const uint16x4_t s5) {
vst1_u16(s, s0);
s += dst_stride;
vst1_u16(s, s1);
s += dst_stride;
vst1_u16(s, s2);
s += dst_stride;
vst1_u16(s, s3);
s += dst_stride;
vst1_u16(s, s4);
s += dst_stride;
vst1_u16(s, s5);
}
static inline void store_u16_4x12(uint16_t *s, ptrdiff_t dst_stride,
const uint16x4_t s0, const uint16x4_t s1,
const uint16x4_t s2, const uint16x4_t s3,
const uint16x4_t s4, const uint16x4_t s5,
const uint16x4_t s6, const uint16x4_t s7,
const uint16x4_t s8, const uint16x4_t s9,
const uint16x4_t s10, const uint16x4_t s11) {
vst1_u16(s, s0);
s += dst_stride;
vst1_u16(s, s1);
s += dst_stride;
vst1_u16(s, s2);
s += dst_stride;
vst1_u16(s, s3);
s += dst_stride;
vst1_u16(s, s4);
s += dst_stride;
vst1_u16(s, s5);
s += dst_stride;
vst1_u16(s, s6);
s += dst_stride;
vst1_u16(s, s7);
s += dst_stride;
vst1_u16(s, s8);
s += dst_stride;
vst1_u16(s, s9);
s += dst_stride;
vst1_u16(s, s10);
s += dst_stride;
vst1_u16(s, s11);
s += dst_stride;
}
static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride, static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
const uint16x8_t s0, const uint16x8_t s1) { const uint16x8_t s0, const uint16x8_t s1) {
vst1q_u16(s, s0); vst1q_u16(s, s0);

View File

@@ -46,16 +46,6 @@ static inline __m128i xx_loadu_128(const void *a) {
return _mm_loadu_si128((const __m128i *)a); return _mm_loadu_si128((const __m128i *)a);
} }
// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
// manually on older compilers.
#if !defined(__clang__) && __GNUC_MAJOR__ < 9
static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
__m64 hi_, lo_;
memcpy(&hi_, hi, sizeof(hi_));
memcpy(&lo_, lo, sizeof(lo_));
return _mm_set_epi64(hi_, lo_);
}
#else
// Load 64 bits from each of hi and low, and pack into an SSE register // Load 64 bits from each of hi and low, and pack into an SSE register
// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
// the strict aliasing rule, this takes a different approach // the strict aliasing rule, this takes a different approach
@@ -63,7 +53,6 @@ static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo), return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo),
_mm_loadl_epi64((const __m128i *)hi)); _mm_loadl_epi64((const __m128i *)hi));
} }
#endif
static inline void xx_storel_32(void *const a, const __m128i v) { static inline void xx_storel_32(void *const a, const __m128i v) {
const int val = _mm_cvtsi128_si32(v); const int val = _mm_cvtsi128_si32(v);

View File

@@ -76,26 +76,11 @@ static inline __m256i yy_loadu_4x64(const void *e3, const void *e2,
return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01)); return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
} }
#define GCC_VERSION (__GNUC__ * 10000 \
+ __GNUC_MINOR__ * 100 \
+ __GNUC_PATCHLEVEL__)
// _mm256_loadu2_m128i has been introduced in GCC 10.1
#if !defined(__clang__) && GCC_VERSION < 101000
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
__m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
__m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
return _mm256_set_m128i(mhi, mlo);
}
#else
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) { static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
__m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
__m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
return yy_set_m128i(mhi, mlo); return yy_set_m128i(mhi, mlo);
} }
#endif
#undef GCC_VERSION
static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) { static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
_mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1)); _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));

View File

@@ -38,6 +38,9 @@ endif()
list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h" list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h"
"${AOM_ROOT}/aom_ports/ppc_cpudetect.c") "${AOM_ROOT}/aom_ports/ppc_cpudetect.c")
list(APPEND AOM_PORTS_SOURCES_RISCV "${AOM_ROOT}/aom_ports/riscv.h"
"${AOM_ROOT}/aom_ports/riscv_cpudetect.c")
# For arm and x86 targets: # For arm and x86 targets:
# #
# * Creates the aom_ports build target, adds the includes in aom_ports to the # * Creates the aom_ports build target, adds the includes in aom_ports to the
@@ -68,9 +71,12 @@ function(setup_aom_ports_targets)
elseif("${AOM_TARGET_CPU}" MATCHES "ppc") elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC}) add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
set(aom_ports_has_symbols 1) set(aom_ports_has_symbols 1)
elseif("${AOM_TARGET_CPU}" MATCHES "riscv")
add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_RISCV})
set(aom_ports_has_symbols 1)
endif() endif()
if("${AOM_TARGET_CPU}" MATCHES "arm|ppc") if("${AOM_TARGET_CPU}" MATCHES "arm|ppc|riscv")
target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>) target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
if(BUILD_SHARED_LIBS) if(BUILD_SHARED_LIBS)
target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_ports>) target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_ports>)

30
third_party/aom/aom_ports/riscv.h vendored Normal file
View File

@@ -0,0 +1,30 @@
/*
* Copyright (c) 2025, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AOM_PORTS_RISCV_H_
#define AOM_AOM_PORTS_RISCV_H_
#include <stdlib.h>
#include "config/aom_config.h"
#ifdef __cplusplus
extern "C" {
#endif
#define HAS_RVV 0x01
int riscv_simd_caps(void);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_AOM_PORTS_RISCV_H_

View File

@@ -0,0 +1,38 @@
/*
* Copyright (c) 2025, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include "config/aom_config.h"
#include "aom_ports/riscv.h"
#if CONFIG_RUNTIME_CPU_DETECT
#include <sys/auxv.h>
#define HWCAP_RVV (1 << ('v' - 'a'))
int riscv_simd_caps(void) {
int flags = 0;
#if HAVE_RVV
unsigned long hwcap = getauxval(AT_HWCAP);
if (hwcap & HWCAP_RVV) flags |= HAS_RVV;
#endif
return flags;
}
#else
// If there is no RTCD the function pointers are not used and can not be
// changed.
int riscv_simd_caps(void) { return 0; }
#endif // CONFIG_RUNTIME_CPU_DETECT

View File

@@ -2318,8 +2318,9 @@ int main(int argc, const char **argv_) {
"match input format.\n", "match input format.\n",
stream->config.cfg.g_profile); stream->config.cfg.g_profile);
} }
if ((global.show_psnr == 2) && (stream->config.cfg.g_input_bit_depth == if (global.show_psnr == 2 &&
stream->config.cfg.g_bit_depth)) { stream->config.cfg.g_input_bit_depth ==
(unsigned int)stream->config.cfg.g_bit_depth) {
fprintf(stderr, fprintf(stderr,
"Warning: --psnr==2 and --psnr==1 will provide same " "Warning: --psnr==2 and --psnr==1 will provide same "
"results when input bit-depth == stream bit-depth, " "results when input bit-depth == stream bit-depth, "

View File

@@ -445,6 +445,9 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c") list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
list(APPEND AOM_AV1_COMMON_INTRIN_RVV
"${AOM_ROOT}/av1/common/riscv/cdef_block_rvv.c")
if(CONFIG_THREE_PASS) if(CONFIG_THREE_PASS)
list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/thirdpass.c" list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/thirdpass.c"
"${AOM_ROOT}/av1/encoder/thirdpass.h") "${AOM_ROOT}/av1/encoder/thirdpass.h")
@@ -822,6 +825,13 @@ function(setup_av1_targets)
endif() endif()
endif() endif()
if(HAVE_RVV)
if(AOM_AV1_COMMON_INTRIN_RVV)
add_intrinsics_object_library("-march=rv64gcv" "rvv" "aom_av1_common"
"AOM_AV1_COMMON_INTRIN_RVV")
endif()
endif()
# Pass the new lib targets up to the parent scope instance of # Pass the new lib targets up to the parent scope instance of
# $AOM_LIB_TARGETS. # $AOM_LIB_TARGETS.
set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)

View File

@@ -1084,7 +1084,6 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
AlgoCfg *const algo_cfg = &oxcf->algo_cfg; AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
ToolCfg *const tool_cfg = &oxcf->tool_cfg; ToolCfg *const tool_cfg = &oxcf->tool_cfg;
const int is_vbr = cfg->rc_end_usage == AOM_VBR;
oxcf->profile = cfg->g_profile; oxcf->profile = cfg->g_profile;
oxcf->max_threads = (int)cfg->g_threads; oxcf->max_threads = (int)cfg->g_threads;
@@ -1167,9 +1166,9 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level); rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct; rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct;
rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct; rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct;
rc_cfg->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz; rc_cfg->maximum_buffer_size_ms = cfg->rc_buf_sz;
rc_cfg->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz; rc_cfg->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
rc_cfg->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz; rc_cfg->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz;
// Convert target bandwidth from Kbit/s to Bit/s // Convert target bandwidth from Kbit/s to Bit/s
rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate; rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate;
rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh; rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh;

View File

@@ -13,6 +13,7 @@
#include "config/aom_config.h" #include "config/aom_config.h"
#include "config/av1_rtcd.h" #include "config/av1_rtcd.h"
#include "aom_dsp/arm/mem_neon.h"
#include "av1/common/cfl.h" #include "av1/common/cfl.h"
static inline void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset, static inline void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
@@ -428,10 +429,7 @@ static inline int16x8_t predict_w8(const int16_t *pred_buf_q3,
static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3, static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
int16x8_t alpha_sign, int abs_alpha_q12, int16x8_t alpha_sign, int abs_alpha_q12,
int16x8_t dc) { int16x8_t dc) {
// vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2 const int16x8x2_t ac_q3 = vld1q_s16_x2(pred_buf_q3);
// does not interleave, but is not currently available in the compilier used
// by the AOM build system.
const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3);
const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
const int16x8_t scaled_luma_0 = const int16x8_t scaled_luma_0 =
@@ -447,10 +445,7 @@ static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
static inline int16x8x4_t predict_w32(const int16_t *pred_buf_q3, static inline int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
int16x8_t alpha_sign, int abs_alpha_q12, int16x8_t alpha_sign, int abs_alpha_q12,
int16x8_t dc) { int16x8_t dc) {
// vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4 const int16x8x4_t ac_q3 = vld1q_s16_x4(pred_buf_q3);
// does not interleave, but is not currently available in the compilier used
// by the AOM build system.
const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3);
const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]); const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
@@ -497,7 +492,7 @@ static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]), const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
vqmovun_s16(pred.val[1]) } }; vqmovun_s16(pred.val[1]) } };
vst2_u8(dst, predun); vst1_u8_x2(dst, predun);
} else { } else {
const int16x8x4_t pred = const int16x8x4_t pred =
predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
@@ -505,7 +500,7 @@ static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
{ vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]), { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) } vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
}; };
vst4_u8(dst, predun); vst1_u8_x4(dst, predun);
} }
dst += dst_stride; dst += dst_stride;
} while ((pred_buf_q3 += CFL_BUF_LINE) < end); } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
@@ -574,11 +569,11 @@ static inline void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
} else if (width == 16) { } else if (width == 16) {
const int16x8x2_t pred = const int16x8x2_t pred =
predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
vst2q_u16(dst, clamp2q_s16(pred, max_16x8)); vst1q_u16_x2(dst, clamp2q_s16(pred, max_16x8));
} else { } else {
const int16x8x4_t pred = const int16x8x4_t pred =
predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
vst4q_u16(dst, clamp4q_s16(pred, max_16x8)); vst1q_u16_x4(dst, clamp4q_s16(pred, max_16x8));
} }
dst += dst_stride; dst += dst_stride;
} while ((pred_buf_q3 += CFL_BUF_LINE) < end); } while ((pred_buf_q3 += CFL_BUF_LINE) < end);

View File

@@ -53,8 +53,7 @@ static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp,
static AOM_FORCE_INLINE int16x8_t load_filters_1(int ofs) { static AOM_FORCE_INLINE int16x8_t load_filters_1(int ofs) {
const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS); const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS);
const int16_t *base = const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
(int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
return vld1q_s16(base + ofs0 * 8); return vld1q_s16(base + ofs0 * 8);
} }
@@ -65,8 +64,7 @@ static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int ofs,
const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
const int16_t *base = const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
(int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
out[0] = vld1q_s16(base + ofs0 * 8); out[0] = vld1q_s16(base + ofs0 * 8);
out[1] = vld1q_s16(base + ofs1 * 8); out[1] = vld1q_s16(base + ofs1 * 8);
out[2] = vld1q_s16(base + ofs2 * 8); out[2] = vld1q_s16(base + ofs2 * 8);
@@ -84,8 +82,7 @@ static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int ofs,
const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS); const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS);
const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS); const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS);
const int16_t *base = const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
(int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
out[0] = vld1q_s16(base + ofs0 * 8); out[0] = vld1q_s16(base + ofs0 * 8);
out[1] = vld1q_s16(base + ofs1 * 8); out[1] = vld1q_s16(base + ofs1 * 8);
out[2] = vld1q_s16(base + ofs2 * 8); out[2] = vld1q_s16(base + ofs2 * 8);

View File

@@ -101,8 +101,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
int sx) { int sx) {
int16x8_t f_s16 = int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
return horizontal_filter_4x1_f1_beta0(in, f_s16); return horizontal_filter_4x1_f1_beta0(in, f_s16);
} }
@@ -140,8 +139,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
int sx) { int sx) {
int16x8_t f_s16 = int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
return horizontal_filter_8x1_f1_beta0(in, f_s16); return horizontal_filter_8x1_f1_beta0(in, f_s16);
} }
@@ -156,8 +154,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
int16x4_t s6 = vget_low_s16(src[6]); int16x4_t s6 = vget_low_s16(src[6]);
int16x4_t s7 = vget_low_s16(src[7]); int16x4_t s7 = vget_low_s16(src[7]);
int16x8_t f = int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
@@ -210,8 +207,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
int16x8_t s6 = src[6]; int16x8_t s6 = src[6];
int16x8_t s7 = src[7]; int16x8_t s7 = src[7];
int16x8_t f = int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);

View File

@@ -61,34 +61,34 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset, static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset,
int stride) { int stride) {
out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> out[0] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]);
out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> out[1] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]);
out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >> out[2] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]);
out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >> out[3] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]);
} }
static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int offset, static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int offset,
int stride) { int stride) {
out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> out[0] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]);
out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> out[1] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]);
out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >> out[2] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]);
out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >> out[3] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]);
out[4] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 4 * stride) >> out[4] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 4 * stride) >> WARPEDDIFF_PREC_BITS]);
out[5] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 5 * stride) >> out[5] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 5 * stride) >> WARPEDDIFF_PREC_BITS]);
out[6] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 6 * stride) >> out[6] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 6 * stride) >> WARPEDDIFF_PREC_BITS]);
out[7] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 7 * stride) >> out[7] = vld1q_s16(
WARPEDDIFF_PREC_BITS))); av1_warped_filter[(offset + 7 * stride) >> WARPEDDIFF_PREC_BITS]);
} }
static AOM_FORCE_INLINE int clamp_iy(int iy, int height) { static AOM_FORCE_INLINE int clamp_iy(int iy, int height) {
@@ -175,8 +175,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
if (p_width == 4) { if (p_width == 4) {
if (beta == 0) { if (beta == 0) {
if (alpha == 0) { if (alpha == 0) {
int16x8_t f_s16 = vld1q_s16( int16x8_t f_s16 =
(int16_t *)(av1_warped_filter + (sx4 >> WARPEDDIFF_PREC_BITS))); vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16); APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16);
} else { } else {
APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha); APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha);
@@ -193,8 +193,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
} else { } else {
if (beta == 0) { if (beta == 0) {
if (alpha == 0) { if (alpha == 0) {
int16x8_t f_s16 = vld1q_s16( int16x8_t f_s16 =
(int16_t *)(av1_warped_filter + (sx4 >> WARPEDDIFF_PREC_BITS))); vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16); APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16);
} else { } else {
APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha); APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha);

View File

@@ -109,8 +109,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
int sx) { int sx) {
int16x8_t f_s16 = int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
return horizontal_filter_4x1_f1_beta0(in, f_s16); return horizontal_filter_4x1_f1_beta0(in, f_s16);
} }
@@ -145,8 +144,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
int sx) { int sx) {
int16x8_t f_s16 = int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
return horizontal_filter_8x1_f1_beta0(in, f_s16); return horizontal_filter_8x1_f1_beta0(in, f_s16);
} }
@@ -161,8 +159,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
int16x4_t s6 = vget_low_s16(src[6]); int16x4_t s6 = vget_low_s16(src[6]);
int16x4_t s7 = vget_low_s16(src[7]); int16x4_t s7 = vget_low_s16(src[7]);
int16x8_t f = int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
@@ -215,8 +212,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
int16x8_t s6 = src[6]; int16x8_t s6 = src[6];
int16x8_t s7 = src[7]; int16x8_t s7 = src[7];
int16x8_t f = int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);

View File

@@ -112,8 +112,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
int sx) { int sx) {
int16x8_t f_s16 = int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
return horizontal_filter_4x1_f1_beta0(in, f_s16); return horizontal_filter_4x1_f1_beta0(in, f_s16);
} }
@@ -148,8 +147,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
int sx) { int sx) {
int16x8_t f_s16 = int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
return horizontal_filter_8x1_f1_beta0(in, f_s16); return horizontal_filter_8x1_f1_beta0(in, f_s16);
} }
@@ -164,8 +162,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
int16x4_t s6 = vget_low_s16(src[6]); int16x4_t s6 = vget_low_s16(src[6]);
int16x4_t s7 = vget_low_s16(src[7]); int16x4_t s7 = vget_low_s16(src[7]);
int16x8_t f = int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
@@ -215,8 +212,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
int16x8_t s6 = src[6]; int16x8_t s6 = src[6];
int16x8_t s7 = src[7]; int16x8_t s7 = src[7];
int16x8_t f = int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);

View File

@@ -495,22 +495,22 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
# structs as arguments, which makes the v256 type of the intrinsics # structs as arguments, which makes the v256 type of the intrinsics
# hard to support, so optimizations for this target are disabled. # hard to support, so optimizations for this target are disabled.
if ($opts{config} !~ /libs-x86-win32-vs.*/) { if ($opts{config} !~ /libs-x86-win32-vs.*/) {
specialize qw/cdef_find_dir sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_find_dir sse4_1 avx2 neon rvv/, "$ssse3_x86";
specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/, "$ssse3_x86";
specialize qw/cdef_filter_8_0 sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_filter_8_0 sse4_1 avx2 neon rvv/, "$ssse3_x86";
specialize qw/cdef_filter_8_1 sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_filter_8_1 sse4_1 avx2 neon rvv/, "$ssse3_x86";
specialize qw/cdef_filter_8_2 sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_filter_8_2 sse4_1 avx2 neon rvv/, "$ssse3_x86";
specialize qw/cdef_filter_8_3 sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_filter_8_3 sse4_1 avx2 neon rvv/, "$ssse3_x86";
specialize qw/cdef_filter_16_0 sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_filter_16_0 sse4_1 avx2 neon rvv/, "$ssse3_x86";
specialize qw/cdef_filter_16_1 sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_filter_16_1 sse4_1 avx2 neon rvv/, "$ssse3_x86";
specialize qw/cdef_filter_16_2 sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_filter_16_2 sse4_1 avx2 neon rvv/, "$ssse3_x86";
specialize qw/cdef_filter_16_3 sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_filter_16_3 sse4_1 avx2 neon rvv/, "$ssse3_x86";
specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon rvv/, "$ssse3_x86";
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86"; specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon rvv/, "$ssse3_x86";
} }
} }

File diff suppressed because it is too large Load Diff

View File

@@ -27,7 +27,8 @@
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS. // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
// We need an extra 2 taps to fit this in, for a total of 8 taps. // We need an extra 2 taps to fit this in, for a total of 8 taps.
/* clang-format off */ /* clang-format off */
const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = { const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1]
[8] = {
// [-1, 0) // [-1, 0)
{ 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 }, { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
{ 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 }, { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
@@ -344,7 +345,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS; WARPEDPIXEL_PREC_SHIFTS;
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
const int16_t *coeffs = av1_warped_filter[offs]; const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
int32_t sum = 1 << offset_bits_horiz; int32_t sum = 1 << offset_bits_horiz;
for (int m = 0; m < 8; ++m) { for (int m = 0; m < 8; ++m) {
@@ -365,7 +366,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS; WARPEDPIXEL_PREC_SHIFTS;
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
const int16_t *coeffs = av1_warped_filter[offs]; const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
int32_t sum = 1 << offset_bits_vert; int32_t sum = 1 << offset_bits_vert;
for (int m = 0; m < 8; ++m) { for (int m = 0; m < 8; ++m) {
@@ -575,7 +576,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS; WARPEDPIXEL_PREC_SHIFTS;
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
const int16_t *coeffs = av1_warped_filter[offs]; const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
int32_t sum = 1 << offset_bits_horiz; int32_t sum = 1 << offset_bits_horiz;
for (int m = 0; m < 8; ++m) { for (int m = 0; m < 8; ++m) {
@@ -599,7 +600,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS; WARPEDPIXEL_PREC_SHIFTS;
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
const int16_t *coeffs = av1_warped_filter[offs]; const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
int32_t sum = 1 << offset_bits_vert; int32_t sum = 1 << offset_bits_vert;
for (int m = 0; m < 8; ++m) { for (int m = 0; m < 8; ++m) {

View File

@@ -33,7 +33,14 @@
#define WARP_ERROR_BLOCK_LOG 5 #define WARP_ERROR_BLOCK_LOG 5
#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG) #define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)
extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]; #if AOM_ARCH_ARM || AOM_ARCH_AARCH64 || AOM_ARCH_X86 || AOM_ARCH_X86_64
typedef int16_t WarpedFilterCoeff;
#else
typedef int8_t WarpedFilterCoeff;
#endif
extern const WarpedFilterCoeff
av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
DECLARE_ALIGNED(8, extern const int8_t, DECLARE_ALIGNED(8, extern const int8_t,
av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]); av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);

View File

@@ -3822,6 +3822,10 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
resize_reset_rc(cpi, resize_pending_params->width, resize_reset_rc(cpi, resize_pending_params->width,
resize_pending_params->height, cm->width, cm->height); resize_pending_params->height, cm->width, cm->height);
} }
if (svc->temporal_layer_id == 0) {
rc->num_col_blscroll_last_tl0 = 0;
rc->num_row_blscroll_last_tl0 = 0;
}
// Set the GF interval and update flag. // Set the GF interval and update flag.
if (!rc->rtc_external_ratectrl) if (!rc->rtc_external_ratectrl)
set_gf_interval_update_onepass_rt(cpi, *frame_type); set_gf_interval_update_onepass_rt(cpi, *frame_type);

View File

@@ -200,6 +200,8 @@ typedef struct {
int last_target_size_keyframe; int last_target_size_keyframe;
int frames_since_scene_change; int frames_since_scene_change;
int perc_spatial_flat_blocks; int perc_spatial_flat_blocks;
int num_col_blscroll_last_tl0;
int num_row_blscroll_last_tl0;
int avg_frame_bandwidth; // Average frame size target for clip int avg_frame_bandwidth; // Average frame size target for clip
int min_frame_bandwidth; // Minimum allocation used for any frame int min_frame_bandwidth; // Minimum allocation used for any frame

View File

@@ -1325,6 +1325,53 @@ static inline void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
} }
} }
static void do_int_pro_motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
unsigned int *y_sad, int mi_row,
int mi_col, int source_sad_nonrd) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mi = xd->mi[0];
const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
const int increase_col_sw = source_sad_nonrd > kMedSad &&
!cpi->rc.high_motion_content_screen_rtc &&
(cpi->svc.temporal_layer_id == 0 ||
cpi->rc.num_col_blscroll_last_tl0 > 2);
int me_search_size_col = is_screen
? increase_col_sw ? 512 : 96
: block_size_wide[cm->seq_params->sb_size] >> 1;
// For screen use larger search size row motion to capture
// vertical scroll, which can be larger motion.
int me_search_size_row = is_screen
? source_sad_nonrd > kMedSad ? 512 : 192
: block_size_high[cm->seq_params->sb_size] >> 1;
unsigned int y_sad_zero;
*y_sad = av1_int_pro_motion_estimation(
cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv, &y_sad_zero,
me_search_size_col, me_search_size_row);
// The logic below selects whether the motion estimated in the
// int_pro_motion() will be used in nonrd_pickmode. Only do this
// for screen for now.
if (is_screen) {
unsigned int thresh_sad =
(cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
x->sb_me_partition = 1;
x->sb_me_mv.as_int = mi->mv[0].as_int;
if (cpi->svc.temporal_layer_id == 0) {
if (abs(mi->mv[0].as_mv.col) > 16 && abs(mi->mv[0].as_mv.row) == 0)
cpi->rc.num_col_blscroll_last_tl0++;
else if (abs(mi->mv[0].as_mv.row) > 16 && abs(mi->mv[0].as_mv.col) == 0)
cpi->rc.num_row_blscroll_last_tl0++;
}
} else {
x->sb_me_partition = 0;
// Fall back to using zero motion.
*y_sad = y_sad_zero;
mi->mv[0].as_int = 0;
}
}
}
static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
unsigned int *y_sad_g, unsigned int *y_sad_alt, unsigned int *y_sad_g, unsigned int *y_sad_alt,
unsigned int *y_sad_last, unsigned int *y_sad_last,
@@ -1418,42 +1465,11 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
// so for now force it to 2 based on superblock sad. // so for now force it to 2 based on superblock sad.
if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2; if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2;
if (est_motion == 1 || est_motion == 2) { if ((est_motion == 1 || est_motion == 2) && xd->mb_to_right_edge >= 0 &&
if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) { xd->mb_to_bottom_edge >= 0 && x->source_variance > 100 &&
// For screen only do int_pro_motion for spatial variance above source_sad_nonrd > kLowSad) {
// threshold and motion level above LowSad. do_int_pro_motion_estimation(cpi, x, y_sad, mi_row, mi_col,
if (x->source_variance > 100 && source_sad_nonrd > kLowSad) { source_sad_nonrd);
int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
int me_search_size_col =
is_screen ? source_sad_nonrd > kMedSad ? 160 : 96
: block_size_wide[cm->seq_params->sb_size] >> 1;
// For screen use larger search size row motion to capture
// vertical scroll, which can be larger motion.
int me_search_size_row =
is_screen ? source_sad_nonrd > kMedSad ? 512 : 192
: block_size_high[cm->seq_params->sb_size] >> 1;
unsigned int y_sad_zero;
*y_sad = av1_int_pro_motion_estimation(
cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv,
&y_sad_zero, me_search_size_col, me_search_size_row);
// The logic below selects whether the motion estimated in the
// int_pro_motion() will be used in nonrd_pickmode. Only do this
// for screen for now.
if (is_screen) {
unsigned int thresh_sad =
(cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
x->sb_me_partition = 1;
x->sb_me_mv.as_int = mi->mv[0].as_int;
} else {
x->sb_me_partition = 0;
// Fall back to using zero motion.
*y_sad = y_sad_zero;
mi->mv[0].as_int = 0;
}
}
}
}
} }
if (*y_sad == UINT_MAX) { if (*y_sad == UINT_MAX) {

View File

@@ -26,6 +26,7 @@ set_aom_detect_var(AOM_ARCH_ARM 0 "Enables ARM architecture.")
set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.") set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.")
set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.") set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.")
set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.") set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.")
set_aom_detect_var(AOM_ARCH_RISCV 0 "Enables RISC-V architecture.")
# Arm/AArch64 feature flags. # Arm/AArch64 feature flags.
set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.") set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.")
@@ -51,6 +52,9 @@ set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.")
set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.") set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.")
set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.") set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.")
# RISC-V64 feature flags.
set_aom_detect_var(HAVE_RVV 0 "Enables RVV optimizations.")
# Flags describing the build environment. # Flags describing the build environment.
set_aom_detect_var(HAVE_FEXCEPT 0 set_aom_detect_var(HAVE_FEXCEPT 0
"Internal flag, GNU fenv.h present for target.") "Internal flag, GNU fenv.h present for target.")
@@ -241,3 +245,6 @@ set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets."
ON) ON)
set_aom_option_var(ENABLE_AVX2 set_aom_option_var(ENABLE_AVX2
"Enables AVX2 optimizations on x86/x86_64 targets." ON) "Enables AVX2 optimizations on x86/x86_64 targets." ON)
# RVV intrinsics flags.
set_aom_option_var(ENABLE_RVV "Enables RVV optimizations on RISC-V targets." ON)

View File

@@ -75,6 +75,8 @@ if(NOT AOM_TARGET_CPU)
set(AOM_TARGET_CPU "arm64") set(AOM_TARGET_CPU "arm64")
elseif(cpu_lowercase MATCHES "^ppc") elseif(cpu_lowercase MATCHES "^ppc")
set(AOM_TARGET_CPU "ppc") set(AOM_TARGET_CPU "ppc")
elseif(cpu_lowercase MATCHES "^riscv")
set(AOM_TARGET_CPU "riscv")
else() else()
message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not " message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
"supported, falling back to the generic target") "supported, falling back to the generic target")

View File

@@ -132,4 +132,15 @@ elseif("${AOM_TARGET_CPU}" MATCHES "^x86")
set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor}) set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
endif() endif()
endforeach() endforeach()
elseif("${AOM_TARGET_CPU}" MATCHES "riscv")
set(AOM_ARCH_RISCV64 1)
set(RTCD_ARCH_RISCV64 "yes")
if(ENABLE_RVV)
set(HAVE_RVV 1)
set(RTCD_HAVE_RVV "yes")
else()
set(HAVE_RVV 0)
set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-rvv)
endif()
endif() endif()

View File

@@ -370,6 +370,36 @@ EOF
common_bottom; common_bottom;
} }
sub riscv() {
determine_indirection("c", @ALL_ARCHS);
# Assign the helper variable for each enabled extension
foreach my $opt (@ALL_ARCHS) {
my $opt_uc = uc $opt;
eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
}
common_top;
print <<EOF;
#ifdef RTCD_C
#include "aom_ports/riscv.h"
static void setup_rtcd_internal(void)
{
int flags = riscv_simd_caps();
(void)flags;
EOF
set_function_pointers("c", @ALL_ARCHS);
print <<EOF;
}
#endif
EOF
common_bottom;
}
sub unoptimized() { sub unoptimized() {
determine_indirection "c"; determine_indirection "c";
common_top; common_top;
@@ -415,6 +445,9 @@ if ($opts{arch} eq 'x86') {
} elsif ($opts{arch} eq 'ppc') { } elsif ($opts{arch} eq 'ppc') {
@ALL_ARCHS = filter(qw/vsx/); @ALL_ARCHS = filter(qw/vsx/);
ppc; ppc;
} elsif ($opts{arch} eq 'riscv') {
@ALL_ARCHS = filter(qw/rvv/);
riscv;
} else { } else {
unoptimized; unoptimized;
} }

View File

@@ -618,7 +618,8 @@ TEST_P(CDEFCopyRect16to16Test, TestSIMDNoMismatch) {
using std::make_tuple; using std::make_tuple;
#if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON) #if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON || \
HAVE_RVV)
static const CdefFilterBlockFunctions kCdefFilterFuncC[] = { static const CdefFilterBlockFunctions kCdefFilterFuncC[] = {
{ &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c, { &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c,
&cdef_filter_8_3_c } &cdef_filter_8_3_c }
@@ -811,6 +812,46 @@ INSTANTIATE_TEST_SUITE_P(
#endif // CONFIG_AV1_HIGHBITDEPTH #endif // CONFIG_AV1_HIGHBITDEPTH
#endif #endif
#if HAVE_RVV
static const CdefFilterBlockFunctions kCdefFilterFuncRvv[] = {
{ &cdef_filter_8_0_rvv, &cdef_filter_8_1_rvv, &cdef_filter_8_2_rvv,
&cdef_filter_8_3_rvv }
};
static const CdefFilterBlockFunctions kCdefFilterHighbdFuncRvv[] = {
{ &cdef_filter_16_0_rvv, &cdef_filter_16_1_rvv, &cdef_filter_16_2_rvv,
&cdef_filter_16_3_rvv }
};
INSTANTIATE_TEST_SUITE_P(
RVV, CDEFBlockTest,
::testing::Combine(::testing::ValuesIn(kCdefFilterFuncRvv),
::testing::ValuesIn(kCdefFilterFuncC),
::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
BLOCK_8X8),
::testing::Range(0, 16), ::testing::Values(8)));
INSTANTIATE_TEST_SUITE_P(
RVV, CDEFBlockHighbdTest,
::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncRvv),
::testing::ValuesIn(kCdefFilterHighbdFuncC),
::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
BLOCK_8X8),
::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
INSTANTIATE_TEST_SUITE_P(RVV, CDEFFindDirTest,
::testing::Values(make_tuple(&cdef_find_dir_rvv,
&cdef_find_dir_c)));
INSTANTIATE_TEST_SUITE_P(
RVV, CDEFCopyRect8to16Test,
::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
&cdef_copy_rect8_8bit_to_16bit_rvv)));
INSTANTIATE_TEST_SUITE_P(
RVV, CDEFCopyRect16to16Test,
::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
&cdef_copy_rect8_16bit_to_16bit_rvv)));
#endif
// Test speed for all supported architectures // Test speed for all supported architectures
#if AOM_ARCH_X86 && HAVE_SSSE3 #if AOM_ARCH_X86 && HAVE_SSSE3
INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P(
@@ -905,4 +946,24 @@ INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirDualSpeedTest,
&cdef_find_dir_dual_c))); &cdef_find_dir_dual_c)));
#endif #endif
#if HAVE_RVV
INSTANTIATE_TEST_SUITE_P(
RVV, CDEFSpeedTest,
::testing::Combine(::testing::ValuesIn(kCdefFilterFuncRvv),
::testing::ValuesIn(kCdefFilterFuncC),
::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
BLOCK_8X8),
::testing::Range(0, 16), ::testing::Values(8)));
INSTANTIATE_TEST_SUITE_P(
RVV, CDEFSpeedHighbdTest,
::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncRvv),
::testing::ValuesIn(kCdefFilterHighbdFuncC),
::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
BLOCK_8X8),
::testing::Range(0, 16), ::testing::Values(10)));
INSTANTIATE_TEST_SUITE_P(RVV, CDEFFindDirSpeedTest,
::testing::Values(make_tuple(&cdef_find_dir_rvv,
&cdef_find_dir_c)));
#endif
} // namespace } // namespace

View File

@@ -1078,6 +1078,39 @@ class DatarateTestSVC
#endif #endif
} }
virtual void BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080Test() {
cfg_.rc_buf_initial_sz = 50;
cfg_.rc_buf_optimal_sz = 50;
cfg_.rc_buf_sz = 100;
cfg_.rc_dropframe_thresh = 30;
cfg_.rc_min_quantizer = 0;
cfg_.rc_max_quantizer = 52;
cfg_.rc_end_usage = AOM_CBR;
cfg_.g_lag_in_frames = 0;
cfg_.g_error_resilient = 0;
::libaom_test::Y4mVideoSource video("screendata.1920_1080.y4m", 0, 60);
const int bitrate_array[2] = { 60, 100 };
cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
ResetModel();
screen_mode_ = 1;
number_temporal_layers_ = 2;
number_spatial_layers_ = 1;
target_layer_bitrate_[0] = 60 * cfg_.rc_target_bitrate / 100;
target_layer_bitrate_[1] = cfg_.rc_target_bitrate;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
#if CONFIG_AV1_DECODER
// Top temporal layers are non_reference, so exclude them from
// mismatch count, since loopfilter/cdef is not applied for these on
// encoder side, but is always applied on decoder.
// This means 150 = #frames(300) - #TL2_frames(150).
// We use LE for screen since loopfilter level can become very small
// or zero and then the frame is not a mismatch.
EXPECT_LE(GetMismatchFrames(), 150u);
#endif
}
virtual void BasicRateTargetingSVC1TL3SLScreenTest() { virtual void BasicRateTargetingSVC1TL3SLScreenTest() {
cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_optimal_sz = 500;
@@ -2651,6 +2684,14 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLScreenDropFrame) {
BasicRateTargetingSVC2TL1SLScreenDropFrameTest(); BasicRateTargetingSVC2TL1SLScreenDropFrameTest();
} }
// Check basic rate targeting for CBR, for 2 temporal layers, 1 spatial
// for screen mode, with frame dropper on at low bitrates. Use small
// values of rc_buf_initial/optimal/sz to trigger postencode frame drop.
// Use 1920x1080 clip.
TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080) {
BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080Test();
}
// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal // Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal
// for screen mode. // for screen mode.
TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLScreen) { TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLScreen) {

View File

@@ -573,3 +573,4 @@ c7f336958e7af6162c20ddc84d67c7dfa9826910 *av1-1-b8-16-intra_only-intrabc-extreme
4d1ad6d3070268ccb000d7fc3ae0f5a9447bfe82 *test_input_w1h1.yuv 4d1ad6d3070268ccb000d7fc3ae0f5a9447bfe82 *test_input_w1h1.yuv
ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
9c2aa2d0f63f706f775bf661dfa81e8bb3089d8b *wikipedia_420_360p_60f.y4m 9c2aa2d0f63f706f775bf661dfa81e8bb3089d8b *wikipedia_420_360p_60f.y4m
9e4d2ba84ba62f7ea4b617a13af5db9c39e7f0f9 *screendata.1920_1080.y4m

View File

@@ -35,6 +35,7 @@ list(APPEND AOM_TEST_DATA_FILE_NAMES
"niklas_1280_720_30.y4m" "niklas_1280_720_30.y4m"
"rush_hour_444.y4m" "rush_hour_444.y4m"
"screendata.y4m" "screendata.y4m"
"screendata.1920_1080.y4m"
"niklas_640_480_30.yuv" "niklas_640_480_30.yuv"
"vase10x10.yuv" "vase10x10.yuv"
"vase10x10_tiles.txt" "vase10x10_tiles.txt"