Bug 1945604 - Update aom to 3990233fc06a35944d6d33797e63931802122a95 r=padenot
Differential Revision: https://phabricator.services.mozilla.com/D236581
This commit is contained in:
@@ -12,6 +12,7 @@
|
|||||||
AOM_ARCH_AARCH64 equ 0
|
AOM_ARCH_AARCH64 equ 0
|
||||||
AOM_ARCH_ARM equ 0
|
AOM_ARCH_ARM equ 0
|
||||||
AOM_ARCH_PPC equ 0
|
AOM_ARCH_PPC equ 0
|
||||||
|
AOM_ARCH_RISCV equ 0
|
||||||
AOM_ARCH_X86 equ 0
|
AOM_ARCH_X86 equ 0
|
||||||
AOM_ARCH_X86_64 equ 0
|
AOM_ARCH_X86_64 equ 0
|
||||||
CONFIG_ACCOUNTING equ 0
|
CONFIG_ACCOUNTING equ 0
|
||||||
@@ -82,6 +83,7 @@ HAVE_MMX equ 0
|
|||||||
HAVE_NEON equ 0
|
HAVE_NEON equ 0
|
||||||
HAVE_NEON_DOTPROD equ 0
|
HAVE_NEON_DOTPROD equ 0
|
||||||
HAVE_NEON_I8MM equ 0
|
HAVE_NEON_I8MM equ 0
|
||||||
|
HAVE_RVV equ 0
|
||||||
HAVE_SSE equ 0
|
HAVE_SSE equ 0
|
||||||
HAVE_SSE2 equ 0
|
HAVE_SSE2 equ 0
|
||||||
HAVE_SSE3 equ 0
|
HAVE_SSE3 equ 0
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
#define AOM_ARCH_AARCH64 0
|
#define AOM_ARCH_AARCH64 0
|
||||||
#define AOM_ARCH_ARM 0
|
#define AOM_ARCH_ARM 0
|
||||||
#define AOM_ARCH_PPC 0
|
#define AOM_ARCH_PPC 0
|
||||||
|
#define AOM_ARCH_RISCV 0
|
||||||
#define AOM_ARCH_X86 0
|
#define AOM_ARCH_X86 0
|
||||||
#define AOM_ARCH_X86_64 0
|
#define AOM_ARCH_X86_64 0
|
||||||
#define CONFIG_ACCOUNTING 0
|
#define CONFIG_ACCOUNTING 0
|
||||||
@@ -84,6 +85,7 @@
|
|||||||
#define HAVE_NEON 0
|
#define HAVE_NEON 0
|
||||||
#define HAVE_NEON_DOTPROD 0
|
#define HAVE_NEON_DOTPROD 0
|
||||||
#define HAVE_NEON_I8MM 0
|
#define HAVE_NEON_I8MM 0
|
||||||
|
#define HAVE_RVV 0
|
||||||
#define HAVE_SSE 0
|
#define HAVE_SSE 0
|
||||||
#define HAVE_SSE2 0
|
#define HAVE_SSE2 0
|
||||||
#define HAVE_SSE3 0
|
#define HAVE_SSE3 0
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
.equ AOM_ARCH_AARCH64, 0
|
.equ AOM_ARCH_AARCH64, 0
|
||||||
.equ AOM_ARCH_ARM, 1
|
.equ AOM_ARCH_ARM, 1
|
||||||
.equ AOM_ARCH_PPC, 0
|
.equ AOM_ARCH_PPC, 0
|
||||||
|
.equ AOM_ARCH_RISCV, 0
|
||||||
.equ AOM_ARCH_X86, 0
|
.equ AOM_ARCH_X86, 0
|
||||||
.equ AOM_ARCH_X86_64, 0
|
.equ AOM_ARCH_X86_64, 0
|
||||||
.equ CONFIG_ACCOUNTING, 0
|
.equ CONFIG_ACCOUNTING, 0
|
||||||
@@ -82,6 +83,7 @@
|
|||||||
.equ HAVE_NEON, 1
|
.equ HAVE_NEON, 1
|
||||||
.equ HAVE_NEON_DOTPROD, 0
|
.equ HAVE_NEON_DOTPROD, 0
|
||||||
.equ HAVE_NEON_I8MM, 0
|
.equ HAVE_NEON_I8MM, 0
|
||||||
|
.equ HAVE_RVV, 0
|
||||||
.equ HAVE_SSE, 0
|
.equ HAVE_SSE, 0
|
||||||
.equ HAVE_SSE2, 0
|
.equ HAVE_SSE2, 0
|
||||||
.equ HAVE_SSE3, 0
|
.equ HAVE_SSE3, 0
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
#define AOM_ARCH_AARCH64 0
|
#define AOM_ARCH_AARCH64 0
|
||||||
#define AOM_ARCH_ARM 1
|
#define AOM_ARCH_ARM 1
|
||||||
#define AOM_ARCH_PPC 0
|
#define AOM_ARCH_PPC 0
|
||||||
|
#define AOM_ARCH_RISCV 0
|
||||||
#define AOM_ARCH_X86 0
|
#define AOM_ARCH_X86 0
|
||||||
#define AOM_ARCH_X86_64 0
|
#define AOM_ARCH_X86_64 0
|
||||||
#define CONFIG_ACCOUNTING 0
|
#define CONFIG_ACCOUNTING 0
|
||||||
@@ -84,6 +85,7 @@
|
|||||||
#define HAVE_NEON 1
|
#define HAVE_NEON 1
|
||||||
#define HAVE_NEON_DOTPROD 0
|
#define HAVE_NEON_DOTPROD 0
|
||||||
#define HAVE_NEON_I8MM 0
|
#define HAVE_NEON_I8MM 0
|
||||||
|
#define HAVE_RVV 0
|
||||||
#define HAVE_SSE 0
|
#define HAVE_SSE 0
|
||||||
#define HAVE_SSE2 0
|
#define HAVE_SSE2 0
|
||||||
#define HAVE_SSE3 0
|
#define HAVE_SSE3 0
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
AOM_ARCH_AARCH64 equ 0
|
AOM_ARCH_AARCH64 equ 0
|
||||||
AOM_ARCH_ARM equ 0
|
AOM_ARCH_ARM equ 0
|
||||||
AOM_ARCH_PPC equ 0
|
AOM_ARCH_PPC equ 0
|
||||||
|
AOM_ARCH_RISCV equ 0
|
||||||
AOM_ARCH_X86 equ 1
|
AOM_ARCH_X86 equ 1
|
||||||
AOM_ARCH_X86_64 equ 0
|
AOM_ARCH_X86_64 equ 0
|
||||||
CONFIG_ACCOUNTING equ 0
|
CONFIG_ACCOUNTING equ 0
|
||||||
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
|
|||||||
HAVE_NEON equ 0
|
HAVE_NEON equ 0
|
||||||
HAVE_NEON_DOTPROD equ 0
|
HAVE_NEON_DOTPROD equ 0
|
||||||
HAVE_NEON_I8MM equ 0
|
HAVE_NEON_I8MM equ 0
|
||||||
|
HAVE_RVV equ 0
|
||||||
HAVE_SSE equ 1
|
HAVE_SSE equ 1
|
||||||
HAVE_SSE2 equ 1
|
HAVE_SSE2 equ 1
|
||||||
HAVE_SSE3 equ 1
|
HAVE_SSE3 equ 1
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
#define AOM_ARCH_AARCH64 0
|
#define AOM_ARCH_AARCH64 0
|
||||||
#define AOM_ARCH_ARM 0
|
#define AOM_ARCH_ARM 0
|
||||||
#define AOM_ARCH_PPC 0
|
#define AOM_ARCH_PPC 0
|
||||||
|
#define AOM_ARCH_RISCV 0
|
||||||
#define AOM_ARCH_X86 1
|
#define AOM_ARCH_X86 1
|
||||||
#define AOM_ARCH_X86_64 0
|
#define AOM_ARCH_X86_64 0
|
||||||
#define CONFIG_ACCOUNTING 0
|
#define CONFIG_ACCOUNTING 0
|
||||||
@@ -84,6 +85,7 @@
|
|||||||
#define HAVE_NEON 0
|
#define HAVE_NEON 0
|
||||||
#define HAVE_NEON_DOTPROD 0
|
#define HAVE_NEON_DOTPROD 0
|
||||||
#define HAVE_NEON_I8MM 0
|
#define HAVE_NEON_I8MM 0
|
||||||
|
#define HAVE_RVV 0
|
||||||
#define HAVE_SSE 1
|
#define HAVE_SSE 1
|
||||||
#define HAVE_SSE2 1
|
#define HAVE_SSE2 1
|
||||||
#define HAVE_SSE3 1
|
#define HAVE_SSE3 1
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
AOM_ARCH_AARCH64 equ 0
|
AOM_ARCH_AARCH64 equ 0
|
||||||
AOM_ARCH_ARM equ 0
|
AOM_ARCH_ARM equ 0
|
||||||
AOM_ARCH_PPC equ 0
|
AOM_ARCH_PPC equ 0
|
||||||
|
AOM_ARCH_RISCV equ 0
|
||||||
AOM_ARCH_X86 equ 0
|
AOM_ARCH_X86 equ 0
|
||||||
AOM_ARCH_X86_64 equ 1
|
AOM_ARCH_X86_64 equ 1
|
||||||
CONFIG_ACCOUNTING equ 0
|
CONFIG_ACCOUNTING equ 0
|
||||||
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
|
|||||||
HAVE_NEON equ 0
|
HAVE_NEON equ 0
|
||||||
HAVE_NEON_DOTPROD equ 0
|
HAVE_NEON_DOTPROD equ 0
|
||||||
HAVE_NEON_I8MM equ 0
|
HAVE_NEON_I8MM equ 0
|
||||||
|
HAVE_RVV equ 0
|
||||||
HAVE_SSE equ 1
|
HAVE_SSE equ 1
|
||||||
HAVE_SSE2 equ 1
|
HAVE_SSE2 equ 1
|
||||||
HAVE_SSE3 equ 1
|
HAVE_SSE3 equ 1
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
#define AOM_ARCH_AARCH64 0
|
#define AOM_ARCH_AARCH64 0
|
||||||
#define AOM_ARCH_ARM 0
|
#define AOM_ARCH_ARM 0
|
||||||
#define AOM_ARCH_PPC 0
|
#define AOM_ARCH_PPC 0
|
||||||
|
#define AOM_ARCH_RISCV 0
|
||||||
#define AOM_ARCH_X86 0
|
#define AOM_ARCH_X86 0
|
||||||
#define AOM_ARCH_X86_64 1
|
#define AOM_ARCH_X86_64 1
|
||||||
#define CONFIG_ACCOUNTING 0
|
#define CONFIG_ACCOUNTING 0
|
||||||
@@ -84,6 +85,7 @@
|
|||||||
#define HAVE_NEON 0
|
#define HAVE_NEON 0
|
||||||
#define HAVE_NEON_DOTPROD 0
|
#define HAVE_NEON_DOTPROD 0
|
||||||
#define HAVE_NEON_I8MM 0
|
#define HAVE_NEON_I8MM 0
|
||||||
|
#define HAVE_RVV 0
|
||||||
#define HAVE_SSE 1
|
#define HAVE_SSE 1
|
||||||
#define HAVE_SSE2 1
|
#define HAVE_SSE2 1
|
||||||
#define HAVE_SSE3 1
|
#define HAVE_SSE3 1
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
AOM_ARCH_AARCH64 equ 1
|
AOM_ARCH_AARCH64 equ 1
|
||||||
AOM_ARCH_ARM equ 1
|
AOM_ARCH_ARM equ 1
|
||||||
AOM_ARCH_PPC equ 0
|
AOM_ARCH_PPC equ 0
|
||||||
|
AOM_ARCH_RISCV equ 0
|
||||||
AOM_ARCH_X86 equ 0
|
AOM_ARCH_X86 equ 0
|
||||||
AOM_ARCH_X86_64 equ 0
|
AOM_ARCH_X86_64 equ 0
|
||||||
CONFIG_ACCOUNTING equ 0
|
CONFIG_ACCOUNTING equ 0
|
||||||
@@ -82,6 +83,7 @@ HAVE_MMX equ 0
|
|||||||
HAVE_NEON equ 1
|
HAVE_NEON equ 1
|
||||||
HAVE_NEON_DOTPROD equ 1
|
HAVE_NEON_DOTPROD equ 1
|
||||||
HAVE_NEON_I8MM equ 1
|
HAVE_NEON_I8MM equ 1
|
||||||
|
HAVE_RVV equ 0
|
||||||
HAVE_SSE equ 0
|
HAVE_SSE equ 0
|
||||||
HAVE_SSE2 equ 0
|
HAVE_SSE2 equ 0
|
||||||
HAVE_SSE3 equ 0
|
HAVE_SSE3 equ 0
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
#define AOM_ARCH_AARCH64 1
|
#define AOM_ARCH_AARCH64 1
|
||||||
#define AOM_ARCH_ARM 1
|
#define AOM_ARCH_ARM 1
|
||||||
#define AOM_ARCH_PPC 0
|
#define AOM_ARCH_PPC 0
|
||||||
|
#define AOM_ARCH_RISCV 0
|
||||||
#define AOM_ARCH_X86 0
|
#define AOM_ARCH_X86 0
|
||||||
#define AOM_ARCH_X86_64 0
|
#define AOM_ARCH_X86_64 0
|
||||||
#define CONFIG_ACCOUNTING 0
|
#define CONFIG_ACCOUNTING 0
|
||||||
@@ -84,6 +85,7 @@
|
|||||||
#define HAVE_NEON 1
|
#define HAVE_NEON 1
|
||||||
#define HAVE_NEON_DOTPROD 1
|
#define HAVE_NEON_DOTPROD 1
|
||||||
#define HAVE_NEON_I8MM 1
|
#define HAVE_NEON_I8MM 1
|
||||||
|
#define HAVE_RVV 0
|
||||||
#define HAVE_SSE 0
|
#define HAVE_SSE 0
|
||||||
#define HAVE_SSE2 0
|
#define HAVE_SSE2 0
|
||||||
#define HAVE_SSE3 0
|
#define HAVE_SSE3 0
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
AOM_ARCH_AARCH64 equ 0
|
AOM_ARCH_AARCH64 equ 0
|
||||||
AOM_ARCH_ARM equ 0
|
AOM_ARCH_ARM equ 0
|
||||||
AOM_ARCH_PPC equ 0
|
AOM_ARCH_PPC equ 0
|
||||||
|
AOM_ARCH_RISCV equ 0
|
||||||
AOM_ARCH_X86 equ 0
|
AOM_ARCH_X86 equ 0
|
||||||
AOM_ARCH_X86_64 equ 1
|
AOM_ARCH_X86_64 equ 1
|
||||||
CONFIG_ACCOUNTING equ 0
|
CONFIG_ACCOUNTING equ 0
|
||||||
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
|
|||||||
HAVE_NEON equ 0
|
HAVE_NEON equ 0
|
||||||
HAVE_NEON_DOTPROD equ 0
|
HAVE_NEON_DOTPROD equ 0
|
||||||
HAVE_NEON_I8MM equ 0
|
HAVE_NEON_I8MM equ 0
|
||||||
|
HAVE_RVV equ 0
|
||||||
HAVE_SSE equ 1
|
HAVE_SSE equ 1
|
||||||
HAVE_SSE2 equ 1
|
HAVE_SSE2 equ 1
|
||||||
HAVE_SSE3 equ 1
|
HAVE_SSE3 equ 1
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
#define AOM_ARCH_AARCH64 0
|
#define AOM_ARCH_AARCH64 0
|
||||||
#define AOM_ARCH_ARM 0
|
#define AOM_ARCH_ARM 0
|
||||||
#define AOM_ARCH_PPC 0
|
#define AOM_ARCH_PPC 0
|
||||||
|
#define AOM_ARCH_RISCV 0
|
||||||
#define AOM_ARCH_X86 0
|
#define AOM_ARCH_X86 0
|
||||||
#define AOM_ARCH_X86_64 1
|
#define AOM_ARCH_X86_64 1
|
||||||
#define CONFIG_ACCOUNTING 0
|
#define CONFIG_ACCOUNTING 0
|
||||||
@@ -84,6 +85,7 @@
|
|||||||
#define HAVE_NEON 0
|
#define HAVE_NEON 0
|
||||||
#define HAVE_NEON_DOTPROD 0
|
#define HAVE_NEON_DOTPROD 0
|
||||||
#define HAVE_NEON_I8MM 0
|
#define HAVE_NEON_I8MM 0
|
||||||
|
#define HAVE_RVV 0
|
||||||
#define HAVE_SSE 1
|
#define HAVE_SSE 1
|
||||||
#define HAVE_SSE2 1
|
#define HAVE_SSE2 1
|
||||||
#define HAVE_SSE3 1
|
#define HAVE_SSE3 1
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
AOM_ARCH_AARCH64 equ 0
|
AOM_ARCH_AARCH64 equ 0
|
||||||
AOM_ARCH_ARM equ 0
|
AOM_ARCH_ARM equ 0
|
||||||
AOM_ARCH_PPC equ 0
|
AOM_ARCH_PPC equ 0
|
||||||
|
AOM_ARCH_RISCV equ 0
|
||||||
AOM_ARCH_X86 equ 1
|
AOM_ARCH_X86 equ 1
|
||||||
AOM_ARCH_X86_64 equ 0
|
AOM_ARCH_X86_64 equ 0
|
||||||
CONFIG_ACCOUNTING equ 0
|
CONFIG_ACCOUNTING equ 0
|
||||||
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
|
|||||||
HAVE_NEON equ 0
|
HAVE_NEON equ 0
|
||||||
HAVE_NEON_DOTPROD equ 0
|
HAVE_NEON_DOTPROD equ 0
|
||||||
HAVE_NEON_I8MM equ 0
|
HAVE_NEON_I8MM equ 0
|
||||||
|
HAVE_RVV equ 0
|
||||||
HAVE_SSE equ 1
|
HAVE_SSE equ 1
|
||||||
HAVE_SSE2 equ 1
|
HAVE_SSE2 equ 1
|
||||||
HAVE_SSE3 equ 1
|
HAVE_SSE3 equ 1
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
#define AOM_ARCH_AARCH64 0
|
#define AOM_ARCH_AARCH64 0
|
||||||
#define AOM_ARCH_ARM 0
|
#define AOM_ARCH_ARM 0
|
||||||
#define AOM_ARCH_PPC 0
|
#define AOM_ARCH_PPC 0
|
||||||
|
#define AOM_ARCH_RISCV 0
|
||||||
#define AOM_ARCH_X86 1
|
#define AOM_ARCH_X86 1
|
||||||
#define AOM_ARCH_X86_64 0
|
#define AOM_ARCH_X86_64 0
|
||||||
#define CONFIG_ACCOUNTING 0
|
#define CONFIG_ACCOUNTING 0
|
||||||
@@ -84,6 +85,7 @@
|
|||||||
#define HAVE_NEON 0
|
#define HAVE_NEON 0
|
||||||
#define HAVE_NEON_DOTPROD 0
|
#define HAVE_NEON_DOTPROD 0
|
||||||
#define HAVE_NEON_I8MM 0
|
#define HAVE_NEON_I8MM 0
|
||||||
|
#define HAVE_RVV 0
|
||||||
#define HAVE_SSE 1
|
#define HAVE_SSE 1
|
||||||
#define HAVE_SSE2 1
|
#define HAVE_SSE2 1
|
||||||
#define HAVE_SSE3 1
|
#define HAVE_SSE3 1
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
AOM_ARCH_AARCH64 equ 0
|
AOM_ARCH_AARCH64 equ 0
|
||||||
AOM_ARCH_ARM equ 0
|
AOM_ARCH_ARM equ 0
|
||||||
AOM_ARCH_PPC equ 0
|
AOM_ARCH_PPC equ 0
|
||||||
|
AOM_ARCH_RISCV equ 0
|
||||||
AOM_ARCH_X86 equ 0
|
AOM_ARCH_X86 equ 0
|
||||||
AOM_ARCH_X86_64 equ 1
|
AOM_ARCH_X86_64 equ 1
|
||||||
CONFIG_ACCOUNTING equ 0
|
CONFIG_ACCOUNTING equ 0
|
||||||
@@ -82,6 +83,7 @@ HAVE_MMX equ 1
|
|||||||
HAVE_NEON equ 0
|
HAVE_NEON equ 0
|
||||||
HAVE_NEON_DOTPROD equ 0
|
HAVE_NEON_DOTPROD equ 0
|
||||||
HAVE_NEON_I8MM equ 0
|
HAVE_NEON_I8MM equ 0
|
||||||
|
HAVE_RVV equ 0
|
||||||
HAVE_SSE equ 1
|
HAVE_SSE equ 1
|
||||||
HAVE_SSE2 equ 1
|
HAVE_SSE2 equ 1
|
||||||
HAVE_SSE3 equ 1
|
HAVE_SSE3 equ 1
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
#define AOM_ARCH_AARCH64 0
|
#define AOM_ARCH_AARCH64 0
|
||||||
#define AOM_ARCH_ARM 0
|
#define AOM_ARCH_ARM 0
|
||||||
#define AOM_ARCH_PPC 0
|
#define AOM_ARCH_PPC 0
|
||||||
|
#define AOM_ARCH_RISCV 0
|
||||||
#define AOM_ARCH_X86 0
|
#define AOM_ARCH_X86 0
|
||||||
#define AOM_ARCH_X86_64 1
|
#define AOM_ARCH_X86_64 1
|
||||||
#define CONFIG_ACCOUNTING 0
|
#define CONFIG_ACCOUNTING 0
|
||||||
@@ -84,6 +85,7 @@
|
|||||||
#define HAVE_NEON 0
|
#define HAVE_NEON 0
|
||||||
#define HAVE_NEON_DOTPROD 0
|
#define HAVE_NEON_DOTPROD 0
|
||||||
#define HAVE_NEON_I8MM 0
|
#define HAVE_NEON_I8MM 0
|
||||||
|
#define HAVE_RVV 0
|
||||||
#define HAVE_SSE 1
|
#define HAVE_SSE 1
|
||||||
#define HAVE_SSE2 1
|
#define HAVE_SSE2 1
|
||||||
#define HAVE_SSE3 1
|
#define HAVE_SSE3 1
|
||||||
|
|||||||
@@ -20,11 +20,11 @@ origin:
|
|||||||
|
|
||||||
# Human-readable identifier for this version/release
|
# Human-readable identifier for this version/release
|
||||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||||
release: 0c13a5d54053f82bf8500b421b5cdefb1cc1b3ed (Sun Jan 05 09:13:09 2025 -0800).
|
release: 3990233fc06a35944d6d33797e63931802122a95 (Thu Jan 30 11:32:16 2025 -0800).
|
||||||
|
|
||||||
# Revision to pull in
|
# Revision to pull in
|
||||||
# Must be a long or short commit SHA (long preferred)
|
# Must be a long or short commit SHA (long preferred)
|
||||||
revision: 0c13a5d54053f82bf8500b421b5cdefb1cc1b3ed
|
revision: 3990233fc06a35944d6d33797e63931802122a95
|
||||||
|
|
||||||
# The package's license, where possible using the mnemonic from
|
# The package's license, where possible using the mnemonic from
|
||||||
# https://spdx.org/licenses/
|
# https://spdx.org/licenses/
|
||||||
|
|||||||
10
third_party/aom/CMakeLists.txt
vendored
10
third_party/aom/CMakeLists.txt
vendored
@@ -333,6 +333,12 @@ if(CONFIG_AV1_ENCODER)
|
|||||||
# libaom static library.
|
# libaom static library.
|
||||||
if(BUILD_SHARED_LIBS)
|
if(BUILD_SHARED_LIBS)
|
||||||
target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom_static)
|
target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom_static)
|
||||||
|
# TODO: https://aomedia.issues.chromium.org/391715078 - This condition can
|
||||||
|
# be removed after aom_av1_rc restricts its symbol visibility.
|
||||||
|
if(CYGWIN OR MINGW)
|
||||||
|
target_link_options(aom_av1_rc ${AOM_LIB_LINK_TYPE}
|
||||||
|
LINKER:--allow-multiple-definition)
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom)
|
target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom)
|
||||||
endif()
|
endif()
|
||||||
@@ -858,8 +864,8 @@ if(BUILD_SHARED_LIBS)
|
|||||||
# errors (don't use it with AddressSanitizer)." See
|
# errors (don't use it with AddressSanitizer)." See
|
||||||
# https://clang.llvm.org/docs/AddressSanitizer.html#usage. Similarly, see
|
# https://clang.llvm.org/docs/AddressSanitizer.html#usage. Similarly, see
|
||||||
# https://clang.llvm.org/docs/MemorySanitizer.html#usage.
|
# https://clang.llvm.org/docs/MemorySanitizer.html#usage.
|
||||||
if(NOT WIN32
|
if(NOT
|
||||||
AND NOT APPLE
|
(APPLE OR CYGWIN OR WIN32)
|
||||||
AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE))
|
AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE))
|
||||||
# The -z defs linker option reports unresolved symbol references from object
|
# The -z defs linker option reports unresolved symbol references from object
|
||||||
# files when building a shared library.
|
# files when building a shared library.
|
||||||
|
|||||||
4
third_party/aom/README.md
vendored
4
third_party/aom/README.md
vendored
@@ -60,7 +60,9 @@ README.md {#LREADME}
|
|||||||
present, yasm will be used by default. Pass -DENABLE_NASM=ON to cmake to
|
present, yasm will be used by default. Pass -DENABLE_NASM=ON to cmake to
|
||||||
select nasm.) If you download yasm with the intention to work with Visual
|
select nasm.) If you download yasm with the intention to work with Visual
|
||||||
Studio, please download win32.exe or win64.exe and rename it into yasm.exe.
|
Studio, please download win32.exe or win64.exe and rename it into yasm.exe.
|
||||||
DO NOT download or use vsyasm.exe.
|
DO NOT download or use vsyasm.exe. The MSYS2 version of the yasm binary can
|
||||||
|
also be used and avoids an issue caused by a missing Visual C++
|
||||||
|
Redistributable install (Visual Studio 2010, MSVCR100.dll).
|
||||||
6. Building the documentation requires
|
6. Building the documentation requires
|
||||||
[doxygen version 1.8.10 or newer](http://doxygen.org).
|
[doxygen version 1.8.10 or newer](http://doxygen.org).
|
||||||
7. Emscripten builds require the portable
|
7. Emscripten builds require the portable
|
||||||
|
|||||||
2
third_party/aom/aom/exports_com
vendored
2
third_party/aom/aom/exports_com
vendored
@@ -10,7 +10,6 @@ text aom_codec_set_option
|
|||||||
text aom_codec_version
|
text aom_codec_version
|
||||||
text aom_codec_version_extra_str
|
text aom_codec_version_extra_str
|
||||||
text aom_codec_version_str
|
text aom_codec_version_str
|
||||||
text aom_free
|
|
||||||
text aom_img_add_metadata
|
text aom_img_add_metadata
|
||||||
text aom_img_alloc
|
text aom_img_alloc
|
||||||
text aom_img_alloc_with_border
|
text aom_img_alloc_with_border
|
||||||
@@ -25,7 +24,6 @@ text aom_img_plane_width
|
|||||||
text aom_img_remove_metadata
|
text aom_img_remove_metadata
|
||||||
text aom_img_set_rect
|
text aom_img_set_rect
|
||||||
text aom_img_wrap
|
text aom_img_wrap
|
||||||
text aom_malloc
|
|
||||||
text aom_rb_bytes_read
|
text aom_rb_bytes_read
|
||||||
text aom_rb_read_bit
|
text aom_rb_read_bit
|
||||||
text aom_rb_read_literal
|
text aom_rb_read_literal
|
||||||
|
|||||||
426
third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
vendored
426
third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
vendored
@@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
#include "aom/aom_integer.h"
|
#include "aom/aom_integer.h"
|
||||||
#include "aom_dsp/arm/transpose_neon.h"
|
#include "aom_dsp/arm/transpose_neon.h"
|
||||||
|
#include "mem_neon.h"
|
||||||
|
|
||||||
static inline int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low,
|
static inline int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low,
|
||||||
const int16x4_t high) {
|
const int16x4_t high) {
|
||||||
@@ -226,13 +227,8 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
|
|||||||
const uint8_t *blimit,
|
const uint8_t *blimit,
|
||||||
const uint8_t *limit,
|
const uint8_t *limit,
|
||||||
const uint8_t *thresh, int bd) {
|
const uint8_t *thresh, int bd) {
|
||||||
uint16_t *const dst_p1 = (uint16_t *)(s - 2 * pitch);
|
uint16x4_t src[4];
|
||||||
uint16_t *const dst_p0 = (uint16_t *)(s - pitch);
|
load_u16_4x4(s - 2 * pitch, pitch, &src[0], &src[1], &src[2], &src[3]);
|
||||||
uint16_t *const dst_q0 = (uint16_t *)(s);
|
|
||||||
uint16_t *const dst_q1 = (uint16_t *)(s + pitch);
|
|
||||||
|
|
||||||
const uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0),
|
|
||||||
vld1_u16(dst_q0), vld1_u16(dst_q1) };
|
|
||||||
|
|
||||||
// Adjust thresholds to bitdepth.
|
// Adjust thresholds to bitdepth.
|
||||||
const int outer_thresh = *blimit << (bd - 8);
|
const int outer_thresh = *blimit << (bd - 8);
|
||||||
@@ -247,12 +243,10 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
|
|||||||
filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
|
filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
|
||||||
&needs_filter4_mask);
|
&needs_filter4_mask);
|
||||||
|
|
||||||
#if AOM_ARCH_AARCH64
|
if (vget_lane_u64(vreinterpret_u64_u16(needs_filter4_mask), 0) == 0) {
|
||||||
if (vaddv_u16(needs_filter4_mask) == 0) {
|
|
||||||
// None of the values will be filtered.
|
// None of the values will be filtered.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // AOM_ARCH_AARCH64
|
|
||||||
|
|
||||||
// Copy the masks to the high bits for packed comparisons later.
|
// Copy the masks to the high bits for packed comparisons later.
|
||||||
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
||||||
@@ -272,10 +266,9 @@ void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
|
|||||||
const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
|
const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
|
||||||
const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
|
const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
|
||||||
|
|
||||||
vst1_u16(dst_p1, vget_low_u16(p1q1_output));
|
store_u16_4x4(s - 2 * pitch, pitch, vget_low_u16(p1q1_output),
|
||||||
vst1_u16(dst_p0, vget_low_u16(p0q0_output));
|
vget_low_u16(p0q0_output), vget_high_u16(p0q0_output),
|
||||||
vst1_u16(dst_q0, vget_high_u16(p0q0_output));
|
vget_high_u16(p1q1_output));
|
||||||
vst1_u16(dst_q1, vget_high_u16(p1q1_output));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_lpf_horizontal_4_dual_neon(
|
void aom_highbd_lpf_horizontal_4_dual_neon(
|
||||||
@@ -290,14 +283,8 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
|
|||||||
const uint8_t *blimit, const uint8_t *limit,
|
const uint8_t *blimit, const uint8_t *limit,
|
||||||
const uint8_t *thresh, int bd) {
|
const uint8_t *thresh, int bd) {
|
||||||
// Offset by 2 uint16_t values to load from first p1 position.
|
// Offset by 2 uint16_t values to load from first p1 position.
|
||||||
uint16_t *dst = s - 2;
|
uint16x4_t src[4];
|
||||||
uint16_t *dst_p1 = dst;
|
load_u16_4x4(s - 2, pitch, &src[0], &src[1], &src[2], &src[3]);
|
||||||
uint16_t *dst_p0 = dst + pitch;
|
|
||||||
uint16_t *dst_q0 = dst + pitch * 2;
|
|
||||||
uint16_t *dst_q1 = dst + pitch * 3;
|
|
||||||
|
|
||||||
uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
|
|
||||||
vld1_u16(dst_q1) };
|
|
||||||
transpose_array_inplace_u16_4x4(src);
|
transpose_array_inplace_u16_4x4(src);
|
||||||
|
|
||||||
// Adjust thresholds to bitdepth.
|
// Adjust thresholds to bitdepth.
|
||||||
@@ -313,12 +300,10 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
|
|||||||
filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
|
filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
|
||||||
&needs_filter4_mask);
|
&needs_filter4_mask);
|
||||||
|
|
||||||
#if AOM_ARCH_AARCH64
|
if (vget_lane_u64(vreinterpret_u64_u16(needs_filter4_mask), 0) == 0) {
|
||||||
if (vaddv_u16(needs_filter4_mask) == 0) {
|
|
||||||
// None of the values will be filtered.
|
// None of the values will be filtered.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // AOM_ARCH_AARCH64
|
|
||||||
|
|
||||||
// Copy the masks to the high bits for packed comparisons later.
|
// Copy the masks to the high bits for packed comparisons later.
|
||||||
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
||||||
@@ -346,10 +331,7 @@ void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
|
|||||||
};
|
};
|
||||||
transpose_array_inplace_u16_4x4(output);
|
transpose_array_inplace_u16_4x4(output);
|
||||||
|
|
||||||
vst1_u16(dst_p1, output[0]);
|
store_u16_4x4(s - 2, pitch, output[0], output[1], output[2], output[3]);
|
||||||
vst1_u16(dst_p0, output[1]);
|
|
||||||
vst1_u16(dst_q0, output[2]);
|
|
||||||
vst1_u16(dst_q1, output[3]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_lpf_vertical_4_dual_neon(
|
void aom_highbd_lpf_vertical_4_dual_neon(
|
||||||
@@ -379,16 +361,14 @@ static inline void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
|
|||||||
// ^^^^^^
|
// ^^^^^^
|
||||||
sum = vaddq_u16(sum, p0q0);
|
sum = vaddq_u16(sum, p0q0);
|
||||||
|
|
||||||
// p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
|
|
||||||
// ^^^^^
|
|
||||||
sum = vshlq_n_u16(sum, 1);
|
|
||||||
|
|
||||||
// p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
|
// p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
|
||||||
// ^^^^^^ ^^^^^^
|
// ^^^^^^ ^^^^^^
|
||||||
// Should dual issue with the left shift.
|
// Should dual issue with the left shift.
|
||||||
const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
|
const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
|
||||||
const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
|
const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
|
||||||
sum = vaddq_u16(sum, outer_sum);
|
// p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
|
||||||
|
// ^^^^^^^^^^^ ^^^^
|
||||||
|
sum = vmlaq_n_u16(outer_sum, sum, 2);
|
||||||
|
|
||||||
*p1q1_output = vrshrq_n_u16(sum, 3);
|
*p1q1_output = vrshrq_n_u16(sum, 3);
|
||||||
|
|
||||||
@@ -396,11 +376,8 @@ static inline void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
|
|||||||
// p0 = p1 - (2 * p2) + q0 + q1
|
// p0 = p1 - (2 * p2) + q0 + q1
|
||||||
// q0 = q1 - (2 * q2) + p0 + p1
|
// q0 = q1 - (2 * q2) + p0 + p1
|
||||||
// p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
|
// p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
|
||||||
// ^^^^^^^^
|
// ^^^^^^^^^^^^^^^^^
|
||||||
const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
|
sum = vmlsq_n_u16(sum, p2q2, 2);
|
||||||
// p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
|
|
||||||
// ^^^^^^^^
|
|
||||||
sum = vsubq_u16(sum, p2q2_double);
|
|
||||||
const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
|
const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
|
||||||
sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
|
sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
|
||||||
|
|
||||||
@@ -411,16 +388,9 @@ void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
|
|||||||
const uint8_t *blimit,
|
const uint8_t *blimit,
|
||||||
const uint8_t *limit,
|
const uint8_t *limit,
|
||||||
const uint8_t *thresh, int bd) {
|
const uint8_t *thresh, int bd) {
|
||||||
uint16_t *const dst_p2 = s - 3 * pitch;
|
uint16x4_t src[6];
|
||||||
uint16_t *const dst_p1 = s - 2 * pitch;
|
load_u16_4x6(s - 3 * pitch, pitch, &src[0], &src[1], &src[2], &src[3],
|
||||||
uint16_t *const dst_p0 = s - pitch;
|
&src[4], &src[5]);
|
||||||
uint16_t *const dst_q0 = s;
|
|
||||||
uint16_t *const dst_q1 = s + pitch;
|
|
||||||
uint16_t *const dst_q2 = s + 2 * pitch;
|
|
||||||
|
|
||||||
const uint16x4_t src[6] = { vld1_u16(dst_p2), vld1_u16(dst_p1),
|
|
||||||
vld1_u16(dst_p0), vld1_u16(dst_q0),
|
|
||||||
vld1_u16(dst_q1), vld1_u16(dst_q2) };
|
|
||||||
|
|
||||||
// Adjust thresholds to bitdepth.
|
// Adjust thresholds to bitdepth.
|
||||||
const int outer_thresh = *blimit << (bd - 8);
|
const int outer_thresh = *blimit << (bd - 8);
|
||||||
@@ -437,32 +407,38 @@ void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
|
|||||||
filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
|
filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
|
||||||
&needs_filter_mask, &is_flat3_mask, &hev_mask);
|
&needs_filter_mask, &is_flat3_mask, &hev_mask);
|
||||||
|
|
||||||
#if AOM_ARCH_AARCH64
|
if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
|
||||||
if (vaddv_u16(needs_filter_mask) == 0) {
|
|
||||||
// None of the values will be filtered.
|
// None of the values will be filtered.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // AOM_ARCH_AARCH64
|
|
||||||
|
|
||||||
|
uint16x8_t p0q0_output, p1q1_output;
|
||||||
|
uint16x8_t f6_p1q1, f6_p0q0;
|
||||||
|
// Not needing filter4() at all is a very common case, so isolate it to avoid
|
||||||
|
// needlessly computing filter4().
|
||||||
|
if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 &&
|
||||||
|
vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
|
||||||
|
filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
|
||||||
|
p1q1_output = f6_p1q1;
|
||||||
|
p0q0_output = f6_p0q0;
|
||||||
|
} else {
|
||||||
// Copy the masks to the high bits for packed comparisons later.
|
// Copy the masks to the high bits for packed comparisons later.
|
||||||
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
||||||
const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
|
const uint16x8_t is_flat3_mask_8 =
|
||||||
|
vcombine_u16(is_flat3_mask, is_flat3_mask);
|
||||||
const uint16x8_t needs_filter_mask_8 =
|
const uint16x8_t needs_filter_mask_8 =
|
||||||
vcombine_u16(needs_filter_mask, needs_filter_mask);
|
vcombine_u16(needs_filter_mask, needs_filter_mask);
|
||||||
|
|
||||||
uint16x8_t f4_p1q1;
|
uint16x8_t f4_p1q1;
|
||||||
uint16x8_t f4_p0q0;
|
uint16x8_t f4_p0q0;
|
||||||
// ZIP1 p0q0, p1q1 may perform better here.
|
|
||||||
const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
|
const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
|
||||||
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
||||||
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
||||||
|
|
||||||
uint16x8_t p0q0_output, p1q1_output;
|
|
||||||
// Because we did not return after testing |needs_filter_mask| we know it is
|
// Because we did not return after testing |needs_filter_mask| we know it is
|
||||||
// nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
|
// nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
|
||||||
// filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
|
// filter6. Therefore if it is false when |needs_filter_mask| is true,
|
||||||
// output is not used.
|
// filter6 output is not used.
|
||||||
uint16x8_t f6_p1q1, f6_p0q0;
|
|
||||||
const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
|
const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
|
||||||
if (vget_lane_u64(need_filter6, 0) == 0) {
|
if (vget_lane_u64(need_filter6, 0) == 0) {
|
||||||
// filter6() does not apply, but filter4() applies to one or more values.
|
// filter6() does not apply, but filter4() applies to one or more values.
|
||||||
@@ -476,11 +452,11 @@ void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
|
|||||||
p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
|
p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
|
||||||
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
vst1_u16(dst_p1, vget_low_u16(p1q1_output));
|
store_u16_4x4(s - 2 * pitch, pitch, vget_low_u16(p1q1_output),
|
||||||
vst1_u16(dst_p0, vget_low_u16(p0q0_output));
|
vget_low_u16(p0q0_output), vget_high_u16(p0q0_output),
|
||||||
vst1_u16(dst_q0, vget_high_u16(p0q0_output));
|
vget_high_u16(p1q1_output));
|
||||||
vst1_u16(dst_q1, vget_high_u16(p1q1_output));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_lpf_horizontal_6_dual_neon(
|
void aom_highbd_lpf_horizontal_6_dual_neon(
|
||||||
@@ -494,17 +470,12 @@ void aom_highbd_lpf_horizontal_6_dual_neon(
|
|||||||
void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
|
void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
|
||||||
const uint8_t *blimit, const uint8_t *limit,
|
const uint8_t *blimit, const uint8_t *limit,
|
||||||
const uint8_t *thresh, int bd) {
|
const uint8_t *thresh, int bd) {
|
||||||
// Left side of the filter window.
|
|
||||||
uint16_t *const dst = s - 3;
|
|
||||||
uint16_t *const dst_0 = dst;
|
|
||||||
uint16_t *const dst_1 = dst + pitch;
|
|
||||||
uint16_t *const dst_2 = dst + 2 * pitch;
|
|
||||||
uint16_t *const dst_3 = dst + 3 * pitch;
|
|
||||||
|
|
||||||
// Overread by 2 values. These overreads become the high halves of src_raw[2]
|
// Overread by 2 values. These overreads become the high halves of src_raw[2]
|
||||||
// and src_raw[3] after transpose.
|
// and src_raw[3] after transpose.
|
||||||
uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
|
uint16x8_t src_raw[4];
|
||||||
vld1q_u16(dst_2), vld1q_u16(dst_3) };
|
load_u16_8x4(s - 3, pitch, &src_raw[0], &src_raw[1], &src_raw[2],
|
||||||
|
&src_raw[3]);
|
||||||
|
|
||||||
transpose_array_inplace_u16_4x8(src_raw);
|
transpose_array_inplace_u16_4x8(src_raw);
|
||||||
// p2, p1, p0, q0, q1, q2
|
// p2, p1, p0, q0, q1, q2
|
||||||
const uint16x4_t src[6] = {
|
const uint16x4_t src[6] = {
|
||||||
@@ -528,16 +499,30 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
|
|||||||
filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
|
filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
|
||||||
&needs_filter_mask, &is_flat3_mask, &hev_mask);
|
&needs_filter_mask, &is_flat3_mask, &hev_mask);
|
||||||
|
|
||||||
#if AOM_ARCH_AARCH64
|
if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
|
||||||
if (vaddv_u16(needs_filter_mask) == 0) {
|
|
||||||
// None of the values will be filtered.
|
// None of the values will be filtered.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // AOM_ARCH_AARCH64
|
|
||||||
|
|
||||||
|
uint16x8_t p0q0_output, p1q1_output;
|
||||||
|
// Because we did not return after testing |needs_filter_mask| we know it is
|
||||||
|
// nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
|
||||||
|
// filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
|
||||||
|
// output is not used.
|
||||||
|
uint16x8_t f6_p1q1, f6_p0q0;
|
||||||
|
const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
|
||||||
|
// Not needing filter4() at all is a very common case, so isolate it to avoid
|
||||||
|
// needlessly computing filter4().
|
||||||
|
if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 &&
|
||||||
|
vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
|
||||||
|
filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
|
||||||
|
p1q1_output = f6_p1q1;
|
||||||
|
p0q0_output = f6_p0q0;
|
||||||
|
} else {
|
||||||
// Copy the masks to the high bits for packed comparisons later.
|
// Copy the masks to the high bits for packed comparisons later.
|
||||||
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
||||||
const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
|
const uint16x8_t is_flat3_mask_8 =
|
||||||
|
vcombine_u16(is_flat3_mask, is_flat3_mask);
|
||||||
const uint16x8_t needs_filter_mask_8 =
|
const uint16x8_t needs_filter_mask_8 =
|
||||||
vcombine_u16(needs_filter_mask, needs_filter_mask);
|
vcombine_u16(needs_filter_mask, needs_filter_mask);
|
||||||
|
|
||||||
@@ -547,14 +532,6 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
|
|||||||
const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
|
const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
|
||||||
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
||||||
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
||||||
|
|
||||||
uint16x8_t p0q0_output, p1q1_output;
|
|
||||||
// Because we did not return after testing |needs_filter_mask| we know it is
|
|
||||||
// nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
|
|
||||||
// filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
|
|
||||||
// output is not used.
|
|
||||||
uint16x8_t f6_p1q1, f6_p0q0;
|
|
||||||
const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
|
|
||||||
if (vget_lane_u64(need_filter6, 0) == 0) {
|
if (vget_lane_u64(need_filter6, 0) == 0) {
|
||||||
// filter6() does not apply, but filter4() applies to one or more values.
|
// filter6() does not apply, but filter4() applies to one or more values.
|
||||||
p0q0_output = p0q0;
|
p0q0_output = p0q0;
|
||||||
@@ -567,6 +544,7 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
|
|||||||
p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
|
p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
|
||||||
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
uint16x4_t output[4] = {
|
uint16x4_t output[4] = {
|
||||||
vget_low_u16(p1q1_output),
|
vget_low_u16(p1q1_output),
|
||||||
@@ -576,11 +554,7 @@ void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
|
|||||||
};
|
};
|
||||||
transpose_array_inplace_u16_4x4(output);
|
transpose_array_inplace_u16_4x4(output);
|
||||||
|
|
||||||
// dst_n starts at p2, so adjust to p1.
|
store_u16_4x4(s - 2, pitch, output[0], output[1], output[2], output[3]);
|
||||||
vst1_u16(dst_0 + 1, output[0]);
|
|
||||||
vst1_u16(dst_1 + 1, output[1]);
|
|
||||||
vst1_u16(dst_2 + 1, output[2]);
|
|
||||||
vst1_u16(dst_3 + 1, output[3]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_lpf_vertical_6_dual_neon(
|
void aom_highbd_lpf_vertical_6_dual_neon(
|
||||||
@@ -607,18 +581,14 @@ static inline void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
|
|||||||
// ^^^^^^^^^^^
|
// ^^^^^^^^^^^
|
||||||
const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
|
const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
|
||||||
|
|
||||||
// p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
|
|
||||||
// ^^^^^
|
|
||||||
uint16x8_t sum = vshlq_n_u16(p23q23, 1);
|
|
||||||
|
|
||||||
// Add two other terms to make dual issue with shift more likely.
|
// Add two other terms to make dual issue with shift more likely.
|
||||||
// p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
|
// p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
|
||||||
// ^^^^^^^^^^^
|
// ^^^^^^^^^^^
|
||||||
const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
|
const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
|
||||||
|
|
||||||
// p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
|
// p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
|
||||||
// ^^^^^^^^^^^^^
|
// ^^^^^ ^^^^^^^^^^^^^
|
||||||
sum = vaddq_u16(sum, p01q01);
|
uint16x8_t sum = vmlaq_n_u16(p01q01, p23q23, 2);
|
||||||
|
|
||||||
// p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
|
// p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
|
||||||
// ^^^^^^
|
// ^^^^^^
|
||||||
@@ -654,19 +624,9 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
|
|||||||
const uint8_t *blimit,
|
const uint8_t *blimit,
|
||||||
const uint8_t *limit,
|
const uint8_t *limit,
|
||||||
const uint8_t *thresh, int bd) {
|
const uint8_t *thresh, int bd) {
|
||||||
uint16_t *const dst_p3 = s - 4 * pitch;
|
uint16x4_t src[8];
|
||||||
uint16_t *const dst_p2 = s - 3 * pitch;
|
load_u16_4x8(s - 4 * pitch, pitch, &src[0], &src[1], &src[2], &src[3],
|
||||||
uint16_t *const dst_p1 = s - 2 * pitch;
|
&src[4], &src[5], &src[6], &src[7]);
|
||||||
uint16_t *const dst_p0 = s - pitch;
|
|
||||||
uint16_t *const dst_q0 = s;
|
|
||||||
uint16_t *const dst_q1 = s + pitch;
|
|
||||||
uint16_t *const dst_q2 = s + 2 * pitch;
|
|
||||||
uint16_t *const dst_q3 = s + 3 * pitch;
|
|
||||||
|
|
||||||
const uint16x4_t src[8] = { vld1_u16(dst_p3), vld1_u16(dst_p2),
|
|
||||||
vld1_u16(dst_p1), vld1_u16(dst_p0),
|
|
||||||
vld1_u16(dst_q0), vld1_u16(dst_q1),
|
|
||||||
vld1_u16(dst_q2), vld1_u16(dst_q3) };
|
|
||||||
|
|
||||||
// Adjust thresholds to bitdepth.
|
// Adjust thresholds to bitdepth.
|
||||||
const int outer_thresh = *blimit << (bd - 8);
|
const int outer_thresh = *blimit << (bd - 8);
|
||||||
@@ -684,13 +644,22 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
|
|||||||
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
|
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
|
||||||
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
|
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
|
||||||
|
|
||||||
#if AOM_ARCH_AARCH64
|
if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
|
||||||
if (vaddv_u16(needs_filter_mask) == 0) {
|
|
||||||
// None of the values will be filtered.
|
// None of the values will be filtered.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // AOM_ARCH_AARCH64
|
|
||||||
|
|
||||||
|
uint16x8_t p0q0_output, p1q1_output, p2q2_output;
|
||||||
|
uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
|
||||||
|
// Not needing filter4() at all is a very common case, so isolate it to avoid
|
||||||
|
// needlessly computing filter4().
|
||||||
|
if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
|
||||||
|
vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
|
||||||
|
filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
|
||||||
|
p2q2_output = f8_p2q2;
|
||||||
|
p1q1_output = f8_p1q1;
|
||||||
|
p0q0_output = f8_p0q0;
|
||||||
|
} else {
|
||||||
// Copy the masks to the high bits for packed comparisons later.
|
// Copy the masks to the high bits for packed comparisons later.
|
||||||
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
||||||
const uint16x8_t needs_filter_mask_8 =
|
const uint16x8_t needs_filter_mask_8 =
|
||||||
@@ -698,17 +667,14 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
|
|||||||
|
|
||||||
uint16x8_t f4_p1q1;
|
uint16x8_t f4_p1q1;
|
||||||
uint16x8_t f4_p0q0;
|
uint16x8_t f4_p0q0;
|
||||||
// ZIP1 p0q0, p1q1 may perform better here.
|
|
||||||
const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
|
const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
|
||||||
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
||||||
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
||||||
|
|
||||||
uint16x8_t p0q0_output, p1q1_output, p2q2_output;
|
|
||||||
// Because we did not return after testing |needs_filter_mask| we know it is
|
// Because we did not return after testing |needs_filter_mask| we know it is
|
||||||
// nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
|
// nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
|
||||||
// filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
|
// filter8. Therefore if it is false when |needs_filter_mask| is true,
|
||||||
// output is not used.
|
// filter8 output is not used.
|
||||||
uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
|
|
||||||
const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
|
const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
|
||||||
if (vget_lane_u64(need_filter8, 0) == 0) {
|
if (vget_lane_u64(need_filter8, 0) == 0) {
|
||||||
// filter8() does not apply, but filter4() applies to one or more values.
|
// filter8() does not apply, but filter4() applies to one or more values.
|
||||||
@@ -725,13 +691,12 @@ void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
|
|||||||
p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
|
p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
|
||||||
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
vst1_u16(dst_p2, vget_low_u16(p2q2_output));
|
store_u16_4x6(s - 3 * pitch, pitch, vget_low_u16(p2q2_output),
|
||||||
vst1_u16(dst_p1, vget_low_u16(p1q1_output));
|
vget_low_u16(p1q1_output), vget_low_u16(p0q0_output),
|
||||||
vst1_u16(dst_p0, vget_low_u16(p0q0_output));
|
vget_high_u16(p0q0_output), vget_high_u16(p1q1_output),
|
||||||
vst1_u16(dst_q0, vget_high_u16(p0q0_output));
|
vget_high_u16(p2q2_output));
|
||||||
vst1_u16(dst_q1, vget_high_u16(p1q1_output));
|
|
||||||
vst1_u16(dst_q2, vget_high_u16(p2q2_output));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_lpf_horizontal_8_dual_neon(
|
void aom_highbd_lpf_horizontal_8_dual_neon(
|
||||||
@@ -749,16 +714,10 @@ static inline uint16x8_t reverse_low_half(const uint16x8_t a) {
|
|||||||
void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
|
void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
|
||||||
const uint8_t *blimit, const uint8_t *limit,
|
const uint8_t *blimit, const uint8_t *limit,
|
||||||
const uint8_t *thresh, int bd) {
|
const uint8_t *thresh, int bd) {
|
||||||
uint16_t *const dst = s - 4;
|
|
||||||
uint16_t *const dst_0 = dst;
|
|
||||||
uint16_t *const dst_1 = dst + pitch;
|
|
||||||
uint16_t *const dst_2 = dst + 2 * pitch;
|
|
||||||
uint16_t *const dst_3 = dst + 3 * pitch;
|
|
||||||
|
|
||||||
// src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
|
// src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
|
||||||
// To get desired pairs after transpose, one half should be reversed.
|
// To get desired pairs after transpose, one half should be reversed.
|
||||||
uint16x8_t src[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
|
uint16x8_t src[4];
|
||||||
vld1q_u16(dst_3) };
|
load_u16_8x4(s - 4, pitch, &src[0], &src[1], &src[2], &src[3]);
|
||||||
|
|
||||||
// src[0] = p0q0
|
// src[0] = p0q0
|
||||||
// src[1] = p1q1
|
// src[1] = p1q1
|
||||||
@@ -783,13 +742,22 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
|
|||||||
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
|
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
|
||||||
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
|
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
|
||||||
|
|
||||||
#if AOM_ARCH_AARCH64
|
if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
|
||||||
if (vaddv_u16(needs_filter_mask) == 0) {
|
|
||||||
// None of the values will be filtered.
|
// None of the values will be filtered.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // AOM_ARCH_AARCH64
|
|
||||||
|
|
||||||
|
uint16x8_t p0q0_output, p1q1_output, p2q2_output;
|
||||||
|
uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
|
||||||
|
// Not needing filter4() at all is a very common case, so isolate it to avoid
|
||||||
|
// needlessly computing filter4().
|
||||||
|
if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
|
||||||
|
vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
|
||||||
|
filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
|
||||||
|
p2q2_output = f8_p2q2;
|
||||||
|
p1q1_output = f8_p1q1;
|
||||||
|
p0q0_output = f8_p0q0;
|
||||||
|
} else {
|
||||||
// Copy the masks to the high bits for packed comparisons later.
|
// Copy the masks to the high bits for packed comparisons later.
|
||||||
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
||||||
const uint16x8_t needs_filter_mask_8 =
|
const uint16x8_t needs_filter_mask_8 =
|
||||||
@@ -797,15 +765,15 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
|
|||||||
|
|
||||||
uint16x8_t f4_p1q1;
|
uint16x8_t f4_p1q1;
|
||||||
uint16x8_t f4_p0q0;
|
uint16x8_t f4_p0q0;
|
||||||
const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
|
const uint16x8_t p0q1 =
|
||||||
|
vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
|
||||||
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
||||||
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
||||||
|
|
||||||
uint16x8_t p0q0_output, p1q1_output, p2q2_output;
|
|
||||||
// Because we did not return after testing |needs_filter_mask| we know it is
|
// Because we did not return after testing |needs_filter_mask| we know it is
|
||||||
// nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
|
// nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
|
||||||
// filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
|
// filter8. Therefore if it is false when |needs_filter_mask| is true,
|
||||||
// output is not used.
|
// filter8 output is not used.
|
||||||
const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
|
const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
|
||||||
if (vget_lane_u64(need_filter8, 0) == 0) {
|
if (vget_lane_u64(need_filter8, 0) == 0) {
|
||||||
// filter8() does not apply, but filter4() applies to one or more values.
|
// filter8() does not apply, but filter4() applies to one or more values.
|
||||||
@@ -815,7 +783,6 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
|
|||||||
} else {
|
} else {
|
||||||
const uint16x8_t is_flat4_mask_8 =
|
const uint16x8_t is_flat4_mask_8 =
|
||||||
vcombine_u16(is_flat4_mask, is_flat4_mask);
|
vcombine_u16(is_flat4_mask, is_flat4_mask);
|
||||||
uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
|
|
||||||
filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
|
filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
|
||||||
p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
|
p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
|
||||||
p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
|
p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
|
||||||
@@ -823,6 +790,7 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
|
|||||||
p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
|
p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
|
||||||
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
|
uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
|
||||||
// After transpose, |output| will contain rows of the form:
|
// After transpose, |output| will contain rows of the form:
|
||||||
@@ -831,10 +799,9 @@ void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
|
|||||||
|
|
||||||
// Reverse p values to produce original order:
|
// Reverse p values to produce original order:
|
||||||
// p3 p2 p1 p0 q0 q1 q2 q3
|
// p3 p2 p1 p0 q0 q1 q2 q3
|
||||||
vst1q_u16(dst_0, reverse_low_half(output[0]));
|
store_u16_8x4(s - 4, pitch, reverse_low_half(output[0]),
|
||||||
vst1q_u16(dst_1, reverse_low_half(output[1]));
|
reverse_low_half(output[1]), reverse_low_half(output[2]),
|
||||||
vst1q_u16(dst_2, reverse_low_half(output[2]));
|
reverse_low_half(output[3]));
|
||||||
vst1q_u16(dst_3, reverse_low_half(output[3]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_lpf_vertical_8_dual_neon(
|
void aom_highbd_lpf_vertical_8_dual_neon(
|
||||||
@@ -864,8 +831,8 @@ static inline void filter14(
|
|||||||
// ^^^^^^^^^^^^^^^^^^^
|
// ^^^^^^^^^^^^^^^^^^^
|
||||||
// q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
|
// q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
|
||||||
// ^^^^^^^^^^^^^^^^^^^
|
// ^^^^^^^^^^^^^^^^^^^
|
||||||
uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
|
const uint16x8_t p45q45 = vaddq_u16(p5q5, p4q4);
|
||||||
sum = vaddq_u16(sum, p6q6_x7);
|
uint16x8_t sum = vmlaq_n_u16(p6q6_x7, p45q45, 2);
|
||||||
|
|
||||||
// p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
|
// p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
|
||||||
// ^^^^^^^
|
// ^^^^^^^
|
||||||
@@ -938,27 +905,10 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
|
|||||||
const uint8_t *blimit,
|
const uint8_t *blimit,
|
||||||
const uint8_t *limit,
|
const uint8_t *limit,
|
||||||
const uint8_t *thresh, int bd) {
|
const uint8_t *thresh, int bd) {
|
||||||
uint16_t *const dst_p6 = s - 7 * pitch;
|
uint16x4_t src[14];
|
||||||
uint16_t *const dst_p5 = s - 6 * pitch;
|
load_u16_4x14(s - 7 * pitch, pitch, &src[0], &src[1], &src[2], &src[3],
|
||||||
uint16_t *const dst_p4 = s - 5 * pitch;
|
&src[4], &src[5], &src[6], &src[7], &src[8], &src[9], &src[10],
|
||||||
uint16_t *const dst_p3 = s - 4 * pitch;
|
&src[11], &src[12], &src[13]);
|
||||||
uint16_t *const dst_p2 = s - 3 * pitch;
|
|
||||||
uint16_t *const dst_p1 = s - 2 * pitch;
|
|
||||||
uint16_t *const dst_p0 = s - pitch;
|
|
||||||
uint16_t *const dst_q0 = s;
|
|
||||||
uint16_t *const dst_q1 = s + pitch;
|
|
||||||
uint16_t *const dst_q2 = s + 2 * pitch;
|
|
||||||
uint16_t *const dst_q3 = s + 3 * pitch;
|
|
||||||
uint16_t *const dst_q4 = s + 4 * pitch;
|
|
||||||
uint16_t *const dst_q5 = s + 5 * pitch;
|
|
||||||
uint16_t *const dst_q6 = s + 6 * pitch;
|
|
||||||
|
|
||||||
const uint16x4_t src[14] = {
|
|
||||||
vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
|
|
||||||
vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
|
|
||||||
vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
|
|
||||||
vld1_u16(dst_q5), vld1_u16(dst_q6)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Adjust thresholds to bitdepth.
|
// Adjust thresholds to bitdepth.
|
||||||
const int outer_thresh = *blimit << (bd - 8);
|
const int outer_thresh = *blimit << (bd - 8);
|
||||||
@@ -976,12 +926,10 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
|
|||||||
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
|
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
|
||||||
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
|
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
|
||||||
|
|
||||||
#if AOM_ARCH_AARCH64
|
if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
|
||||||
if (vaddv_u16(needs_filter_mask) == 0) {
|
|
||||||
// None of the values will be filtered.
|
// None of the values will be filtered.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // AOM_ARCH_AARCH64
|
|
||||||
const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
|
const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
|
||||||
const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
|
const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
|
||||||
const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
|
const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
|
||||||
@@ -991,6 +939,32 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
|
|||||||
const uint16x4_t is_flat4_outer_mask = vand_u16(
|
const uint16x4_t is_flat4_outer_mask = vand_u16(
|
||||||
is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
|
is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
|
||||||
vabdq_u16(p0q0, p6q6), bd));
|
vabdq_u16(p0q0, p6q6), bd));
|
||||||
|
|
||||||
|
uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
|
||||||
|
p5q5_output;
|
||||||
|
uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
|
||||||
|
uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
|
||||||
|
if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) {
|
||||||
|
// filter14() applies to all values.
|
||||||
|
filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
|
||||||
|
&f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
|
||||||
|
p5q5_output = f14_p5q5;
|
||||||
|
p4q4_output = f14_p4q4;
|
||||||
|
p3q3_output = f14_p3q3;
|
||||||
|
p2q2_output = f14_p2q2;
|
||||||
|
p1q1_output = f14_p1q1;
|
||||||
|
p0q0_output = f14_p0q0;
|
||||||
|
} else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
|
||||||
|
vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) {
|
||||||
|
// filter8() applies to all values.
|
||||||
|
filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
|
||||||
|
p5q5_output = p5q5;
|
||||||
|
p4q4_output = p4q4;
|
||||||
|
p3q3_output = p3q3;
|
||||||
|
p2q2_output = f8_p2q2;
|
||||||
|
p1q1_output = f8_p1q1;
|
||||||
|
p0q0_output = f8_p0q0;
|
||||||
|
} else {
|
||||||
// Copy the masks to the high bits for packed comparisons later.
|
// Copy the masks to the high bits for packed comparisons later.
|
||||||
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
||||||
const uint16x8_t needs_filter_mask_8 =
|
const uint16x8_t needs_filter_mask_8 =
|
||||||
@@ -998,18 +972,13 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
|
|||||||
|
|
||||||
uint16x8_t f4_p1q1;
|
uint16x8_t f4_p1q1;
|
||||||
uint16x8_t f4_p0q0;
|
uint16x8_t f4_p0q0;
|
||||||
// ZIP1 p0q0, p1q1 may perform better here.
|
|
||||||
const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
|
const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
|
||||||
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
||||||
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
||||||
|
|
||||||
uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
|
|
||||||
p5q5_output;
|
|
||||||
// Because we did not return after testing |needs_filter_mask| we know it is
|
// Because we did not return after testing |needs_filter_mask| we know it is
|
||||||
// nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
|
// nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
|
||||||
// filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
|
// filter8. Therefore if it is false when |needs_filter_mask| is true,
|
||||||
// output is not used.
|
// filter8 output is not used.
|
||||||
uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
|
|
||||||
const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
|
const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
|
||||||
if (vget_lane_u64(need_filter8, 0) == 0) {
|
if (vget_lane_u64(need_filter8, 0) == 0) {
|
||||||
// filter8() and filter14() do not apply, but filter4() applies to one or
|
// filter8() and filter14() do not apply, but filter4() applies to one or
|
||||||
@@ -1024,10 +993,11 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
|
|||||||
const uint16x8_t use_filter8_mask =
|
const uint16x8_t use_filter8_mask =
|
||||||
vcombine_u16(is_flat4_mask, is_flat4_mask);
|
vcombine_u16(is_flat4_mask, is_flat4_mask);
|
||||||
filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
|
filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
|
||||||
const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
|
const uint64x1_t need_filter14 =
|
||||||
|
vreinterpret_u64_u16(is_flat4_outer_mask);
|
||||||
if (vget_lane_u64(need_filter14, 0) == 0) {
|
if (vget_lane_u64(need_filter14, 0) == 0) {
|
||||||
// filter14() does not apply, but filter8() and filter4() apply to one or
|
// filter14() does not apply, but filter8() and filter4() apply to one
|
||||||
// more values.
|
// or more values.
|
||||||
p5q5_output = p5q5;
|
p5q5_output = p5q5;
|
||||||
p4q4_output = p4q4;
|
p4q4_output = p4q4;
|
||||||
p3q3_output = p3q3;
|
p3q3_output = p3q3;
|
||||||
@@ -1040,7 +1010,6 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
|
|||||||
// All filters may contribute values to final outputs.
|
// All filters may contribute values to final outputs.
|
||||||
const uint16x8_t use_filter14_mask =
|
const uint16x8_t use_filter14_mask =
|
||||||
vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
|
vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
|
||||||
uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
|
|
||||||
filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
|
filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
|
||||||
&f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
|
&f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
|
||||||
p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
|
p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
|
||||||
@@ -1057,19 +1026,15 @@ void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
|
|||||||
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
vst1_u16(dst_p5, vget_low_u16(p5q5_output));
|
store_u16_4x12(s - 6 * pitch, pitch, vget_low_u16(p5q5_output),
|
||||||
vst1_u16(dst_p4, vget_low_u16(p4q4_output));
|
vget_low_u16(p4q4_output), vget_low_u16(p3q3_output),
|
||||||
vst1_u16(dst_p3, vget_low_u16(p3q3_output));
|
vget_low_u16(p2q2_output), vget_low_u16(p1q1_output),
|
||||||
vst1_u16(dst_p2, vget_low_u16(p2q2_output));
|
vget_low_u16(p0q0_output), vget_high_u16(p0q0_output),
|
||||||
vst1_u16(dst_p1, vget_low_u16(p1q1_output));
|
vget_high_u16(p1q1_output), vget_high_u16(p2q2_output),
|
||||||
vst1_u16(dst_p0, vget_low_u16(p0q0_output));
|
vget_high_u16(p3q3_output), vget_high_u16(p4q4_output),
|
||||||
vst1_u16(dst_q0, vget_high_u16(p0q0_output));
|
vget_high_u16(p5q5_output));
|
||||||
vst1_u16(dst_q1, vget_high_u16(p1q1_output));
|
|
||||||
vst1_u16(dst_q2, vget_high_u16(p2q2_output));
|
|
||||||
vst1_u16(dst_q3, vget_high_u16(p3q3_output));
|
|
||||||
vst1_u16(dst_q4, vget_high_u16(p4q4_output));
|
|
||||||
vst1_u16(dst_q5, vget_high_u16(p5q5_output));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_lpf_horizontal_14_dual_neon(
|
void aom_highbd_lpf_horizontal_14_dual_neon(
|
||||||
@@ -1107,23 +1072,17 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
|
|||||||
const uint8_t *blimit,
|
const uint8_t *blimit,
|
||||||
const uint8_t *limit,
|
const uint8_t *limit,
|
||||||
const uint8_t *thresh, int bd) {
|
const uint8_t *thresh, int bd) {
|
||||||
uint16_t *const dst = s - 8;
|
|
||||||
uint16_t *const dst_0 = dst;
|
|
||||||
uint16_t *const dst_1 = dst + pitch;
|
|
||||||
uint16_t *const dst_2 = dst + 2 * pitch;
|
|
||||||
uint16_t *const dst_3 = dst + 3 * pitch;
|
|
||||||
|
|
||||||
// Low halves: p7 p6 p5 p4
|
// Low halves: p7 p6 p5 p4
|
||||||
// High halves: p3 p2 p1 p0
|
// High halves: p3 p2 p1 p0
|
||||||
uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
|
uint16x8_t src_p[4];
|
||||||
vld1q_u16(dst_3) };
|
load_u16_8x4(s - 8, pitch, &src_p[0], &src_p[1], &src_p[2], &src_p[3]);
|
||||||
// p7 will be the low half of src_p[0]. Not used until the end.
|
// p7 will be the low half of src_p[0]. Not used until the end.
|
||||||
transpose_array_inplace_u16_4x8(src_p);
|
transpose_array_inplace_u16_4x8(src_p);
|
||||||
|
|
||||||
// Low halves: q0 q1 q2 q3
|
// Low halves: q0 q1 q2 q3
|
||||||
// High halves: q4 q5 q6 q7
|
// High halves: q4 q5 q6 q7
|
||||||
uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
|
uint16x8_t src_q[4];
|
||||||
vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
|
load_u16_8x4(s, pitch, &src_q[0], &src_q[1], &src_q[2], &src_q[3]);
|
||||||
// q7 will be the high half of src_q[3]. Not used until the end.
|
// q7 will be the high half of src_q[3]. Not used until the end.
|
||||||
transpose_array_inplace_u16_4x8(src_q);
|
transpose_array_inplace_u16_4x8(src_q);
|
||||||
|
|
||||||
@@ -1144,12 +1103,11 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
|
|||||||
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
|
filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
|
||||||
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
|
bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
|
||||||
|
|
||||||
#if AOM_ARCH_AARCH64
|
if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
|
||||||
if (vaddv_u16(needs_filter_mask) == 0) {
|
|
||||||
// None of the values will be filtered.
|
// None of the values will be filtered.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // AOM_ARCH_AARCH64
|
|
||||||
const uint16x8_t p4q4 =
|
const uint16x8_t p4q4 =
|
||||||
vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
|
vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
|
||||||
const uint16x8_t p5q5 =
|
const uint16x8_t p5q5 =
|
||||||
@@ -1164,6 +1122,32 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
|
|||||||
const uint16x4_t is_flat4_outer_mask = vand_u16(
|
const uint16x4_t is_flat4_outer_mask = vand_u16(
|
||||||
is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
|
is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
|
||||||
vabdq_u16(p0q0, p6q6), bd));
|
vabdq_u16(p0q0, p6q6), bd));
|
||||||
|
|
||||||
|
uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
|
||||||
|
p5q5_output;
|
||||||
|
uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
|
||||||
|
uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
|
||||||
|
if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) {
|
||||||
|
// filter14() applies to all values.
|
||||||
|
filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
|
||||||
|
&f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
|
||||||
|
p5q5_output = f14_p5q5;
|
||||||
|
p4q4_output = f14_p4q4;
|
||||||
|
p3q3_output = f14_p3q3;
|
||||||
|
p2q2_output = f14_p2q2;
|
||||||
|
p1q1_output = f14_p1q1;
|
||||||
|
p0q0_output = f14_p0q0;
|
||||||
|
} else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
|
||||||
|
vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) {
|
||||||
|
// filter8() applies to all values.
|
||||||
|
filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
|
||||||
|
p5q5_output = p5q5;
|
||||||
|
p4q4_output = p4q4;
|
||||||
|
p3q3_output = p3q3;
|
||||||
|
p2q2_output = f8_p2q2;
|
||||||
|
p1q1_output = f8_p1q1;
|
||||||
|
p0q0_output = f8_p0q0;
|
||||||
|
} else {
|
||||||
// Copy the masks to the high bits for packed comparisons later.
|
// Copy the masks to the high bits for packed comparisons later.
|
||||||
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
|
||||||
const uint16x8_t needs_filter_mask_8 =
|
const uint16x8_t needs_filter_mask_8 =
|
||||||
@@ -1171,17 +1155,14 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
|
|||||||
|
|
||||||
uint16x8_t f4_p1q1;
|
uint16x8_t f4_p1q1;
|
||||||
uint16x8_t f4_p0q0;
|
uint16x8_t f4_p0q0;
|
||||||
const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
|
const uint16x8_t p0q1 =
|
||||||
|
vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
|
||||||
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
|
||||||
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
|
||||||
|
|
||||||
uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
|
|
||||||
p5q5_output;
|
|
||||||
// Because we did not return after testing |needs_filter_mask| we know it is
|
// Because we did not return after testing |needs_filter_mask| we know it is
|
||||||
// nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
|
// nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
|
||||||
// filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
|
// filter8. Therefore if it is false when |needs_filter_mask| is true,
|
||||||
// output is not used.
|
// filter8 output is not used.
|
||||||
uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
|
|
||||||
const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
|
const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
|
||||||
if (vget_lane_u64(need_filter8, 0) == 0) {
|
if (vget_lane_u64(need_filter8, 0) == 0) {
|
||||||
// filter8() and filter14() do not apply, but filter4() applies to one or
|
// filter8() and filter14() do not apply, but filter4() applies to one or
|
||||||
@@ -1196,10 +1177,11 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
|
|||||||
const uint16x8_t use_filter8_mask =
|
const uint16x8_t use_filter8_mask =
|
||||||
vcombine_u16(is_flat4_mask, is_flat4_mask);
|
vcombine_u16(is_flat4_mask, is_flat4_mask);
|
||||||
filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
|
filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
|
||||||
const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
|
const uint64x1_t need_filter14 =
|
||||||
|
vreinterpret_u64_u16(is_flat4_outer_mask);
|
||||||
if (vget_lane_u64(need_filter14, 0) == 0) {
|
if (vget_lane_u64(need_filter14, 0) == 0) {
|
||||||
// filter14() does not apply, but filter8() and filter4() apply to one or
|
// filter14() does not apply, but filter8() and filter4() apply to one
|
||||||
// more values.
|
// or more values.
|
||||||
p5q5_output = p5q5;
|
p5q5_output = p5q5;
|
||||||
p4q4_output = p4q4;
|
p4q4_output = p4q4;
|
||||||
p3q3_output = p3q3;
|
p3q3_output = p3q3;
|
||||||
@@ -1212,7 +1194,6 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
|
|||||||
// All filters may contribute values to final outputs.
|
// All filters may contribute values to final outputs.
|
||||||
const uint16x8_t use_filter14_mask =
|
const uint16x8_t use_filter14_mask =
|
||||||
vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
|
vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
|
||||||
uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
|
|
||||||
filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
|
filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
|
||||||
&f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
|
&f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
|
||||||
p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
|
p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
|
||||||
@@ -1229,6 +1210,8 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
|
|||||||
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// To get the correctly ordered rows from the transpose, we need:
|
// To get the correctly ordered rows from the transpose, we need:
|
||||||
// p7p3 p6p2 p5p1 p4p0
|
// p7p3 p6p2 p5p1 p4p0
|
||||||
// q0q4 q1q5 q2q6 q3q7
|
// q0q4 q1q5 q2q6 q3q7
|
||||||
@@ -1236,23 +1219,20 @@ void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
|
|||||||
const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output);
|
const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output);
|
||||||
const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output);
|
const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output);
|
||||||
const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
|
const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
|
||||||
|
|
||||||
uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
|
uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
|
||||||
p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
|
p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
|
||||||
transpose_array_inplace_u16_4x8(output_p);
|
|
||||||
uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
|
uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
|
||||||
p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
|
p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
|
||||||
|
|
||||||
|
transpose_array_inplace_u16_4x8(output_p);
|
||||||
transpose_array_inplace_u16_4x8(output_q);
|
transpose_array_inplace_u16_4x8(output_q);
|
||||||
|
|
||||||
// Reverse p values to produce original order:
|
// Reverse p values to produce original order:
|
||||||
// p3 p2 p1 p0 q0 q1 q2 q3
|
// p3 p2 p1 p0 q0 q1 q2 q3
|
||||||
vst1q_u16(dst_0, output_p[0]);
|
store_u16_8x4(s - 8, pitch, output_p[0], output_p[1], output_p[2],
|
||||||
vst1q_u16(dst_0 + 8, output_q[0]);
|
output_p[3]);
|
||||||
vst1q_u16(dst_1, output_p[1]);
|
store_u16_8x4(s, pitch, output_q[0], output_q[1], output_q[2], output_q[3]);
|
||||||
vst1q_u16(dst_1 + 8, output_q[1]);
|
|
||||||
vst1q_u16(dst_2, output_p[2]);
|
|
||||||
vst1q_u16(dst_2 + 8, output_q[2]);
|
|
||||||
vst1q_u16(dst_3, output_p[3]);
|
|
||||||
vst1q_u16(dst_3 + 8, output_q[3]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_lpf_vertical_14_dual_neon(
|
void aom_highbd_lpf_vertical_14_dual_neon(
|
||||||
|
|||||||
620
third_party/aom/aom_dsp/arm/loopfilter_neon.c
vendored
620
third_party/aom/aom_dsp/arm/loopfilter_neon.c
vendored
@@ -146,172 +146,237 @@ static inline uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
|
|||||||
return mask_8x8;
|
return mask_8x8;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
|
static inline void filter4(const uint8x8_t p0q0, const uint8x8_t p1q1,
|
||||||
uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
|
uint8x8_t *p0q0_output, uint8x8_t *p1q1_output,
|
||||||
uint8x8_t *p0q0, const uint8_t blimit,
|
uint8x8_t mask_8x8, const uint8_t thresh) {
|
||||||
const uint8_t limit, const uint8_t thresh) {
|
|
||||||
uint16x8_t out;
|
|
||||||
uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
|
|
||||||
out_f14_pq5;
|
|
||||||
uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
|
|
||||||
uint8x8_t out_f4_pq0, out_f4_pq1;
|
|
||||||
uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
|
|
||||||
uint8x8_t q0p0, q1p1, q2p2;
|
|
||||||
|
|
||||||
// Calculate filter masks
|
|
||||||
mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
|
|
||||||
flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
|
|
||||||
flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
|
|
||||||
{
|
|
||||||
// filter 4
|
|
||||||
int32x2x2_t ps0_qs0, ps1_qs1;
|
|
||||||
int16x8_t filter_s16;
|
|
||||||
const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
|
const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
|
||||||
uint8x8_t temp0_8x8, temp1_8x8;
|
|
||||||
int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
|
|
||||||
int8x8_t op0, oq0, op1, oq1;
|
|
||||||
int8x8_t pq_s0, pq_s1;
|
|
||||||
int8x8_t filter_s8, filter1_s8, filter2_s8;
|
|
||||||
int8x8_t hev_8x8;
|
|
||||||
const int8x8_t sign_mask = vdup_n_s8(0x80);
|
const int8x8_t sign_mask = vdup_n_s8(0x80);
|
||||||
const int8x8_t val_4 = vdup_n_s8(4);
|
const int8x8_t val_4 = vdup_n_s8(4);
|
||||||
const int8x8_t val_3 = vdup_n_s8(3);
|
const int8x8_t val_3 = vdup_n_s8(3);
|
||||||
|
|
||||||
pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
|
int8x8_t pq_s0 = veor_s8(vreinterpret_s8_u8(p0q0), sign_mask);
|
||||||
pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
|
int8x8_t pq_s1 = veor_s8(vreinterpret_s8_u8(p1q1), sign_mask);
|
||||||
|
|
||||||
ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
|
int32x2x2_t ps0_qs0 =
|
||||||
ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
|
vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
|
||||||
ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
|
int32x2x2_t ps1_qs1 =
|
||||||
qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
|
vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
|
||||||
ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
|
int8x8_t ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
|
||||||
qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
|
int8x8_t qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
|
||||||
|
int8x8_t ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
|
||||||
|
int8x8_t qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
|
||||||
|
|
||||||
// hev_mask
|
// hev_mask
|
||||||
temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
|
uint8x8_t temp0_8x8 = vcgt_u8(vabd_u8(p0q0, p1q1), thresh_f4);
|
||||||
temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
|
uint8x8_t temp1_8x8 =
|
||||||
hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
|
vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
|
||||||
|
int8x8_t hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
|
||||||
|
|
||||||
// add outer taps if we have high edge variance
|
// add outer taps if we have high edge variance
|
||||||
filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
|
int8x8_t filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
|
||||||
filter_s8 = vand_s8(filter_s8, hev_8x8);
|
filter_s8 = vand_s8(filter_s8, hev_8x8);
|
||||||
|
|
||||||
// inner taps
|
// inner taps
|
||||||
temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
|
int8x8_t temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
|
||||||
filter_s16 = vmovl_s8(filter_s8);
|
int16x8_t filter_s16 = vmovl_s8(filter_s8);
|
||||||
filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
|
filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
|
||||||
filter_s8 = vqmovn_s16(filter_s16);
|
filter_s8 = vqmovn_s16(filter_s16);
|
||||||
filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
|
filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
|
||||||
|
|
||||||
filter1_s8 = vqadd_s8(filter_s8, val_4);
|
int8x8_t filter1_s8 = vqadd_s8(filter_s8, val_4);
|
||||||
filter2_s8 = vqadd_s8(filter_s8, val_3);
|
int8x8_t filter2_s8 = vqadd_s8(filter_s8, val_3);
|
||||||
filter1_s8 = vshr_n_s8(filter1_s8, 3);
|
filter1_s8 = vshr_n_s8(filter1_s8, 3);
|
||||||
filter2_s8 = vshr_n_s8(filter2_s8, 3);
|
filter2_s8 = vshr_n_s8(filter2_s8, 3);
|
||||||
|
|
||||||
oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
|
int8x8_t oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
|
||||||
op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
|
int8x8_t op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
|
||||||
|
|
||||||
hev_8x8 = vmvn_s8(hev_8x8);
|
|
||||||
filter_s8 = vrshr_n_s8(filter1_s8, 1);
|
filter_s8 = vrshr_n_s8(filter1_s8, 1);
|
||||||
filter_s8 = vand_s8(filter_s8, hev_8x8);
|
filter_s8 = vbic_s8(filter_s8, hev_8x8);
|
||||||
|
|
||||||
oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
|
int8x8_t oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
|
||||||
op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
|
int8x8_t op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
|
||||||
|
|
||||||
out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
|
*p0q0_output = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
|
||||||
out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
|
*p1q1_output = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
|
||||||
}
|
}
|
||||||
// reverse p and q
|
|
||||||
q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
|
|
||||||
q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
|
|
||||||
q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
|
|
||||||
{
|
|
||||||
// filter 8
|
|
||||||
uint16x8_t out_pq0, out_pq1, out_pq2;
|
|
||||||
out = vaddl_u8(*p3q3, *p2q2);
|
|
||||||
out = vaddw_u8(out, *p1q1);
|
|
||||||
out = vaddw_u8(out, *p0q0);
|
|
||||||
|
|
||||||
out = vaddw_u8(out, q0p0);
|
static inline void filter8(const uint8x8_t p0q0, const uint8x8_t p1q1,
|
||||||
out_pq1 = vaddw_u8(out, *p3q3);
|
const uint8x8_t p2q2, const uint8x8_t p3q3,
|
||||||
out_pq2 = vaddw_u8(out_pq1, *p3q3);
|
uint8x8_t *p0q0_output, uint8x8_t *p1q1_output,
|
||||||
out_pq2 = vaddw_u8(out_pq2, *p2q2);
|
uint8x8_t *p2q2_output) {
|
||||||
out_pq1 = vaddw_u8(out_pq1, *p1q1);
|
// Reverse p and q.
|
||||||
out_pq1 = vaddw_u8(out_pq1, q1p1);
|
uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
|
||||||
|
uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4);
|
||||||
|
uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4);
|
||||||
|
|
||||||
out_pq0 = vaddw_u8(out, *p0q0);
|
uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
|
||||||
out_pq0 = vaddw_u8(out_pq0, q1p1);
|
uint16x8_t p2q2_p3q3 = vaddl_u8(p3q3, p2q2);
|
||||||
out_pq0 = vaddw_u8(out_pq0, q2p2);
|
uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3);
|
||||||
|
|
||||||
out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
|
uint16x8_t q0p0_p3q3 = vaddl_u8(q0p0, p3q3);
|
||||||
out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
|
uint16x8_t out_q0p0_p3q3 = vaddq_u16(out, q0p0_p3q3);
|
||||||
out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
// filter 14
|
|
||||||
uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
|
|
||||||
uint16x8_t p6q6_2, p6q6_temp, qp_sum;
|
|
||||||
uint8x8_t qp_rev;
|
|
||||||
|
|
||||||
out = vaddw_u8(out, *p4q4);
|
uint16x8_t out_pq2 = vaddq_u16(out_q0p0_p3q3, p2q2_p3q3);
|
||||||
out = vaddw_u8(out, *p5q5);
|
|
||||||
out = vaddw_u8(out, *p6q6);
|
|
||||||
|
|
||||||
out_pq5 = vaddw_u8(out, *p4q4);
|
uint16x8_t p1q1_q1p1 = vaddl_u8(p1q1, q1p1);
|
||||||
out_pq4 = vaddw_u8(out_pq5, *p3q3);
|
uint16x8_t out_pq1 = vaddq_u16(out_q0p0_p3q3, p1q1_q1p1);
|
||||||
out_pq3 = vaddw_u8(out_pq4, *p2q2);
|
|
||||||
|
|
||||||
out_pq5 = vaddw_u8(out_pq5, *p5q5);
|
uint16x8_t q0p0_p0q0 = vaddl_u8(q0p0, p0q0);
|
||||||
out_pq4 = vaddw_u8(out_pq4, *p5q5);
|
uint16x8_t q1p1_q2p2 = vaddl_u8(q1p1, q2p2);
|
||||||
|
uint16x8_t out_pq0 = vaddq_u16(q0p0_p0q0, q1p1_q2p2);
|
||||||
|
out_pq0 = vaddq_u16(out_pq0, out);
|
||||||
|
|
||||||
out_pq0 = vaddw_u8(out, *p1q1);
|
*p0q0_output = vrshrn_n_u16(out_pq0, 3);
|
||||||
out_pq1 = vaddw_u8(out_pq0, *p2q2);
|
*p1q1_output = vrshrn_n_u16(out_pq1, 3);
|
||||||
out_pq2 = vaddw_u8(out_pq1, *p3q3);
|
*p2q2_output = vrshrn_n_u16(out_pq2, 3);
|
||||||
|
}
|
||||||
|
|
||||||
out_pq0 = vaddw_u8(out_pq0, *p0q0);
|
static inline void filter14(const uint8x8_t p0q0, const uint8x8_t p1q1,
|
||||||
out_pq1 = vaddw_u8(out_pq1, *p0q0);
|
const uint8x8_t p2q2, const uint8x8_t p3q3,
|
||||||
|
const uint8x8_t p4q4, const uint8x8_t p5q5,
|
||||||
|
const uint8x8_t p6q6, uint8x8_t *p0q0_output,
|
||||||
|
uint8x8_t *p1q1_output, uint8x8_t *p2q2_output,
|
||||||
|
uint8x8_t *p3q3_output, uint8x8_t *p4q4_output,
|
||||||
|
uint8x8_t *p5q5_output) {
|
||||||
|
// Reverse p and q.
|
||||||
|
uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
|
||||||
|
uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4);
|
||||||
|
uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4);
|
||||||
|
uint8x8_t q3p3 = vext_u8(p3q3, p3q3, 4);
|
||||||
|
uint8x8_t q4p4 = vext_u8(p4q4, p4q4, 4);
|
||||||
|
uint8x8_t q5p5 = vext_u8(p5q5, p5q5, 4);
|
||||||
|
|
||||||
out_pq1 = vaddw_u8(out_pq1, *p6q6);
|
uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
|
||||||
p6q6_2 = vaddl_u8(*p6q6, *p6q6);
|
uint16x8_t p2q2_p3q3 = vaddl_u8(p2q2, p3q3);
|
||||||
out_pq2 = vaddq_u16(out_pq2, p6q6_2);
|
uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3);
|
||||||
p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
|
|
||||||
|
uint16x8_t q0p0_p4q4 = vaddl_u8(q0p0, p4q4);
|
||||||
|
uint16x8_t p5q5_p6q6 = vaddl_u8(p5q5, p6q6);
|
||||||
|
uint16x8_t tmp = vaddq_u16(q0p0_p4q4, p5q5_p6q6);
|
||||||
|
// This offset removes the need for a rounding shift at the end.
|
||||||
|
uint16x8_t tmp_offset = vaddq_u16(tmp, vdupq_n_u16(1 << 3));
|
||||||
|
out = vaddq_u16(out, tmp_offset);
|
||||||
|
|
||||||
|
uint16x8_t out_pq5 = vaddw_u8(out, p4q4);
|
||||||
|
uint16x8_t out_pq4 = vaddw_u8(out_pq5, p3q3);
|
||||||
|
uint16x8_t out_pq3 = vaddw_u8(out_pq4, p2q2);
|
||||||
|
|
||||||
|
out_pq5 = vaddw_u8(out_pq5, p5q5);
|
||||||
|
|
||||||
|
uint16x8_t out_pq0 = vaddw_u8(out, p1q1);
|
||||||
|
uint16x8_t out_pq1 = vaddw_u8(out_pq0, p2q2);
|
||||||
|
uint16x8_t out_pq2 = vaddw_u8(out_pq1, p3q3);
|
||||||
|
|
||||||
|
uint16x8_t p0q0_q0p0 = vaddl_u8(p0q0, q0p0);
|
||||||
|
out_pq0 = vaddq_u16(out_pq0, p0q0_q0p0);
|
||||||
|
|
||||||
|
uint16x8_t p0q0_p6q6 = vaddl_u8(p0q0, p6q6);
|
||||||
|
out_pq1 = vaddq_u16(out_pq1, p0q0_p6q6);
|
||||||
|
uint16x8_t p5q5_q1p1 = vaddl_u8(p5q5, q1p1);
|
||||||
|
out_pq4 = vaddq_u16(out_pq4, p5q5_q1p1);
|
||||||
|
|
||||||
|
uint16x8_t p6q6_p6q6 = vaddl_u8(p6q6, p6q6);
|
||||||
|
out_pq2 = vaddq_u16(out_pq2, p6q6_p6q6);
|
||||||
|
uint16x8_t p6q6_temp = vaddw_u8(p6q6_p6q6, p6q6);
|
||||||
out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
|
out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
|
||||||
p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
|
p6q6_temp = vaddw_u8(p6q6_temp, p6q6);
|
||||||
out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
|
out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
|
||||||
p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
|
p6q6_temp = vaddq_u16(p6q6_temp, p6q6_p6q6);
|
||||||
out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
|
out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
|
||||||
|
|
||||||
out_pq4 = vaddw_u8(out_pq4, q1p1);
|
uint16x8_t qp_sum = vaddl_u8(q2p2, q1p1);
|
||||||
|
|
||||||
qp_sum = vaddl_u8(q2p2, q1p1);
|
|
||||||
out_pq3 = vaddq_u16(out_pq3, qp_sum);
|
out_pq3 = vaddq_u16(out_pq3, qp_sum);
|
||||||
|
|
||||||
qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
|
qp_sum = vaddw_u8(qp_sum, q3p3);
|
||||||
qp_sum = vaddw_u8(qp_sum, qp_rev);
|
|
||||||
out_pq2 = vaddq_u16(out_pq2, qp_sum);
|
out_pq2 = vaddq_u16(out_pq2, qp_sum);
|
||||||
|
|
||||||
qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
|
qp_sum = vaddw_u8(qp_sum, q4p4);
|
||||||
qp_sum = vaddw_u8(qp_sum, qp_rev);
|
|
||||||
out_pq1 = vaddq_u16(out_pq1, qp_sum);
|
out_pq1 = vaddq_u16(out_pq1, qp_sum);
|
||||||
|
|
||||||
qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
|
qp_sum = vaddw_u8(qp_sum, q5p5);
|
||||||
qp_sum = vaddw_u8(qp_sum, qp_rev);
|
|
||||||
out_pq0 = vaddq_u16(out_pq0, qp_sum);
|
out_pq0 = vaddq_u16(out_pq0, qp_sum);
|
||||||
|
|
||||||
out_pq0 = vaddw_u8(out_pq0, q0p0);
|
*p0q0_output = vshrn_n_u16(out_pq0, 4);
|
||||||
|
*p1q1_output = vshrn_n_u16(out_pq1, 4);
|
||||||
|
*p2q2_output = vshrn_n_u16(out_pq2, 4);
|
||||||
|
*p3q3_output = vshrn_n_u16(out_pq3, 4);
|
||||||
|
*p4q4_output = vshrn_n_u16(out_pq4, 4);
|
||||||
|
*p5q5_output = vshrn_n_u16(out_pq5, 4);
|
||||||
|
}
|
||||||
|
|
||||||
out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
|
static inline void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5,
|
||||||
out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
|
uint8x8_t *p4q4, uint8x8_t *p3q3,
|
||||||
out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
|
uint8x8_t *p2q2, uint8x8_t *p1q1,
|
||||||
out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
|
uint8x8_t *p0q0, const uint8_t blimit,
|
||||||
out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
|
const uint8_t limit, const uint8_t thresh) {
|
||||||
out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
|
uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
|
||||||
|
out_f14_pq5;
|
||||||
|
uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
|
||||||
|
uint8x8_t out_f4_pq0, out_f4_pq1;
|
||||||
|
|
||||||
|
// Calculate filter masks.
|
||||||
|
uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
|
||||||
|
uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
|
||||||
|
uint8x8_t flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
|
||||||
|
|
||||||
|
// No filtering.
|
||||||
|
if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
{
|
|
||||||
uint8x8_t filter4_cond, filter8_cond, filter14_cond;
|
uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8);
|
||||||
filter8_cond = vand_u8(flat_8x8, mask_8x8);
|
uint8x8_t filter4_cond = vmvn_u8(filter8_cond);
|
||||||
filter4_cond = vmvn_u8(filter8_cond);
|
uint8x8_t filter14_cond = vand_u8(filter8_cond, flat2_8x8);
|
||||||
filter14_cond = vand_u8(filter8_cond, flat2_8x8);
|
|
||||||
|
if (vget_lane_s64(vreinterpret_s64_u8(filter14_cond), 0) == -1) {
|
||||||
|
// Only filter14() applies.
|
||||||
|
filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0,
|
||||||
|
&out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4,
|
||||||
|
&out_f14_pq5);
|
||||||
|
|
||||||
|
*p0q0 = out_f14_pq0;
|
||||||
|
*p1q1 = out_f14_pq1;
|
||||||
|
*p2q2 = out_f14_pq2;
|
||||||
|
*p3q3 = out_f14_pq3;
|
||||||
|
*p4q4 = out_f14_pq4;
|
||||||
|
*p5q5 = out_f14_pq5;
|
||||||
|
} else if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 &&
|
||||||
|
vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) {
|
||||||
|
// Only filter8() applies.
|
||||||
|
filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2);
|
||||||
|
|
||||||
|
*p0q0 = out_f7_pq0;
|
||||||
|
*p1q1 = out_f7_pq1;
|
||||||
|
*p2q2 = out_f7_pq2;
|
||||||
|
} else {
|
||||||
|
filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
|
||||||
|
|
||||||
|
if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 &&
|
||||||
|
vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) {
|
||||||
|
// filter8() and filter14() do not apply, but filter4() applies to one or
|
||||||
|
// more values.
|
||||||
|
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
|
||||||
|
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
|
||||||
|
} else {
|
||||||
|
filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1,
|
||||||
|
&out_f7_pq2);
|
||||||
|
|
||||||
|
if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0) {
|
||||||
|
// filter14() does not apply, but filter8() and filter4() apply to one
|
||||||
|
// or more values. filter4 outputs
|
||||||
|
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
|
||||||
|
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
|
||||||
|
|
||||||
|
// filter8 outputs
|
||||||
|
*p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
|
||||||
|
*p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
|
||||||
|
*p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
|
||||||
|
} else {
|
||||||
|
// All filters may contribute values to final outputs.
|
||||||
|
filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0,
|
||||||
|
&out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4,
|
||||||
|
&out_f14_pq5);
|
||||||
|
|
||||||
// filter4 outputs
|
// filter4 outputs
|
||||||
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
|
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
|
||||||
@@ -330,111 +395,46 @@ static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
|
|||||||
*p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
|
*p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
|
||||||
*p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
|
*p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
|
static inline void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
|
||||||
uint8x8_t *p0q0, const uint8_t blimit,
|
uint8x8_t *p0q0, const uint8_t blimit,
|
||||||
const uint8_t limit, const uint8_t thresh) {
|
const uint8_t limit, const uint8_t thresh) {
|
||||||
uint16x8_t out;
|
|
||||||
uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
|
uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
|
||||||
uint8x8_t out_f4_pq0, out_f4_pq1;
|
uint8x8_t out_f4_pq0, out_f4_pq1;
|
||||||
uint8x8_t mask_8x8, flat_8x8;
|
|
||||||
|
|
||||||
// Calculate filter masks
|
// Calculate filter masks.
|
||||||
mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
|
uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
|
||||||
flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
|
uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
|
||||||
{
|
|
||||||
// filter 4
|
|
||||||
int32x2x2_t ps0_qs0, ps1_qs1;
|
|
||||||
int16x8_t filter_s16;
|
|
||||||
const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
|
|
||||||
uint8x8_t temp0_8x8, temp1_8x8;
|
|
||||||
int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
|
|
||||||
int8x8_t op0, oq0, op1, oq1;
|
|
||||||
int8x8_t pq_s0, pq_s1;
|
|
||||||
int8x8_t filter_s8, filter1_s8, filter2_s8;
|
|
||||||
int8x8_t hev_8x8;
|
|
||||||
const int8x8_t sign_mask = vdup_n_s8(0x80);
|
|
||||||
const int8x8_t val_4 = vdup_n_s8(4);
|
|
||||||
const int8x8_t val_3 = vdup_n_s8(3);
|
|
||||||
|
|
||||||
pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
|
// No filtering.
|
||||||
pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
|
if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
|
||||||
|
return;
|
||||||
ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
|
|
||||||
ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
|
|
||||||
ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
|
|
||||||
qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
|
|
||||||
ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
|
|
||||||
qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
|
|
||||||
|
|
||||||
// hev_mask
|
|
||||||
temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
|
|
||||||
temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
|
|
||||||
hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
|
|
||||||
|
|
||||||
// add outer taps if we have high edge variance
|
|
||||||
filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
|
|
||||||
filter_s8 = vand_s8(filter_s8, hev_8x8);
|
|
||||||
|
|
||||||
// inner taps
|
|
||||||
temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
|
|
||||||
filter_s16 = vmovl_s8(filter_s8);
|
|
||||||
filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
|
|
||||||
filter_s8 = vqmovn_s16(filter_s16);
|
|
||||||
filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
|
|
||||||
|
|
||||||
filter1_s8 = vqadd_s8(filter_s8, val_4);
|
|
||||||
filter2_s8 = vqadd_s8(filter_s8, val_3);
|
|
||||||
filter1_s8 = vshr_n_s8(filter1_s8, 3);
|
|
||||||
filter2_s8 = vshr_n_s8(filter2_s8, 3);
|
|
||||||
|
|
||||||
oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
|
|
||||||
op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
|
|
||||||
|
|
||||||
hev_8x8 = vmvn_s8(hev_8x8);
|
|
||||||
filter_s8 = vrshr_n_s8(filter1_s8, 1);
|
|
||||||
filter_s8 = vand_s8(filter_s8, hev_8x8);
|
|
||||||
|
|
||||||
oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
|
|
||||||
op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
|
|
||||||
|
|
||||||
out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
|
|
||||||
out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
|
|
||||||
}
|
}
|
||||||
{
|
|
||||||
// filter 8
|
|
||||||
uint16x8_t out_pq0, out_pq1, out_pq2;
|
|
||||||
uint8x8_t q0p0, q1p1, q2p2;
|
|
||||||
|
|
||||||
out = vaddl_u8(*p3q3, *p2q2);
|
uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8);
|
||||||
out = vaddw_u8(out, *p1q1);
|
uint8x8_t filter4_cond = vmvn_u8(filter8_cond);
|
||||||
out = vaddw_u8(out, *p0q0);
|
|
||||||
|
|
||||||
// reverse p and q
|
// Not needing filter4() at all is a very common case, so isolate it to avoid
|
||||||
q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
|
// needlessly computing filter4().
|
||||||
q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
|
if (vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) {
|
||||||
q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
|
filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2);
|
||||||
|
|
||||||
out = vaddw_u8(out, q0p0);
|
*p0q0 = out_f7_pq0;
|
||||||
out_pq1 = vaddw_u8(out, *p3q3);
|
*p1q1 = out_f7_pq1;
|
||||||
out_pq2 = vaddw_u8(out_pq1, *p3q3);
|
*p2q2 = out_f7_pq2;
|
||||||
out_pq2 = vaddw_u8(out_pq2, *p2q2);
|
} else {
|
||||||
out_pq1 = vaddw_u8(out_pq1, *p1q1);
|
filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
|
||||||
out_pq1 = vaddw_u8(out_pq1, q1p1);
|
|
||||||
|
|
||||||
out_pq0 = vaddw_u8(out, *p0q0);
|
if (vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) {
|
||||||
out_pq0 = vaddw_u8(out_pq0, q1p1);
|
// filter8() does not apply, but filter4() applies to one or more values.
|
||||||
out_pq0 = vaddw_u8(out_pq0, q2p2);
|
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
|
||||||
|
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
|
||||||
out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
|
} else {
|
||||||
out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
|
filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1,
|
||||||
out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
|
&out_f7_pq2);
|
||||||
}
|
|
||||||
{
|
|
||||||
uint8x8_t filter4_cond, filter8_cond;
|
|
||||||
filter8_cond = vand_u8(flat_8x8, mask_8x8);
|
|
||||||
filter4_cond = vmvn_u8(filter8_cond);
|
|
||||||
|
|
||||||
// filter4 outputs
|
// filter4 outputs
|
||||||
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
|
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
|
||||||
@@ -445,103 +445,65 @@ static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
|
|||||||
*p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
|
*p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
|
||||||
*p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
|
*p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
|
static inline void filter6(const uint8x8_t p0q0, const uint8x8_t p1q1,
|
||||||
|
const uint8x8_t p2q2, uint8x8_t *p0q0_output,
|
||||||
|
uint8x8_t *p1q1_output) {
|
||||||
|
uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4);
|
||||||
|
|
||||||
|
uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1);
|
||||||
|
uint16x8_t out = vaddq_u16(p0q0_p1q1, p0q0_p1q1);
|
||||||
|
|
||||||
|
uint16x8_t q0p0_p2q2 = vaddl_u8(q0p0, p2q2);
|
||||||
|
out = vaddq_u16(out, q0p0_p2q2);
|
||||||
|
|
||||||
|
uint16x8_t q0p0_q1p1 = vextq_u16(p0q0_p1q1, p0q0_p1q1, 4);
|
||||||
|
uint16x8_t out_pq0 = vaddq_u16(out, q0p0_q1p1);
|
||||||
|
|
||||||
|
uint16x8_t p2q2_p2q2 = vaddl_u8(p2q2, p2q2);
|
||||||
|
uint16x8_t out_pq1 = vaddq_u16(out, p2q2_p2q2);
|
||||||
|
|
||||||
|
*p0q0_output = vrshrn_n_u16(out_pq0, 3);
|
||||||
|
*p1q1_output = vrshrn_n_u16(out_pq1, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
|
||||||
const uint8_t blimit, const uint8_t limit,
|
const uint8_t blimit, const uint8_t limit,
|
||||||
const uint8_t thresh) {
|
const uint8_t thresh) {
|
||||||
uint16x8_t out;
|
|
||||||
uint8x8_t out_f6_pq0, out_f6_pq1;
|
uint8x8_t out_f6_pq0, out_f6_pq1;
|
||||||
uint8x8_t out_f4_pq0, out_f4_pq1;
|
uint8x8_t out_f4_pq0, out_f4_pq1;
|
||||||
uint8x8_t mask_8x8, flat_8x8;
|
|
||||||
|
|
||||||
// Calculate filter masks
|
// Calculate filter masks.
|
||||||
mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
|
uint8x8_t mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
|
||||||
flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
|
uint8x8_t flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
|
||||||
{
|
|
||||||
// filter 4
|
|
||||||
int32x2x2_t ps0_qs0, ps1_qs1;
|
|
||||||
int16x8_t filter_s16;
|
|
||||||
const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
|
|
||||||
uint8x8_t temp0_8x8, temp1_8x8;
|
|
||||||
int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
|
|
||||||
int8x8_t op0, oq0, op1, oq1;
|
|
||||||
int8x8_t pq_s0, pq_s1;
|
|
||||||
int8x8_t filter_s8, filter1_s8, filter2_s8;
|
|
||||||
int8x8_t hev_8x8;
|
|
||||||
const int8x8_t sign_mask = vdup_n_s8(0x80);
|
|
||||||
const int8x8_t val_4 = vdup_n_s8(4);
|
|
||||||
const int8x8_t val_3 = vdup_n_s8(3);
|
|
||||||
|
|
||||||
pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
|
// No filtering.
|
||||||
pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
|
if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
|
||||||
|
return;
|
||||||
ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
|
|
||||||
ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
|
|
||||||
ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
|
|
||||||
qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
|
|
||||||
ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
|
|
||||||
qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
|
|
||||||
|
|
||||||
// hev_mask
|
|
||||||
temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
|
|
||||||
temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
|
|
||||||
hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
|
|
||||||
|
|
||||||
// add outer taps if we have high edge variance
|
|
||||||
filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
|
|
||||||
filter_s8 = vand_s8(filter_s8, hev_8x8);
|
|
||||||
|
|
||||||
// inner taps
|
|
||||||
temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
|
|
||||||
filter_s16 = vmovl_s8(filter_s8);
|
|
||||||
filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
|
|
||||||
filter_s8 = vqmovn_s16(filter_s16);
|
|
||||||
filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
|
|
||||||
|
|
||||||
filter1_s8 = vqadd_s8(filter_s8, val_4);
|
|
||||||
filter2_s8 = vqadd_s8(filter_s8, val_3);
|
|
||||||
filter1_s8 = vshr_n_s8(filter1_s8, 3);
|
|
||||||
filter2_s8 = vshr_n_s8(filter2_s8, 3);
|
|
||||||
|
|
||||||
oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
|
|
||||||
op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
|
|
||||||
|
|
||||||
filter_s8 = vrshr_n_s8(filter1_s8, 1);
|
|
||||||
filter_s8 = vbic_s8(filter_s8, hev_8x8);
|
|
||||||
|
|
||||||
oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
|
|
||||||
op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
|
|
||||||
|
|
||||||
out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
|
|
||||||
out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
|
|
||||||
}
|
}
|
||||||
{
|
|
||||||
// filter 6
|
|
||||||
uint16x8_t out_pq0, out_pq1;
|
|
||||||
uint8x8_t pq_rev;
|
|
||||||
|
|
||||||
out = vaddl_u8(*p0q0, *p1q1);
|
uint8x8_t filter6_cond = vand_u8(flat_8x8, mask_8x8);
|
||||||
out = vaddq_u16(out, out);
|
uint8x8_t filter4_cond = vmvn_u8(filter6_cond);
|
||||||
out = vaddw_u8(out, *p2q2);
|
|
||||||
|
|
||||||
pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
|
// Not needing filter4 at all is a very common case, so isolate it to avoid
|
||||||
out = vaddw_u8(out, pq_rev);
|
// needlessly computing filter4.
|
||||||
|
if (vget_lane_s64(vreinterpret_s64_u8(filter6_cond), 0) == -1) {
|
||||||
|
filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1);
|
||||||
|
|
||||||
out_pq0 = vaddw_u8(out, pq_rev);
|
*p0q0 = out_f6_pq0;
|
||||||
pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
|
*p1q1 = out_f6_pq1;
|
||||||
out_pq0 = vaddw_u8(out_pq0, pq_rev);
|
} else {
|
||||||
|
filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
|
||||||
|
|
||||||
out_pq1 = vaddw_u8(out, *p2q2);
|
if (vget_lane_u64(vreinterpret_u64_u8(filter6_cond), 0) == 0) {
|
||||||
out_pq1 = vaddw_u8(out_pq1, *p2q2);
|
// filter6 does not apply, but filter4 applies to one or more values.
|
||||||
|
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
|
||||||
out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
|
*p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
|
||||||
out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
|
} else {
|
||||||
}
|
// All filters may contribute to the final output.
|
||||||
{
|
filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1);
|
||||||
uint8x8_t filter4_cond, filter6_cond;
|
|
||||||
filter6_cond = vand_u8(flat_8x8, mask_8x8);
|
|
||||||
filter4_cond = vmvn_u8(filter6_cond);
|
|
||||||
|
|
||||||
// filter4 outputs
|
// filter4 outputs
|
||||||
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
|
*p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
|
||||||
@@ -551,68 +513,26 @@ static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
|
|||||||
*p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
|
*p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
|
||||||
*p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
|
*p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
|
static inline void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0,
|
||||||
const uint8_t limit, const uint8_t thresh) {
|
const uint8_t blimit, const uint8_t limit,
|
||||||
int32x2x2_t ps0_qs0, ps1_qs1;
|
const uint8_t thresh) {
|
||||||
int16x8_t filter_s16;
|
uint8x8_t out_f4_pq0, out_f4_pq1;
|
||||||
const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
|
|
||||||
uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
|
|
||||||
int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
|
|
||||||
int8x8_t op0, oq0, op1, oq1;
|
|
||||||
int8x8_t pq_s0, pq_s1;
|
|
||||||
int8x8_t filter_s8, filter1_s8, filter2_s8;
|
|
||||||
int8x8_t hev_8x8;
|
|
||||||
const int8x8_t sign_mask = vdup_n_s8(0x80);
|
|
||||||
const int8x8_t val_4 = vdup_n_s8(4);
|
|
||||||
const int8x8_t val_3 = vdup_n_s8(3);
|
|
||||||
|
|
||||||
// Calculate filter mask
|
// Calculate filter mask
|
||||||
mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
|
uint8x8_t mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
|
||||||
|
|
||||||
pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
|
// No filtering.
|
||||||
pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
|
if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
|
filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh);
|
||||||
ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
|
|
||||||
ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
|
|
||||||
qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
|
|
||||||
ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
|
|
||||||
qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
|
|
||||||
|
|
||||||
// hev_mask
|
*p0q0 = out_f4_pq0;
|
||||||
temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
|
*p1q1 = out_f4_pq1;
|
||||||
temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
|
|
||||||
hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
|
|
||||||
|
|
||||||
// add outer taps if we have high edge variance
|
|
||||||
filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
|
|
||||||
filter_s8 = vand_s8(filter_s8, hev_8x8);
|
|
||||||
|
|
||||||
// inner taps
|
|
||||||
temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
|
|
||||||
filter_s16 = vmovl_s8(filter_s8);
|
|
||||||
filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
|
|
||||||
filter_s8 = vqmovn_s16(filter_s16);
|
|
||||||
filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
|
|
||||||
|
|
||||||
filter1_s8 = vqadd_s8(filter_s8, val_4);
|
|
||||||
filter2_s8 = vqadd_s8(filter_s8, val_3);
|
|
||||||
filter1_s8 = vshr_n_s8(filter1_s8, 3);
|
|
||||||
filter2_s8 = vshr_n_s8(filter2_s8, 3);
|
|
||||||
|
|
||||||
oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
|
|
||||||
op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
|
|
||||||
|
|
||||||
filter_s8 = vrshr_n_s8(filter1_s8, 1);
|
|
||||||
filter_s8 = vbic_s8(filter_s8, hev_8x8);
|
|
||||||
|
|
||||||
oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
|
|
||||||
op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
|
|
||||||
|
|
||||||
*p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
|
|
||||||
*p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
|
void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
|
||||||
|
|||||||
185
third_party/aom/aom_dsp/arm/mem_neon.h
vendored
185
third_party/aom/aom_dsp/arm/mem_neon.h
vendored
@@ -55,12 +55,52 @@ static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
|
||||||
|
int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
|
||||||
|
int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
|
||||||
|
vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
|
||||||
|
vst1_u8(ptr + 0 * 8, a.val[0]);
|
||||||
|
vst1_u8(ptr + 1 * 8, a.val[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
|
||||||
|
vst1_u8(ptr + 0 * 8, a.val[0]);
|
||||||
|
vst1_u8(ptr + 1 * 8, a.val[1]);
|
||||||
|
vst1_u8(ptr + 2 * 8, a.val[2]);
|
||||||
|
vst1_u8(ptr + 3 * 8, a.val[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) {
|
||||||
|
vst1q_u16(ptr + 0 * 8, a.val[0]);
|
||||||
|
vst1q_u16(ptr + 1 * 8, a.val[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) {
|
||||||
|
vst1q_u16(ptr + 0 * 8, a.val[0]);
|
||||||
|
vst1q_u16(ptr + 1 * 8, a.val[1]);
|
||||||
|
vst1q_u16(ptr + 2 * 8, a.val[2]);
|
||||||
|
vst1q_u16(ptr + 3 * 8, a.val[3]);
|
||||||
|
}
|
||||||
|
|
||||||
#elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit.
|
#elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit.
|
||||||
#if __GNUC__ < 8
|
#if __GNUC__ < 8
|
||||||
static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
|
static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
|
||||||
uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
|
uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
|
||||||
|
int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
|
||||||
|
return res;
|
||||||
|
}
|
||||||
#endif // __GNUC__ < 8
|
#endif // __GNUC__ < 8
|
||||||
|
|
||||||
#if __GNUC__ < 9
|
#if __GNUC__ < 9
|
||||||
@@ -71,13 +111,30 @@ static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
|
|||||||
}
|
}
|
||||||
#endif // __GNUC__ < 9
|
#endif // __GNUC__ < 9
|
||||||
|
|
||||||
// vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
|
|
||||||
#if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
|
#if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
|
||||||
static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
|
static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
|
||||||
uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
|
uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
|
||||||
vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
|
vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
|
||||||
|
int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
|
||||||
|
vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
|
||||||
|
vst1_u8(ptr + 0 * 8, a.val[0]);
|
||||||
|
vst1_u8(ptr + 1 * 8, a.val[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
|
||||||
|
vst1_u8(ptr + 0 * 8, a.val[0]);
|
||||||
|
vst1_u8(ptr + 1 * 8, a.val[1]);
|
||||||
|
vst1_u8(ptr + 2 * 8, a.val[2]);
|
||||||
|
vst1_u8(ptr + 3 * 8, a.val[3]);
|
||||||
|
}
|
||||||
#endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
|
#endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
|
||||||
#endif // defined(__GNUC__) && !defined(__clang__)
|
#endif // defined(__GNUC__) && !defined(__clang__)
|
||||||
|
|
||||||
@@ -215,6 +272,23 @@ static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
|
|||||||
s += p;
|
s += p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void load_u16_4x6(const uint16_t *s, ptrdiff_t p,
|
||||||
|
uint16x4_t *const s0, uint16x4_t *const s1,
|
||||||
|
uint16x4_t *const s2, uint16x4_t *const s3,
|
||||||
|
uint16x4_t *const s4, uint16x4_t *const s5) {
|
||||||
|
*s0 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s1 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s2 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s3 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s4 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s5 = vld1_u16(s);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
|
static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
|
||||||
uint16x4_t *const s0, uint16x4_t *const s1,
|
uint16x4_t *const s0, uint16x4_t *const s1,
|
||||||
uint16x4_t *const s2, uint16x4_t *const s3,
|
uint16x4_t *const s2, uint16x4_t *const s3,
|
||||||
@@ -235,6 +309,65 @@ static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
|
|||||||
*s6 = vld1_u16(s);
|
*s6 = vld1_u16(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void load_u16_4x8(const uint16_t *s, ptrdiff_t p,
|
||||||
|
uint16x4_t *const s0, uint16x4_t *const s1,
|
||||||
|
uint16x4_t *const s2, uint16x4_t *const s3,
|
||||||
|
uint16x4_t *const s4, uint16x4_t *const s5,
|
||||||
|
uint16x4_t *const s6, uint16x4_t *const s7) {
|
||||||
|
*s0 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s1 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s2 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s3 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s4 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s5 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s6 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s7 = vld1_u16(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void load_u16_4x14(const uint16_t *s, ptrdiff_t p,
|
||||||
|
uint16x4_t *const s0, uint16x4_t *const s1,
|
||||||
|
uint16x4_t *const s2, uint16x4_t *const s3,
|
||||||
|
uint16x4_t *const s4, uint16x4_t *const s5,
|
||||||
|
uint16x4_t *const s6, uint16x4_t *const s7,
|
||||||
|
uint16x4_t *const s8, uint16x4_t *const s9,
|
||||||
|
uint16x4_t *const s10, uint16x4_t *const s11,
|
||||||
|
uint16x4_t *const s12, uint16x4_t *const s13) {
|
||||||
|
*s0 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s1 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s2 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s3 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s4 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s5 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s6 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s7 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s8 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s9 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s10 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s11 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s12 = vld1_u16(s);
|
||||||
|
s += p;
|
||||||
|
*s13 = vld1_u16(s);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
|
static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
|
||||||
int16x8_t *const s0, int16x8_t *const s1) {
|
int16x8_t *const s0, int16x8_t *const s1) {
|
||||||
*s0 = vld1q_s16(s);
|
*s0 = vld1q_s16(s);
|
||||||
@@ -597,6 +730,56 @@ static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
|
|||||||
vst1_u16(s, s3);
|
vst1_u16(s, s3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void store_u16_4x6(uint16_t *s, ptrdiff_t dst_stride,
|
||||||
|
const uint16x4_t s0, const uint16x4_t s1,
|
||||||
|
const uint16x4_t s2, const uint16x4_t s3,
|
||||||
|
const uint16x4_t s4, const uint16x4_t s5) {
|
||||||
|
vst1_u16(s, s0);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s1);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s2);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s3);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s4);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s5);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void store_u16_4x12(uint16_t *s, ptrdiff_t dst_stride,
|
||||||
|
const uint16x4_t s0, const uint16x4_t s1,
|
||||||
|
const uint16x4_t s2, const uint16x4_t s3,
|
||||||
|
const uint16x4_t s4, const uint16x4_t s5,
|
||||||
|
const uint16x4_t s6, const uint16x4_t s7,
|
||||||
|
const uint16x4_t s8, const uint16x4_t s9,
|
||||||
|
const uint16x4_t s10, const uint16x4_t s11) {
|
||||||
|
vst1_u16(s, s0);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s1);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s2);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s3);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s4);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s5);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s6);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s7);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s8);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s9);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s10);
|
||||||
|
s += dst_stride;
|
||||||
|
vst1_u16(s, s11);
|
||||||
|
s += dst_stride;
|
||||||
|
}
|
||||||
|
|
||||||
static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
|
static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
|
||||||
const uint16x8_t s0, const uint16x8_t s1) {
|
const uint16x8_t s0, const uint16x8_t s1) {
|
||||||
vst1q_u16(s, s0);
|
vst1q_u16(s, s0);
|
||||||
|
|||||||
11
third_party/aom/aom_dsp/x86/synonyms.h
vendored
11
third_party/aom/aom_dsp/x86/synonyms.h
vendored
@@ -46,16 +46,6 @@ static inline __m128i xx_loadu_128(const void *a) {
|
|||||||
return _mm_loadu_si128((const __m128i *)a);
|
return _mm_loadu_si128((const __m128i *)a);
|
||||||
}
|
}
|
||||||
|
|
||||||
// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
|
|
||||||
// manually on older compilers.
|
|
||||||
#if !defined(__clang__) && __GNUC_MAJOR__ < 9
|
|
||||||
static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
|
|
||||||
__m64 hi_, lo_;
|
|
||||||
memcpy(&hi_, hi, sizeof(hi_));
|
|
||||||
memcpy(&lo_, lo, sizeof(lo_));
|
|
||||||
return _mm_set_epi64(hi_, lo_);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
// Load 64 bits from each of hi and low, and pack into an SSE register
|
// Load 64 bits from each of hi and low, and pack into an SSE register
|
||||||
// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
|
// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
|
||||||
// the strict aliasing rule, this takes a different approach
|
// the strict aliasing rule, this takes a different approach
|
||||||
@@ -63,7 +53,6 @@ static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
|
|||||||
return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo),
|
return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo),
|
||||||
_mm_loadl_epi64((const __m128i *)hi));
|
_mm_loadl_epi64((const __m128i *)hi));
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
static inline void xx_storel_32(void *const a, const __m128i v) {
|
static inline void xx_storel_32(void *const a, const __m128i v) {
|
||||||
const int val = _mm_cvtsi128_si32(v);
|
const int val = _mm_cvtsi128_si32(v);
|
||||||
|
|||||||
15
third_party/aom/aom_dsp/x86/synonyms_avx2.h
vendored
15
third_party/aom/aom_dsp/x86/synonyms_avx2.h
vendored
@@ -76,26 +76,11 @@ static inline __m256i yy_loadu_4x64(const void *e3, const void *e2,
|
|||||||
return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
|
return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GCC_VERSION (__GNUC__ * 10000 \
|
|
||||||
+ __GNUC_MINOR__ * 100 \
|
|
||||||
+ __GNUC_PATCHLEVEL__)
|
|
||||||
|
|
||||||
// _mm256_loadu2_m128i has been introduced in GCC 10.1
|
|
||||||
#if !defined(__clang__) && GCC_VERSION < 101000
|
|
||||||
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
|
|
||||||
__m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
|
|
||||||
__m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
|
|
||||||
return _mm256_set_m128i(mhi, mlo);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
|
static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
|
||||||
__m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
|
__m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
|
||||||
__m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
|
__m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
|
||||||
return yy_set_m128i(mhi, mlo);
|
return yy_set_m128i(mhi, mlo);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef GCC_VERSION
|
|
||||||
|
|
||||||
static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
|
static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
|
||||||
_mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
|
_mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
|
||||||
|
|||||||
8
third_party/aom/aom_ports/aom_ports.cmake
vendored
8
third_party/aom/aom_ports/aom_ports.cmake
vendored
@@ -38,6 +38,9 @@ endif()
|
|||||||
list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h"
|
list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h"
|
||||||
"${AOM_ROOT}/aom_ports/ppc_cpudetect.c")
|
"${AOM_ROOT}/aom_ports/ppc_cpudetect.c")
|
||||||
|
|
||||||
|
list(APPEND AOM_PORTS_SOURCES_RISCV "${AOM_ROOT}/aom_ports/riscv.h"
|
||||||
|
"${AOM_ROOT}/aom_ports/riscv_cpudetect.c")
|
||||||
|
|
||||||
# For arm and x86 targets:
|
# For arm and x86 targets:
|
||||||
#
|
#
|
||||||
# * Creates the aom_ports build target, adds the includes in aom_ports to the
|
# * Creates the aom_ports build target, adds the includes in aom_ports to the
|
||||||
@@ -68,9 +71,12 @@ function(setup_aom_ports_targets)
|
|||||||
elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
|
elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
|
||||||
add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
|
add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
|
||||||
set(aom_ports_has_symbols 1)
|
set(aom_ports_has_symbols 1)
|
||||||
|
elseif("${AOM_TARGET_CPU}" MATCHES "riscv")
|
||||||
|
add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_RISCV})
|
||||||
|
set(aom_ports_has_symbols 1)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if("${AOM_TARGET_CPU}" MATCHES "arm|ppc")
|
if("${AOM_TARGET_CPU}" MATCHES "arm|ppc|riscv")
|
||||||
target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
|
target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
|
||||||
if(BUILD_SHARED_LIBS)
|
if(BUILD_SHARED_LIBS)
|
||||||
target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_ports>)
|
target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_ports>)
|
||||||
|
|||||||
30
third_party/aom/aom_ports/riscv.h
vendored
Normal file
30
third_party/aom/aom_ports/riscv.h
vendored
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2025, Alliance for Open Media. All rights reserved.
|
||||||
|
*
|
||||||
|
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||||
|
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||||
|
* was not distributed with this source code in the LICENSE file, you can
|
||||||
|
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||||
|
* Media Patent License 1.0 was not distributed with this source code in the
|
||||||
|
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef AOM_AOM_PORTS_RISCV_H_
|
||||||
|
#define AOM_AOM_PORTS_RISCV_H_
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "config/aom_config.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define HAS_RVV 0x01
|
||||||
|
|
||||||
|
int riscv_simd_caps(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} // extern "C"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // AOM_AOM_PORTS_RISCV_H_
|
||||||
38
third_party/aom/aom_ports/riscv_cpudetect.c
vendored
Normal file
38
third_party/aom/aom_ports/riscv_cpudetect.c
vendored
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2025, Alliance for Open Media. All rights reserved.
|
||||||
|
*
|
||||||
|
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||||
|
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||||
|
* was not distributed with this source code in the LICENSE file, you can
|
||||||
|
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||||
|
* Media Patent License 1.0 was not distributed with this source code in the
|
||||||
|
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "config/aom_config.h"
|
||||||
|
|
||||||
|
#include "aom_ports/riscv.h"
|
||||||
|
|
||||||
|
#if CONFIG_RUNTIME_CPU_DETECT
|
||||||
|
|
||||||
|
#include <sys/auxv.h>
|
||||||
|
|
||||||
|
#define HWCAP_RVV (1 << ('v' - 'a'))
|
||||||
|
|
||||||
|
int riscv_simd_caps(void) {
|
||||||
|
int flags = 0;
|
||||||
|
#if HAVE_RVV
|
||||||
|
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||||
|
if (hwcap & HWCAP_RVV) flags |= HAS_RVV;
|
||||||
|
#endif
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// If there is no RTCD the function pointers are not used and can not be
|
||||||
|
// changed.
|
||||||
|
int riscv_simd_caps(void) { return 0; }
|
||||||
|
#endif // CONFIG_RUNTIME_CPU_DETECT
|
||||||
5
third_party/aom/apps/aomenc.c
vendored
5
third_party/aom/apps/aomenc.c
vendored
@@ -2318,8 +2318,9 @@ int main(int argc, const char **argv_) {
|
|||||||
"match input format.\n",
|
"match input format.\n",
|
||||||
stream->config.cfg.g_profile);
|
stream->config.cfg.g_profile);
|
||||||
}
|
}
|
||||||
if ((global.show_psnr == 2) && (stream->config.cfg.g_input_bit_depth ==
|
if (global.show_psnr == 2 &&
|
||||||
stream->config.cfg.g_bit_depth)) {
|
stream->config.cfg.g_input_bit_depth ==
|
||||||
|
(unsigned int)stream->config.cfg.g_bit_depth) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"Warning: --psnr==2 and --psnr==1 will provide same "
|
"Warning: --psnr==2 and --psnr==1 will provide same "
|
||||||
"results when input bit-depth == stream bit-depth, "
|
"results when input bit-depth == stream bit-depth, "
|
||||||
|
|||||||
10
third_party/aom/av1/av1.cmake
vendored
10
third_party/aom/av1/av1.cmake
vendored
@@ -445,6 +445,9 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
|
|||||||
|
|
||||||
list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
|
list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
|
||||||
|
|
||||||
|
list(APPEND AOM_AV1_COMMON_INTRIN_RVV
|
||||||
|
"${AOM_ROOT}/av1/common/riscv/cdef_block_rvv.c")
|
||||||
|
|
||||||
if(CONFIG_THREE_PASS)
|
if(CONFIG_THREE_PASS)
|
||||||
list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/thirdpass.c"
|
list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/thirdpass.c"
|
||||||
"${AOM_ROOT}/av1/encoder/thirdpass.h")
|
"${AOM_ROOT}/av1/encoder/thirdpass.h")
|
||||||
@@ -822,6 +825,13 @@ function(setup_av1_targets)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(HAVE_RVV)
|
||||||
|
if(AOM_AV1_COMMON_INTRIN_RVV)
|
||||||
|
add_intrinsics_object_library("-march=rv64gcv" "rvv" "aom_av1_common"
|
||||||
|
"AOM_AV1_COMMON_INTRIN_RVV")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
# Pass the new lib targets up to the parent scope instance of
|
# Pass the new lib targets up to the parent scope instance of
|
||||||
# $AOM_LIB_TARGETS.
|
# $AOM_LIB_TARGETS.
|
||||||
set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
|
set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
|
||||||
|
|||||||
7
third_party/aom/av1/av1_cx_iface.c
vendored
7
third_party/aom/av1/av1_cx_iface.c
vendored
@@ -1084,7 +1084,6 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
|
|||||||
AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
|
AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
|
||||||
ToolCfg *const tool_cfg = &oxcf->tool_cfg;
|
ToolCfg *const tool_cfg = &oxcf->tool_cfg;
|
||||||
|
|
||||||
const int is_vbr = cfg->rc_end_usage == AOM_VBR;
|
|
||||||
oxcf->profile = cfg->g_profile;
|
oxcf->profile = cfg->g_profile;
|
||||||
oxcf->max_threads = (int)cfg->g_threads;
|
oxcf->max_threads = (int)cfg->g_threads;
|
||||||
|
|
||||||
@@ -1167,9 +1166,9 @@ static void set_encoder_config(AV1EncoderConfig *oxcf,
|
|||||||
rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
|
rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
|
||||||
rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct;
|
rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct;
|
||||||
rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct;
|
rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct;
|
||||||
rc_cfg->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
|
rc_cfg->maximum_buffer_size_ms = cfg->rc_buf_sz;
|
||||||
rc_cfg->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
|
rc_cfg->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
|
||||||
rc_cfg->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
|
rc_cfg->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz;
|
||||||
// Convert target bandwidth from Kbit/s to Bit/s
|
// Convert target bandwidth from Kbit/s to Bit/s
|
||||||
rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate;
|
rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate;
|
||||||
rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh;
|
rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh;
|
||||||
|
|||||||
19
third_party/aom/av1/common/arm/cfl_neon.c
vendored
19
third_party/aom/av1/common/arm/cfl_neon.c
vendored
@@ -13,6 +13,7 @@
|
|||||||
#include "config/aom_config.h"
|
#include "config/aom_config.h"
|
||||||
#include "config/av1_rtcd.h"
|
#include "config/av1_rtcd.h"
|
||||||
|
|
||||||
|
#include "aom_dsp/arm/mem_neon.h"
|
||||||
#include "av1/common/cfl.h"
|
#include "av1/common/cfl.h"
|
||||||
|
|
||||||
static inline void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
|
static inline void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
|
||||||
@@ -428,10 +429,7 @@ static inline int16x8_t predict_w8(const int16_t *pred_buf_q3,
|
|||||||
static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
|
static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
|
||||||
int16x8_t alpha_sign, int abs_alpha_q12,
|
int16x8_t alpha_sign, int abs_alpha_q12,
|
||||||
int16x8_t dc) {
|
int16x8_t dc) {
|
||||||
// vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
|
const int16x8x2_t ac_q3 = vld1q_s16_x2(pred_buf_q3);
|
||||||
// does not interleave, but is not currently available in the compilier used
|
|
||||||
// by the AOM build system.
|
|
||||||
const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3);
|
|
||||||
const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
|
const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
|
||||||
const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
|
const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
|
||||||
const int16x8_t scaled_luma_0 =
|
const int16x8_t scaled_luma_0 =
|
||||||
@@ -447,10 +445,7 @@ static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
|
|||||||
static inline int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
|
static inline int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
|
||||||
int16x8_t alpha_sign, int abs_alpha_q12,
|
int16x8_t alpha_sign, int abs_alpha_q12,
|
||||||
int16x8_t dc) {
|
int16x8_t dc) {
|
||||||
// vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
|
const int16x8x4_t ac_q3 = vld1q_s16_x4(pred_buf_q3);
|
||||||
// does not interleave, but is not currently available in the compilier used
|
|
||||||
// by the AOM build system.
|
|
||||||
const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3);
|
|
||||||
const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
|
const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
|
||||||
const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
|
const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
|
||||||
const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
|
const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
|
||||||
@@ -497,7 +492,7 @@ static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
|
|||||||
predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
|
predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
|
||||||
const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
|
const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
|
||||||
vqmovun_s16(pred.val[1]) } };
|
vqmovun_s16(pred.val[1]) } };
|
||||||
vst2_u8(dst, predun);
|
vst1_u8_x2(dst, predun);
|
||||||
} else {
|
} else {
|
||||||
const int16x8x4_t pred =
|
const int16x8x4_t pred =
|
||||||
predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
|
predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
|
||||||
@@ -505,7 +500,7 @@ static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
|
|||||||
{ vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
|
{ vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
|
||||||
vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
|
vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
|
||||||
};
|
};
|
||||||
vst4_u8(dst, predun);
|
vst1_u8_x4(dst, predun);
|
||||||
}
|
}
|
||||||
dst += dst_stride;
|
dst += dst_stride;
|
||||||
} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
|
} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
|
||||||
@@ -574,11 +569,11 @@ static inline void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
|
|||||||
} else if (width == 16) {
|
} else if (width == 16) {
|
||||||
const int16x8x2_t pred =
|
const int16x8x2_t pred =
|
||||||
predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
|
predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
|
||||||
vst2q_u16(dst, clamp2q_s16(pred, max_16x8));
|
vst1q_u16_x2(dst, clamp2q_s16(pred, max_16x8));
|
||||||
} else {
|
} else {
|
||||||
const int16x8x4_t pred =
|
const int16x8x4_t pred =
|
||||||
predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
|
predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
|
||||||
vst4q_u16(dst, clamp4q_s16(pred, max_16x8));
|
vst1q_u16_x4(dst, clamp4q_s16(pred, max_16x8));
|
||||||
}
|
}
|
||||||
dst += dst_stride;
|
dst += dst_stride;
|
||||||
} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
|
} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
|
||||||
|
|||||||
@@ -53,8 +53,7 @@ static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp,
|
|||||||
static AOM_FORCE_INLINE int16x8_t load_filters_1(int ofs) {
|
static AOM_FORCE_INLINE int16x8_t load_filters_1(int ofs) {
|
||||||
const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS);
|
const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS);
|
||||||
|
|
||||||
const int16_t *base =
|
const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
|
||||||
(int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
|
|
||||||
return vld1q_s16(base + ofs0 * 8);
|
return vld1q_s16(base + ofs0 * 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -65,8 +64,7 @@ static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int ofs,
|
|||||||
const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
|
const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
|
||||||
const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
|
const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
|
||||||
|
|
||||||
const int16_t *base =
|
const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
|
||||||
(int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
|
|
||||||
out[0] = vld1q_s16(base + ofs0 * 8);
|
out[0] = vld1q_s16(base + ofs0 * 8);
|
||||||
out[1] = vld1q_s16(base + ofs1 * 8);
|
out[1] = vld1q_s16(base + ofs1 * 8);
|
||||||
out[2] = vld1q_s16(base + ofs2 * 8);
|
out[2] = vld1q_s16(base + ofs2 * 8);
|
||||||
@@ -84,8 +82,7 @@ static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int ofs,
|
|||||||
const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS);
|
const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS);
|
||||||
const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS);
|
const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS);
|
||||||
|
|
||||||
const int16_t *base =
|
const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS];
|
||||||
(int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
|
|
||||||
out[0] = vld1q_s16(base + ofs0 * 8);
|
out[0] = vld1q_s16(base + ofs0 * 8);
|
||||||
out[1] = vld1q_s16(base + ofs1 * 8);
|
out[1] = vld1q_s16(base + ofs1 * 8);
|
||||||
out[2] = vld1q_s16(base + ofs2 * 8);
|
out[2] = vld1q_s16(base + ofs2 * 8);
|
||||||
|
|||||||
12
third_party/aom/av1/common/arm/warp_plane_neon.c
vendored
12
third_party/aom/av1/common/arm/warp_plane_neon.c
vendored
@@ -101,8 +101,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
|
|||||||
|
|
||||||
static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
|
static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
|
||||||
int sx) {
|
int sx) {
|
||||||
int16x8_t f_s16 =
|
int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
return horizontal_filter_4x1_f1_beta0(in, f_s16);
|
return horizontal_filter_4x1_f1_beta0(in, f_s16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -140,8 +139,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
|
|||||||
|
|
||||||
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
|
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
|
||||||
int sx) {
|
int sx) {
|
||||||
int16x8_t f_s16 =
|
int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
return horizontal_filter_8x1_f1_beta0(in, f_s16);
|
return horizontal_filter_8x1_f1_beta0(in, f_s16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -156,8 +154,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
|
|||||||
int16x4_t s6 = vget_low_s16(src[6]);
|
int16x4_t s6 = vget_low_s16(src[6]);
|
||||||
int16x4_t s7 = vget_low_s16(src[7]);
|
int16x4_t s7 = vget_low_s16(src[7]);
|
||||||
|
|
||||||
int16x8_t f =
|
int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
|
|
||||||
int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
|
int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
|
||||||
m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
|
m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
|
||||||
@@ -210,8 +207,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
|
|||||||
int16x8_t s6 = src[6];
|
int16x8_t s6 = src[6];
|
||||||
int16x8_t s7 = src[7];
|
int16x8_t s7 = src[7];
|
||||||
|
|
||||||
int16x8_t f =
|
int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
|
|
||||||
int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
|
int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
|
||||||
m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
|
m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
|
||||||
|
|||||||
56
third_party/aom/av1/common/arm/warp_plane_neon.h
vendored
56
third_party/aom/av1/common/arm/warp_plane_neon.h
vendored
@@ -61,34 +61,34 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
|
|||||||
|
|
||||||
static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset,
|
static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset,
|
||||||
int stride) {
|
int stride) {
|
||||||
out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
|
out[0] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
|
out[1] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
|
out[2] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
|
out[3] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int offset,
|
static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int offset,
|
||||||
int stride) {
|
int stride) {
|
||||||
out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
|
out[0] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
|
out[1] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
|
out[2] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
|
out[3] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
out[4] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 4 * stride) >>
|
out[4] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 4 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
out[5] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 5 * stride) >>
|
out[5] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 5 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
out[6] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 6 * stride) >>
|
out[6] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 6 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
out[7] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 7 * stride) >>
|
out[7] = vld1q_s16(
|
||||||
WARPEDDIFF_PREC_BITS)));
|
av1_warped_filter[(offset + 7 * stride) >> WARPEDDIFF_PREC_BITS]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static AOM_FORCE_INLINE int clamp_iy(int iy, int height) {
|
static AOM_FORCE_INLINE int clamp_iy(int iy, int height) {
|
||||||
@@ -175,8 +175,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
|
|||||||
if (p_width == 4) {
|
if (p_width == 4) {
|
||||||
if (beta == 0) {
|
if (beta == 0) {
|
||||||
if (alpha == 0) {
|
if (alpha == 0) {
|
||||||
int16x8_t f_s16 = vld1q_s16(
|
int16x8_t f_s16 =
|
||||||
(int16_t *)(av1_warped_filter + (sx4 >> WARPEDDIFF_PREC_BITS)));
|
vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
|
||||||
APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16);
|
APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16);
|
||||||
} else {
|
} else {
|
||||||
APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha);
|
APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha);
|
||||||
@@ -193,8 +193,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
|
|||||||
} else {
|
} else {
|
||||||
if (beta == 0) {
|
if (beta == 0) {
|
||||||
if (alpha == 0) {
|
if (alpha == 0) {
|
||||||
int16x8_t f_s16 = vld1q_s16(
|
int16x8_t f_s16 =
|
||||||
(int16_t *)(av1_warped_filter + (sx4 >> WARPEDDIFF_PREC_BITS)));
|
vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
|
||||||
APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16);
|
APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16);
|
||||||
} else {
|
} else {
|
||||||
APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha);
|
APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha);
|
||||||
|
|||||||
@@ -109,8 +109,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
|
|||||||
|
|
||||||
static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
|
static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
|
||||||
int sx) {
|
int sx) {
|
||||||
int16x8_t f_s16 =
|
int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
return horizontal_filter_4x1_f1_beta0(in, f_s16);
|
return horizontal_filter_4x1_f1_beta0(in, f_s16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -145,8 +144,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
|
|||||||
|
|
||||||
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
|
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
|
||||||
int sx) {
|
int sx) {
|
||||||
int16x8_t f_s16 =
|
int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
return horizontal_filter_8x1_f1_beta0(in, f_s16);
|
return horizontal_filter_8x1_f1_beta0(in, f_s16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -161,8 +159,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
|
|||||||
int16x4_t s6 = vget_low_s16(src[6]);
|
int16x4_t s6 = vget_low_s16(src[6]);
|
||||||
int16x4_t s7 = vget_low_s16(src[7]);
|
int16x4_t s7 = vget_low_s16(src[7]);
|
||||||
|
|
||||||
int16x8_t f =
|
int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
|
|
||||||
int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
|
int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
|
||||||
m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
|
m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
|
||||||
@@ -215,8 +212,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
|
|||||||
int16x8_t s6 = src[6];
|
int16x8_t s6 = src[6];
|
||||||
int16x8_t s7 = src[7];
|
int16x8_t s7 = src[7];
|
||||||
|
|
||||||
int16x8_t f =
|
int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
|
|
||||||
int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
|
int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
|
||||||
m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
|
m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
|
||||||
|
|||||||
12
third_party/aom/av1/common/arm/warp_plane_sve.c
vendored
12
third_party/aom/av1/common/arm/warp_plane_sve.c
vendored
@@ -112,8 +112,7 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
|
|||||||
|
|
||||||
static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
|
static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
|
||||||
int sx) {
|
int sx) {
|
||||||
int16x8_t f_s16 =
|
int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
return horizontal_filter_4x1_f1_beta0(in, f_s16);
|
return horizontal_filter_4x1_f1_beta0(in, f_s16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -148,8 +147,7 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
|
|||||||
|
|
||||||
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
|
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
|
||||||
int sx) {
|
int sx) {
|
||||||
int16x8_t f_s16 =
|
int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
return horizontal_filter_8x1_f1_beta0(in, f_s16);
|
return horizontal_filter_8x1_f1_beta0(in, f_s16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -164,8 +162,7 @@ static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
|
|||||||
int16x4_t s6 = vget_low_s16(src[6]);
|
int16x4_t s6 = vget_low_s16(src[6]);
|
||||||
int16x4_t s7 = vget_low_s16(src[7]);
|
int16x4_t s7 = vget_low_s16(src[7]);
|
||||||
|
|
||||||
int16x8_t f =
|
int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
|
|
||||||
int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
|
int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
|
||||||
m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
|
m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
|
||||||
@@ -215,8 +212,7 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
|
|||||||
int16x8_t s6 = src[6];
|
int16x8_t s6 = src[6];
|
||||||
int16x8_t s7 = src[7];
|
int16x8_t s7 = src[7];
|
||||||
|
|
||||||
int16x8_t f =
|
int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
|
||||||
vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
|
|
||||||
|
|
||||||
int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
|
int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
|
||||||
m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
|
m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
|
||||||
|
|||||||
22
third_party/aom/av1/common/av1_rtcd_defs.pl
vendored
22
third_party/aom/av1/common/av1_rtcd_defs.pl
vendored
@@ -495,22 +495,22 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
|
|||||||
# structs as arguments, which makes the v256 type of the intrinsics
|
# structs as arguments, which makes the v256 type of the intrinsics
|
||||||
# hard to support, so optimizations for this target are disabled.
|
# hard to support, so optimizations for this target are disabled.
|
||||||
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
|
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
|
||||||
specialize qw/cdef_find_dir sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_find_dir sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/, "$ssse3_x86";
|
||||||
|
|
||||||
specialize qw/cdef_filter_8_0 sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_filter_8_0 sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
specialize qw/cdef_filter_8_1 sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_filter_8_1 sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
specialize qw/cdef_filter_8_2 sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_filter_8_2 sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
specialize qw/cdef_filter_8_3 sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_filter_8_3 sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
|
|
||||||
specialize qw/cdef_filter_16_0 sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_filter_16_0 sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
specialize qw/cdef_filter_16_1 sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_filter_16_1 sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
specialize qw/cdef_filter_16_2 sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_filter_16_2 sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
specialize qw/cdef_filter_16_3 sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_filter_16_3 sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
|
|
||||||
specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
|
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
|
||||||
specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86";
|
specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon rvv/, "$ssse3_x86";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
1354
third_party/aom/av1/common/riscv/cdef_block_rvv.c
vendored
Normal file
1354
third_party/aom/av1/common/riscv/cdef_block_rvv.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
11
third_party/aom/av1/common/warped_motion.c
vendored
11
third_party/aom/av1/common/warped_motion.c
vendored
@@ -27,7 +27,8 @@
|
|||||||
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
|
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
|
||||||
// We need an extra 2 taps to fit this in, for a total of 8 taps.
|
// We need an extra 2 taps to fit this in, for a total of 8 taps.
|
||||||
/* clang-format off */
|
/* clang-format off */
|
||||||
const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
|
const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1]
|
||||||
|
[8] = {
|
||||||
// [-1, 0)
|
// [-1, 0)
|
||||||
{ 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
|
{ 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
|
||||||
{ 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
|
{ 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
|
||||||
@@ -344,7 +345,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
|
|||||||
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
|
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
|
||||||
WARPEDPIXEL_PREC_SHIFTS;
|
WARPEDPIXEL_PREC_SHIFTS;
|
||||||
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
||||||
const int16_t *coeffs = av1_warped_filter[offs];
|
const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
|
||||||
|
|
||||||
int32_t sum = 1 << offset_bits_horiz;
|
int32_t sum = 1 << offset_bits_horiz;
|
||||||
for (int m = 0; m < 8; ++m) {
|
for (int m = 0; m < 8; ++m) {
|
||||||
@@ -365,7 +366,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
|
|||||||
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
|
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
|
||||||
WARPEDPIXEL_PREC_SHIFTS;
|
WARPEDPIXEL_PREC_SHIFTS;
|
||||||
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
||||||
const int16_t *coeffs = av1_warped_filter[offs];
|
const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
|
||||||
|
|
||||||
int32_t sum = 1 << offset_bits_vert;
|
int32_t sum = 1 << offset_bits_vert;
|
||||||
for (int m = 0; m < 8; ++m) {
|
for (int m = 0; m < 8; ++m) {
|
||||||
@@ -575,7 +576,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
|
|||||||
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
|
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
|
||||||
WARPEDPIXEL_PREC_SHIFTS;
|
WARPEDPIXEL_PREC_SHIFTS;
|
||||||
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
||||||
const int16_t *coeffs = av1_warped_filter[offs];
|
const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
|
||||||
|
|
||||||
int32_t sum = 1 << offset_bits_horiz;
|
int32_t sum = 1 << offset_bits_horiz;
|
||||||
for (int m = 0; m < 8; ++m) {
|
for (int m = 0; m < 8; ++m) {
|
||||||
@@ -599,7 +600,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
|
|||||||
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
|
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
|
||||||
WARPEDPIXEL_PREC_SHIFTS;
|
WARPEDPIXEL_PREC_SHIFTS;
|
||||||
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
||||||
const int16_t *coeffs = av1_warped_filter[offs];
|
const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
|
||||||
|
|
||||||
int32_t sum = 1 << offset_bits_vert;
|
int32_t sum = 1 << offset_bits_vert;
|
||||||
for (int m = 0; m < 8; ++m) {
|
for (int m = 0; m < 8; ++m) {
|
||||||
|
|||||||
9
third_party/aom/av1/common/warped_motion.h
vendored
9
third_party/aom/av1/common/warped_motion.h
vendored
@@ -33,7 +33,14 @@
|
|||||||
#define WARP_ERROR_BLOCK_LOG 5
|
#define WARP_ERROR_BLOCK_LOG 5
|
||||||
#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)
|
#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)
|
||||||
|
|
||||||
extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
|
#if AOM_ARCH_ARM || AOM_ARCH_AARCH64 || AOM_ARCH_X86 || AOM_ARCH_X86_64
|
||||||
|
typedef int16_t WarpedFilterCoeff;
|
||||||
|
#else
|
||||||
|
typedef int8_t WarpedFilterCoeff;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern const WarpedFilterCoeff
|
||||||
|
av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
|
||||||
|
|
||||||
DECLARE_ALIGNED(8, extern const int8_t,
|
DECLARE_ALIGNED(8, extern const int8_t,
|
||||||
av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
|
av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
|
||||||
|
|||||||
4
third_party/aom/av1/encoder/ratectrl.c
vendored
4
third_party/aom/av1/encoder/ratectrl.c
vendored
@@ -3822,6 +3822,10 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
|
|||||||
resize_reset_rc(cpi, resize_pending_params->width,
|
resize_reset_rc(cpi, resize_pending_params->width,
|
||||||
resize_pending_params->height, cm->width, cm->height);
|
resize_pending_params->height, cm->width, cm->height);
|
||||||
}
|
}
|
||||||
|
if (svc->temporal_layer_id == 0) {
|
||||||
|
rc->num_col_blscroll_last_tl0 = 0;
|
||||||
|
rc->num_row_blscroll_last_tl0 = 0;
|
||||||
|
}
|
||||||
// Set the GF interval and update flag.
|
// Set the GF interval and update flag.
|
||||||
if (!rc->rtc_external_ratectrl)
|
if (!rc->rtc_external_ratectrl)
|
||||||
set_gf_interval_update_onepass_rt(cpi, *frame_type);
|
set_gf_interval_update_onepass_rt(cpi, *frame_type);
|
||||||
|
|||||||
2
third_party/aom/av1/encoder/ratectrl.h
vendored
2
third_party/aom/av1/encoder/ratectrl.h
vendored
@@ -200,6 +200,8 @@ typedef struct {
|
|||||||
int last_target_size_keyframe;
|
int last_target_size_keyframe;
|
||||||
int frames_since_scene_change;
|
int frames_since_scene_change;
|
||||||
int perc_spatial_flat_blocks;
|
int perc_spatial_flat_blocks;
|
||||||
|
int num_col_blscroll_last_tl0;
|
||||||
|
int num_row_blscroll_last_tl0;
|
||||||
|
|
||||||
int avg_frame_bandwidth; // Average frame size target for clip
|
int avg_frame_bandwidth; // Average frame size target for clip
|
||||||
int min_frame_bandwidth; // Minimum allocation used for any frame
|
int min_frame_bandwidth; // Minimum allocation used for any frame
|
||||||
|
|||||||
88
third_party/aom/av1/encoder/var_based_part.c
vendored
88
third_party/aom/av1/encoder/var_based_part.c
vendored
@@ -1325,6 +1325,53 @@ static inline void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void do_int_pro_motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
|
||||||
|
unsigned int *y_sad, int mi_row,
|
||||||
|
int mi_col, int source_sad_nonrd) {
|
||||||
|
AV1_COMMON *const cm = &cpi->common;
|
||||||
|
MACROBLOCKD *xd = &x->e_mbd;
|
||||||
|
MB_MODE_INFO *mi = xd->mi[0];
|
||||||
|
const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
|
||||||
|
const int increase_col_sw = source_sad_nonrd > kMedSad &&
|
||||||
|
!cpi->rc.high_motion_content_screen_rtc &&
|
||||||
|
(cpi->svc.temporal_layer_id == 0 ||
|
||||||
|
cpi->rc.num_col_blscroll_last_tl0 > 2);
|
||||||
|
int me_search_size_col = is_screen
|
||||||
|
? increase_col_sw ? 512 : 96
|
||||||
|
: block_size_wide[cm->seq_params->sb_size] >> 1;
|
||||||
|
// For screen use larger search size row motion to capture
|
||||||
|
// vertical scroll, which can be larger motion.
|
||||||
|
int me_search_size_row = is_screen
|
||||||
|
? source_sad_nonrd > kMedSad ? 512 : 192
|
||||||
|
: block_size_high[cm->seq_params->sb_size] >> 1;
|
||||||
|
unsigned int y_sad_zero;
|
||||||
|
*y_sad = av1_int_pro_motion_estimation(
|
||||||
|
cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv, &y_sad_zero,
|
||||||
|
me_search_size_col, me_search_size_row);
|
||||||
|
// The logic below selects whether the motion estimated in the
|
||||||
|
// int_pro_motion() will be used in nonrd_pickmode. Only do this
|
||||||
|
// for screen for now.
|
||||||
|
if (is_screen) {
|
||||||
|
unsigned int thresh_sad =
|
||||||
|
(cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
|
||||||
|
if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
|
||||||
|
x->sb_me_partition = 1;
|
||||||
|
x->sb_me_mv.as_int = mi->mv[0].as_int;
|
||||||
|
if (cpi->svc.temporal_layer_id == 0) {
|
||||||
|
if (abs(mi->mv[0].as_mv.col) > 16 && abs(mi->mv[0].as_mv.row) == 0)
|
||||||
|
cpi->rc.num_col_blscroll_last_tl0++;
|
||||||
|
else if (abs(mi->mv[0].as_mv.row) > 16 && abs(mi->mv[0].as_mv.col) == 0)
|
||||||
|
cpi->rc.num_row_blscroll_last_tl0++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
x->sb_me_partition = 0;
|
||||||
|
// Fall back to using zero motion.
|
||||||
|
*y_sad = y_sad_zero;
|
||||||
|
mi->mv[0].as_int = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
|
static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
|
||||||
unsigned int *y_sad_g, unsigned int *y_sad_alt,
|
unsigned int *y_sad_g, unsigned int *y_sad_alt,
|
||||||
unsigned int *y_sad_last,
|
unsigned int *y_sad_last,
|
||||||
@@ -1418,42 +1465,11 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
|
|||||||
// so for now force it to 2 based on superblock sad.
|
// so for now force it to 2 based on superblock sad.
|
||||||
if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2;
|
if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2;
|
||||||
|
|
||||||
if (est_motion == 1 || est_motion == 2) {
|
if ((est_motion == 1 || est_motion == 2) && xd->mb_to_right_edge >= 0 &&
|
||||||
if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
|
xd->mb_to_bottom_edge >= 0 && x->source_variance > 100 &&
|
||||||
// For screen only do int_pro_motion for spatial variance above
|
source_sad_nonrd > kLowSad) {
|
||||||
// threshold and motion level above LowSad.
|
do_int_pro_motion_estimation(cpi, x, y_sad, mi_row, mi_col,
|
||||||
if (x->source_variance > 100 && source_sad_nonrd > kLowSad) {
|
source_sad_nonrd);
|
||||||
int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
|
|
||||||
int me_search_size_col =
|
|
||||||
is_screen ? source_sad_nonrd > kMedSad ? 160 : 96
|
|
||||||
: block_size_wide[cm->seq_params->sb_size] >> 1;
|
|
||||||
// For screen use larger search size row motion to capture
|
|
||||||
// vertical scroll, which can be larger motion.
|
|
||||||
int me_search_size_row =
|
|
||||||
is_screen ? source_sad_nonrd > kMedSad ? 512 : 192
|
|
||||||
: block_size_high[cm->seq_params->sb_size] >> 1;
|
|
||||||
unsigned int y_sad_zero;
|
|
||||||
*y_sad = av1_int_pro_motion_estimation(
|
|
||||||
cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv,
|
|
||||||
&y_sad_zero, me_search_size_col, me_search_size_row);
|
|
||||||
// The logic below selects whether the motion estimated in the
|
|
||||||
// int_pro_motion() will be used in nonrd_pickmode. Only do this
|
|
||||||
// for screen for now.
|
|
||||||
if (is_screen) {
|
|
||||||
unsigned int thresh_sad =
|
|
||||||
(cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
|
|
||||||
if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
|
|
||||||
x->sb_me_partition = 1;
|
|
||||||
x->sb_me_mv.as_int = mi->mv[0].as_int;
|
|
||||||
} else {
|
|
||||||
x->sb_me_partition = 0;
|
|
||||||
// Fall back to using zero motion.
|
|
||||||
*y_sad = y_sad_zero;
|
|
||||||
mi->mv[0].as_int = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (*y_sad == UINT_MAX) {
|
if (*y_sad == UINT_MAX) {
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ set_aom_detect_var(AOM_ARCH_ARM 0 "Enables ARM architecture.")
|
|||||||
set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.")
|
set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.")
|
||||||
set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.")
|
set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.")
|
||||||
set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.")
|
set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.")
|
||||||
|
set_aom_detect_var(AOM_ARCH_RISCV 0 "Enables RISC-V architecture.")
|
||||||
|
|
||||||
# Arm/AArch64 feature flags.
|
# Arm/AArch64 feature flags.
|
||||||
set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.")
|
set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.")
|
||||||
@@ -51,6 +52,9 @@ set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.")
|
|||||||
set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.")
|
set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.")
|
||||||
set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.")
|
set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.")
|
||||||
|
|
||||||
|
# RISC-V64 feature flags.
|
||||||
|
set_aom_detect_var(HAVE_RVV 0 "Enables RVV optimizations.")
|
||||||
|
|
||||||
# Flags describing the build environment.
|
# Flags describing the build environment.
|
||||||
set_aom_detect_var(HAVE_FEXCEPT 0
|
set_aom_detect_var(HAVE_FEXCEPT 0
|
||||||
"Internal flag, GNU fenv.h present for target.")
|
"Internal flag, GNU fenv.h present for target.")
|
||||||
@@ -241,3 +245,6 @@ set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets."
|
|||||||
ON)
|
ON)
|
||||||
set_aom_option_var(ENABLE_AVX2
|
set_aom_option_var(ENABLE_AVX2
|
||||||
"Enables AVX2 optimizations on x86/x86_64 targets." ON)
|
"Enables AVX2 optimizations on x86/x86_64 targets." ON)
|
||||||
|
|
||||||
|
# RVV intrinsics flags.
|
||||||
|
set_aom_option_var(ENABLE_RVV "Enables RVV optimizations on RISC-V targets." ON)
|
||||||
|
|||||||
@@ -75,6 +75,8 @@ if(NOT AOM_TARGET_CPU)
|
|||||||
set(AOM_TARGET_CPU "arm64")
|
set(AOM_TARGET_CPU "arm64")
|
||||||
elseif(cpu_lowercase MATCHES "^ppc")
|
elseif(cpu_lowercase MATCHES "^ppc")
|
||||||
set(AOM_TARGET_CPU "ppc")
|
set(AOM_TARGET_CPU "ppc")
|
||||||
|
elseif(cpu_lowercase MATCHES "^riscv")
|
||||||
|
set(AOM_TARGET_CPU "riscv")
|
||||||
else()
|
else()
|
||||||
message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
|
message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
|
||||||
"supported, falling back to the generic target")
|
"supported, falling back to the generic target")
|
||||||
|
|||||||
11
third_party/aom/build/cmake/cpu.cmake
vendored
11
third_party/aom/build/cmake/cpu.cmake
vendored
@@ -132,4 +132,15 @@ elseif("${AOM_TARGET_CPU}" MATCHES "^x86")
|
|||||||
set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
|
set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
|
||||||
endif()
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
elseif("${AOM_TARGET_CPU}" MATCHES "riscv")
|
||||||
|
set(AOM_ARCH_RISCV64 1)
|
||||||
|
set(RTCD_ARCH_RISCV64 "yes")
|
||||||
|
|
||||||
|
if(ENABLE_RVV)
|
||||||
|
set(HAVE_RVV 1)
|
||||||
|
set(RTCD_HAVE_RVV "yes")
|
||||||
|
else()
|
||||||
|
set(HAVE_RVV 0)
|
||||||
|
set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-rvv)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
33
third_party/aom/build/cmake/rtcd.pl
vendored
33
third_party/aom/build/cmake/rtcd.pl
vendored
@@ -370,6 +370,36 @@ EOF
|
|||||||
common_bottom;
|
common_bottom;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sub riscv() {
|
||||||
|
determine_indirection("c", @ALL_ARCHS);
|
||||||
|
|
||||||
|
# Assign the helper variable for each enabled extension
|
||||||
|
foreach my $opt (@ALL_ARCHS) {
|
||||||
|
my $opt_uc = uc $opt;
|
||||||
|
eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
|
||||||
|
}
|
||||||
|
|
||||||
|
common_top;
|
||||||
|
print <<EOF;
|
||||||
|
#ifdef RTCD_C
|
||||||
|
#include "aom_ports/riscv.h"
|
||||||
|
static void setup_rtcd_internal(void)
|
||||||
|
{
|
||||||
|
int flags = riscv_simd_caps();
|
||||||
|
|
||||||
|
(void)flags;
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
||||||
|
set_function_pointers("c", @ALL_ARCHS);
|
||||||
|
|
||||||
|
print <<EOF;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
EOF
|
||||||
|
common_bottom;
|
||||||
|
}
|
||||||
|
|
||||||
sub unoptimized() {
|
sub unoptimized() {
|
||||||
determine_indirection "c";
|
determine_indirection "c";
|
||||||
common_top;
|
common_top;
|
||||||
@@ -415,6 +445,9 @@ if ($opts{arch} eq 'x86') {
|
|||||||
} elsif ($opts{arch} eq 'ppc') {
|
} elsif ($opts{arch} eq 'ppc') {
|
||||||
@ALL_ARCHS = filter(qw/vsx/);
|
@ALL_ARCHS = filter(qw/vsx/);
|
||||||
ppc;
|
ppc;
|
||||||
|
} elsif ($opts{arch} eq 'riscv') {
|
||||||
|
@ALL_ARCHS = filter(qw/rvv/);
|
||||||
|
riscv;
|
||||||
} else {
|
} else {
|
||||||
unoptimized;
|
unoptimized;
|
||||||
}
|
}
|
||||||
|
|||||||
63
third_party/aom/test/cdef_test.cc
vendored
63
third_party/aom/test/cdef_test.cc
vendored
@@ -618,7 +618,8 @@ TEST_P(CDEFCopyRect16to16Test, TestSIMDNoMismatch) {
|
|||||||
|
|
||||||
using std::make_tuple;
|
using std::make_tuple;
|
||||||
|
|
||||||
#if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON)
|
#if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON || \
|
||||||
|
HAVE_RVV)
|
||||||
static const CdefFilterBlockFunctions kCdefFilterFuncC[] = {
|
static const CdefFilterBlockFunctions kCdefFilterFuncC[] = {
|
||||||
{ &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c,
|
{ &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c,
|
||||||
&cdef_filter_8_3_c }
|
&cdef_filter_8_3_c }
|
||||||
@@ -811,6 +812,46 @@ INSTANTIATE_TEST_SUITE_P(
|
|||||||
#endif // CONFIG_AV1_HIGHBITDEPTH
|
#endif // CONFIG_AV1_HIGHBITDEPTH
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_RVV
|
||||||
|
static const CdefFilterBlockFunctions kCdefFilterFuncRvv[] = {
|
||||||
|
{ &cdef_filter_8_0_rvv, &cdef_filter_8_1_rvv, &cdef_filter_8_2_rvv,
|
||||||
|
&cdef_filter_8_3_rvv }
|
||||||
|
};
|
||||||
|
|
||||||
|
static const CdefFilterBlockFunctions kCdefFilterHighbdFuncRvv[] = {
|
||||||
|
{ &cdef_filter_16_0_rvv, &cdef_filter_16_1_rvv, &cdef_filter_16_2_rvv,
|
||||||
|
&cdef_filter_16_3_rvv }
|
||||||
|
};
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
RVV, CDEFBlockTest,
|
||||||
|
::testing::Combine(::testing::ValuesIn(kCdefFilterFuncRvv),
|
||||||
|
::testing::ValuesIn(kCdefFilterFuncC),
|
||||||
|
::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
|
||||||
|
BLOCK_8X8),
|
||||||
|
::testing::Range(0, 16), ::testing::Values(8)));
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
RVV, CDEFBlockHighbdTest,
|
||||||
|
::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncRvv),
|
||||||
|
::testing::ValuesIn(kCdefFilterHighbdFuncC),
|
||||||
|
::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
|
||||||
|
BLOCK_8X8),
|
||||||
|
::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
|
||||||
|
INSTANTIATE_TEST_SUITE_P(RVV, CDEFFindDirTest,
|
||||||
|
::testing::Values(make_tuple(&cdef_find_dir_rvv,
|
||||||
|
&cdef_find_dir_c)));
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
RVV, CDEFCopyRect8to16Test,
|
||||||
|
::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
|
||||||
|
&cdef_copy_rect8_8bit_to_16bit_rvv)));
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
RVV, CDEFCopyRect16to16Test,
|
||||||
|
::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
|
||||||
|
&cdef_copy_rect8_16bit_to_16bit_rvv)));
|
||||||
|
#endif
|
||||||
|
|
||||||
// Test speed for all supported architectures
|
// Test speed for all supported architectures
|
||||||
#if AOM_ARCH_X86 && HAVE_SSSE3
|
#if AOM_ARCH_X86 && HAVE_SSSE3
|
||||||
INSTANTIATE_TEST_SUITE_P(
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
@@ -905,4 +946,24 @@ INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirDualSpeedTest,
|
|||||||
&cdef_find_dir_dual_c)));
|
&cdef_find_dir_dual_c)));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_RVV
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
RVV, CDEFSpeedTest,
|
||||||
|
::testing::Combine(::testing::ValuesIn(kCdefFilterFuncRvv),
|
||||||
|
::testing::ValuesIn(kCdefFilterFuncC),
|
||||||
|
::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
|
||||||
|
BLOCK_8X8),
|
||||||
|
::testing::Range(0, 16), ::testing::Values(8)));
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
RVV, CDEFSpeedHighbdTest,
|
||||||
|
::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncRvv),
|
||||||
|
::testing::ValuesIn(kCdefFilterHighbdFuncC),
|
||||||
|
::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
|
||||||
|
BLOCK_8X8),
|
||||||
|
::testing::Range(0, 16), ::testing::Values(10)));
|
||||||
|
INSTANTIATE_TEST_SUITE_P(RVV, CDEFFindDirSpeedTest,
|
||||||
|
::testing::Values(make_tuple(&cdef_find_dir_rvv,
|
||||||
|
&cdef_find_dir_c)));
|
||||||
|
#endif
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|||||||
41
third_party/aom/test/svc_datarate_test.cc
vendored
41
third_party/aom/test/svc_datarate_test.cc
vendored
@@ -1078,6 +1078,39 @@ class DatarateTestSVC
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual void BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080Test() {
|
||||||
|
cfg_.rc_buf_initial_sz = 50;
|
||||||
|
cfg_.rc_buf_optimal_sz = 50;
|
||||||
|
cfg_.rc_buf_sz = 100;
|
||||||
|
cfg_.rc_dropframe_thresh = 30;
|
||||||
|
cfg_.rc_min_quantizer = 0;
|
||||||
|
cfg_.rc_max_quantizer = 52;
|
||||||
|
cfg_.rc_end_usage = AOM_CBR;
|
||||||
|
cfg_.g_lag_in_frames = 0;
|
||||||
|
cfg_.g_error_resilient = 0;
|
||||||
|
|
||||||
|
::libaom_test::Y4mVideoSource video("screendata.1920_1080.y4m", 0, 60);
|
||||||
|
|
||||||
|
const int bitrate_array[2] = { 60, 100 };
|
||||||
|
cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
|
||||||
|
ResetModel();
|
||||||
|
screen_mode_ = 1;
|
||||||
|
number_temporal_layers_ = 2;
|
||||||
|
number_spatial_layers_ = 1;
|
||||||
|
target_layer_bitrate_[0] = 60 * cfg_.rc_target_bitrate / 100;
|
||||||
|
target_layer_bitrate_[1] = cfg_.rc_target_bitrate;
|
||||||
|
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
|
||||||
|
#if CONFIG_AV1_DECODER
|
||||||
|
// Top temporal layers are non_reference, so exclude them from
|
||||||
|
// mismatch count, since loopfilter/cdef is not applied for these on
|
||||||
|
// encoder side, but is always applied on decoder.
|
||||||
|
// This means 150 = #frames(300) - #TL2_frames(150).
|
||||||
|
// We use LE for screen since loopfilter level can become very small
|
||||||
|
// or zero and then the frame is not a mismatch.
|
||||||
|
EXPECT_LE(GetMismatchFrames(), 150u);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
virtual void BasicRateTargetingSVC1TL3SLScreenTest() {
|
virtual void BasicRateTargetingSVC1TL3SLScreenTest() {
|
||||||
cfg_.rc_buf_initial_sz = 500;
|
cfg_.rc_buf_initial_sz = 500;
|
||||||
cfg_.rc_buf_optimal_sz = 500;
|
cfg_.rc_buf_optimal_sz = 500;
|
||||||
@@ -2651,6 +2684,14 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLScreenDropFrame) {
|
|||||||
BasicRateTargetingSVC2TL1SLScreenDropFrameTest();
|
BasicRateTargetingSVC2TL1SLScreenDropFrameTest();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check basic rate targeting for CBR, for 2 temporal layers, 1 spatial
|
||||||
|
// for screen mode, with frame dropper on at low bitrates. Use small
|
||||||
|
// values of rc_buf_initial/optimal/sz to trigger postencode frame drop.
|
||||||
|
// Use 1920x1080 clip.
|
||||||
|
TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080) {
|
||||||
|
BasicRateTargetingSVC2TL1SLScreenDropFrame1920x1080Test();
|
||||||
|
}
|
||||||
|
|
||||||
// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal
|
// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal
|
||||||
// for screen mode.
|
// for screen mode.
|
||||||
TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLScreen) {
|
TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLScreen) {
|
||||||
|
|||||||
1
third_party/aom/test/test-data.sha1
vendored
1
third_party/aom/test/test-data.sha1
vendored
@@ -573,3 +573,4 @@ c7f336958e7af6162c20ddc84d67c7dfa9826910 *av1-1-b8-16-intra_only-intrabc-extreme
|
|||||||
4d1ad6d3070268ccb000d7fc3ae0f5a9447bfe82 *test_input_w1h1.yuv
|
4d1ad6d3070268ccb000d7fc3ae0f5a9447bfe82 *test_input_w1h1.yuv
|
||||||
ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
|
ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
|
||||||
9c2aa2d0f63f706f775bf661dfa81e8bb3089d8b *wikipedia_420_360p_60f.y4m
|
9c2aa2d0f63f706f775bf661dfa81e8bb3089d8b *wikipedia_420_360p_60f.y4m
|
||||||
|
9e4d2ba84ba62f7ea4b617a13af5db9c39e7f0f9 *screendata.1920_1080.y4m
|
||||||
|
|||||||
1
third_party/aom/test/test_data_util.cmake
vendored
1
third_party/aom/test/test_data_util.cmake
vendored
@@ -35,6 +35,7 @@ list(APPEND AOM_TEST_DATA_FILE_NAMES
|
|||||||
"niklas_1280_720_30.y4m"
|
"niklas_1280_720_30.y4m"
|
||||||
"rush_hour_444.y4m"
|
"rush_hour_444.y4m"
|
||||||
"screendata.y4m"
|
"screendata.y4m"
|
||||||
|
"screendata.1920_1080.y4m"
|
||||||
"niklas_640_480_30.yuv"
|
"niklas_640_480_30.yuv"
|
||||||
"vase10x10.yuv"
|
"vase10x10.yuv"
|
||||||
"vase10x10_tiles.txt"
|
"vase10x10_tiles.txt"
|
||||||
|
|||||||
Reference in New Issue
Block a user