Bug 1661093 - Update libdav1d to 0243c3ff for Firefox 82. r=mjf
Differential Revision: https://phabricator.services.mozilla.com/D92534
This commit is contained in:
@@ -186,7 +186,9 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
|||||||
'../../../third_party/dav1d/src/arm/32/itx.S',
|
'../../../third_party/dav1d/src/arm/32/itx.S',
|
||||||
'../../../third_party/dav1d/src/arm/32/loopfilter.S',
|
'../../../third_party/dav1d/src/arm/32/loopfilter.S',
|
||||||
'../../../third_party/dav1d/src/arm/32/looprestoration.S',
|
'../../../third_party/dav1d/src/arm/32/looprestoration.S',
|
||||||
|
'../../../third_party/dav1d/src/arm/32/looprestoration16.S',
|
||||||
'../../../third_party/dav1d/src/arm/32/mc.S',
|
'../../../third_party/dav1d/src/arm/32/mc.S',
|
||||||
|
'../../../third_party/dav1d/src/arm/32/mc16.S',
|
||||||
'../../../third_party/dav1d/src/arm/32/msac.S',
|
'../../../third_party/dav1d/src/arm/32/msac.S',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -20,11 +20,11 @@ origin:
|
|||||||
|
|
||||||
# Human-readable identifier for this version/release
|
# Human-readable identifier for this version/release
|
||||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||||
release: commit d0e50cacead63e9904dde184580ce9a746374bd5 (2020-08-21T15:13:49.000+02:00).
|
release: commit 0243c3ffb644e61848b82f24f5e4a7324669d76e (2020-09-27T15:38:45.000+02:00).
|
||||||
|
|
||||||
# Revision to pull in
|
# Revision to pull in
|
||||||
# Must be a long or short commit SHA (long preferred)
|
# Must be a long or short commit SHA (long preferred)
|
||||||
revision: d0e50cacead63e9904dde184580ce9a746374bd5
|
revision: 0243c3ffb644e61848b82f24f5e4a7324669d76e
|
||||||
|
|
||||||
# The package's license, where possible using the mnemonic from
|
# The package's license, where possible using the mnemonic from
|
||||||
# https://spdx.org/licenses/
|
# https://spdx.org/licenses/
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
/* auto-generated, do not edit */
|
/* auto-generated, do not edit */
|
||||||
#define DAV1D_VERSION "0.7.1-49-gd0e50ca"
|
#define DAV1D_VERSION "0.7.1-81-g0243c3f"
|
||||||
|
|||||||
@@ -27,8 +27,8 @@
|
|||||||
#ifndef DAV1D_VERSION_H
|
#ifndef DAV1D_VERSION_H
|
||||||
#define DAV1D_VERSION_H
|
#define DAV1D_VERSION_H
|
||||||
|
|
||||||
#define DAV1D_API_VERSION_MAJOR 4
|
#define DAV1D_API_VERSION_MAJOR 5
|
||||||
#define DAV1D_API_VERSION_MINOR 0
|
#define DAV1D_API_VERSION_MINOR 0
|
||||||
#define DAV1D_API_VERSION_PATCH 2
|
#define DAV1D_API_VERSION_PATCH 0
|
||||||
|
|
||||||
#endif /* DAV1D_VERSION_H */
|
#endif /* DAV1D_VERSION_H */
|
||||||
|
|||||||
2
third_party/dav1d/CONTRIBUTING.md
vendored
2
third_party/dav1d/CONTRIBUTING.md
vendored
@@ -12,7 +12,7 @@ The todo list can be found [on the wiki](https://code.videolan.org/videolan/dav1
|
|||||||
The codebase is developed with the following assumptions:
|
The codebase is developed with the following assumptions:
|
||||||
|
|
||||||
For the library:
|
For the library:
|
||||||
- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extension,
|
- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extensions. Anonymous structures and unions are the only allowed compiler extensions for internal code.
|
||||||
- x86 asm in .asm files, using the NASM syntax,
|
- x86 asm in .asm files, using the NASM syntax,
|
||||||
- arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports,
|
- arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports,
|
||||||
- no C++ is allowed, whatever the version.
|
- no C++ is allowed, whatever the version.
|
||||||
|
|||||||
2
third_party/dav1d/include/dav1d/dav1d.h
vendored
2
third_party/dav1d/include/dav1d/dav1d.h
vendored
@@ -65,9 +65,9 @@ typedef struct Dav1dSettings {
|
|||||||
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
|
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
|
||||||
int all_layers; ///< output all spatial layers of a scalable AV1 biststream
|
int all_layers; ///< output all spatial layers of a scalable AV1 biststream
|
||||||
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
|
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
|
||||||
uint8_t reserved[32]; ///< reserved for future use
|
|
||||||
Dav1dPicAllocator allocator; ///< Picture allocator callback.
|
Dav1dPicAllocator allocator; ///< Picture allocator callback.
|
||||||
Dav1dLogger logger; ///< Logger callback.
|
Dav1dLogger logger; ///< Logger callback.
|
||||||
|
uint8_t reserved[32]; ///< reserved for future use
|
||||||
} Dav1dSettings;
|
} Dav1dSettings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
5
third_party/dav1d/include/dav1d/headers.h
vendored
5
third_party/dav1d/include/dav1d/headers.h
vendored
@@ -28,6 +28,7 @@
|
|||||||
#ifndef DAV1D_HEADERS_H
|
#ifndef DAV1D_HEADERS_H
|
||||||
#define DAV1D_HEADERS_H
|
#define DAV1D_HEADERS_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
|
||||||
// Constants from Section 3. "Symbols and abbreviated terms"
|
// Constants from Section 3. "Symbols and abbreviated terms"
|
||||||
@@ -95,9 +96,9 @@ typedef struct Dav1dWarpedMotionParams {
|
|||||||
union {
|
union {
|
||||||
struct {
|
struct {
|
||||||
int16_t alpha, beta, gamma, delta;
|
int16_t alpha, beta, gamma, delta;
|
||||||
};
|
} p;
|
||||||
int16_t abcd[4];
|
int16_t abcd[4];
|
||||||
};
|
} u;
|
||||||
} Dav1dWarpedMotionParams;
|
} Dav1dWarpedMotionParams;
|
||||||
|
|
||||||
enum Dav1dPixelLayout {
|
enum Dav1dPixelLayout {
|
||||||
|
|||||||
14
third_party/dav1d/include/dav1d/meson.build
vendored
14
third_party/dav1d/include/dav1d/meson.build
vendored
@@ -31,11 +31,15 @@ version_h_target = configure_file(input: 'version.h.in',
|
|||||||
output: 'version.h',
|
output: 'version.h',
|
||||||
configuration: version_h_data)
|
configuration: version_h_data)
|
||||||
|
|
||||||
|
dav1d_api_headers = [
|
||||||
|
'common.h',
|
||||||
|
'data.h',
|
||||||
|
'dav1d.h',
|
||||||
|
'headers.h',
|
||||||
|
'picture.h',
|
||||||
|
]
|
||||||
|
|
||||||
# install headers
|
# install headers
|
||||||
install_headers('common.h',
|
install_headers(dav1d_api_headers,
|
||||||
'data.h',
|
|
||||||
'dav1d.h',
|
|
||||||
'headers.h',
|
|
||||||
'picture.h',
|
|
||||||
version_h_target,
|
version_h_target,
|
||||||
subdir : 'dav1d')
|
subdir : 'dav1d')
|
||||||
|
|||||||
39
third_party/dav1d/meson.build
vendored
39
third_party/dav1d/meson.build
vendored
@@ -28,9 +28,9 @@ project('dav1d', ['c'],
|
|||||||
'warning_level=2',
|
'warning_level=2',
|
||||||
'buildtype=release',
|
'buildtype=release',
|
||||||
'b_ndebug=if-release'],
|
'b_ndebug=if-release'],
|
||||||
meson_version: '>= 0.47.0')
|
meson_version: '>= 0.49.0')
|
||||||
|
|
||||||
dav1d_soname_version = '4.0.2'
|
dav1d_soname_version = '5.0.0'
|
||||||
dav1d_api_version_array = dav1d_soname_version.split('.')
|
dav1d_api_version_array = dav1d_soname_version.split('.')
|
||||||
dav1d_api_version_major = dav1d_api_version_array[0]
|
dav1d_api_version_major = dav1d_api_version_array[0]
|
||||||
dav1d_api_version_minor = dav1d_api_version_array[1]
|
dav1d_api_version_minor = dav1d_api_version_array[1]
|
||||||
@@ -118,6 +118,17 @@ if host_machine.system() == 'windows'
|
|||||||
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
|
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
|
||||||
|
|
||||||
rt_dependency = []
|
rt_dependency = []
|
||||||
|
|
||||||
|
rc_version_array = meson.project_version().split('.')
|
||||||
|
winmod = import('windows')
|
||||||
|
rc_data = configuration_data()
|
||||||
|
rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
|
||||||
|
rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
|
||||||
|
rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
|
||||||
|
rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
|
||||||
|
rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
|
||||||
|
rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
|
||||||
|
rc_data.set('COPYRIGHT_YEARS', '2020')
|
||||||
else
|
else
|
||||||
thread_dependency = dependency('threads')
|
thread_dependency = dependency('threads')
|
||||||
thread_compat_dep = []
|
thread_compat_dep = []
|
||||||
@@ -227,7 +238,7 @@ endif
|
|||||||
# Compiler flags that should be set
|
# Compiler flags that should be set
|
||||||
# But when the compiler does not supports them
|
# But when the compiler does not supports them
|
||||||
# it is not an error and silently tolerated
|
# it is not an error and silently tolerated
|
||||||
if cc.get_id() != 'msvc'
|
if cc.get_argument_syntax() != 'msvc'
|
||||||
optional_arguments += [
|
optional_arguments += [
|
||||||
'-Wundef',
|
'-Wundef',
|
||||||
'-Werror=vla',
|
'-Werror=vla',
|
||||||
@@ -426,6 +437,28 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
|
|||||||
])
|
])
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
use_gaspp = false
|
||||||
|
if (is_asm_enabled and
|
||||||
|
(host_machine.cpu_family() == 'aarch64' or
|
||||||
|
host_machine.cpu_family().startswith('arm')) and
|
||||||
|
cc.get_argument_syntax() == 'msvc')
|
||||||
|
gaspp = find_program('gas-preprocessor.pl')
|
||||||
|
use_gaspp = true
|
||||||
|
gaspp_gen = generator(gaspp,
|
||||||
|
output: '@BASENAME@.obj',
|
||||||
|
arguments: [
|
||||||
|
'-as-type', 'armasm',
|
||||||
|
'-arch', host_machine.cpu_family(),
|
||||||
|
'--',
|
||||||
|
host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm',
|
||||||
|
'-nologo',
|
||||||
|
'-I@0@'.format(dav1d_src_root),
|
||||||
|
'-I@0@/'.format(meson.current_build_dir()),
|
||||||
|
'@INPUT@',
|
||||||
|
'-c',
|
||||||
|
'-o', '@OUTPUT@'
|
||||||
|
])
|
||||||
|
endif
|
||||||
|
|
||||||
# Generate config.h
|
# Generate config.h
|
||||||
config_h_target = configure_file(output: 'config.h', configuration: cdata)
|
config_h_target = configure_file(output: 'config.h', configuration: cdata)
|
||||||
|
|||||||
143
third_party/dav1d/src/arm/32/looprestoration.S
vendored
143
third_party/dav1d/src/arm/32/looprestoration.S
vendored
@@ -40,8 +40,8 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||||||
mov r8, r5
|
mov r8, r5
|
||||||
vld1.16 {q0}, [r4]
|
vld1.16 {q0}, [r4]
|
||||||
movw r9, #(1 << 14) - (1 << 2)
|
movw r9, #(1 << 14) - (1 << 2)
|
||||||
vdup.16 q14, r9
|
vdup.16 q14, r9
|
||||||
vmov.s16 q15, #2048
|
vmov.s16 q15, #2048
|
||||||
// Calculate mid_stride
|
// Calculate mid_stride
|
||||||
add r10, r5, #7
|
add r10, r5, #7
|
||||||
bic r10, r10, #7
|
bic r10, r10, #7
|
||||||
@@ -108,8 +108,8 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||||||
0:
|
0:
|
||||||
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
|
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
|
||||||
// and shift q2 to have 3x the first byte at the front.
|
// and shift q2 to have 3x the first byte at the front.
|
||||||
vdup.8 q1, d4[0]
|
vdup.8 q1, d4[0]
|
||||||
vdup.8 q8, d18[0]
|
vdup.8 q8, d18[0]
|
||||||
// Move r2 back to account for the last 3 bytes we loaded before,
|
// Move r2 back to account for the last 3 bytes we loaded before,
|
||||||
// which we shifted out.
|
// which we shifted out.
|
||||||
sub r2, r2, #3
|
sub r2, r2, #3
|
||||||
@@ -127,7 +127,7 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||||||
bne 4f
|
bne 4f
|
||||||
// If we'll need to pad the right edge, load that byte to pad with
|
// If we'll need to pad the right edge, load that byte to pad with
|
||||||
// here since we can find it pretty easily from here.
|
// here since we can find it pretty easily from here.
|
||||||
sub r9, r5, #14
|
sub r9, r5, #14
|
||||||
ldrb r11, [r2, r9]
|
ldrb r11, [r2, r9]
|
||||||
ldrb r9, [lr, r9]
|
ldrb r9, [lr, r9]
|
||||||
// Fill q12/q13 with the right padding pixel
|
// Fill q12/q13 with the right padding pixel
|
||||||
@@ -144,7 +144,6 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||||||
b 6f
|
b 6f
|
||||||
|
|
||||||
4: // Loop horizontally
|
4: // Loop horizontally
|
||||||
.macro filter_8
|
|
||||||
// This is tuned as some sort of compromise between Cortex A7, A8,
|
// This is tuned as some sort of compromise between Cortex A7, A8,
|
||||||
// A9 and A53.
|
// A9 and A53.
|
||||||
vmul.s16 q3, q1, d0[0]
|
vmul.s16 q3, q1, d0[0]
|
||||||
@@ -187,8 +186,6 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||||||
vshr.s16 q10, q10, #3
|
vshr.s16 q10, q10, #3
|
||||||
vadd.s16 q3, q3, q15
|
vadd.s16 q3, q3, q15
|
||||||
vadd.s16 q10, q10, q15
|
vadd.s16 q10, q10, q15
|
||||||
.endm
|
|
||||||
filter_8
|
|
||||||
vst1.16 {q3}, [r0, :128]!
|
vst1.16 {q3}, [r0, :128]!
|
||||||
vst1.16 {q10}, [r12, :128]!
|
vst1.16 {q10}, [r12, :128]!
|
||||||
|
|
||||||
@@ -206,50 +203,43 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||||||
|
|
||||||
5: // Filter 4 pixels, 7 <= w < 11
|
5: // Filter 4 pixels, 7 <= w < 11
|
||||||
.macro filter_4
|
.macro filter_4
|
||||||
|
vext.8 d20, d2, d3, #2
|
||||||
|
vext.8 d21, d2, d3, #4
|
||||||
|
vext.8 d22, d2, d3, #6
|
||||||
|
vext.8 d23, d3, d4, #2
|
||||||
|
vext.8 d8, d3, d4, #4
|
||||||
vmul.s16 d6, d2, d0[0]
|
vmul.s16 d6, d2, d0[0]
|
||||||
vext.8 q10, q1, q2, #2
|
|
||||||
vext.8 q11, q1, q2, #4
|
|
||||||
vmla.s16 d6, d20, d0[1]
|
vmla.s16 d6, d20, d0[1]
|
||||||
vmla.s16 d6, d22, d0[2]
|
vmla.s16 d6, d21, d0[2]
|
||||||
vext.8 q10, q1, q2, #6
|
vmla.s16 d6, d22, d0[3]
|
||||||
vext.8 q11, q1, q2, #8
|
vmla.s16 d6, d3, d1[0]
|
||||||
vmla.s16 d6, d20, d0[3]
|
vmla.s16 d6, d23, d1[1]
|
||||||
vmla.s16 d6, d22, d1[0]
|
vmla.s16 d6, d8, d1[2]
|
||||||
vext.8 q10, q1, q2, #10
|
|
||||||
vext.8 q11, q1, q2, #12
|
|
||||||
vmla.s16 d6, d20, d1[1]
|
|
||||||
vmla.s16 d6, d22, d1[2]
|
|
||||||
|
|
||||||
vmul.s16 d20, d16, d0[0]
|
vext.8 d20, d16, d17, #2
|
||||||
vext.8 q11, q8, q9, #2
|
vext.8 d21, d16, d17, #4
|
||||||
vext.8 q4, q8, q9, #4
|
vext.8 d22, d16, d17, #6
|
||||||
vmla.s16 d20, d22, d0[1]
|
vext.8 d23, d17, d18, #2
|
||||||
vmla.s16 d20, d8, d0[2]
|
vext.8 d8, d17, d18, #4
|
||||||
vext.8 q11, q8, q9, #6
|
vmul.s16 d7, d16, d0[0]
|
||||||
vext.8 q4, q8, q9, #8
|
vmla.s16 d7, d20, d0[1]
|
||||||
vmla.s16 d20, d22, d0[3]
|
vmla.s16 d7, d21, d0[2]
|
||||||
vmla.s16 d20, d8, d1[0]
|
vmla.s16 d7, d22, d0[3]
|
||||||
vext.8 q11, q8, q9, #10
|
vmla.s16 d7, d17, d1[0]
|
||||||
vext.8 q4, q8, q9, #12
|
vmla.s16 d7, d23, d1[1]
|
||||||
vmla.s16 d20, d22, d1[1]
|
vmla.s16 d7, d8, d1[2]
|
||||||
vmla.s16 d20, d8, d1[2]
|
|
||||||
|
|
||||||
vext.8 q11, q1, q2, #6
|
vext.8 d22, d2, d3, #6
|
||||||
vshl.s16 d22, d22, #7
|
vext.8 d23, d16, d17, #6
|
||||||
vsub.s16 d22, d22, d28
|
vshl.s16 q11, q11, #7
|
||||||
vqadd.s16 d6, d6, d22
|
vsub.s16 q11, q11, q14
|
||||||
vext.8 q11, q8, q9, #6
|
vqadd.s16 q3, q3, q11
|
||||||
vshl.s16 d22, d22, #7
|
vshr.s16 q3, q3, #3
|
||||||
vsub.s16 d22, d22, d28
|
vadd.s16 q3, q3, q15
|
||||||
vqadd.s16 d20, d20, d22
|
|
||||||
vshr.s16 d6, d6, #3
|
|
||||||
vshr.s16 d20, d20, #3
|
|
||||||
vadd.s16 d6, d6, d30
|
|
||||||
vadd.s16 d20, d20, d30
|
|
||||||
.endm
|
.endm
|
||||||
filter_4
|
filter_4
|
||||||
vst1.16 {d6}, [r0, :64]!
|
vst1.16 {d6}, [r0, :64]!
|
||||||
vst1.16 {d20}, [r12, :64]!
|
vst1.16 {d7}, [r12, :64]!
|
||||||
|
|
||||||
subs r5, r5, #4 // 3 <= w < 7
|
subs r5, r5, #4 // 3 <= w < 7
|
||||||
vext.8 q1, q1, q2, #8
|
vext.8 q1, q1, q2, #8
|
||||||
@@ -323,7 +313,7 @@ L(variable_shift_tbl):
|
|||||||
// w >= 4, filter 4 pixels
|
// w >= 4, filter 4 pixels
|
||||||
filter_4
|
filter_4
|
||||||
vst1.16 {d6}, [r0, :64]!
|
vst1.16 {d6}, [r0, :64]!
|
||||||
vst1.16 {d20}, [r12, :64]!
|
vst1.16 {d7}, [r12, :64]!
|
||||||
subs r5, r5, #4 // 0 <= w < 4
|
subs r5, r5, #4 // 0 <= w < 4
|
||||||
vext.8 q1, q1, q2, #8
|
vext.8 q1, q1, q2, #8
|
||||||
vext.8 q8, q8, q9, #8
|
vext.8 q8, q8, q9, #8
|
||||||
@@ -338,11 +328,11 @@ L(variable_shift_tbl):
|
|||||||
vdup.16 d25, d16[3]
|
vdup.16 d25, d16[3]
|
||||||
vpadd.s16 d6, d6, d6
|
vpadd.s16 d6, d6, d6
|
||||||
vtrn.16 d24, d25
|
vtrn.16 d24, d25
|
||||||
vshl.s16 d24, d24, #7
|
vshl.s16 d24, d24, #7
|
||||||
vsub.s16 d24, d24, d28
|
vsub.s16 d24, d24, d28
|
||||||
vqadd.s16 d6, d6, d24
|
vqadd.s16 d6, d6, d24
|
||||||
vshr.s16 d6, d6, #3
|
vshr.s16 d6, d6, #3
|
||||||
vadd.s16 d6, d6, d30
|
vadd.s16 d6, d6, d30
|
||||||
vst1.s16 {d6[0]}, [r0, :16]!
|
vst1.s16 {d6[0]}, [r0, :16]!
|
||||||
vst1.s16 {d6[1]}, [r12, :16]!
|
vst1.s16 {d6[1]}, [r12, :16]!
|
||||||
subs r5, r5, #1
|
subs r5, r5, #1
|
||||||
@@ -363,7 +353,6 @@ L(variable_shift_tbl):
|
|||||||
0:
|
0:
|
||||||
vpop {q4}
|
vpop {q4}
|
||||||
pop {r4-r11,pc}
|
pop {r4-r11,pc}
|
||||||
.purgem filter_8
|
|
||||||
.purgem filter_4
|
.purgem filter_4
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
@@ -422,22 +411,22 @@ function wiener_filter_v_8bpc_neon, export=1
|
|||||||
// Interleaving the mul/mla chains actually hurts performance
|
// Interleaving the mul/mla chains actually hurts performance
|
||||||
// significantly on Cortex A53, thus keeping mul/mla tightly
|
// significantly on Cortex A53, thus keeping mul/mla tightly
|
||||||
// chained like this.
|
// chained like this.
|
||||||
vmull.s16 q2, d16, d0[0]
|
vmull.s16 q2, d16, d0[0]
|
||||||
vmlal.s16 q2, d18, d0[1]
|
vmlal.s16 q2, d18, d0[1]
|
||||||
vmlal.s16 q2, d20, d0[2]
|
vmlal.s16 q2, d20, d0[2]
|
||||||
vmlal.s16 q2, d22, d0[3]
|
vmlal.s16 q2, d22, d0[3]
|
||||||
vmlal.s16 q2, d24, d1[0]
|
vmlal.s16 q2, d24, d1[0]
|
||||||
vmlal.s16 q2, d26, d1[1]
|
vmlal.s16 q2, d26, d1[1]
|
||||||
vmlal.s16 q2, d28, d1[2]
|
vmlal.s16 q2, d28, d1[2]
|
||||||
vmull.s16 q3, d17, d0[0]
|
vmull.s16 q3, d17, d0[0]
|
||||||
vmlal.s16 q3, d19, d0[1]
|
vmlal.s16 q3, d19, d0[1]
|
||||||
vmlal.s16 q3, d21, d0[2]
|
vmlal.s16 q3, d21, d0[2]
|
||||||
vmlal.s16 q3, d23, d0[3]
|
vmlal.s16 q3, d23, d0[3]
|
||||||
vmlal.s16 q3, d25, d1[0]
|
vmlal.s16 q3, d25, d1[0]
|
||||||
vmlal.s16 q3, d27, d1[1]
|
vmlal.s16 q3, d27, d1[1]
|
||||||
vmlal.s16 q3, d29, d1[2]
|
vmlal.s16 q3, d29, d1[2]
|
||||||
vqrshrun.s32 d4, q2, #11
|
vqrshrun.s32 d4, q2, #11
|
||||||
vqrshrun.s32 d5, q3, #11
|
vqrshrun.s32 d5, q3, #11
|
||||||
vqmovun.s16 d4, q2
|
vqmovun.s16 d4, q2
|
||||||
vst1.8 {d4}, [r0], r1
|
vst1.8 {d4}, [r0], r1
|
||||||
.if \compare
|
.if \compare
|
||||||
@@ -473,7 +462,7 @@ function wiener_filter_v_8bpc_neon, export=1
|
|||||||
52: // 2 rows in total, q11 already loaded, load q12 with content data
|
52: // 2 rows in total, q11 already loaded, load q12 with content data
|
||||||
// and 2 rows of edge.
|
// and 2 rows of edge.
|
||||||
vld1.16 {q14}, [r2, :128], r7
|
vld1.16 {q14}, [r2, :128], r7
|
||||||
vmov q15, q14
|
vmov q15, q14
|
||||||
b 8f
|
b 8f
|
||||||
53:
|
53:
|
||||||
// 3 rows in total, q11 already loaded, load q12 and q13 with content
|
// 3 rows in total, q11 already loaded, load q12 and q13 with content
|
||||||
@@ -615,8 +604,8 @@ L(copy_narrow_tbl):
|
|||||||
asr r1, r1, #1
|
asr r1, r1, #1
|
||||||
22:
|
22:
|
||||||
subs r4, r4, #1
|
subs r4, r4, #1
|
||||||
vld1.16 {d0[]}, [r2]!
|
vld1.16 {d0[]}, [r2, :16]!
|
||||||
vst1.16 {d0[0]}, [r0], r1
|
vst1.16 {d0[0]}, [r0, :16], r1
|
||||||
bgt 22b
|
bgt 22b
|
||||||
0:
|
0:
|
||||||
pop {r4,pc}
|
pop {r4,pc}
|
||||||
@@ -644,8 +633,8 @@ L(copy_narrow_tbl):
|
|||||||
ble 0f
|
ble 0f
|
||||||
b 42b
|
b 42b
|
||||||
41:
|
41:
|
||||||
vld1.32 {d0[]}, [r2]
|
vld1.32 {d0[]}, [r2, :32]
|
||||||
vst1.32 {d0[0]}, [r0]
|
vst1.32 {d0[0]}, [r0, :32]
|
||||||
0:
|
0:
|
||||||
pop {r4,pc}
|
pop {r4,pc}
|
||||||
|
|
||||||
@@ -785,7 +774,7 @@ function sgr_box3_h_8bpc_neon, export=1
|
|||||||
bne 4f
|
bne 4f
|
||||||
// If we'll need to pad the right edge, load that byte to pad with
|
// If we'll need to pad the right edge, load that byte to pad with
|
||||||
// here since we can find it pretty easily from here.
|
// here since we can find it pretty easily from here.
|
||||||
sub lr, r5, #(2 + 16 - 2 + 1)
|
sub lr, r5, #(2 + 16 - 2 + 1)
|
||||||
ldrb r11, [r3, lr]
|
ldrb r11, [r3, lr]
|
||||||
ldrb lr, [r12, lr]
|
ldrb lr, [r12, lr]
|
||||||
// Fill q14/q15 with the right padding pixel
|
// Fill q14/q15 with the right padding pixel
|
||||||
@@ -1058,7 +1047,7 @@ function sgr_box5_h_8bpc_neon, export=1
|
|||||||
bne 4f
|
bne 4f
|
||||||
// If we'll need to pad the right edge, load that byte to pad with
|
// If we'll need to pad the right edge, load that byte to pad with
|
||||||
// here since we can find it pretty easily from here.
|
// here since we can find it pretty easily from here.
|
||||||
sub lr, r5, #(2 + 16 - 3 + 1)
|
sub lr, r5, #(2 + 16 - 3 + 1)
|
||||||
ldrb r11, [r3, lr]
|
ldrb r11, [r3, lr]
|
||||||
ldrb lr, [r12, lr]
|
ldrb lr, [r12, lr]
|
||||||
// Fill q14/q15 with the right padding pixel
|
// Fill q14/q15 with the right padding pixel
|
||||||
@@ -1100,7 +1089,7 @@ function sgr_box5_h_8bpc_neon, export=1
|
|||||||
vaddl_u16_n q12, q13, d2, d3, d16, d17, \w
|
vaddl_u16_n q12, q13, d2, d3, d16, d17, \w
|
||||||
vaddl_u16_n q8, q9, d18, d19, d20, d21, \w
|
vaddl_u16_n q8, q9, d18, d19, d20, d21, \w
|
||||||
vaddw_u16_n q12, q13, d22, d23, \w
|
vaddw_u16_n q12, q13, d22, d23, \w
|
||||||
vadd_i32_n q12, q13, q8, q9, \w
|
vadd_i32_n q12, q13, q8, q9, \w
|
||||||
vext.8 q8, q5, q6, #2
|
vext.8 q8, q5, q6, #2
|
||||||
vext.8 q9, q5, q6, #4
|
vext.8 q9, q5, q6, #4
|
||||||
vext.8 q10, q5, q6, #6
|
vext.8 q10, q5, q6, #6
|
||||||
@@ -1152,7 +1141,7 @@ function sgr_box5_h_8bpc_neon, export=1
|
|||||||
|
|
||||||
6: // Pad the right edge and produce the last few pixels.
|
6: // Pad the right edge and produce the last few pixels.
|
||||||
// w < 7, w+1 pixels valid in q0/q4
|
// w < 7, w+1 pixels valid in q0/q4
|
||||||
sub lr, r5, #1
|
sub lr, r5, #1
|
||||||
// lr = pixels valid - 2
|
// lr = pixels valid - 2
|
||||||
adr r11, L(box5_variable_shift_tbl)
|
adr r11, L(box5_variable_shift_tbl)
|
||||||
ldr lr, [r11, lr, lsl #2]
|
ldr lr, [r11, lr, lsl #2]
|
||||||
|
|||||||
720
third_party/dav1d/src/arm/32/looprestoration16.S
vendored
Normal file
720
third_party/dav1d/src/arm/32/looprestoration16.S
vendored
Normal file
@@ -0,0 +1,720 @@
|
|||||||
|
/*
|
||||||
|
* Copyright © 2018, VideoLAN and dav1d authors
|
||||||
|
* Copyright © 2020, Martin Storsjo
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "src/arm/asm.S"
|
||||||
|
#include "util.S"
|
||||||
|
|
||||||
|
// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
|
||||||
|
// const pixel *src, ptrdiff_t stride,
|
||||||
|
// const int16_t fh[7], const intptr_t w,
|
||||||
|
// int h, enum LrEdgeFlags edges,
|
||||||
|
// const int bitdepth_max);
|
||||||
|
function wiener_filter_h_16bpc_neon, export=1
|
||||||
|
push {r4-r11,lr}
|
||||||
|
vpush {q4-q7}
|
||||||
|
ldrd r4, r5, [sp, #100]
|
||||||
|
ldrd r6, r7, [sp, #108]
|
||||||
|
ldr r8, [sp, #116] // bitdepth_max
|
||||||
|
vld1.16 {q0}, [r4]
|
||||||
|
clz r8, r8
|
||||||
|
vmov.i32 q14, #1
|
||||||
|
sub r9, r8, #38 // -(bitdepth + 6)
|
||||||
|
sub r8, r8, #25 // -round_bits_h
|
||||||
|
neg r9, r9 // bitdepth + 6
|
||||||
|
vdup.32 q1, r9
|
||||||
|
vdup.32 q13, r8 // -round_bits_h
|
||||||
|
vmov.i16 q15, #8192
|
||||||
|
vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6)
|
||||||
|
mov r8, r5
|
||||||
|
// Calculate mid_stride
|
||||||
|
add r10, r5, #7
|
||||||
|
bic r10, r10, #7
|
||||||
|
lsl r10, r10, #1
|
||||||
|
|
||||||
|
// Clear the last unused element of q0, to allow filtering a single
|
||||||
|
// pixel with one plain vmul+vpadd.
|
||||||
|
mov r12, #0
|
||||||
|
vmov.16 d1[3], r12
|
||||||
|
|
||||||
|
// Set up pointers for reading/writing alternate rows
|
||||||
|
add r12, r0, r10
|
||||||
|
lsl r10, r10, #1
|
||||||
|
add lr, r2, r3
|
||||||
|
lsl r3, r3, #1
|
||||||
|
|
||||||
|
// Subtract the width from mid_stride
|
||||||
|
sub r10, r10, r5, lsl #1
|
||||||
|
|
||||||
|
// For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
|
||||||
|
cmp r5, #8
|
||||||
|
add r11, r5, #13
|
||||||
|
bic r11, r11, #7
|
||||||
|
bge 1f
|
||||||
|
mov r11, #16
|
||||||
|
1:
|
||||||
|
sub r3, r3, r11, lsl #1
|
||||||
|
|
||||||
|
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
|
||||||
|
tst r7, #1 // LR_HAVE_LEFT
|
||||||
|
beq 2f
|
||||||
|
// LR_HAVE_LEFT
|
||||||
|
cmp r1, #0
|
||||||
|
bne 0f
|
||||||
|
// left == NULL
|
||||||
|
sub r2, r2, #6
|
||||||
|
sub lr, lr, #6
|
||||||
|
b 1f
|
||||||
|
0: // LR_HAVE_LEFT, left != NULL
|
||||||
|
2: // !LR_HAVE_LEFT, increase the stride.
|
||||||
|
// For this case we don't read the left 3 pixels from the src pointer,
|
||||||
|
// but shift it as if we had done that.
|
||||||
|
add r3, r3, #6
|
||||||
|
|
||||||
|
|
||||||
|
1: // Loop vertically
|
||||||
|
vld1.16 {q2, q3}, [r2]!
|
||||||
|
vld1.16 {q4, q5}, [lr]!
|
||||||
|
|
||||||
|
tst r7, #1 // LR_HAVE_LEFT
|
||||||
|
beq 0f
|
||||||
|
cmp r1, #0
|
||||||
|
beq 2f
|
||||||
|
// LR_HAVE_LEFT, left != NULL
|
||||||
|
vld1.16 {d3}, [r1]!
|
||||||
|
// Move r2/lr back to account for the last 3 pixels we loaded earlier,
|
||||||
|
// which we'll shift out.
|
||||||
|
sub r2, r2, #6
|
||||||
|
sub lr, lr, #6
|
||||||
|
vld1.16 {d13}, [r1]!
|
||||||
|
vext.8 q3, q2, q3, #10
|
||||||
|
vext.8 q2, q1, q2, #10
|
||||||
|
vext.8 q5, q4, q5, #10
|
||||||
|
vext.8 q4, q6, q4, #10
|
||||||
|
b 2f
|
||||||
|
0:
|
||||||
|
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
|
||||||
|
// and shift q2/q3 to have 3x the first pixel at the front.
|
||||||
|
vdup.16 q1, d4[0]
|
||||||
|
vdup.16 q6, d8[0]
|
||||||
|
// Move r2 back to account for the last 3 pixels we loaded before,
|
||||||
|
// which we shifted out.
|
||||||
|
sub r2, r2, #6
|
||||||
|
sub lr, lr, #6
|
||||||
|
vext.8 q3, q2, q3, #10
|
||||||
|
vext.8 q2, q1, q2, #10
|
||||||
|
vext.8 q5, q4, q5, #10
|
||||||
|
vext.8 q4, q6, q4, #10
|
||||||
|
|
||||||
|
2:
|
||||||
|
|
||||||
|
tst r7, #2 // LR_HAVE_RIGHT
|
||||||
|
bne 4f
|
||||||
|
// If we'll need to pad the right edge, load that byte to pad with
|
||||||
|
// here since we can find it pretty easily from here.
|
||||||
|
sub r9, r5, #14
|
||||||
|
lsl r9, r9, #1
|
||||||
|
ldrh r11, [r2, r9]
|
||||||
|
ldrh r9, [lr, r9]
|
||||||
|
// Fill q11/q12 with the right padding pixel
|
||||||
|
vdup.16 q11, r11
|
||||||
|
vdup.16 q12, r9
|
||||||
|
3: // !LR_HAVE_RIGHT
|
||||||
|
// If we'll have to pad the right edge we need to quit early here.
|
||||||
|
cmp r5, #11
|
||||||
|
bge 4f // If w >= 11, all used input pixels are valid
|
||||||
|
cmp r5, #7
|
||||||
|
bge 5f // If w >= 7, we can filter 4 pixels
|
||||||
|
b 6f
|
||||||
|
|
||||||
|
4: // Loop horizontally
|
||||||
|
vext.8 q10, q2, q3, #6
|
||||||
|
vext.8 q8, q2, q3, #2
|
||||||
|
vext.8 q9, q2, q3, #4
|
||||||
|
vshll.u16 q6, d20, #7
|
||||||
|
vshll.u16 q7, d21, #7
|
||||||
|
vmlal.s16 q6, d4, d0[0]
|
||||||
|
vmlal.s16 q6, d16, d0[1]
|
||||||
|
vmlal.s16 q6, d18, d0[2]
|
||||||
|
vmlal.s16 q6, d20, d0[3]
|
||||||
|
vmlal.s16 q7, d5, d0[0]
|
||||||
|
vmlal.s16 q7, d17, d0[1]
|
||||||
|
vmlal.s16 q7, d19, d0[2]
|
||||||
|
vmlal.s16 q7, d21, d0[3]
|
||||||
|
vext.8 q8, q2, q3, #8
|
||||||
|
vext.8 q9, q2, q3, #10
|
||||||
|
vext.8 q10, q2, q3, #12
|
||||||
|
vmlal.s16 q6, d16, d1[0]
|
||||||
|
vmlal.s16 q6, d18, d1[1]
|
||||||
|
vmlal.s16 q6, d20, d1[2]
|
||||||
|
vmlal.s16 q7, d17, d1[0]
|
||||||
|
vmlal.s16 q7, d19, d1[1]
|
||||||
|
vmlal.s16 q7, d21, d1[2]
|
||||||
|
vext.8 q10, q4, q5, #6
|
||||||
|
vext.8 q2, q4, q5, #2
|
||||||
|
vshll.u16 q8, d20, #7
|
||||||
|
vshll.u16 q9, d21, #7
|
||||||
|
vmlal.s16 q8, d8, d0[0]
|
||||||
|
vmlal.s16 q8, d4, d0[1]
|
||||||
|
vmlal.s16 q8, d20, d0[3]
|
||||||
|
vmlal.s16 q9, d9, d0[0]
|
||||||
|
vmlal.s16 q9, d5, d0[1]
|
||||||
|
vmlal.s16 q9, d21, d0[3]
|
||||||
|
vext.8 q2, q4, q5, #4
|
||||||
|
vext.8 q10, q4, q5, #8
|
||||||
|
vmlal.s16 q8, d4, d0[2]
|
||||||
|
vmlal.s16 q8, d20, d1[0]
|
||||||
|
vmlal.s16 q9, d5, d0[2]
|
||||||
|
vmlal.s16 q9, d21, d1[0]
|
||||||
|
vext.8 q2, q4, q5, #10
|
||||||
|
vext.8 q10, q4, q5, #12
|
||||||
|
vmlal.s16 q8, d4, d1[1]
|
||||||
|
vmlal.s16 q8, d20, d1[2]
|
||||||
|
vmlal.s16 q9, d5, d1[1]
|
||||||
|
vmlal.s16 q9, d21, d1[2]
|
||||||
|
|
||||||
|
vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
|
||||||
|
vadd.i32 q6, q6, q14
|
||||||
|
vadd.i32 q7, q7, q14
|
||||||
|
vadd.i32 q8, q8, q14
|
||||||
|
vadd.i32 q9, q9, q14
|
||||||
|
vrshl.s32 q6, q6, q13
|
||||||
|
vrshl.s32 q7, q7, q13
|
||||||
|
vrshl.s32 q8, q8, q13
|
||||||
|
vrshl.s32 q9, q9, q13
|
||||||
|
vqmovun.s32 d12, q6
|
||||||
|
vqmovun.s32 d13, q7
|
||||||
|
vqmovun.s32 d14, q8
|
||||||
|
vqmovun.s32 d15, q9
|
||||||
|
vmin.u16 q6, q6, q10
|
||||||
|
vmin.u16 q7, q7, q10
|
||||||
|
vsub.i16 q6, q6, q15
|
||||||
|
vsub.i16 q7, q7, q15
|
||||||
|
vst1.16 {q6}, [r0, :128]!
|
||||||
|
vst1.16 {q7}, [r12, :128]!
|
||||||
|
|
||||||
|
subs r5, r5, #8
|
||||||
|
ble 9f
|
||||||
|
tst r7, #2 // LR_HAVE_RIGHT
|
||||||
|
vmov q2, q3
|
||||||
|
vmov q4, q5
|
||||||
|
vld1.16 {q3}, [r2]!
|
||||||
|
vld1.16 {q5}, [lr]!
|
||||||
|
bne 4b // If we don't need to pad, just keep filtering.
|
||||||
|
b 3b // If we need to pad, check how many pixels we have left.
|
||||||
|
|
||||||
|
5: // Filter 4 pixels, 7 <= w < 11
|
||||||
|
.macro filter_4
|
||||||
|
vext.8 d18, d4, d5, #6
|
||||||
|
vext.8 d16, d4, d5, #2
|
||||||
|
vext.8 d17, d4, d5, #4
|
||||||
|
vext.8 d19, d5, d6, #2
|
||||||
|
vext.8 d20, d5, d6, #4
|
||||||
|
vshll.u16 q6, d18, #7
|
||||||
|
vmlal.s16 q6, d4, d0[0]
|
||||||
|
vmlal.s16 q6, d16, d0[1]
|
||||||
|
vmlal.s16 q6, d17, d0[2]
|
||||||
|
vmlal.s16 q6, d18, d0[3]
|
||||||
|
vmlal.s16 q6, d5, d1[0]
|
||||||
|
vmlal.s16 q6, d19, d1[1]
|
||||||
|
vmlal.s16 q6, d20, d1[2]
|
||||||
|
|
||||||
|
vext.8 d18, d8, d9, #6
|
||||||
|
vext.8 d16, d8, d9, #2
|
||||||
|
vext.8 d17, d8, d9, #4
|
||||||
|
vext.8 d19, d9, d10, #2
|
||||||
|
vext.8 d20, d9, d10, #4
|
||||||
|
vshll.u16 q7, d18, #7
|
||||||
|
vmlal.s16 q7, d8, d0[0]
|
||||||
|
vmlal.s16 q7, d16, d0[1]
|
||||||
|
vmlal.s16 q7, d17, d0[2]
|
||||||
|
vmlal.s16 q7, d18, d0[3]
|
||||||
|
vmlal.s16 q7, d9, d1[0]
|
||||||
|
vmlal.s16 q7, d19, d1[1]
|
||||||
|
vmlal.s16 q7, d20, d1[2]
|
||||||
|
|
||||||
|
vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
|
||||||
|
vadd.i32 q6, q6, q14
|
||||||
|
vadd.i32 q7, q7, q14
|
||||||
|
vrshl.s32 q6, q6, q13
|
||||||
|
vrshl.s32 q7, q7, q13
|
||||||
|
vqmovun.s32 d12, q6
|
||||||
|
vqmovun.s32 d13, q7
|
||||||
|
vmin.u16 q6, q6, q10
|
||||||
|
vsub.i16 q6, q6, q15
|
||||||
|
.endm
|
||||||
|
filter_4
|
||||||
|
vst1.16 {d12}, [r0, :64]!
|
||||||
|
vst1.16 {d13}, [r12, :64]!
|
||||||
|
|
||||||
|
subs r5, r5, #4 // 3 <= w < 7
|
||||||
|
vext.8 q2, q2, q3, #8
|
||||||
|
vext.8 q3, q3, q3, #8
|
||||||
|
vext.8 q4, q4, q5, #8
|
||||||
|
vext.8 q5, q5, q5, #8
|
||||||
|
|
||||||
|
6: // Pad the right edge and filter the last few pixels.
|
||||||
|
// w < 7, w+3 pixels valid in q2-q3
|
||||||
|
cmp r5, #5
|
||||||
|
blt 7f
|
||||||
|
bgt 8f
|
||||||
|
// w == 5, 8 pixels valid in q2, q3 invalid
|
||||||
|
vmov q3, q11
|
||||||
|
vmov q5, q12
|
||||||
|
b 88f
|
||||||
|
|
||||||
|
7: // 1 <= w < 5, 4-7 pixels valid in q2
|
||||||
|
sub r9, r5, #1
|
||||||
|
// r9 = (pixels valid - 4)
|
||||||
|
adr r11, L(variable_shift_tbl)
|
||||||
|
ldr r9, [r11, r9, lsl #2]
|
||||||
|
add r11, r11, r9
|
||||||
|
vmov q3, q11
|
||||||
|
vmov q5, q12
|
||||||
|
bx r11
|
||||||
|
|
||||||
|
.align 2
|
||||||
|
L(variable_shift_tbl):
|
||||||
|
.word 44f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||||
|
.word 55f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||||
|
.word 66f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||||
|
.word 77f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||||
|
|
||||||
|
44: // 4 pixels valid in q2/q4, fill the high half with padding.
|
||||||
|
vmov d5, d6
|
||||||
|
vmov d9, d10
|
||||||
|
b 88f
|
||||||
|
// Shift q2 right, shifting out invalid pixels,
|
||||||
|
// shift q2 left to the original offset, shifting in padding pixels.
|
||||||
|
55: // 5 pixels valid
|
||||||
|
vext.8 q2, q2, q2, #10
|
||||||
|
vext.8 q2, q2, q3, #6
|
||||||
|
vext.8 q4, q4, q4, #10
|
||||||
|
vext.8 q4, q4, q5, #6
|
||||||
|
b 88f
|
||||||
|
66: // 6 pixels valid
|
||||||
|
vext.8 q2, q2, q2, #12
|
||||||
|
vext.8 q2, q2, q3, #4
|
||||||
|
vext.8 q4, q4, q4, #12
|
||||||
|
vext.8 q4, q4, q5, #4
|
||||||
|
b 88f
|
||||||
|
77: // 7 pixels valid
|
||||||
|
vext.8 q2, q2, q2, #14
|
||||||
|
vext.8 q2, q2, q3, #2
|
||||||
|
vext.8 q4, q4, q4, #14
|
||||||
|
vext.8 q4, q4, q5, #2
|
||||||
|
b 88f
|
||||||
|
|
||||||
|
8: // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3
|
||||||
|
vext.8 q3, q3, q3, #2
|
||||||
|
vext.8 q3, q3, q11, #14
|
||||||
|
vext.8 q5, q5, q5, #2
|
||||||
|
vext.8 q5, q5, q12, #14
|
||||||
|
|
||||||
|
88:
|
||||||
|
// w < 7, q2-q3 padded properly
|
||||||
|
cmp r5, #4
|
||||||
|
blt 888f
|
||||||
|
|
||||||
|
// w >= 4, filter 4 pixels
|
||||||
|
filter_4
|
||||||
|
vst1.16 {d12}, [r0, :64]!
|
||||||
|
vst1.16 {d13}, [r12, :64]!
|
||||||
|
subs r5, r5, #4 // 0 <= w < 4
|
||||||
|
vext.8 q2, q2, q3, #8
|
||||||
|
vext.8 q4, q4, q5, #8
|
||||||
|
beq 9f
|
||||||
|
888: // 1 <= w < 4, filter 1 pixel at a time
|
||||||
|
vmull.s16 q6, d4, d0
|
||||||
|
vmull.s16 q7, d5, d1
|
||||||
|
vmull.s16 q8, d8, d0
|
||||||
|
vmull.s16 q9, d9, d1
|
||||||
|
vadd.i32 q6, q7
|
||||||
|
vadd.i32 q8, q9
|
||||||
|
vpadd.i32 d12, d12, d13
|
||||||
|
vpadd.i32 d13, d16, d17
|
||||||
|
vdup.16 d14, d4[3]
|
||||||
|
vdup.16 d15, d8[3]
|
||||||
|
vpadd.i32 d12, d12, d13
|
||||||
|
vtrn.16 d14, d15
|
||||||
|
vadd.i32 d12, d12, d28
|
||||||
|
vshll.u16 q7, d14, #7
|
||||||
|
vmvn.i16 d20, #0x8000 // 0x7fff = (1 << 15) - 1
|
||||||
|
vadd.i32 d12, d12, d14
|
||||||
|
vrshl.s32 d12, d12, d26
|
||||||
|
vqmovun.s32 d12, q6
|
||||||
|
vmin.u16 d12, d12, d20
|
||||||
|
vsub.i16 d12, d12, d30
|
||||||
|
vst1.16 {d12[0]}, [r0, :16]!
|
||||||
|
vst1.16 {d12[1]}, [r12, :16]!
|
||||||
|
subs r5, r5, #1
|
||||||
|
vext.8 q2, q2, q3, #2
|
||||||
|
vext.8 q4, q4, q5, #2
|
||||||
|
bgt 888b
|
||||||
|
|
||||||
|
9:
|
||||||
|
subs r6, r6, #2
|
||||||
|
ble 0f
|
||||||
|
// Jump to the next row and loop horizontally
|
||||||
|
add r0, r0, r10
|
||||||
|
add r12, r12, r10
|
||||||
|
add r2, r2, r3
|
||||||
|
add lr, lr, r3
|
||||||
|
mov r5, r8
|
||||||
|
b 1b
|
||||||
|
0:
|
||||||
|
vpop {q4-q7}
|
||||||
|
pop {r4-r11,pc}
|
||||||
|
.purgem filter_4
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||||
|
// const int16_t *mid, int w, int h,
|
||||||
|
// const int16_t fv[7], enum LrEdgeFlags edges,
|
||||||
|
// ptrdiff_t mid_stride, const int bitdepth_max);
|
||||||
|
function wiener_filter_v_16bpc_neon, export=1
|
||||||
|
push {r4-r7,lr}
|
||||||
|
vpush {q4-q5}
|
||||||
|
ldrd r4, r5, [sp, #52]
|
||||||
|
ldrd r6, r7, [sp, #60]
|
||||||
|
ldr lr, [sp, #68] // bitdepth_max
|
||||||
|
vmov.i16 q1, #0
|
||||||
|
mov r12, #128
|
||||||
|
vld1.16 {q0}, [r5]
|
||||||
|
vdup.16 q5, lr
|
||||||
|
clz lr, lr
|
||||||
|
vmov.i16 d2[3], r12
|
||||||
|
sub lr, lr, #11 // round_bits_v
|
||||||
|
vadd.i16 q0, q0, q1
|
||||||
|
vdup.32 q4, lr
|
||||||
|
mov lr, r4
|
||||||
|
vneg.s32 q4, q4 // -round_bits_v
|
||||||
|
|
||||||
|
// Calculate the number of rows to move back when looping vertically
|
||||||
|
mov r12, r4
|
||||||
|
tst r6, #4 // LR_HAVE_TOP
|
||||||
|
beq 0f
|
||||||
|
sub r2, r2, r7, lsl #1
|
||||||
|
add r12, r12, #2
|
||||||
|
0:
|
||||||
|
tst r6, #8 // LR_HAVE_BOTTOM
|
||||||
|
beq 1f
|
||||||
|
add r12, r12, #2
|
||||||
|
|
||||||
|
1: // Start of horizontal loop; start one vertical filter slice.
|
||||||
|
// Load rows into q8-q11 and pad properly.
|
||||||
|
tst r6, #4 // LR_HAVE_TOP
|
||||||
|
vld1.16 {q8}, [r2, :128], r7
|
||||||
|
beq 2f
|
||||||
|
// LR_HAVE_TOP
|
||||||
|
vld1.16 {q10}, [r2, :128], r7
|
||||||
|
vmov q9, q8
|
||||||
|
vld1.16 {q11}, [r2, :128], r7
|
||||||
|
b 3f
|
||||||
|
2: // !LR_HAVE_TOP
|
||||||
|
vmov q9, q8
|
||||||
|
vmov q10, q8
|
||||||
|
vmov q11, q8
|
||||||
|
|
||||||
|
3:
|
||||||
|
cmp r4, #4
|
||||||
|
blt 5f
|
||||||
|
// Start filtering normally; fill in q12-q14 with unique rows.
|
||||||
|
vld1.16 {q12}, [r2, :128], r7
|
||||||
|
vld1.16 {q13}, [r2, :128], r7
|
||||||
|
vld1.16 {q14}, [r2, :128], r7
|
||||||
|
|
||||||
|
4:
|
||||||
|
.macro filter compare
|
||||||
|
subs r4, r4, #1
|
||||||
|
// Interleaving the mul/mla chains actually hurts performance
|
||||||
|
// significantly on Cortex A53, thus keeping mul/mla tightly
|
||||||
|
// chained like this.
|
||||||
|
vmull.s16 q2, d16, d0[0]
|
||||||
|
vmlal.s16 q2, d18, d0[1]
|
||||||
|
vmlal.s16 q2, d20, d0[2]
|
||||||
|
vmlal.s16 q2, d22, d0[3]
|
||||||
|
vmlal.s16 q2, d24, d1[0]
|
||||||
|
vmlal.s16 q2, d26, d1[1]
|
||||||
|
vmlal.s16 q2, d28, d1[2]
|
||||||
|
vmull.s16 q3, d17, d0[0]
|
||||||
|
vmlal.s16 q3, d19, d0[1]
|
||||||
|
vmlal.s16 q3, d21, d0[2]
|
||||||
|
vmlal.s16 q3, d23, d0[3]
|
||||||
|
vmlal.s16 q3, d25, d1[0]
|
||||||
|
vmlal.s16 q3, d27, d1[1]
|
||||||
|
vmlal.s16 q3, d29, d1[2]
|
||||||
|
vrshl.s32 q2, q2, q4 // round_bits_v
|
||||||
|
vrshl.s32 q3, q3, q4
|
||||||
|
vqmovun.s32 d4, q2
|
||||||
|
vqmovun.s32 d5, q3
|
||||||
|
vmin.u16 q2, q2, q5 // bitdepth_max
|
||||||
|
vst1.16 {q2}, [r0], r1
|
||||||
|
.if \compare
|
||||||
|
cmp r4, #4
|
||||||
|
.else
|
||||||
|
ble 9f
|
||||||
|
.endif
|
||||||
|
vmov q8, q9
|
||||||
|
vmov q9, q10
|
||||||
|
vmov q10, q11
|
||||||
|
vmov q11, q12
|
||||||
|
vmov q12, q13
|
||||||
|
vmov q13, q14
|
||||||
|
.endm
|
||||||
|
filter 1
|
||||||
|
blt 7f
|
||||||
|
vld1.16 {q14}, [r2, :128], r7
|
||||||
|
b 4b
|
||||||
|
|
||||||
|
5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
|
||||||
|
tst r6, #8 // LR_HAVE_BOTTOM
|
||||||
|
beq 6f
|
||||||
|
// LR_HAVE_BOTTOM
|
||||||
|
cmp r4, #2
|
||||||
|
// We load at least 2 rows in all cases.
|
||||||
|
vld1.16 {q12}, [r2, :128], r7
|
||||||
|
vld1.16 {q13}, [r2, :128], r7
|
||||||
|
bgt 53f // 3 rows in total
|
||||||
|
beq 52f // 2 rows in total
|
||||||
|
51: // 1 row in total, q11 already loaded, load edge into q12-q14.
|
||||||
|
vmov q13, q12
|
||||||
|
b 8f
|
||||||
|
52: // 2 rows in total, q11 already loaded, load q12 with content data
|
||||||
|
// and 2 rows of edge.
|
||||||
|
vld1.16 {q14}, [r2, :128], r7
|
||||||
|
vmov q15, q14
|
||||||
|
b 8f
|
||||||
|
53:
|
||||||
|
// 3 rows in total, q11 already loaded, load q12 and q13 with content
|
||||||
|
// and 2 rows of edge.
|
||||||
|
vld1.16 {q14}, [r2, :128], r7
|
||||||
|
vld1.16 {q15}, [r2, :128], r7
|
||||||
|
vmov q1, q15
|
||||||
|
b 8f
|
||||||
|
|
||||||
|
6:
|
||||||
|
// !LR_HAVE_BOTTOM
|
||||||
|
cmp r4, #2
|
||||||
|
bgt 63f // 3 rows in total
|
||||||
|
beq 62f // 2 rows in total
|
||||||
|
61: // 1 row in total, q11 already loaded, pad that into q12-q14.
|
||||||
|
vmov q12, q11
|
||||||
|
vmov q13, q11
|
||||||
|
vmov q14, q11
|
||||||
|
b 8f
|
||||||
|
62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
|
||||||
|
vld1.16 {q12}, [r2, :128], r7
|
||||||
|
vmov q13, q12
|
||||||
|
vmov q14, q12
|
||||||
|
vmov q15, q12
|
||||||
|
b 8f
|
||||||
|
63:
|
||||||
|
// 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
|
||||||
|
vld1.16 {q12}, [r2, :128], r7
|
||||||
|
vld1.16 {q13}, [r2, :128], r7
|
||||||
|
vmov q14, q13
|
||||||
|
vmov q15, q13
|
||||||
|
vmov q1, q13
|
||||||
|
b 8f
|
||||||
|
|
||||||
|
7:
|
||||||
|
// All registers up to q13 are filled already, 3 valid rows left.
|
||||||
|
// < 4 valid rows left; fill in padding and filter the last
|
||||||
|
// few rows.
|
||||||
|
tst r6, #8 // LR_HAVE_BOTTOM
|
||||||
|
beq 71f
|
||||||
|
// LR_HAVE_BOTTOM; load 2 rows of edge.
|
||||||
|
vld1.16 {q14}, [r2, :128], r7
|
||||||
|
vld1.16 {q15}, [r2, :128], r7
|
||||||
|
vmov q1, q15
|
||||||
|
b 8f
|
||||||
|
71:
|
||||||
|
// !LR_HAVE_BOTTOM, pad 3 rows
|
||||||
|
vmov q14, q13
|
||||||
|
vmov q15, q13
|
||||||
|
vmov q1, q13
|
||||||
|
|
||||||
|
8: // At this point, all registers up to q14-q15,q1 are loaded with
|
||||||
|
// edge/padding (depending on how many rows are left).
|
||||||
|
filter 0 // This branches to 9f when done
|
||||||
|
vmov q14, q15
|
||||||
|
vmov q15, q1
|
||||||
|
b 8b
|
||||||
|
|
||||||
|
9: // End of one vertical slice.
|
||||||
|
subs r3, r3, #8
|
||||||
|
ble 0f
|
||||||
|
// Move pointers back up to the top and loop horizontally.
|
||||||
|
mls r0, r1, lr, r0
|
||||||
|
mls r2, r7, r12, r2
|
||||||
|
add r0, r0, #16
|
||||||
|
add r2, r2, #16
|
||||||
|
mov r4, lr
|
||||||
|
b 1b
|
||||||
|
|
||||||
|
0:
|
||||||
|
vpop {q4-q5}
|
||||||
|
pop {r4-r7,pc}
|
||||||
|
.purgem filter
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||||
|
// const pixel *src, int w, int h);
|
||||||
|
function copy_narrow_16bpc_neon, export=1
|
||||||
|
push {r4,lr}
|
||||||
|
ldr r4, [sp, #8]
|
||||||
|
adr r12, L(copy_narrow_tbl)
|
||||||
|
ldr r3, [r12, r3, lsl #2]
|
||||||
|
add r12, r12, r3
|
||||||
|
bx r12
|
||||||
|
|
||||||
|
.align 2
|
||||||
|
L(copy_narrow_tbl):
|
||||||
|
.word 0
|
||||||
|
.word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||||
|
.word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||||
|
.word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||||
|
.word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||||
|
.word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||||
|
.word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||||
|
.word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||||
|
|
||||||
|
10:
|
||||||
|
add r3, r0, r1
|
||||||
|
lsl r1, r1, #1
|
||||||
|
18:
|
||||||
|
subs r4, r4, #8
|
||||||
|
blt 110f
|
||||||
|
vld1.16 {q0}, [r2, :128]!
|
||||||
|
vst1.16 {d0[0]}, [r0, :16], r1
|
||||||
|
vst1.16 {d0[1]}, [r3, :16], r1
|
||||||
|
vst1.16 {d0[2]}, [r0, :16], r1
|
||||||
|
vst1.16 {d0[3]}, [r3, :16], r1
|
||||||
|
vst1.16 {d1[0]}, [r0, :16], r1
|
||||||
|
vst1.16 {d1[1]}, [r3, :16], r1
|
||||||
|
vst1.16 {d1[2]}, [r0, :16], r1
|
||||||
|
vst1.16 {d1[3]}, [r3, :16], r1
|
||||||
|
ble 0f
|
||||||
|
b 18b
|
||||||
|
110:
|
||||||
|
add r4, r4, #8
|
||||||
|
asr r1, r1, #1
|
||||||
|
11:
|
||||||
|
subs r4, r4, #1
|
||||||
|
vld1.16 {d0[]}, [r2]!
|
||||||
|
vst1.16 {d0[0]}, [r0], r1
|
||||||
|
bgt 11b
|
||||||
|
0:
|
||||||
|
pop {r4,pc}
|
||||||
|
|
||||||
|
20:
|
||||||
|
add r3, r0, r1
|
||||||
|
lsl r1, r1, #1
|
||||||
|
24:
|
||||||
|
subs r4, r4, #4
|
||||||
|
blt 210f
|
||||||
|
vld1.32 {q0}, [r2, :128]!
|
||||||
|
vst1.32 {d0[0]}, [r0, :32], r1
|
||||||
|
vst1.32 {d0[1]}, [r3, :32], r1
|
||||||
|
vst1.32 {d1[0]}, [r0, :32], r1
|
||||||
|
vst1.32 {d1[1]}, [r3, :32], r1
|
||||||
|
ble 0f
|
||||||
|
b 24b
|
||||||
|
210:
|
||||||
|
add r4, r4, #4
|
||||||
|
asr r1, r1, #1
|
||||||
|
22:
|
||||||
|
subs r4, r4, #1
|
||||||
|
vld1.32 {d0[]}, [r2, :32]!
|
||||||
|
vst1.32 {d0[0]}, [r0, :32], r1
|
||||||
|
bgt 22b
|
||||||
|
0:
|
||||||
|
pop {r4,pc}
|
||||||
|
|
||||||
|
30:
|
||||||
|
ldr r3, [r2]
|
||||||
|
ldrh r12, [r2, #4]
|
||||||
|
add r2, r2, #6
|
||||||
|
subs r4, r4, #1
|
||||||
|
str r3, [r0]
|
||||||
|
strh r12, [r0, #4]
|
||||||
|
add r0, r0, r1
|
||||||
|
bgt 30b
|
||||||
|
pop {r4,pc}
|
||||||
|
|
||||||
|
40:
|
||||||
|
add r3, r0, r1
|
||||||
|
lsl r1, r1, #1
|
||||||
|
42:
|
||||||
|
subs r4, r4, #2
|
||||||
|
blt 41f
|
||||||
|
vld1.16 {q0}, [r2, :128]!
|
||||||
|
vst1.16 {d0}, [r0, :64], r1
|
||||||
|
vst1.16 {d1}, [r3, :64], r1
|
||||||
|
ble 0f
|
||||||
|
b 42b
|
||||||
|
41:
|
||||||
|
vld1.16 {d0}, [r2, :64]
|
||||||
|
vst1.16 {d0}, [r0, :64]
|
||||||
|
0:
|
||||||
|
pop {r4,pc}
|
||||||
|
|
||||||
|
50:
|
||||||
|
vld1.16 {d0}, [r2]
|
||||||
|
ldrh r12, [r2, #8]
|
||||||
|
add r2, r2, #10
|
||||||
|
subs r4, r4, #1
|
||||||
|
vst1.16 {d0}, [r0]
|
||||||
|
strh r12, [r0, #8]
|
||||||
|
add r0, r0, r1
|
||||||
|
bgt 50b
|
||||||
|
pop {r4,pc}
|
||||||
|
|
||||||
|
60:
|
||||||
|
vld1.16 {d0}, [r2]
|
||||||
|
ldr r12, [r2, #8]
|
||||||
|
add r2, r2, #12
|
||||||
|
subs r4, r4, #1
|
||||||
|
vst1.16 {d0}, [r0]
|
||||||
|
str r12, [r0, #8]
|
||||||
|
add r0, r0, r1
|
||||||
|
bgt 60b
|
||||||
|
pop {r4,pc}
|
||||||
|
|
||||||
|
70:
|
||||||
|
vld1.16 {d0}, [r2]
|
||||||
|
ldr r12, [r2, #8]
|
||||||
|
ldrh lr, [r2, #12]
|
||||||
|
add r2, r2, #14
|
||||||
|
subs r4, r4, #1
|
||||||
|
vst1.16 {d0}, [r0]
|
||||||
|
str r12, [r0, #8]
|
||||||
|
strh lr, [r0, #12]
|
||||||
|
add r0, r0, r1
|
||||||
|
bgt 70b
|
||||||
|
pop {r4,pc}
|
||||||
|
endfunc
|
||||||
42
third_party/dav1d/src/arm/32/mc.S
vendored
42
third_party/dav1d/src/arm/32/mc.S
vendored
@@ -1403,12 +1403,12 @@ L(\type\()_8tap_h_tbl):
|
|||||||
vld1.8 {d24}, [\sr2], \s_strd
|
vld1.8 {d24}, [\sr2], \s_strd
|
||||||
vmovl.u8 q8, d16
|
vmovl.u8 q8, d16
|
||||||
vmovl.u8 q12, d24
|
vmovl.u8 q12, d24
|
||||||
vext.8 q9, q8, q8, #2
|
vext.8 d18, d16, d17, #2
|
||||||
vext.8 q10, q8, q8, #4
|
vext.8 d20, d16, d17, #4
|
||||||
vext.8 q11, q8, q8, #6
|
vext.8 d22, d16, d17, #6
|
||||||
vext.8 q13, q12, q12, #2
|
vext.8 d26, d24, d25, #2
|
||||||
vext.8 q14, q12, q12, #4
|
vext.8 d28, d24, d25, #4
|
||||||
vext.8 q15, q12, q12, #6
|
vext.8 d30, d24, d25, #6
|
||||||
subs \h, \h, #2
|
subs \h, \h, #2
|
||||||
vmul.s16 d4, d16, d0[0]
|
vmul.s16 d4, d16, d0[0]
|
||||||
vmla.s16 d4, d18, d0[1]
|
vmla.s16 d4, d18, d0[1]
|
||||||
@@ -1431,7 +1431,7 @@ L(\type\()_8tap_h_tbl):
|
|||||||
pop {r4-r11,pc}
|
pop {r4-r11,pc}
|
||||||
|
|
||||||
80: // 8xN h
|
80: // 8xN h
|
||||||
vld1.8 {d0}, [\mx]
|
vld1.8 {d0}, [\mx, :64]
|
||||||
sub \src, \src, #3
|
sub \src, \src, #3
|
||||||
add \ds2, \dst, \d_strd
|
add \ds2, \dst, \d_strd
|
||||||
add \sr2, \src, \s_strd
|
add \sr2, \src, \s_strd
|
||||||
@@ -1482,7 +1482,7 @@ L(\type\()_8tap_h_tbl):
|
|||||||
// one temporary for vext in the loop. That's slower on A7 and A53,
|
// one temporary for vext in the loop. That's slower on A7 and A53,
|
||||||
// (but surprisingly, marginally faster on A8 and A73).
|
// (but surprisingly, marginally faster on A8 and A73).
|
||||||
vpush {q4-q6}
|
vpush {q4-q6}
|
||||||
vld1.8 {d0}, [\mx]
|
vld1.8 {d0}, [\mx, :64]
|
||||||
sub \src, \src, #3
|
sub \src, \src, #3
|
||||||
add \ds2, \dst, \d_strd
|
add \ds2, \dst, \d_strd
|
||||||
add \sr2, \src, \s_strd
|
add \sr2, \src, \s_strd
|
||||||
@@ -1629,7 +1629,7 @@ L(\type\()_8tap_v_tbl):
|
|||||||
|
|
||||||
28: // 2x8, 2x16 v
|
28: // 2x8, 2x16 v
|
||||||
vpush {q4-q7}
|
vpush {q4-q7}
|
||||||
vld1.8 {d0}, [\my]
|
vld1.8 {d0}, [\my, :64]
|
||||||
sub \sr2, \src, \s_strd, lsl #1
|
sub \sr2, \src, \s_strd, lsl #1
|
||||||
add \ds2, \dst, \d_strd
|
add \ds2, \dst, \d_strd
|
||||||
sub \src, \sr2, \s_strd
|
sub \src, \sr2, \s_strd
|
||||||
@@ -1709,7 +1709,7 @@ L(\type\()_8tap_v_tbl):
|
|||||||
|
|
||||||
480: // 4x8, 4x16 v
|
480: // 4x8, 4x16 v
|
||||||
vpush {q4}
|
vpush {q4}
|
||||||
vld1.8 {d0}, [\my]
|
vld1.8 {d0}, [\my, :64]
|
||||||
sub \sr2, \src, \s_strd, lsl #1
|
sub \sr2, \src, \s_strd, lsl #1
|
||||||
add \ds2, \dst, \d_strd
|
add \ds2, \dst, \d_strd
|
||||||
sub \src, \sr2, \s_strd
|
sub \src, \sr2, \s_strd
|
||||||
@@ -1782,7 +1782,7 @@ L(\type\()_8tap_v_tbl):
|
|||||||
640:
|
640:
|
||||||
1280:
|
1280:
|
||||||
vpush {q4}
|
vpush {q4}
|
||||||
vld1.8 {d0}, [\my]
|
vld1.8 {d0}, [\my, :64]
|
||||||
sub \src, \src, \s_strd
|
sub \src, \src, \s_strd
|
||||||
sub \src, \src, \s_strd, lsl #1
|
sub \src, \src, \s_strd, lsl #1
|
||||||
vmovl.s8 q0, d0
|
vmovl.s8 q0, d0
|
||||||
@@ -1951,11 +1951,10 @@ L(\type\()_8tap_hv_tbl):
|
|||||||
bl L(\type\()_8tap_filter_2)
|
bl L(\type\()_8tap_filter_2)
|
||||||
|
|
||||||
vext.8 d18, d17, d26, #4
|
vext.8 d18, d17, d26, #4
|
||||||
vmov d19, d26
|
|
||||||
vmull.s16 q2, d16, d2[0]
|
vmull.s16 q2, d16, d2[0]
|
||||||
vmlal.s16 q2, d17, d2[1]
|
vmlal.s16 q2, d17, d2[1]
|
||||||
vmlal.s16 q2, d18, d2[2]
|
vmlal.s16 q2, d18, d2[2]
|
||||||
vmlal.s16 q2, d19, d2[3]
|
vmlal.s16 q2, d26, d2[3]
|
||||||
|
|
||||||
vqrshrn.s32 d4, q2, #\shift_hv
|
vqrshrn.s32 d4, q2, #\shift_hv
|
||||||
vqmovun.s16 d4, q2
|
vqmovun.s16 d4, q2
|
||||||
@@ -1964,11 +1963,11 @@ L(\type\()_8tap_hv_tbl):
|
|||||||
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
|
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
|
||||||
ble 0f
|
ble 0f
|
||||||
vmov d16, d18
|
vmov d16, d18
|
||||||
vmov d17, d19
|
vmov d17, d26
|
||||||
b 2b
|
b 2b
|
||||||
|
|
||||||
280: // 2x8, 2x16, 2x32 hv
|
280: // 2x8, 2x16, 2x32 hv
|
||||||
vld1.8 {d2}, [\my]
|
vld1.8 {d2}, [\my, :64]
|
||||||
sub \src, \src, #1
|
sub \src, \src, #1
|
||||||
sub \sr2, \src, \s_strd, lsl #1
|
sub \sr2, \src, \s_strd, lsl #1
|
||||||
sub \src, \sr2, \s_strd
|
sub \src, \sr2, \s_strd
|
||||||
@@ -2001,7 +2000,6 @@ L(\type\()_8tap_hv_tbl):
|
|||||||
28:
|
28:
|
||||||
bl L(\type\()_8tap_filter_2)
|
bl L(\type\()_8tap_filter_2)
|
||||||
vext.8 d22, d21, d26, #4
|
vext.8 d22, d21, d26, #4
|
||||||
vmov d23, d26
|
|
||||||
vmull.s16 q2, d16, d2[0]
|
vmull.s16 q2, d16, d2[0]
|
||||||
vmlal.s16 q2, d17, d2[1]
|
vmlal.s16 q2, d17, d2[1]
|
||||||
vmlal.s16 q2, d18, d2[2]
|
vmlal.s16 q2, d18, d2[2]
|
||||||
@@ -2009,7 +2007,7 @@ L(\type\()_8tap_hv_tbl):
|
|||||||
vmlal.s16 q2, d20, d3[0]
|
vmlal.s16 q2, d20, d3[0]
|
||||||
vmlal.s16 q2, d21, d3[1]
|
vmlal.s16 q2, d21, d3[1]
|
||||||
vmlal.s16 q2, d22, d3[2]
|
vmlal.s16 q2, d22, d3[2]
|
||||||
vmlal.s16 q2, d23, d3[3]
|
vmlal.s16 q2, d26, d3[3]
|
||||||
|
|
||||||
vqrshrn.s32 d4, q2, #\shift_hv
|
vqrshrn.s32 d4, q2, #\shift_hv
|
||||||
vqmovun.s16 d4, q2
|
vqmovun.s16 d4, q2
|
||||||
@@ -2022,7 +2020,7 @@ L(\type\()_8tap_hv_tbl):
|
|||||||
vmov d18, d20
|
vmov d18, d20
|
||||||
vmov d19, d21
|
vmov d19, d21
|
||||||
vmov d20, d22
|
vmov d20, d22
|
||||||
vmov d21, d23
|
vmov d21, d26
|
||||||
b 28b
|
b 28b
|
||||||
|
|
||||||
0:
|
0:
|
||||||
@@ -2108,7 +2106,7 @@ L(\type\()_8tap_filter_2):
|
|||||||
b 4b
|
b 4b
|
||||||
|
|
||||||
480: // 4x8, 4x16, 4x32 hv
|
480: // 4x8, 4x16, 4x32 hv
|
||||||
vld1.8 {d2}, [\my]
|
vld1.8 {d2}, [\my, :64]
|
||||||
sub \src, \src, #1
|
sub \src, \src, #1
|
||||||
sub \sr2, \src, \s_strd, lsl #1
|
sub \sr2, \src, \s_strd, lsl #1
|
||||||
sub \src, \sr2, \s_strd
|
sub \src, \sr2, \s_strd
|
||||||
@@ -2211,7 +2209,7 @@ L(\type\()_8tap_filter_4):
|
|||||||
bgt 880f
|
bgt 880f
|
||||||
vpush {q4-q7}
|
vpush {q4-q7}
|
||||||
add \my, \my, #2
|
add \my, \my, #2
|
||||||
vld1.8 {d0}, [\mx]
|
vld1.8 {d0}, [\mx, :64]
|
||||||
vld1.32 {d2[]}, [\my]
|
vld1.32 {d2[]}, [\my]
|
||||||
sub \src, \src, #3
|
sub \src, \src, #3
|
||||||
sub \src, \src, \s_strd
|
sub \src, \src, \s_strd
|
||||||
@@ -2301,8 +2299,8 @@ L(\type\()_8tap_filter_4):
|
|||||||
640:
|
640:
|
||||||
1280:
|
1280:
|
||||||
vpush {q4-q7}
|
vpush {q4-q7}
|
||||||
vld1.8 {d0}, [\mx]
|
vld1.8 {d0}, [\mx, :64]
|
||||||
vld1.8 {d2}, [\my]
|
vld1.8 {d2}, [\my, :64]
|
||||||
sub \src, \src, #3
|
sub \src, \src, #3
|
||||||
sub \src, \src, \s_strd
|
sub \src, \src, \s_strd
|
||||||
sub \src, \src, \s_strd, lsl #1
|
sub \src, \src, \s_strd, lsl #1
|
||||||
|
|||||||
2429
third_party/dav1d/src/arm/32/mc16.S
vendored
Normal file
2429
third_party/dav1d/src/arm/32/mc16.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
14
third_party/dav1d/src/arm/64/looprestoration16.S
vendored
14
third_party/dav1d/src/arm/64/looprestoration16.S
vendored
@@ -172,13 +172,13 @@ function wiener_filter_h_16bpc_neon, export=1
|
|||||||
// Interleaving the mul/mla chains actually hurts performance
|
// Interleaving the mul/mla chains actually hurts performance
|
||||||
// significantly on Cortex A53, thus keeping mul/mla tightly
|
// significantly on Cortex A53, thus keeping mul/mla tightly
|
||||||
// chained like this.
|
// chained like this.
|
||||||
|
ext v18.16b, v2.16b, v3.16b, #6
|
||||||
ext v16.16b, v2.16b, v3.16b, #2
|
ext v16.16b, v2.16b, v3.16b, #2
|
||||||
ext v17.16b, v2.16b, v3.16b, #4
|
ext v17.16b, v2.16b, v3.16b, #4
|
||||||
ext v18.16b, v2.16b, v3.16b, #6
|
|
||||||
ext v19.16b, v2.16b, v3.16b, #8
|
ext v19.16b, v2.16b, v3.16b, #8
|
||||||
ext v20.16b, v2.16b, v3.16b, #10
|
ext v20.16b, v2.16b, v3.16b, #10
|
||||||
ext v21.16b, v2.16b, v3.16b, #12
|
|
||||||
ushll_sz v6, v7, v18, #7, \wd
|
ushll_sz v6, v7, v18, #7, \wd
|
||||||
|
ext v21.16b, v2.16b, v3.16b, #12
|
||||||
smlal v6.4s, v2.4h, v0.h[0]
|
smlal v6.4s, v2.4h, v0.h[0]
|
||||||
smlal v6.4s, v16.4h, v0.h[1]
|
smlal v6.4s, v16.4h, v0.h[1]
|
||||||
smlal v6.4s, v17.4h, v0.h[2]
|
smlal v6.4s, v17.4h, v0.h[2]
|
||||||
@@ -195,13 +195,13 @@ function wiener_filter_h_16bpc_neon, export=1
|
|||||||
smlal2 v7.4s, v20.8h, v0.h[5]
|
smlal2 v7.4s, v20.8h, v0.h[5]
|
||||||
smlal2 v7.4s, v21.8h, v0.h[6]
|
smlal2 v7.4s, v21.8h, v0.h[6]
|
||||||
.endif
|
.endif
|
||||||
|
ext v21.16b, v4.16b, v5.16b, #6
|
||||||
ext v19.16b, v4.16b, v5.16b, #2
|
ext v19.16b, v4.16b, v5.16b, #2
|
||||||
ext v20.16b, v4.16b, v5.16b, #4
|
ext v20.16b, v4.16b, v5.16b, #4
|
||||||
ext v21.16b, v4.16b, v5.16b, #6
|
|
||||||
ext v22.16b, v4.16b, v5.16b, #8
|
ext v22.16b, v4.16b, v5.16b, #8
|
||||||
ext v23.16b, v4.16b, v5.16b, #10
|
ext v23.16b, v4.16b, v5.16b, #10
|
||||||
ext v24.16b, v4.16b, v5.16b, #12
|
|
||||||
ushll_sz v16, v17, v21, #7, \wd
|
ushll_sz v16, v17, v21, #7, \wd
|
||||||
|
ext v24.16b, v4.16b, v5.16b, #12
|
||||||
smlal v16.4s, v4.4h, v0.h[0]
|
smlal v16.4s, v4.4h, v0.h[0]
|
||||||
smlal v16.4s, v19.4h, v0.h[1]
|
smlal v16.4s, v19.4h, v0.h[1]
|
||||||
smlal v16.4s, v20.4h, v0.h[2]
|
smlal v16.4s, v20.4h, v0.h[2]
|
||||||
@@ -334,9 +334,9 @@ L(variable_shift_tbl):
|
|||||||
ins v6.s[1], v7.s[0]
|
ins v6.s[1], v7.s[0]
|
||||||
mvni v24.4h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
|
mvni v24.4h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
|
||||||
ushll v16.4s, v16.4h, #7
|
ushll v16.4s, v16.4h, #7
|
||||||
add v6.4s, v6.4s, v30.4s
|
add v6.2s, v6.2s, v30.2s
|
||||||
add v6.4s, v6.4s, v16.4s
|
add v6.2s, v6.2s, v16.2s
|
||||||
srshl v6.4s, v6.4s, v29.4s
|
srshl v6.2s, v6.2s, v29.2s
|
||||||
sqxtun v6.4h, v6.4s
|
sqxtun v6.4h, v6.4s
|
||||||
umin v6.4h, v6.4h, v24.4h
|
umin v6.4h, v6.4h, v24.4h
|
||||||
sub v6.4h, v6.4h, v31.4h
|
sub v6.4h, v6.4h, v31.4h
|
||||||
|
|||||||
10
third_party/dav1d/src/arm/64/mc.S
vendored
10
third_party/dav1d/src/arm/64/mc.S
vendored
@@ -1906,11 +1906,10 @@ L(\type\()_8tap_hv):
|
|||||||
bl L(\type\()_8tap_filter_2)
|
bl L(\type\()_8tap_filter_2)
|
||||||
|
|
||||||
ext v18.8b, v17.8b, v28.8b, #4
|
ext v18.8b, v17.8b, v28.8b, #4
|
||||||
mov v19.8b, v28.8b
|
|
||||||
smull v2.4s, v16.4h, v1.h[0]
|
smull v2.4s, v16.4h, v1.h[0]
|
||||||
smlal v2.4s, v17.4h, v1.h[1]
|
smlal v2.4s, v17.4h, v1.h[1]
|
||||||
smlal v2.4s, v18.4h, v1.h[2]
|
smlal v2.4s, v18.4h, v1.h[2]
|
||||||
smlal v2.4s, v19.4h, v1.h[3]
|
smlal v2.4s, v28.4h, v1.h[3]
|
||||||
|
|
||||||
sqrshrn v2.4h, v2.4s, #\shift_hv
|
sqrshrn v2.4h, v2.4s, #\shift_hv
|
||||||
sqxtun v2.8b, v2.8h
|
sqxtun v2.8b, v2.8h
|
||||||
@@ -1919,7 +1918,7 @@ L(\type\()_8tap_hv):
|
|||||||
st1 {v2.h}[1], [\ds2], \d_strd
|
st1 {v2.h}[1], [\ds2], \d_strd
|
||||||
b.le 0f
|
b.le 0f
|
||||||
mov v16.8b, v18.8b
|
mov v16.8b, v18.8b
|
||||||
mov v17.8b, v19.8b
|
mov v17.8b, v28.8b
|
||||||
b 2b
|
b 2b
|
||||||
|
|
||||||
280: // 2x8, 2x16, 2x32 hv
|
280: // 2x8, 2x16, 2x32 hv
|
||||||
@@ -1956,7 +1955,6 @@ L(\type\()_8tap_hv):
|
|||||||
28:
|
28:
|
||||||
bl L(\type\()_8tap_filter_2)
|
bl L(\type\()_8tap_filter_2)
|
||||||
ext v22.8b, v21.8b, v28.8b, #4
|
ext v22.8b, v21.8b, v28.8b, #4
|
||||||
mov v23.8b, v28.8b
|
|
||||||
smull v2.4s, v16.4h, v1.h[0]
|
smull v2.4s, v16.4h, v1.h[0]
|
||||||
smlal v2.4s, v17.4h, v1.h[1]
|
smlal v2.4s, v17.4h, v1.h[1]
|
||||||
smlal v2.4s, v18.4h, v1.h[2]
|
smlal v2.4s, v18.4h, v1.h[2]
|
||||||
@@ -1964,7 +1962,7 @@ L(\type\()_8tap_hv):
|
|||||||
smlal v2.4s, v20.4h, v1.h[4]
|
smlal v2.4s, v20.4h, v1.h[4]
|
||||||
smlal v2.4s, v21.4h, v1.h[5]
|
smlal v2.4s, v21.4h, v1.h[5]
|
||||||
smlal v2.4s, v22.4h, v1.h[6]
|
smlal v2.4s, v22.4h, v1.h[6]
|
||||||
smlal v2.4s, v23.4h, v1.h[7]
|
smlal v2.4s, v28.4h, v1.h[7]
|
||||||
|
|
||||||
sqrshrn v2.4h, v2.4s, #\shift_hv
|
sqrshrn v2.4h, v2.4s, #\shift_hv
|
||||||
sqxtun v2.8b, v2.8h
|
sqxtun v2.8b, v2.8h
|
||||||
@@ -1977,7 +1975,7 @@ L(\type\()_8tap_hv):
|
|||||||
mov v18.8b, v20.8b
|
mov v18.8b, v20.8b
|
||||||
mov v19.8b, v21.8b
|
mov v19.8b, v21.8b
|
||||||
mov v20.8b, v22.8b
|
mov v20.8b, v22.8b
|
||||||
mov v21.8b, v23.8b
|
mov v21.8b, v28.8b
|
||||||
b 28b
|
b 28b
|
||||||
|
|
||||||
0:
|
0:
|
||||||
|
|||||||
92
third_party/dav1d/src/arm/64/mc16.S
vendored
92
third_party/dav1d/src/arm/64/mc16.S
vendored
@@ -1004,11 +1004,11 @@ function put_neon
|
|||||||
b.gt 2b
|
b.gt 2b
|
||||||
ret
|
ret
|
||||||
4:
|
4:
|
||||||
ld1 {v0.8b}, [x2], x3
|
ld1 {v0.4h}, [x2], x3
|
||||||
ld1 {v1.8b}, [x2], x3
|
ld1 {v1.4h}, [x2], x3
|
||||||
subs w5, w5, #2
|
subs w5, w5, #2
|
||||||
st1 {v0.8b}, [x0], x1
|
st1 {v0.4h}, [x0], x1
|
||||||
st1 {v1.8b}, [x0], x1
|
st1 {v1.4h}, [x0], x1
|
||||||
b.gt 4b
|
b.gt 4b
|
||||||
ret
|
ret
|
||||||
80:
|
80:
|
||||||
@@ -1017,11 +1017,11 @@ function put_neon
|
|||||||
add x9, x2, x3
|
add x9, x2, x3
|
||||||
lsl x3, x3, #1
|
lsl x3, x3, #1
|
||||||
8:
|
8:
|
||||||
ld1 {v0.16b}, [x2], x3
|
ld1 {v0.8h}, [x2], x3
|
||||||
ld1 {v1.16b}, [x9], x3
|
ld1 {v1.8h}, [x9], x3
|
||||||
subs w5, w5, #2
|
subs w5, w5, #2
|
||||||
st1 {v0.16b}, [x0], x1
|
st1 {v0.8h}, [x0], x1
|
||||||
st1 {v1.16b}, [x8], x1
|
st1 {v1.8h}, [x8], x1
|
||||||
b.gt 8b
|
b.gt 8b
|
||||||
ret
|
ret
|
||||||
16:
|
16:
|
||||||
@@ -2039,7 +2039,6 @@ L(\type\()_8tap_hv):
|
|||||||
sxtl v0.8h, v0.8b
|
sxtl v0.8h, v0.8b
|
||||||
sxtl v1.8h, v1.8b
|
sxtl v1.8h, v1.8b
|
||||||
mov x15, x30
|
mov x15, x30
|
||||||
sxtl v1.4s, v1.4h
|
|
||||||
|
|
||||||
ld1 {v27.8h}, [\src], \s_strd
|
ld1 {v27.8h}, [\src], \s_strd
|
||||||
ext v28.16b, v27.16b, v27.16b, #2
|
ext v28.16b, v27.16b, v27.16b, #2
|
||||||
@@ -2049,19 +2048,23 @@ L(\type\()_8tap_hv):
|
|||||||
addp v16.4s, v27.4s, v27.4s
|
addp v16.4s, v27.4s, v27.4s
|
||||||
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
|
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
|
||||||
bl L(\type\()_8tap_filter_2)
|
bl L(\type\()_8tap_filter_2)
|
||||||
|
// The intermediates from the horizontal pass fit in 16 bit without
|
||||||
|
// any bias; we could just as well keep them as .4s, but narrowing
|
||||||
|
// them to .4h gives a significant speedup on out of order cores
|
||||||
|
// (at the cost of a smaller slowdown on in-order cores such as A53).
|
||||||
|
xtn v16.4h, v16.4s
|
||||||
|
|
||||||
trn1 v16.2d, v16.2d, v24.2d
|
trn1 v16.2s, v16.2s, v24.2s
|
||||||
mov v17.16b, v24.16b
|
mov v17.8b, v24.8b
|
||||||
|
|
||||||
2:
|
2:
|
||||||
bl L(\type\()_8tap_filter_2)
|
bl L(\type\()_8tap_filter_2)
|
||||||
|
|
||||||
ext v18.16b, v17.16b, v24.16b, #8
|
ext v18.8b, v17.8b, v24.8b, #4
|
||||||
mov v19.16b, v24.16b
|
smull v2.4s, v16.4h, v1.h[0]
|
||||||
mul v2.4s, v16.4s, v1.s[0]
|
smlal v2.4s, v17.4h, v1.h[1]
|
||||||
mla v2.4s, v17.4s, v1.s[1]
|
smlal v2.4s, v18.4h, v1.h[2]
|
||||||
mla v2.4s, v18.4s, v1.s[2]
|
smlal v2.4s, v24.4h, v1.h[3]
|
||||||
mla v2.4s, v19.4s, v1.s[3]
|
|
||||||
|
|
||||||
srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
|
srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
|
||||||
sqxtun v2.4h, v2.4s
|
sqxtun v2.4h, v2.4s
|
||||||
@@ -2070,8 +2073,8 @@ L(\type\()_8tap_hv):
|
|||||||
st1 {v2.s}[0], [\dst], \d_strd
|
st1 {v2.s}[0], [\dst], \d_strd
|
||||||
st1 {v2.s}[1], [\ds2], \d_strd
|
st1 {v2.s}[1], [\ds2], \d_strd
|
||||||
b.le 0f
|
b.le 0f
|
||||||
mov v16.16b, v18.16b
|
mov v16.8b, v18.8b
|
||||||
mov v17.16b, v19.16b
|
mov v17.8b, v24.8b
|
||||||
b 2b
|
b 2b
|
||||||
|
|
||||||
280: // 2x8, 2x16, 2x32 hv
|
280: // 2x8, 2x16, 2x32 hv
|
||||||
@@ -2085,8 +2088,6 @@ L(\type\()_8tap_hv):
|
|||||||
sxtl v0.8h, v0.8b
|
sxtl v0.8h, v0.8b
|
||||||
sxtl v1.8h, v1.8b
|
sxtl v1.8h, v1.8b
|
||||||
mov x15, x30
|
mov x15, x30
|
||||||
sxtl2 v2.4s, v1.8h
|
|
||||||
sxtl v1.4s, v1.4h
|
|
||||||
|
|
||||||
ld1 {v27.8h}, [\src], \s_strd
|
ld1 {v27.8h}, [\src], \s_strd
|
||||||
ext v28.16b, v27.16b, v27.16b, #2
|
ext v28.16b, v27.16b, v27.16b, #2
|
||||||
@@ -2095,29 +2096,33 @@ L(\type\()_8tap_hv):
|
|||||||
addp v27.4s, v27.4s, v28.4s
|
addp v27.4s, v27.4s, v28.4s
|
||||||
addp v16.4s, v27.4s, v27.4s
|
addp v16.4s, v27.4s, v27.4s
|
||||||
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
|
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
|
||||||
|
// The intermediates from the horizontal pass fit in 16 bit without
|
||||||
|
// any bias; we could just as well keep them as .4s, but narrowing
|
||||||
|
// them to .4h gives a significant speedup on out of order cores
|
||||||
|
// (at the cost of a smaller slowdown on in-order cores such as A53).
|
||||||
|
|
||||||
bl L(\type\()_8tap_filter_2)
|
bl L(\type\()_8tap_filter_2)
|
||||||
trn1 v16.2d, v16.2d, v24.2d
|
xtn v16.4h, v16.4s
|
||||||
mov v17.16b, v24.16b
|
trn1 v16.2s, v16.2s, v24.2s
|
||||||
|
mov v17.8b, v24.8b
|
||||||
bl L(\type\()_8tap_filter_2)
|
bl L(\type\()_8tap_filter_2)
|
||||||
ext v18.16b, v17.16b, v24.16b, #8
|
ext v18.8b, v17.8b, v24.8b, #4
|
||||||
mov v19.16b, v24.16b
|
mov v19.8b, v24.8b
|
||||||
bl L(\type\()_8tap_filter_2)
|
bl L(\type\()_8tap_filter_2)
|
||||||
ext v20.16b, v19.16b, v24.16b, #8
|
ext v20.8b, v19.8b, v24.8b, #4
|
||||||
mov v21.16b, v24.16b
|
mov v21.8b, v24.8b
|
||||||
|
|
||||||
28:
|
28:
|
||||||
bl L(\type\()_8tap_filter_2)
|
bl L(\type\()_8tap_filter_2)
|
||||||
ext v22.16b, v21.16b, v24.16b, #8
|
ext v22.8b, v21.8b, v24.8b, #4
|
||||||
mov v23.16b, v24.16b
|
smull v3.4s, v16.4h, v1.h[0]
|
||||||
mul v3.4s, v16.4s, v1.s[0]
|
smlal v3.4s, v17.4h, v1.h[1]
|
||||||
mla v3.4s, v17.4s, v1.s[1]
|
smlal v3.4s, v18.4h, v1.h[2]
|
||||||
mla v3.4s, v18.4s, v1.s[2]
|
smlal v3.4s, v19.4h, v1.h[3]
|
||||||
mla v3.4s, v19.4s, v1.s[3]
|
smlal v3.4s, v20.4h, v1.h[4]
|
||||||
mla v3.4s, v20.4s, v2.s[0]
|
smlal v3.4s, v21.4h, v1.h[5]
|
||||||
mla v3.4s, v21.4s, v2.s[1]
|
smlal v3.4s, v22.4h, v1.h[6]
|
||||||
mla v3.4s, v22.4s, v2.s[2]
|
smlal v3.4s, v24.4h, v1.h[7]
|
||||||
mla v3.4s, v23.4s, v2.s[3]
|
|
||||||
|
|
||||||
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
|
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
|
||||||
sqxtun v3.4h, v3.4s
|
sqxtun v3.4h, v3.4s
|
||||||
@@ -2126,12 +2131,12 @@ L(\type\()_8tap_hv):
|
|||||||
st1 {v3.s}[0], [\dst], \d_strd
|
st1 {v3.s}[0], [\dst], \d_strd
|
||||||
st1 {v3.s}[1], [\ds2], \d_strd
|
st1 {v3.s}[1], [\ds2], \d_strd
|
||||||
b.le 0f
|
b.le 0f
|
||||||
mov v16.16b, v18.16b
|
mov v16.8b, v18.8b
|
||||||
mov v17.16b, v19.16b
|
mov v17.8b, v19.8b
|
||||||
mov v18.16b, v20.16b
|
mov v18.8b, v20.8b
|
||||||
mov v19.16b, v21.16b
|
mov v19.8b, v21.8b
|
||||||
mov v20.16b, v22.16b
|
mov v20.8b, v22.8b
|
||||||
mov v21.16b, v23.16b
|
mov v21.8b, v24.8b
|
||||||
b 28b
|
b 28b
|
||||||
|
|
||||||
0:
|
0:
|
||||||
@@ -2151,6 +2156,7 @@ L(\type\()_8tap_filter_2):
|
|||||||
smlal v24.4s, v27.4h, v0.h[2]
|
smlal v24.4s, v27.4h, v0.h[2]
|
||||||
smlal v24.4s, v28.4h, v0.h[3]
|
smlal v24.4s, v28.4h, v0.h[3]
|
||||||
srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
|
srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
|
||||||
|
xtn v24.4h, v24.4s
|
||||||
ret
|
ret
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,6 @@
|
|||||||
#include "src/looprestoration.h"
|
#include "src/looprestoration.h"
|
||||||
#include "src/tables.h"
|
#include "src/tables.h"
|
||||||
|
|
||||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
|
||||||
// The 8bpc version calculates things slightly differently than the reference
|
// The 8bpc version calculates things slightly differently than the reference
|
||||||
// C version. That version calculates roughly this:
|
// C version. That version calculates roughly this:
|
||||||
// int16_t sum = 0;
|
// int16_t sum = 0;
|
||||||
@@ -105,6 +104,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||||
void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
|
void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
|
||||||
const pixel (*left)[4],
|
const pixel (*left)[4],
|
||||||
const pixel *src, const ptrdiff_t stride,
|
const pixel *src, const ptrdiff_t stride,
|
||||||
@@ -290,8 +290,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont
|
|||||||
|
|
||||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||||
|
|
||||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
|
||||||
c->wiener = wiener_filter_neon;
|
c->wiener = wiener_filter_neon;
|
||||||
|
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||||
if (bpc <= 10)
|
if (bpc <= 10)
|
||||||
c->selfguided = sgr_filter_neon;
|
c->selfguided = sgr_filter_neon;
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
2
third_party/dav1d/src/arm/mc_init_tmpl.c
vendored
2
third_party/dav1d/src/arm/mc_init_tmpl.c
vendored
@@ -77,7 +77,6 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
|
|||||||
|
|
||||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||||
|
|
||||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
|
||||||
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
|
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
|
||||||
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
|
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
|
||||||
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
|
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
|
||||||
@@ -103,6 +102,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
|
|||||||
c->avg = BF(dav1d_avg, neon);
|
c->avg = BF(dav1d_avg, neon);
|
||||||
c->w_avg = BF(dav1d_w_avg, neon);
|
c->w_avg = BF(dav1d_w_avg, neon);
|
||||||
c->mask = BF(dav1d_mask, neon);
|
c->mask = BF(dav1d_mask, neon);
|
||||||
|
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||||
c->blend = BF(dav1d_blend, neon);
|
c->blend = BF(dav1d_blend, neon);
|
||||||
c->blend_h = BF(dav1d_blend_h, neon);
|
c->blend_h = BF(dav1d_blend_h, neon);
|
||||||
c->blend_v = BF(dav1d_blend_v, neon);
|
c->blend_v = BF(dav1d_blend_v, neon);
|
||||||
|
|||||||
16
third_party/dav1d/src/decode.c
vendored
16
third_party/dav1d/src/decode.c
vendored
@@ -773,10 +773,10 @@ static int decode_b(Dav1dTileContext *const t,
|
|||||||
signabs(t->warpmv.matrix[3]),
|
signabs(t->warpmv.matrix[3]),
|
||||||
signabs(t->warpmv.matrix[4]),
|
signabs(t->warpmv.matrix[4]),
|
||||||
signabs(t->warpmv.matrix[5]),
|
signabs(t->warpmv.matrix[5]),
|
||||||
signabs(t->warpmv.alpha),
|
signabs(t->warpmv.u.p.alpha),
|
||||||
signabs(t->warpmv.beta),
|
signabs(t->warpmv.u.p.beta),
|
||||||
signabs(t->warpmv.gamma),
|
signabs(t->warpmv.u.p.gamma),
|
||||||
signabs(t->warpmv.delta),
|
signabs(t->warpmv.u.p.delta),
|
||||||
b->mv2d.y, b->mv2d.x);
|
b->mv2d.y, b->mv2d.x);
|
||||||
#undef signabs
|
#undef signabs
|
||||||
}
|
}
|
||||||
@@ -1843,10 +1843,10 @@ static int decode_b(Dav1dTileContext *const t,
|
|||||||
signabs(t->warpmv.matrix[3]),
|
signabs(t->warpmv.matrix[3]),
|
||||||
signabs(t->warpmv.matrix[4]),
|
signabs(t->warpmv.matrix[4]),
|
||||||
signabs(t->warpmv.matrix[5]),
|
signabs(t->warpmv.matrix[5]),
|
||||||
signabs(t->warpmv.alpha),
|
signabs(t->warpmv.u.p.alpha),
|
||||||
signabs(t->warpmv.beta),
|
signabs(t->warpmv.u.p.beta),
|
||||||
signabs(t->warpmv.gamma),
|
signabs(t->warpmv.u.p.gamma),
|
||||||
signabs(t->warpmv.delta),
|
signabs(t->warpmv.u.p.delta),
|
||||||
b->mv[0].y, b->mv[0].x);
|
b->mv[0].y, b->mv[0].x);
|
||||||
#undef signabs
|
#undef signabs
|
||||||
if (f->frame_thread.pass) {
|
if (f->frame_thread.pass) {
|
||||||
|
|||||||
37
third_party/dav1d/src/meson.build
vendored
37
third_party/dav1d/src/meson.build
vendored
@@ -82,7 +82,7 @@ libdav1d_entrypoints_sources = files(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# ASM specific sources
|
# ASM specific sources
|
||||||
libdav1d_nasm_objs = []
|
libdav1d_asm_objs = []
|
||||||
# Arch-specific flags
|
# Arch-specific flags
|
||||||
arch_flags = []
|
arch_flags = []
|
||||||
if is_asm_enabled
|
if is_asm_enabled
|
||||||
@@ -102,7 +102,7 @@ if is_asm_enabled
|
|||||||
)
|
)
|
||||||
if (host_machine.cpu_family() == 'aarch64' or
|
if (host_machine.cpu_family() == 'aarch64' or
|
||||||
host_machine.cpu() == 'arm64')
|
host_machine.cpu() == 'arm64')
|
||||||
libdav1d_sources += files(
|
libdav1d_sources_asm = files(
|
||||||
# itx.S is used for both 8 and 16 bpc.
|
# itx.S is used for both 8 and 16 bpc.
|
||||||
'arm/64/itx.S',
|
'arm/64/itx.S',
|
||||||
'arm/64/looprestoration_common.S',
|
'arm/64/looprestoration_common.S',
|
||||||
@@ -110,7 +110,7 @@ if is_asm_enabled
|
|||||||
)
|
)
|
||||||
|
|
||||||
if dav1d_bitdepths.contains('8')
|
if dav1d_bitdepths.contains('8')
|
||||||
libdav1d_sources += files(
|
libdav1d_sources_asm += files(
|
||||||
'arm/64/cdef.S',
|
'arm/64/cdef.S',
|
||||||
'arm/64/ipred.S',
|
'arm/64/ipred.S',
|
||||||
'arm/64/loopfilter.S',
|
'arm/64/loopfilter.S',
|
||||||
@@ -120,7 +120,7 @@ if is_asm_enabled
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
if dav1d_bitdepths.contains('16')
|
if dav1d_bitdepths.contains('16')
|
||||||
libdav1d_sources += files(
|
libdav1d_sources_asm += files(
|
||||||
'arm/64/cdef16.S',
|
'arm/64/cdef16.S',
|
||||||
'arm/64/ipred16.S',
|
'arm/64/ipred16.S',
|
||||||
'arm/64/itx16.S',
|
'arm/64/itx16.S',
|
||||||
@@ -130,12 +130,12 @@ if is_asm_enabled
|
|||||||
)
|
)
|
||||||
endif
|
endif
|
||||||
elif host_machine.cpu_family().startswith('arm')
|
elif host_machine.cpu_family().startswith('arm')
|
||||||
libdav1d_sources += files(
|
libdav1d_sources_asm = files(
|
||||||
'arm/32/msac.S',
|
'arm/32/msac.S',
|
||||||
)
|
)
|
||||||
|
|
||||||
if dav1d_bitdepths.contains('8')
|
if dav1d_bitdepths.contains('8')
|
||||||
libdav1d_sources += files(
|
libdav1d_sources_asm += files(
|
||||||
'arm/32/cdef.S',
|
'arm/32/cdef.S',
|
||||||
'arm/32/ipred.S',
|
'arm/32/ipred.S',
|
||||||
'arm/32/itx.S',
|
'arm/32/itx.S',
|
||||||
@@ -146,10 +146,18 @@ if is_asm_enabled
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
if dav1d_bitdepths.contains('16')
|
if dav1d_bitdepths.contains('16')
|
||||||
libdav1d_sources += files(
|
libdav1d_sources_asm += files(
|
||||||
|
'arm/32/looprestoration16.S',
|
||||||
|
'arm/32/mc16.S',
|
||||||
)
|
)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
if use_gaspp
|
||||||
|
libdav1d_asm_objs = gaspp_gen.process(libdav1d_sources_asm)
|
||||||
|
else
|
||||||
|
libdav1d_sources += libdav1d_sources_asm
|
||||||
|
endif
|
||||||
elif host_machine.cpu_family().startswith('x86')
|
elif host_machine.cpu_family().startswith('x86')
|
||||||
|
|
||||||
libdav1d_sources += files(
|
libdav1d_sources += files(
|
||||||
@@ -200,7 +208,7 @@ if is_asm_enabled
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
# Compile the ASM sources with NASM
|
# Compile the ASM sources with NASM
|
||||||
libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
|
libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
|
||||||
elif host_machine.cpu() == 'ppc64le'
|
elif host_machine.cpu() == 'ppc64le'
|
||||||
arch_flags = ['-maltivec', '-mvsx']
|
arch_flags = ['-maltivec', '-mvsx']
|
||||||
libdav1d_sources += files(
|
libdav1d_sources += files(
|
||||||
@@ -222,17 +230,6 @@ api_export_flags = []
|
|||||||
#
|
#
|
||||||
|
|
||||||
if host_machine.system() == 'windows' and get_option('default_library') != 'static'
|
if host_machine.system() == 'windows' and get_option('default_library') != 'static'
|
||||||
rc_version_array = meson.project_version().split('.')
|
|
||||||
winmod = import('windows')
|
|
||||||
rc_data = configuration_data()
|
|
||||||
rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
|
|
||||||
rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
|
|
||||||
rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
|
|
||||||
rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
|
|
||||||
rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
|
|
||||||
rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
|
|
||||||
rc_data.set('COPYRIGHT_YEARS', '2019')
|
|
||||||
|
|
||||||
rc_file = configure_file(
|
rc_file = configure_file(
|
||||||
input : 'dav1d.rc.in',
|
input : 'dav1d.rc.in',
|
||||||
output : 'dav1d.rc',
|
output : 'dav1d.rc',
|
||||||
@@ -301,7 +298,7 @@ endif
|
|||||||
|
|
||||||
libdav1d = library('dav1d',
|
libdav1d = library('dav1d',
|
||||||
libdav1d_sources,
|
libdav1d_sources,
|
||||||
libdav1d_nasm_objs,
|
libdav1d_asm_objs,
|
||||||
libdav1d_rc_obj,
|
libdav1d_rc_obj,
|
||||||
|
|
||||||
objects : [
|
objects : [
|
||||||
|
|||||||
2
third_party/dav1d/src/obu.c
vendored
2
third_party/dav1d/src/obu.c
vendored
@@ -112,6 +112,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
|
|||||||
struct Dav1dSequenceHeaderOperatingPoint *const op =
|
struct Dav1dSequenceHeaderOperatingPoint *const op =
|
||||||
&hdr->operating_points[i];
|
&hdr->operating_points[i];
|
||||||
op->idc = dav1d_get_bits(gb, 12);
|
op->idc = dav1d_get_bits(gb, 12);
|
||||||
|
if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00)))
|
||||||
|
goto error;
|
||||||
op->major_level = 2 + dav1d_get_bits(gb, 3);
|
op->major_level = 2 + dav1d_get_bits(gb, 3);
|
||||||
op->minor_level = dav1d_get_bits(gb, 2);
|
op->minor_level = dav1d_get_bits(gb, 2);
|
||||||
op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0;
|
op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0;
|
||||||
|
|||||||
12
third_party/dav1d/src/recon_tmpl.c
vendored
12
third_party/dav1d/src/recon_tmpl.c
vendored
@@ -1082,11 +1082,11 @@ static int warp_affine(Dav1dTileContext *const t,
|
|||||||
const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
|
const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
|
||||||
|
|
||||||
const int dx = (int) (mvx >> 16) - 4;
|
const int dx = (int) (mvx >> 16) - 4;
|
||||||
const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
|
const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
|
||||||
wmp->beta * 7) & ~0x3f;
|
wmp->u.p.beta * 7) & ~0x3f;
|
||||||
const int dy = (int) (mvy >> 16) - 4;
|
const int dy = (int) (mvy >> 16) - 4;
|
||||||
const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 -
|
const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
|
||||||
wmp->delta * 4) & ~0x3f;
|
wmp->u.p.delta * 4) & ~0x3f;
|
||||||
|
|
||||||
const pixel *ref_ptr;
|
const pixel *ref_ptr;
|
||||||
ptrdiff_t ref_stride = refp->p.stride[!!pl];
|
ptrdiff_t ref_stride = refp->p.stride[!!pl];
|
||||||
@@ -1108,10 +1108,10 @@ static int warp_affine(Dav1dTileContext *const t,
|
|||||||
}
|
}
|
||||||
if (dst16 != NULL)
|
if (dst16 != NULL)
|
||||||
dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
|
dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
|
||||||
wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
|
wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
|
||||||
else
|
else
|
||||||
dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
|
dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
|
||||||
wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
|
wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
|
||||||
}
|
}
|
||||||
if (dst8) dst8 += 8 * PXSTRIDE(dstride);
|
if (dst8) dst8 += 8 * PXSTRIDE(dstride);
|
||||||
else dst16 += 8 * dstride;
|
else dst16 += 8 * dstride;
|
||||||
|
|||||||
8
third_party/dav1d/src/tables.c
vendored
8
third_party/dav1d/src/tables.c
vendored
@@ -391,10 +391,10 @@ const Dav1dWarpedMotionParams dav1d_default_wm_params = {
|
|||||||
0, 0, 1 << 16,
|
0, 0, 1 << 16,
|
||||||
0, 0, 1 << 16,
|
0, 0, 1 << 16,
|
||||||
},
|
},
|
||||||
.alpha = 0,
|
.u.p.alpha = 0,
|
||||||
.beta = 0,
|
.u.p.beta = 0,
|
||||||
.gamma = 0,
|
.u.p.gamma = 0,
|
||||||
.delta = 0,
|
.u.p.delta = 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
|
const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
|
||||||
|
|||||||
12
third_party/dav1d/src/warpmv.c
vendored
12
third_party/dav1d/src/warpmv.c
vendored
@@ -82,21 +82,21 @@ int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) {
|
|||||||
|
|
||||||
if (mat[2] <= 0) return 1;
|
if (mat[2] <= 0) return 1;
|
||||||
|
|
||||||
wm->alpha = iclip_wmp(mat[2] - 0x10000);
|
wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000);
|
||||||
wm->beta = iclip_wmp(mat[3]);
|
wm->u.p.beta = iclip_wmp(mat[3]);
|
||||||
|
|
||||||
int shift;
|
int shift;
|
||||||
const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]);
|
const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]);
|
||||||
const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y;
|
const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y;
|
||||||
const int rnd = (1 << shift) >> 1;
|
const int rnd = (1 << shift) >> 1;
|
||||||
wm->gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
|
wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
|
||||||
const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y;
|
const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y;
|
||||||
wm->delta = iclip_wmp(mat[5] -
|
wm->u.p.delta = iclip_wmp(mat[5] -
|
||||||
apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) -
|
apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) -
|
||||||
0x10000);
|
0x10000);
|
||||||
|
|
||||||
return (4 * abs(wm->alpha) + 7 * abs(wm->beta) >= 0x10000) ||
|
return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) ||
|
||||||
(4 * abs(wm->gamma) + 4 * abs(wm->delta) >= 0x10000);
|
(4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int resolve_divisor_64(const uint64_t d, int *const shift) {
|
static int resolve_divisor_64(const uint64_t d, int *const shift) {
|
||||||
|
|||||||
734
third_party/dav1d/src/x86/mc_avx2.asm
vendored
734
third_party/dav1d/src/x86/mc_avx2.asm
vendored
File diff suppressed because it is too large
Load Diff
1600
third_party/dav1d/src/x86/mc_sse.asm
vendored
1600
third_party/dav1d/src/x86/mc_sse.asm
vendored
File diff suppressed because it is too large
Load Diff
33
third_party/dav1d/tests/header_test.c
vendored
Normal file
33
third_party/dav1d/tests/header_test.c
vendored
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
/*
|
||||||
|
* Copyright © 2018, VideoLAN and dav1d authors
|
||||||
|
* Copyright © 2018, Two Orioles, LLC
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||||
|
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include DAV1D_TEST_HEADER
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
36
third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
vendored
36
third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
vendored
@@ -31,6 +31,7 @@
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include <dav1d/dav1d.h>
|
#include <dav1d/dav1d.h>
|
||||||
#include "src/cpu.h"
|
#include "src/cpu.h"
|
||||||
@@ -38,8 +39,6 @@
|
|||||||
|
|
||||||
#ifdef DAV1D_ALLOC_FAIL
|
#ifdef DAV1D_ALLOC_FAIL
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
#include "alloc_fail.h"
|
#include "alloc_fail.h"
|
||||||
|
|
||||||
static unsigned djb_xor(const uint8_t * c, size_t len) {
|
static unsigned djb_xor(const uint8_t * c, size_t len) {
|
||||||
@@ -56,6 +55,39 @@ static unsigned r32le(const uint8_t *const p) {
|
|||||||
|
|
||||||
#define DAV1D_FUZZ_MAX_SIZE 4096 * 4096
|
#define DAV1D_FUZZ_MAX_SIZE 4096 * 4096
|
||||||
|
|
||||||
|
// search for "--cpumask xxx" in argv and remove both parameters
|
||||||
|
int LLVMFuzzerInitialize(int *argc, char ***argv) {
|
||||||
|
int i = 1;
|
||||||
|
for (; i < *argc; i++) {
|
||||||
|
if (!strcmp((*argv)[i], "--cpumask")) {
|
||||||
|
const char * cpumask = (*argv)[i+1];
|
||||||
|
if (cpumask) {
|
||||||
|
char *end;
|
||||||
|
unsigned res;
|
||||||
|
if (!strncmp(cpumask, "0x", 2)) {
|
||||||
|
cpumask += 2;
|
||||||
|
res = (unsigned) strtoul(cpumask, &end, 16);
|
||||||
|
} else {
|
||||||
|
res = (unsigned) strtoul(cpumask, &end, 0);
|
||||||
|
}
|
||||||
|
if (end != cpumask && !end[0]) {
|
||||||
|
dav1d_set_cpu_flags_mask(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; i < *argc - 2; i++) {
|
||||||
|
(*argv)[i] = (*argv)[i + 2];
|
||||||
|
}
|
||||||
|
|
||||||
|
*argc = i;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// expects ivf input
|
// expects ivf input
|
||||||
|
|
||||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
|
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
|
||||||
|
|||||||
@@ -31,6 +31,7 @@
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
int LLVMFuzzerInitialize(int *argc, char ***argv);
|
||||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);
|
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);
|
||||||
|
|
||||||
#endif /* DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H */
|
#endif /* DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H */
|
||||||
|
|||||||
6
third_party/dav1d/tests/libfuzzer/main.c
vendored
6
third_party/dav1d/tests/libfuzzer/main.c
vendored
@@ -40,7 +40,7 @@
|
|||||||
|
|
||||||
// expects ivf input
|
// expects ivf input
|
||||||
|
|
||||||
int main(const int argc, char *const *const argv) {
|
int main(int argc, char *argv[]) {
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
FILE *f = NULL;
|
FILE *f = NULL;
|
||||||
int64_t fsize;
|
int64_t fsize;
|
||||||
@@ -48,6 +48,10 @@ int main(const int argc, char *const *const argv) {
|
|||||||
uint8_t *data = NULL;
|
uint8_t *data = NULL;
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
|
|
||||||
|
if (LLVMFuzzerInitialize(&argc, &argv)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (argc != 2) {
|
if (argc != 2) {
|
||||||
fprintf(stdout, "Usage:\n%s fuzzing_testcase.ivf\n", argv[0]);
|
fprintf(stdout, "Usage:\n%s fuzzing_testcase.ivf\n", argv[0]);
|
||||||
return -1;
|
return -1;
|
||||||
|
|||||||
50
third_party/dav1d/tests/meson.build
vendored
50
third_party/dav1d/tests/meson.build
vendored
@@ -31,8 +31,6 @@ if not get_option('enable_tests')
|
|||||||
subdir_done()
|
subdir_done()
|
||||||
endif
|
endif
|
||||||
|
|
||||||
libdav1d_nasm_objs_if_needed = []
|
|
||||||
|
|
||||||
if is_asm_enabled
|
if is_asm_enabled
|
||||||
checkasm_sources = files(
|
checkasm_sources = files(
|
||||||
'checkasm/checkasm.c',
|
'checkasm/checkasm.c',
|
||||||
@@ -62,25 +60,27 @@ if is_asm_enabled
|
|||||||
checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects()
|
checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects()
|
||||||
endforeach
|
endforeach
|
||||||
|
|
||||||
checkasm_nasm_objs = []
|
checkasm_asm_objs = []
|
||||||
|
checkasm_asm_sources = []
|
||||||
if host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64'
|
if host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64'
|
||||||
checkasm_sources += files('checkasm/arm/checkasm_64.S')
|
checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
|
||||||
elif host_machine.cpu_family().startswith('arm')
|
elif host_machine.cpu_family().startswith('arm')
|
||||||
checkasm_sources += files('checkasm/arm/checkasm_32.S')
|
checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
|
||||||
elif host_machine.cpu_family().startswith('x86')
|
elif host_machine.cpu_family().startswith('x86')
|
||||||
checkasm_nasm_objs = nasm_gen.process(files('checkasm/x86/checkasm.asm'))
|
checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
|
||||||
|
endif
|
||||||
|
|
||||||
|
if use_gaspp
|
||||||
|
checkasm_asm_objs += gaspp_gen.process(checkasm_asm_sources)
|
||||||
|
else
|
||||||
|
checkasm_sources += checkasm_asm_sources
|
||||||
endif
|
endif
|
||||||
|
|
||||||
m_lib = cc.find_library('m', required: false)
|
m_lib = cc.find_library('m', required: false)
|
||||||
|
|
||||||
if meson.version().version_compare('< 0.48.999')
|
|
||||||
libdav1d_nasm_objs_if_needed = libdav1d_nasm_objs
|
|
||||||
endif
|
|
||||||
|
|
||||||
checkasm = executable('checkasm',
|
checkasm = executable('checkasm',
|
||||||
checkasm_sources,
|
checkasm_sources,
|
||||||
checkasm_nasm_objs,
|
checkasm_asm_objs,
|
||||||
libdav1d_nasm_objs_if_needed,
|
|
||||||
|
|
||||||
objects: [
|
objects: [
|
||||||
checkasm_bitdepth_objs,
|
checkasm_bitdepth_objs,
|
||||||
@@ -101,10 +101,30 @@ if is_asm_enabled
|
|||||||
test('checkasm', checkasm, is_parallel: false)
|
test('checkasm', checkasm, is_parallel: false)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
c99_extension_flag = cc.first_supported_argument(
|
||||||
|
'-Werror=c11-extensions',
|
||||||
|
'-Werror=c99-c11-compat',
|
||||||
|
'-Wc11-extensions',
|
||||||
|
'-Wc99-c11-compat',
|
||||||
|
)
|
||||||
|
|
||||||
|
# dav1d_api_headers
|
||||||
|
foreach header : dav1d_api_headers
|
||||||
|
target = header + '_test'
|
||||||
|
|
||||||
|
header_test_exe = executable(target,
|
||||||
|
'header_test.c',
|
||||||
|
include_directories: dav1d_inc_dirs,
|
||||||
|
c_args: ['-DDAV1D_TEST_HEADER="@0@"'.format(header), c99_extension_flag],
|
||||||
|
build_by_default: true
|
||||||
|
)
|
||||||
|
|
||||||
|
test(target, header_test_exe)
|
||||||
|
endforeach
|
||||||
|
|
||||||
|
|
||||||
# fuzzing binaries
|
# fuzzing binaries
|
||||||
if meson.version().version_compare('>=0.49')
|
subdir('libfuzzer')
|
||||||
subdir('libfuzzer')
|
|
||||||
endif
|
|
||||||
|
|
||||||
# Include dav1d test data repository with additional tests
|
# Include dav1d test data repository with additional tests
|
||||||
if get_option('testdata_tests')
|
if get_option('testdata_tests')
|
||||||
|
|||||||
12
third_party/dav1d/tools/dav1d.c
vendored
12
third_party/dav1d/tools/dav1d.c
vendored
@@ -124,11 +124,15 @@ static void print_stats(const int istty, const unsigned n, const unsigned num,
|
|||||||
else
|
else
|
||||||
b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)",
|
b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)",
|
||||||
n, num, 100.0 * n / num);
|
n, num, 100.0 * n / num);
|
||||||
if (i_fps && b < end) {
|
if (b < end) {
|
||||||
const double d_fps = 1e9 * n / elapsed;
|
const double d_fps = 1e9 * n / elapsed;
|
||||||
const double speed = d_fps / i_fps;
|
if (i_fps) {
|
||||||
b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
|
const double speed = d_fps / i_fps;
|
||||||
d_fps, i_fps, speed);
|
b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
|
||||||
|
d_fps, i_fps, speed);
|
||||||
|
} else {
|
||||||
|
b += snprintf(b, end - b, " - %.2lf fps", d_fps);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (!istty)
|
if (!istty)
|
||||||
strcpy(b > end - 2 ? end - 2 : b, "\n");
|
strcpy(b > end - 2 ? end - 2 : b, "\n");
|
||||||
|
|||||||
10
third_party/dav1d/tools/dav1d.manifest
vendored
Normal file
10
third_party/dav1d/tools/dav1d.manifest
vendored
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
|
||||||
|
<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
|
||||||
|
<assemblyIdentity type="win32" name="VideoLAN.dav1d" version="1.0.0.0"/>
|
||||||
|
<application xmlns="urn:schemas-microsoft-com:asm.v3">
|
||||||
|
<windowsSettings>
|
||||||
|
<longPathAware xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">true</longPathAware>
|
||||||
|
<activeCodePage xmlns="http://schemas.microsoft.com/SMI/2019/WindowsSettings">UTF-8</activeCodePage>
|
||||||
|
</windowsSettings>
|
||||||
|
</application>
|
||||||
|
</assembly>
|
||||||
33
third_party/dav1d/tools/dav1d.rc.in
vendored
Normal file
33
third_party/dav1d/tools/dav1d.rc.in
vendored
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
|
||||||
|
#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
|
||||||
|
#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
|
||||||
|
#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
|
||||||
|
|
||||||
|
#include <windows.h>
|
||||||
|
|
||||||
|
1 RT_MANIFEST "dav1d.manifest"
|
||||||
|
1 VERSIONINFO
|
||||||
|
FILETYPE VFT_APP
|
||||||
|
FILEOS VOS_NT_WINDOWS32
|
||||||
|
PRODUCTVERSION PROJECT_VERSION_NUMBER
|
||||||
|
FILEVERSION API_VERSION_NUMBER
|
||||||
|
BEGIN
|
||||||
|
BLOCK "StringFileInfo"
|
||||||
|
BEGIN
|
||||||
|
BLOCK "040904E4"
|
||||||
|
BEGIN
|
||||||
|
VALUE "CompanyName", "VideoLAN"
|
||||||
|
VALUE "ProductName", "dav1d"
|
||||||
|
VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
|
||||||
|
VALUE "FileVersion", API_VERSION_NUMBER_STR
|
||||||
|
VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
|
||||||
|
VALUE "InternalName", "dav1d"
|
||||||
|
VALUE "OriginalFilename", "dav1d.exe"
|
||||||
|
VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
|
||||||
|
END
|
||||||
|
END
|
||||||
|
BLOCK "VarFileInfo"
|
||||||
|
BEGIN
|
||||||
|
VALUE "Translation", 0x409, 1252
|
||||||
|
END
|
||||||
|
END
|
||||||
16
third_party/dav1d/tools/meson.build
vendored
16
third_party/dav1d/tools/meson.build
vendored
@@ -77,8 +77,24 @@ dav1d_sources = files(
|
|||||||
'dav1d_cli_parse.c',
|
'dav1d_cli_parse.c',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if host_machine.system() == 'windows'
|
||||||
|
rc_file = configure_file(
|
||||||
|
input : 'dav1d.rc.in',
|
||||||
|
output : 'dav1d.rc',
|
||||||
|
configuration : rc_data
|
||||||
|
)
|
||||||
|
|
||||||
|
dav1d_rc_obj = winmod.compile_resources(rc_file,
|
||||||
|
depend_files : files('dav1d.manifest'),
|
||||||
|
include_directories : include_directories('.')
|
||||||
|
)
|
||||||
|
else
|
||||||
|
dav1d_rc_obj = []
|
||||||
|
endif
|
||||||
|
|
||||||
dav1d = executable('dav1d',
|
dav1d = executable('dav1d',
|
||||||
dav1d_sources,
|
dav1d_sources,
|
||||||
|
dav1d_rc_obj,
|
||||||
rev_target, cli_config_h_target,
|
rev_target, cli_config_h_target,
|
||||||
|
|
||||||
link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],
|
link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],
|
||||||
|
|||||||
14
third_party/dav1d/tools/output/y4m2.c
vendored
14
third_party/dav1d/tools/output/y4m2.c
vendored
@@ -28,6 +28,7 @@
|
|||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
#include <inttypes.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -77,8 +78,17 @@ static int write_header(Y4m2OutputContext *const c, const Dav1dPicture *const p)
|
|||||||
chr_names_8bpc_i420[p->seq_hdr->chr > 2 ? DAV1D_CHR_UNKNOWN : p->seq_hdr->chr] :
|
chr_names_8bpc_i420[p->seq_hdr->chr > 2 ? DAV1D_CHR_UNKNOWN : p->seq_hdr->chr] :
|
||||||
ss_names[p->p.layout][p->seq_hdr->hbd];
|
ss_names[p->p.layout][p->seq_hdr->hbd];
|
||||||
|
|
||||||
fprintf(c->f, "YUV4MPEG2 W%d H%d F%d:%d Ip C%s\n",
|
const unsigned fw = p->p.w;
|
||||||
p->p.w, p->p.h, c->fps[0], c->fps[1], ss_name);
|
const unsigned fh = p->p.h;
|
||||||
|
uint64_t aw = (uint64_t)fh * p->frame_hdr->render_width;
|
||||||
|
uint64_t ah = (uint64_t)fw * p->frame_hdr->render_height;
|
||||||
|
uint64_t gcd = ah;
|
||||||
|
for (uint64_t a = aw, b; (b = a % gcd); a = gcd, gcd = b);
|
||||||
|
aw /= gcd;
|
||||||
|
ah /= gcd;
|
||||||
|
|
||||||
|
fprintf(c->f, "YUV4MPEG2 W%u H%u F%u:%u Ip A%"PRIu64":%"PRIu64" C%s\n",
|
||||||
|
fw, fh, c->fps[0], c->fps[1], aw, ah, ss_name);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user