From df26df6e40528e96d82c8d3dcea447efaa07322b Mon Sep 17 00:00:00 2001 From: Jon Bauman Date: Tue, 6 Oct 2020 15:53:50 +0000 Subject: [PATCH] Bug 1661093 - Update libdav1d to 0243c3ff for Firefox 82. r=mjf Differential Revision: https://phabricator.services.mozilla.com/D92534 --- media/libdav1d/asm/moz.build | 2 + media/libdav1d/moz.yaml | 4 +- media/libdav1d/vcs_version.h | 2 +- media/libdav1d/version.h | 4 +- third_party/dav1d/CONTRIBUTING.md | 2 +- third_party/dav1d/include/dav1d/dav1d.h | 2 +- third_party/dav1d/include/dav1d/headers.h | 5 +- third_party/dav1d/include/dav1d/meson.build | 14 +- third_party/dav1d/meson.build | 39 +- .../dav1d/src/arm/32/looprestoration.S | 143 +- .../dav1d/src/arm/32/looprestoration16.S | 720 +++++ third_party/dav1d/src/arm/32/mc.S | 42 +- third_party/dav1d/src/arm/32/mc16.S | 2429 +++++++++++++++++ .../dav1d/src/arm/64/looprestoration16.S | 14 +- third_party/dav1d/src/arm/64/mc.S | 10 +- third_party/dav1d/src/arm/64/mc16.S | 92 +- .../dav1d/src/arm/looprestoration_init_tmpl.c | 4 +- third_party/dav1d/src/arm/mc_init_tmpl.c | 2 +- third_party/dav1d/src/decode.c | 16 +- third_party/dav1d/src/meson.build | 37 +- third_party/dav1d/src/obu.c | 2 + third_party/dav1d/src/recon_tmpl.c | 12 +- third_party/dav1d/src/tables.c | 8 +- third_party/dav1d/src/warpmv.c | 12 +- third_party/dav1d/src/x86/mc_avx2.asm | 734 +++-- third_party/dav1d/src/x86/mc_sse.asm | 1600 +++++------ third_party/dav1d/tests/header_test.c | 33 + .../dav1d/tests/libfuzzer/dav1d_fuzzer.c | 36 +- .../dav1d/tests/libfuzzer/dav1d_fuzzer.h | 1 + third_party/dav1d/tests/libfuzzer/main.c | 6 +- third_party/dav1d/tests/meson.build | 50 +- third_party/dav1d/tools/dav1d.c | 12 +- third_party/dav1d/tools/dav1d.manifest | 10 + third_party/dav1d/tools/dav1d.rc.in | 33 + third_party/dav1d/tools/meson.build | 16 + third_party/dav1d/tools/output/y4m2.c | 14 +- 36 files changed, 4671 insertions(+), 1491 deletions(-) create mode 100644 third_party/dav1d/src/arm/32/looprestoration16.S create mode 100644 third_party/dav1d/src/arm/32/mc16.S create mode 100644 third_party/dav1d/tests/header_test.c create mode 100644 third_party/dav1d/tools/dav1d.manifest create mode 100644 third_party/dav1d/tools/dav1d.rc.in diff --git a/media/libdav1d/asm/moz.build b/media/libdav1d/asm/moz.build index 510c1780ab00..6f6e69df9046 100644 --- a/media/libdav1d/asm/moz.build +++ b/media/libdav1d/asm/moz.build @@ -186,7 +186,9 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64': '../../../third_party/dav1d/src/arm/32/itx.S', '../../../third_party/dav1d/src/arm/32/loopfilter.S', '../../../third_party/dav1d/src/arm/32/looprestoration.S', + '../../../third_party/dav1d/src/arm/32/looprestoration16.S', '../../../third_party/dav1d/src/arm/32/mc.S', + '../../../third_party/dav1d/src/arm/32/mc16.S', '../../../third_party/dav1d/src/arm/32/msac.S', ] diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml index 59b9ee72ec11..7f79dd36114c 100644 --- a/media/libdav1d/moz.yaml +++ b/media/libdav1d/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: commit d0e50cacead63e9904dde184580ce9a746374bd5 (2020-08-21T15:13:49.000+02:00). + release: commit 0243c3ffb644e61848b82f24f5e4a7324669d76e (2020-09-27T15:38:45.000+02:00). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: d0e50cacead63e9904dde184580ce9a746374bd5 + revision: 0243c3ffb644e61848b82f24f5e4a7324669d76e # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h index dc0eb56b7a6a..28c54ea6b281 100644 --- a/media/libdav1d/vcs_version.h +++ b/media/libdav1d/vcs_version.h @@ -1,2 +1,2 @@ /* auto-generated, do not edit */ -#define DAV1D_VERSION "0.7.1-49-gd0e50ca" +#define DAV1D_VERSION "0.7.1-81-g0243c3f" diff --git a/media/libdav1d/version.h b/media/libdav1d/version.h index f6f33e7e9b39..cf6ca7b36f91 100644 --- a/media/libdav1d/version.h +++ b/media/libdav1d/version.h @@ -27,8 +27,8 @@ #ifndef DAV1D_VERSION_H #define DAV1D_VERSION_H -#define DAV1D_API_VERSION_MAJOR 4 +#define DAV1D_API_VERSION_MAJOR 5 #define DAV1D_API_VERSION_MINOR 0 -#define DAV1D_API_VERSION_PATCH 2 +#define DAV1D_API_VERSION_PATCH 0 #endif /* DAV1D_VERSION_H */ diff --git a/third_party/dav1d/CONTRIBUTING.md b/third_party/dav1d/CONTRIBUTING.md index 347741f32907..cdbc98be5385 100644 --- a/third_party/dav1d/CONTRIBUTING.md +++ b/third_party/dav1d/CONTRIBUTING.md @@ -12,7 +12,7 @@ The todo list can be found [on the wiki](https://code.videolan.org/videolan/dav1 The codebase is developed with the following assumptions: For the library: -- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extension, +- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extensions. Anonymous structures and unions are the only allowed compiler extensions for internal code. - x86 asm in .asm files, using the NASM syntax, - arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports, - no C++ is allowed, whatever the version. diff --git a/third_party/dav1d/include/dav1d/dav1d.h b/third_party/dav1d/include/dav1d/dav1d.h index 32fe8c3cb528..9d484e58e401 100644 --- a/third_party/dav1d/include/dav1d/dav1d.h +++ b/third_party/dav1d/include/dav1d/dav1d.h @@ -65,9 +65,9 @@ typedef struct Dav1dSettings { int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31) int all_layers; ///< output all spatial layers of a scalable AV1 biststream unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited) - uint8_t reserved[32]; ///< reserved for future use Dav1dPicAllocator allocator; ///< Picture allocator callback. Dav1dLogger logger; ///< Logger callback. + uint8_t reserved[32]; ///< reserved for future use } Dav1dSettings; /** diff --git a/third_party/dav1d/include/dav1d/headers.h b/third_party/dav1d/include/dav1d/headers.h index f9b89f346969..1fba40358de0 100644 --- a/third_party/dav1d/include/dav1d/headers.h +++ b/third_party/dav1d/include/dav1d/headers.h @@ -28,6 +28,7 @@ #ifndef DAV1D_HEADERS_H #define DAV1D_HEADERS_H +#include #include // Constants from Section 3. "Symbols and abbreviated terms" @@ -95,9 +96,9 @@ typedef struct Dav1dWarpedMotionParams { union { struct { int16_t alpha, beta, gamma, delta; - }; + } p; int16_t abcd[4]; - }; + } u; } Dav1dWarpedMotionParams; enum Dav1dPixelLayout { diff --git a/third_party/dav1d/include/dav1d/meson.build b/third_party/dav1d/include/dav1d/meson.build index b5649d398ba3..68faaf9a3695 100644 --- a/third_party/dav1d/include/dav1d/meson.build +++ b/third_party/dav1d/include/dav1d/meson.build @@ -31,11 +31,15 @@ version_h_target = configure_file(input: 'version.h.in', output: 'version.h', configuration: version_h_data) +dav1d_api_headers = [ + 'common.h', + 'data.h', + 'dav1d.h', + 'headers.h', + 'picture.h', +] + # install headers -install_headers('common.h', - 'data.h', - 'dav1d.h', - 'headers.h', - 'picture.h', +install_headers(dav1d_api_headers, version_h_target, subdir : 'dav1d') diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build index e354f096bf08..4b72388ada56 100644 --- a/third_party/dav1d/meson.build +++ b/third_party/dav1d/meson.build @@ -28,9 +28,9 @@ project('dav1d', ['c'], 'warning_level=2', 'buildtype=release', 'b_ndebug=if-release'], - meson_version: '>= 0.47.0') + meson_version: '>= 0.49.0') -dav1d_soname_version = '4.0.2' +dav1d_soname_version = '5.0.0' dav1d_api_version_array = dav1d_soname_version.split('.') dav1d_api_version_major = dav1d_api_version_array[0] dav1d_api_version_minor = dav1d_api_version_array[1] @@ -118,6 +118,17 @@ if host_machine.system() == 'windows' thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c')) rt_dependency = [] + + rc_version_array = meson.project_version().split('.') + winmod = import('windows') + rc_data = configuration_data() + rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0]) + rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1]) + rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2]) + rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major) + rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor) + rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision) + rc_data.set('COPYRIGHT_YEARS', '2020') else thread_dependency = dependency('threads') thread_compat_dep = [] @@ -227,7 +238,7 @@ endif # Compiler flags that should be set # But when the compiler does not supports them # it is not an error and silently tolerated -if cc.get_id() != 'msvc' +if cc.get_argument_syntax() != 'msvc' optional_arguments += [ '-Wundef', '-Werror=vla', @@ -426,6 +437,28 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86') ]) endif +use_gaspp = false +if (is_asm_enabled and + (host_machine.cpu_family() == 'aarch64' or + host_machine.cpu_family().startswith('arm')) and + cc.get_argument_syntax() == 'msvc') + gaspp = find_program('gas-preprocessor.pl') + use_gaspp = true + gaspp_gen = generator(gaspp, + output: '@BASENAME@.obj', + arguments: [ + '-as-type', 'armasm', + '-arch', host_machine.cpu_family(), + '--', + host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm', + '-nologo', + '-I@0@'.format(dav1d_src_root), + '-I@0@/'.format(meson.current_build_dir()), + '@INPUT@', + '-c', + '-o', '@OUTPUT@' + ]) +endif # Generate config.h config_h_target = configure_file(output: 'config.h', configuration: cdata) diff --git a/third_party/dav1d/src/arm/32/looprestoration.S b/third_party/dav1d/src/arm/32/looprestoration.S index ea32d63f265f..073525a5a5a5 100644 --- a/third_party/dav1d/src/arm/32/looprestoration.S +++ b/third_party/dav1d/src/arm/32/looprestoration.S @@ -40,8 +40,8 @@ function wiener_filter_h_8bpc_neon, export=1 mov r8, r5 vld1.16 {q0}, [r4] movw r9, #(1 << 14) - (1 << 2) - vdup.16 q14, r9 - vmov.s16 q15, #2048 + vdup.16 q14, r9 + vmov.s16 q15, #2048 // Calculate mid_stride add r10, r5, #7 bic r10, r10, #7 @@ -108,8 +108,8 @@ function wiener_filter_h_8bpc_neon, export=1 0: // !LR_HAVE_LEFT, fill q1 with the leftmost byte // and shift q2 to have 3x the first byte at the front. - vdup.8 q1, d4[0] - vdup.8 q8, d18[0] + vdup.8 q1, d4[0] + vdup.8 q8, d18[0] // Move r2 back to account for the last 3 bytes we loaded before, // which we shifted out. sub r2, r2, #3 @@ -127,7 +127,7 @@ function wiener_filter_h_8bpc_neon, export=1 bne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. - sub r9, r5, #14 + sub r9, r5, #14 ldrb r11, [r2, r9] ldrb r9, [lr, r9] // Fill q12/q13 with the right padding pixel @@ -144,7 +144,6 @@ function wiener_filter_h_8bpc_neon, export=1 b 6f 4: // Loop horizontally -.macro filter_8 // This is tuned as some sort of compromise between Cortex A7, A8, // A9 and A53. vmul.s16 q3, q1, d0[0] @@ -187,8 +186,6 @@ function wiener_filter_h_8bpc_neon, export=1 vshr.s16 q10, q10, #3 vadd.s16 q3, q3, q15 vadd.s16 q10, q10, q15 -.endm - filter_8 vst1.16 {q3}, [r0, :128]! vst1.16 {q10}, [r12, :128]! @@ -206,50 +203,43 @@ function wiener_filter_h_8bpc_neon, export=1 5: // Filter 4 pixels, 7 <= w < 11 .macro filter_4 + vext.8 d20, d2, d3, #2 + vext.8 d21, d2, d3, #4 + vext.8 d22, d2, d3, #6 + vext.8 d23, d3, d4, #2 + vext.8 d8, d3, d4, #4 vmul.s16 d6, d2, d0[0] - vext.8 q10, q1, q2, #2 - vext.8 q11, q1, q2, #4 vmla.s16 d6, d20, d0[1] - vmla.s16 d6, d22, d0[2] - vext.8 q10, q1, q2, #6 - vext.8 q11, q1, q2, #8 - vmla.s16 d6, d20, d0[3] - vmla.s16 d6, d22, d1[0] - vext.8 q10, q1, q2, #10 - vext.8 q11, q1, q2, #12 - vmla.s16 d6, d20, d1[1] - vmla.s16 d6, d22, d1[2] + vmla.s16 d6, d21, d0[2] + vmla.s16 d6, d22, d0[3] + vmla.s16 d6, d3, d1[0] + vmla.s16 d6, d23, d1[1] + vmla.s16 d6, d8, d1[2] - vmul.s16 d20, d16, d0[0] - vext.8 q11, q8, q9, #2 - vext.8 q4, q8, q9, #4 - vmla.s16 d20, d22, d0[1] - vmla.s16 d20, d8, d0[2] - vext.8 q11, q8, q9, #6 - vext.8 q4, q8, q9, #8 - vmla.s16 d20, d22, d0[3] - vmla.s16 d20, d8, d1[0] - vext.8 q11, q8, q9, #10 - vext.8 q4, q8, q9, #12 - vmla.s16 d20, d22, d1[1] - vmla.s16 d20, d8, d1[2] + vext.8 d20, d16, d17, #2 + vext.8 d21, d16, d17, #4 + vext.8 d22, d16, d17, #6 + vext.8 d23, d17, d18, #2 + vext.8 d8, d17, d18, #4 + vmul.s16 d7, d16, d0[0] + vmla.s16 d7, d20, d0[1] + vmla.s16 d7, d21, d0[2] + vmla.s16 d7, d22, d0[3] + vmla.s16 d7, d17, d1[0] + vmla.s16 d7, d23, d1[1] + vmla.s16 d7, d8, d1[2] - vext.8 q11, q1, q2, #6 - vshl.s16 d22, d22, #7 - vsub.s16 d22, d22, d28 - vqadd.s16 d6, d6, d22 - vext.8 q11, q8, q9, #6 - vshl.s16 d22, d22, #7 - vsub.s16 d22, d22, d28 - vqadd.s16 d20, d20, d22 - vshr.s16 d6, d6, #3 - vshr.s16 d20, d20, #3 - vadd.s16 d6, d6, d30 - vadd.s16 d20, d20, d30 + vext.8 d22, d2, d3, #6 + vext.8 d23, d16, d17, #6 + vshl.s16 q11, q11, #7 + vsub.s16 q11, q11, q14 + vqadd.s16 q3, q3, q11 + vshr.s16 q3, q3, #3 + vadd.s16 q3, q3, q15 .endm filter_4 vst1.16 {d6}, [r0, :64]! - vst1.16 {d20}, [r12, :64]! + vst1.16 {d7}, [r12, :64]! subs r5, r5, #4 // 3 <= w < 7 vext.8 q1, q1, q2, #8 @@ -323,7 +313,7 @@ L(variable_shift_tbl): // w >= 4, filter 4 pixels filter_4 vst1.16 {d6}, [r0, :64]! - vst1.16 {d20}, [r12, :64]! + vst1.16 {d7}, [r12, :64]! subs r5, r5, #4 // 0 <= w < 4 vext.8 q1, q1, q2, #8 vext.8 q8, q8, q9, #8 @@ -338,11 +328,11 @@ L(variable_shift_tbl): vdup.16 d25, d16[3] vpadd.s16 d6, d6, d6 vtrn.16 d24, d25 - vshl.s16 d24, d24, #7 - vsub.s16 d24, d24, d28 - vqadd.s16 d6, d6, d24 - vshr.s16 d6, d6, #3 - vadd.s16 d6, d6, d30 + vshl.s16 d24, d24, #7 + vsub.s16 d24, d24, d28 + vqadd.s16 d6, d6, d24 + vshr.s16 d6, d6, #3 + vadd.s16 d6, d6, d30 vst1.s16 {d6[0]}, [r0, :16]! vst1.s16 {d6[1]}, [r12, :16]! subs r5, r5, #1 @@ -363,7 +353,6 @@ L(variable_shift_tbl): 0: vpop {q4} pop {r4-r11,pc} -.purgem filter_8 .purgem filter_4 endfunc @@ -422,22 +411,22 @@ function wiener_filter_v_8bpc_neon, export=1 // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - vmull.s16 q2, d16, d0[0] - vmlal.s16 q2, d18, d0[1] - vmlal.s16 q2, d20, d0[2] - vmlal.s16 q2, d22, d0[3] - vmlal.s16 q2, d24, d1[0] - vmlal.s16 q2, d26, d1[1] - vmlal.s16 q2, d28, d1[2] - vmull.s16 q3, d17, d0[0] - vmlal.s16 q3, d19, d0[1] - vmlal.s16 q3, d21, d0[2] - vmlal.s16 q3, d23, d0[3] - vmlal.s16 q3, d25, d1[0] - vmlal.s16 q3, d27, d1[1] - vmlal.s16 q3, d29, d1[2] - vqrshrun.s32 d4, q2, #11 - vqrshrun.s32 d5, q3, #11 + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d20, d0[2] + vmlal.s16 q2, d22, d0[3] + vmlal.s16 q2, d24, d1[0] + vmlal.s16 q2, d26, d1[1] + vmlal.s16 q2, d28, d1[2] + vmull.s16 q3, d17, d0[0] + vmlal.s16 q3, d19, d0[1] + vmlal.s16 q3, d21, d0[2] + vmlal.s16 q3, d23, d0[3] + vmlal.s16 q3, d25, d1[0] + vmlal.s16 q3, d27, d1[1] + vmlal.s16 q3, d29, d1[2] + vqrshrun.s32 d4, q2, #11 + vqrshrun.s32 d5, q3, #11 vqmovun.s16 d4, q2 vst1.8 {d4}, [r0], r1 .if \compare @@ -473,7 +462,7 @@ function wiener_filter_v_8bpc_neon, export=1 52: // 2 rows in total, q11 already loaded, load q12 with content data // and 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 - vmov q15, q14 + vmov q15, q14 b 8f 53: // 3 rows in total, q11 already loaded, load q12 and q13 with content @@ -615,8 +604,8 @@ L(copy_narrow_tbl): asr r1, r1, #1 22: subs r4, r4, #1 - vld1.16 {d0[]}, [r2]! - vst1.16 {d0[0]}, [r0], r1 + vld1.16 {d0[]}, [r2, :16]! + vst1.16 {d0[0]}, [r0, :16], r1 bgt 22b 0: pop {r4,pc} @@ -644,8 +633,8 @@ L(copy_narrow_tbl): ble 0f b 42b 41: - vld1.32 {d0[]}, [r2] - vst1.32 {d0[0]}, [r0] + vld1.32 {d0[]}, [r2, :32] + vst1.32 {d0[0]}, [r0, :32] 0: pop {r4,pc} @@ -785,7 +774,7 @@ function sgr_box3_h_8bpc_neon, export=1 bne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. - sub lr, r5, #(2 + 16 - 2 + 1) + sub lr, r5, #(2 + 16 - 2 + 1) ldrb r11, [r3, lr] ldrb lr, [r12, lr] // Fill q14/q15 with the right padding pixel @@ -1058,7 +1047,7 @@ function sgr_box5_h_8bpc_neon, export=1 bne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. - sub lr, r5, #(2 + 16 - 3 + 1) + sub lr, r5, #(2 + 16 - 3 + 1) ldrb r11, [r3, lr] ldrb lr, [r12, lr] // Fill q14/q15 with the right padding pixel @@ -1100,7 +1089,7 @@ function sgr_box5_h_8bpc_neon, export=1 vaddl_u16_n q12, q13, d2, d3, d16, d17, \w vaddl_u16_n q8, q9, d18, d19, d20, d21, \w vaddw_u16_n q12, q13, d22, d23, \w - vadd_i32_n q12, q13, q8, q9, \w + vadd_i32_n q12, q13, q8, q9, \w vext.8 q8, q5, q6, #2 vext.8 q9, q5, q6, #4 vext.8 q10, q5, q6, #6 @@ -1152,7 +1141,7 @@ function sgr_box5_h_8bpc_neon, export=1 6: // Pad the right edge and produce the last few pixels. // w < 7, w+1 pixels valid in q0/q4 - sub lr, r5, #1 + sub lr, r5, #1 // lr = pixels valid - 2 adr r11, L(box5_variable_shift_tbl) ldr lr, [r11, lr, lsl #2] diff --git a/third_party/dav1d/src/arm/32/looprestoration16.S b/third_party/dav1d/src/arm/32/looprestoration16.S new file mode 100644 index 000000000000..39c248f8b527 --- /dev/null +++ b/third_party/dav1d/src/arm/32/looprestoration16.S @@ -0,0 +1,720 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], +// const pixel *src, ptrdiff_t stride, +// const int16_t fh[7], const intptr_t w, +// int h, enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter_h_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + ldr r8, [sp, #116] // bitdepth_max + vld1.16 {q0}, [r4] + clz r8, r8 + vmov.i32 q14, #1 + sub r9, r8, #38 // -(bitdepth + 6) + sub r8, r8, #25 // -round_bits_h + neg r9, r9 // bitdepth + 6 + vdup.32 q1, r9 + vdup.32 q13, r8 // -round_bits_h + vmov.i16 q15, #8192 + vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6) + mov r8, r5 + // Calculate mid_stride + add r10, r5, #7 + bic r10, r10, #7 + lsl r10, r10, #1 + + // Clear the last unused element of q0, to allow filtering a single + // pixel with one plain vmul+vpadd. + mov r12, #0 + vmov.16 d1[3], r12 + + // Set up pointers for reading/writing alternate rows + add r12, r0, r10 + lsl r10, r10, #1 + add lr, r2, r3 + lsl r3, r3, #1 + + // Subtract the width from mid_stride + sub r10, r10, r5, lsl #1 + + // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. + cmp r5, #8 + add r11, r5, #13 + bic r11, r11, #7 + bge 1f + mov r11, #16 +1: + sub r3, r3, r11, lsl #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r1, #0 + bne 0f + // left == NULL + sub r2, r2, #6 + sub lr, lr, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r3, r3, #6 + + +1: // Loop vertically + vld1.16 {q2, q3}, [r2]! + vld1.16 {q4, q5}, [lr]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r1, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.16 {d3}, [r1]! + // Move r2/lr back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub r2, r2, #6 + sub lr, lr, #6 + vld1.16 {d13}, [r1]! + vext.8 q3, q2, q3, #10 + vext.8 q2, q1, q2, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill q1 with the leftmost pixel + // and shift q2/q3 to have 3x the first pixel at the front. + vdup.16 q1, d4[0] + vdup.16 q6, d8[0] + // Move r2 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub r2, r2, #6 + sub lr, lr, #6 + vext.8 q3, q2, q3, #10 + vext.8 q2, q1, q2, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + +2: + + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub r9, r5, #14 + lsl r9, r9, #1 + ldrh r11, [r2, r9] + ldrh r9, [lr, r9] + // Fill q11/q12 with the right padding pixel + vdup.16 q11, r11 + vdup.16 q12, r9 +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + cmp r5, #7 + bge 5f // If w >= 7, we can filter 4 pixels + b 6f + +4: // Loop horizontally + vext.8 q10, q2, q3, #6 + vext.8 q8, q2, q3, #2 + vext.8 q9, q2, q3, #4 + vshll.u16 q6, d20, #7 + vshll.u16 q7, d21, #7 + vmlal.s16 q6, d4, d0[0] + vmlal.s16 q6, d16, d0[1] + vmlal.s16 q6, d18, d0[2] + vmlal.s16 q6, d20, d0[3] + vmlal.s16 q7, d5, d0[0] + vmlal.s16 q7, d17, d0[1] + vmlal.s16 q7, d19, d0[2] + vmlal.s16 q7, d21, d0[3] + vext.8 q8, q2, q3, #8 + vext.8 q9, q2, q3, #10 + vext.8 q10, q2, q3, #12 + vmlal.s16 q6, d16, d1[0] + vmlal.s16 q6, d18, d1[1] + vmlal.s16 q6, d20, d1[2] + vmlal.s16 q7, d17, d1[0] + vmlal.s16 q7, d19, d1[1] + vmlal.s16 q7, d21, d1[2] + vext.8 q10, q4, q5, #6 + vext.8 q2, q4, q5, #2 + vshll.u16 q8, d20, #7 + vshll.u16 q9, d21, #7 + vmlal.s16 q8, d8, d0[0] + vmlal.s16 q8, d4, d0[1] + vmlal.s16 q8, d20, d0[3] + vmlal.s16 q9, d9, d0[0] + vmlal.s16 q9, d5, d0[1] + vmlal.s16 q9, d21, d0[3] + vext.8 q2, q4, q5, #4 + vext.8 q10, q4, q5, #8 + vmlal.s16 q8, d4, d0[2] + vmlal.s16 q8, d20, d1[0] + vmlal.s16 q9, d5, d0[2] + vmlal.s16 q9, d21, d1[0] + vext.8 q2, q4, q5, #10 + vext.8 q10, q4, q5, #12 + vmlal.s16 q8, d4, d1[1] + vmlal.s16 q8, d20, d1[2] + vmlal.s16 q9, d5, d1[1] + vmlal.s16 q9, d21, d1[2] + + vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q14 + vadd.i32 q8, q8, q14 + vadd.i32 q9, q9, q14 + vrshl.s32 q6, q6, q13 + vrshl.s32 q7, q7, q13 + vrshl.s32 q8, q8, q13 + vrshl.s32 q9, q9, q13 + vqmovun.s32 d12, q6 + vqmovun.s32 d13, q7 + vqmovun.s32 d14, q8 + vqmovun.s32 d15, q9 + vmin.u16 q6, q6, q10 + vmin.u16 q7, q7, q10 + vsub.i16 q6, q6, q15 + vsub.i16 q7, q7, q15 + vst1.16 {q6}, [r0, :128]! + vst1.16 {q7}, [r12, :128]! + + subs r5, r5, #8 + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q2, q3 + vmov q4, q5 + vld1.16 {q3}, [r2]! + vld1.16 {q5}, [lr]! + bne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Filter 4 pixels, 7 <= w < 11 +.macro filter_4 + vext.8 d18, d4, d5, #6 + vext.8 d16, d4, d5, #2 + vext.8 d17, d4, d5, #4 + vext.8 d19, d5, d6, #2 + vext.8 d20, d5, d6, #4 + vshll.u16 q6, d18, #7 + vmlal.s16 q6, d4, d0[0] + vmlal.s16 q6, d16, d0[1] + vmlal.s16 q6, d17, d0[2] + vmlal.s16 q6, d18, d0[3] + vmlal.s16 q6, d5, d1[0] + vmlal.s16 q6, d19, d1[1] + vmlal.s16 q6, d20, d1[2] + + vext.8 d18, d8, d9, #6 + vext.8 d16, d8, d9, #2 + vext.8 d17, d8, d9, #4 + vext.8 d19, d9, d10, #2 + vext.8 d20, d9, d10, #4 + vshll.u16 q7, d18, #7 + vmlal.s16 q7, d8, d0[0] + vmlal.s16 q7, d16, d0[1] + vmlal.s16 q7, d17, d0[2] + vmlal.s16 q7, d18, d0[3] + vmlal.s16 q7, d9, d1[0] + vmlal.s16 q7, d19, d1[1] + vmlal.s16 q7, d20, d1[2] + + vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q14 + vrshl.s32 q6, q6, q13 + vrshl.s32 q7, q7, q13 + vqmovun.s32 d12, q6 + vqmovun.s32 d13, q7 + vmin.u16 q6, q6, q10 + vsub.i16 q6, q6, q15 +.endm + filter_4 + vst1.16 {d12}, [r0, :64]! + vst1.16 {d13}, [r12, :64]! + + subs r5, r5, #4 // 3 <= w < 7 + vext.8 q2, q2, q3, #8 + vext.8 q3, q3, q3, #8 + vext.8 q4, q4, q5, #8 + vext.8 q5, q5, q5, #8 + +6: // Pad the right edge and filter the last few pixels. + // w < 7, w+3 pixels valid in q2-q3 + cmp r5, #5 + blt 7f + bgt 8f + // w == 5, 8 pixels valid in q2, q3 invalid + vmov q3, q11 + vmov q5, q12 + b 88f + +7: // 1 <= w < 5, 4-7 pixels valid in q2 + sub r9, r5, #1 + // r9 = (pixels valid - 4) + adr r11, L(variable_shift_tbl) + ldr r9, [r11, r9, lsl #2] + add r11, r11, r9 + vmov q3, q11 + vmov q5, q12 + bx r11 + + .align 2 +L(variable_shift_tbl): + .word 44f - L(variable_shift_tbl) + CONFIG_THUMB + .word 55f - L(variable_shift_tbl) + CONFIG_THUMB + .word 66f - L(variable_shift_tbl) + CONFIG_THUMB + .word 77f - L(variable_shift_tbl) + CONFIG_THUMB + +44: // 4 pixels valid in q2/q4, fill the high half with padding. + vmov d5, d6 + vmov d9, d10 + b 88f + // Shift q2 right, shifting out invalid pixels, + // shift q2 left to the original offset, shifting in padding pixels. +55: // 5 pixels valid + vext.8 q2, q2, q2, #10 + vext.8 q2, q2, q3, #6 + vext.8 q4, q4, q4, #10 + vext.8 q4, q4, q5, #6 + b 88f +66: // 6 pixels valid + vext.8 q2, q2, q2, #12 + vext.8 q2, q2, q3, #4 + vext.8 q4, q4, q4, #12 + vext.8 q4, q4, q5, #4 + b 88f +77: // 7 pixels valid + vext.8 q2, q2, q2, #14 + vext.8 q2, q2, q3, #2 + vext.8 q4, q4, q4, #14 + vext.8 q4, q4, q5, #2 + b 88f + +8: // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3 + vext.8 q3, q3, q3, #2 + vext.8 q3, q3, q11, #14 + vext.8 q5, q5, q5, #2 + vext.8 q5, q5, q12, #14 + +88: + // w < 7, q2-q3 padded properly + cmp r5, #4 + blt 888f + + // w >= 4, filter 4 pixels + filter_4 + vst1.16 {d12}, [r0, :64]! + vst1.16 {d13}, [r12, :64]! + subs r5, r5, #4 // 0 <= w < 4 + vext.8 q2, q2, q3, #8 + vext.8 q4, q4, q5, #8 + beq 9f +888: // 1 <= w < 4, filter 1 pixel at a time + vmull.s16 q6, d4, d0 + vmull.s16 q7, d5, d1 + vmull.s16 q8, d8, d0 + vmull.s16 q9, d9, d1 + vadd.i32 q6, q7 + vadd.i32 q8, q9 + vpadd.i32 d12, d12, d13 + vpadd.i32 d13, d16, d17 + vdup.16 d14, d4[3] + vdup.16 d15, d8[3] + vpadd.i32 d12, d12, d13 + vtrn.16 d14, d15 + vadd.i32 d12, d12, d28 + vshll.u16 q7, d14, #7 + vmvn.i16 d20, #0x8000 // 0x7fff = (1 << 15) - 1 + vadd.i32 d12, d12, d14 + vrshl.s32 d12, d12, d26 + vqmovun.s32 d12, q6 + vmin.u16 d12, d12, d20 + vsub.i16 d12, d12, d30 + vst1.16 {d12[0]}, [r0, :16]! + vst1.16 {d12[1]}, [r12, :16]! + subs r5, r5, #1 + vext.8 q2, q2, q3, #2 + vext.8 q4, q4, q5, #2 + bgt 888b + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r10 + add r12, r12, r10 + add r2, r2, r3 + add lr, lr, r3 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +.purgem filter_4 +endfunc + +// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, +// const int16_t *mid, int w, int h, +// const int16_t fv[7], enum LrEdgeFlags edges, +// ptrdiff_t mid_stride, const int bitdepth_max); +function wiener_filter_v_16bpc_neon, export=1 + push {r4-r7,lr} + vpush {q4-q5} + ldrd r4, r5, [sp, #52] + ldrd r6, r7, [sp, #60] + ldr lr, [sp, #68] // bitdepth_max + vmov.i16 q1, #0 + mov r12, #128 + vld1.16 {q0}, [r5] + vdup.16 q5, lr + clz lr, lr + vmov.i16 d2[3], r12 + sub lr, lr, #11 // round_bits_v + vadd.i16 q0, q0, q1 + vdup.32 q4, lr + mov lr, r4 + vneg.s32 q4, q4 // -round_bits_v + + // Calculate the number of rows to move back when looping vertically + mov r12, r4 + tst r6, #4 // LR_HAVE_TOP + beq 0f + sub r2, r2, r7, lsl #1 + add r12, r12, #2 +0: + tst r6, #8 // LR_HAVE_BOTTOM + beq 1f + add r12, r12, #2 + +1: // Start of horizontal loop; start one vertical filter slice. + // Load rows into q8-q11 and pad properly. + tst r6, #4 // LR_HAVE_TOP + vld1.16 {q8}, [r2, :128], r7 + beq 2f + // LR_HAVE_TOP + vld1.16 {q10}, [r2, :128], r7 + vmov q9, q8 + vld1.16 {q11}, [r2, :128], r7 + b 3f +2: // !LR_HAVE_TOP + vmov q9, q8 + vmov q10, q8 + vmov q11, q8 + +3: + cmp r4, #4 + blt 5f + // Start filtering normally; fill in q12-q14 with unique rows. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vld1.16 {q14}, [r2, :128], r7 + +4: +.macro filter compare + subs r4, r4, #1 + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d20, d0[2] + vmlal.s16 q2, d22, d0[3] + vmlal.s16 q2, d24, d1[0] + vmlal.s16 q2, d26, d1[1] + vmlal.s16 q2, d28, d1[2] + vmull.s16 q3, d17, d0[0] + vmlal.s16 q3, d19, d0[1] + vmlal.s16 q3, d21, d0[2] + vmlal.s16 q3, d23, d0[3] + vmlal.s16 q3, d25, d1[0] + vmlal.s16 q3, d27, d1[1] + vmlal.s16 q3, d29, d1[2] + vrshl.s32 q2, q2, q4 // round_bits_v + vrshl.s32 q3, q3, q4 + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q5 // bitdepth_max + vst1.16 {q2}, [r0], r1 +.if \compare + cmp r4, #4 +.else + ble 9f +.endif + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 + vmov q11, q12 + vmov q12, q13 + vmov q13, q14 +.endm + filter 1 + blt 7f + vld1.16 {q14}, [r2, :128], r7 + b 4b + +5: // Less than 4 rows in total; not all of q12-q13 are filled yet. + tst r6, #8 // LR_HAVE_BOTTOM + beq 6f + // LR_HAVE_BOTTOM + cmp r4, #2 + // We load at least 2 rows in all cases. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + bgt 53f // 3 rows in total + beq 52f // 2 rows in total +51: // 1 row in total, q11 already loaded, load edge into q12-q14. + vmov q13, q12 + b 8f +52: // 2 rows in total, q11 already loaded, load q12 with content data + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vmov q15, q14 + b 8f +53: + // 3 rows in total, q11 already loaded, load q12 and q13 with content + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f + +6: + // !LR_HAVE_BOTTOM + cmp r4, #2 + bgt 63f // 3 rows in total + beq 62f // 2 rows in total +61: // 1 row in total, q11 already loaded, pad that into q12-q14. + vmov q12, q11 + vmov q13, q11 + vmov q14, q11 + b 8f +62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. + vld1.16 {q12}, [r2, :128], r7 + vmov q13, q12 + vmov q14, q12 + vmov q15, q12 + b 8f +63: + // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + b 8f + +7: + // All registers up to q13 are filled already, 3 valid rows left. + // < 4 valid rows left; fill in padding and filter the last + // few rows. + tst r6, #8 // LR_HAVE_BOTTOM + beq 71f + // LR_HAVE_BOTTOM; load 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f +71: + // !LR_HAVE_BOTTOM, pad 3 rows + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + +8: // At this point, all registers up to q14-q15,q1 are loaded with + // edge/padding (depending on how many rows are left). + filter 0 // This branches to 9f when done + vmov q14, q15 + vmov q15, q1 + b 8b + +9: // End of one vertical slice. + subs r3, r3, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + mls r0, r1, lr, r0 + mls r2, r7, r12, r2 + add r0, r0, #16 + add r2, r2, #16 + mov r4, lr + b 1b + +0: + vpop {q4-q5} + pop {r4-r7,pc} +.purgem filter +endfunc + +// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride, +// const pixel *src, int w, int h); +function copy_narrow_16bpc_neon, export=1 + push {r4,lr} + ldr r4, [sp, #8] + adr r12, L(copy_narrow_tbl) + ldr r3, [r12, r3, lsl #2] + add r12, r12, r3 + bx r12 + + .align 2 +L(copy_narrow_tbl): + .word 0 + .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB + .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB + +10: + add r3, r0, r1 + lsl r1, r1, #1 +18: + subs r4, r4, #8 + blt 110f + vld1.16 {q0}, [r2, :128]! + vst1.16 {d0[0]}, [r0, :16], r1 + vst1.16 {d0[1]}, [r3, :16], r1 + vst1.16 {d0[2]}, [r0, :16], r1 + vst1.16 {d0[3]}, [r3, :16], r1 + vst1.16 {d1[0]}, [r0, :16], r1 + vst1.16 {d1[1]}, [r3, :16], r1 + vst1.16 {d1[2]}, [r0, :16], r1 + vst1.16 {d1[3]}, [r3, :16], r1 + ble 0f + b 18b +110: + add r4, r4, #8 + asr r1, r1, #1 +11: + subs r4, r4, #1 + vld1.16 {d0[]}, [r2]! + vst1.16 {d0[0]}, [r0], r1 + bgt 11b +0: + pop {r4,pc} + +20: + add r3, r0, r1 + lsl r1, r1, #1 +24: + subs r4, r4, #4 + blt 210f + vld1.32 {q0}, [r2, :128]! + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[1]}, [r3, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r3, :32], r1 + ble 0f + b 24b +210: + add r4, r4, #4 + asr r1, r1, #1 +22: + subs r4, r4, #1 + vld1.32 {d0[]}, [r2, :32]! + vst1.32 {d0[0]}, [r0, :32], r1 + bgt 22b +0: + pop {r4,pc} + +30: + ldr r3, [r2] + ldrh r12, [r2, #4] + add r2, r2, #6 + subs r4, r4, #1 + str r3, [r0] + strh r12, [r0, #4] + add r0, r0, r1 + bgt 30b + pop {r4,pc} + +40: + add r3, r0, r1 + lsl r1, r1, #1 +42: + subs r4, r4, #2 + blt 41f + vld1.16 {q0}, [r2, :128]! + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r3, :64], r1 + ble 0f + b 42b +41: + vld1.16 {d0}, [r2, :64] + vst1.16 {d0}, [r0, :64] +0: + pop {r4,pc} + +50: + vld1.16 {d0}, [r2] + ldrh r12, [r2, #8] + add r2, r2, #10 + subs r4, r4, #1 + vst1.16 {d0}, [r0] + strh r12, [r0, #8] + add r0, r0, r1 + bgt 50b + pop {r4,pc} + +60: + vld1.16 {d0}, [r2] + ldr r12, [r2, #8] + add r2, r2, #12 + subs r4, r4, #1 + vst1.16 {d0}, [r0] + str r12, [r0, #8] + add r0, r0, r1 + bgt 60b + pop {r4,pc} + +70: + vld1.16 {d0}, [r2] + ldr r12, [r2, #8] + ldrh lr, [r2, #12] + add r2, r2, #14 + subs r4, r4, #1 + vst1.16 {d0}, [r0] + str r12, [r0, #8] + strh lr, [r0, #12] + add r0, r0, r1 + bgt 70b + pop {r4,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/mc.S b/third_party/dav1d/src/arm/32/mc.S index 47631c071487..1a12d93ad9bc 100644 --- a/third_party/dav1d/src/arm/32/mc.S +++ b/third_party/dav1d/src/arm/32/mc.S @@ -1403,12 +1403,12 @@ L(\type\()_8tap_h_tbl): vld1.8 {d24}, [\sr2], \s_strd vmovl.u8 q8, d16 vmovl.u8 q12, d24 - vext.8 q9, q8, q8, #2 - vext.8 q10, q8, q8, #4 - vext.8 q11, q8, q8, #6 - vext.8 q13, q12, q12, #2 - vext.8 q14, q12, q12, #4 - vext.8 q15, q12, q12, #6 + vext.8 d18, d16, d17, #2 + vext.8 d20, d16, d17, #4 + vext.8 d22, d16, d17, #6 + vext.8 d26, d24, d25, #2 + vext.8 d28, d24, d25, #4 + vext.8 d30, d24, d25, #6 subs \h, \h, #2 vmul.s16 d4, d16, d0[0] vmla.s16 d4, d18, d0[1] @@ -1431,7 +1431,7 @@ L(\type\()_8tap_h_tbl): pop {r4-r11,pc} 80: // 8xN h - vld1.8 {d0}, [\mx] + vld1.8 {d0}, [\mx, :64] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1482,7 +1482,7 @@ L(\type\()_8tap_h_tbl): // one temporary for vext in the loop. That's slower on A7 and A53, // (but surprisingly, marginally faster on A8 and A73). vpush {q4-q6} - vld1.8 {d0}, [\mx] + vld1.8 {d0}, [\mx, :64] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1629,7 +1629,7 @@ L(\type\()_8tap_v_tbl): 28: // 2x8, 2x16 v vpush {q4-q7} - vld1.8 {d0}, [\my] + vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd @@ -1709,7 +1709,7 @@ L(\type\()_8tap_v_tbl): 480: // 4x8, 4x16 v vpush {q4} - vld1.8 {d0}, [\my] + vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd @@ -1782,7 +1782,7 @@ L(\type\()_8tap_v_tbl): 640: 1280: vpush {q4} - vld1.8 {d0}, [\my] + vld1.8 {d0}, [\my, :64] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 vmovl.s8 q0, d0 @@ -1951,11 +1951,10 @@ L(\type\()_8tap_hv_tbl): bl L(\type\()_8tap_filter_2) vext.8 d18, d17, d26, #4 - vmov d19, d26 vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] - vmlal.s16 q2, d19, d2[3] + vmlal.s16 q2, d26, d2[3] vqrshrn.s32 d4, q2, #\shift_hv vqmovun.s16 d4, q2 @@ -1964,11 +1963,11 @@ L(\type\()_8tap_hv_tbl): vst1.16 {d4[1]}, [\ds2, :16], \d_strd ble 0f vmov d16, d18 - vmov d17, d19 + vmov d17, d26 b 2b 280: // 2x8, 2x16, 2x32 hv - vld1.8 {d2}, [\my] + vld1.8 {d2}, [\my, :64] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd @@ -2001,7 +2000,6 @@ L(\type\()_8tap_hv_tbl): 28: bl L(\type\()_8tap_filter_2) vext.8 d22, d21, d26, #4 - vmov d23, d26 vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] @@ -2009,7 +2007,7 @@ L(\type\()_8tap_hv_tbl): vmlal.s16 q2, d20, d3[0] vmlal.s16 q2, d21, d3[1] vmlal.s16 q2, d22, d3[2] - vmlal.s16 q2, d23, d3[3] + vmlal.s16 q2, d26, d3[3] vqrshrn.s32 d4, q2, #\shift_hv vqmovun.s16 d4, q2 @@ -2022,7 +2020,7 @@ L(\type\()_8tap_hv_tbl): vmov d18, d20 vmov d19, d21 vmov d20, d22 - vmov d21, d23 + vmov d21, d26 b 28b 0: @@ -2108,7 +2106,7 @@ L(\type\()_8tap_filter_2): b 4b 480: // 4x8, 4x16, 4x32 hv - vld1.8 {d2}, [\my] + vld1.8 {d2}, [\my, :64] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd @@ -2211,7 +2209,7 @@ L(\type\()_8tap_filter_4): bgt 880f vpush {q4-q7} add \my, \my, #2 - vld1.8 {d0}, [\mx] + vld1.8 {d0}, [\mx, :64] vld1.32 {d2[]}, [\my] sub \src, \src, #3 sub \src, \src, \s_strd @@ -2301,8 +2299,8 @@ L(\type\()_8tap_filter_4): 640: 1280: vpush {q4-q7} - vld1.8 {d0}, [\mx] - vld1.8 {d2}, [\my] + vld1.8 {d0}, [\mx, :64] + vld1.8 {d2}, [\my, :64] sub \src, \src, #3 sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 diff --git a/third_party/dav1d/src/arm/32/mc16.S b/third_party/dav1d/src/arm/32/mc16.S new file mode 100644 index 000000000000..da4ebe4cac46 --- /dev/null +++ b/third_party/dav1d/src/arm/32/mc16.S @@ -0,0 +1,2429 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define PREP_BIAS 8192 + +.macro avg d0, d00, d01, d1, d10, d11 + vld1.16 {q0, q1}, [r2, :128]! + vld1.16 {q2, q3}, [r3, :128]! + vqadd.s16 q0, q0, q2 + vqadd.s16 q1, q1, q3 + vmax.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vmax.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vqsub.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vqsub.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vshl.s16 \d0, q0, q13 // -(intermediate_bits+1) + vshl.s16 \d1, q1, q13 // -(intermediate_bits+1) +.endm + +.macro w_avg d0, d00, d01, d1, d10, d11 + vld1.16 {q0, q1}, [r2, :128]! + vld1.16 {q2, q3}, [r3, :128]! + // This difference requires a 17 bit range, and all bits are + // significant for the following multiplication. + vsubl.s16 \d0, d4, d0 + vsubl.s16 q0, d5, d1 + vsubl.s16 \d1, d6, d2 + vsubl.s16 q1, d7, d3 + vmul.s32 \d0, \d0, q4 + vmul.s32 q0, q0, q4 + vmul.s32 \d1, \d1, q4 + vmul.s32 q1, q1, q4 + vshr.s32 \d0, \d0, #4 + vshr.s32 q0, q0, #4 + vshr.s32 \d1, \d1, #4 + vshr.s32 q1, q1, #4 + vaddw.s16 \d0, \d0, d4 + vaddw.s16 q0, q0, d5 + vaddw.s16 \d1, \d1, d6 + vaddw.s16 q1, q1, d7 + vmovn.i32 \d00, \d0 + vmovn.i32 \d01, q0 + vmovn.i32 \d10, \d1 + vmovn.i32 \d11, q1 + vrshl.s16 \d0, \d0, q13 // -intermediate_bits + vrshl.s16 \d1, \d1, q13 // -intermediate_bits + vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits + vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits + vmin.s16 \d0, \d0, q15 // bitdepth_max + vmin.s16 \d1, \d1, q15 // bitdepth_max + vmax.s16 \d0, \d0, q14 // 0 + vmax.s16 \d1, \d1, q14 // 0 +.endm + +.macro mask d0, d00, d01, d1, d10, d11 + vld1.8 {q7}, [r6, :128]! + vld1.16 {q0, q1}, [r2, :128]! + vneg.s8 q7, q7 + vld1.16 {q2, q3}, [r3, :128]! + vmovl.s8 q6, d14 + vmovl.s8 q7, d15 + vmovl.s16 q4, d12 + vmovl.s16 q5, d13 + vmovl.s16 q6, d14 + vmovl.s16 q7, d15 + vsubl.s16 \d0, d4, d0 + vsubl.s16 q0, d5, d1 + vsubl.s16 \d1, d6, d2 + vsubl.s16 q1, d7, d3 + vmul.s32 \d0, \d0, q4 + vmul.s32 q0, q0, q5 + vmul.s32 \d1, \d1, q6 + vmul.s32 q1, q1, q7 + vshr.s32 \d0, \d0, #6 + vshr.s32 q0, q0, #6 + vshr.s32 \d1, \d1, #6 + vshr.s32 q1, q1, #6 + vaddw.s16 \d0, \d0, d4 + vaddw.s16 q0, q0, d5 + vaddw.s16 \d1, \d1, d6 + vaddw.s16 q1, q1, d7 + vmovn.i32 \d00, \d0 + vmovn.i32 \d01, q0 + vmovn.i32 \d10, \d1 + vmovn.i32 \d11, q1 + vrshl.s16 \d0, \d0, q13 // -intermediate_bits + vrshl.s16 \d1, \d1, q13 // -intermediate_bits + vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits + vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits + vmin.s16 \d0, \d0, q15 // bitdepth_max + vmin.s16 \d1, \d1, q15 // bitdepth_max + vmax.s16 \d0, \d0, q14 // 0 + vmax.s16 \d1, \d1, q14 // 0 +.endm + +.macro bidir_fn type, bdmax +function \type\()_16bpc_neon, export=1 + push {r4-r7,lr} + ldr r4, [sp, #20] + ldr r5, [sp, #24] + ldr r6, [sp, #28] + clz r4, r4 +.ifnc \type, avg + ldr r7, [sp, #32] + vmov.i16 q14, #0 + vdup.16 q15, r7 // bitdepth_max +.endif +.ifc \type, w_avg + vpush {q4} +.endif +.ifc \type, mask + vpush {q4-q7} +.endif + clz r7, \bdmax + sub r7, r7, #18 // intermediate_bits = clz(bitdepth_max) - 18 +.ifc \type, avg + mov lr, #1 + movw r12, #2*PREP_BIAS + lsl lr, lr, r7 // 1 << intermediate_bits + neg r12, r12 // -2*PREP_BIAS + add r7, r7, #1 + sub r12, r12, lr // -2*PREP_BIAS - 1 << intermediate_bits + neg r7, r7 // -(intermediate_bits+1) + vdup.16 q12, r12 // -2*PREP_BIAS - 1 << intermediate_bits + vdup.16 q13, r7 // -(intermediate_bits+1) +.else + mov r12, #PREP_BIAS + lsr r12, r12, r7 // PREP_BIAS >> intermediate_bits + neg r7, r7 // -intermediate_bits + vdup.16 q12, r12 // PREP_BIAS >> intermediate_bits + vdup.16 q13, r7 // -intermediate_bits +.endif +.ifc \type, w_avg + vdup.32 q4, r6 + vneg.s32 q4, q4 +.endif + adr r7, L(\type\()_tbl) + sub r4, r4, #24 + \type q8, d16, d17, q9, d18, d19 + ldr r4, [r7, r4, lsl #2] + add r7, r7, r4 + bx r7 + + .align 2 +L(\type\()_tbl): + .word 1280f - L(\type\()_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_tbl) + CONFIG_THUMB + +40: + add r7, r0, r1 + lsl r1, r1, #1 +4: + subs r5, r5, #4 + vst1.16 {d16}, [r0, :64], r1 + vst1.16 {d17}, [r7, :64], r1 + vst1.16 {d18}, [r0, :64], r1 + vst1.16 {d19}, [r7, :64], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 4b +80: + add r7, r0, r1 + lsl r1, r1, #1 +8: + vst1.16 {q8}, [r0, :128], r1 + subs r5, r5, #2 + vst1.16 {q9}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 8b +160: +16: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #2 + vst1.16 {q10, q11}, [r0, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 16b +320: + add r7, r0, #32 +32: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 32b +640: + add r7, r0, #32 + mov r12, #64 + sub r1, r1, #64 +64: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 64b +1280: + add r7, r0, #32 + mov r12, #64 + sub r1, r1, #192 +128: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 128b +0: +.ifc \type, mask + vpop {q4-q7} +.endif +.ifc \type, w_avg + vpop {q4} +.endif + pop {r4-r7,pc} +endfunc +.endm + +bidir_fn avg, r6 +bidir_fn w_avg, r7 +bidir_fn mask, r7 + + +// This has got the same signature as the put_8tap functions, +// and assumes that r9 is set to (clz(w)-24). +function put_neon + adr r10, L(put_tbl) + ldr r9, [r10, r9, lsl #2] + add r10, r10, r9 + bx r10 + + .align 2 +L(put_tbl): + .word 1280f - L(put_tbl) + CONFIG_THUMB + .word 640f - L(put_tbl) + CONFIG_THUMB + .word 320f - L(put_tbl) + CONFIG_THUMB + .word 16f - L(put_tbl) + CONFIG_THUMB + .word 80f - L(put_tbl) + CONFIG_THUMB + .word 4f - L(put_tbl) + CONFIG_THUMB + .word 2f - L(put_tbl) + CONFIG_THUMB + +2: + vld1.32 {d0[]}, [r2], r3 + vld1.32 {d1[]}, [r2], r3 + subs r5, r5, #2 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + bgt 2b + pop {r4-r11,pc} +4: + vld1.16 {d0}, [r2], r3 + vld1.16 {d1}, [r2], r3 + subs r5, r5, #2 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r0, :64], r1 + bgt 4b + pop {r4-r11,pc} +80: + add r8, r0, r1 + lsl r1, r1, #1 + add r9, r2, r3 + lsl r3, r3, #1 +8: + vld1.16 {q0}, [r2], r3 + vld1.16 {q1}, [r9], r3 + subs r5, r5, #2 + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r8, :128], r1 + bgt 8b + pop {r4-r11,pc} +16: + vld1.16 {q0, q1}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q0, q1}, [r0, :128], r1 + bgt 16b + pop {r4-r11,pc} +320: + sub r1, r1, #32 + sub r3, r3, #32 +32: + vld1.16 {q0, q1}, [r2]! + vst1.16 {q0, q1}, [r0, :128]! + vld1.16 {q2, q3}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q2, q3}, [r0, :128], r1 + bgt 32b + pop {r4-r11,pc} +640: + sub r1, r1, #96 + sub r3, r3, #96 +64: + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q14, q15}, [r0, :128], r1 + bgt 64b + pop {r4-r11,pc} +1280: + sub r1, r1, #224 + sub r3, r3, #224 +128: + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2]! + vst1.16 {q14, q15}, [r0, :128]! + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q14, q15}, [r0, :128], r1 + bgt 128b + pop {r4-r11,pc} +endfunc + +// This has got the same signature as the prep_8tap functions, +// and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and +// r8 to w*2. +function prep_neon + adr r10, L(prep_tbl) + ldr r9, [r10, r9, lsl #2] + vdup.16 q15, r7 // intermediate_bits + vmov.i16 q14, #PREP_BIAS + add r10, r10, r9 + bx r10 + + .align 2 +L(prep_tbl): + .word 1280f - L(prep_tbl) + CONFIG_THUMB + .word 640f - L(prep_tbl) + CONFIG_THUMB + .word 320f - L(prep_tbl) + CONFIG_THUMB + .word 16f - L(prep_tbl) + CONFIG_THUMB + .word 80f - L(prep_tbl) + CONFIG_THUMB + .word 40f - L(prep_tbl) + CONFIG_THUMB + +40: + add r9, r1, r2 + lsl r2, r2, #1 +4: + vld1.16 {d0}, [r1], r2 + vld1.16 {d1}, [r9], r2 + subs r4, r4, #2 + vshl.s16 q0, q0, q15 + vsub.i16 q0, q0, q14 + vst1.16 {q0}, [r0, :128]! + bgt 4b + pop {r4-r11,pc} +80: + add r9, r1, r2 + lsl r2, r2, #1 +8: + vld1.16 {q0}, [r1], r2 + vld1.16 {q1}, [r9], r2 + subs r4, r4, #2 + vshl.s16 q0, q0, q15 + vshl.s16 q1, q1, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vst1.16 {q0, q1}, [r0, :128]! + bgt 8b + pop {r4-r11,pc} +16: + vld1.16 {q0, q1}, [r1], r2 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1], r2 + subs r4, r4, #2 + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vshl.s16 q3, q3, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q3, q3, q14 + vst1.16 {q2, q3}, [r0, :128]! + bgt 16b + pop {r4-r11,pc} +320: + sub r2, r2, #32 +32: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1], r2 + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vshl.s16 q3, q3, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q3, q3, q14 + vst1.16 {q2, q3}, [r0, :128]! + bgt 32b + pop {r4-r11,pc} +640: + sub r2, r2, #96 +64: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1]! + vshl.s16 q1, q1, q15 + vld1.16 {q8, q9}, [r1]! + vshl.s16 q2, q2, q15 + vld1.16 {q10, q11}, [r1], r2 + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q9, q9, q14 + vst1.16 {q2, q3}, [r0, :128]! + vsub.i16 q10, q10, q14 + vst1.16 {q8, q9}, [r0, :128]! + vsub.i16 q11, q11, q14 + vst1.16 {q10, q11}, [r0, :128]! + bgt 64b + pop {r4-r11,pc} +1280: + sub r2, r2, #224 +128: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1]! + vshl.s16 q1, q1, q15 + vld1.16 {q8, q9}, [r1]! + vshl.s16 q2, q2, q15 + vld1.16 {q10, q11}, [r1]! + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vld1.16 {q0, q1}, [r1]! + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 + vst1.16 {q2, q3}, [r0, :128]! + vld1.16 {q2, q3}, [r1]! + vsub.i16 q11, q11, q14 + vshl.s16 q0, q0, q15 + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q8, q9}, [r1]! + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q10, q11}, [r1], r2 + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q9, q9, q14 + vst1.16 {q2, q3}, [r0, :128]! + vsub.i16 q10, q10, q14 + vst1.16 {q8, q9}, [r0, :128]! + vsub.i16 q11, q11, q14 + vst1.16 {q10, q11}, [r0, :128]! + bgt 128b + pop {r4-r11,pc} +endfunc + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + vld1.\wd {\d0[]}, [\s0], \strd + vld1.\wd {\d1[]}, [\s1], \strd +.ifnb \d2 + vld1.\wd {\d2[]}, [\s0], \strd + vld1.\wd {\d3[]}, [\s1], \strd +.endif +.ifnb \d4 + vld1.\wd {\d4[]}, [\s0], \strd +.endif +.ifnb \d5 + vld1.\wd {\d5[]}, [\s1], \strd +.endif +.ifnb \d6 + vld1.\wd {\d6[]}, [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + vld1.16 {\d0}, [\s0], \strd + vld1.16 {\d1}, [\s1], \strd +.ifnb \d2 + vld1.16 {\d2}, [\s0], \strd + vld1.16 {\d3}, [\s1], \strd +.endif +.ifnb \d4 + vld1.16 {\d4}, [\s0], \strd +.endif +.ifnb \d5 + vld1.16 {\d5}, [\s1], \strd +.endif +.ifnb \d6 + vld1.16 {\d6}, [\s0], \strd +.endif +.endm +.macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5 + vld1.16 {\d0, \d1}, [\s0], \strd +.ifnb \d2 + vld1.16 {\d2, \d3}, [\s1], \strd +.endif +.ifnb \d4 + vld1.16 {\d4, \d5}, [\s0], \strd +.endif +.endm +.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5 + load_regpair \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5 +.endm +.macro interleave_1_32 r0, r1, r2, r3, r4 + vext.8 \r0, \r0, \r1, #4 + vext.8 \r1, \r1, \r2, #4 +.ifnb \r3 + vext.8 \r2, \r2, \r3, #4 + vext.8 \r3, \r3, \r4, #4 +.endif +.endm +.macro vmin_u16 c, r0, r1, r2, r3 + vmin.u16 \r0, \r0, \c +.ifnb \r1 + vmin.u16 \r1, \r1, \c +.endif +.ifnb \r2 + vmin.u16 \r2, \r2, \c + vmin.u16 \r3, \r3, \c +.endif +.endm +.macro vsub_i16 c, r0, r1, r2, r3 + vsub.i16 \r0, \r0, \c +.ifnb \r1 + vsub.i16 \r1, \r1, \c +.endif +.ifnb \r2 + vsub.i16 \r2, \r2, \c + vsub.i16 \r3, \r3, \c +.endif +.endm +.macro vmull_vmlal_4 d, s0, s1, s2, s3 + vmull.s16 \d, \s0, d0[0] + vmlal.s16 \d, \s1, d0[1] + vmlal.s16 \d, \s2, d0[2] + vmlal.s16 \d, \s3, d0[3] +.endm +.macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 + vmull.s16 \d, \s0, d0[0] + vmlal.s16 \d, \s1, d0[1] + vmlal.s16 \d, \s2, d0[2] + vmlal.s16 \d, \s3, d0[3] + vmlal.s16 \d, \s4, d1[0] + vmlal.s16 \d, \s5, d1[1] + vmlal.s16 \d, \s6, d1[2] + vmlal.s16 \d, \s7, d1[3] +.endm +.macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3 + vqrshrun.s32 \d0, \q0, #\shift +.ifnb \q1 + vqrshrun.s32 \d1, \q1, #\shift +.endif +.ifnb \q2 + vqrshrun.s32 \d2, \q2, #\shift + vqrshrun.s32 \d3, \q3, #\shift +.endif +.endm +.macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3 + vmovn.i32 \d0, \q0 +.ifnb \q1 + vmovn.i32 \d1, \q1 +.endif +.ifnb \q2 + vmovn.i32 \d2, \q2 + vmovn.i32 \d3, \q3 +.endif +.endm +.macro vrshl_s32 shift, r0, r1, r2, r3 + vrshl.s32 \r0, \r0, \shift + vrshl.s32 \r1, \r1, \shift +.ifnb \r2 + vrshl.s32 \r2, \r2, \shift + vrshl.s32 \r3, \r3, \shift +.endif +.endm +.macro vst1_32 strd, r0, r1 + vst1.32 {\r0[0]}, [r0, :32], \strd + vst1.32 {\r0[1]}, [r9, :32], \strd +.ifnb \r1 + vst1.32 {\r1[0]}, [r0, :32], \strd + vst1.32 {\r1[1]}, [r9, :32], \strd +.endif +.endm +.macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 + vst1.16 {\r0}, [r0, \align], \strd + vst1.16 {\r1}, [r9, \align], \strd +.ifnb \r2 + vst1.16 {\r2}, [r0, \align], \strd + vst1.16 {\r3}, [r9, \align], \strd +.endif +.ifnb \r4 + vst1.16 {\r4}, [r0, \align], \strd + vst1.16 {\r5}, [r9, \align], \strd + vst1.16 {\r6}, [r0, \align], \strd + vst1.16 {\r7}, [r9, \align], \strd +.endif +.endm +.macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3 +.ifc \type, put + vqrshrun_s32 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 + vmin_u16 q15, \q0, \q1 +.else + vrshl_s32 q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits) + vmovn_i32 \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 + vsub_i16 q15, \q0, \q1 // PREP_BIAS +.endif +.endm +.macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1_reg \strd, :64, \d0, \d1, \d2, \d3 +.endm +.macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1_reg \strd, :128, \q0, \q1 +.endm +.macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1.16 {\q0, \q1}, [r0, :128], \strd +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + movw r9, \type_h + movw r10, \type_v + b \op\()_8tap_neon +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2 +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap_neon + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] +.ifc \bdmax, r8 + ldr r8, [sp, #52] +.endif + movw r11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, r11 + mul \my, \my, r11 + add \mx, \mx, r9 // mx, 8tap_h, 4tap_h + add \my, \my, r10 // my, 8tap_v, 4tap_v + +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + + vdup.16 q15, \bdmax // bitdepth_max + clz \bdmax, \bdmax + clz r9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + tst \mx, #(0x7f << 14) + sub r9, r9, #24 + add lr, \bdmax, #6 // 6 + intermediate_bits + rsb r12, \bdmax, #6 // 6 - intermediate_bits + movrel r11, X(mc_subpel_filters), -8 + bne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + bne L(\type\()_8tap_v) + b \type\()_neon + +L(\type\()_8tap_h): + cmp \w, #4 + ubfx r10, \mx, #7, #7 + and \mx, \mx, #0x7f + it gt + movgt \mx, r10 + tst \my, #(0x7f << 14) + add \mx, r11, \mx, lsl #3 + bne L(\type\()_8tap_hv) + + adr r10, L(\type\()_8tap_h_tbl) + vdup.32 q14, r12 // 6 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s32 q14, q14 // -(6-intermediate_bits) +.ifc \type, put + vdup.16 q13, \bdmax // intermediate_bits +.else + vmov.i16 q13, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s16 q13, q13 // -intermediate_bits +.endif + bx r10 + + .align 2 +L(\type\()_8tap_h_tbl): + .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +2: + vld1.16 {q2}, [\src], \s_strd + vld1.16 {q3}, [\sr2], \s_strd + vext.8 d5, d4, d5, #2 + vext.8 d7, d6, d7, #2 + subs \h, \h, #2 + vtrn.32 d4, d6 + vtrn.32 d5, d7 + vmull.s16 q1, d4, d0[0] + vmlal.s16 q1, d5, d0[1] + vmlal.s16 q1, d6, d0[2] + vmlal.s16 q1, d7, d0[3] + vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) + vqmovun.s32 d2, q1 + vrshl.s16 d2, d2, d26 // -intermediate_bits + vmin.u16 d2, d2, d30 + vst1.32 {d2[0]}, [\dst, :32], \d_strd + vst1.32 {d2[1]}, [\ds2, :32], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +4: + vld1.16 {q8}, [\src], \s_strd + vld1.16 {q11}, [\sr2], \s_strd + vext.8 d18, d16, d17, #2 + vext.8 d19, d16, d17, #4 + vext.8 d20, d16, d17, #6 + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d21, d22, d23, #6 + subs \h, \h, #2 + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d19, d0[2] + vmlal.s16 q2, d20, d0[3] + vmull.s16 q3, d22, d0[0] + vmlal.s16 q3, d24, d0[1] + vmlal.s16 q3, d25, d0[2] + vmlal.s16 q3, d21, d0[3] + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) +.ifc \type, put + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vrshl.s16 q2, q2, q13 // -intermediate_bits + vmin.u16 q2, q2, q15 +.else + vmovn.s32 d4, q2 + vmovn.s32 d5, q3 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + bgt 4b + pop {r4-r11,pc} + +80: +160: +320: +640: +1280: // 8xN, 16xN, 32xN, ... h + vpush {q4-q5} + vld1.8 {d0}, [\mx, :64] + sub \src, \src, #6 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + sub \s_strd, \s_strd, \w, lsl #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, lsl #1 +.endif +81: + vld1.16 {q8, q9}, [\src]! + vld1.16 {q10, q11}, [\sr2]! + mov \mx, \w + +8: + vmull.s16 q1, d16, d0[0] + vmull.s16 q2, d17, d0[0] + vmull.s16 q3, d20, d0[0] + vmull.s16 q4, d21, d0[0] +.irpc i, 1234567 + vext.8 q12, q8, q9, #(2*\i) + vext.8 q5, q10, q11, #(2*\i) +.if \i < 4 + vmlal.s16 q1, d24, d0[\i] + vmlal.s16 q2, d25, d0[\i] + vmlal.s16 q3, d10, d0[\i] + vmlal.s16 q4, d11, d0[\i] +.else + vmlal.s16 q1, d24, d1[\i-4] + vmlal.s16 q2, d25, d1[\i-4] + vmlal.s16 q3, d10, d1[\i-4] + vmlal.s16 q4, d11, d1[\i-4] +.endif +.endr + subs \mx, \mx, #8 + vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vrshl.s32 q4, q4, q14 // -(6-intermediate_bits) +.ifc \type, put + vqmovun.s32 d2, q1 + vqmovun.s32 d3, q2 + vqmovun.s32 d4, q3 + vqmovun.s32 d5, q4 + vrshl.s16 q1, q1, q13 // -intermediate_bits + vrshl.s16 q2, q2, q13 // -intermediate_bits + vmin.u16 q1, q1, q15 + vmin.u16 q2, q2, q15 +.else + vmovn.s32 d2, q1 + vmovn.s32 d3, q2 + vmovn.s32 d4, q3 + vmovn.s32 d5, q4 + vsub.i16 q1, q1, q13 // PREP_BIAS + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + vst1.16 {q1}, [\dst, :128]! + vst1.16 {q2}, [\ds2, :128]! + ble 9f + + vmov q8, q9 + vmov q10, q11 + vld1.16 {q9}, [\src]! + vld1.16 {q11}, [\sr2]! + b 8b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 81b + vpop {q4-q5} + pop {r4-r11,pc} + + +L(\type\()_8tap_v): + cmp \h, #4 + ubfx r10, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r10 + add \my, r11, \my, lsl #3 + +.ifc \type, prep + vdup.32 q14, r12 // 6 - intermediate_bits + vmov.i16 q15, #PREP_BIAS +.endif + adr r10, L(\type\()_8tap_v_tbl) + ldr r9, [r10, r9, lsl #2] +.ifc \type, prep + vneg.s32 q14, q14 // -(6-intermediate_bits) +.endif + add r10, r10, r9 + bx r10 + + .align 2 +L(\type\()_8tap_v_tbl): + .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + bgt 28f + + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + // 2x2 v + load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + interleave_1_32 d1, d2, d3, d4, d5 + bgt 24f + vmull_vmlal_4 q8, d1, d2, d3, d4 + vqrshrun_s32 6, q8, d16 + vmin_u16 d30, d16 + vst1_32 \d_strd, d16 + pop {r4-r11,pc} + +24: // 2x4 v + load_32 \sr2, \src, \s_strd, d6, d7 + interleave_1_32 d5, d6, d7 + vmull_vmlal_4 q8, d1, d2, d3, d4 + vmull_vmlal_4 q9, d3, d4, d5, d6 + vqrshrun_s32 6, q8, d16, q9, d17 + vmin_u16 q15, q8 + vst1_32 \d_strd, d16, d17 + pop {r4-r11,pc} + +28: // 2x8, 2x16 v + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + load_32 \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16 + interleave_1_32 d2, d3, d4, d5, d6 + interleave_1_32 d6, d7, d16 +216: + subs \h, \h, #8 + load_32 \sr2, \src, \s_strd, d17, d18, d19, d20 + load_32 \sr2, \src, \s_strd, d21, d22, d23, d24 + interleave_1_32 d16, d17, d18, d19, d20 + interleave_1_32 d20, d21, d22, d23, d24 + vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 + vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19 + vmull_vmlal_8 q2, d6, d7, d16, d17, d18, d19, d20, d21 + vmull_vmlal_8 q3, d16, d17, d18, d19, d20, d21, d22, d23 + vqrshrun_s32 6, q13, d26, q1, d27, q2, d2, q3, d3 + vmin_u16 q15, q13, q1 + vst1_32 \d_strd, d26, d27 + vst1_32 \d_strd, d2, d3 + ble 0f + vmov q1, q9 + vmov q2, q10 + vmov q3, q11 + vmov d16, d24 + b 216b +0: + pop {r4-r11,pc} +.endif + +40: + bgt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + vmull_vmlal_4 q8, d1, d2, d3, d4 + vmull_vmlal_4 q9, d2, d3, d4, d5 + shift_store_4 \type, \d_strd, q8, q9, d16, d17 + ble 0f + load_reg \sr2, \src, \s_strd, d6, d7 + vmull_vmlal_4 q8, d3, d4, d5, d6 + vmull_vmlal_4 q9, d4, d5, d6, d7 + shift_store_4 \type, \d_strd, q8, q9, d16, d17 +0: + pop {r4-r11,pc} + +480: // 4x8, 4x16 v + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22 + +48: + subs \h, \h, #4 + load_reg \sr2, \src, \s_strd, d23, d24, d25, d26 + vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23 + vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24 + vmull_vmlal_8 q3, d18, d19, d20, d21, d22, d23, d24, d25 + vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26 + shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5 + ble 0f + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + vmov d22, d26 + b 48b +0: + pop {r4-r11,pc} + +80: + bgt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, q1, q2, q3, q8, q9 + vmull_vmlal_4 q10, d2, d4, d6, d16 + vmull_vmlal_4 q11, d3, d5, d7, d17 + vmull_vmlal_4 q12, d4, d6, d16, d18 + vmull_vmlal_4 q13, d5, d7, d17, d19 + shift_store_8 \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23 + ble 0f + load_reg \sr2, \src, \s_strd, q10, q11 + vmull_vmlal_4 q1, d6, d16, d18, d20 + vmull_vmlal_4 q2, d7, d17, d19, d21 + vmull_vmlal_4 q12, d16, d18, d20, d22 + vmull_vmlal_4 q13, d17, d19, d21, d23 + shift_store_8 \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5 +0: + pop {r4-r11,pc} + +880: // 8x6, 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + vpush {q4-q7} + vld1.8 {d0}, [\my, :64] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_reg \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11 + +88: + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, q12, q13 + vmull_vmlal_8 q1, d10, d12, d14, d16, d18, d20, d22, d24 + vmull_vmlal_8 q2, d11, d13, d15, d17, d19, d21, d23, d25 + vmull_vmlal_8 q3, d12, d14, d16, d18, d20, d22, d24, d26 + vmull_vmlal_8 q4, d13, d15, d17, d19, d21, d23, d25, d27 + shift_store_8 \type, \d_strd, q1, q2, d2, d3, q3, q4, d4, d5 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, q1, q2 + vmull_vmlal_8 q3, d14, d16, d18, d20, d22, d24, d26, d2 + vmull_vmlal_8 q4, d15, d17, d19, d21, d23, d25, d27, d3 + vmull_vmlal_8 q5, d16, d18, d20, d22, d24, d26, d2, d4 + vmull_vmlal_8 q6, d17, d19, d21, d23, d25, d27, d3, d5 + shift_store_8 \type, \d_strd, q3, q4, d6, d7, q5, q6, d8, d9 + ble 9f + vmov q5, q9 + vmov q6, q10 + vmov q7, q11 + vmov q8, q12 + vmov q9, q13 + vmov q10, q1 + vmov q11, q2 + b 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + vpop {q4-q7} + pop {r4-r11,pc} + +160: + bgt 1680b + + // 16x2, 16x4 v + vpush {q6-q7} + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + vmovl.s8 q0, d0 + + load_16s16 \src, \src, \s_strd, q6, q7, q8, q9, q10, q11 +16: + load_16s16 \src, \src, \s_strd, q12, q13 + subs \h, \h, #1 + vmull_vmlal_4 q1, d12, d16, d20, d24 + vmull_vmlal_4 q2, d13, d17, d21, d25 + vmull_vmlal_4 q3, d14, d18, d22, d26 + vmull_vmlal_4 q6, d15, d19, d23, d27 + shift_store_16 \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5 + ble 0f + vmov q6, q8 + vmov q7, q9 + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + vmov q11, q13 + b 16b +0: + vpop {q6-q7} + pop {r4-r11,pc} + + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfx r10, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r10 +4: + add \my, r11, \my, lsl #3 + + adr r10, L(\type\()_8tap_hv_tbl) + neg r12, r12 // -(6-intermediate_bits) + ldr r9, [r10, r9, lsl #2] + vdup.32 q14, r12 // -(6-intermediate_bits) +.ifc \type, put + neg r8, lr // -(6+intermeidate_bits) +.else + vmov.i16 q13, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vdup.32 q13, r8 // -(6+intermediate_bits) +.endif + bx r10 + + .align 2 +L(\type\()_8tap_hv_tbl): + .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + +20: +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 280f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + + // 2x2, 2x4 hv + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vmull.s16 q11, d22, d0 + vmull.s16 q12, d24, d0 + vpadd.s32 d22, d22, d23 + vpadd.s32 d23, d24, d25 + vpadd.s32 d22, d22, d23 + vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) + vmovn.i32 d16, q8 + bl L(\type\()_8tap_filter_2) + + vext.8 d16, d16, d16, #4 + vext.8 d16, d16, d24, #4 + vmov d17, d24 + +2: + bl L(\type\()_8tap_filter_2) + + vext.8 d18, d17, d24, #4 + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d24, d2[3] + + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vmin.u16 d4, d4, d30 + subs \h, \h, #2 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d4[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d18 + vmov d17, d24 + b 2b + +280: // 2x8, 2x16, 2x32 hv + vld1.8 {d2}, [\my, :64] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vmull.s16 q11, d22, d0 + vmull.s16 q12, d24, d0 + vpadd.s32 d22, d22, d23 + vpadd.s32 d23, d24, d25 + vpadd.s32 d22, d22, d23 + vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) + vmovn.i32 d16, q8 + + bl L(\type\()_8tap_filter_2) + + vext.8 d16, d16, d16, #4 + vext.8 d16, d16, d24, #4 + vmov d17, d24 + bl L(\type\()_8tap_filter_2) + vext.8 d18, d17, d24, #4 + vmov d19, d24 + bl L(\type\()_8tap_filter_2) + vext.8 d20, d19, d24, #4 + vmov d21, d24 + +28: + bl L(\type\()_8tap_filter_2) + vext.8 d22, d21, d24, #4 + vmull.s16 q3, d16, d2[0] + vmlal.s16 q3, d17, d2[1] + vmlal.s16 q3, d18, d2[2] + vmlal.s16 q3, d19, d2[3] + vmlal.s16 q3, d20, d3[0] + vmlal.s16 q3, d21, d3[1] + vmlal.s16 q3, d22, d3[2] + vmlal.s16 q3, d24, d3[3] + + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d6, q3 + vmin.u16 d6, d6, d30 + subs \h, \h, #2 + vst1.32 {d6[0]}, [\dst, :32], \d_strd + vst1.32 {d6[1]}, [\ds2, :32], \d_strd + ble 0f + vmov q8, q9 + vmov q9, q10 + vmov d20, d22 + vmov d21, d24 + b 28b +0: + pop {r4-r11,pc} + +L(\type\()_8tap_filter_2): + vld1.16 {q11}, [\sr2], \s_strd + vld1.16 {q12}, [\src], \s_strd + vext.8 d23, d22, d23, #2 + vext.8 d25, d24, d25, #2 + vtrn.32 q11, q12 + vmull.s16 q3, d22, d0[0] + vmlal.s16 q3, d23, d0[1] + vmlal.s16 q3, d24, d0[2] + vmlal.s16 q3, d25, d0[3] + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d24, q3 + bx lr +.endif + +40: + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 480f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + // 4x2, 4x4 hv + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d17, q10 + + bl L(\type\()_8tap_filter_4) + vmov q9, q12 + +4: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d17, d2[0] + vmlal.s16 q2, d18, d2[1] + vmlal.s16 q2, d19, d2[2] + vmlal.s16 q2, d24, d2[3] + vmull.s16 q3, d18, d2[0] + vmlal.s16 q3, d19, d2[1] + vmlal.s16 q3, d24, d2[2] + vmlal.s16 q3, d25, d2[3] +.ifc \type, put + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q15 +.else + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + subs \h, \h, #2 + + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + ble 0f + vmov d17, d19 + vmov q9, q12 + b 4b +0: + pop {r4-r11,pc} + +480: // 4x8, 4x16, 4x32 hv + vpush {d13-d15} + vld1.8 {d2}, [\my, :64] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d13, q10 + + bl L(\type\()_8tap_filter_4) + vmov q7, q12 + bl L(\type\()_8tap_filter_4) + vmov q8, q12 + bl L(\type\()_8tap_filter_4) + vmov q9, q12 + +48: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d13, d2[0] + vmlal.s16 q2, d14, d2[1] + vmlal.s16 q2, d15, d2[2] + vmlal.s16 q2, d16, d2[3] + vmlal.s16 q2, d17, d3[0] + vmlal.s16 q2, d18, d3[1] + vmlal.s16 q2, d19, d3[2] + vmlal.s16 q2, d24, d3[3] + vmull.s16 q3, d14, d2[0] + vmlal.s16 q3, d15, d2[1] + vmlal.s16 q3, d16, d2[2] + vmlal.s16 q3, d17, d2[3] + vmlal.s16 q3, d18, d3[0] + vmlal.s16 q3, d19, d3[1] + vmlal.s16 q3, d24, d3[2] + vmlal.s16 q3, d25, d3[3] +.ifc \type, put + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q15 +.else + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + ble 0f + vmov d13, d15 + vmov q7, q8 + vmov q8, q9 + vmov q9, q12 + b 48b +0: + vpop {d13-d15} + pop {r4-r11,pc} + +L(\type\()_8tap_filter_4): + vld1.16 {q10}, [\sr2], \s_strd + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d20, d21, #2 + vext.8 d25, d20, d21, #4 + vext.8 d21, d20, d21, #6 + vmull.s16 q3, d20, d0[0] + vmlal.s16 q3, d24, d0[1] + vmlal.s16 q3, d25, d0[2] + vmlal.s16 q3, d21, d0[3] + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d24, q3 + vmovn.i32 d25, q10 + bx lr + +80: +160: +320: + bgt 880f + add \my, \my, #2 + vld1.8 {d0}, [\mx, :64] + vld1.32 {d2[]}, [\my] + sub \src, \src, #6 + sub \src, \src, \s_strd + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.16 {q11, q12}, [\src], \s_strd + vmull.s16 q2, d22, d0[0] + vmull.s16 q3, d23, d0[0] + vdup.32 q14, r12 // -(6-intermediate_bits) +.irpc i, 1234567 + vext.8 q10, q11, q12, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d20, d0[\i] + vmlal.s16 q3, d21, d0[\i] +.else + vmlal.s16 q2, d20, d1[\i - 4] + vmlal.s16 q3, d21, d1[\i - 4] +.endif +.endr + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d16, q2 + vmovn.i32 d17, q3 + + bl L(\type\()_8tap_filter_8) + vmov q9, q11 + vmov q10, q12 + +8: + bl L(\type\()_8tap_filter_8) + vmull.s16 q2, d16, d2[0] + vmull.s16 q3, d17, d2[0] + vmull.s16 q13, d18, d2[0] + vmull.s16 q14, d19, d2[0] +.ifc \type, put + vdup.32 q8, r8 // -(6+intermediate_bits) +.endif + vmlal.s16 q2, d18, d2[1] + vmlal.s16 q3, d19, d2[1] + vmlal.s16 q13, d20, d2[1] + vmlal.s16 q14, d21, d2[1] + vmlal.s16 q2, d20, d2[2] + vmlal.s16 q3, d21, d2[2] + vmlal.s16 q13, d22, d2[2] + vmlal.s16 q14, d23, d2[2] + vmlal.s16 q2, d22, d2[3] + vmlal.s16 q3, d23, d2[3] + vmlal.s16 q13, d24, d2[3] + vmlal.s16 q14, d25, d2[3] +.ifc \type, put + vdup.16 q9, \bdmax // bitdepth_max + vrshl.s32 q2, q2, q8 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q8 // -(6+intermediate_bits) + vrshl.s32 q13, q13, q8 // -(6+intermediate_bits) + vrshl.s32 q14, q14, q8 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q13 + vqmovun.s32 d7, q14 + vmin.u16 q2, q2, q15 + vmin.u16 q3, q3, q15 +.else + vmov.i16 q9, #PREP_BIAS + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q13, #6 + vrshrn.i32 d7, q14, #6 + vsub.i16 q2, q2, q9 // PREP_BIAS + vsub.i16 q3, q3, q9 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {q2}, [\dst, :128], \d_strd + vst1.16 {q3}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + b 8b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 164b +0: + pop {r4-r11,pc} + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + vpush {q4-q7} + vld1.8 {d0}, [\mx, :64] + vld1.8 {d2}, [\my, :64] + sub \src, \src, #6 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.16 {q11, q12}, [\src], \s_strd + vmull.s16 q2, d22, d0[0] + vmull.s16 q3, d23, d0[0] + vdup.32 q14, r12 // -(6-intermediate_bits) +.irpc i, 1234567 + vext.8 q10, q11, q12, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d20, d0[\i] + vmlal.s16 q3, d21, d0[\i] +.else + vmlal.s16 q2, d20, d1[\i - 4] + vmlal.s16 q3, d21, d1[\i - 4] +.endif +.endr + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d8, q2 + vmovn.i32 d9, q3 + + bl L(\type\()_8tap_filter_8) + vmov q5, q11 + vmov q6, q12 + bl L(\type\()_8tap_filter_8) + vmov q7, q11 + vmov q8, q12 + bl L(\type\()_8tap_filter_8) + vmov q9, q11 + vmov q10, q12 + +88: + bl L(\type\()_8tap_filter_8) + vmull.s16 q2, d8, d2[0] + vmull.s16 q3, d9, d2[0] + vmull.s16 q13, d10, d2[0] + vmull.s16 q14, d11, d2[0] +.ifc \type, put + vdup.32 q4, r8 // -(6+intermediate_bits) +.endif + vmlal.s16 q2, d10, d2[1] + vmlal.s16 q3, d11, d2[1] + vmlal.s16 q13, d12, d2[1] + vmlal.s16 q14, d13, d2[1] + vmlal.s16 q2, d12, d2[2] + vmlal.s16 q3, d13, d2[2] + vmlal.s16 q13, d14, d2[2] + vmlal.s16 q14, d15, d2[2] + vmlal.s16 q2, d14, d2[3] + vmlal.s16 q3, d15, d2[3] + vmlal.s16 q13, d16, d2[3] + vmlal.s16 q14, d17, d2[3] + vmlal.s16 q2, d16, d3[0] + vmlal.s16 q3, d17, d3[0] + vmlal.s16 q13, d18, d3[0] + vmlal.s16 q14, d19, d3[0] + vmlal.s16 q2, d18, d3[1] + vmlal.s16 q3, d19, d3[1] + vmlal.s16 q13, d20, d3[1] + vmlal.s16 q14, d21, d3[1] + vmlal.s16 q2, d20, d3[2] + vmlal.s16 q3, d21, d3[2] + vmlal.s16 q13, d22, d3[2] + vmlal.s16 q14, d23, d3[2] + vmlal.s16 q2, d22, d3[3] + vmlal.s16 q3, d23, d3[3] + vmlal.s16 q13, d24, d3[3] + vmlal.s16 q14, d25, d3[3] +.ifc \type, put + vrshl.s32 q2, q2, q4 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q4 // -(6+intermediate_bits) + vrshl.s32 q13, q13, q4 // -(6+intermediate_bits) + vrshl.s32 q14, q14, q4 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q13 + vqmovun.s32 d7, q14 + vmin.u16 q2, q2, q15 + vmin.u16 q3, q3, q15 +.else + vmov.i16 q5, #PREP_BIAS + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q13, #6 + vrshrn.i32 d7, q14, #6 + vsub.i16 q2, q2, q5 // PREP_BIAS + vsub.i16 q3, q3, q5 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {q2}, [\dst, :128], \d_strd + vst1.16 {q3}, [\ds2, :128], \d_strd + ble 9f + vmov q4, q6 + vmov q5, q7 + vmov q6, q8 + vmov q7, q9 + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + b 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + vpop {q4-q7} + pop {r4-r11,pc} + +L(\type\()_8tap_filter_8): + vld1.16 {q13, q14}, [\sr2], \s_strd + vmull.s16 q2, d26, d0[0] + vmull.s16 q3, d27, d0[0] +.irpc i, 1234567 + vext.8 q12, q13, q14, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d24, d0[\i] + vmlal.s16 q3, d25, d0[\i] +.else + vmlal.s16 q2, d24, d1[\i - 4] + vmlal.s16 q3, d25, d1[\i - 4] +.endif +.endr + vdup.32 q12, r12 // -(6-intermediate_bits) + vld1.16 {q13, q14}, [\src], \s_strd + vrshl.s32 q2, q2, q12 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q12 // -(6-intermediate_bits) + vmovn.i32 d4, q2 + vmovn.i32 d5, q3 + + vmull.s16 q3, d26, d0[0] + vmull.s16 q11, d27, d0[0] +.irpc i, 1234567 + vext.8 q12, q13, q14, #(2*\i) +.if \i < 4 + vmlal.s16 q3, d24, d0[\i] + vmlal.s16 q11, d25, d0[\i] +.else + vmlal.s16 q3, d24, d1[\i - 4] + vmlal.s16 q11, d25, d1[\i - 4] +.endif +.endr + vdup.32 q13, r12 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6-intermediate_bits) + vrshl.s32 q11, q11, q13 // -(6-intermediate_bits) + + vmovn.i32 d24, q3 + vmovn.i32 d25, q11 + vmov q11, q2 + bx lr +endfunc + +function \type\()_bilin_16bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] +.ifc \bdmax, r8 + ldr r8, [sp, #52] +.endif + vdup.16 q1, \mx + vdup.16 q3, \my + rsb r9, \mx, #16 + rsb r10, \my, #16 + vdup.16 q0, r9 + vdup.16 q2, r10 +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + clz \bdmax, \bdmax // bitdepth_max + clz r9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + cmp \mx, #0 + sub r9, r9, #24 + rsb r11, \bdmax, #4 // 4 - intermediate_bits + add r12, \bdmax, #4 // 4 + intermediate_bits + bne L(\type\()_bilin_h) + cmp \my, #0 + bne L(\type\()_bilin_v) + b \type\()_neon + +L(\type\()_bilin_h): + cmp \my, #0 + bne L(\type\()_bilin_hv) + + adr r10, L(\type\()_bilin_h_tbl) + vdup.16 q15, r11 // 4 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s16 q15, q15 // -(4-intermediate_bits) +.ifc \type, put + vdup.16 q14, \bdmax // intermediate_bits +.else + vmov.i16 q14, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s16 q14, q14 // -intermediate_bits +.endif + bx r10 + + .align 2 +L(\type\()_bilin_h_tbl): + .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + vld1.16 {d16}, [\src], \s_strd + vld1.16 {d18}, [\sr2], \s_strd + vext.8 d17, d16, d16, #2 + vext.8 d19, d18, d18, #2 + vtrn.32 d16, d18 + vtrn.32 d17, d19 + subs \h, \h, #2 + vmul.i16 d16, d16, d0 + vmla.i16 d16, d17, d2 + vrshl.u16 d16, d16, d30 + vrshl.u16 d16, d16, d28 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + vld1.16 {q8}, [\src], \s_strd + vld1.16 {q10}, [\sr2], \s_strd + vext.8 q9, q8, q8, #2 + vext.8 q11, q10, q10, #2 + vmov d17, d20 + vmov d19, d22 + subs \h, \h, #2 + vmul.i16 q8, q8, q0 + vmla.i16 q8, q9, q1 + vrshl.u16 q8, q8, q15 +.ifc \type, put + vrshl.u16 q8, q8, q14 +.else + vsub.i16 q8, q8, q14 +.endif + vst1.16 {d16}, [\dst, :64], \d_strd + vst1.16 {d17}, [\ds2, :64], \d_strd + bgt 4b + pop {r4-r11,pc} + +80: // 8xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + vld1.16 {d16, d17, d18}, [\src], \s_strd + vld1.16 {d20, d21, d22}, [\sr2], \s_strd + vext.8 q9, q8, q9, #2 + vext.8 q11, q10, q11, #2 + subs \h, \h, #2 + vmul.i16 q8, q8, q0 + vmla.i16 q8, q9, q1 + vmul.i16 q10, q10, q0 + vmla.i16 q10, q11, q1 + vrshl.u16 q8, q8, q15 + vrshl.u16 q10, q10, q15 +.ifc \type, put + vrshl.u16 q8, q8, q14 + vrshl.u16 q10, q10, q14 +.else + vsub.i16 q8, q8, q14 + vsub.i16 q10, q10, q14 +.endif + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q10}, [\ds2, :128], \d_strd + bgt 8b + pop {r4-r11,pc} +160: +320: +640: +1280: // 16xN, 32xN, ... h + vpush {q4-q7} + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w, lsl #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, lsl #1 +.endif +161: + vld1.16 {q4}, [\src]! + vld1.16 {q9}, [\sr2]! + mov \mx, \w + +16: + vld1.16 {q5, q6}, [\src]! + vld1.16 {q10, q11}, [\sr2]! + vext.8 q7, q4, q5, #2 + vext.8 q8, q5, q6, #2 + vext.8 q12, q9, q10, #2 + vext.8 q13, q10, q11, #2 + vmul.i16 q4, q4, q0 + vmla.i16 q4, q7, q1 + vmul.i16 q5, q5, q0 + vmla.i16 q5, q8, q1 + vmul.i16 q9, q9, q0 + vmla.i16 q9, q12, q1 + vmul.i16 q10, q10, q0 + vmla.i16 q10, q13, q1 + vrshl.u16 q4, q4, q15 + vrshl.u16 q5, q5, q15 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + subs \mx, \mx, #16 +.ifc \type, put + vrshl.u16 q4, q4, q14 + vrshl.u16 q5, q5, q14 + vrshl.u16 q9, q9, q14 + vrshl.u16 q10, q10, q14 +.else + vsub.i16 q4, q4, q14 + vsub.i16 q5, q5, q14 + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 +.endif + vst1.16 {q4, q5}, [\dst, :128]! + vst1.16 {q9, q10}, [\ds2, :128]! + ble 9f + + vmov q4, q6 + vmov q9, q11 + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 161b + vpop {q4-q7} + pop {r4-r11,pc} + + +L(\type\()_bilin_v): + cmp \h, #4 + adr r10, L(\type\()_bilin_v_tbl) +.ifc \type, prep + vdup.16 q15, r11 // 4 - intermediate_bits +.endif + ldr r9, [r10, r9, lsl #2] +.ifc \type, prep + vmov.i16 q14, #PREP_BIAS + vneg.s16 q15, q15 // -(4-intermediate_bits) +.endif + add r10, r10, r9 + bx r10 + + .align 2 +L(\type\()_bilin_v_tbl): + .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + vld1.32 {d16[]}, [\src], \s_strd + bgt 24f + vld1.32 {d17[]}, [\sr2], \s_strd + vld1.32 {d18[]}, [\src], \s_strd + vext.8 d16, d16, d17, #4 + vext.8 d17, d17, d18, #4 + vmul.i16 d16, d16, d4 + vmla.i16 d16, d17, d6 + vrshr.u16 d16, d16, #4 + vst1.32 {d16[0]}, [\dst, :32] + vst1.32 {d16[1]}, [\ds2, :32] + pop {r4-r11,pc} +24: // 2x4, 2x8, ... v + vld1.32 {d17[]}, [\sr2], \s_strd + vld1.32 {d18[]}, [\src], \s_strd + vld1.32 {d19[]}, [\sr2], \s_strd + vld1.32 {d20[]}, [\src], \s_strd + vext.8 d16, d16, d17, #4 + vext.8 d17, d17, d18, #4 + vext.8 d18, d18, d19, #4 + vext.8 d19, d19, d20, #4 + vswp d17, d18 + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + subs \h, \h, #4 + vrshr.u16 q8, q8, #4 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + vst1.32 {d17[0]}, [\dst, :32], \d_strd + vst1.32 {d17[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d20 + b 24b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.16 {d16}, [\src], \s_strd +4: + vld1.16 {d17}, [\sr2], \s_strd + vld1.16 {d19}, [\src], \s_strd + vmov d18, d17 + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 +.else + vrshl.u16 q8, q8, q15 + vsub.i16 q8, q8, q14 +.endif + vst1.16 {d16}, [\dst, :64], \d_strd + vst1.16 {d17}, [\ds2, :64], \d_strd + ble 0f + vmov d16, d19 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.16 {q8}, [\src], \s_strd +8: + vld1.16 {q9}, [\sr2], \s_strd + vld1.16 {q10}, [\src], \s_strd + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + vmul.i16 q9, q9, q2 + vmla.i16 q9, q10, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 + vrshr.u16 q9, q9, #4 +.else + vrshl.u16 q8, q8, q15 + vrshl.u16 q9, q9, q15 + vsub.i16 q8, q8, q14 + vsub.i16 q9, q9, q14 +.endif + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q9}, [\ds2, :128], \d_strd + ble 0f + vmov q8, q10 + b 8b +0: + pop {r4-r11,pc} + +160: // 16xN, 32xN, ... +320: +640: +1280: + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {q8, q9}, [\src], \s_strd +2: + vld1.16 {q10, q11}, [\sr2], \s_strd + vld1.16 {q12, q13}, [\src], \s_strd + vmul.i16 q8, q8, q2 + vmla.i16 q8, q10, q3 + vmul.i16 q9, q9, q2 + vmla.i16 q9, q11, q3 + vmul.i16 q10, q10, q2 + vmla.i16 q10, q12, q3 + vmul.i16 q11, q11, q2 + vmla.i16 q11, q13, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 + vrshr.u16 q9, q9, #4 + vrshr.u16 q10, q10, #4 + vrshr.u16 q11, q11, #4 +.else + vrshl.u16 q8, q8, q15 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + vrshl.u16 q11, q11, q15 + vsub.i16 q8, q8, q14 + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 + vsub.i16 q11, q11, q14 +.endif + vst1.16 {q8, q9}, [\dst, :128], \d_strd + vst1.16 {q10, q11}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q12 + vmov q9, q13 + b 2b +9: + subs \w, \w, #16 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #32 + add \dst, \dst, #32 + b 1b +0: + pop {r4-r11,pc} + +L(\type\()_bilin_hv): + adr r10, L(\type\()_bilin_hv_tbl) + vdup.16 q15, r11 // 4 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s16 q15, q15 // -(4-intermediate_bits) +.ifc \type, put + vdup.32 q14, r12 // 4 + intermediate_bits +.else + vmov.i16 q14, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s32 q14, q14 // -(4+intermediate_bits) +.endif + bx r10 + + .align 2 +L(\type\()_bilin_hv_tbl): + .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + +20: // 2xN hv +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {d20}, [\src], \s_strd + vext.8 d21, d20, d20, #2 + vmul.i16 d16, d20, d0 + vmla.i16 d16, d21, d2 + vrshl.u16 d16, d16, d30 + vext.8 d16, d16, d16, #4 + +2: + vld1.16 {d20}, [\sr2], \s_strd + vld1.16 {d22}, [\src], \s_strd + vext.8 d21, d20, d20, #2 + vext.8 d23, d22, d22, #2 + vtrn.32 d20, d22 + vtrn.32 d21, d23 + vmul.i16 d18, d20, d0 + vmla.i16 d18, d21, d2 + vrshl.u16 d18, d18, d30 + + vext.8 d16, d16, d18, #4 + + vmull.u16 q8, d16, d4 + vmlal.u16 q8, d18, d6 + vrshl.u32 q8, q8, q14 + vmovn.i32 d16, q8 + subs \h, \h, #2 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d18 + b 2b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN hv + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {q10}, [\src], \s_strd + vext.8 d21, d20, d21, #2 + vmul.i16 d16, d20, d0 + vmla.i16 d16, d21, d2 + vrshl.u16 d16, d16, d30 + +4: + vld1.16 {q10}, [\sr2], \s_strd + vld1.16 {q11}, [\src], \s_strd + vext.8 d21, d20, d21, #2 + vext.8 d23, d22, d23, #2 + vswp d21, d22 + vmul.i16 q9, q10, q0 + vmla.i16 q9, q11, q1 + vrshl.u16 q9, q9, q15 + + vmull.u16 q10, d16, d4 + vmlal.u16 q10, d18, d6 + vmull.u16 q11, d18, d4 + vmlal.u16 q11, d19, d6 +.ifc \type, put + vrshl.u32 q10, q10, q14 + vrshl.u32 q11, q11, q14 + vmovn.i32 d20, q10 + vmovn.i32 d21, q11 +.else + vrshrn.i32 d20, q10, #4 + vrshrn.i32 d21, q11, #4 + vsub.i16 q10, q10, q14 +.endif + subs \h, \h, #2 + vst1.16 {d20}, [\dst, :64], \d_strd + vst1.16 {d21}, [\ds2, :64], \d_strd + ble 0f + vmov d16, d19 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {d20, d21, d22}, [\src], \s_strd + vext.8 q11, q10, q11, #2 + vmul.i16 q8, q10, q0 + vmla.i16 q8, q11, q1 + vrshl.u16 q8, q8, q15 + +2: + vld1.16 {d20, d21, d22}, [\sr2], \s_strd + vld1.16 {d24, d25, d26}, [\src], \s_strd + vext.8 q11, q10, q11, #2 + vext.8 q13, q12, q13, #2 + vmul.i16 q9, q10, q0 + vmla.i16 q9, q11, q1 + vmul.i16 q10, q12, q0 + vmla.i16 q10, q13, q1 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + + vmull.u16 q11, d16, d4 + vmlal.u16 q11, d18, d6 + vmull.u16 q12, d17, d4 + vmlal.u16 q12, d19, d6 + vmull.u16 q8, d18, d4 + vmlal.u16 q8, d20, d6 + vmull.u16 q9, d19, d4 + vmlal.u16 q9, d21, d6 +.ifc \type, put + vrshl.u32 q11, q11, q14 + vrshl.u32 q12, q12, q14 + vrshl.u32 q8, q8, q14 + vrshl.u32 q9, q9, q14 + vmovn.i32 d22, q11 + vmovn.i32 d23, q12 + vmovn.i32 d16, q8 + vmovn.i32 d17, q9 +.else + vrshrn.i32 d22, q11, #4 + vrshrn.i32 d23, q12, #4 + vrshrn.i32 d16, q8, #4 + vrshrn.i32 d17, q9, #4 + vsub.i16 q11, q11, q14 + vsub.i16 q8, q8, q14 +.endif + subs \h, \h, #2 + vst1.16 {q11}, [\dst, :128], \d_strd + vst1.16 {q8}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q10 + b 2b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 1b +0: + pop {r4-r11,pc} +endfunc +.endm + +filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 +filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10 diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S index 95f24fc5bd61..437988cfac7d 100644 --- a/third_party/dav1d/src/arm/64/looprestoration16.S +++ b/third_party/dav1d/src/arm/64/looprestoration16.S @@ -172,13 +172,13 @@ function wiener_filter_h_16bpc_neon, export=1 // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. + ext v18.16b, v2.16b, v3.16b, #6 ext v16.16b, v2.16b, v3.16b, #2 ext v17.16b, v2.16b, v3.16b, #4 - ext v18.16b, v2.16b, v3.16b, #6 ext v19.16b, v2.16b, v3.16b, #8 ext v20.16b, v2.16b, v3.16b, #10 - ext v21.16b, v2.16b, v3.16b, #12 ushll_sz v6, v7, v18, #7, \wd + ext v21.16b, v2.16b, v3.16b, #12 smlal v6.4s, v2.4h, v0.h[0] smlal v6.4s, v16.4h, v0.h[1] smlal v6.4s, v17.4h, v0.h[2] @@ -195,13 +195,13 @@ function wiener_filter_h_16bpc_neon, export=1 smlal2 v7.4s, v20.8h, v0.h[5] smlal2 v7.4s, v21.8h, v0.h[6] .endif + ext v21.16b, v4.16b, v5.16b, #6 ext v19.16b, v4.16b, v5.16b, #2 ext v20.16b, v4.16b, v5.16b, #4 - ext v21.16b, v4.16b, v5.16b, #6 ext v22.16b, v4.16b, v5.16b, #8 ext v23.16b, v4.16b, v5.16b, #10 - ext v24.16b, v4.16b, v5.16b, #12 ushll_sz v16, v17, v21, #7, \wd + ext v24.16b, v4.16b, v5.16b, #12 smlal v16.4s, v4.4h, v0.h[0] smlal v16.4s, v19.4h, v0.h[1] smlal v16.4s, v20.4h, v0.h[2] @@ -334,9 +334,9 @@ L(variable_shift_tbl): ins v6.s[1], v7.s[0] mvni v24.4h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 ushll v16.4s, v16.4h, #7 - add v6.4s, v6.4s, v30.4s - add v6.4s, v6.4s, v16.4s - srshl v6.4s, v6.4s, v29.4s + add v6.2s, v6.2s, v30.2s + add v6.2s, v6.2s, v16.2s + srshl v6.2s, v6.2s, v29.2s sqxtun v6.4h, v6.4s umin v6.4h, v6.4h, v24.4h sub v6.4h, v6.4h, v31.4h diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S index f6970de3c163..32ed6e901ab6 100644 --- a/third_party/dav1d/src/arm/64/mc.S +++ b/third_party/dav1d/src/arm/64/mc.S @@ -1906,11 +1906,10 @@ L(\type\()_8tap_hv): bl L(\type\()_8tap_filter_2) ext v18.8b, v17.8b, v28.8b, #4 - mov v19.8b, v28.8b smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] - smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v28.4h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h @@ -1919,7 +1918,7 @@ L(\type\()_8tap_hv): st1 {v2.h}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b - mov v17.8b, v19.8b + mov v17.8b, v28.8b b 2b 280: // 2x8, 2x16, 2x32 hv @@ -1956,7 +1955,6 @@ L(\type\()_8tap_hv): 28: bl L(\type\()_8tap_filter_2) ext v22.8b, v21.8b, v28.8b, #4 - mov v23.8b, v28.8b smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -1964,7 +1962,7 @@ L(\type\()_8tap_hv): smlal v2.4s, v20.4h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] - smlal v2.4s, v23.4h, v1.h[7] + smlal v2.4s, v28.4h, v1.h[7] sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h @@ -1977,7 +1975,7 @@ L(\type\()_8tap_hv): mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b - mov v21.8b, v23.8b + mov v21.8b, v28.8b b 28b 0: diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S index 7ac186302afa..c00b709e6859 100644 --- a/third_party/dav1d/src/arm/64/mc16.S +++ b/third_party/dav1d/src/arm/64/mc16.S @@ -1004,11 +1004,11 @@ function put_neon b.gt 2b ret 4: - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x2], x3 + ld1 {v0.4h}, [x2], x3 + ld1 {v1.4h}, [x2], x3 subs w5, w5, #2 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 + st1 {v0.4h}, [x0], x1 + st1 {v1.4h}, [x0], x1 b.gt 4b ret 80: @@ -1017,11 +1017,11 @@ function put_neon add x9, x2, x3 lsl x3, x3, #1 8: - ld1 {v0.16b}, [x2], x3 - ld1 {v1.16b}, [x9], x3 + ld1 {v0.8h}, [x2], x3 + ld1 {v1.8h}, [x9], x3 subs w5, w5, #2 - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x8], x1 + st1 {v0.8h}, [x0], x1 + st1 {v1.8h}, [x8], x1 b.gt 8b ret 16: @@ -2039,7 +2039,6 @@ L(\type\()_8tap_hv): sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 - sxtl v1.4s, v1.4h ld1 {v27.8h}, [\src], \s_strd ext v28.16b, v27.16b, v27.16b, #2 @@ -2049,19 +2048,23 @@ L(\type\()_8tap_hv): addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) bl L(\type\()_8tap_filter_2) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). + xtn v16.4h, v16.4s - trn1 v16.2d, v16.2d, v24.2d - mov v17.16b, v24.16b + trn1 v16.2s, v16.2s, v24.2s + mov v17.8b, v24.8b 2: bl L(\type\()_8tap_filter_2) - ext v18.16b, v17.16b, v24.16b, #8 - mov v19.16b, v24.16b - mul v2.4s, v16.4s, v1.s[0] - mla v2.4s, v17.4s, v1.s[1] - mla v2.4s, v18.4s, v1.s[2] - mla v2.4s, v19.4s, v1.s[3] + ext v18.8b, v17.8b, v24.8b, #4 + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v24.4h, v1.h[3] srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s @@ -2070,8 +2073,8 @@ L(\type\()_8tap_hv): st1 {v2.s}[0], [\dst], \d_strd st1 {v2.s}[1], [\ds2], \d_strd b.le 0f - mov v16.16b, v18.16b - mov v17.16b, v19.16b + mov v16.8b, v18.8b + mov v17.8b, v24.8b b 2b 280: // 2x8, 2x16, 2x32 hv @@ -2085,8 +2088,6 @@ L(\type\()_8tap_hv): sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 - sxtl2 v2.4s, v1.8h - sxtl v1.4s, v1.4h ld1 {v27.8h}, [\src], \s_strd ext v28.16b, v27.16b, v27.16b, #2 @@ -2095,29 +2096,33 @@ L(\type\()_8tap_hv): addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). bl L(\type\()_8tap_filter_2) - trn1 v16.2d, v16.2d, v24.2d - mov v17.16b, v24.16b + xtn v16.4h, v16.4s + trn1 v16.2s, v16.2s, v24.2s + mov v17.8b, v24.8b bl L(\type\()_8tap_filter_2) - ext v18.16b, v17.16b, v24.16b, #8 - mov v19.16b, v24.16b + ext v18.8b, v17.8b, v24.8b, #4 + mov v19.8b, v24.8b bl L(\type\()_8tap_filter_2) - ext v20.16b, v19.16b, v24.16b, #8 - mov v21.16b, v24.16b + ext v20.8b, v19.8b, v24.8b, #4 + mov v21.8b, v24.8b 28: bl L(\type\()_8tap_filter_2) - ext v22.16b, v21.16b, v24.16b, #8 - mov v23.16b, v24.16b - mul v3.4s, v16.4s, v1.s[0] - mla v3.4s, v17.4s, v1.s[1] - mla v3.4s, v18.4s, v1.s[2] - mla v3.4s, v19.4s, v1.s[3] - mla v3.4s, v20.4s, v2.s[0] - mla v3.4s, v21.4s, v2.s[1] - mla v3.4s, v22.4s, v2.s[2] - mla v3.4s, v23.4s, v2.s[3] + ext v22.8b, v21.8b, v24.8b, #4 + smull v3.4s, v16.4h, v1.h[0] + smlal v3.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[2] + smlal v3.4s, v19.4h, v1.h[3] + smlal v3.4s, v20.4h, v1.h[4] + smlal v3.4s, v21.4h, v1.h[5] + smlal v3.4s, v22.4h, v1.h[6] + smlal v3.4s, v24.4h, v1.h[7] srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s @@ -2126,12 +2131,12 @@ L(\type\()_8tap_hv): st1 {v3.s}[0], [\dst], \d_strd st1 {v3.s}[1], [\ds2], \d_strd b.le 0f - mov v16.16b, v18.16b - mov v17.16b, v19.16b - mov v18.16b, v20.16b - mov v19.16b, v21.16b - mov v20.16b, v22.16b - mov v21.16b, v23.16b + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v24.8b b 28b 0: @@ -2151,6 +2156,7 @@ L(\type\()_8tap_filter_2): smlal v24.4s, v27.4h, v0.h[2] smlal v24.4s, v28.4h, v0.h[3] srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) + xtn v24.4h, v24.4s ret .endif diff --git a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c index 1f18d62fc2c5..125f28c62e59 100644 --- a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c +++ b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c @@ -29,7 +29,6 @@ #include "src/looprestoration.h" #include "src/tables.h" -#if BITDEPTH == 8 || ARCH_AARCH64 // The 8bpc version calculates things slightly differently than the reference // C version. That version calculates roughly this: // int16_t sum = 0; @@ -105,6 +104,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride, } } +#if BITDEPTH == 8 || ARCH_AARCH64 void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum, const pixel (*left)[4], const pixel *src, const ptrdiff_t stride, @@ -290,8 +290,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 || ARCH_AARCH64 c->wiener = wiener_filter_neon; +#if BITDEPTH == 8 || ARCH_AARCH64 if (bpc <= 10) c->selfguided = sgr_filter_neon; #endif diff --git a/third_party/dav1d/src/arm/mc_init_tmpl.c b/third_party/dav1d/src/arm/mc_init_tmpl.c index 399ad41a45c8..75fbe3d7b08d 100644 --- a/third_party/dav1d/src/arm/mc_init_tmpl.c +++ b/third_party/dav1d/src/arm/mc_init_tmpl.c @@ -77,7 +77,6 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 || ARCH_AARCH64 init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); @@ -103,6 +102,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { c->avg = BF(dav1d_avg, neon); c->w_avg = BF(dav1d_w_avg, neon); c->mask = BF(dav1d_mask, neon); +#if BITDEPTH == 8 || ARCH_AARCH64 c->blend = BF(dav1d_blend, neon); c->blend_h = BF(dav1d_blend_h, neon); c->blend_v = BF(dav1d_blend_v, neon); diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c index f6782153c146..104dd827d256 100644 --- a/third_party/dav1d/src/decode.c +++ b/third_party/dav1d/src/decode.c @@ -773,10 +773,10 @@ static int decode_b(Dav1dTileContext *const t, signabs(t->warpmv.matrix[3]), signabs(t->warpmv.matrix[4]), signabs(t->warpmv.matrix[5]), - signabs(t->warpmv.alpha), - signabs(t->warpmv.beta), - signabs(t->warpmv.gamma), - signabs(t->warpmv.delta), + signabs(t->warpmv.u.p.alpha), + signabs(t->warpmv.u.p.beta), + signabs(t->warpmv.u.p.gamma), + signabs(t->warpmv.u.p.delta), b->mv2d.y, b->mv2d.x); #undef signabs } @@ -1843,10 +1843,10 @@ static int decode_b(Dav1dTileContext *const t, signabs(t->warpmv.matrix[3]), signabs(t->warpmv.matrix[4]), signabs(t->warpmv.matrix[5]), - signabs(t->warpmv.alpha), - signabs(t->warpmv.beta), - signabs(t->warpmv.gamma), - signabs(t->warpmv.delta), + signabs(t->warpmv.u.p.alpha), + signabs(t->warpmv.u.p.beta), + signabs(t->warpmv.u.p.gamma), + signabs(t->warpmv.u.p.delta), b->mv[0].y, b->mv[0].x); #undef signabs if (f->frame_thread.pass) { diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build index 706d4cd76f3c..61dcc9e563b5 100644 --- a/third_party/dav1d/src/meson.build +++ b/third_party/dav1d/src/meson.build @@ -82,7 +82,7 @@ libdav1d_entrypoints_sources = files( ) # ASM specific sources -libdav1d_nasm_objs = [] +libdav1d_asm_objs = [] # Arch-specific flags arch_flags = [] if is_asm_enabled @@ -102,7 +102,7 @@ if is_asm_enabled ) if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64') - libdav1d_sources += files( + libdav1d_sources_asm = files( # itx.S is used for both 8 and 16 bpc. 'arm/64/itx.S', 'arm/64/looprestoration_common.S', @@ -110,7 +110,7 @@ if is_asm_enabled ) if dav1d_bitdepths.contains('8') - libdav1d_sources += files( + libdav1d_sources_asm += files( 'arm/64/cdef.S', 'arm/64/ipred.S', 'arm/64/loopfilter.S', @@ -120,7 +120,7 @@ if is_asm_enabled endif if dav1d_bitdepths.contains('16') - libdav1d_sources += files( + libdav1d_sources_asm += files( 'arm/64/cdef16.S', 'arm/64/ipred16.S', 'arm/64/itx16.S', @@ -130,12 +130,12 @@ if is_asm_enabled ) endif elif host_machine.cpu_family().startswith('arm') - libdav1d_sources += files( + libdav1d_sources_asm = files( 'arm/32/msac.S', ) if dav1d_bitdepths.contains('8') - libdav1d_sources += files( + libdav1d_sources_asm += files( 'arm/32/cdef.S', 'arm/32/ipred.S', 'arm/32/itx.S', @@ -146,10 +146,18 @@ if is_asm_enabled endif if dav1d_bitdepths.contains('16') - libdav1d_sources += files( + libdav1d_sources_asm += files( + 'arm/32/looprestoration16.S', + 'arm/32/mc16.S', ) endif endif + + if use_gaspp + libdav1d_asm_objs = gaspp_gen.process(libdav1d_sources_asm) + else + libdav1d_sources += libdav1d_sources_asm + endif elif host_machine.cpu_family().startswith('x86') libdav1d_sources += files( @@ -200,7 +208,7 @@ if is_asm_enabled endif # Compile the ASM sources with NASM - libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm) + libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm) elif host_machine.cpu() == 'ppc64le' arch_flags = ['-maltivec', '-mvsx'] libdav1d_sources += files( @@ -222,17 +230,6 @@ api_export_flags = [] # if host_machine.system() == 'windows' and get_option('default_library') != 'static' - rc_version_array = meson.project_version().split('.') - winmod = import('windows') - rc_data = configuration_data() - rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0]) - rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1]) - rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2]) - rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major) - rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor) - rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision) - rc_data.set('COPYRIGHT_YEARS', '2019') - rc_file = configure_file( input : 'dav1d.rc.in', output : 'dav1d.rc', @@ -301,7 +298,7 @@ endif libdav1d = library('dav1d', libdav1d_sources, - libdav1d_nasm_objs, + libdav1d_asm_objs, libdav1d_rc_obj, objects : [ diff --git a/third_party/dav1d/src/obu.c b/third_party/dav1d/src/obu.c index 255c2a6198cc..27d83b53525b 100644 --- a/third_party/dav1d/src/obu.c +++ b/third_party/dav1d/src/obu.c @@ -112,6 +112,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, struct Dav1dSequenceHeaderOperatingPoint *const op = &hdr->operating_points[i]; op->idc = dav1d_get_bits(gb, 12); + if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00))) + goto error; op->major_level = 2 + dav1d_get_bits(gb, 3); op->minor_level = dav1d_get_bits(gb, 2); op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0; diff --git a/third_party/dav1d/src/recon_tmpl.c b/third_party/dav1d/src/recon_tmpl.c index 8e96f8e16adf..987cccb3afd7 100644 --- a/third_party/dav1d/src/recon_tmpl.c +++ b/third_party/dav1d/src/recon_tmpl.c @@ -1082,11 +1082,11 @@ static int warp_affine(Dav1dTileContext *const t, const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; const int dx = (int) (mvx >> 16) - 4; - const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 - - wmp->beta * 7) & ~0x3f; + const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 - + wmp->u.p.beta * 7) & ~0x3f; const int dy = (int) (mvy >> 16) - 4; - const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 - - wmp->delta * 4) & ~0x3f; + const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 - + wmp->u.p.delta * 4) & ~0x3f; const pixel *ref_ptr; ptrdiff_t ref_stride = refp->p.stride[!!pl]; @@ -1108,10 +1108,10 @@ static int warp_affine(Dav1dTileContext *const t, } if (dst16 != NULL) dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride, - wmp->abcd, mx, my HIGHBD_CALL_SUFFIX); + wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); else dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride, - wmp->abcd, mx, my HIGHBD_CALL_SUFFIX); + wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); } if (dst8) dst8 += 8 * PXSTRIDE(dstride); else dst16 += 8 * dstride; diff --git a/third_party/dav1d/src/tables.c b/third_party/dav1d/src/tables.c index a50bba731d6c..840b409518ca 100644 --- a/third_party/dav1d/src/tables.c +++ b/third_party/dav1d/src/tables.c @@ -391,10 +391,10 @@ const Dav1dWarpedMotionParams dav1d_default_wm_params = { 0, 0, 1 << 16, 0, 0, 1 << 16, }, - .alpha = 0, - .beta = 0, - .gamma = 0, - .delta = 0, + .u.p.alpha = 0, + .u.p.beta = 0, + .u.p.gamma = 0, + .u.p.delta = 0, }; const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = { diff --git a/third_party/dav1d/src/warpmv.c b/third_party/dav1d/src/warpmv.c index a933044f8b7b..439c4304c7a3 100644 --- a/third_party/dav1d/src/warpmv.c +++ b/third_party/dav1d/src/warpmv.c @@ -82,21 +82,21 @@ int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) { if (mat[2] <= 0) return 1; - wm->alpha = iclip_wmp(mat[2] - 0x10000); - wm->beta = iclip_wmp(mat[3]); + wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000); + wm->u.p.beta = iclip_wmp(mat[3]); int shift; const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]); const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y; const int rnd = (1 << shift) >> 1; - wm->gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1)); + wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1)); const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y; - wm->delta = iclip_wmp(mat[5] - + wm->u.p.delta = iclip_wmp(mat[5] - apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) - 0x10000); - return (4 * abs(wm->alpha) + 7 * abs(wm->beta) >= 0x10000) || - (4 * abs(wm->gamma) + 4 * abs(wm->delta) >= 0x10000); + return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) || + (4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000); } static int resolve_divisor_64(const uint64_t d, int *const shift) { diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm index d1de9ba240b0..dda8234f1390 100644 --- a/third_party/dav1d/src/x86/mc_avx2.asm +++ b/third_party/dav1d/src/x86/mc_avx2.asm @@ -59,8 +59,8 @@ subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 1 subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 -bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 @@ -76,6 +76,7 @@ wm_422_sign: dd 0x80808080, 0x7f7f7f7f pb_64: times 4 db 64 pw_m256: times 2 dw -256 +pw_15: times 2 dw 15 pw_32: times 2 dw 32 pw_34: times 2 dw 34 pw_258: times 2 dw 258 @@ -201,10 +202,9 @@ BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 SECTION .text INIT_XMM avx2 -DECLARE_REG_TMP 4, 6, 7 cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx - lea t2, [put_avx2] + lea r7, [put_avx2] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd @@ -213,35 +213,35 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy test mxyd, mxyd jnz .v .put: - movzx wd, word [t2+wq*2+table_offset(put,)] - add wq, t2 + movzx wd, word [r7+wq*2+table_offset(put,)] + add wq, r7 jmp wq .put_w2: - movzx t0d, word [srcq+ssq*0] - movzx t1d, word [srcq+ssq*1] + movzx r6d, word [srcq+ssq*0] + movzx r7d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] - mov [dstq+dsq*0], t0w - mov [dstq+dsq*1], t1w + mov [dstq+dsq*0], r6w + mov [dstq+dsq*1], r7w lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: - mov t0d, [srcq+ssq*0] - mov t1d, [srcq+ssq*1] + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - mov [dstq+dsq*0], t0d - mov [dstq+dsq*1], t1d + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: - mov t0, [srcq+ssq*0] - mov t1, [srcq+ssq*1] + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - mov [dstq+dsq*0], t0 - mov [dstq+dsq*1], t1 + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 @@ -298,17 +298,17 @@ INIT_YMM avx2 .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 - imul mxyd, 0xff01 + imul mxyd, 255 vbroadcasti128 m4, [bilin_h_shuf8] - add mxyd, 16 << 8 + add mxyd, 16 movd xm5, mxyd mov mxyd, r7m ; my vpbroadcastw m5, xm5 test mxyd, mxyd jnz .hv - movzx wd, word [t2+wq*2+table_offset(put, _bilin_h)] + movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] vpbroadcastd m3, [pw_2048] - add wq, t2 + add wq, r7 jmp wq .h_w2: movd xm0, [srcq+ssq*0] @@ -419,10 +419,10 @@ INIT_YMM avx2 jg .h_w64 RET .h_w128: - mov t1, -32*3 + mov r6, -32*3 .h_w128_loop: - movu m0, [srcq+t1+32*3+8*0] - movu m1, [srcq+t1+32*3+8*1] + movu m0, [srcq+r6+32*3+8*0] + movu m1, [srcq+r6+32*3+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 @@ -430,8 +430,8 @@ INIT_YMM avx2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 - mova [dstq+t1+32*3], m0 - add t1, 32 + mova [dstq+r6+32*3], m0 + add r6, 32 jle .h_w128_loop add srcq, ssq add dstq, dsq @@ -439,11 +439,11 @@ INIT_YMM avx2 jg .h_w128 RET .v: - movzx wd, word [t2+wq*2+table_offset(put, _bilin_v)] - imul mxyd, 0xff01 + movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] + imul mxyd, 255 vpbroadcastd m5, [pw_2048] - add mxyd, 16 << 8 - add wq, t2 + add mxyd, 16 + add wq, r7 movd xm4, mxyd vpbroadcastw m4, xm4 jmp wq @@ -454,7 +454,7 @@ INIT_YMM avx2 lea srcq, [srcq+ssq*2] pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 pshuflw xm1, xm1, q2301 ; 1 0 - punpcklbw xm1, xm0, xm1 + punpcklbw xm1, xm0 pmaddubsw xm1, xm4 pmulhrsw xm1, xm5 packuswb xm1, xm1 @@ -467,11 +467,11 @@ INIT_YMM avx2 .v_w4: movd xm0, [srcq+ssq*0] .v_w4_loop: - vpbroadcastd xm1, [srcq+ssq*1] + vpbroadcastd xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendd xm2, xm1, xm0, 0x01 ; 0 1 + vpblendd xm1, xm2, xm0, 0x01 ; 0 1 vpbroadcastd xm0, [srcq+ssq*0] - vpblendd xm1, xm0, 0x02 ; 1 2 + vpblendd xm2, xm0, 0x02 ; 1 2 punpcklbw xm1, xm2 pmaddubsw xm1, xm4 pmulhrsw xm1, xm5 @@ -485,11 +485,11 @@ INIT_YMM avx2 .v_w8: movq xm0, [srcq+ssq*0] .v_w8_loop: - movq xm3, [srcq+ssq*1] + movq xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw xm1, xm3, xm0 + punpcklbw xm1, xm0, xm2 movq xm0, [srcq+ssq*0] - punpcklbw xm2, xm0, xm3 + punpcklbw xm2, xm0 pmaddubsw xm1, xm4 pmaddubsw xm2, xm4 pmulhrsw xm1, xm5 @@ -504,11 +504,11 @@ INIT_YMM avx2 .v_w16: movu xm0, [srcq+ssq*0] .v_w16_loop: - vbroadcasti128 m2, [srcq+ssq*1] + vbroadcasti128 m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendd m3, m2, m0, 0x0f ; 0 1 + vpblendd m2, m3, m0, 0x0f ; 0 1 vbroadcasti128 m0, [srcq+ssq*0] - vpblendd m2, m0, 0xf0 ; 1 2 + vpblendd m3, m0, 0xf0 ; 1 2 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m4 @@ -528,23 +528,23 @@ INIT_YMM avx2 %%loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw m1, m3, m0 - punpckhbw m2, m3, m0 + punpcklbw m1, m0, m3 + punpckhbw m2, m0, m3 movu m0, [srcq+ssq*0] pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 - mova [dstq+dsq*0], m1 - punpcklbw m1, m0, m3 - punpckhbw m2, m0, m3 - pmaddubsw m1, m4 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 pmaddubsw m2, m4 - pmulhrsw m1, m5 + pmaddubsw m3, m4 pmulhrsw m2, m5 - packuswb m1, m2 - mova [dstq+dsq*1], m1 + pmulhrsw m3, m5 + packuswb m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg %%loop @@ -557,8 +557,8 @@ INIT_YMM avx2 .v_w64_loop: add srcq, ssq movu m3, [srcq+32*0] - punpcklbw m2, m3, m0 - punpckhbw m0, m3, m0 + punpcklbw m2, m0, m3 + punpckhbw m0, m3 pmaddubsw m2, m4 pmaddubsw m0, m4 pmulhrsw m2, m5 @@ -567,8 +567,8 @@ INIT_YMM avx2 mova m0, m3 movu m3, [srcq+32*1] mova [dstq+32*0], m2 - punpcklbw m2, m3, m1 - punpckhbw m1, m3, m1 + punpcklbw m2, m1, m3 + punpckhbw m1, m3 pmaddubsw m2, m4 pmaddubsw m1, m4 pmulhrsw m2, m5 @@ -581,28 +581,29 @@ INIT_YMM avx2 jg .v_w64_loop RET .v_w128: - mov t0, dstq - mov t1, srcq - lea t2d, [hq+(3<<8)] + lea r6d, [hq+(3<<8)] + mov r4, srcq + mov r7, dstq .v_w128_loop: PUT_BILIN_V_W32 - movzx hd, t2b - add t0, 32 - add t1, 32 - mov dstq, t0 - mov srcq, t1 - sub t2d, 1<<8 + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 jg .v_w128_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 - movzx wd, word [t2+wq*2+table_offset(put, _bilin_hv)] + movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow - vpbroadcastd m7, [pw_2048] + vpbroadcastd m7, [pw_15] movd xm6, mxyd - add wq, t2 + add wq, r7 + paddb m5, m5 vpbroadcastw m6, xm6 jmp wq .hv_w2: @@ -618,10 +619,10 @@ INIT_YMM avx2 shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ mova xm0, xm1 psubw xm1, xm2 - paddw xm1, xm1 pmulhw xm1, xm6 + pavgw xm2, xm7 paddw xm1, xm2 - pmulhrsw xm1, xm7 + psrlw xm1, 4 packuswb xm1, xm1 pextrw [dstq+dsq*0], xm1, 0 pextrw [dstq+dsq*1], xm1, 2 @@ -643,10 +644,10 @@ INIT_YMM avx2 shufps xm2, xm0, xm1, q1032 ; 0 1 mova xm0, xm1 psubw xm1, xm2 - paddw xm1, xm1 pmulhw xm1, xm6 + pavgw xm2, xm7 paddw xm1, xm2 - pmulhrsw xm1, xm7 + psrlw xm1, 4 packuswb xm1, xm1 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 @@ -667,10 +668,10 @@ INIT_YMM avx2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 - paddw m1, m1 pmulhw m1, m6 + pavgw m2, m7 paddw m1, m2 - pmulhrsw m1, m7 + psrlw m1, 4 vextracti128 xm2, m1, 1 packuswb xm1, xm2 movq [dstq+dsq*0], xm1 @@ -694,16 +695,16 @@ INIT_YMM avx2 pshufb m3, m4 pmaddubsw m2, m5 psubw m1, m2, m0 - paddw m1, m1 pmulhw m1, m6 + pavgw m0, m7 paddw m1, m0 pmaddubsw m0, m3, m5 psubw m3, m0, m2 - paddw m3, m3 pmulhw m3, m6 + pavgw m2, m7 paddw m3, m2 - pmulhrsw m1, m7 - pmulhrsw m3, m7 + psrlw m1, 4 + psrlw m3, 4 packuswb m1, m3 vpermq m1, m1, q3120 mova [dstq+dsq*0], xm1 @@ -712,72 +713,65 @@ INIT_YMM avx2 sub hd, 2 jg .hv_w16_loop RET +.hv_w128: + lea r6d, [hq+(3<<16)] + jmp .hv_w32_start +.hv_w64: + lea r6d, [hq+(1<<16)] +.hv_w32_start: + mov r4, srcq + mov r7, dstq .hv_w32: - xor t2d, t2d -.hv_w32gt: - mov t0, dstq - mov t1, srcq %if WIN64 movaps r4m, xmm8 %endif .hv_w32_loop0: movu m0, [srcq+8*0] - vinserti128 m0, [srcq+8*2], 1 movu m1, [srcq+8*1] - vinserti128 m1, [srcq+8*3], 1 pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w32_loop: add srcq, ssq - movu xm2, [srcq+8*1] - vinserti128 m2, [srcq+8*3], 1 + movu m2, [srcq+8*0] + movu m3, [srcq+8*1] pshufb m2, m4 + pshufb m3, m4 pmaddubsw m2, m5 - psubw m3, m2, m1 - paddw m3, m3 - pmulhw m3, m6 - paddw m3, m1 - mova m1, m2 - pmulhrsw m8, m3, m7 - movu xm2, [srcq+8*0] - vinserti128 m2, [srcq+8*2], 1 - pshufb m2, m4 - pmaddubsw m2, m5 - psubw m3, m2, m0 - paddw m3, m3 - pmulhw m3, m6 - paddw m3, m0 + pmaddubsw m3, m5 + psubw m8, m2, m0 + pmulhw m8, m6 + pavgw m0, m7 + paddw m8, m0 mova m0, m2 - pmulhrsw m3, m7 - packuswb m3, m8 - mova [dstq], m3 + psubw m2, m3, m1 + pmulhw m2, m6 + pavgw m1, m7 + paddw m2, m1 + mova m1, m3 + psrlw m8, 4 + psrlw m2, 4 + packuswb m8, m2 + mova [dstq], m8 add dstq, dsq dec hd jg .hv_w32_loop - movzx hd, t2b - add t0, 32 - add t1, 32 - mov dstq, t0 - mov srcq, t1 - sub t2d, 1<<8 + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<16 jg .hv_w32_loop0 %if WIN64 movaps xmm8, r4m %endif RET -.hv_w64: - lea t2d, [hq+(1<<8)] - jmp .hv_w32gt -.hv_w128: - lea t2d, [hq+(3<<8)] - jmp .hv_w32gt -DECLARE_REG_TMP 3, 5, 6 cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx - lea t2, [prep%+SUFFIX] + lea r6, [prep%+SUFFIX] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd @@ -786,8 +780,8 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 test mxyd, mxyd jnz .v .prep: - movzx wd, word [t2+wq*2+table_offset(prep,)] - add wq, t2 + movzx wd, word [r6+wq*2+table_offset(prep,)] + add wq, r6 lea stride3q, [strideq*3] jmp wq .prep_w4: @@ -906,16 +900,16 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] - imul mxyd, 0xff01 + imul mxyd, 255 vbroadcasti128 m4, [bilin_h_shuf8] - add mxyd, 16 << 8 + add mxyd, 16 movd xm5, mxyd mov mxyd, r6m ; my vpbroadcastw m5, xm5 test mxyd, mxyd jnz .hv - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] - add wq, t2 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: @@ -1079,10 +1073,10 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 RET .v: WIN64_SPILL_XMM 7 - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] - imul mxyd, 0xff01 - add mxyd, 16 << 8 - add wq, t2 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + imul mxyd, 255 + add mxyd, 16 + add wq, r6 lea stride3q, [strideq*3] movd xm6, mxyd vpbroadcastw m6, xm6 @@ -1100,9 +1094,9 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4 vpblendd m1, m3, 0xaa ; 0 1 2 3 vpblendd m2, m3, 0x55 ; 1 2 3 4 - punpcklbw m2, m1 - pmaddubsw m2, m6 - mova [tmpq], m2 + punpcklbw m1, m2 + pmaddubsw m1, m6 + mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .v_w4_loop @@ -1116,15 +1110,15 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 lea srcq, [srcq+strideq*4] vpblendd m1, m0, 0x03 ; 0 2 2 2 vpbroadcastq m0, [srcq+strideq*0] - vpblendd m3, m2, 0x33 ; 1 3 1 3 - vpblendd m2, m1, m3, 0x0f ; 1 3 2 2 - vpblendd m1, m3, 0xf0 ; 0 2 1 3 - vpblendd m2, m0, 0xc0 ; 1 3 2 4 - punpcklbw m3, m2, m1 - punpckhbw m2, m1 - pmaddubsw m3, m6 + vpblendd m2, m3, 0xcc ; 1 3 1 3 + vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2 + vpblendd m2, m1, 0x0f ; 0 2 1 3 + vpblendd m3, m0, 0xc0 ; 1 3 2 4 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m1, m6 pmaddubsw m2, m6 - mova [tmpq+32*0], m3 + mova [tmpq+32*0], m1 mova [tmpq+32*1], m2 add tmpq, 32*2 sub hd, 4 @@ -1133,25 +1127,25 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .v_w16: vbroadcasti128 m0, [srcq+strideq*0] .v_w16_loop: - vbroadcasti128 m1, [srcq+strideq*2] - vbroadcasti128 m2, [srcq+strideq*1] + vbroadcasti128 m1, [srcq+strideq*1] + vbroadcasti128 m2, [srcq+strideq*2] vbroadcasti128 m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - shufpd m4, m0, m1, 0x0c ; 0 2 ; 0l2l 0h2h + shufpd m4, m0, m2, 0x0c ; 0 2 vbroadcasti128 m0, [srcq+strideq*0] - shufpd m2, m2, m3, 0x0c ; 1 3 ; 1l3l 1h3h - shufpd m1, m1, m0, 0x0c ; 2 4 ; 2l4l 2h4h - punpcklbw m3, m2, m4 + shufpd m1, m3, 0x0c ; 1 3 + shufpd m2, m0, 0x0c ; 2 4 + punpcklbw m3, m4, m1 punpcklbw m5, m1, m2 + punpckhbw m4, m1 punpckhbw m1, m2 - punpckhbw m2, m4 pmaddubsw m3, m6 pmaddubsw m5, m6 - pmaddubsw m2, m6 + pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+32*0], m3 mova [tmpq+32*1], m5 - mova [tmpq+32*2], m2 + mova [tmpq+32*2], m4 mova [tmpq+32*3], m1 add tmpq, 32*4 sub hd, 4 @@ -1164,32 +1158,32 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 vpermq m2, [srcq+strideq*2], q3120 vpermq m3, [srcq+stride3q ], q3120 lea srcq, [srcq+strideq*4] - punpcklbw m4, m1, m0 - punpckhbw m5, m1, m0 + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 vpermq m0, [srcq+strideq*0], q3120 pmaddubsw m4, m6 pmaddubsw m5, m6 mova [tmpq+32*0], m4 mova [tmpq+32*1], m5 - punpcklbw m4, m2, m1 - punpckhbw m5, m2, m1 + punpcklbw m4, m1, m2 + punpckhbw m1, m2 pmaddubsw m4, m6 - pmaddubsw m5, m6 - mova [tmpq+32*2], m4 - mova [tmpq+32*3], m5 - add tmpq, 32*8 - punpcklbw m4, m3, m2 - punpckhbw m5, m3, m2 - punpcklbw m1, m0, m3 - punpckhbw m2, m0, m3 - pmaddubsw m4, m6 - pmaddubsw m5, m6 pmaddubsw m1, m6 + punpcklbw m5, m2, m3 + punpckhbw m2, m3 + pmaddubsw m5, m6 pmaddubsw m2, m6 - mova [tmpq-32*4], m4 - mova [tmpq-32*3], m5 + mova [tmpq+32*2], m4 + mova [tmpq+32*3], m1 + add tmpq, 32*8 + punpcklbw m1, m3, m0 + punpckhbw m3, m0 + pmaddubsw m1, m6 + pmaddubsw m3, m6 + mova [tmpq-32*4], m5 + mova [tmpq-32*3], m2 mova [tmpq-32*2], m1 - mova [tmpq-32*1], m2 + mova [tmpq-32*1], m3 sub hd, 4 jg .v_w32_loop RET @@ -1200,14 +1194,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 vpermq m2, [srcq+strideq*1+32*0], q3120 vpermq m3, [srcq+strideq*1+32*1], q3120 lea srcq, [srcq+strideq*2] - punpcklbw m4, m2, m0 - punpckhbw m5, m2, m0 + punpcklbw m4, m0, m2 + punpckhbw m0, m2 pmaddubsw m4, m6 - pmaddubsw m5, m6 + pmaddubsw m0, m6 mova [tmpq+32*0], m4 - mova [tmpq+32*1], m5 - punpcklbw m4, m3, m1 - punpckhbw m5, m3, m1 + mova [tmpq+32*1], m0 + punpcklbw m4, m1, m3 + punpckhbw m5, m1, m3 vpermq m0, [srcq+strideq*0+32*0], q3120 vpermq m1, [srcq+strideq*0+32*1], q3120 pmaddubsw m4, m6 @@ -1215,52 +1209,52 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mova [tmpq+32*2], m4 mova [tmpq+32*3], m5 add tmpq, 32*8 - punpcklbw m4, m0, m2 - punpckhbw m5, m0, m2 - punpcklbw m2, m1, m3 - punpckhbw m3, m1, m3 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 + punpcklbw m5, m3, m1 + punpckhbw m3, m1 pmaddubsw m4, m6 - pmaddubsw m5, m6 pmaddubsw m2, m6 + pmaddubsw m5, m6 pmaddubsw m3, m6 mova [tmpq-32*4], m4 - mova [tmpq-32*3], m5 - mova [tmpq-32*2], m2 + mova [tmpq-32*3], m2 + mova [tmpq-32*2], m5 mova [tmpq-32*1], m3 sub hd, 2 jg .v_w64_loop RET .v_w128: - mov t0, tmpq - mov t1, srcq - lea t2d, [hq+(3<<8)] + lea r6d, [hq+(3<<8)] + mov r3, srcq + mov r5, tmpq .v_w128_loop0: vpermq m0, [srcq+strideq*0], q3120 .v_w128_loop: vpermq m1, [srcq+strideq*1], q3120 lea srcq, [srcq+strideq*2] - punpcklbw m2, m1, m0 - punpckhbw m3, m1, m0 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 vpermq m0, [srcq+strideq*0], q3120 - punpcklbw m4, m0, m1 - punpckhbw m5, m0, m1 pmaddubsw m2, m6 pmaddubsw m3, m6 + punpcklbw m4, m1, m0 + punpckhbw m1, m0 pmaddubsw m4, m6 - pmaddubsw m5, m6 + pmaddubsw m1, m6 mova [tmpq+32*0], m2 mova [tmpq+32*1], m3 mova [tmpq+32*8], m4 - mova [tmpq+32*9], m5 + mova [tmpq+32*9], m1 add tmpq, 32*16 sub hd, 2 jg .v_w128_loop - movzx hd, t2b - add t0, 64 - add t1, 32 - mov tmpq, t0 - mov srcq, t1 - sub t2d, 1<<8 + add r3, 32 + add r5, 64 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 jg .v_w128_loop0 RET .hv: @@ -1268,11 +1262,11 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 7 - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 movd xm6, mxyd vpbroadcastw m6, xm6 - add wq, t2 + add wq, r6 lea stride3q, [strideq*3] jmp wq .hv_w4: @@ -1388,10 +1382,19 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 dec hd jg .hv_w32_loop RET +.hv_w128: + lea r3d, [hq+(7<<8)] + mov r6d, 256 + jmp .hv_w64_start .hv_w64: - mov t0, tmpq - mov t1, srcq - lea t2d, [hq+(3<<8)] + lea r3d, [hq+(3<<8)] + mov r6d, 128 +.hv_w64_start: +%if WIN64 + PUSH r7 +%endif + mov r5, srcq + mov r7, tmpq .hv_w64_loop0: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 @@ -1413,56 +1416,21 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 - mova [tmpq+32*0], m3 - add tmpq, 32*8 - mova [tmpq-32*4], m2 + mova [tmpq+r6*0], m3 + mova [tmpq+r6*1], m2 + lea tmpq, [tmpq+r6*2] sub hd, 2 jg .hv_w64_loop - movzx hd, t2b - add t0, 32 - add t1, 16 - mov tmpq, t0 - mov srcq, t1 - sub t2d, 1<<8 + add r5, 16 + add r7, 32 + movzx hd, r3b + mov srcq, r5 + mov tmpq, r7 + sub r3d, 1<<8 jg .hv_w64_loop0 - RET -.hv_w128: - mov t0, tmpq - mov t1, srcq - lea t2d, [hq+(7<<8)] -.hv_w128_loop0: - movu xm0, [srcq+strideq*0+8*0] - vinserti128 m0, [srcq+strideq*0+8*1], 1 - pshufb m0, m4 - pmaddubsw m0, m5 -.hv_w128_loop: - movu xm1, [srcq+strideq*1+8*0] - vinserti128 m1, [srcq+strideq*1+8*1], 1 - lea srcq, [srcq+strideq*2] - movu xm2, [srcq+strideq*0+8*0] - vinserti128 m2, [srcq+strideq*0+8*1], 1 - pshufb m1, m4 - pshufb m2, m4 - pmaddubsw m1, m5 - psubw m3, m1, m0 - pmulhrsw m3, m6 - paddw m3, m0 - pmaddubsw m0, m2, m5 - psubw m2, m0, m1 - pmulhrsw m2, m6 - paddw m2, m1 - mova [tmpq+32*0], m3 - mova [tmpq+32*8], m2 - add tmpq, 32*16 - sub hd, 2 - jg .hv_w128_loop - movzx hd, t2b - add t0, 32 - add t1, 16 - mov tmpq, t0 - mov srcq, t1 - sub t2d, 1<<8 - jg .hv_w128_loop0 +%if WIN64 + POP r7 +%endif RET ; int8_t subpel_filters[5][15][8] @@ -1676,12 +1644,12 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movd xm2, [srcq+ssq*0] pinsrw xm2, [srcq+ssq*1], 2 pinsrw xm2, [srcq+ssq*2], 4 - pinsrw xm2, [srcq+ss3q ], 6 ; 0 1 2 3 - lea srcq, [srcq+ssq*4] - movd xm3, [srcq+ssq*0] - vpbroadcastd xm1, [srcq+ssq*1] - vpbroadcastd xm0, [srcq+ssq*2] add srcq, ss3q + pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3 + movd xm3, [srcq+ssq*1] + vpbroadcastd xm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 @@ -1696,10 +1664,10 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova xm2, xm3 pmaddubsw xm3, xm10 ; a2 b2 paddw xm5, xm3 - vpbroadcastd xm4, [srcq+ssq*0] - vpblendd xm3, xm0, xm4, 0x02 ; 6 7 - vpbroadcastd xm0, [srcq+ssq*1] + vpbroadcastd xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklbw xm3, xm4 ; 67 78 pmaddubsw xm4, xm3, xm11 ; a3 b3 @@ -1716,12 +1684,12 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movd xm2, [srcq+ssq*0] pinsrd xm2, [srcq+ssq*1], 1 pinsrd xm2, [srcq+ssq*2], 2 - pinsrd xm2, [srcq+ss3q ], 3 ; 0 1 2 3 - lea srcq, [srcq+ssq*4] - movd xm3, [srcq+ssq*0] - vpbroadcastd xm1, [srcq+ssq*1] - vpbroadcastd xm0, [srcq+ssq*2] add srcq, ss3q + pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xm3, [srcq+ssq*1] + vpbroadcastd xm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 @@ -1736,10 +1704,10 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova xm2, xm3 pmaddubsw xm3, xm10 ; a2 b2 paddw xm5, xm3 - vpbroadcastd xm4, [srcq+ssq*0] - vpblendd xm3, xm0, xm4, 0x02 ; 6 7 - vpbroadcastd xm0, [srcq+ssq*1] + vpbroadcastd xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklbw xm3, xm4 ; 67 78 pmaddubsw xm4, xm3, xm11 ; a3 b3 @@ -1756,12 +1724,12 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movq xm1, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] - vpbroadcastq m5, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - vpbroadcastq m3, [srcq+ssq*0] - vpbroadcastq m6, [srcq+ssq*1] - vpbroadcastq m0, [srcq+ssq*2] add srcq, ss3q + vpbroadcastq m5, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m6, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m0, [srcq+ssq*0] vpblendd m1, m4, 0x30 vpblendd m4, m2, 0x30 punpcklbw m1, m4 ; 01 12 @@ -1772,6 +1740,8 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vpblendd m6, m0, 0x30 punpcklbw m3, m6 ; 45 56 .v_w8_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, m8 ; a0 b0 mova m1, m2 pmaddubsw m2, m9 ; a1 b1 @@ -1779,10 +1749,8 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova m2, m3 pmaddubsw m3, m10 ; a2 b2 paddw m5, m3 - vpbroadcastq m4, [srcq+ssq*0] vpblendd m3, m0, m4, 0x30 - vpbroadcastq m0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] vpblendd m4, m0, 0x30 punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, m11 ; a3 b3 @@ -1800,30 +1768,28 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 .v_w32: .v_w64: .v_w128: - lea r6d, [wq-16] - mov r4, dstq - mov r7, srcq - shl r6d, 4 - mov r6b, hb + lea r6d, [wq*8-128] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*2] .v_w16_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vbroadcasti128 m0, [srcq+ssq*1] - vbroadcasti128 m6, [srcq+ssq*0] - lea srcq, [srcq+ssq*2] - vbroadcasti128 m1, [srcq+ssq*0] - vbroadcasti128 m2, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] + vbroadcasti128 m6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + vbroadcasti128 m2, [srcq+ssq*2] + add srcq, ss3q vbroadcasti128 m3, [srcq+ssq*0] - shufpd m4, m4, m0, 0x0c - shufpd m5, m5, m1, 0x0c + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 punpckhbw m4, m5 ; 34 - shufpd m6, m6, m2, 0x0c + shufpd m6, m2, 0x0c punpcklbw m2, m5, m6 ; 12 punpckhbw m5, m6 ; 45 - shufpd m0, m0, m3, 0x0c + shufpd m0, m3, 0x0c punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: @@ -1861,11 +1827,11 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop - movzx hd, r6b add r4, 16 add r7, 16 - mov dstq, r4 - mov srcq, r7 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET @@ -1898,12 +1864,12 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movq xm2, [srcq+ssq*0] movhps xm2, [srcq+ssq*1] movq xm0, [srcq+ssq*2] - movhps xm0, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - vpbroadcastq m3, [srcq+ssq*0] - vpbroadcastq m4, [srcq+ssq*1] - vpbroadcastq m1, [srcq+ssq*2] add srcq, ss3q + movhps xm0, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1, [srcq+ssq*0] vpblendd m2, m3, 0x30 vpblendd m0, m1, 0x30 vpblendd m2, m4, 0xc0 @@ -1920,20 +1886,20 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pshufd xm0, xm3, q2121 punpcklwd xm3, xm0 ; 45 56 .hv_w2_loop: + movq xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm4, [srcq+ssq*0] + pshufb xm4, xm6 + pmaddubsw xm4, xm7 pmaddwd xm5, xm1, xm10 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm11 ; a1 b1 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm12 ; a2 b2 - paddd xm5, xm3 - movq xm4, [srcq+ssq*0] - movhps xm4, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb xm4, xm6 - pmaddubsw xm4, xm7 phaddw xm4, xm4 pmulhrsw xm4, xm8 + paddd xm5, xm3 palignr xm3, xm4, xm0, 12 mova xm0, xm4 punpcklwd xm3, xm0 ; 67 78 @@ -1954,13 +1920,13 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vpbroadcastq m2, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m0, [srcq+ssq*2] - vpbroadcastq m5, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - vpbroadcastq m3, [srcq+ssq*0] - vpblendd m2, m4, 0xcc ; 0 1 - vpbroadcastq m4, [srcq+ssq*1] - vpbroadcastq m1, [srcq+ssq*2] add srcq, ss3q + vpbroadcastq m5, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpblendd m2, m4, 0xcc ; 0 1 + vpbroadcastq m4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1, [srcq+ssq*0] vpblendd m0, m5, 0xcc ; 2 3 vpblendd m3, m4, 0xcc ; 4 5 pshufb m2, m6 @@ -1981,6 +1947,8 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pshufd m0, m3, q2121 punpcklwd m3, m0 ; 45 56 .hv_w4_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] pmaddwd m5, m1, m10 ; a0 b0 mova m1, m2 pmaddwd m2, m11 ; a1 b1 @@ -1988,9 +1956,7 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova m2, m3 pmaddwd m3, m12 ; a2 b2 paddd m5, m3 - vpbroadcastq m4, [srcq+ssq*0] - vpbroadcastq m3, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] + vpbroadcastq m3, [srcq+ssq*0] vpblendd m4, m3, 0xcc ; 7 8 pshufb m4, m6 pmaddubsw m4, m7 @@ -2031,25 +1997,23 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 - lea r6d, [wq-8] - mov r4, dstq - mov r7, srcq - shl r6d, 5 - mov r6b, hb + lea r6d, [wq*8-64] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] - vbroadcasti128 m8, [subpel_h_shufB] - vbroadcasti128 m9, [subpel_h_shufC] movu xm4, [srcq+ssq*0] + vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - movu xm6, [srcq+ssq*0] - vbroadcasti128 m0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] + vbroadcasti128 m9, [subpel_h_shufC] + movu xm6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m0, [srcq+ssq*0] vpblendd m4, m0, 0xf0 ; 0 3 - vinserti128 m5, [srcq+ssq*0], 1 ; 1 4 - vinserti128 m6, [srcq+ssq*1], 1 ; 2 5 - lea srcq, [srcq+ssq*2] + vinserti128 m5, [srcq+ssq*1], 1 ; 1 4 + vinserti128 m6, [srcq+ssq*2], 1 ; 2 5 + add srcq, ss3q vinserti128 m0, [srcq+ssq*0], 1 ; 3 6 %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] pshufb %3, %1, %6 @@ -2130,11 +2094,11 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop - movzx hd, r6b add r4, 8 add r7, 8 - mov dstq, r4 - mov srcq, r7 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET @@ -2153,48 +2117,6 @@ cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pmulhrsw m0, m4 %endmacro -%macro PREP_8TAP_V_W4 5 ; round, weights - movd xm0, [srcq+strideq*0] - vpbroadcastd m1, [srcq+strideq*2] - vpbroadcastd xm2, [srcq+strideq*1] - vpbroadcastd m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ - vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ - vpbroadcastd m0, [srcq+strideq*0] - vpbroadcastd m2, [srcq+strideq*1] - vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ - vpbroadcastd m0, [srcq+strideq*2] - vbroadcasti128 m5, [deint_shuf4] - vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 - vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 - vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ - punpcklbw m1, m2, m3 ; 01 12 23 34 - vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 - punpckhbw m2, m3 ; 23 34 45 56 -.v_w4_loop: - pinsrd xm0, [srcq+stride3q ], 1 - lea srcq, [srcq+strideq*4] - vpbroadcastd m3, [srcq+strideq*0] - vpbroadcastd m4, [srcq+strideq*1] - vpblendd m3, m4, 0x20 ; _ _ 8 _ 8 9 _ _ - vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 9 _ _ - vpbroadcastd m0, [srcq+strideq*2] - vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ - pshufb m3, m5 ; 67 78 89 9a - pmaddubsw m4, m1, m%2 - vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 - pmaddubsw m2, m%3 - paddw m4, m2 - mova m2, m3 - pmaddubsw m3, m%5 - paddw m3, m4 - pmaddubsw m4, m1, m%4 - paddw m3, m4 - pmulhrsw m3, m%1 - mova [tmpq], m3 -%endmacro - %if WIN64 DECLARE_REG_TMP 6, 4 %else @@ -2347,7 +2269,45 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_w16 je .v_w8 .v_w4: - PREP_8TAP_V_W4 7, 8, 9, 10, 11 + movd xm0, [srcq+strideq*0] + vpbroadcastd m1, [srcq+strideq*2] + vpbroadcastd xm2, [srcq+strideq*1] + add srcq, stride3q + vpbroadcastd m3, [srcq+strideq*0] + vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ + vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ + vpbroadcastd m0, [srcq+strideq*1] + vpbroadcastd m2, [srcq+strideq*2] + vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ + vpbroadcastd m0, [srcq+stride3q ] + vbroadcasti128 m5, [deint_shuf4] + vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 + vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 + vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ + punpcklbw m1, m2, m3 ; 01 12 23 34 + vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 + punpckhbw m2, m3 ; 23 34 45 56 +.v_w4_loop: + lea srcq, [srcq+strideq*4] + pinsrd xm0, [srcq+strideq*0], 1 + vpbroadcastd m3, [srcq+strideq*1] + vpbroadcastd m4, [srcq+strideq*2] + vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _ + vpbroadcastd m0, [srcq+stride3q ] + vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _ + vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ + pshufb m3, m5 ; 67 78 89 9a + pmaddubsw m4, m1, m8 + vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 + pmaddubsw m2, m9 + paddw m4, m2 + mova m2, m3 + pmaddubsw m3, m11 + paddw m3, m4 + pmaddubsw m4, m1, m10 + paddw m3, m4 + pmulhrsw m3, m7 + mova [tmpq], m3 add tmpq, 32 sub hd, 4 jg .v_w4_loop @@ -2406,11 +2366,10 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_w8_loop RET .v_w16: - lea r6d, [wq-16] - mov r5, tmpq - mov r7, srcq - shl r6d, 4 - mov r6b, hb + add wd, wd + mov r5, srcq + mov r7, tmpq + lea r6d, [hq+wq*8-256] .v_w16_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] @@ -2461,15 +2420,15 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 pmulhrsw m14, m7 pmulhrsw m15, m7 mova [tmpq+wq*0], m14 - mova [tmpq+wq*2], m15 - lea tmpq, [tmpq+wq*4] + mova [tmpq+wq*1], m15 + lea tmpq, [tmpq+wq*2] sub hd, 2 jg .v_w16_loop + add r5, 16 + add r7, 32 movzx hd, r6b - add r5, 32 - add r7, 16 - mov tmpq, r5 - mov srcq, r7 + mov srcq, r5 + mov tmpq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET @@ -2557,8 +2516,8 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vpbroadcastq m2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] paddd m6, m4 - paddd m5, m3 vpbroadcastq m4, [srcq+strideq*0] + paddd m5, m3 vpbroadcastq m3, [srcq+strideq*1] vpblendd m2, m4, 0xcc vpbroadcastq m4, [srcq+strideq*2] @@ -2591,18 +2550,17 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .hv_w4_loop RET .hv_w8: - lea r6d, [wq-8] - mov r5, tmpq - mov r7, srcq - shl r6d, 5 - mov r6b, hb + lea r6d, [wq*8-64] + mov r5, srcq + mov r7, tmpq + lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] - vbroadcasti128 m8, [subpel_h_shufB] - vbroadcasti128 m9, [subpel_h_shufC] movu xm4, [srcq+strideq*0] + vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] + vbroadcasti128 m9, [subpel_h_shufC] movu xm6, [srcq+strideq*0] vbroadcasti128 m0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] @@ -2676,11 +2634,11 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w8_loop + add r5, 8 + add r7, 16 movzx hd, r6b - add r5, 16 - add r7, 8 - mov tmpq, r5 - mov srcq, r7 + mov srcq, r5 + mov tmpq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm index 9ee70615f556..edbd1865645c 100644 --- a/third_party/dav1d/src/x86/mc_sse.asm +++ b/third_party/dav1d/src/x86/mc_sse.asm @@ -57,8 +57,8 @@ subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 -bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 pb_8x0_8x8: times 8 db 0 @@ -77,6 +77,7 @@ pw_m256: times 8 dw -256 pw_1: times 8 dw 1 pw_2: times 8 dw 2 pw_8: times 8 dw 8 +pw_15: times 8 dw 15 pw_26: times 8 dw 26 pw_34: times 8 dw 34 pw_512: times 8 dw 512 @@ -220,16 +221,18 @@ INIT_XMM ssse3 DECLARE_REG_TMP 7 %define base 0 %endif -; + %macro RESTORE_DSQ_32 1 %if ARCH_X86_32 mov %1, dsm ; restore dsq %endif %endmacro -; -cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak + +cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx LEA t0, put_ssse3 + movifnidn srcq, srcmp + movifnidn ssq, ssmp tzcnt wd, wm mov hd, hm test mxyd, mxyd @@ -335,20 +338,19 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 - imul mxyd, 0xff01 + imul mxyd, 0x00ff00ff mova m4, [base+bilin_h_shuf8] mova m0, [base+bilin_h_shuf4] - add mxyd, 16 << 8 + add mxyd, 0x00100010 movd m5, mxyd mov mxyd, r7m ; my - pshuflw m5, m5, q0000 - punpcklqdq m5, m5 + pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] mova m3, [base+pw_2048] add wq, t0 - RESTORE_DSQ_32 t0 + movifnidn dsq, dsmp jmp wq .h_w2: pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} @@ -485,23 +487,22 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak RET .v: movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] - imul mxyd, 0xff01 + imul mxyd, 0x00ff00ff mova m5, [base+pw_2048] - add mxyd, 16 << 8 + add mxyd, 0x00100010 add wq, t0 movd m4, mxyd - pshuflw m4, m4, q0000 - punpcklqdq m4, m4 - RESTORE_DSQ_32 t0 + pshufd m4, m4, q0000 + movifnidn dsq, dsmp jmp wq .v_w2: movd m0, [srcq+ssq*0] .v_w2_loop: pinsrw m0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] - pshuflw m2, m0, q2301 + pshuflw m1, m0, q2301 pinsrw m0, [srcq+ssq*0], 0 ; 2 1 - punpcklbw m1, m0, m2 + punpcklbw m1, m0 pmaddubsw m1, m4 pmulhrsw m1, m5 packuswb m1, m1 @@ -516,11 +517,12 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak .v_w4: movd m0, [srcq+ssq*0] .v_w4_loop: - movd m1, [srcq+ssq*1] + movd m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpckldq m2, m0, m1 ; 0 1 + mova m1, m0 movd m0, [srcq+ssq*0] - punpckldq m1, m0 ; 1 2 + punpckldq m1, m2 ; 0 1 + punpckldq m2, m0 ; 1 2 punpcklbw m1, m2 pmaddubsw m1, m4 pmulhrsw m1, m5 @@ -536,11 +538,12 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak .v_w8: movq m0, [srcq+ssq*0] .v_w8_loop: - movq m3, [srcq+ssq*1] + movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw m1, m3, m0 + mova m1, m0 movq m0, [srcq+ssq*0] - punpcklbw m2, m0, m3 + punpcklbw m1, m2 + punpcklbw m2, m0 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 @@ -552,98 +555,102 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak sub hd, 2 jg .v_w8_loop RET - ; %macro PUT_BILIN_V_W16 0 movu m0, [srcq+ssq*0] %%loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw m1, m3, m0 - punpckhbw m2, m3, m0 + mova m1, m0 + mova m2, m0 movu m0, [srcq+ssq*0] + punpcklbw m1, m3 + punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + pmaddubsw m2, m4 + pmaddubsw m3, m4 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + packuswb m2, m3 mova [dstq+dsq*0], m1 - punpcklbw m1, m0, m3 - punpckhbw m2, m0, m3 - pmaddubsw m1, m4 - pmaddubsw m2, m4 - pmulhrsw m1, m5 - pmulhrsw m2, m5 - packuswb m1, m2 - mova [dstq+dsq*1], m1 + mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg %%loop %endmacro - ; .v_w16: PUT_BILIN_V_W16 RET -.v_w16gt: - mov r4, dstq - mov r6, srcq -.v_w16gt_loop: -%if ARCH_X86_32 - mov bakm, t0q - RESTORE_DSQ_32 t0 - PUT_BILIN_V_W16 - mov t0q, bakm -%else - PUT_BILIN_V_W16 -%endif - mov hw, t0w - add r4, mmsize - add r6, mmsize - mov dstq, r4 - mov srcq, r6 - sub t0d, 1<<16 - jg .v_w16gt - RET -.v_w32: - lea t0d, [hq+(1<<16)] +.v_w128: + lea r6d, [hq+(7<<16)] jmp .v_w16gt .v_w64: - lea t0d, [hq+(3<<16)] - jmp .v_w16gt -.v_w128: - lea t0d, [hq+(7<<16)] + lea r6d, [hq+(3<<16)] jmp .v_w16gt +.v_w32: + lea r6d, [hq+(1<<16)] +.v_w16gt: + mov r4, srcq +%if ARCH_X86_64 + mov r7, dstq +%endif +.v_w16gt_loop: + PUT_BILIN_V_W16 +%if ARCH_X86_64 + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 +%else + mov dstq, dstmp + add r4, 16 + movzx hd, r6w + add dstq, 16 + mov srcq, r4 + mov dstmp, dstq +%endif + sub r6d, 1<<16 + jg .v_w16gt + RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow - mova m7, [base+pw_2048] + mova m7, [base+pw_15] movd m6, mxyd add wq, t0 pshuflw m6, m6, q0000 + paddb m5, m5 punpcklqdq m6, m6 jmp wq .hv_w2: RESTORE_DSQ_32 t0 movd m0, [srcq+ssq*0] - pshufd m0, m0, q0000 ; src[x - src_stride] + punpckldq m0, m0 pshufb m0, m4 pmaddubsw m0, m5 .hv_w2_loop: - movd m1, [srcq+ssq*1] ; src[x] + movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - movhps m1, [srcq+ssq*0] ; src[x + src_stride] - pshufd m1, m1, q3120 + movd m2, [srcq+ssq*0] + punpckldq m1, m2 pshufb m1, m4 pmaddubsw m1, m5 ; 1 _ 2 _ shufps m2, m0, m1, q1032 ; 0 _ 1 _ mova m0, m1 - psubw m1, m2 ; src[x + src_stride] - src[x] - paddw m1, m1 - pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) - paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x]) - pmulhrsw m1, m7 + psubw m1, m2 ; 2 * (src[x + src_stride] - src[x]) + pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4 + pavgw m2, m7 ; src[x] + 8 + paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8 + psrlw m1, 4 packuswb m1, m1 %if ARCH_X86_64 movq r6, m1 @@ -660,8 +667,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak RET .hv_w4: mova m4, [base+bilin_h_shuf4] - RESTORE_DSQ_32 t0 movddup xm0, [srcq+ssq*0] + movifnidn dsq, dsmp pshufb m0, m4 pmaddubsw m0, m5 .hv_w4_loop: @@ -669,14 +676,14 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak lea srcq, [srcq+ssq*2] movhps m1, [srcq+ssq*0] pshufb m1, m4 - pmaddubsw m1, m5 ; 1 2 + pmaddubsw m1, m5 ; 1 2 shufps m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 - paddw m1, m1 pmulhw m1, m6 + pavgw m2, m7 paddw m1, m2 - pmulhrsw m1, m7 + psrlw m1, 4 packuswb m1, m1 movd [dstq+dsq*0], m1 psrlq m1, 32 @@ -686,28 +693,28 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak jg .hv_w4_loop RET .hv_w8: - RESTORE_DSQ_32 t0 - movu m0, [srcq+ssq*0+8*0] + movu m0, [srcq+ssq*0] + movifnidn dsq, dsmp pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: - movu m2, [srcq+ssq*1+8*0] + movu m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m2, m4 pmaddubsw m2, m5 psubw m1, m2, m0 - paddw m1, m1 pmulhw m1, m6 + pavgw m0, m7 paddw m1, m0 - movu m0, [srcq+ssq*0+8*0] + movu m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 psubw m3, m0, m2 - paddw m3, m3 pmulhw m3, m6 + pavgw m2, m7 paddw m3, m2 - pmulhrsw m1, m7 - pmulhrsw m3, m7 + psrlw m1, 4 + psrlw m3, 4 packuswb m1, m3 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 @@ -715,27 +722,34 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak sub hd, 2 jg .hv_w8_loop RET +.hv_w128: + lea r6d, [hq+(7<<16)] + jmp .hv_w16_start +.hv_w64: + lea r6d, [hq+(3<<16)] + jmp .hv_w16_start +.hv_w32: + lea r6d, [hq+(1<<16)] +.hv_w16_start: + mov r4, srcq +%if ARCH_X86_32 + %define m8 [dstq] +%else + mov r7, dstq +%endif .hv_w16: - xor t0d, t0d -.hv_w16gt: - mov r4, dstq - mov r6, srcq - %if WIN64 - movaps r4m, xmm8 - %endif + movifnidn dsq, dsmp +%if WIN64 + movaps r4m, m8 +%endif .hv_w16_loop0: - movu m0, [srcq+8*0] - movu m1, [srcq+8*1] + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w16_loop: -%if ARCH_X86_32 - %define m0tmp [dstq] -%else - %define m0tmp m8 -%endif add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] @@ -743,62 +757,51 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 - mova m0tmp, m2 + mova m8, m2 psubw m2, m0 - paddw m2, m2 pmulhw m2, m6 + pavgw m0, m7 paddw m2, m0 mova m0, m3 psubw m3, m1 - paddw m3, m3 pmulhw m3, m6 + pavgw m1, m7 paddw m3, m1 mova m1, m0 - mova m0, m0tmp - pmulhrsw m2, m7 - pmulhrsw m3, m7 + mova m0, m8 + psrlw m2, 4 + psrlw m3, 4 packuswb m2, m3 mova [dstq], m2 add dstq, dsmp dec hd jg .hv_w16_loop - movzx hd, t0w - add r4, mmsize - add r6, mmsize - mov dstq, r4 - mov srcq, r6 - sub t0d, 1<<16 +%if ARCH_X86_32 + mov dstq, dstm + add r4, 16 + movzx hd, r6w + add dstq, 16 + mov srcq, r4 + mov dstm, dstq +%else + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 +%endif + sub r6d, 1<<16 jg .hv_w16_loop0 - %if WIN64 - movaps xmm8, r4m - %endif +%if WIN64 + movaps m8, r4m +%endif RET -.hv_w32: - lea t0d, [hq+(1<<16)] - jmp .hv_w16gt -.hv_w64: - lea t0d, [hq+(3<<16)] - jmp .hv_w16gt -.hv_w128: - lea t0d, [hq+(7<<16)] - jmp .hv_w16gt - -%macro PSHUFB_0X1X 1-2 ; dst[, src] - %if cpuflag(ssse3) - pshufb %1, %2 - %else - punpcklbw %1, %1 - psraw %1, 8 - pshufd %1, %1, q0000 - %endif -%endmacro %macro PSHUFB_BILIN_H8 2 ; dst, src %if cpuflag(ssse3) pshufb %1, %2 %else - mova %2, %1 - psrldq %1, 1 + psrldq %2, %1, 1 punpcklbw %1, %2 %endif %endmacro @@ -807,8 +810,7 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak %if cpuflag(ssse3) pshufb %1, %2 %else - mova %2, %1 - psrldq %1, 1 + psrldq %2, %1, 1 punpckhbw %3, %1, %2 punpcklbw %1, %2 punpcklqdq %1, %3 @@ -845,17 +847,15 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak %endmacro %macro PREP_BILIN 0 - -DECLARE_REG_TMP 3, 5, 6 %if ARCH_X86_32 - %define base t2-prep%+SUFFIX + %define base r6-prep%+SUFFIX %else - %define base 0 + %define base 0 %endif cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx - LEA t2, prep%+SUFFIX + LEA r6, prep%+SUFFIX tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd @@ -865,11 +865,12 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jnz .v .prep: %if notcpuflag(ssse3) - add t2, prep_ssse3 - prep_sse2 + add r6, prep_ssse3 - prep_sse2 jmp prep_ssse3 %else - movzx wd, word [t2+wq*2+table_offset(prep,)] - add wq, t2 + movzx wd, word [r6+wq*2+table_offset(prep,)] + pxor m4, m4 + add wq, r6 lea stride3q, [strideq*3] jmp wq .prep_w4: @@ -877,17 +878,16 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movd m1, [srcq+strideq*1] movd m2, [srcq+strideq*2] movd m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] punpckldq m0, m1 punpckldq m2, m3 - lea srcq, [srcq+strideq*4] - pxor m1, m1 - punpcklbw m0, m1 - punpcklbw m2, m1 + punpcklbw m0, m4 + punpcklbw m2, m4 psllw m0, 4 psllw m2, 4 - mova [tmpq+mmsize*0], m0 - mova [tmpq+mmsize*1], m2 - add tmpq, 32 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m2 + add tmpq, 16*2 sub hd, 4 jg .prep_w4 RET @@ -897,7 +897,6 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movq m2, [srcq+strideq*2] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - pxor m4, m4 punpcklbw m0, m4 punpcklbw m1, m4 punpcklbw m2, m4 @@ -915,16 +914,13 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .prep_w8 RET .prep_w16: - movq m0, [srcq+strideq*0+8*0] - movq m1, [srcq+strideq*0+8*1] - movq m2, [srcq+strideq*1+8*0] - movq m3, [srcq+strideq*1+8*1] + movu m1, [srcq+strideq*0] + movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - pxor m4, m4 - punpcklbw m0, m4 - punpcklbw m1, m4 - punpcklbw m2, m4 - punpcklbw m3, m4 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 @@ -937,27 +933,25 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 sub hd, 2 jg .prep_w16 RET -.prep_w32: - mov t2d, 1 - jmp .prep_w32_vloop -.prep_w64: - mov t2d, 2 - jmp .prep_w32_vloop .prep_w128: - mov t2d, 4 + mov r3, -128 + jmp .prep_w32_start +.prep_w64: + mov r3, -64 + jmp .prep_w32_start +.prep_w32: + mov r3, -32 +.prep_w32_start: + sub srcq, r3 .prep_w32_vloop: - mov t1q, srcq - mov r3d, t2d + mov r6, r3 .prep_w32_hloop: - movq m0, [t1q+8*0] - movq m1, [t1q+8*1] - movq m2, [t1q+8*2] - movq m3, [t1q+8*3] - pxor m4, m4 - punpcklbw m0, m4 - punpcklbw m1, m4 - punpcklbw m2, m4 - punpcklbw m3, m4 + movu m1, [srcq+r6+16*0] + movu m3, [srcq+r6+16*1] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 @@ -967,10 +961,9 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 - add t1q, 32 - dec r3d - jg .prep_w32_hloop - lea srcq, [srcq+strideq] + add r6, 32 + jl .prep_w32_hloop + add srcq, strideq dec hd jg .prep_w32_vloop RET @@ -978,40 +971,31 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] - imul mxyd, 0xff01 %if cpuflag(ssse3) + imul mxyd, 0x00ff00ff mova m4, [base+bilin_h_shuf8] + add mxyd, 0x00100010 +%else + imul mxyd, 0xffff + add mxyd, 16 %endif - add mxyd, 16 << 8 movd m5, mxyd mov mxyd, r6m ; my -%if cpuflag(ssse3) - pshuflw m5, m5, q0000 - punpcklqdq m5, m5 -%else - PSHUFB_0X1X m5 -%endif + pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv -%if ARCH_X86_32 - mov t1, t2 ; save base reg for w4 -%endif - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] %if notcpuflag(ssse3) WIN64_SPILL_XMM 8 pxor m6, m6 %endif - add wq, t2 - lea stride3q, [strideq*3] + add wq, r6 jmp wq .h_w4: %if cpuflag(ssse3) - %if ARCH_X86_32 - mova m4, [t1-prep_ssse3+bilin_h_shuf4] - %else - mova m4, [bilin_h_shuf4] - %endif + mova m4, [base+bilin_h_shuf4] %endif + lea stride3q, [strideq*3] .h_w4_loop: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] @@ -1029,6 +1013,8 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .h_w4_loop RET .h_w8: + lea stride3q, [strideq*3] +.h_w8_loop: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] @@ -1048,7 +1034,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 - jg .h_w8 + jg .h_w8_loop RET .h_w16: movu m0, [srcq+strideq*0+8*0] @@ -1072,22 +1058,23 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 sub hd, 2 jg .h_w16 RET -.h_w32: - mov t2d, 1 << 0 - jmp .h_w32_vloop -.h_w64: - mov t2d, 1 << 1 - jmp .h_w32_vloop .h_w128: - mov t2d, 1 << 3 + mov r3, -128 + jmp .h_w32_start +.h_w64: + mov r3, -64 + jmp .h_w32_start +.h_w32: + mov r3, -32 +.h_w32_start: + sub srcq, r3 .h_w32_vloop: - mov t1q, srcq - mov r3d, t2d + mov r6, r3 .h_w32_hloop: - movu m0, [t1q+8*0] - movu m1, [t1q+8*1] - movu m2, [t1q+8*2] - movu m3, [t1q+8*3] + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + movu m2, [srcq+r6+8*2] + movu m3, [srcq+r6+8*3] PSHUFB_BILIN_H8 m0, m4 PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 @@ -1101,11 +1088,10 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 - add t1q, 32 - shr r3d, 1 - jnz .h_w32_hloop - lea srcq, [srcq+strideq] - sub hd, 1 + add r6, 32 + jl .h_w32_hloop + add srcq, strideq + dec hd jg .h_w32_vloop RET .v: @@ -1113,19 +1099,19 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 %endif - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] - imul mxyd, 0xff01 - add mxyd, 16 << 8 - add wq, t2 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] +%if cpuflag(ssse3) + imul mxyd, 0x00ff00ff + add mxyd, 0x00100010 +%else + imul mxyd, 0xffff + pxor m6, m6 + add mxyd, 16 +%endif + add wq, r6 lea stride3q, [strideq*3] movd m5, mxyd -%if cpuflag(ssse3) - pshuflw m5, m5, q0000 - punpcklqdq m5, m5 -%else - PSHUFB_0X1X m5 - pxor m6, m6 -%endif + pshufd m5, m5, q0000 jmp wq .v_w4: movd m0, [srcq+strideq*0] @@ -1134,46 +1120,41 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movd m2, [srcq+strideq*2] movd m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - punpcklwd m0, m1 ; 0 1 _ _ - punpcklwd m1, m2 ; 1 2 _ _ - punpcklbw m1, m0 - PMADDUBSW m1, m5, m6, m7, 0 - pshufd m1, m1, q3120 - mova [tmpq+16*0], m1 + punpckldq m0, m1 + punpckldq m1, m2 + punpcklbw m0, m1 ; 01 12 + PMADDUBSW m0, m5, m6, m7, 0 + mova [tmpq+16*0], m0 movd m0, [srcq+strideq*0] - punpcklwd m2, m3 ; 2 3 _ _ - punpcklwd m3, m0 ; 3 4 _ _ - punpcklbw m3, m2 - PMADDUBSW m3, m5, m6, m7, 0 - pshufd m3, m3, q3120 - mova [tmpq+16*1], m3 - add tmpq, 32 + punpckldq m2, m3 + punpckldq m3, m0 + punpcklbw m2, m3 ; 23 34 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*1], m2 + add tmpq, 16*2 sub hd, 4 jg .v_w4_loop RET .v_w8: movq m0, [srcq+strideq*0] .v_w8_loop: - movq m1, [srcq+strideq*2] - movq m2, [srcq+strideq*1] + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*2] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - shufpd m4, m0, m1, 0x0c ; 0 2 - movq m0, [srcq+strideq*0] - shufpd m2, m3, 0x0c ; 1 3 - shufpd m1, m0, 0x0c ; 2 4 - punpcklbw m3, m2, m4 - PMADDUBSW m3, m5, m6, m7, 0 - mova [tmpq+16*0], m3 - punpckhbw m3, m2, m4 - PMADDUBSW m3, m5, m6, m7, 0 - mova [tmpq+16*2], m3 - punpcklbw m3, m1, m2 - punpckhbw m1, m2 - PMADDUBSW m3, m5, m6, m7, 0 + punpcklbw m0, m1 ; 01 + punpcklbw m1, m2 ; 12 + PMADDUBSW m0, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 - mova [tmpq+16*1], m3 - mova [tmpq+16*3], m1 + mova [tmpq+16*0], m0 + movq m0, [srcq+strideq*0] + punpcklbw m2, m3 ; 23 + punpcklbw m3, m0 ; 34 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*1], m1 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .v_w8_loop @@ -1183,48 +1164,48 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .v_w16_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] - punpcklbw m3, m1, m0 - punpckhbw m4, m1, m0 - PMADDUBSW m3, m5, m6, m7, 0 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*0], m3 - mova [tmpq+16*1], m4 - punpcklbw m3, m2, m1 - punpckhbw m4, m2, m1 - PMADDUBSW m3, m5, m6, m7, 0 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*2], m3 - mova [tmpq+16*3], m4 movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - movu m0, [srcq+strideq*0] - add tmpq, 16*8 - punpcklbw m1, m3, m2 - punpckhbw m4, m3, m2 - PMADDUBSW m1, m5, m6, m7, 0 + punpcklbw m4, m0, m1 + punpckhbw m0, m1 PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq-16*4], m1 - mova [tmpq-16*3], m4 - punpcklbw m1, m0, m3 - punpckhbw m2, m0, m3 + PMADDUBSW m0, m5, m6, m7, 0 + mova [tmpq+16*0], m4 + punpcklbw m4, m1, m2 + punpckhbw m1, m2 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*1], m0 + movu m0, [srcq+strideq*0] PMADDUBSW m1, m5, m6, m7, 0 + mova [tmpq+16*2], m4 + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*3], m1 PMADDUBSW m2, m5, m6, m7, 0 - mova [tmpq-16*2], m1 - mova [tmpq-16*1], m2 + mova [tmpq+16*4], m4 + punpcklbw m4, m3, m0 + punpckhbw m3, m0 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*5], m2 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*6], m4 + mova [tmpq+16*7], m3 + add tmpq, 16*8 sub hd, 4 jg .v_w16_loop RET -.v_w32: - lea t2d, [hq+(0<<16)] - mov t0d, 64 +.v_w128: + lea r3d, [hq+(3<<8)] + mov r6d, 256 jmp .v_w32_start .v_w64: - lea t2d, [hq+(1<<16)] - mov t0d, 128 + lea r3d, [hq+(1<<8)] + mov r6d, 128 jmp .v_w32_start -.v_w128: - lea t2d, [hq+(3<<16)] - mov t0d, 256 +.v_w32: + xor r3d, r3d + mov r6d, 64 .v_w32_start: %if ARCH_X86_64 %if WIN64 @@ -1232,7 +1213,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %endif mov r7, tmpq %endif - mov t1, srcq + mov r5, srcq .v_w32_hloop: movu m0, [srcq+strideq*0+16*0] movu m1, [srcq+strideq*0+16*1] @@ -1240,48 +1221,48 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movu m2, [srcq+strideq*1+16*0] movu m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] - punpcklbw m4, m2, m0 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*0], m4 - punpckhbw m4, m2, m0 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*1], m4 - punpcklbw m4, m3, m1 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*2], m4 - punpckhbw m4, m3, m1 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*3], m4 - add tmpq, t0q - movu m0, [srcq+strideq*0+16*0] - movu m1, [srcq+strideq*0+16*1] punpcklbw m4, m0, m2 + punpckhbw m0, m2 PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m0, m5, m6, m7, 0 mova [tmpq+16*0], m4 - punpckhbw m4, m0, m2 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*1], m4 + mova [tmpq+16*1], m0 + movu m0, [srcq+strideq*0+16*0] punpcklbw m4, m1, m3 + punpckhbw m1, m3 PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 mova [tmpq+16*2], m4 - punpckhbw m4, m1, m3 + mova [tmpq+16*3], m1 + movu m1, [srcq+strideq*0+16*1] + add tmpq, r6 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*3], m4 - add tmpq, t0q + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*0], m4 + mova [tmpq+16*1], m2 + punpcklbw m4, m3, m1 + punpckhbw m3, m1 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*2], m4 + mova [tmpq+16*3], m3 + add tmpq, r6 sub hd, 2 jg .v_w32_vloop - movzx hd, t2w - add t1, 32 - mov srcq, t1 + add r5, 32 + movzx hd, r3b + mov srcq, r5 %if ARCH_X86_64 - add r7, 2*16*2 + add r7, 16*4 mov tmpq, r7 %else mov tmpq, tmpmp - add tmpq, 2*16*2 + add tmpq, 16*4 mov tmpmp, tmpq %endif - sub t2d, 1<<16 + sub r3d, 1<<8 jg .v_w32_hloop %if WIN64 POP r7 @@ -1290,71 +1271,56 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] %assign stack_offset stack_offset - stack_size_padded %if cpuflag(ssse3) + imul mxyd, 0x08000800 WIN64_SPILL_XMM 8 %else - WIN64_SPILL_XMM 10 -%endif - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] -%if cpuflag(ssse3) - shl mxyd, 11 -%else + or mxyd, 1<<16 + WIN64_SPILL_XMM 9 %if ARCH_X86_64 - mova m8, [pw_8] + mova m8, [base+pw_8] %else - %define m8 [t1-prep_sse2+pw_8] + %define m8 [base+pw_8] %endif pxor m7, m7 %endif movd m6, mxyd - add wq, t2 - pshuflw m6, m6, q0000 -%if cpuflag(ssse3) - punpcklqdq m6, m6 -%elif ARCH_X86_64 - psrlw m0, m8, 3 - punpcklwd m6, m0 -%else - punpcklwd m6, [base+pw_1] -%endif -%if ARCH_X86_32 - mov t1, t2 ; save base reg for w4 -%endif - lea stride3q, [strideq*3] + add wq, r6 + pshufd m6, m6, q0000 jmp wq .hv_w4: %if cpuflag(ssse3) - %if ARCH_X86_32 - mova m4, [t1-prep_ssse3+bilin_h_shuf4] - %else - mova m4, [bilin_h_shuf4] - %endif -%endif + mova m4, [base+bilin_h_shuf4] + movddup m0, [srcq+strideq*0] +%else movhps m0, [srcq+strideq*0] +%endif + lea r3, [strideq*3] PSHUFB_BILIN_H4 m0, m4, m3 PMADDUBSW m0, m5, m7, m4, 0 ; _ 0 .hv_w4_loop: movq m1, [srcq+strideq*1] movhps m1, [srcq+strideq*2] - movq m2, [srcq+stride3q ] + movq m2, [srcq+r3 ] lea srcq, [srcq+strideq*4] movhps m2, [srcq+strideq*0] PSHUFB_BILIN_H4 m1, m4, m3 PSHUFB_BILIN_H4 m2, m4, m3 PMADDUBSW m1, m5, m7, m4, 0 ; 1 2 - shufpd m3, m0, m1, 0x01 ; 0 1 - mova m0, m2 - PMADDUBSW m0, m5, m7, m4, 0 ; 3 4 - shufpd m2, m1, m0, 0x01 ; 2 3 - psubw m1, m3 + PMADDUBSW m2, m5, m7, m4, 0 ; 3 4 + shufpd m0, m1, 0x01 ; 0 1 + shufpd m3, m1, m2, 0x01 ; 2 3 + psubw m1, m0 PMULHRSW m1, m6, m4, m8, 4 - paddw m1, m3 - psubw m3, m0, m2 - PMULHRSW m3, m6, m4, m8, 4 - paddw m3, m2 + paddw m1, m0 + mova m0, m2 + psubw m2, m3 + PMULHRSW m2, m6, m4, m8, 4 + paddw m2, m3 mova [tmpq+16*0], m1 - mova [tmpq+16*1], m3 + mova [tmpq+16*1], m2 add tmpq, 32 sub hd, 4 jg .hv_w4_loop @@ -1365,7 +1331,8 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 PMADDUBSW m0, m5, m7, m4, 0 ; 0 .hv_w8_loop: movu m1, [srcq+strideq*1] - movu m2, [srcq+strideq*2] + lea srcq, [srcq+strideq*2] + movu m2, [srcq+strideq*0] PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 PMADDUBSW m1, m5, m7, m4, 0 ; 1 @@ -1373,68 +1340,40 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 psubw m3, m1, m0 PMULHRSW m3, m6, m4, m8, 4 paddw m3, m0 -%if notcpuflag(ssse3) && ARCH_X86_64 - SWAP m9, m7 -%endif - psubw m7, m2, m1 - PMULHRSW m7, m6, m4, m8, 4 - paddw m7, m1 + mova m0, m2 + psubw m2, m1 + PMULHRSW m2, m6, m4, m8, 4 + paddw m2, m1 mova [tmpq+16*0], m3 - mova [tmpq+16*1], m7 -%if notcpuflag(ssse3) && ARCH_X86_64 - SWAP m7, m9 -%endif - movu m1, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - movu m0, [srcq+strideq*0] - PSHUFB_BILIN_H8 m1, m4 - PSHUFB_BILIN_H8 m0, m4 - PMADDUBSW m1, m5, m7, m4, ARCH_X86_32 ; 3 - PMADDUBSW m0, m5, m7, m4, 0 ; 4 - psubw m3, m1, m2 - PMULHRSW m3, m6, m4, m8, 4 - paddw m3, m2 -%if notcpuflag(ssse3) && ARCH_X86_64 - SWAP m9, m7 -%endif - psubw m7, m0, m1 - PMULHRSW m7, m6, m4, m8, 4 - paddw m7, m1 - mova [tmpq+16*2], m3 - mova [tmpq+16*3], m7 -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m7, m9 - %else - pxor m7, m7 - %endif -%endif - add tmpq, 16*4 - sub hd, 4 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 2 jg .hv_w8_loop RET -.hv_w16: - mov t2d, hd - mov t0d, 32 - jmp .hv_w16_start -.hv_w32: - lea t2d, [hq+(1<<16)] - mov t0d, 64 +.hv_w128: + lea r3d, [hq+(7<<8)] + mov r5d, 256 jmp .hv_w16_start .hv_w64: - lea t2d, [hq+(3<<16)] - mov t0d, 128 + lea r3d, [hq+(3<<8)] + mov r5d, 128 jmp .hv_w16_start -.hv_w128: - lea t2d, [hq+(7<<16)] - mov t0d, 256 +.hv_w32: + lea r3d, [hq+(1<<8)] + mov r5d, 64 + jmp .hv_w16_start +.hv_w16: + xor r3d, r3d + mov r5d, 32 .hv_w16_start: +%if ARCH_X86_64 || cpuflag(ssse3) + mov r6, srcq +%endif %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r7, tmpq - mov r5, srcq %endif .hv_w16_hloop: movu m0, [srcq+strideq*0+8*0] @@ -1459,7 +1398,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 PMULHRSW m0, m6, m4, m8, 4 paddw m0, m1 mova [tmpq+16*1], m0 - add tmpq, t0q + add tmpq, r5 movu m0, [srcq+strideq*0+8*0] PSHUFB_BILIN_H8 m0, m4 PMADDUBSW m0, m5, m7, m4, 0 ; 2a @@ -1474,24 +1413,30 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 PMULHRSW m2, m6, m4, m8, 4 paddw m2, m3 mova [tmpq+16*1], m2 - add tmpq, t0q + add tmpq, r5 sub hd, 2 jg .hv_w16_vloop - movzx hd, t2w + movzx hd, r3b %if ARCH_X86_64 - add r5, 16 + add r6, 16 add r7, 2*16 - mov srcq, r5 + mov srcq, r6 mov tmpq, r7 +%elif cpuflag(ssse3) + mov tmpq, tmpm + add r6, 16 + add tmpq, 2*16 + mov srcq, r6 + mov tmpm, tmpq %else - mov srcq, srcmp - mov tmpq, tmpmp + mov srcq, srcm + mov tmpq, tmpm add srcq, 16 add tmpq, 2*16 - mov srcmp, srcq - mov tmpmp, tmpq + mov srcm, srcq + mov tmpm, tmpq %endif - sub t2d, 1<<16 + sub r3d, 1<<8 jg .hv_w16_hloop %if WIN64 POP r7 @@ -1538,13 +1483,9 @@ FN put_8tap, regular, REGULAR, REGULAR %if ARCH_X86_32 %define base_reg r1 %define base base_reg-put_ssse3 - %define W32_RESTORE_DSQ mov dsq, dsm - %define W32_RESTORE_SSQ mov ssq, ssm %else %define base_reg r8 %define base 0 - %define W32_RESTORE_DSQ - %define W32_RESTORE_SSQ %endif cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 @@ -1575,10 +1516,9 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 add wq, base_reg ; put_bilin mangling jump %assign stack_offset org_stack_offset -%if ARCH_X86_32 - mov dsq, dsm - mov ssq, ssm -%elif WIN64 + movifnidn dsq, dsmp + movifnidn ssq, ssmp +%if WIN64 pop r8 %endif lea r6, [ssq*3] @@ -1590,7 +1530,7 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 test myd, 0xf00 %endif jnz .hv - W32_RESTORE_SSQ + movifnidn ssq, ssmp WIN64_SPILL_XMM 12 cmp wd, 4 jl .h_w2 @@ -1604,11 +1544,10 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)] - movd m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0] - pshufd m5, m5, q0000 - movd m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4] - pshufd m6, m6, q0000 + movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3] mova m7, [base+pw_34] ; 2 + (8 << 2) + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 add wq, base_reg jmp wq .h_w2: @@ -1620,9 +1559,9 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 dec srcq mova m4, [base+subpel_h_shuf4] movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] - pshufd m3, m3, q0000 mova m5, [base+pw_34] ; 2 + (8 << 2) - W32_RESTORE_DSQ + pshufd m3, m3, q0000 + movifnidn dsq, dsmp .h_w2_loop: movq m0, [srcq+ssq*0] movhps m0, [srcq+ssq*1] @@ -1633,10 +1572,10 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 paddw m0, m5 ; pw34 psraw m0, 6 packuswb m0, m0 - movd r4d, m0 - mov [dstq+dsq*0], r4w - shr r4d, 16 - mov [dstq+dsq*1], r4w + movd r6d, m0 + mov [dstq+dsq*0], r6w + shr r6d, 16 + mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop @@ -1649,10 +1588,10 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %endif dec srcq movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] - pshufd m3, m3, q0000 - mova m5, [base+pw_34] ; 2 + (8 << 2) mova m6, [base+subpel_h_shufA] - W32_RESTORE_DSQ + mova m5, [base+pw_34] ; 2 + (8 << 2) + pshufd m3, m3, q0000 + movifnidn dsq, dsmp .h_w4_loop: movq m0, [srcq+ssq*0] ; 1 movq m1, [srcq+ssq*1] ; 2 @@ -1672,7 +1611,6 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 sub hd, 2 jg .h_w4_loop RET - ; %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] %if ARCH_X86_32 pshufb %2, %1, [base+subpel_h_shufB] @@ -1693,18 +1631,17 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 paddw %1, m7 ; pw34 psraw %1, 6 %endmacro - ; .h_w8: - movu m0, [srcq+ssq*0] - movu m1, [srcq+ssq*1] - PUT_8TAP_H m0, m2, m3, m4 + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] + PUT_8TAP_H m0, m2, m3, m4 PUT_8TAP_H m1, m2, m3, m4 packuswb m0, m1 %if ARCH_X86_32 - movq [dstq ], m0 + movq [dstq], m0 add dstq, dsm - movhps [dstq ], m0 + movhps [dstq], m0 add dstq, dsm %else movq [dstq+dsq*0], m0 @@ -1714,39 +1651,35 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 sub hd, 2 jg .h_w8 RET -.h_w16: - xor r6d, r6d - jmp .h_start -.h_w32: - mov r6, -16*1 - jmp .h_start -.h_w64: - mov r6, -16*3 - jmp .h_start .h_w128: - mov r6, -16*7 -.h_start: - sub srcq, r6 - sub dstq, r6 - mov r4, r6 -.h_loop: + mov r4, -16*7 + jmp .h_w16_start +.h_w64: + mov r4, -16*3 + jmp .h_w16_start +.h_w32: + mov r4, -16*1 + jmp .h_w16_start +.h_w16: + xor r4d, r4d +.h_w16_start: + sub srcq, r4 + sub dstq, r4 +.h_w16_loop_v: + mov r6, r4 +.h_w16_loop_h: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] PUT_8TAP_H m0, m2, m3, m4 PUT_8TAP_H m1, m2, m3, m4 packuswb m0, m1 mova [dstq+r6], m0 - add r6, mmsize - jle .h_loop + add r6, 16 + jle .h_w16_loop_h add srcq, ssq -%if ARCH_X86_32 - add dstq, dsm -%else - add dstq, dsq -%endif - mov r6, r4 + add dstq, dsmp dec hd - jg .h_loop + jg .h_w16_loop_v RET .v: %if ARCH_X86_32 @@ -1754,7 +1687,7 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 shr ssd, 16 cmp hd, 6 cmovs ssd, mxd - lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3] + movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] %else %assign stack_offset org_stack_offset WIN64_SPILL_XMM 16 @@ -1762,12 +1695,12 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - lea myq, [base_reg+myq*8+subpel_filters-put_ssse3] + movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] %endif tzcnt r6d, wd movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)] + punpcklwd m0, m0 mova m7, [base+pw_512] - psrlw m2, m7, 1 ; 0x0100 add r6, base_reg %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] @@ -1775,20 +1708,16 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed - ALLOC_STACK -mmsize*4 + ALLOC_STACK -16*4 %assign regs_used 7 - movd m0, [ssq+0] - pshufb m0, m2 - mova subpel0, m0 - movd m0, [ssq+2] - pshufb m0, m2 - mova subpel1, m0 - movd m0, [ssq+4] - pshufb m0, m2 - mova subpel2, m0 - movd m0, [ssq+6] - pshufb m0, m2 - mova subpel3, m0 + pshufd m1, m0, q0000 + mova subpel0, m1 + pshufd m1, m0, q1111 + mova subpel1, m1 + pshufd m1, m0, q2222 + mova subpel2, m1 + pshufd m1, m0, q3333 + mova subpel3, m1 mov ssq, [rstk+stack_offset+gprsize*4] lea ssq, [ssq*3] sub srcq, ssq @@ -1799,47 +1728,46 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %define subpel1 m9 %define subpel2 m10 %define subpel3 m11 - movd subpel0, [myq+0] - pshufb subpel0, m2 - movd subpel1, [myq+2] - pshufb subpel1, m2 - movd subpel2, [myq+4] - pshufb subpel2, m2 - movd subpel3, [myq+6] - pshufb subpel3, m2 lea ss3q, [ssq*3] + pshufd m8, m0, q0000 sub srcq, ss3q + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 %endif jmp r6 .v_w2: - movd m2, [srcq+ssq*0] ; 0 - pinsrw m2, [srcq+ssq*1], 2 ; 0 1 - pinsrw m2, [srcq+ssq*2], 4 ; 0 1 2 + movd m1, [srcq+ssq*0] + movd m0, [srcq+ssq*1] %if ARCH_X86_32 lea srcq, [srcq+ssq*2] - add srcq, ssq - pinsrw m2, [srcq+ssq*0], 6 ; 0 1 2 3 - add srcq, ssq -%else - pinsrw m2, [srcq+ss3q ], 6 ; 0 1 2 3 - lea srcq, [srcq+ssq*4] -%endif - movd m3, [srcq+ssq*0] ; 4 - movd m1, [srcq+ssq*1] ; 5 - movd m0, [srcq+ssq*2] ; 6 -%if ARCH_X86_32 + movd m2, [srcq+ssq*0] + movd m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m3, [srcq+ssq*0] + movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - add srcq, ssq %else + movd m2, [srcq+ssq*2] + add srcq, ss3q + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m4, [srcq+ssq*2] add srcq, ss3q %endif - punpckldq m3, m1 ; 4 5 _ _ - punpckldq m1, m0 ; 5 6 _ _ - palignr m4, m3, m2, 4 ; 1 2 3 4 - punpcklbw m3, m1 ; 45 56 - punpcklbw m1, m2, m4 ; 01 12 - punpckhbw m2, m4 ; 23 34 + punpcklwd m1, m0 ; 0 1 + punpcklwd m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+ssq*0] + punpcklwd m2, m5 ; 2 3 + punpcklwd m5, m3 ; 3 4 + punpcklwd m3, m4 ; 4 5 + punpcklwd m4, m0 ; 5 6 + punpcklbw m2, m5 ; 23 34 + punpcklbw m3, m4 ; 45 56 .v_w2_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 @@ -1847,17 +1775,14 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 paddw m5, m3 - movd m4, [srcq+ssq*0] ; 7 - punpckldq m3, m0, m4 ; 6 7 _ _ - movd m0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - punpckldq m4, m0 ; 7 8 _ _ + punpcklwd m3, m0, m4 ; 6 7 + movd m0, [srcq+ssq*0] + punpcklwd m4, m0 ; 7 8 punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, subpel3 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 packuswb m5, m5 - pshuflw m5, m5, q2020 movd r6d, m5 mov [dstq+dsq*0], r6w shr r6d, 16 @@ -1873,51 +1798,46 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 .v_w32: .v_w64: .v_w128: -%endif ; ARCH_X86_32 - lea r6d, [wq - 4] ; horizontal loop - mov r4, dstq -%if ARCH_X86_32 -%if STACK_ALIGNMENT < mmsize - %define srcm [rsp+mmsize*4+gprsize] + shl wd, 14 +%if STACK_ALIGNMENT < 16 + %define dstm [rsp+mmsize*4+gprsize] + mov dstm, dstq %endif - mov srcm, srcq -%else - mov r7, srcq -%endif - shl r6d, (16 - 2) ; (wq / 4) << 16 - mov r6w, hw + lea r6d, [hq+wq-(1<<16)] + mov r4, srcq .v_w4_loop0: - movd m2, [srcq+ssq*0] ; 0 - movhps m2, [srcq+ssq*2] ; 0 _ 2 - movd m3, [srcq+ssq*1] ; 1 -%if ARCH_X86_32 - lea srcq, [srcq+ssq*2] - add srcq, ssq - movhps m3, [srcq+ssq*0] ; 1 _ 3 - lea srcq, [srcq+ssq*1] -%else - movhps m3, [srcq+ss3q ] ; 1 _ 3 - lea srcq, [srcq+ssq*4] %endif - pshufd m2, m2, q2020 ; 0 2 0 2 - pshufd m3, m3, q2020 ; 1 3 1 3 - punpckldq m2, m3 ; 0 1 2 3 - movd m3, [srcq+ssq*0] ; 4 - movd m1, [srcq+ssq*1] ; 5 - movd m0, [srcq+ssq*2] ; 6 + movd m1, [srcq+ssq*0] + movd m0, [srcq+ssq*1] %if ARCH_X86_32 lea srcq, [srcq+ssq*2] - add srcq, ssq + movd m2, [srcq+ssq*0] + movd m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m3, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] %else + movd m2, [srcq+ssq*2] + add srcq, ss3q + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m4, [srcq+ssq*2] add srcq, ss3q %endif - punpckldq m3, m1 ; 4 5 _ _ - punpckldq m1, m0 ; 5 6 _ _ - palignr m4, m3, m2, 4 ; 1 2 3 4 - punpcklbw m3, m1 ; 45 56 - punpcklbw m1, m2, m4 ; 01 12 - punpckhbw m2, m4 ; 23 34 + punpckldq m1, m0 ; 0 1 + punpckldq m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+ssq*0] + punpckldq m2, m5 ; 2 3 + punpckldq m5, m3 ; 3 4 + punpckldq m3, m4 ; 4 5 + punpckldq m4, m0 ; 5 6 + punpcklbw m2, m5 ; 23 34 + punpcklbw m3, m4 ; 45 56 .v_w4_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 @@ -1925,10 +1845,8 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 paddw m5, m3 - movd m4, [srcq+ssq*0] punpckldq m3, m0, m4 ; 6 7 _ _ - movd m0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] + movd m0, [srcq+ssq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, subpel3 ; a3 b3 @@ -1936,24 +1854,21 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pmulhrsw m5, m7 packuswb m5, m5 movd [dstq+dsq*0], m5 - pshufd m5, m5, q0101 + psrlq m5, 32 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop - mov hw, r6w ; reset vertical loop - add r4, 4 - mov dstq, r4 %if ARCH_X86_32 - mov srcq, srcm - add srcq, 4 - mov srcm, srcq -%else - add r7, 4 - mov srcq, r7 -%endif - sub r6d, 1<<16 ; horizontal-- + mov dstq, dstm + add r4, 4 + movzx hd, r6w + add dstq, 4 + mov srcq, r4 + mov dstm, dstq + sub r6d, 1<<16 jg .v_w4_loop0 +%endif RET %if ARCH_X86_64 .v_w8: @@ -1961,56 +1876,51 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 .v_w32: .v_w64: .v_w128: - lea r6d, [wq - 8] ; horizontal loop - mov r4, dstq - mov r7, srcq - shl r6d, 8 - 3; (wq / 8) << 8 - mov r6b, hb + lea r6d, [wq*8-64] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*4] .v_w8_loop0: - movq m4, [srcq+ssq*0] ; 0 - movq m5, [srcq+ssq*1] ; 1 - lea srcq, [srcq+ssq*2] - movq m6, [srcq+ssq*0] ; 2 - movq m0, [srcq+ssq*1] ; 3 - lea srcq, [srcq+ssq*2] - movq m1, [srcq+ssq*0] ; 4 - movq m2, [srcq+ssq*1] ; 5 - lea srcq, [srcq+ssq*2] ; - movq m3, [srcq+ssq*0] ; 6 - shufpd m4, m0, 0x0c - shufpd m5, m1, 0x0c - punpcklbw m1, m4, m5 ; 01 - punpckhbw m4, m5 ; 34 - shufpd m6, m2, 0x0c - punpcklbw m2, m5, m6 ; 12 - punpckhbw m5, m6 ; 45 - shufpd m0, m3, 0x0c - punpcklbw m3, m6, m0 ; 23 - punpckhbw m6, m0 ; 56 + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + movq m3, [srcq+ssq*2] + add srcq, ss3q + movq m4, [srcq+ssq*0] + movq m5, [srcq+ssq*1] + movq m6, [srcq+ssq*2] + add srcq, ss3q + movq m0, [srcq+ssq*0] + punpcklbw m1, m2 ; 01 + punpcklbw m2, m3 ; 12 + punpcklbw m3, m4 ; 23 + punpcklbw m4, m5 ; 34 + punpcklbw m5, m6 ; 45 + punpcklbw m6, m0 ; 56 .v_w8_loop: - movq m12, [srcq+ssq*1] ; 8 + movq m13, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - movq m13, [srcq+ssq*0] ; 9 pmaddubsw m14, m1, subpel0 ; a0 - pmaddubsw m15, m2, subpel0 ; b0 mova m1, m3 + pmaddubsw m15, m2, subpel0 ; b0 mova m2, m4 pmaddubsw m3, subpel1 ; a1 + mova m12, m0 pmaddubsw m4, subpel1 ; b1 + movq m0, [srcq+ssq*0] paddw m14, m3 paddw m15, m4 mova m3, m5 - mova m4, m6 pmaddubsw m5, subpel2 ; a2 + mova m4, m6 pmaddubsw m6, subpel2 ; b2 + punpcklbw m12, m13 ; 67 + punpcklbw m13, m0 ; 78 paddw m14, m5 + mova m5, m12 + pmaddubsw m12, subpel3 ; a3 paddw m15, m6 - shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c - punpcklbw m5, m6, m0 ; 67 - punpckhbw m6, m0 ; 78 - pmaddubsw m12, m5, subpel3 ; a3 - pmaddubsw m13, m6, subpel3 ; b3 + mova m6, m13 + pmaddubsw m13, subpel3 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 @@ -2021,12 +1931,12 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop - movzx hd, r6b ; reset vertical loop add r4, 8 add r7, 8 - mov dstq, r4 - mov srcq, r7 - sub r6d, 1<<8 ; horizontal-- + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 @@ -2051,7 +1961,7 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 cmp hd, 6 cmovs ssd, mxd movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] - W32_RESTORE_SSQ + mov ssq, ssmp lea r6, [ssq*3] sub srcq, r6 %define base_reg r6 @@ -2064,7 +1974,6 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %define subpelv1 [rsp+mmsize*1] %define subpelv2 [rsp+mmsize*2] %define subpelv3 [rsp+mmsize*3] - punpcklqdq m0, m0 punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m6, m0, q0000 @@ -2088,7 +1997,6 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %define subpelv1 m11 %define subpelv2 m12 %define subpelv3 m13 - punpcklqdq m0, m0 punpcklbw m0, m0 psraw m0, 8 ; sign-extend mova m8, [base+pw_8192] @@ -2103,22 +2011,21 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 je .hv_w4 .hv_w2: mova m6, [base+subpel_h_shuf4] - ; movq m2, [srcq+ssq*0] ; 0 movhps m2, [srcq+ssq*1] ; 0 _ 1 - movq m0, [srcq+ssq*2] ; 2 %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d512reg [base+pd_512] lea srcq, [srcq+ssq*2] - add srcq, ssq - movhps m0, [srcq+ssq*0] ; 2 _ 3 - lea srcq, [srcq+ssq*1] + movq m0, [srcq+ssq*0] ; 2 + movhps m0, [srcq+ssq*1] ; 2 _ 3 + lea srcq, [srcq+ssq*2] %else %define w8192reg m8 %define d512reg m9 - movhps m0, [srcq+ss3q ] ; 2 _ 3 - lea srcq, [srcq+ssq*4] + movq m0, [srcq+ssq*2] ; 2 + add srcq, ss3q + movhps m0, [srcq+ssq*0] ; 2 _ 3 %endif pshufb m2, m6 ; 0 ~ 1 ~ pshufb m0, m6 ; 2 ~ 3 ~ @@ -2126,43 +2033,42 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pmaddubsw m0, m7 ; subpel_filters phaddw m2, m0 ; 0 1 2 3 pmulhrsw m2, w8192reg - ; +%if ARCH_X86_32 movq m3, [srcq+ssq*0] ; 4 movhps m3, [srcq+ssq*1] ; 4 _ 5 - movq m0, [srcq+ssq*2] ; 6 -%if ARCH_X86_32 lea srcq, [srcq+ssq*2] - add srcq, ssq %else + movq m3, [srcq+ssq*1] ; 4 + movhps m3, [srcq+ssq*2] ; 4 _ 5 add srcq, ss3q %endif + movq m0, [srcq+ssq*0] ; 6 pshufb m3, m6 ; 4 ~ 5 ~ pshufb m0, m6 ; 6 ~ pmaddubsw m3, m7 ; subpel_filters pmaddubsw m0, m7 ; subpel_filters phaddw m3, m0 ; 4 5 6 _ pmulhrsw m3, w8192reg - ; palignr m4, m3, m2, 4; V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 punpckhwd m2, m4 ; V 23 34 2 3 3 4 pshufd m0, m3, q2121; V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 4 5 5 6 .hv_w2_loop: + movq m4, [srcq+ssq*1] ; V 7 + lea srcq, [srcq+ssq*2] ; V + movhps m4, [srcq+ssq*0] ; V 7 8 + pshufb m4, m6 + pmaddubsw m4, m7 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 ; V pmaddwd m2, subpelv1 ; V a1 b1 paddd m5, m2 ; V mova m2, m3 ; V pmaddwd m3, subpelv2 ; a2 b2 - paddd m5, m3 ; V - movq m4, [srcq+ssq*0] ; V 7 - movhps m4, [srcq+ssq*1] ; V 7 8 - lea srcq, [srcq+ssq*2] ; V - pshufb m4, m6 - pmaddubsw m4, m7 phaddw m4, m4 pmulhrsw m4, w8192reg + paddd m5, m3 ; V palignr m3, m4, m0, 12 mova m0, m4 punpcklwd m3, m0 ; V 67 78 @@ -2182,7 +2088,6 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 RET %undef w8192reg %undef d512reg - ; .hv_w4: %define hv4_line_0_0 4 %define hv4_line_0_1 5 @@ -2194,14 +2099,12 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %define hv4_line_1_1 11 %define hv4_line_1_2 12 %define hv4_line_1_3 13 - ; %macro SAVELINE_W4 3 mova [rsp+mmsize*hv4_line_%3_%2], %1 %endmacro %macro RESTORELINE_W4 3 mova %1, [rsp+mmsize*hv4_line_%3_%2] %endmacro - ; %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d512reg [base+pd_512] @@ -2213,13 +2116,13 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova m6, [base+subpel_h_shuf4] movq m5, [srcq+ssq*0] ; 0 _ _ _ movhps m5, [srcq+ssq*1] ; 0 _ 1 _ - movq m4, [srcq+ssq*2] ; 2 _ _ _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] - add srcq, ssq - movhps m4, [srcq+ssq*0] ; 2 _ 3 _ - add srcq, ssq + movq m4, [srcq+ssq*0] ; 2 _ _ _ + movhps m4, [srcq+ssq*1] ; 2 _ 3 _ + lea srcq, [srcq+ssq*2] %else + movq m4, [srcq+ssq*2] ; 2 _ _ _ movhps m4, [srcq+ss3q ] ; 2 _ 3 _ lea srcq, [srcq+ssq*4] %endif @@ -2243,7 +2146,14 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova m6, [base+subpel_h_shuf4] movq m5, [srcq+ssq*0] ; 4 _ _ _ movhps m5, [srcq+ssq*1] ; 4 _ 5 _ +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] + movq m4, [srcq+ssq*0] ; 6 _ _ _ + add srcq, ssq +%else movq m4, [srcq+ssq*2] ; 6 _ _ _ + add srcq, ss3q +%endif pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ pmaddubsw m3, m7 ;H subpel_filters @@ -2259,13 +2169,6 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pmaddubsw m0, m7 ;H subpel_filters phaddw m3, m0 ;H 4 5 6 7 pmulhrsw m3, w8192reg ;H pw_8192 - ; -%if ARCH_X86_32 - lea srcq, [srcq+ssq*2] - add srcq, ssq -%else - add srcq, ss3q -%endif ;process high palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 @@ -2293,7 +2196,6 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 - ; mova m6, [base+subpel_h_shuf4] movq m4, [srcq+ssq*0] ; 7 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ @@ -2325,10 +2227,10 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 - ; mova m6, [base+subpel_h_shuf4+16] movq m4, [srcq+ssq*0] ; 7 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ + lea srcq, [srcq+ssq*2] pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ pmaddubsw m4, m7 ;H subpel_filters phaddw m4, m4 ;H 7 8 7 8 @@ -2340,12 +2242,10 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 paddd m5, d512reg ; pd_512 paddd m5, m4 psrad m4, m5, 10 - ; RESTORELINE_W4 m5, 5, 0 packssdw m5, m4 ; d -> w packuswb m5, m5 ; w -> b pshuflw m5, m5, q3120 - lea srcq, [srcq+ssq*2] movd [dstq+dsq*0], m5 psrlq m5, 32 movd [dstq+dsq*1], m5 @@ -2365,7 +2265,6 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %undef subpelv1 %undef subpelv2 %undef subpelv3 - ; .hv_w8: %assign stack_offset org_stack_offset %define hv8_line_1 0 @@ -2400,7 +2299,7 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mov ssq, ssmp ALLOC_STACK -mmsize*13 %if STACK_ALIGNMENT < 16 - %define srcm [rsp+mmsize*13+gprsize*1] + %define dstm [rsp+mmsize*13+gprsize*1] %define dsm [rsp+mmsize*13+gprsize*2] mov r6, [rstk+stack_offset+gprsize*2] mov dsm, r6 @@ -2420,10 +2319,10 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova subpelv2, m4 mova subpelv3, m5 lea r6, [ssq*3] + mov dstm, dstq sub srcq, r6 - mov srcm, srcq %else - ALLOC_STACK mmsize*5, 16 + ALLOC_STACK 16*5, 16 %define subpelh0 m10 %define subpelh1 m11 %define subpelv0 m12 @@ -2440,7 +2339,6 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 - punpcklqdq m1, m1 punpcklbw m1, m1 psraw m1, 8 ; sign-extend pshufd subpelv0, m1, q0000 @@ -2448,18 +2346,18 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 lea ss3q, [ssq*3] + mov r7, dstq sub srcq, ss3q - mov r7, srcq %endif - lea r6d, [wq-4] - mov r4, dstq - shl r6d, (16 - 2) - mov r6w, hw + shl wd, 14 + lea r6d, [hq+wq-(1<<16)] + mov r4, srcq .hv_w8_loop0: movu m4, [srcq+ssq*0] ; 0 = _ _ movu m5, [srcq+ssq*1] ; 1 = _ _ +%if ARCH_X86_32 lea srcq, [srcq+ssq*2] - ; +%endif %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] %if ARCH_X86_32 pshufb %3, %1, [base+subpel_h_shufB] @@ -2478,7 +2376,6 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 paddw %1, %3 ; A0+C4 phaddw %1, %2 %endmacro - ; %if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] @@ -2486,12 +2383,17 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %endif HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ +%if ARCH_X86_32 movu m6, [srcq+ssq*0] ; 2 = _ _ movu m0, [srcq+ssq*1] ; 3 = _ _ lea srcq, [srcq+ssq*2] +%else + movu m6, [srcq+ssq*2] ; 2 = _ _ + add srcq, ss3q + movu m0, [srcq+ssq*0] ; 3 = _ _ +%endif HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ - ; mova m7, [base+pw_8192] pmulhrsw m4, m7 ; H pw_8192 pmulhrsw m5, m7 ; H pw_8192 @@ -2503,11 +2405,16 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 - ; mova m7, [base+subpel_h_shufA] +%if ARCH_X86_32 movu m4, [srcq+ssq*0] ; 4 = _ _ movu m5, [srcq+ssq*1] ; 5 = _ _ lea srcq, [srcq+ssq*2] +%else + movu m4, [srcq+ssq*1] ; 4 = _ _ + movu m5, [srcq+ssq*2] ; 5 = _ _ + add srcq, ss3q +%endif movu m6, [srcq+ssq*0] ; 6 = _ _ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ @@ -2519,7 +2426,6 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 punpcklwd m4, m0, m1 ; 3 4 ~ punpcklwd m5, m1, m2 ; 4 5 ~ punpcklwd m6, m2, m3 ; 5 6 ~ - ; SAVELINE_W8 6, m3 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 @@ -2603,16 +2509,19 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 RESTORELINE_W8 4, m4 jmp .hv_w8_loop .hv_w8_outer: - movzx hd, r6w - add r4, 4 - mov dstq, r4 %if ARCH_X86_32 - mov srcq, srcm - add srcq, 4 - mov srcm, srcq + mov dstq, dstm + add r4, 4 + movzx hd, r6w + add dstq, 4 + mov srcq, r4 + mov dstm, dstq %else + add r4, 4 add r7, 4 - mov srcq, r7 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 %endif sub r6d, 1<<16 jg .hv_w8_loop0 @@ -2836,7 +2745,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v - movsxd wq, wm + mov wd, wm movifnidn srcd, srcm movifnidn hd, hm test mxd, 0xf00 @@ -2846,6 +2755,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 LEA base_reg, prep_ssse3 tzcnt wd, wd movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] + pxor m4, m4 add wq, base_reg movifnidn strided, stridem lea r6, [strideq*3] @@ -2885,16 +2795,13 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] - movd m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0] - pshufd m5, m5, q0000 - movd m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4] - pshufd m6, m6, q0000 + movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) mova m7, [base+pw_8192] + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 %else - punpcklbw m5, m5 punpcklbw m6, m6 - psraw m5, 8 psraw m6, 8 %if ARCH_X86_64 mova m7, [pw_2] @@ -2902,6 +2809,8 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %else %define m15 m4 %endif + pshufd m5, m6, q1010 + punpckhqdq m6, m6 %endif add wq, base_reg jmp wq @@ -2913,10 +2822,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif dec srcq movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] - pshufd m4, m4, q0000 %if cpuflag(ssse3) mova m6, [base+pw_8192] mova m5, [base+subpel_h_shufA] + pshufd m4, m4, q0000 %else mova m6, [base+pw_2] %if ARCH_X86_64 @@ -2926,6 +2835,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif punpcklbw m4, m4 psraw m4, 8 + punpcklqdq m4, m4 %endif %if ARCH_X86_64 lea stride3q, [strideq*3] @@ -3089,11 +2999,14 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - lea myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) mova m2, [base+pw_512] - psrlw m2, m2, 1 ; 0x0100 mova m7, [base+pw_8192] + punpcklwd m0, m0 +%else + punpcklbw m0, m0 + psraw m0, 8 %endif %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] @@ -3107,35 +3020,27 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 ALLOC_STACK -mmsize*5 %endif %assign regs_used 7 - movd m0, [myq+0] - PSHUFB_0X1X m0, m2 - mova subpel0, m0 - movd m0, [myq+2] - PSHUFB_0X1X m0, m2 - mova subpel1, m0 - movd m0, [myq+4] - PSHUFB_0X1X m0, m2 - mova subpel2, m0 - movd m0, [myq+6] - PSHUFB_0X1X m0, m2 - mova subpel3, m0 mov strideq, [rstk+stack_offset+gprsize*3] + pshufd m1, m0, q0000 + mova subpel0, m1 + pshufd m1, m0, q1111 + mova subpel1, m1 lea r5, [strideq*3] + pshufd m1, m0, q2222 + mova subpel2, m1 + pshufd m1, m0, q3333 + mova subpel3, m1 sub srcq, r5 %else %define subpel0 m8 %define subpel1 m9 %define subpel2 m10 %define subpel3 m11 - movd subpel0, [myq+0] - PSHUFB_0X1X subpel0, m2 - movd subpel1, [myq+2] - PSHUFB_0X1X subpel1, m2 - movd subpel2, [myq+4] - PSHUFB_0X1X subpel2, m2 - movd subpel3, [myq+6] - PSHUFB_0X1X subpel3, m2 + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 lea stride3q, [strideq*3] + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 sub srcq, stride3q cmp wd, 8 jns .v_w8 @@ -3159,35 +3064,34 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mov r5w, hw .v_w4_loop0: %endif - movd m2, [srcq+strideq*0] ; 0 - movhps m2, [srcq+strideq*2] ; 0 _ 2 - movd m3, [srcq+strideq*1] ; 1 + movd m1, [srcq+strideq*0] + movd m0, [srcq+strideq*1] %if ARCH_X86_32 lea srcq, [srcq+strideq*2] - movhps m3, [srcq+strideq*1] ; 1 _ 3 + movd m2, [srcq+strideq*0] + movd m4, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movd m3, [srcq+strideq*0] + movd m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] %else - movhps m3, [srcq+stride3q ] ; 1 _ 3 - lea srcq, [srcq+strideq*4] -%endif - pshufd m2, m2, q2020 ; 0 2 0 2 - pshufd m3, m3, q2020 ; 1 3 1 3 - punpckldq m2, m3 ; 0 1 2 3 - movd m3, [srcq+strideq*0] ; 4 - movd m1, [srcq+strideq*1] ; 5 - movd m0, [srcq+strideq*2] ; 6 -%if ARCH_X86_32 - lea srcq, [srcq+strideq*2] - add srcq, strideq -%else + movd m2, [srcq+strideq*2] + add srcq, stride3q + movd m4, [srcq+strideq*0] + movd m3, [srcq+strideq*1] + movd m5, [srcq+strideq*2] add srcq, stride3q %endif - punpckldq m3, m1 ; 4 5 _ _ - punpckldq m1, m0 ; 5 6 _ _ - PALIGNR m4, m3, m2, 4 ; 1 2 3 4 - punpcklbw m3, m1 ; 45 56 - punpcklbw m1, m2, m4 ; 01 12 - punpckhbw m2, m4 ; 23 34 + punpckldq m1, m0 ; 0 1 + punpckldq m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+strideq*0] + punpckldq m2, m4 ; 2 3 + punpckldq m4, m3 ; 3 4 + punpckldq m3, m5 ; 4 5 + punpckldq m5, m0 ; 5 6 + punpcklbw m2, m4 ; 23 34 + punpcklbw m3, m5 ; 45 56 .v_w4_loop: %if ARCH_X86_32 && notcpuflag(ssse3) mova m7, subpel0 @@ -3208,11 +3112,11 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif mova m2, m3 PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2 - paddw m5, m3 - movd m4, [srcq+strideq*0] - punpckldq m3, m0, m4 ; 6 7 _ _ - movd m0, [srcq+strideq*1] + movd m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] + paddw m5, m3 + punpckldq m3, m0, m4 ; 6 7 _ _ + movd m0, [srcq+strideq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 %if notcpuflag(ssse3) @@ -3242,50 +3146,43 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 sub hd, 2 jg .v_w4_loop %if ARCH_X86_32 - mov hw, r5w ; reset vertical loop - mov tmpq, tmpm mov srcq, srcm - add tmpq, 8 + mov tmpq, tmpm + movzx hd, r5w add srcq, 4 - mov tmpm, tmpq + add tmpq, 8 mov srcm, srcq + mov tmpm, tmpq sub r5d, 1<<16 ; horizontal-- jg .v_w4_loop0 %endif RET %if ARCH_X86_64 .v_w8: - lea r5d, [wq - 8] ; horizontal loop + lea r6d, [wq*8-64] + mov r5, srcq mov r8, tmpq - mov r6, srcq - shl r5d, 8 - 3; (wq / 8) << 8 - mov r5b, hb + lea r6d, [hq+r6*4] .v_w8_loop0: - movq m4, [srcq+strideq*0] - movq m5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movq m6, [srcq+strideq*0] - movq m0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] movq m1, [srcq+strideq*0] movq m2, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movq m3, [srcq+strideq*0] - shufpd m4, m0, 0x0c - shufpd m5, m1, 0x0c - punpcklbw m1, m4, m5 ; 01 - punpckhbw m4, m5 ; 34 - shufpd m6, m2, 0x0c - punpcklbw m2, m5, m6 ; 12 - punpckhbw m5, m6 ; 45 - shufpd m0, m3, 0x0c - punpcklbw m3, m6, m0 ; 23 - punpckhbw m6, m0 ; 56 + movq m3, [srcq+strideq*2] + add srcq, stride3q + movq m4, [srcq+strideq*0] + movq m5, [srcq+strideq*1] + movq m6, [srcq+strideq*2] + add srcq, stride3q + movq m0, [srcq+strideq*0] + punpcklbw m1, m2 ; 01 + punpcklbw m2, m3 ; 12 + punpcklbw m3, m4 ; 23 + punpcklbw m4, m5 ; 34 + punpcklbw m5, m6 ; 45 + punpcklbw m6, m0 ; 56 .v_w8_loop: -%if cpuflag(ssse3) - movq m12, [srcq+strideq*1] + movq m13, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - movq m13, [srcq+strideq*0] +%if cpuflag(ssse3) pmaddubsw m14, m1, subpel0 ; a0 pmaddubsw m15, m2, subpel0 ; b0 mova m1, m3 @@ -3298,64 +3195,59 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mova m4, m6 pmaddubsw m5, subpel2 ; a2 pmaddubsw m6, subpel2 ; b2 + punpcklbw m12, m0, m13 ; 67 + movq m0, [srcq+strideq*0] + punpcklbw m13, m0 ; 78 paddw m14, m5 + mova m5, m12 + pmaddubsw m12, subpel3 ; a3 paddw m15, m6 - shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c - punpcklbw m5, m6, m0 ; 67 - punpckhbw m6, m0 ; 78 - pmaddubsw m12, m5, subpel3 ; a3 - pmaddubsw m13, m6, subpel3 ; b3 + mova m6, m13 + pmaddubsw m13, subpel3 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 - movu [tmpq+wq*0], m14 - movu [tmpq+wq*2], m15 %else mova m14, m1 PMADDUBSW m14, subpel0, m7, m12, 1 ; a0 + mova m15, m2 + PMADDUBSW m15, subpel0, m7, m12, 0 ; b0 mova m1, m3 PMADDUBSW m3, subpel1, m7, m12, 0 ; a1 + mova m2, m4 + PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 paddw m14, m3 mova m3, m5 PMADDUBSW m5, subpel2, m7, m12, 0 ; a2 - paddw m14, m5 - movq m12, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movq m13, [srcq+strideq*0] - shufpd m15, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c - punpcklbw m5, m15, m0 ; 67 - punpckhbw m15, m0 ; 78 - mova m13, m5 - PMADDUBSW m13, subpel3, m7, m12, 0 ; a3 - paddw m14, m13 - PMULHRSW_8192 m14, m14, [base+pw_2] - movu [tmpq+wq*0], m14 - mova m14, m2 - PMADDUBSW m14, subpel0, m7, m12, 0 ; b0 - mova m2, m4 - PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 - paddw m14, m4 + paddw m15, m4 mova m4, m6 PMADDUBSW m6, subpel2, m7, m12, 0 ; b2 - paddw m14, m6 - mova m6, m15 - PMADDUBSW m15, subpel3, m7, m12, 0 ; b3 - paddw m14, m15 + paddw m15, m6 + punpcklbw m12, m0, m13 ; 67 + movq m0, [srcq+strideq*0] + punpcklbw m13, m0 ; 78 + paddw m14, m5 + mova m5, m12 + PMADDUBSW m12, subpel3, m7, m6, 0 ; a3 + paddw m14, m12 + mova m6, m13 + PMADDUBSW m13, subpel3, m7, m12, 0 ; b3 + paddw m15, m13 PMULHRSW_8192 m14, m14, [base+pw_2] - movu [tmpq+wq*2], m14 + PMULHRSW_8192 m15, m15, [base+pw_2] %endif + movu [tmpq+wq*0], m14 + movu [tmpq+wq*2], m15 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w8_loop - movzx hd, r5b ; reset vertical loop + add r5, 8 add r8, 16 - add r6, 8 + movzx hd, r6b + mov srcq, r5 mov tmpq, r8 - mov srcq, r6 - sub r5d, 1<<8 ; horizontal-- + sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 @@ -3363,7 +3255,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpel1 %undef subpel2 %undef subpel3 - ; .hv: %assign stack_offset org_stack_offset cmp wd, 4 @@ -3466,13 +3357,13 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif movq m5, [srcq+strideq*0] ; 0 _ _ _ movhps m5, [srcq+strideq*1] ; 0 _ 1 _ - movq m4, [srcq+strideq*2] ; 2 _ _ _ %if ARCH_X86_32 lea srcq, [srcq+strideq*2] - add srcq, strideq - movhps m4, [srcq+strideq*0] ; 2 _ 3 _ - add srcq, strideq + movq m4, [srcq+strideq*0] ; 2 _ _ _ + movhps m4, [srcq+strideq*1] ; 2 _ 3 _ + lea srcq, [srcq+strideq*2] %else + movq m4, [srcq+strideq*2] ; 2 _ _ _ movhps m4, [srcq+stride3q ] ; 2 _ 3 _ lea srcq, [srcq+strideq*4] %endif @@ -3506,7 +3397,14 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif movq m5, [srcq+strideq*0] ; 4 _ _ _ movhps m5, [srcq+strideq*1] ; 4 _ 5 _ +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movq m4, [srcq+strideq*0] ; 6 _ _ _ + add srcq, strideq +%else movq m4, [srcq+strideq*2] ; 6 _ _ _ + add srcq, stride3q +%endif PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters @@ -3530,12 +3428,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %else mova m2, [esp+mmsize*4] %endif -%endif -%if ARCH_X86_32 - lea srcq, [srcq+strideq*2] - add srcq, strideq -%else - add srcq, stride3q %endif ;process high PALIGNR m4, m3, m2, 4;V 1 2 3 4 @@ -3572,7 +3464,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define m15 m3 %endif %endif - ; %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] %endif @@ -3620,7 +3511,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mova [esp+0xA0], m5 %endif %endif - ; %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] %endif @@ -3644,7 +3534,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m4, m5, 6 - ; RESTORELINE_W4 m5, 5, 0 packssdw m5, m4 pshufd m5, m5, q3120 @@ -3666,7 +3555,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpelv1 %undef subpelv2 %undef subpelv3 - ; .hv_w8: %assign stack_offset org_stack_offset %define hv8_line_1 0 @@ -3699,20 +3587,20 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define tmpm [rsp+mmsize*13+gprsize*1] %define srcm [rsp+mmsize*13+gprsize*2] %define stridem [rsp+mmsize*13+gprsize*3] + mov tmpm, tmpq mov stridem, strideq %endif + %if cpuflag(ssse3) pshufd m0, m1, q0000 pshufd m1, m1, q1111 - punpcklbw m5, m5 - %if notcpuflag(ssse3) - punpcklbw m0, m0 + %else punpcklbw m1, m1 - %endif - psraw m5, 8 - %if notcpuflag(ssse3) - psraw m0, 8 psraw m1, 8 + pshufd m0, m1, q1010 + punpckhqdq m1, m1 %endif + punpcklbw m5, m5 + psraw m5, 8 pshufd m2, m5, q0000 pshufd m3, m5, q1111 pshufd m4, m5, q2222 @@ -3742,38 +3630,31 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 cmp hd, 6 cmovs myd, mxd movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + %if cpuflag(ssse3) pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 + %else + punpcklbw m0, m0 + psraw m0, 8 + pshufd subpelh0, m0, q1010 + pshufd subpelh1, m0, q3232 + mova m7, [base+pw_2] + %endif punpcklbw m1, m1 - %if notcpuflag(ssse3) - punpcklbw subpelh0, subpelh0 - punpcklbw subpelh1, subpelh1 - %endif psraw m1, 8 - %if notcpuflag(ssse3) - psraw subpelh0, 8 - psraw subpelh1, 8 - %endif pshufd subpelv0, m1, q0000 pshufd subpelv1, m1, q1111 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 - %if notcpuflag(ssse3) - mova m7, [base+pw_2] - %endif lea stride3q, [strideq*3] sub srcq, 3 sub srcq, stride3q mov r6, srcq + mov r8, tmpq %endif lea r5d, [wq-4] -%if ARCH_X86_64 - mov r8, tmpq -%else - mov tmpm, tmpq -%endif - shl r5d, (16 - 2) - mov r5w, hw + shl r5d, 14 + add r5d, hd .hv_w8_loop0: %if cpuflag(ssse3) %if ARCH_X86_64 @@ -3791,24 +3672,24 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 +%if ARCH_X86_64 + PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 + add srcq, stride3q + PREP_8TAP_HV m0, srcq+strideq*0, m7, m9 +%else lea srcq, [srcq+strideq*2] -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m9, m4 - %else + %if notcpuflag(ssse3) mova [esp], m4 %endif -%endif PREP_8TAP_HV m6, srcq+strideq*0, m7, m4 PREP_8TAP_HV m0, srcq+strideq*1, m7, m4 lea srcq, [srcq+strideq*2] +%endif %if cpuflag(ssse3) mova m7, [base+pw_8192] %else mova m7, [base+pw_2] - %if ARCH_X86_64 - SWAP m4, m9 - %else + %if ARCH_X86_32 mova m4, [esp] %endif %endif @@ -3824,28 +3705,26 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 SAVELINE_W8 3, m3 %if cpuflag(ssse3) mova m7, [base+subpel_h_shufA] +%endif +%if ARCH_X86_64 + PREP_8TAP_HV m4, srcq+strideq*1, m8, m9 + PREP_8TAP_HV m5, srcq+strideq*2, m8, m9 + add srcq, stride3q + PREP_8TAP_HV m6, srcq+strideq*0, m8, m9 %else - %if ARCH_X86_64 - SWAP m8, m7 - SWAP m9, m0 - %else + %if notcpuflag(ssse3) mova [esp+0x30], m0 %endif -%endif PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 - PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 lea srcq, [srcq+strideq*2] + PREP_8TAP_HV m6, srcq+strideq*0, m7, m0 +%endif %if cpuflag(ssse3) mova m7, [base+pw_8192] -%else - %if ARCH_X86_64 - SWAP m0, m9 - SWAP m7, m8 - %else +%elif ARCH_X86_32 mova m0, [esp+0x30] mova m7, [base+pw_2] - %endif %endif PMULHRSW_8192 m1, m4, m7 PMULHRSW_8192 m2, m5, m7 @@ -3902,8 +3781,8 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif %endif PREP_8TAP_HV m0, srcq+strideq*1, m5, m6 - PREP_8TAP_HV m4, srcq+strideq*2, m5, m6 lea srcq, [srcq+strideq*2] + PREP_8TAP_HV m4, srcq+strideq*0, m5, m6 %if cpuflag(ssse3) mova m5, [base+pw_8192] %else @@ -3933,19 +3812,20 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 RESTORELINE_W8 4, m4 jmp .hv_w8_loop .hv_w8_outer: - movzx hd, r5w %if ARCH_X86_32 mov srcq, srcm mov tmpq, tmpm + movzx hd, r5w add srcq, 4 add tmpq, 8 mov srcm, srcq mov tmpm, tmpq %else - add r8, 8 - mov tmpq, r8 add r6, 4 + add r8, 8 + movzx hd, r5b mov srcq, r6 + mov tmpq, r8 %endif sub r5d, 1<<16 jg .hv_w8_loop0 diff --git a/third_party/dav1d/tests/header_test.c b/third_party/dav1d/tests/header_test.c new file mode 100644 index 000000000000..dfe0dfb43192 --- /dev/null +++ b/third_party/dav1d/tests/header_test.c @@ -0,0 +1,33 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include DAV1D_TEST_HEADER + +int main() +{ + return 0; +} diff --git a/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c b/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c index 4506d2f9fa24..bd040a861aa1 100644 --- a/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c +++ b/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "src/cpu.h" @@ -38,8 +39,6 @@ #ifdef DAV1D_ALLOC_FAIL -#include - #include "alloc_fail.h" static unsigned djb_xor(const uint8_t * c, size_t len) { @@ -56,6 +55,39 @@ static unsigned r32le(const uint8_t *const p) { #define DAV1D_FUZZ_MAX_SIZE 4096 * 4096 +// search for "--cpumask xxx" in argv and remove both parameters +int LLVMFuzzerInitialize(int *argc, char ***argv) { + int i = 1; + for (; i < *argc; i++) { + if (!strcmp((*argv)[i], "--cpumask")) { + const char * cpumask = (*argv)[i+1]; + if (cpumask) { + char *end; + unsigned res; + if (!strncmp(cpumask, "0x", 2)) { + cpumask += 2; + res = (unsigned) strtoul(cpumask, &end, 16); + } else { + res = (unsigned) strtoul(cpumask, &end, 0); + } + if (end != cpumask && !end[0]) { + dav1d_set_cpu_flags_mask(res); + } + } + break; + } + } + + for (; i < *argc - 2; i++) { + (*argv)[i] = (*argv)[i + 2]; + } + + *argc = i; + + return 0; +} + + // expects ivf input int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) diff --git a/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h b/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h index 5d9329973e9f..0cbbad46b0b3 100644 --- a/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h +++ b/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h @@ -31,6 +31,7 @@ #include #include +int LLVMFuzzerInitialize(int *argc, char ***argv); int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size); #endif /* DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H */ diff --git a/third_party/dav1d/tests/libfuzzer/main.c b/third_party/dav1d/tests/libfuzzer/main.c index 985ebba4cf31..8647738666f0 100644 --- a/third_party/dav1d/tests/libfuzzer/main.c +++ b/third_party/dav1d/tests/libfuzzer/main.c @@ -40,7 +40,7 @@ // expects ivf input -int main(const int argc, char *const *const argv) { +int main(int argc, char *argv[]) { int ret = -1; FILE *f = NULL; int64_t fsize; @@ -48,6 +48,10 @@ int main(const int argc, char *const *const argv) { uint8_t *data = NULL; size_t size = 0; + if (LLVMFuzzerInitialize(&argc, &argv)) { + return 1; + } + if (argc != 2) { fprintf(stdout, "Usage:\n%s fuzzing_testcase.ivf\n", argv[0]); return -1; diff --git a/third_party/dav1d/tests/meson.build b/third_party/dav1d/tests/meson.build index 51de562a2de5..e26358f737bc 100644 --- a/third_party/dav1d/tests/meson.build +++ b/third_party/dav1d/tests/meson.build @@ -31,8 +31,6 @@ if not get_option('enable_tests') subdir_done() endif -libdav1d_nasm_objs_if_needed = [] - if is_asm_enabled checkasm_sources = files( 'checkasm/checkasm.c', @@ -62,25 +60,27 @@ if is_asm_enabled checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects() endforeach - checkasm_nasm_objs = [] + checkasm_asm_objs = [] + checkasm_asm_sources = [] if host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64' - checkasm_sources += files('checkasm/arm/checkasm_64.S') + checkasm_asm_sources += files('checkasm/arm/checkasm_64.S') elif host_machine.cpu_family().startswith('arm') - checkasm_sources += files('checkasm/arm/checkasm_32.S') + checkasm_asm_sources += files('checkasm/arm/checkasm_32.S') elif host_machine.cpu_family().startswith('x86') - checkasm_nasm_objs = nasm_gen.process(files('checkasm/x86/checkasm.asm')) + checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm')) + endif + + if use_gaspp + checkasm_asm_objs += gaspp_gen.process(checkasm_asm_sources) + else + checkasm_sources += checkasm_asm_sources endif m_lib = cc.find_library('m', required: false) - if meson.version().version_compare('< 0.48.999') - libdav1d_nasm_objs_if_needed = libdav1d_nasm_objs - endif - checkasm = executable('checkasm', checkasm_sources, - checkasm_nasm_objs, - libdav1d_nasm_objs_if_needed, + checkasm_asm_objs, objects: [ checkasm_bitdepth_objs, @@ -101,10 +101,30 @@ if is_asm_enabled test('checkasm', checkasm, is_parallel: false) endif +c99_extension_flag = cc.first_supported_argument( + '-Werror=c11-extensions', + '-Werror=c99-c11-compat', + '-Wc11-extensions', + '-Wc99-c11-compat', +) + +# dav1d_api_headers +foreach header : dav1d_api_headers + target = header + '_test' + + header_test_exe = executable(target, + 'header_test.c', + include_directories: dav1d_inc_dirs, + c_args: ['-DDAV1D_TEST_HEADER="@0@"'.format(header), c99_extension_flag], + build_by_default: true + ) + + test(target, header_test_exe) +endforeach + + # fuzzing binaries -if meson.version().version_compare('>=0.49') - subdir('libfuzzer') -endif +subdir('libfuzzer') # Include dav1d test data repository with additional tests if get_option('testdata_tests') diff --git a/third_party/dav1d/tools/dav1d.c b/third_party/dav1d/tools/dav1d.c index 4b97a9f20f31..907af3f8efde 100644 --- a/third_party/dav1d/tools/dav1d.c +++ b/third_party/dav1d/tools/dav1d.c @@ -124,11 +124,15 @@ static void print_stats(const int istty, const unsigned n, const unsigned num, else b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)", n, num, 100.0 * n / num); - if (i_fps && b < end) { + if (b < end) { const double d_fps = 1e9 * n / elapsed; - const double speed = d_fps / i_fps; - b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)", - d_fps, i_fps, speed); + if (i_fps) { + const double speed = d_fps / i_fps; + b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)", + d_fps, i_fps, speed); + } else { + b += snprintf(b, end - b, " - %.2lf fps", d_fps); + } } if (!istty) strcpy(b > end - 2 ? end - 2 : b, "\n"); diff --git a/third_party/dav1d/tools/dav1d.manifest b/third_party/dav1d/tools/dav1d.manifest new file mode 100644 index 000000000000..68cd1856d784 --- /dev/null +++ b/third_party/dav1d/tools/dav1d.manifest @@ -0,0 +1,10 @@ + + + + + + true + UTF-8 + + + diff --git a/third_party/dav1d/tools/dav1d.rc.in b/third_party/dav1d/tools/dav1d.rc.in new file mode 100644 index 000000000000..a4b49009923f --- /dev/null +++ b/third_party/dav1d/tools/dav1d.rc.in @@ -0,0 +1,33 @@ +#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0 +#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@" +#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0 +#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@" + +#include + +1 RT_MANIFEST "dav1d.manifest" +1 VERSIONINFO +FILETYPE VFT_APP +FILEOS VOS_NT_WINDOWS32 +PRODUCTVERSION PROJECT_VERSION_NUMBER +FILEVERSION API_VERSION_NUMBER +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "040904E4" + BEGIN + VALUE "CompanyName", "VideoLAN" + VALUE "ProductName", "dav1d" + VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR + VALUE "FileVersion", API_VERSION_NUMBER_STR + VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder" + VALUE "InternalName", "dav1d" + VALUE "OriginalFilename", "dav1d.exe" + VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors" + END + END + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x409, 1252 + END +END diff --git a/third_party/dav1d/tools/meson.build b/third_party/dav1d/tools/meson.build index 4b4217a1e694..76fa1e0bb52e 100644 --- a/third_party/dav1d/tools/meson.build +++ b/third_party/dav1d/tools/meson.build @@ -77,8 +77,24 @@ dav1d_sources = files( 'dav1d_cli_parse.c', ) +if host_machine.system() == 'windows' + rc_file = configure_file( + input : 'dav1d.rc.in', + output : 'dav1d.rc', + configuration : rc_data + ) + + dav1d_rc_obj = winmod.compile_resources(rc_file, + depend_files : files('dav1d.manifest'), + include_directories : include_directories('.') + ) +else + dav1d_rc_obj = [] +endif + dav1d = executable('dav1d', dav1d_sources, + dav1d_rc_obj, rev_target, cli_config_h_target, link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs], diff --git a/third_party/dav1d/tools/output/y4m2.c b/third_party/dav1d/tools/output/y4m2.c index bcd40346e541..8766f6486823 100644 --- a/third_party/dav1d/tools/output/y4m2.c +++ b/third_party/dav1d/tools/output/y4m2.c @@ -28,6 +28,7 @@ #include "config.h" #include +#include #include #include #include @@ -77,8 +78,17 @@ static int write_header(Y4m2OutputContext *const c, const Dav1dPicture *const p) chr_names_8bpc_i420[p->seq_hdr->chr > 2 ? DAV1D_CHR_UNKNOWN : p->seq_hdr->chr] : ss_names[p->p.layout][p->seq_hdr->hbd]; - fprintf(c->f, "YUV4MPEG2 W%d H%d F%d:%d Ip C%s\n", - p->p.w, p->p.h, c->fps[0], c->fps[1], ss_name); + const unsigned fw = p->p.w; + const unsigned fh = p->p.h; + uint64_t aw = (uint64_t)fh * p->frame_hdr->render_width; + uint64_t ah = (uint64_t)fw * p->frame_hdr->render_height; + uint64_t gcd = ah; + for (uint64_t a = aw, b; (b = a % gcd); a = gcd, gcd = b); + aw /= gcd; + ah /= gcd; + + fprintf(c->f, "YUV4MPEG2 W%u H%u F%u:%u Ip A%"PRIu64":%"PRIu64" C%s\n", + fw, fh, c->fps[0], c->fps[1], aw, ah, ss_name); return 0; }