Bug 1793238 - Update libjxl r=tnikkel

Differential Revision: https://phabricator.services.mozilla.com/D158771
This commit is contained in:
Kagami Sascha Rosylight
2022-10-06 17:31:29 +00:00
parent 9cf3d82c8c
commit db107725b2
123 changed files with 4471 additions and 3040 deletions

View File

@@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 7f2e26854086fba4255220fd6c77e9141f1f87cc
release: 22e3d7276f4157d4a47586ba9fd91dd6303f441a
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 7f2e26854086fba4255220fd6c77e9141f1f87cc
revision: 22e3d7276f4157d4a47586ba9fd91dd6303f441a
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@@ -0,0 +1,14 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef JXL_VERSION_H_
#define JXL_VERSION_H_
#define JPEGXL_MAJOR_VERSION 0
#define JPEGXL_MINOR_VERSION 0
#define JPEGXL_PATCH_VERSION 0
#endif /* JXL_VERSION_H_ */

View File

@@ -103,13 +103,10 @@ SOURCES += [
"/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc",
]
DEFINES["JPEGXL_MAJOR_VERSION"] = "0"
DEFINES["JPEGXL_MINOR_VERSION"] = "0"
DEFINES["JPEGXL_PATCH_VERSION"] = "0"
EXPORTS.jxl += [
"./include/jxl/jxl_export.h",
"./include/jxl/jxl_threads_export.h",
"./include/jxl/version.h",
"/third_party/jpeg-xl/lib/include/jxl/butteraugli.h",
"/third_party/jpeg-xl/lib/include/jxl/butteraugli_cxx.h",
"/third_party/jpeg-xl/lib/include/jxl/cms_interface.h",

View File

@@ -10,9 +10,9 @@ origin:
url: https://github.com/libjxl/libjxl
release: 3e0b08d4ee53a08f9b58739e088c5bdecebae74d (2022-09-09T11:59:45Z).
release: 19e36b964cd966e2408bad87182faa38b7de3e9e
revision: 3e0b08d4ee53a08f9b58739e088c5bdecebae74d
revision: 19e36b964cd966e2408bad87182faa38b7de3e9e
license: Apache-2.0

View File

@@ -19,7 +19,7 @@ if(POLICY CMP0083)
cmake_policy(SET CMP0083 NEW)
endif()
project(hwy VERSION 1.0.0) # Keep in sync with highway.h version
project(hwy VERSION 1.0.1) # Keep in sync with highway.h version
# Directly define the ABI version from the cmake project() version values:
set(LIBRARY_VERSION "${hwy_VERSION}")
@@ -89,6 +89,9 @@ list(APPEND HWY_CONTRIB_SOURCES
hwy/contrib/sort/vqsort-inl.h
hwy/contrib/sort/vqsort.cc
hwy/contrib/sort/vqsort.h
hwy/contrib/algo/copy-inl.h
hwy/contrib/algo/find-inl.h
hwy/contrib/algo/transform-inl.h
)
endif() # HWY_ENABLE_CONTRIB

View File

@@ -1,3 +1,15 @@
highway (1.0.1-1) UNRELEASED; urgency=medium
* Add Eq128, i64 Mul, unsigned->float ConvertTo
* Faster sort for few unique keys, more robust pivot selection
* Fix: floating-point generator for sort tests, Min/MaxOfLanes for i16
* Fix: avoid always_inline in debug, link atomic
* GCC warnings: string.h, maybe-uninitialized, ignored-attributes
* GCC warnings: preprocessor int overflow, spurious use-after-free/overflow
* Doc: <=HWY_AVX3, Full32/64/128, how to use generic-inl
-- Jan Wassenberg <janwas@google.com> Tue, 23 Aug 2022 10:00:00 +0200
highway (1.0.0-1) UNRELEASED; urgency=medium
* ABI change: 64-bit target values, more room for expansion

View File

@@ -24,6 +24,9 @@
#include "hwy/detect_compiler_arch.h"
#include "hwy/highway_export.h"
#if HWY_COMPILER_MSVC
#include <string.h> // memcpy
#endif
#if HWY_ARCH_X86
#include <atomic>
#endif
@@ -131,6 +134,19 @@
#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
#if HWY_COMPILER_GCC_ACTUAL
// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
#define HWY_DEFAULT_UNROLL HWY_UNROLL()
#else
#define HWY_UNROLL(factor)
#define HWY_DEFAULT_UNROLL HWY_UNROLL()
#endif
// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
// does, without generating code.
@@ -863,10 +879,18 @@ HWY_API void CopyBytes(const From* from, To* to) {
#if HWY_COMPILER_MSVC
memcpy(to, from, kBytes);
#else
__builtin_memcpy(to, from, kBytes);
__builtin_memcpy(
static_cast<void*>(to), static_cast<const void*>(from), kBytes);
#endif
}
// Same as CopyBytes, but for same-sized objects; avoids a size argument.
template <typename From, typename To>
HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
static_assert(sizeof(From) == sizeof(To), "");
CopyBytes<sizeof(From)>(from, to);
}
template <size_t kBytes, typename To>
HWY_API void ZeroBytes(To* to) {
#if HWY_COMPILER_MSVC
@@ -880,13 +904,13 @@ HWY_API float F32FromBF16(bfloat16_t bf) {
uint32_t bits = bf.bits;
bits <<= 16;
float f;
CopyBytes<4>(&bits, &f);
CopySameSize(&bits, &f);
return f;
}
HWY_API bfloat16_t BF16FromF32(float f) {
uint32_t bits;
CopyBytes<4>(&f, &bits);
CopySameSize(&f, &bits);
bfloat16_t bf;
bf.bits = static_cast<uint16_t>(bits >> 16);
return bf;

View File

@@ -22,8 +22,6 @@
#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
#endif
#include <string.h> // memcpy
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();

View File

@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include <string.h> // memcpy
#include "hwy/aligned_allocator.h"

View File

@@ -18,7 +18,6 @@
// SIMD/multicore-friendly planar image representation with row accessors.
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
@@ -104,7 +103,7 @@ struct HWY_CONTRIB_DLLEXPORT ImageBase {
HWY_INLINE void* VoidRow(const size_t y) const {
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
if (y >= ysize_) {
HWY_ABORT("Row(%" PRIu64 ") >= %u\n", static_cast<uint64_t>(y), ysize_);
HWY_ABORT("Row(%d) >= %u\n", static_cast<int>(y), ysize_);
}
#endif
@@ -223,14 +222,11 @@ class Image3 {
Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
HWY_ABORT("Not same size: %" PRIu64 " x %" PRIu64 ", %" PRIu64
" x %" PRIu64 ", %" PRIu64 " x %" PRIu64 "\n",
static_cast<uint64_t>(plane0.xsize()),
static_cast<uint64_t>(plane0.ysize()),
static_cast<uint64_t>(plane1.xsize()),
static_cast<uint64_t>(plane1.ysize()),
static_cast<uint64_t>(plane2.xsize()),
static_cast<uint64_t>(plane2.ysize()));
HWY_ABORT(
"Not same size: %d x %d, %d x %d, %d x %d\n",
static_cast<int>(plane0.xsize()), static_cast<int>(plane0.ysize()),
static_cast<int>(plane1.xsize()), static_cast<int>(plane1.ysize()),
static_cast<int>(plane2.xsize()), static_cast<int>(plane2.ysize()));
}
planes_[0] = std::move(plane0);
planes_[1] = std::move(plane1);
@@ -294,9 +290,8 @@ class Image3 {
HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
if (c >= kNumPlanes || y >= ysize()) {
HWY_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") >= %" PRIu64 "\n",
static_cast<uint64_t>(c), static_cast<uint64_t>(y),
static_cast<uint64_t>(ysize()));
HWY_ABORT("PlaneRow(%d, %d) >= %d\n", static_cast<int>(c),
static_cast<int>(y), static_cast<int>(ysize()));
}
#endif
// Use the first plane's stride because the compiler might not realize they

View File

@@ -13,6 +13,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stdio.h>
#include <cfloat> // FLT_MAX

View File

@@ -99,6 +99,7 @@ cc_library(
"traits-inl.h",
"traits128-inl.h",
"vqsort-inl.h",
# Placeholder for internal instrumentation. Do not remove.
],
deps = [
# Only if VQSORT_SECURE_RNG is set.

View File

@@ -124,7 +124,7 @@ class InputStats {
// bit representations as the checksum.
uint64_t bits = 0;
static_assert(sizeof(T) <= 8, "Expected a built-in type");
CopyBytes<sizeof(T)>(&value, &bits);
CopyBytes<sizeof(T)>(&value, &bits); // not same size
sum_ += bits;
count_ += 1;
}

View File

@@ -15,7 +15,6 @@
#include <stdint.h>
#include <stdio.h>
#include <string.h> // memcpy
#include <vector>
@@ -51,6 +50,7 @@ using detail::SharedTraits;
#if VQSORT_ENABLED || HWY_IDE
using detail::OrderAscending128;
using detail::OrderAscendingKV128;
using detail::Traits128;
template <class Traits>
@@ -81,8 +81,9 @@ HWY_NOINLINE void BenchPartition() {
// The pivot value can influence performance. Do exactly what vqsort will
// do so that the performance (influenced by prefetching and branch
// prediction) is likely to predict the actual performance inside vqsort.
const auto pivot = detail::ChoosePivot(d, st, aligned.get(), 0, num_lanes,
buf.get(), rng);
detail::PivotResult result;
const auto pivot = detail::ChoosePivot(d, st, aligned.get(), num_lanes,
buf.get(), rng, result);
const Timestamp t0;
detail::Partition(d, st, aligned.get(), 0, num_lanes - 1, pivot,
@@ -110,7 +111,7 @@ HWY_NOINLINE void BenchAllPartition() {
BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
BenchPartition<Traits128<OrderAscending128>>();
// BenchPartition<Traits128<OrderDescending128>>();
// BenchPartition<Traits128<OrderAscendingKV128>>();
BenchPartition<Traits128<OrderAscendingKV128>>();
}
template <class Traits>
@@ -258,12 +259,9 @@ HWY_NOINLINE void BenchSort(size_t num_keys) {
HWY_NOINLINE void BenchAllSort() {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 ||
HWY_TARGET == HWY_EMU128) {
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
return;
}
// Only enable EMU128 on x86 - it's slow on emulators.
if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
constexpr size_t K = 1000;
constexpr size_t M = K * K;
@@ -287,7 +285,7 @@ HWY_NOINLINE void BenchAllSort() {
#if !HAVE_VXSORT && VQSORT_ENABLED
BenchSort<Traits128<OrderAscending128>>(num_keys);
// BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
#endif
}
}

View File

@@ -13,6 +13,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h> // memcpy
@@ -218,9 +222,6 @@ HWY_NOINLINE void TestAllBaseCase() {
#if defined(_MSC_VER)
return;
#endif
// Only enable EMU128 on x86 - it's slow on emulators.
if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
TestBaseCase<TraitsLane<OrderDescending<int64_t> > >();
TestBaseCase<Traits128<OrderAscending128> >();
@@ -356,9 +357,6 @@ static HWY_NOINLINE void TestPartition() {
}
HWY_NOINLINE void TestAllPartition() {
// Only enable EMU128 on x86 - it's slow on emulators.
if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
TestPartition<TraitsLane<OrderAscending<int16_t> > >();
TestPartition<TraitsLane<OrderDescending<int32_t> > >();
TestPartition<TraitsLane<OrderAscending<int64_t> > >();
@@ -490,9 +488,6 @@ void TestSort(size_t num_lanes) {
#if defined(_MSC_VER)
return;
#endif
// Only enable EMU128 on x86 - it's slow on emulators.
if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
using Order = typename Traits::Order;
using LaneType = typename Traits::LaneType;
using KeyType = typename Traits::KeyType;

View File

@@ -41,7 +41,7 @@ namespace detail {
// independent of the order.
template <typename T>
struct KeyLane {
constexpr bool Is128() const { return false; }
static constexpr bool Is128() { return false; }
constexpr size_t LanesPerKey() const { return 1; }
// What type bench_sort should allocate for generating inputs.
@@ -130,7 +130,7 @@ struct KeyLane {
#if HWY_HAVE_FLOAT64 // in case D is float32
const RepartitionToWide<D> dw;
#else
const RepartitionToWide<RebindToUnsigned<D>> dw;
const RepartitionToWide<RebindToUnsigned<D> > dw;
#endif
return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
}
@@ -146,7 +146,7 @@ struct KeyLane {
#if HWY_HAVE_FLOAT64 // in case D is float32
const RepartitionToWide<D> dw;
#else
const RepartitionToWide<RebindToUnsigned<D>> dw;
const RepartitionToWide<RebindToUnsigned<D> > dw;
#endif
return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
}
@@ -160,7 +160,7 @@ struct KeyLane {
#if HWY_HAVE_FLOAT64 // in case D is float32
const RepartitionToWide<D> dw;
#else
const RepartitionToWide<RebindToUnsigned<D>> dw;
const RepartitionToWide<RebindToUnsigned<D> > dw;
#endif
return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
}
@@ -181,9 +181,7 @@ template <typename T>
struct OrderAscending : public KeyLane<T> {
using Order = SortAscending;
HWY_INLINE bool Compare1(const T* a, const T* b) {
return *a < *b;
}
HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
@@ -222,15 +220,18 @@ struct OrderAscending : public KeyLane<T> {
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::HighestValue<T>());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
return Sub(v, Set(d, 1));
}
};
template <typename T>
struct OrderDescending : public KeyLane<T> {
using Order = SortDescending;
HWY_INLINE bool Compare1(const T* a, const T* b) {
return *b < *a;
}
HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
@@ -268,6 +269,11 @@ struct OrderDescending : public KeyLane<T> {
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::LowestValue<T>());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
return Add(v, Set(d, 1));
}
};
// Shared code that depends on Order.

View File

@@ -39,7 +39,7 @@ namespace detail {
// along with an abstraction layer for single-lane vs. lane-pair, which is
// independent of the order.
struct KeyAny128 {
constexpr bool Is128() const { return true; }
static constexpr bool Is128() { return true; }
constexpr size_t LanesPerKey() const { return 2; }
// What type bench_sort should allocate for generating inputs.
@@ -130,8 +130,8 @@ struct Key128 : public KeyAny128 {
std::string KeyString() const { return "U128"; }
template <class D>
HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
return Eq128(a, b);
HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
return Eq128(d, a, b);
}
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
@@ -184,6 +184,12 @@ struct OrderAscending128 : public Key128 {
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
return Sub(v, k1);
}
};
struct OrderDescending128 : public Key128 {
@@ -224,6 +230,12 @@ struct OrderDescending128 : public Key128 {
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
return Add(v, k1);
}
};
// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
@@ -234,8 +246,8 @@ struct KeyValue128 : public KeyAny128 {
std::string KeyString() const { return "KV128"; }
template <class D>
HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
return Eq128Upper(a, b);
HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
return Eq128Upper(d, a, b);
}
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
@@ -281,6 +293,12 @@ struct OrderAscendingKV128 : public KeyValue128 {
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
return Sub(v, k1);
}
};
struct OrderDescendingKV128 : public KeyValue128 {
@@ -321,6 +339,12 @@ struct OrderDescendingKV128 : public KeyValue128 {
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
return Add(v, k1);
}
};
// Shared code that depends on Order.

View File

@@ -61,6 +61,7 @@
#include "hwy/contrib/sort/shared-inl.h"
#include "hwy/contrib/sort/sorting_networks-inl.h"
// Placeholder for internal instrumentation. Do not remove.
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
@@ -573,14 +574,44 @@ HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
}
#endif // VQSORT_PRINT
template <class V>
V OrXor(const V o, const V x1, const V x2) {
// TODO(janwas): ternlog?
return Or(o, Xor(x1, x2));
}
// Returns a lower bound on the index of the first mismatch, or `num` if all
// are equal. `num` is const to ensure we don't change it, which would lead to
// bugs because the caller will check whether we return the original value.
template <class D, class Traits, typename T>
HWY_INLINE bool ScanEqual(D d, Traits st, const T* HWY_RESTRICT keys,
size_t num) {
HWY_NOINLINE size_t LowerBoundOfMismatch(D d, Traits st,
const T* HWY_RESTRICT keys,
const size_t num) {
using V = Vec<decltype(d)>;
const size_t N = Lanes(d);
HWY_DASSERT(num >= N); // See HandleSpecialCases
const V reference = st.SetKey(d, keys);
const V zero = Zero(d);
size_t i = 0;
// Vector-align keys + i.
const size_t misalign =
(reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (N - 1);
if (HWY_LIKELY(misalign != 0)) {
HWY_DASSERT(misalign % st.LanesPerKey() == 0);
const size_t consume = N - misalign;
const auto mask = FirstN(d, consume);
const V v0 = LoadU(d, keys);
// Only check masked lanes; consider others to be equal to the reference.
if (!AllTrue(d, Or(Not(mask), Eq(v0, reference)))) {
return 0; // not equal
}
i = consume;
}
HWY_DASSERT(((reinterpret_cast<uintptr_t>(keys + i) / sizeof(T)) & (N - 1)) ==
0);
// Sticky bits registering any difference between `keys` and the first key.
// We use vector XOR because it may be cheaper than comparisons, especially
// for 128-bit. 2x unrolled for more ILP.
@@ -592,81 +623,112 @@ HWY_INLINE bool ScanEqual(D d, Traits st, const T* HWY_RESTRICT keys,
// after a 'group', which consists of kLoops times two vectors.
constexpr size_t kLoops = 4;
const size_t lanes_per_group = kLoops * 2 * N;
size_t i = 0;
for (; i + lanes_per_group <= num; i += lanes_per_group) {
HWY_DEFAULT_UNROLL
for (size_t loop = 0; loop < kLoops; ++loop) {
const V v0 = LoadU(d, keys + i + loop * 2 * N);
const V v1 = LoadU(d, keys + i + loop * 2 * N + N);
// TODO(janwas): ternlog
diff0 = Or(diff0, Xor(v0, reference));
diff1 = Or(diff1, Xor(v1, reference));
const V v0 = Load(d, keys + i + loop * 2 * N);
const V v1 = Load(d, keys + i + loop * 2 * N + N);
diff0 = OrXor(diff0, v0, reference);
diff1 = OrXor(diff1, v1, reference);
}
diff0 = Or(diff0, diff1);
if (!AllTrue(d, Eq(diff0, zero))) {
return false;
return i; // not equal
}
}
// Whole vectors, no unrolling
// Whole vectors, no unrolling, compare directly
for (; i + N <= num; i += N) {
const V v0 = LoadU(d, keys + i);
// TODO(janwas): ternlog
diff0 = Or(diff0, Xor(v0, reference));
if (!AllTrue(d, Eq(diff0, zero))) {
return false;
const V v0 = Load(d, keys + i);
if (!AllTrue(d, Eq(v0, reference))) {
return i; // not equal
}
}
// If there are remainders, re-check the last whole vector.
if (HWY_LIKELY(i != num)) {
const V v0 = LoadU(d, keys + num - N);
// TODO(janwas): ternlog
diff0 = Or(diff0, Xor(v0, reference));
if (!AllTrue(d, Eq(diff0, zero))) {
return false;
if (!AllTrue(d, Eq(v0, reference))) {
return i; // not equal
}
}
return true;
return num; // all equal
}
// Returns key prior to reference in sort order.
enum class PivotResult {
kAllEqual, // stop without partitioning
kNormal, // partition and recurse left and right
kIsFirst, // partition but skip left recursion
kWasLast, // partition but skip right recursion
};
// Classifies (and possibly modifies) `pivot` by scanning for the first/last
// key from index `idx_diff`, which is less than `num`.
template <class D, class Traits, typename T>
HWY_INLINE Vec<D> ScanForPrev(D d, Traits st, const T* HWY_RESTRICT keys,
size_t num, Vec<D> reference,
T* HWY_RESTRICT buf) {
HWY_NOINLINE PivotResult CheckFirstLast(D d, Traits st,
const T* HWY_RESTRICT keys, size_t num,
size_t idx_diff,
Vec<D>* HWY_RESTRICT pivot,
T* HWY_RESTRICT buf) {
const size_t N = Lanes(d);
HWY_DASSERT(num >= N); // See HandleSpecialCases
HWY_DASSERT(idx_diff < num);
Vec<D> prev = st.FirstValue(d);
Mask<D> any_found = st.Compare(d, prev, prev); // false
Vec<D> first = st.LastValue(d);
Vec<D> last = st.FirstValue(d);
// Early out for mostly-0 arrays, where pivot is often FirstValue.
if (AllTrue(d, st.EqualKeys(d, *pivot, last))) {
return PivotResult::kIsFirst;
}
size_t i = 0;
// We know keys[0, idx_diff) are equal, but they might be the first/last, so
// start scanning one vector before.
size_t i = static_cast<size_t>(
HWY_MAX(static_cast<intptr_t>(idx_diff) - static_cast<intptr_t>(N), 0));
constexpr size_t kLoops = 4;
const size_t lanes_per_group = kLoops * N;
// Whole group, unrolled
for (; i + lanes_per_group <= num; i += lanes_per_group) {
HWY_DEFAULT_UNROLL
for (size_t loop = 0; loop < kLoops; ++loop) {
const Vec<D> curr = LoadU(d, keys + i + loop * N);
first = st.First(d, first, curr);
last = st.Last(d, last, curr);
}
}
// Whole vectors, no unrolling
for (; i + N <= num; i += N) {
const Vec<D> curr = LoadU(d, keys + i);
const auto is_before = st.Compare(d, curr, reference);
any_found = Or(any_found, is_before);
prev = IfThenElse(is_before, st.Last(d, prev, curr), prev);
first = st.First(d, first, curr);
last = st.Last(d, last, curr);
}
// If there are remainders, re-check the last whole vector.
if (HWY_LIKELY(i != num)) {
const Vec<D> curr = LoadU(d, keys + num - N);
const auto is_before = st.Compare(d, curr, reference);
any_found = Or(any_found, is_before);
prev = IfThenElse(is_before, st.Last(d, prev, curr), prev);
first = st.First(d, first, curr);
last = st.Last(d, last, curr);
}
const Vec<D> candidate = st.LastOfLanes(d, prev, buf);
// If we didn't find any key less than reference, we're still stuck with
// FirstValue; replace that with reference. (We cannot compare directly to
// FirstValue because that might be the desired value of prev.)
return IfThenElse(any_found, candidate, reference);
first = st.FirstOfLanes(d, first, buf);
last = st.LastOfLanes(d, last, buf);
if (AllTrue(d, st.EqualKeys(d, first, *pivot))) {
return PivotResult::kIsFirst;
}
// Fixup required because keys equal to the pivot go to the left partition,
// and the pivot is the last, so Partition would not change anything.
// Instead use the previous value in sort order, which is not necessarily an
// actual key.
if (AllTrue(d, st.EqualKeys(d, last, *pivot))) {
*pivot = st.PrevValue(d, *pivot);
return PivotResult::kWasLast;
}
return PivotResult::kNormal;
}
enum class PivotResult {
kNormal, // use partition
kAllEqual, // already done
};
// Writes samples from `keys[0, num)` into `buf`.
template <class D, class Traits, typename T>
HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
T* HWY_RESTRICT buf, Generator& rng) {
@@ -732,27 +794,25 @@ HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
}
}
// Returns pivot, which is never the largest key (thus the right partition will
// never be empty).
// Returns pivot chosen from `keys[0, num)`. It will never be the largest key
// (thus the right partition will never be empty).
template <class D, class Traits, typename T>
HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
const size_t begin, const size_t end,
T* HWY_RESTRICT buf, Generator& rng,
PivotResult& result) {
const size_t num, T* HWY_RESTRICT buf,
Generator& rng, PivotResult& result) {
using V = decltype(Zero(d));
const size_t N = Lanes(d);
constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
constexpr size_t N1 = st.LanesPerKey();
const size_t num = end - begin;
#if VQSORT_PRINT
fprintf(stderr, "\nChoosePivot num %zu:\n", num);
#endif
DrawSamples(d, st, keys + begin, num, buf, rng);
DrawSamples(d, st, keys, num, buf, rng);
SortSamples(st, buf);
#if VQSORT_PRINT
const size_t N = Lanes(d);
for (size_t i = 0; i < kSampleLanes; i += N) {
Print(d, "", Load(d, buf + i), 0, N);
}
@@ -760,27 +820,22 @@ HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
// All samples are equal.
if (st.Equal1(buf, buf + kSampleLanes - N1)) {
const bool all_eq = ScanEqual(d, st, keys + begin, num);
const size_t idx_diff = LowerBoundOfMismatch(d, st, keys, num);
const bool all_eq = idx_diff == num;
#if VQSORT_PRINT
fprintf(stderr, "Pivot num=%zu all eq samples, keys also: %d\n", num,
all_eq);
fprintf(stderr, "Pivot num=%zu samplesEq, idxDiff %zu keysEq: %d\n", num,
idx_diff, all_eq);
#endif
if (all_eq) {
result = PivotResult::kAllEqual;
return Zero(d);
}
// If the sample is indeed the most common key and it is the largest, then
// the right partition will be empty. Prevent this by replacing the pivot
// with the previous key in sort order. By contrast, selecting the first key
// in sort order would guarantee (minimal) progress. We instead do a full
// scan to maximize load balance in case there are numerous keys that
// precede the most common key.
result = PivotResult::kNormal;
const V reference = st.SetKey(d, buf);
const V pivot = ScanForPrev(d, st, keys + begin, num, reference, buf);
V pivot = st.SetKey(d, buf); // the single unique sample
result = CheckFirstLast(d, st, keys, num, idx_diff, &pivot, buf);
#if VQSORT_PRINT
Print(d, "PREV pivot", pivot, 0, st.LanesPerKey());
fprintf(stderr, "PivotResult %d\n", static_cast<int>(result));
Print(d, "Adjusted pivot", pivot, 0, st.LanesPerKey());
#endif
return pivot;
}
@@ -796,19 +851,32 @@ HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
}
template <class D, class Traits, typename T>
void Recurse(D d, Traits st, T* HWY_RESTRICT keys, T* HWY_RESTRICT keys_end,
const size_t begin, const size_t end, const Vec<D> pivot,
T* HWY_RESTRICT buf, Generator& rng, size_t remaining_levels) {
HWY_DASSERT(begin + 1 < end);
const size_t num = end - begin; // >= 2
HWY_NOINLINE void Recurse(D d, Traits st, T* HWY_RESTRICT keys,
T* HWY_RESTRICT keys_end, const size_t begin,
const size_t end, T* HWY_RESTRICT buf, Generator& rng,
size_t remaining_levels) {
const size_t num = end - begin; // >= 1
#if VQSORT_PRINT
fprintf(stderr, "- Recurse remaining %zu [%zu %zu) len %zu\n",
remaining_levels, begin, end, num);
Vec<D> first, last;
ScanMinMax(d, st, keys + begin, num, buf, first, last);
if (num >= Lanes(d)) {
ScanMinMax(d, st, keys + begin, num, buf, first, last);
}
Print(d, "first", first, 0, st.LanesPerKey());
Print(d, "last", last, 0, st.LanesPerKey());
#endif
HWY_DASSERT(begin < end);
if (HWY_UNLIKELY(num <= Constants::BaseCaseNum(Lanes(d)))) {
BaseCase(d, st, keys + begin, keys_end, num, buf);
return;
}
PivotResult result;
Vec<D> pivot = ChoosePivot(d, st, keys + begin, num, buf, rng, result);
if (HWY_UNLIKELY(result == PivotResult::kAllEqual)) {
return;
}
// Too many recursions. This is unlikely to happen because we select pivots
// from large (though still O(1)) samples.
@@ -820,47 +888,24 @@ void Recurse(D d, Traits st, T* HWY_RESTRICT keys, T* HWY_RESTRICT keys_end,
return;
}
const ptrdiff_t base_case_num =
static_cast<ptrdiff_t>(Constants::BaseCaseNum(Lanes(d)));
const size_t bound = Partition(d, st, keys, begin, end, pivot, buf);
const ptrdiff_t num_left =
static_cast<ptrdiff_t>(bound) - static_cast<ptrdiff_t>(begin);
const ptrdiff_t num_right =
static_cast<ptrdiff_t>(end) - static_cast<ptrdiff_t>(bound);
// ChoosePivot ensures pivot != largest key, so this should never happen.
HWY_ASSERT(num_right != 0);
if (HWY_UNLIKELY(num_left <= base_case_num)) {
BaseCase(d, st, keys + begin, keys_end, static_cast<size_t>(num_left), buf);
} else {
PivotResult result;
const Vec<D> next_pivot =
ChoosePivot(d, st, keys, begin, bound, buf, rng, result);
if (result != PivotResult::kAllEqual) {
Recurse(d, st, keys, keys_end, begin, bound, next_pivot, buf, rng,
remaining_levels - 1);
}
// ChoosePivot ensures pivot != last key, so the right partition is never
// empty. Nor is the left, because the pivot is either one of the keys, or
// the value prior to the last (which is not the only value).
HWY_ASSERT(begin != bound && bound != end);
if (HWY_LIKELY(result != PivotResult::kIsFirst)) {
Recurse(d, st, keys, keys_end, begin, bound, buf, rng,
remaining_levels - 1);
}
if (HWY_UNLIKELY(num_right <= base_case_num)) {
BaseCase(d, st, keys + bound, keys_end, static_cast<size_t>(num_right),
buf);
} else {
PivotResult result;
const Vec<D> next_pivot =
ChoosePivot(d, st, keys, bound, end, buf, rng, result);
if (result != PivotResult::kAllEqual) {
Recurse(d, st, keys, keys_end, bound, end, next_pivot, buf, rng,
remaining_levels - 1);
}
if (HWY_LIKELY(result != PivotResult::kWasLast)) {
Recurse(d, st, keys, keys_end, bound, end, buf, rng, remaining_levels - 1);
}
}
// Returns true if sorting is finished.
template <class D, class Traits, typename T>
bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
T* HWY_RESTRICT buf) {
HWY_INLINE bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys,
size_t num) {
const size_t N = Lanes(d);
const size_t base_case_num = Constants::BaseCaseNum(N);
@@ -876,16 +921,15 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols;
const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
if (partial_128 || huge_vec) {
// PERFORMANCE WARNING: falling back to HeapSort.
#if VQSORT_PRINT
fprintf(stderr, "WARNING: using slow HeapSort: partial %d huge %d\n",
partial_128, huge_vec);
#endif
HeapSort(st, keys, num);
return true;
}
// Small arrays: use sorting network, no need for other checks.
if (HWY_UNLIKELY(num <= base_case_num)) {
BaseCase(d, st, keys, keys + num, num, buf);
return true;
}
// Small arrays are already handled by Recurse.
// We could also check for already sorted/reverse/equal, but that's probably
// counterproductive if vqsort is used as a base case.
@@ -925,31 +969,26 @@ void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
buf = storage;
#endif // !HWY_HAVE_SCALABLE
if (detail::HandleSpecialCases(d, st, keys, num, buf)) return;
if (detail::HandleSpecialCases(d, st, keys, num)) return;
#if HWY_MAX_BYTES > 64
// sorting_networks-inl and traits assume no more than 512 bit vectors.
if (Lanes(d) > 64 / sizeof(T)) {
if (HWY_UNLIKELY(Lanes(d) > 64 / sizeof(T))) {
return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf);
}
#endif // HWY_MAX_BYTES > 64
// Pulled out of the recursion so we can special-case degenerate partitions.
detail::Generator rng(keys, num);
detail::PivotResult result;
const Vec<D> pivot =
detail::ChoosePivot(d, st, keys, 0, num, buf, rng, result);
if (result != detail::PivotResult::kAllEqual) {
// Introspection: switch to worst-case N*logN heapsort after this many.
const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
detail::Recurse(d, st, keys, keys + num, 0, num, pivot, buf, rng,
max_levels);
}
// Introspection: switch to worst-case N*logN heapsort after this many.
const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
detail::Recurse(d, st, keys, keys + num, 0, num, buf, rng, max_levels);
#else
(void)d;
(void)buf;
// PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
#if VQSORT_PRINT
fprintf(stderr, "WARNING: using slow HeapSort because vqsort disabled\n");
#endif
return detail::HeapSort(st, keys, num);
#endif // VQSORT_ENABLED
}

View File

@@ -50,6 +50,12 @@
#define HWY_COMPILER_ICC 0
#endif
#ifdef __INTEL_LLVM_COMPILER
#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER
#else
#define HWY_COMPILER_ICX 0
#endif
// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
// compiler extensions (eg. Clang, Intel...)
#ifdef __GNUC__

View File

@@ -13,6 +13,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>

View File

@@ -19,7 +19,9 @@
// splitting code into different files while still inlining instead of requiring
// calling through function pointers.
// Include guard (still compiled once per target)
// Per-target include guard. This is only required when using dynamic dispatch,
// i.e. including foreach_target.h. For static dispatch, a normal include
// guard would be fine because the header is only compiled once.
#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
@@ -36,7 +38,8 @@ HWY_BEFORE_NAMESPACE();
namespace skeleton {
namespace HWY_NAMESPACE {
using namespace hwy::HWY_NAMESPACE;
// Highway ops reside here; ADL does not find templates nor builtins.
namespace hn = hwy::HWY_NAMESPACE;
// Example of a type-agnostic (caller-specified lane type) and width-agnostic
// (uses best available instruction set) function in a header.
@@ -46,12 +49,12 @@ template <class D, typename T>
HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
const T* HWY_RESTRICT add_array,
const size_t size, T* HWY_RESTRICT x_array) {
for (size_t i = 0; i < size; i += Lanes(d)) {
const auto mul = Load(d, mul_array + i);
const auto add = Load(d, add_array + i);
auto x = Load(d, x_array + i);
x = MulAdd(mul, x, add);
Store(x, d, x_array + i);
for (size_t i = 0; i < size; i += hn::Lanes(d)) {
const auto mul = hn::Load(d, mul_array + i);
const auto add = hn::Load(d, add_array + i);
auto x = hn::Load(d, x_array + i);
x = hn::MulAdd(mul, x, add);
hn::Store(x, d, x_array + i);
}
}

View File

@@ -17,22 +17,28 @@
#include <stdio.h>
// >>>> for dynamic dispatch only, skip if you want static dispatch
// First undef to prevent error when re-included.
#undef HWY_TARGET_INCLUDE
// For runtime dispatch, specify the name of the current file (unfortunately
// For dynamic dispatch, specify the name of the current file (unfortunately
// __FILE__ is not reliable) so that foreach_target.h can re-include it.
#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
// Generates code for each enabled target by re-including this source file.
#include "hwy/foreach_target.h" // IWYU pragma: keep
// <<<< end of dynamic dispatch
// Must come after foreach_target.h to avoid redefinition errors.
#include "hwy/highway.h"
// Optional, can instead add HWY_ATTR to all functions.
HWY_BEFORE_NAMESPACE();
namespace skeleton {
// This namespace name is unique per target, which allows code for multiple
// targets to co-exist in the same translation unit.
// targets to co-exist in the same translation unit. Required when using dynamic
// dispatch, otherwise optional.
namespace HWY_NAMESPACE {
// Highway ops reside here; ADL does not find templates nor builtins.
@@ -104,6 +110,7 @@ HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
uint8_t* HWY_RESTRICT out) {
// This must reside outside of HWY_NAMESPACE because it references (calls the
// appropriate one from) the per-target implementations there.
// For static dispatch, use HWY_STATIC_DISPATCH.
return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
}

View File

@@ -62,7 +62,7 @@ struct TestFloorLog2 {
};
HWY_NOINLINE void TestAllFloorLog2() {
ForPartialVectors<TestFloorLog2>()(float());
hn::ForPartialVectors<TestFloorLog2>()(float());
}
// Calls function defined in skeleton-inl.h.
@@ -91,7 +91,7 @@ struct TestSumMulAdd {
};
HWY_NOINLINE void TestAllSumMulAdd() {
ForFloatTypes(ForPartialVectors<TestSumMulAdd>());
hn::ForFloatTypes(hn::ForPartialVectors<TestSumMulAdd>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)

View File

@@ -29,7 +29,7 @@ namespace hwy {
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
#define HWY_MAJOR 1
#define HWY_MINOR 0
#define HWY_PATCH 0
#define HWY_PATCH 1
//------------------------------------------------------------------------------
// Shorthand for tags (defined in shared-inl.h) used to select overloads.

View File

@@ -15,7 +15,6 @@
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <bitset>
@@ -224,7 +223,7 @@ HWY_INLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
// avoid truncating doubles.
uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0};
const T lane = GetLane(v);
memcpy(bytes, &lane, sizeof(T));
CopyBytes<sizeof(T)>(&lane, bytes);
Abort(file, line,
"Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x "
"%02x)",

View File

@@ -15,11 +15,13 @@
#include "hwy/nanobenchmark.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h> // abort
#include <string.h> // memcpy
#include <stdlib.h>
#include <time.h> // clock_gettime
#include <algorithm> // sort
@@ -414,7 +416,7 @@ std::string BrandString() {
for (size_t i = 0; i < 3; ++i) {
Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
CopyBytes<sizeof(abcd)>(&abcd[0], brand_string + i * 16); // not same size
}
brand_string[48] = 0;
return brand_string;

View File

@@ -15,6 +15,9 @@
#include "hwy/nanobenchmark.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>

View File

@@ -1030,6 +1030,9 @@ template <typename T, size_t N>
HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
#endif
typename detail::Raw128<T, N>::type a;
return Vec128<T, N>(a);
HWY_DIAGNOSTICS(pop)
@@ -3285,6 +3288,16 @@ HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
return Vec128<float, N>(vcvt_f32_s32(v.raw));
}
HWY_API Vec128<float> ConvertTo(Full128<float> /* tag */,
const Vec128<uint32_t> v) {
return Vec128<float>(vcvtq_f32_u32(v.raw));
}
template <size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
const Vec128<uint32_t, N> v) {
return Vec128<float, N>(vcvt_f32_u32(v.raw));
}
// Truncates (rounds toward zero).
HWY_API Vec128<int32_t> ConvertTo(Full128<int32_t> /* tag */,
const Vec128<float> v) {
@@ -3307,6 +3320,15 @@ HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
return Vec64<double>(vcvt_f64_s64(v.raw));
}
HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
const Vec128<uint64_t> v) {
return Vec128<double>(vcvtq_f64_u64(v.raw));
}
HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
const Vec64<uint64_t> v) {
return Vec64<double>(vcvt_f64_u64(v.raw));
}
// Truncates (rounds toward zero).
HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */,
const Vec128<double> v) {
@@ -4979,24 +5001,52 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
return Max(v10, v01);
}
// u16/i16
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
const Repartition<int32_t, Simd<T, N, 0>> d32;
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
const Simd<uint16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(d32, Min(even, odd));
const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
// Also broadcast into odd lanes.
return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
const Repartition<int32_t, Simd<T, N, 0>> d32;
template <size_t N, HWY_IF_GE32(int16_t, N)>
HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<int16_t, N> v) {
const Simd<int16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
const Simd<uint16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(d32, Max(even, odd));
const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
// Also broadcast into odd lanes.
return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
template <size_t N, HWY_IF_GE32(int16_t, N)>
HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<int16_t, N> v) {
const Simd<int16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
} // namespace detail
@@ -6356,64 +6406,6 @@ HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
return IfThenElse(Lt128Upper(d, b, a), a, b);
}
// ================================================== Operator wrapper
// These apply to all x86_*-inl.h because there are no restrictions on V.
template <class V>
HWY_API V Add(V a, V b) {
return a + b;
}
template <class V>
HWY_API V Sub(V a, V b) {
return a - b;
}
template <class V>
HWY_API V Mul(V a, V b) {
return a * b;
}
template <class V>
HWY_API V Div(V a, V b) {
return a / b;
}
template <class V>
V Shl(V a, V b) {
return a << b;
}
template <class V>
V Shr(V a, V b) {
return a >> b;
}
template <class V>
HWY_API auto Eq(V a, V b) -> decltype(a == b) {
return a == b;
}
template <class V>
HWY_API auto Ne(V a, V b) -> decltype(a == b) {
return a != b;
}
template <class V>
HWY_API auto Lt(V a, V b) -> decltype(a == b) {
return a < b;
}
template <class V>
HWY_API auto Gt(V a, V b) -> decltype(a == b) {
return a > b;
}
template <class V>
HWY_API auto Ge(V a, V b) -> decltype(a == b) {
return a >= b;
}
template <class V>
HWY_API auto Le(V a, V b) -> decltype(a == b) {
return a <= b;
}
namespace detail { // for code folding
#if HWY_ARCH_ARM_V7
#undef vuzp1_s8

View File

@@ -629,6 +629,13 @@ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, Mul, mul)
HWY_SVE_FOREACH_UIF3264(HWY_SVE_RETV_ARGPVV, Mul, mul)
// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
#ifdef HWY_NATIVE_I64MULLO
#undef HWY_NATIVE_I64MULLO
#else
#define HWY_NATIVE_I64MULLO
#endif
// ------------------------------ MulHigh
HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
namespace detail {
@@ -1497,11 +1504,18 @@ HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
// ------------------------------ ConvertTo F
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
/* signed integers */ \
template <size_t N, int kPow2> \
HWY_API HWY_SVE_V(BASE, BITS) \
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
} \
/* unsigned integers */ \
template <size_t N, int kPow2> \
HWY_API HWY_SVE_V(BASE, BITS) \
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \
return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
} \
/* Truncates (rounds toward zero). */ \
template <size_t N, int kPow2> \
HWY_API HWY_SVE_V(int, BITS) \
@@ -2248,9 +2262,9 @@ HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) {
#endif
#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
uint64_t bits = 0; // predicate reg is 32-bit
CopyBytes<4>(&mask, &bits);
CopyBytes<4>(&mask, &bits); // not same size - 64-bit more efficient
// Concatenate LSB for upper and lower blocks, pre-scale by 4 for table idx.
const size_t offset = ((bits & 1) ? 4 : 0) + ((bits & 0x10000) ? 8 : 0);
const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u);
// See CompressIsPartition. Manually generated; flip halves if mask = [0, 1].
alignas(16) static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1,
0, 1, 2, 3, 0, 1, 2, 3};
@@ -2680,7 +2694,7 @@ HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
// Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane.
// The "at least 8 byte" guarantee in quick_reference ensures this is safe.
uint32_t mask_bits;
CopyBytes<4>(bits, &mask_bits);
CopyBytes<4>(bits, &mask_bits); // copy from bytes
const auto vbits = Set(du, mask_bits);
// 2 ^ {0,1, .., 31}, will not have more lanes than that.

View File

@@ -101,9 +101,7 @@ using TFromV = TFromD<DFromV<V>>;
template <typename T, size_t N, typename FromT, size_t FromN>
HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> /* tag */, Vec128<FromT, FromN> v) {
Vec128<T, N> to;
static_assert(sizeof(T) * N == sizeof(FromT) * FromN,
"Casting does not change size");
CopyBytes<sizeof(T) * N>(v.raw, to.raw);
CopySameSize(&v, &to);
return to;
}
@@ -285,8 +283,7 @@ template <typename TFrom, typename TTo, size_t N>
HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
Mask128<TFrom, N> mask) {
Mask128<TTo, N> to;
static_assert(sizeof(TTo) * N == sizeof(TFrom) * N, "Must have same size");
CopyBytes<sizeof(TTo) * N>(mask.bits, to.bits);
CopySameSize(&mask, &to);
return to;
}
@@ -294,15 +291,14 @@ HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
template <typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
Mask128<T, N> mask;
static_assert(sizeof(v) == sizeof(mask), "Must have same size");
CopyBytes<sizeof(T) * N>(v.raw, mask.bits);
CopySameSize(&v, &mask);
return mask;
}
template <typename T, size_t N>
Vec128<T, N> VecFromMask(const Mask128<T, N> mask) {
Vec128<T, N> v;
CopyBytes<sizeof(T) * N>(mask.bits, v.raw);
CopySameSize(&mask, &v);
return v;
}
@@ -926,10 +922,10 @@ HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
for (size_t i = 0; i < N; ++i) {
const float half = v.raw[i] * 0.5f;
uint32_t bits;
CopyBytes<4>(&v.raw[i], &bits);
CopySameSize(&v.raw[i], &bits);
// Initial guess based on log2(f)
bits = 0x5F3759DF - (bits >> 1);
CopyBytes<4>(&bits, &v.raw[i]);
CopySameSize(&bits, &v.raw[i]);
// One Newton-Raphson iteration
v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
}
@@ -1039,7 +1035,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
const bool positive = v.raw[i] > Float(0.0);
Bits bits;
CopyBytes<sizeof(Bits)>(&v.raw[i], &bits);
CopySameSize(&v.raw[i], &bits);
const int exponent =
static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1059,7 +1055,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
if (positive) bits += (kMantissaMask + 1) >> exponent;
bits &= ~mantissa_mask;
CopyBytes<sizeof(Bits)>(&bits, &v.raw[i]);
CopySameSize(&bits, &v.raw[i]);
}
return v;
}
@@ -1077,7 +1073,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
const bool negative = v.raw[i] < Float(0.0);
Bits bits;
CopyBytes<sizeof(Bits)>(&v.raw[i], &bits);
CopySameSize(&v.raw[i], &bits);
const int exponent =
static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1097,7 +1093,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
if (negative) bits += (kMantissaMask + 1) >> exponent;
bits &= ~mantissa_mask;
CopyBytes<sizeof(Bits)>(&bits, &v.raw[i]);
CopySameSize(&bits, &v.raw[i]);
}
return v;
}
@@ -1110,7 +1106,7 @@ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
for (size_t i = 0; i < N; ++i) {
// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
MakeUnsigned<T> bits;
memcpy(&bits, &v.raw[i], sizeof(T));
CopySameSize(&v.raw[i], &bits);
bits += bits;
bits >>= 1; // clear sign bit
// NaN if all exponent bits are set and the mantissa is not zero.
@@ -1278,7 +1274,7 @@ template <typename T, size_t N>
HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */,
const T* HWY_RESTRICT aligned) {
Vec128<T, N> v;
CopyBytes<sizeof(T) * N>(aligned, v.raw);
CopyBytes<sizeof(T) * N>(aligned, v.raw); // copy from array
return v;
}
@@ -1305,7 +1301,7 @@ HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d,
template <typename T, size_t N>
HWY_API void Store(const Vec128<T, N> v, Simd<T, N, 0> /* tag */,
T* HWY_RESTRICT aligned) {
CopyBytes<sizeof(T) * N>(v.raw, aligned);
CopyBytes<sizeof(T) * N>(v.raw, aligned); // copy to array
}
template <typename T, size_t N>
@@ -1434,7 +1430,7 @@ HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* base,
static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
for (size_t i = 0; i < N; ++i) {
uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw[i];
CopyBytes<sizeof(T)>(&v.raw[i], base8);
CopyBytes<sizeof(T)>(&v.raw[i], base8); // copy to bytes
}
}
@@ -1457,7 +1453,7 @@ HWY_API Vec128<T, N> GatherOffset(Simd<T, N, 0> /* tag */, const T* base,
for (size_t i = 0; i < N; ++i) {
const uint8_t* base8 =
reinterpret_cast<const uint8_t*>(base) + offset.raw[i];
CopyBytes<sizeof(T)>(base8, &v.raw[i]);
CopyBytes<sizeof(T)>(base8, &v.raw[i]); // copy from bytes
}
return v;
}
@@ -1556,12 +1552,12 @@ namespace detail {
HWY_INLINE void StoreU16ToF16(const uint16_t val,
hwy::float16_t* HWY_RESTRICT to) {
CopyBytes<2>(&val, to);
CopySameSize(&val, to);
}
HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) {
uint16_t bits16;
CopyBytes<2>(from, &bits16);
CopySameSize(from, &bits16);
return bits16;
}
@@ -1590,7 +1586,7 @@ HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
const uint32_t biased_exp32 = biased_exp + (127 - 15);
const uint32_t mantissa32 = mantissa << (23 - 10);
const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
CopyBytes<4>(&bits32, &ret.raw[i]);
CopySameSize(&bits32, &ret.raw[i]);
}
return ret;
}
@@ -1611,7 +1607,7 @@ HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
Vec128<float16_t, N> ret;
for (size_t i = 0; i < N; ++i) {
uint32_t bits32;
CopyBytes<4>(&v.raw[i], &bits32);
CopySameSize(&v.raw[i], &bits32);
const uint32_t sign = bits32 >> 31;
const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
const uint32_t mantissa32 = bits32 & 0x7FFFFF;
@@ -2446,62 +2442,6 @@ HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
return Load(Full128<uint64_t>(), mul);
}
// ================================================== Operator wrapper
template <class V>
HWY_API V Add(V a, V b) {
return a + b;
}
template <class V>
HWY_API V Sub(V a, V b) {
return a - b;
}
template <class V>
HWY_API V Mul(V a, V b) {
return a * b;
}
template <class V>
HWY_API V Div(V a, V b) {
return a / b;
}
template <class V>
V Shl(V a, V b) {
return a << b;
}
template <class V>
V Shr(V a, V b) {
return a >> b;
}
template <class V>
HWY_API auto Eq(V a, V b) -> decltype(a == b) {
return a == b;
}
template <class V>
HWY_API auto Ne(V a, V b) -> decltype(a == b) {
return a != b;
}
template <class V>
HWY_API auto Lt(V a, V b) -> decltype(a == b) {
return a < b;
}
template <class V>
HWY_API auto Gt(V a, V b) -> decltype(a == b) {
return a > b;
}
template <class V>
HWY_API auto Ge(V a, V b) -> decltype(a == b) {
return a >= b;
}
template <class V>
HWY_API auto Le(V a, V b) -> decltype(a == b) {
return a <= b;
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy

View File

@@ -1209,7 +1209,8 @@ HWY_API V PopulationCount(V v) {
// RVV has a specialization that avoids the Set().
#if HWY_TARGET != HWY_RVV
// Slower fallback for capped vectors.
template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1), HWY_IF_LT128_D(D)>
template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
HWY_IF_LT128_D(D)>
HWY_API V PopulationCount(V v) {
static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
const D d;
@@ -1251,6 +1252,105 @@ HWY_API V PopulationCount(V v) {
#endif // HWY_NATIVE_POPCNT
template <class V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8),
HWY_IF_LT128_D(D)>
HWY_API V operator*(V x, V y) {
return Set(D(), GetLane(x) * GetLane(y));
}
// "Include guard": skip if native 64-bit mul instructions are available.
#if (defined(HWY_NATIVE_I64MULLO) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_I64MULLO
#undef HWY_NATIVE_I64MULLO
#else
#define HWY_NATIVE_I64MULLO
#endif
template <class V, class D64 = DFromV<V>, typename T = LaneType<V>,
HWY_IF_LANE_SIZE(T, 8), HWY_IF_UNSIGNED(T), HWY_IF_GE128_D(D64)>
HWY_API V operator*(V x, V y) {
RepartitionToNarrow<D64> d32;
auto x32 = BitCast(d32, x);
auto y32 = BitCast(d32, y);
auto lolo = BitCast(d32, MulEven(x32, y32));
auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
return BitCast(D64{}, lolo + hi);
}
template <class V, class DI64 = DFromV<V>, typename T = LaneType<V>,
HWY_IF_LANE_SIZE(T, 8), HWY_IF_SIGNED(T), HWY_IF_GE128_D(DI64)>
HWY_API V operator*(V x, V y) {
RebindToUnsigned<DI64> du64;
return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
}
#endif // HWY_NATIVE_I64MULLO
// ================================================== Operator wrapper
// These targets currently cannot define operators and have already defined
// (only) the corresponding functions such as Add.
#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \
HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
HWY_TARGET != HWY_SVE2_128
template <class V>
HWY_API V Add(V a, V b) {
return a + b;
}
template <class V>
HWY_API V Sub(V a, V b) {
return a - b;
}
template <class V>
HWY_API V Mul(V a, V b) {
return a * b;
}
template <class V>
HWY_API V Div(V a, V b) {
return a / b;
}
template <class V>
V Shl(V a, V b) {
return a << b;
}
template <class V>
V Shr(V a, V b) {
return a >> b;
}
template <class V>
HWY_API auto Eq(V a, V b) -> decltype(a == b) {
return a == b;
}
template <class V>
HWY_API auto Ne(V a, V b) -> decltype(a == b) {
return a != b;
}
template <class V>
HWY_API auto Lt(V a, V b) -> decltype(a == b) {
return a < b;
}
template <class V>
HWY_API auto Gt(V a, V b) -> decltype(a == b) {
return a > b;
}
template <class V>
HWY_API auto Ge(V a, V b) -> decltype(a == b) {
return a >= b;
}
template <class V>
HWY_API auto Le(V a, V b) -> decltype(a == b) {
return a <= b;
}
#endif // HWY_TARGET for operators
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy

View File

@@ -949,16 +949,16 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Max, fmax, _ALL)
// ------------------------------ Mul
// Only for internal use (Highway only promises Mul for 16/32-bit inputs).
// Used by MulLower.
namespace detail {
HWY_RVV_FOREACH_U64(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
} // namespace detail
HWY_RVV_FOREACH_UI16(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
HWY_RVV_FOREACH_UI32(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
HWY_RVV_FOREACH_UI163264(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL)
// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
#ifdef HWY_NATIVE_I64MULLO
#undef HWY_NATIVE_I64MULLO
#else
#define HWY_NATIVE_I64MULLO
#endif
// ------------------------------ MulHigh
// Only for internal use (Highway only promises MulHigh for 16-bit inputs).
@@ -2019,6 +2019,11 @@ HWY_API VFromD<Simd<uint16_t, N, kPow2>> DemoteTo(
HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \
return vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \
} \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) {\
return vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d)); \
} \
/* Truncates (rounds toward zero). */ \
template <size_t N> \
HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
@@ -3069,14 +3074,14 @@ HWY_API VFromD<DW> MulEven(const V a, const V b) {
// There is no 64x64 vwmul.
template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
HWY_INLINE V MulEven(const V a, const V b) {
const auto lo = detail::Mul(a, b);
const auto lo = Mul(a, b);
const auto hi = detail::MulHigh(a, b);
return OddEven(detail::Slide1Up(hi), lo);
}
template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
HWY_INLINE V MulOdd(const V a, const V b) {
const auto lo = detail::Mul(a, b);
const auto lo = Mul(a, b);
const auto hi = detail::MulHigh(a, b);
return OddEven(hi, detail::Slide1Down(lo));
}

View File

@@ -102,7 +102,7 @@ template <typename T, typename FromT>
HWY_API Vec1<T> BitCast(Sisd<T> /* tag */, Vec1<FromT> v) {
static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
T to;
CopyBytes<sizeof(FromT)>(&v.raw, &to);
CopyBytes<sizeof(FromT)>(&v.raw, &to); // not same size - ok to shrink
return Vec1<T>(to);
}
@@ -260,21 +260,21 @@ HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
template <typename T>
HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) {
Mask1<T> mask;
CopyBytes<sizeof(mask.bits)>(&v.raw, &mask.bits);
CopySameSize(&v, &mask);
return mask;
}
template <typename T>
Vec1<T> VecFromMask(const Mask1<T> mask) {
Vec1<T> v;
CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
CopySameSize(&mask, &v);
return v;
}
template <typename T>
Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
Vec1<T> v;
CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
CopySameSize(&mask, &v);
return v;
}
@@ -697,10 +697,10 @@ HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
float f = v.raw;
const float half = f * 0.5f;
uint32_t bits;
CopyBytes<4>(&f, &bits);
CopySameSize(&f, &bits);
// Initial guess based on log2(f)
bits = 0x5F3759DF - (bits >> 1);
CopyBytes<4>(&bits, &f);
CopySameSize(&bits, &f);
// One Newton-Raphson iteration
return Vec1<float>(f * (1.5f - (half * f * f)));
}
@@ -778,7 +778,7 @@ V Ceiling(const V v) {
const bool positive = f > Float(0.0);
Bits bits;
CopyBytes<sizeof(Bits)>(&v, &bits);
CopySameSize(&v, &bits);
const int exponent =
static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -795,7 +795,7 @@ V Ceiling(const V v) {
if (positive) bits += (kMantissaMask + 1) >> exponent;
bits &= ~mantissa_mask;
CopyBytes<sizeof(Bits)>(&bits, &f);
CopySameSize(&bits, &f);
return V(f);
}
@@ -810,7 +810,7 @@ V Floor(const V v) {
const bool negative = f < Float(0.0);
Bits bits;
CopyBytes<sizeof(Bits)>(&v, &bits);
CopySameSize(&v, &bits);
const int exponent =
static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -827,7 +827,7 @@ V Floor(const V v) {
if (negative) bits += (kMantissaMask + 1) >> exponent;
bits &= ~mantissa_mask;
CopyBytes<sizeof(Bits)>(&bits, &f);
CopySameSize(&bits, &f);
return V(f);
}
@@ -889,7 +889,7 @@ template <typename T>
HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
MakeUnsigned<T> bits;
memcpy(&bits, &v, sizeof(v));
CopySameSize(&v, &bits);
bits += bits;
bits >>= 1; // clear sign bit
// NaN if all exponent bits are set and the mantissa is not zero.
@@ -929,7 +929,7 @@ HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
template <typename T>
HWY_API Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
T t;
CopyBytes<sizeof(T)>(aligned, &t);
CopySameSize(aligned, &t);
return Vec1<T>(t);
}
@@ -955,7 +955,7 @@ HWY_API Vec1<T> LoadDup128(Sisd<T> d, const T* HWY_RESTRICT aligned) {
template <typename T>
HWY_API void Store(const Vec1<T> v, Sisd<T> /* tag */,
T* HWY_RESTRICT aligned) {
CopyBytes<sizeof(T)>(&v.raw, aligned);
CopySameSize(&v.raw, aligned);
}
template <typename T>
@@ -1119,7 +1119,7 @@ HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
uint16_t bits16;
CopyBytes<2>(&v.raw, &bits16);
CopySameSize(&v.raw, &bits16);
const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
const uint32_t mantissa = bits16 & 0x3FF;
@@ -1136,7 +1136,7 @@ HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
const uint32_t mantissa32 = mantissa << (23 - 10);
const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
float out;
CopyBytes<4>(&bits32, &out);
CopySameSize(&bits32, &out);
return Vec1<float>(out);
}
@@ -1147,7 +1147,7 @@ HWY_API Vec1<float> PromoteTo(Sisd<float> d, const Vec1<bfloat16_t> v) {
HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
const Vec1<float> v) {
uint32_t bits32;
CopyBytes<4>(&v.raw, &bits32);
CopySameSize(&v.raw, &bits32);
const uint32_t sign = bits32 >> 31;
const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
const uint32_t mantissa32 = bits32 & 0x7FFFFF;
@@ -1158,7 +1158,7 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
Vec1<float16_t> out;
if (exp < -24) {
const uint16_t zero = 0;
CopyBytes<2>(&zero, &out.raw);
CopySameSize(&zero, &out.raw);
return out;
}
@@ -1182,7 +1182,7 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
HWY_DASSERT(bits16 < 0x10000);
const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
CopyBytes<2>(&narrowed, &out.raw);
CopySameSize(&narrowed, &out.raw);
return out;
}
@@ -1379,7 +1379,7 @@ HWY_API Vec1<TI> TableLookupBytes(const Vec1<T> in, const Vec1<TI> indices) {
uint8_t in_bytes[sizeof(T)];
uint8_t idx_bytes[sizeof(T)];
uint8_t out_bytes[sizeof(T)];
CopyBytes<sizeof(T)>(&in, &in_bytes);
CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes
CopyBytes<sizeof(T)>(&indices, &idx_bytes);
for (size_t i = 0; i < sizeof(T); ++i) {
out_bytes[i] = in_bytes[idx_bytes[i]];
@@ -1394,7 +1394,7 @@ HWY_API Vec1<TI> TableLookupBytesOr0(const Vec1<T> in, const Vec1<TI> indices) {
uint8_t in_bytes[sizeof(T)];
uint8_t idx_bytes[sizeof(T)];
uint8_t out_bytes[sizeof(T)];
CopyBytes<sizeof(T)>(&in, &in_bytes);
CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes
CopyBytes<sizeof(T)>(&indices, &idx_bytes);
for (size_t i = 0; i < sizeof(T); ++i) {
out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
@@ -1546,62 +1546,6 @@ HWY_API Vec1<T> MaxOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
return v;
}
// ================================================== Operator wrapper
template <class V>
HWY_API V Add(V a, V b) {
return a + b;
}
template <class V>
HWY_API V Sub(V a, V b) {
return a - b;
}
template <class V>
HWY_API V Mul(V a, V b) {
return a * b;
}
template <class V>
HWY_API V Div(V a, V b) {
return a / b;
}
template <class V>
V Shl(V a, V b) {
return a << b;
}
template <class V>
V Shr(V a, V b) {
return a >> b;
}
template <class V>
HWY_API auto Eq(V a, V b) -> decltype(a == b) {
return a == b;
}
template <class V>
HWY_API auto Ne(V a, V b) -> decltype(a == b) {
return a != b;
}
template <class V>
HWY_API auto Lt(V a, V b) -> decltype(a == b) {
return a < b;
}
template <class V>
HWY_API auto Gt(V a, V b) -> decltype(a == b) {
return a > b;
}
template <class V>
HWY_API auto Ge(V a, V b) -> decltype(a == b) {
return a >= b;
}
template <class V>
HWY_API auto Le(V a, V b) -> decltype(a == b) {
return a <= b;
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy

View File

@@ -3367,6 +3367,11 @@ HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
}
template <size_t N>
HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
const Vec128<uint32_t, N> v) {
return Vec128<float, N>{wasm_f32x4_convert_u32x4(v.raw)};
}
// Truncates (rounds toward zero).
template <size_t N>
HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N, 0> /* tag */,
@@ -4348,26 +4353,52 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
return Max(v10, v01);
}
// u16/i16
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const Repartition<int32_t, decltype(d)> d32;
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
const Simd<uint16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(d32, Min(even, odd));
const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
// Also broadcast into odd lanes.
return BitCast(d, Or(min, ShiftLeft<16>(min)));
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const Repartition<int32_t, decltype(d)> d32;
template <size_t N, HWY_IF_GE32(int16_t, N)>
HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<int16_t, N> v) {
const Simd<int16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
const Simd<uint16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(d32, Max(even, odd));
const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
// Also broadcast into odd lanes.
return BitCast(d, Or(min, ShiftLeft<16>(min)));
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
template <size_t N, HWY_IF_GE32(int16_t, N)>
HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<int16_t, N> v) {
const Simd<int16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
} // namespace detail
@@ -4463,62 +4494,6 @@ HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
return IfThenElse(Lt128Upper(d, b, a), a, b);
}
// ================================================== Operator wrapper
template <class V>
HWY_API V Add(V a, V b) {
return a + b;
}
template <class V>
HWY_API V Sub(V a, V b) {
return a - b;
}
template <class V>
HWY_API V Mul(V a, V b) {
return a * b;
}
template <class V>
HWY_API V Div(V a, V b) {
return a / b;
}
template <class V>
V Shl(V a, V b) {
return a << b;
}
template <class V>
V Shr(V a, V b) {
return a >> b;
}
template <class V>
HWY_API auto Eq(V a, V b) -> decltype(a == b) {
return a == b;
}
template <class V>
HWY_API auto Ne(V a, V b) -> decltype(a == b) {
return a != b;
}
template <class V>
HWY_API auto Lt(V a, V b) -> decltype(a == b) {
return a < b;
}
template <class V>
HWY_API auto Gt(V a, V b) -> decltype(a == b) {
return a > b;
}
template <class V>
HWY_API auto Ge(V a, V b) -> decltype(a == b) {
return a >= b;
}
template <class V>
HWY_API auto Le(V a, V b) -> decltype(a == b) {
return a <= b;
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy

View File

@@ -592,7 +592,7 @@ HWY_API Vec256<int16_t> MulHigh(const Vec256<int16_t> a,
}
HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t>, Vec256<int16_t>) {
HWY_ASSERT(0);
HWY_ASSERT(0); // Not implemented
}
// Multiplies even lanes (0, 2 ..) and returns the double-width result.
@@ -1043,7 +1043,7 @@ HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
template <typename T>
HWY_API Vec256 <
T IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
HWY_ASSERT(0);
HWY_ASSERT(0); // Not implemented
}
template <typename T, HWY_IF_FLOAT(T)>
@@ -1333,13 +1333,13 @@ HWY_API Vec256<T> GatherIndex(const Full256<T> d, const T* HWY_RESTRICT base,
// ------------------------------ ExtractLane
template <typename T, size_t N>
HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
HWY_ASSERT(0);
HWY_ASSERT(0); // Not implemented
}
// ------------------------------ InsertLane
template <typename T, size_t N>
HWY_API Vec128<T, N> InsertLane(const Vec128<T, N> v, size_t i, T t) {
HWY_ASSERT(0);
HWY_ASSERT(0); // Not implemented
}
// ------------------------------ GetLane
@@ -1846,21 +1846,21 @@ HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
template <typename T>
HWY_API Vec256<T> Reverse2(Full256<T> d, const Vec256<T> v) {
HWY_ASSERT(0);
HWY_ASSERT(0); // Not implemented
}
// ------------------------------ Reverse4
template <typename T>
HWY_API Vec256<T> Reverse4(Full256<T> d, const Vec256<T> v) {
HWY_ASSERT(0);
HWY_ASSERT(0); // Not implemented
}
// ------------------------------ Reverse8
template <typename T>
HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
HWY_ASSERT(0);
HWY_ASSERT(0); // Not implemented
}
// ------------------------------ InterleaveLower
@@ -2065,13 +2065,13 @@ HWY_API Vec256<T> ConcatEven(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
// ------------------------------ DupEven
template <typename T>
HWY_API Vec256<T> DupEven(Vec256<T> v) {
HWY_ASSERT(0);
HWY_ASSERT(0); // Not implemented
}
// ------------------------------ DupOdd
template <typename T>
HWY_API Vec256<T> DupOdd(Vec256<T> v) {
HWY_ASSERT(0);
HWY_ASSERT(0); // Not implemented
}
// ------------------------------ OddEven
@@ -2354,6 +2354,10 @@ HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
const Vec256<int32_t> v) {
return Vec256<float>{wasm_f32x4_convert_i32x4(v.raw)};
}
HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
const Vec256<uint32_t> v) {
return Vec256<float>{wasm_f32x4_convert_u32x4(v.raw)};
}
// Truncates (rounds toward zero).
HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> /* tag */,
const Vec256<float> v) {
@@ -2811,7 +2815,7 @@ HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
// ------------------------------ CompressBlocksNot
HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
Mask256<uint64_t> mask) {
HWY_ASSERT(0);
HWY_ASSERT(0); // Not implemented
}
// ------------------------------ CompressBits
@@ -2968,22 +2972,12 @@ HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
// u16/i16
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
const Repartition<int32_t, Full256<T>> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(d32, Min(even, odd));
// Also broadcast into odd lanes.
return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> /*v*/) {
HWY_ASSERT(0); // Not implemented
}
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
const Repartition<int32_t, Full256<T>> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(d32, Max(even, odd));
// Also broadcast into odd lanes.
return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> /*v*/) {
HWY_ASSERT(0); // Not implemented
}
} // namespace detail

View File

@@ -17,6 +17,17 @@
// operations when compiling for those targets.
// External include guard in highway.h - see comment there.
// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL
#include "hwy/base.h"
// Avoid uninitialized warnings in GCC's emmintrin.h - see
// https://github.com/google/highway/issues/710 and pull/902)
HWY_DIAGNOSTICS(push)
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
#endif
#include <emmintrin.h>
#include <stdio.h>
#if HWY_TARGET == HWY_SSSE3
@@ -27,8 +38,8 @@
#endif
#include <stddef.h>
#include <stdint.h>
#include <string.h> // memcpy
#include "hwy/base.h"
#include "hwy/ops/shared-inl.h"
#if HWY_IS_MSAN
@@ -1910,7 +1921,7 @@ template <typename T>
HWY_API Vec64<T> Load(Full64<T> /* tag */, const T* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
__m128i v = _mm_setzero_si128();
CopyBytes<8>(p, &v);
CopyBytes<8>(p, &v); // not same size
return Vec64<T>{v};
#else
return Vec64<T>{_mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))};
@@ -1921,7 +1932,7 @@ HWY_API Vec128<float, 2> Load(Full64<float> /* tag */,
const float* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
__m128 v = _mm_setzero_ps();
CopyBytes<8>(p, &v);
CopyBytes<8>(p, &v); // not same size
return Vec128<float, 2>{v};
#else
const __m128 hi = _mm_setzero_ps();
@@ -1933,7 +1944,7 @@ HWY_API Vec64<double> Load(Full64<double> /* tag */,
const double* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
__m128d v = _mm_setzero_pd();
CopyBytes<8>(p, &v);
CopyBytes<8>(p, &v); // not same size
return Vec64<double>{v};
#else
return Vec64<double>{_mm_load_sd(p)};
@@ -1944,7 +1955,7 @@ HWY_API Vec128<float, 1> Load(Full32<float> /* tag */,
const float* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
__m128 v = _mm_setzero_ps();
CopyBytes<4>(p, &v);
CopyBytes<4>(p, &v); // not same size
return Vec128<float, 1>{v};
#else
return Vec128<float, 1>{_mm_load_ss(p)};
@@ -1957,11 +1968,11 @@ HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
constexpr size_t kSize = sizeof(T) * N;
#if HWY_SAFE_PARTIAL_LOAD_STORE
__m128 v = _mm_setzero_ps();
CopyBytes<kSize>(p, &v);
CopyBytes<kSize>(p, &v); // not same size
return Vec128<T, N>{v};
#else
int32_t bits = 0;
CopyBytes<kSize>(p, &bits);
CopyBytes<kSize>(p, &bits); // not same size
return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
#endif
}
@@ -2111,7 +2122,7 @@ HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
template <typename T>
HWY_API void Store(Vec64<T> v, Full64<T> /* tag */, T* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
CopyBytes<8>(&v, p);
CopyBytes<8>(&v, p); // not same size
#else
_mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw);
#endif
@@ -2119,7 +2130,7 @@ HWY_API void Store(Vec64<T> v, Full64<T> /* tag */, T* HWY_RESTRICT p) {
HWY_API void Store(const Vec128<float, 2> v, Full64<float> /* tag */,
float* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
CopyBytes<8>(&v, p);
CopyBytes<8>(&v, p); // not same size
#else
_mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
#endif
@@ -2127,7 +2138,7 @@ HWY_API void Store(const Vec128<float, 2> v, Full64<float> /* tag */,
HWY_API void Store(const Vec64<double> v, Full64<double> /* tag */,
double* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
CopyBytes<8>(&v, p);
CopyBytes<8>(&v, p); // not same size
#else
_mm_storel_pd(p, v.raw);
#endif
@@ -2136,12 +2147,12 @@ HWY_API void Store(const Vec64<double> v, Full64<double> /* tag */,
// Any <= 32 bit except <float, 1>
template <typename T, size_t N, HWY_IF_LE32(T, N)>
HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
CopyBytes<sizeof(T) * N>(&v, p);
CopyBytes<sizeof(T) * N>(&v, p); // not same size
}
HWY_API void Store(const Vec128<float, 1> v, Full32<float> /* tag */,
float* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
CopyBytes<4>(&v, p);
CopyBytes<4>(&v, p); // not same size
#else
_mm_store_ss(p, v.raw);
#endif
@@ -2172,7 +2183,7 @@ HWY_API void ScalarMaskedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
Store(BitCast(di, VecFromMask(d, m)), di, mask);
for (size_t i = 0; i < N; ++i) {
if (mask[i]) {
CopyBytes<sizeof(T)>(buf + i, p + i);
CopySameSize(buf + i, p + i);
}
}
}
@@ -3635,9 +3646,9 @@ HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
return lanes[kLane];
#else
// Bug in the intrinsic, returns int but should be float.
const int bits = _mm_extract_ps(v.raw, kLane);
const int32_t bits = _mm_extract_ps(v.raw, kLane);
float ret;
CopyBytes<4>(&bits, &ret);
CopySameSize(&bits, &ret);
return ret;
#endif
}
@@ -3814,7 +3825,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
return Load(d, lanes);
#else
MakeSigned<T> ti;
CopyBytes<sizeof(T)>(&t, &ti); // don't just cast because T might be float.
CopySameSize(&t, &ti); // don't just cast because T might be float.
return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
#endif
}
@@ -3830,7 +3841,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
return Load(d, lanes);
#else
MakeSigned<T> ti;
CopyBytes<sizeof(T)>(&t, &ti); // don't just cast because T might be float.
CopySameSize(&t, &ti); // don't just cast because T might be float.
return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
#endif
}
@@ -5582,6 +5593,26 @@ HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)};
}
template <size_t N>
HWY_API Vec128<float, N> ConvertTo(HWY_MAYBE_UNUSED Simd<float, N, 0> df,
const Vec128<uint32_t, N> v) {
#if HWY_TARGET <= HWY_AVX3
return Vec128<float, N>{_mm_cvtepu32_ps(v.raw)};
#else
// Based on wim's approach (https://stackoverflow.com/questions/34066228/)
const RebindToUnsigned<decltype(df)> du32;
const RebindToSigned<decltype(df)> d32;
const auto msk_lo = Set(du32, 0xFFFF);
const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
// Extract the 16 lowest/highest significant bits of v and cast to signed int
const auto v_lo = BitCast(d32, And(v, msk_lo));
const auto v_hi = BitCast(d32, ShiftRight<16>(v));
return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
#endif
}
template <size_t N>
HWY_API Vec128<double, N> ConvertTo(Simd<double, N, 0> dd,
const Vec128<int64_t, N> v) {
@@ -5606,6 +5637,33 @@ HWY_API Vec128<double, N> ConvertTo(Simd<double, N, 0> dd,
#endif
}
template <size_t N>
HWY_API Vec128<double, N> ConvertTo(HWY_MAYBE_UNUSED Simd<double, N, 0> dd,
const Vec128<uint64_t, N> v) {
#if HWY_TARGET <= HWY_AVX3
return Vec128<double, N>{_mm_cvtepu64_pd(v.raw)};
#else
// Based on wim's approach (https://stackoverflow.com/questions/41144668/)
const RebindToUnsigned<decltype(dd)> d64;
using VU = VFromD<decltype(d64)>;
const VU msk_lo = Set(d64, 0xFFFFFFFF);
const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
// Extract the 32 lowest/highest significant bits of v
const VU v_lo = And(v, msk_lo);
const VU v_hi = ShiftRight<32>(v);
auto uint64_to_double128_fast = [&dd](VU w) HWY_ATTR {
w = Or(w, VU{detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)});
return BitCast(dd, w) - Set(dd, 0x0010000000000000);
};
const auto v_lo_dbl = uint64_to_double128_fast(v_lo);
return MulAdd(cnst2_32_dbl, uint64_to_double128_fast(v_hi), v_lo_dbl);
#endif
}
// Truncates (rounds toward zero).
template <size_t N>
HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N, 0> di,
@@ -5959,8 +6017,8 @@ HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
// Non-full byte, need to clear the undefined upper bits.
if (N < 8) {
const int mask = (1 << N) - 1;
bits[0] = static_cast<uint8_t>(bits[0] & mask);
const int mask_bits = (1 << N) - 1;
bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
}
return kNumBytes;
@@ -7103,24 +7161,52 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
return Max(v10, v01);
}
// u16/i16
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
const Repartition<int32_t, Simd<T, N, 0>> d32;
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
const Simd<uint16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(d32, Min(even, odd));
const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
// Also broadcast into odd lanes.
return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
const Repartition<int32_t, Simd<T, N, 0>> d32;
template <size_t N, HWY_IF_GE32(int16_t, N)>
HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<int16_t, N> v) {
const Simd<int16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
const Simd<uint16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(d32, Max(even, odd));
const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
// Also broadcast into odd lanes.
return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
template <size_t N, HWY_IF_GE32(int16_t, N)>
HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<int16_t, N> v) {
const Simd<int16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
} // namespace detail
@@ -7237,65 +7323,11 @@ HWY_API V Max128Upper(D d, const V a, const V b) {
return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
}
// ================================================== Operator wrapper
// These apply to all x86_*-inl.h because there are no restrictions on V.
template <class V>
HWY_API V Add(V a, V b) {
return a + b;
}
template <class V>
HWY_API V Sub(V a, V b) {
return a - b;
}
template <class V>
HWY_API V Mul(V a, V b) {
return a * b;
}
template <class V>
HWY_API V Div(V a, V b) {
return a / b;
}
template <class V>
V Shl(V a, V b) {
return a << b;
}
template <class V>
V Shr(V a, V b) {
return a >> b;
}
template <class V>
HWY_API auto Eq(V a, V b) -> decltype(a == b) {
return a == b;
}
template <class V>
HWY_API auto Ne(V a, V b) -> decltype(a == b) {
return a != b;
}
template <class V>
HWY_API auto Lt(V a, V b) -> decltype(a == b) {
return a < b;
}
template <class V>
HWY_API auto Gt(V a, V b) -> decltype(a == b) {
return a > b;
}
template <class V>
HWY_API auto Ge(V a, V b) -> decltype(a == b) {
return a >= b;
}
template <class V>
HWY_API auto Le(V a, V b) -> decltype(a == b) {
return a <= b;
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
// the warning seems to be issued at the call site of intrinsics, i.e. our code.
HWY_DIAGNOSTICS(pop)

View File

@@ -49,6 +49,7 @@ HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
#include <stddef.h>
#include <stdint.h>
#include <string.h> // memcpy
#if HWY_IS_MSAN
#include <sanitizer/msan_interface.h>
@@ -2368,7 +2369,7 @@ HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
Store(BitCast(du, VecFromMask(d, m)), du, mask);
for (size_t i = 0; i < 32 / sizeof(T); ++i) {
if (mask[i]) {
CopyBytes<sizeof(T)>(buf + i, p + i);
CopySameSize(buf + i, p + i);
}
}
}
@@ -4207,6 +4208,53 @@ HWY_API Vec256<double> ConvertTo(Full256<double> dd, const Vec256<int64_t> v) {
#endif
}
HWY_API Vec256<float> ConvertTo(HWY_MAYBE_UNUSED Full256<float> df,
const Vec256<uint32_t> v) {
#if HWY_TARGET <= HWY_AVX3
return Vec256<float>{_mm256_cvtepu32_ps(v.raw)};
#else
// Based on wim's approach (https://stackoverflow.com/questions/34066228/)
const RebindToUnsigned<decltype(df)> du32;
const RebindToSigned<decltype(df)> d32;
const auto msk_lo = Set(du32, 0xFFFF);
const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
// Extract the 16 lowest/highest significant bits of v and cast to signed int
const auto v_lo = BitCast(d32, And(v, msk_lo));
const auto v_hi = BitCast(d32, ShiftRight<16>(v));
return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
#endif
}
HWY_API Vec256<double> ConvertTo(HWY_MAYBE_UNUSED Full256<double> dd,
const Vec256<uint64_t> v) {
#if HWY_TARGET <= HWY_AVX3
return Vec256<double>{_mm256_cvtepu64_pd(v.raw)};
#else
// Based on wim's approach (https://stackoverflow.com/questions/41144668/)
const RebindToUnsigned<decltype(dd)> d64;
using VU = VFromD<decltype(d64)>;
const VU msk_lo = Set(d64, 0xFFFFFFFFULL);
const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
// Extract the 32 lowest significant bits of v
const VU v_lo = And(v, msk_lo);
const VU v_hi = ShiftRight<32>(v);
auto uint64_to_double256_fast = [&dd](Vec256<uint64_t> w) HWY_ATTR {
w = Or(w, Vec256<uint64_t>{
detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)});
return BitCast(dd, w) - Set(dd, 0x0010000000000000);
};
const auto v_lo_dbl = uint64_to_double256_fast(v_lo);
return MulAdd(cnst2_32_dbl, uint64_to_double256_fast(v_hi), v_lo_dbl);
#endif
}
// Truncates (rounds toward zero).
HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> d, const Vec256<float> v) {
return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw));
@@ -4396,8 +4444,8 @@ HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
// Non-full byte, need to clear the undefined upper bits.
if (N < 8) {
const int mask = static_cast<int>((1ull << N) - 1);
bits[0] = static_cast<uint8_t>(bits[0] & mask);
const int mask_bits = static_cast<int>((1ull << N) - 1);
bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
}
return kNumBytes;
}
@@ -5381,24 +5429,48 @@ HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
return Max(v10, v01);
}
// u16/i16
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
const Repartition<int32_t, Full256<T>> d32;
HWY_API Vec256<uint16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec256<uint16_t> v) {
const Full256<uint16_t> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(d32, Min(even, odd));
const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
// Also broadcast into odd lanes.
return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
const Repartition<int32_t, Full256<T>> d32;
HWY_API Vec256<int16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec256<int16_t> v) {
const Full256<int16_t> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
HWY_API Vec256<uint16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,
Vec256<uint16_t> v) {
const Full256<uint16_t> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(d32, Max(even, odd));
const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
// Also broadcast into odd lanes.
return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
HWY_API Vec256<int16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,
Vec256<int16_t> v) {
const Full256<int16_t> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
} // namespace detail

View File

@@ -1164,6 +1164,22 @@ HWY_API Vec512<uint16_t> operator*(Vec512<uint16_t> a, Vec512<uint16_t> b) {
HWY_API Vec512<uint32_t> operator*(Vec512<uint32_t> a, Vec512<uint32_t> b) {
return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
}
HWY_API Vec512<uint64_t> operator*(Vec512<uint64_t> a, Vec512<uint64_t> b) {
return Vec512<uint64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
}
HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) {
return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
}
HWY_API Vec128<uint64_t> operator*(Vec128<uint64_t> a, Vec128<uint64_t> b) {
return Vec128<uint64_t>{_mm_mullo_epi64(a.raw, b.raw)};
}
// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
#ifdef HWY_NATIVE_I64MULLO
#undef HWY_NATIVE_I64MULLO
#else
#define HWY_NATIVE_I64MULLO
#endif
// Signed
HWY_API Vec512<int16_t> operator*(Vec512<int16_t> a, Vec512<int16_t> b) {
@@ -1172,7 +1188,15 @@ HWY_API Vec512<int16_t> operator*(Vec512<int16_t> a, Vec512<int16_t> b) {
HWY_API Vec512<int32_t> operator*(Vec512<int32_t> a, Vec512<int32_t> b) {
return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
}
HWY_API Vec512<int64_t> operator*(Vec512<int64_t> a, Vec512<int64_t> b) {
return Vec512<int64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
}
HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) {
return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
}
HWY_API Vec128<int64_t> operator*(Vec128<int64_t> a, Vec128<int64_t> b) {
return Vec128<int64_t>{_mm_mullo_epi64(a.raw, b.raw)};
}
// Returns the upper 16 bits of a * b in each lane.
HWY_API Vec512<uint16_t> MulHigh(Vec512<uint16_t> a, Vec512<uint16_t> b) {
return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
@@ -3399,6 +3423,16 @@ HWY_API Vec512<double> ConvertTo(Full512<double> /* tag */,
return Vec512<double>{_mm512_cvtepi64_pd(v.raw)};
}
HWY_API Vec512<float> ConvertTo(Full512<float> /* tag*/,
const Vec512<uint32_t> v) {
return Vec512<float>{_mm512_cvtepu32_ps(v.raw)};
}
HWY_API Vec512<double> ConvertTo(Full512<double> /* tag*/,
const Vec512<uint64_t> v) {
return Vec512<double>{_mm512_cvtepu64_pd(v.raw)};
}
// Truncates (rounds toward zero).
HWY_API Vec512<int32_t> ConvertTo(Full512<int32_t> d, const Vec512<float> v) {
return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw));
@@ -4231,14 +4265,22 @@ HWY_API Vec512<float> MinOfLanes(Full512<float> d, Vec512<float> v) {
HWY_API Vec512<double> MinOfLanes(Full512<double> d, Vec512<double> v) {
return Set(d, _mm512_reduce_min_pd(v.raw));
}
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec512<T> MinOfLanes(Full512<T> d, Vec512<T> v) {
const Repartition<int32_t, decltype(d)> d32;
HWY_API Vec512<uint16_t> MinOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(d32, Min(even, odd));
// Also broadcast into odd lanes.
return BitCast(d, Or(min, ShiftLeft<16>(min)));
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
HWY_API Vec512<int16_t> MinOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(d32, Min(even, odd));
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
// Returns the maximum in each lane.
@@ -4260,14 +4302,22 @@ HWY_API Vec512<float> MaxOfLanes(Full512<float> d, Vec512<float> v) {
HWY_API Vec512<double> MaxOfLanes(Full512<double> d, Vec512<double> v) {
return Set(d, _mm512_reduce_max_pd(v.raw));
}
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec512<T> MaxOfLanes(Full512<T> d, Vec512<T> v) {
const Repartition<int32_t, decltype(d)> d32;
HWY_API Vec512<uint16_t> MaxOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(d32, Max(even, odd));
// Also broadcast into odd lanes.
return BitCast(d, Or(min, ShiftLeft<16>(min)));
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
HWY_API Vec512<int16_t> MaxOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(d32, Max(even, odd));
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
}
// NOLINTNEXTLINE(google-readability-namespace-comments)

View File

@@ -15,7 +15,6 @@
// Print() function
#include <inttypes.h>
#include <stdint.h>
#include "hwy/aligned_allocator.h"

View File

@@ -15,6 +15,9 @@
#include "hwy/print.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stddef.h>
#include <stdio.h>

View File

@@ -15,6 +15,9 @@
#include "hwy/targets.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h> // PRIx64
#include <stdarg.h>
#include <stddef.h>
@@ -23,7 +26,7 @@
#include <atomic>
#include "hwy/per_target.h"
#include "hwy/per_target.h" // VectorBytes
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace

View File

@@ -13,7 +13,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>

View File

@@ -15,7 +15,7 @@
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <string.h> // memcpy
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc"
@@ -63,17 +63,17 @@ struct TestShiftBytes {
auto expected = AllocateAligned<T>(N);
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
const size_t kBlockSize = HWY_MIN(N8, 16);
for (size_t block = 0; block < N8; block += kBlockSize) {
const size_t block_size = HWY_MIN(N8, 16);
for (size_t block = 0; block < N8; block += block_size) {
expected_bytes[block] = 0;
memcpy(expected_bytes + block + 1, in_bytes + block, kBlockSize - 1);
memcpy(expected_bytes + block + 1, in_bytes + block, block_size - 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v));
for (size_t block = 0; block < N8; block += kBlockSize) {
memcpy(expected_bytes + block, in_bytes + block + 1, kBlockSize - 1);
expected_bytes[block + kBlockSize - 1] = 0;
for (size_t block = 0; block < N8; block += block_size) {
memcpy(expected_bytes + block, in_bytes + block + 1, block_size - 1);
expected_bytes[block + block_size - 1] = 0;
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v));
#else
@@ -152,7 +152,7 @@ template <int kBytes>
struct TestCombineShiftRightBytes {
template <class T, class D>
HWY_NOINLINE void operator()(T, D d) {
const size_t kBlockSize = 16;
constexpr size_t kBlockSize = 16;
static_assert(kBytes < kBlockSize, "Shift count is per block");
const Repartition<uint8_t, D> d8;
const size_t N8 = Lanes(d8);
@@ -170,6 +170,7 @@ struct TestCombineShiftRightBytes {
lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
}
for (size_t i = 0; i < N8; i += kBlockSize) {
// Arguments are not the same size.
CopyBytes<kBlockSize>(&lo_bytes[i], combined);
CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
CopyBytes<kBlockSize>(combined + kBytes, &expected_bytes[i]);
@@ -194,7 +195,7 @@ struct TestCombineShiftRightLanes {
auto hi_bytes = AllocateAligned<uint8_t>(N8);
auto lo_bytes = AllocateAligned<uint8_t>(N8);
auto expected_bytes = AllocateAligned<uint8_t>(N8);
const size_t kBlockSize = 16;
constexpr size_t kBlockSize = 16;
uint8_t combined[2 * kBlockSize];
// Random inputs in each lane
@@ -205,6 +206,7 @@ struct TestCombineShiftRightLanes {
lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
}
for (size_t i = 0; i < N8; i += kBlockSize) {
// Arguments are not the same size.
CopyBytes<kBlockSize>(&lo_bytes[i], combined);
CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
CopyBytes<kBlockSize>(combined + kLanes * sizeof(T),

View File

@@ -15,6 +15,7 @@
#include <stddef.h>
#include <stdint.h>
#include <string.h> // memcpy
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/combine_test.cc"

View File

@@ -13,7 +13,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <inttypes.h> // PRIu64
#include <stddef.h>
#include <stdint.h>
#include <string.h> // memset
@@ -44,19 +43,16 @@ void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
int line) {
if (expected_pos != actual_pos) {
hwy::Abort(
__FILE__, line,
"Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
TypeName(T(), Lanes(d)).c_str(), static_cast<uint64_t>(expected_pos),
static_cast<uint64_t>(actual_pos));
hwy::Abort(__FILE__, line, "Size mismatch for %s: expected %d, actual %d\n",
TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
static_cast<int>(actual_pos));
}
// Modified from AssertVecEqual - we may not be checking all lanes.
for (size_t i = 0; i < num_to_check; ++i) {
if (!IsEqual(expected[i], actual_u[i])) {
const size_t N = Lanes(d);
fprintf(stderr, "Mismatch at i=%" PRIu64 " of %" PRIu64 ", line %d:\n\n",
static_cast<uint64_t>(i), static_cast<uint64_t>(num_to_check),
line);
fprintf(stderr, "Mismatch at i=%d of %d, line %d:\n\n",
static_cast<int>(i), static_cast<int>(num_to_check), line);
Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
Print(d, "in", Load(d, in.get()), 0, N);
Print(d, "expect", Load(d, expected.get()), 0, N);
@@ -97,7 +93,7 @@ struct TestCompress {
for (size_t i = 0; i < N; ++i) {
const uint64_t bits = Random32(&rng);
in_lanes[i] = T(); // cannot initialize float16_t directly.
CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
CopyBytes<sizeof(T)>(&bits, &in_lanes[i]); // not same size
mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
if (mask_lanes[i] > 0) {
expected[expected_pos++] = in_lanes[i];
@@ -203,8 +199,8 @@ struct TestCompressBlocks {
for (size_t i = 0; i < N; i += 2) {
const uint64_t bits = Random32(&rng);
in_lanes[i + 1] = in_lanes[i] = T(); // cannot set float16_t directly.
CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]);
CopyBytes<sizeof(T)>(&bits, &in_lanes[i]); // not same size
CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]); // not same size
mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0};
if (mask_lanes[i] > 0) {
expected[expected_pos++] = in_lanes[i];
@@ -598,8 +594,7 @@ void PrintCompress32x4Tables() {
for (size_t i = 0; i < N; ++i) {
for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
printf("%" PRIu64 ",",
static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
}
}
}
@@ -630,8 +625,7 @@ void PrintCompressNot32x4Tables() {
for (size_t i = 0; i < N; ++i) {
for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
printf("%" PRIu64 ",",
static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
}
}
}
@@ -662,8 +656,7 @@ void PrintCompress64x2Tables() {
for (size_t i = 0; i < N; ++i) {
for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
printf("%" PRIu64 ",",
static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
}
}
}
@@ -694,8 +687,7 @@ void PrintCompressNot64x2Tables() {
for (size_t i = 0; i < N; ++i) {
for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
printf("%" PRIu64 ",",
static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
}
}
}

View File

@@ -16,6 +16,9 @@
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <cmath> // std::isfinite
#include "hwy/base.h"
#undef HWY_TARGET_INCLUDE
@@ -155,7 +158,7 @@ struct TestPromoteTo {
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
for (size_t i = 0; i < N; ++i) {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(T));
CopyBytes<sizeof(T)>(&bits, &from[i]); // not same size
expected[i] = from[i];
}
@@ -235,13 +238,19 @@ AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
-2.00390625f, -3.99609375f,
// No infinity/NaN - implementation-defined due to ARM.
};
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
const size_t N = Lanes(d);
HWY_ASSERT(N != 0);
padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors
auto in = AllocateAligned<float>(padded);
auto expected = AllocateAligned<float>(padded);
std::copy(test_cases, test_cases + kNumTestCases, in.get());
std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
size_t i = 0;
for (; i < kNumTestCases; ++i) {
in[i] = test_cases[i];
}
for (; i < padded; ++i) {
in[i] = 0.0f;
}
return in;
}
@@ -250,10 +259,11 @@ struct TestF16 {
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
#if HWY_HAVE_FLOAT16
size_t padded;
const size_t N = Lanes(d32); // same count for f16
HWY_ASSERT(N != 0);
auto in = F16TestCases(d32, padded);
using TF16 = float16_t;
const Rebind<TF16, DF32> d16;
const size_t N = Lanes(d32); // same count for f16
auto temp16 = AllocateAligned<TF16>(N);
for (size_t i = 0; i < padded; i += N) {
@@ -289,13 +299,19 @@ AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
// negative +/- delta
-2.015625f, -3.984375f,
};
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
const size_t N = Lanes(d);
HWY_ASSERT(N != 0);
padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors
auto in = AllocateAligned<float>(padded);
auto expected = AllocateAligned<float>(padded);
std::copy(test_cases, test_cases + kNumTestCases, in.get());
std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
size_t i = 0;
for (; i < kNumTestCases; ++i) {
in[i] = test_cases[i];
}
for (; i < padded; ++i) {
in[i] = 0.0f;
}
return in;
}
@@ -387,10 +403,13 @@ HWY_NOINLINE void TestAllTruncate() {
struct TestIntFromFloatHuge {
template <typename TF, class DF>
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
// Still does not work, although ARMv7 manual says that float->int
// saturates, i.e. chooses the nearest representable value. Also causes
// out-of-memory for MSVC.
#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC
// The ARMv7 manual says that float->int saturates, i.e. chooses the
// nearest representable value. This works correctly on armhf with GCC, but
// not with clang. For reasons unknown, MSVC also runs into an out-of-memory
// error here.
#if HWY_COMPILER_CLANG || HWY_COMPILER_MSVC
(void)df;
#else
using TI = MakeSigned<TF>;
const Rebind<TI, DF> di;
@@ -406,8 +425,6 @@ struct TestIntFromFloatHuge {
// Huge negative
Store(Set(di, LimitsMin<TI>()), di, expected.get());
HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20))));
#else
(void)df;
#endif
}
};
@@ -451,7 +468,7 @@ class TestIntFromFloat {
for (size_t i = 0; i < N; ++i) {
do {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(TF));
CopyBytes<sizeof(TF)>(&bits, &from[i]); // not same size
} while (!std::isfinite(from[i]));
if (from[i] >= max) {
expected[i] = LimitsMax<TI>();
@@ -532,6 +549,34 @@ HWY_NOINLINE void TestAllFloatFromInt() {
ForFloatTypes(ForPartialVectors<TestFloatFromInt>());
}
struct TestFloatFromUint {
template <typename TF, class DF>
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
using TU = MakeUnsigned<TF>;
const RebindToUnsigned<DF> du;
// Integer positive
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(du, TU(4))));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(65535.0)),
ConvertTo(df, Iota(du, 65535))); // 2^16-1
if (sizeof(TF) > 4) {
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4294967295.0)),
ConvertTo(df, Iota(du, 4294967295ULL))); // 2^32-1
}
// Max positive
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TU>())),
ConvertTo(df, Set(du, LimitsMax<TU>())));
// Zero
HWY_ASSERT_VEC_EQ(df, Zero(df), ConvertTo(df, Zero(du)));
}
};
HWY_NOINLINE void TestAllFloatFromUint() {
ForFloatTypes(ForPartialVectors<TestFloatFromUint>());
}
struct TestI32F64 {
template <typename TF, class DF>
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
@@ -591,6 +636,7 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllTruncate);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromUint);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
} // namespace hwy

View File

@@ -492,8 +492,8 @@ struct TestCLMul {
const size_t padded = RoundUpTo(kCLMulNum, N);
auto expected_lower = AllocateAligned<T>(padded);
auto expected_upper = AllocateAligned<T>(padded);
memcpy(expected_lower.get(), kCLMulLower, kCLMulNum * sizeof(T));
memcpy(expected_upper.get(), kCLMulUpper, kCLMulNum * sizeof(T));
CopyBytes<kCLMulNum * sizeof(T)>(kCLMulLower, expected_lower.get());
CopyBytes<kCLMulNum * sizeof(T)>(kCLMulUpper, expected_upper.get());
const size_t padding_size = (padded - kCLMulNum) * sizeof(T);
memset(expected_lower.get() + kCLMulNum, 0, padding_size);
memset(expected_upper.get() + kCLMulNum, 0, padding_size);

View File

@@ -15,7 +15,6 @@
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/demote_test.cc"
@@ -66,7 +65,7 @@ struct TestDemoteTo {
for (size_t i = 0; i < N; ++i) {
do {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(T));
CopyBytes<sizeof(T)>(&bits, &from[i]); // not same size
} while (!value_ok(from[i]));
expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
}
@@ -116,7 +115,7 @@ struct TestDemoteToFloat {
for (size_t i = 0; i < N; ++i) {
do {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(T));
CopyBytes<sizeof(T)>(&bits, &from[i]); // not same size
} while (!IsFiniteT(from[i]));
const T magn = std::abs(from[i]);
const T max_abs = HighestValue<ToT>();

View File

@@ -15,7 +15,6 @@
// Tests some ops specific to floating-point types (Div, Round etc.)
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
@@ -113,9 +112,8 @@ struct TestReciprocalSquareRoot {
float err = lanes[i] - 0.090166f;
if (err < 0.0f) err = -err;
if (err >= 4E-4f) {
HWY_ABORT("Lane %" PRIu64 "(%" PRIu64 "): actual %f err %f\n",
static_cast<uint64_t>(i), static_cast<uint64_t>(N), lanes[i],
err);
HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast<int>(i),
static_cast<int>(N), lanes[i], err);
}
}
}

View File

@@ -13,6 +13,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS // before inttypes.h
#endif
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>

View File

@@ -13,7 +13,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h> // memcmp

View File

@@ -13,7 +13,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
@@ -58,7 +57,7 @@ struct TestUnsignedMul {
HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));
const size_t bits = sizeof(T) * 8;
const uint64_t mask = (1ull << bits) - 1;
const uint64_t mask = bits==64 ? (~uint64_t{0}) : (1ull << bits) - 1;
const T max2 = (static_cast<uint64_t>(max) * max) & mask;
HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
}
@@ -97,13 +96,13 @@ HWY_NOINLINE void TestAllMul() {
// No u8.
test_unsigned(uint16_t());
test_unsigned(uint32_t());
// No u64.
test_unsigned(uint64_t());
const ForPartialVectors<TestSignedMul> test_signed;
// No i8.
test_signed(int16_t());
test_signed(int32_t());
// No i64.
test_signed(int64_t());
}
struct TestMulHigh {

View File

@@ -13,7 +13,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
@@ -80,6 +79,35 @@ struct TestMinOfLanes {
min = HWY_MIN(min, in_lanes[i]);
}
HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
// Bug #910: also check negative values
min = HighestValue<T>();
const T input_copy[] = {static_cast<T>(-1),
static_cast<T>(-2),
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14};
size_t i = 0;
for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
in_lanes[i] = input_copy[i];
min = HWY_MIN(min, input_copy[i]);
}
// Pad with neutral element to full vector (so we can load)
for (; i < N; ++i) {
in_lanes[i] = min;
}
HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
}
};
@@ -105,6 +133,35 @@ struct TestMaxOfLanes {
max = HWY_MAX(max, in_lanes[i]);
}
HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
// Bug #910: also check negative values
max = LowestValue<T>();
const T input_copy[] = {static_cast<T>(-1),
static_cast<T>(-2),
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14};
size_t i = 0;
for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
in_lanes[i] = input_copy[i];
max = HWY_MAX(max, in_lanes[i]);
}
// Pad with neutral element to full vector (so we can load)
for (; i < N; ++i) {
in_lanes[i] = max;
}
HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
}
};

View File

@@ -13,7 +13,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
@@ -243,7 +242,7 @@ T RightShiftNegative(T val) {
// seen divisions replaced with shifts, so resort to bit operations.
using TU = hwy::MakeUnsigned<T>;
TU bits;
CopyBytes<sizeof(T)>(&val, &bits);
CopySameSize(&val, &bits);
const TU shifted = TU(bits >> kAmount);
@@ -252,7 +251,7 @@ T RightShiftNegative(T val) {
const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
bits = shifted | sign_extended;
CopyBytes<sizeof(T)>(&bits, &val);
CopySameSize(&bits, &val);
return val;
}
@@ -356,7 +355,7 @@ struct TestVariableSignedRightShifts {
for (size_t i = 0; i < N; ++i) {
const size_t amount = i & kMaxShift;
const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
CopyBytes<sizeof(T)>(&shifted, &expected[i]);
CopySameSize(&shifted, &expected[i]);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
@@ -364,7 +363,7 @@ struct TestVariableSignedRightShifts {
for (size_t i = 0; i < N; ++i) {
const size_t amount = kMaxShift - (i & kMaxShift);
const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
CopyBytes<sizeof(T)>(&shifted, &expected[i]);
CopySameSize(&shifted, &expected[i]);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
}

View File

@@ -15,7 +15,6 @@
// Target-specific helper functions for use by *_test.cc.
#include <inttypes.h>
#include <stdint.h>
#include "hwy/base.h"
@@ -97,8 +96,8 @@ HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
// First check whole bytes (if that many elements are still valid)
for (; i < N / 8; ++i) {
if (bits_a[i] != bits_b[i]) {
fprintf(stderr, "Mismatch in byte %" PRIu64 ": %d != %d\n",
static_cast<uint64_t>(i), bits_a[i], bits_b[i]);
fprintf(stderr, "Mismatch in byte %d: %d != %d\n", static_cast<int>(i),
bits_a[i], bits_b[i]);
Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
hwy::Abort(filename, line, "Masks not equal");
@@ -111,8 +110,8 @@ HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
const int valid_a = bits_a[i] & mask;
const int valid_b = bits_b[i] & mask;
if (valid_a != valid_b) {
fprintf(stderr, "Mismatch in last byte %" PRIu64 ": %d != %d\n",
static_cast<uint64_t>(i), valid_a, valid_b);
fprintf(stderr, "Mismatch in last byte %d: %d != %d\n",
static_cast<int>(i), valid_a, valid_b);
Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
hwy::Abort(filename, line, "Masks not equal");

View File

@@ -15,7 +15,6 @@
#include "hwy/tests/test_util.h"
#include <inttypes.h>
#include <stddef.h>
#include <stdio.h>
@@ -71,8 +70,7 @@ HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
CopyBytes<8>(actual_ptr, &actual);
return ComputeUlpDelta(expected, actual) <= 1;
} else {
HWY_ABORT("Unexpected float size %" PRIu64 "\n",
static_cast<uint64_t>(info.sizeof_t));
HWY_ABORT("Unexpected float size %d\n", static_cast<int>(info.sizeof_t));
return false;
}
}
@@ -88,10 +86,9 @@ HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
char actual_str[100];
ToString(info, actual_ptr, actual_str);
Abort(filename, line,
"%s, %sx%" PRIu64 " lane %" PRIu64
" mismatch: expected '%s', got '%s'.\n",
target_name, type_name, static_cast<uint64_t>(num_lanes),
static_cast<uint64_t>(lane), expected_str, actual_str);
"%s, %sx%d lane %d mismatch: expected '%s', got '%s'.\n", target_name,
type_name, static_cast<int>(num_lanes), static_cast<int>(lane),
expected_str, actual_str);
}
HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,

View File

@@ -105,8 +105,8 @@ TU ComputeUlpDelta(const T expected, const T actual) {
// Compute the difference in units of last place. We do not need to check for
// differing signs; they will result in large differences, which is fine.
TU ux, uy;
CopyBytes<sizeof(T)>(&expected, &ux);
CopyBytes<sizeof(T)>(&actual, &uy);
CopySameSize(&expected, &ux);
CopySameSize(&actual, &uy);
// Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
const TU ulp = HWY_MAX(ux, uy) - HWY_MIN(ux, uy);

View File

@@ -32,7 +32,7 @@ jobs:
with:
repository: libjxl/conformance
# TODO(eustas): move ref to a global variable / file?
ref: a6a44bbbd69830e1dc862174599ce5738a0a414f
ref: 43d8135b50e53167ee6fee0b842b9eb15cfc4aa2
path: conformance
- name: Cache
uses: actions/cache@v2
@@ -161,7 +161,7 @@ jobs:
uses: actions/checkout@v2
with:
repository: libjxl/conformance
ref: a6a44bbbd69830e1dc862174599ce5738a0a414f
ref: 43d8135b50e53167ee6fee0b842b9eb15cfc4aa2
path: conformance
- name: Cache
uses: actions/cache@v2

View File

@@ -48,6 +48,7 @@ roland-rollo
Samuel Leong <wvvwvvvvwvvw@gmail.com>
Sandro <sandro.jaeckel@gmail.com>
Stephan T. Lavavej <stl@nuwen.net>
Thomas Bonfort <thomas.bonfort@airbus.com>
Vincent Torri <vincent.torri@gmail.com>
xiota
Yonatan Nebenzhal <yonatan.nebenzhl@gmail.com>

View File

@@ -7,9 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Unreleased
### Added
- encoder API: new function `JxlEncoderSetFrameBitDepth` to set the bit depth
of the input buffer.
- decoder API: new function `JxlDecoderSetImageBitDepth` to set the bit depth
of the output buffer.
## [0.7] - 2022-07-21
### Added
- Export version information in headers.
- decoder API: Ability to decode the content of metadata boxes:
`JXL_DEC_BOX`, `JXL_DEC_BOX_NEED_MORE_OUTPUT`, `JxlDecoderSetBoxBuffer`,
`JxlDecoderGetBoxType`, `JxlDecoderGetBoxSizeRaw` and

View File

@@ -1394,10 +1394,8 @@ cmd_bump_version() {
fi
fi
newver="${major}.${minor}"
if [[ "${patch}" != "0" ]]; then
newver="${newver}.${patch}"
fi
newver="${major}.${minor}.${patch}"
echo "Bumping version to ${newver} (${major}.${minor}.${patch})"
sed -E \
-e "s/(set\\(JPEGXL_MAJOR_VERSION) [0-9]+\\)/\\1 ${major})/" \

View File

@@ -51,9 +51,9 @@ std::string ExtensionFromCodec(Codec codec, const bool is_gray,
case Codec::kPNG:
return ".png";
case Codec::kPNM:
if (bits_per_sample == 32) return ".pfm";
if (has_alpha) return ".pam";
if (is_gray) return ".pgm";
return (bits_per_sample == 32) ? ".pfm" : ".ppm";
return is_gray ? ".pgm" : ".ppm";
case Codec::kGIF:
return ".gif";
case Codec::kEXR:
@@ -173,10 +173,11 @@ struct TestImageParams {
bool is_gray;
bool add_alpha;
bool big_endian;
bool add_extra_channels;
bool ShouldTestRoundtrip() const {
if (codec == Codec::kPNG) {
return true;
return bits_per_sample <= 16;
} else if (codec == Codec::kPNM) {
// TODO(szabadka) Make PNM encoder endianness-aware.
return ((bits_per_sample <= 16 && big_endian) ||
@@ -213,7 +214,7 @@ struct TestImageParams {
std::string DebugString() const {
std::ostringstream os;
os << "bps:" << bits_per_sample << " gr:" << is_gray << " al:" << add_alpha
<< " be: " << big_endian;
<< " be: " << big_endian << " ec: " << add_extra_channels;
return os.str();
}
};
@@ -233,6 +234,19 @@ void CreateTestImage(const TestImageParams& params, PackedPixelFile* ppf) {
PackedFrame frame(params.xsize, params.ysize, params.PixelFormat());
FillPackedImage(params.bits_per_sample, &frame.color);
if (params.add_extra_channels) {
for (size_t i = 0; i < 7; ++i) {
JxlPixelFormat ec_format = params.PixelFormat();
ec_format.num_channels = 1;
PackedImage ec(params.xsize, params.ysize, ec_format);
FillPackedImage(params.bits_per_sample, &ec);
frame.extra_channels.emplace_back(std::move(ec));
PackedExtraChannel pec;
pec.ec_info.bits_per_sample = params.bits_per_sample;
pec.ec_info.type = static_cast<JxlExtraChannelType>(i);
ppf->extra_channels_info.emplace_back(std::move(pec));
}
}
ppf->frames.emplace_back(std::move(frame));
}
@@ -254,8 +268,13 @@ void TestRoundTrip(const TestImageParams& params, ThreadPool* pool) {
ASSERT_EQ(encoded.bitstreams.size(), 1);
PackedPixelFile ppf_out;
ColorHints color_hints;
if (params.codec == Codec::kPNM || params.codec == Codec::kPGX) {
color_hints.Add("color_space",
params.is_gray ? "Gra_D65_Rel_SRG" : "RGB_D65_SRG_Rel_SRG");
}
ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(encoded.bitstreams[0]),
ColorHints(), SizeConstraints(), &ppf_out));
color_hints, SizeConstraints(), &ppf_out));
if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
params.codec != Codec::kEXR) {
@@ -263,9 +282,21 @@ void TestRoundTrip(const TestImageParams& params, ThreadPool* pool) {
}
ASSERT_EQ(ppf_out.frames.size(), 1);
VerifySameImage(ppf_in.frames[0].color, ppf_in.info.bits_per_sample,
ppf_out.frames[0].color, ppf_out.info.bits_per_sample,
const auto& frame_in = ppf_in.frames[0];
const auto& frame_out = ppf_out.frames[0];
VerifySameImage(frame_in.color, ppf_in.info.bits_per_sample, frame_out.color,
ppf_out.info.bits_per_sample,
/*lossless=*/params.codec != Codec::kJPG);
ASSERT_EQ(frame_in.extra_channels.size(), frame_out.extra_channels.size());
ASSERT_EQ(ppf_out.extra_channels_info.size(),
frame_out.extra_channels.size());
for (size_t i = 0; i < frame_in.extra_channels.size(); ++i) {
VerifySameImage(frame_in.extra_channels[i], ppf_in.info.bits_per_sample,
frame_out.extra_channels[i], ppf_out.info.bits_per_sample,
/*lossless=*/true);
EXPECT_EQ(ppf_out.extra_channels_info[i].ec_info.type,
ppf_in.extra_channels_info[i].ec_info.type);
}
}
TEST(CodecTest, TestRoundTrip) {
@@ -285,7 +316,12 @@ TEST(CodecTest, TestRoundTrip) {
params.is_gray = is_gray;
params.add_alpha = add_alpha;
params.big_endian = big_endian;
params.add_extra_channels = false;
TestRoundTrip(params, &pool);
if (codec == Codec::kPNM && add_alpha) {
params.add_extra_channels = true;
TestRoundTrip(params, &pool);
}
}
}
}

View File

@@ -68,6 +68,39 @@ struct BoxProcessor {
}
};
void SetBitDepthFromDataType(JxlDataType data_type, uint32_t* bits_per_sample,
uint32_t* exponent_bits_per_sample) {
switch (data_type) {
case JXL_TYPE_UINT8:
*bits_per_sample = 8;
*exponent_bits_per_sample = 0;
break;
case JXL_TYPE_UINT16:
*bits_per_sample = 16;
*exponent_bits_per_sample = 0;
break;
case JXL_TYPE_FLOAT16:
*bits_per_sample = 16;
*exponent_bits_per_sample = 5;
break;
case JXL_TYPE_FLOAT:
*bits_per_sample = 32;
*exponent_bits_per_sample = 8;
break;
}
}
template <typename T>
void UpdateBitDepth(JxlBitDepth bit_depth, JxlDataType data_type, T* info) {
if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
SetBitDepthFromDataType(data_type, &info->bits_per_sample,
&info->exponent_bits_per_sample);
} else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
info->bits_per_sample = bit_depth.bits_per_sample;
info->exponent_bits_per_sample = bit_depth.exponent_bits_per_sample;
}
}
} // namespace
bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
@@ -185,8 +218,12 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
}
break;
}
size_t released_size = JxlDecoderReleaseInput(dec);
fprintf(stderr,
"Input file is truncated and allow_partial_input was disabled.");
"Input file is truncated (total bytes: %" PRIuS
", processed bytes: %" PRIuS
") and allow_partial_input was disabled.",
bytes_size, bytes_size - released_size);
return false;
} else if (status == JXL_DEC_BOX) {
boxes.FinalizeOutput();
@@ -254,9 +291,11 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
if (!have_alpha) {
// Mark in the basic info that alpha channel was dropped.
ppf->info.alpha_bits = 0;
} else if (dparams.unpremultiply_alpha) {
// Mark in the basic info that alpha was unpremultiplied.
ppf->info.alpha_premultiplied = false;
} else {
if (dparams.unpremultiply_alpha) {
// Mark in the basic info that alpha was unpremultiplied.
ppf->info.alpha_premultiplied = false;
}
}
bool alpha_found = false;
for (uint32_t i = 0; i < ppf->info.num_extra_channels; ++i) {
@@ -421,9 +460,21 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
return false;
}
}
if (JXL_DEC_SUCCESS !=
JxlDecoderSetImageOutBitDepth(dec, &dparams.output_bitdepth)) {
fprintf(stderr, "JxlDecoderSetImageOutBitDepth failed\n");
return false;
}
UpdateBitDepth(dparams.output_bitdepth, format.data_type, &ppf->info);
bool have_alpha = (format.num_channels == 2 || format.num_channels == 4);
if (have_alpha) {
// Interleaved alpha channels has the same bit depth as color channels.
ppf->info.alpha_bits = ppf->info.bits_per_sample;
ppf->info.alpha_exponent_bits = ppf->info.exponent_bits_per_sample;
}
JxlPixelFormat ec_format = format;
ec_format.num_channels = 1;
for (const auto& eci : ppf->extra_channels_info) {
for (auto& eci : ppf->extra_channels_info) {
frame.extra_channels.emplace_back(jxl::extras::PackedImage(
ppf->info.xsize, ppf->info.ysize, ec_format));
auto& ec = frame.extra_channels.back();
@@ -446,6 +497,8 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
fprintf(stderr, "JxlDecoderSetExtraChannelBuffer failed\n");
return false;
}
UpdateBitDepth(dparams.output_bitdepth, ec_format.data_type,
&eci.ec_info);
}
} else if (status == JXL_DEC_SUCCESS) {
// Decoding finished successfully.

View File

@@ -53,6 +53,9 @@ struct JXLDecompressParams {
bool use_image_callback = true;
// Whether to unpremultiply colors for associated alpha channels.
bool unpremultiply_alpha = false;
// Controls the effective bit depth of the output pixels.
JxlBitDepth output_bitdepth = {JXL_BIT_DEPTH_FROM_PIXEL_FORMAT, 0, 0};
};
bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,

View File

@@ -24,6 +24,7 @@ struct HeaderPNM {
size_t bits_per_sample;
bool floating_point;
bool big_endian;
std::vector<JxlExtraChannelType> ec_types; // PAM
};
class Parser {
@@ -183,16 +184,20 @@ class Parser {
Status ParseHeaderPAM(HeaderPNM* header, const uint8_t** pos) {
size_t depth = 3;
size_t max_val = 255;
JXL_RETURN_IF_ERROR(SkipWhitespace());
while (!MatchString("ENDHDR", /*skipws=*/false)) {
JXL_RETURN_IF_ERROR(SkipWhitespace());
if (MatchString("WIDTH")) {
JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize));
JXL_RETURN_IF_ERROR(SkipWhitespace());
} else if (MatchString("HEIGHT")) {
JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize));
JXL_RETURN_IF_ERROR(SkipWhitespace());
} else if (MatchString("DEPTH")) {
JXL_RETURN_IF_ERROR(ParseUnsigned(&depth));
JXL_RETURN_IF_ERROR(SkipWhitespace());
} else if (MatchString("MAXVAL")) {
JXL_RETURN_IF_ERROR(ParseUnsigned(&max_val));
JXL_RETURN_IF_ERROR(SkipWhitespace());
} else if (MatchString("TUPLTYPE")) {
if (MatchString("RGB_ALPHA")) {
header->has_alpha = true;
@@ -209,6 +214,20 @@ class Parser {
} else if (MatchString("BLACKANDWHITE")) {
header->is_gray = true;
max_val = 1;
} else if (MatchString("Alpha")) {
header->ec_types.push_back(JXL_CHANNEL_ALPHA);
} else if (MatchString("Depth")) {
header->ec_types.push_back(JXL_CHANNEL_DEPTH);
} else if (MatchString("SpotColor")) {
header->ec_types.push_back(JXL_CHANNEL_SPOT_COLOR);
} else if (MatchString("SelectionMask")) {
header->ec_types.push_back(JXL_CHANNEL_SELECTION_MASK);
} else if (MatchString("Black")) {
header->ec_types.push_back(JXL_CHANNEL_BLACK);
} else if (MatchString("CFA")) {
header->ec_types.push_back(JXL_CHANNEL_CFA);
} else if (MatchString("Thermal")) {
header->ec_types.push_back(JXL_CHANNEL_THERMAL);
} else {
return JXL_FAILURE("PAM: unknown TUPLTYPE");
}
@@ -223,7 +242,7 @@ class Parser {
}
size_t num_channels = header->is_gray ? 1 : 3;
if (header->has_alpha) num_channels++;
if (num_channels != depth) {
if (num_channels + header->ec_types.size() != depth) {
return JXL_FAILURE("PAM: bad DEPTH");
}
if (max_val == 0 || max_val >= 65536) {
@@ -341,7 +360,17 @@ Status DecodeImagePNM(const Span<const uint8_t> bytes,
ppf->info.alpha_bits = (header.has_alpha ? ppf->info.bits_per_sample : 0);
ppf->info.alpha_exponent_bits = 0;
ppf->info.num_color_channels = (header.is_gray ? 1 : 3);
ppf->info.num_extra_channels = (header.has_alpha ? 1 : 0);
uint32_t num_alpha_channels = (header.has_alpha ? 1 : 0);
uint32_t num_interleaved_channels =
ppf->info.num_color_channels + num_alpha_channels;
ppf->info.num_extra_channels = num_alpha_channels + header.ec_types.size();
for (auto type : header.ec_types) {
PackedExtraChannel pec;
pec.ec_info.bits_per_sample = ppf->info.bits_per_sample;
pec.ec_info.type = type;
ppf->extra_channels_info.emplace_back(std::move(pec));
}
JxlDataType data_type;
if (header.floating_point) {
@@ -356,27 +385,50 @@ Status DecodeImagePNM(const Span<const uint8_t> bytes,
}
const JxlPixelFormat format{
/*num_channels=*/ppf->info.num_color_channels +
ppf->info.num_extra_channels,
/*num_channels=*/num_interleaved_channels,
/*data_type=*/data_type,
/*endianness=*/header.big_endian ? JXL_BIG_ENDIAN : JXL_LITTLE_ENDIAN,
/*align=*/0,
};
const JxlPixelFormat ec_format{1, format.data_type, format.endianness, 0};
ppf->frames.clear();
ppf->frames.emplace_back(header.xsize, header.ysize, format);
auto* frame = &ppf->frames.back();
for (size_t i = 0; i < header.ec_types.size(); ++i) {
frame->extra_channels.emplace_back(header.xsize, header.ysize, ec_format);
}
size_t pnm_remaining_size = bytes.data() + bytes.size() - pos;
if (pnm_remaining_size < frame->color.pixels_size) {
return JXL_FAILURE("PNM file too small");
}
const bool flipped_y = header.bits_per_sample == 32; // PFMs are flipped
uint8_t* out = reinterpret_cast<uint8_t*>(frame->color.pixels());
for (size_t y = 0; y < header.ysize; ++y) {
size_t y_in = flipped_y ? header.ysize - 1 - y : y;
const uint8_t* row_in = &pos[y_in * frame->color.stride];
uint8_t* row_out = &out[y * frame->color.stride];
memcpy(row_out, row_in, frame->color.stride);
std::vector<uint8_t*> ec_out(header.ec_types.size());
for (size_t i = 0; i < ec_out.size(); ++i) {
ec_out[i] = reinterpret_cast<uint8_t*>(frame->extra_channels[i].pixels());
}
if (ec_out.empty()) {
const bool flipped_y = header.bits_per_sample == 32; // PFMs are flipped
for (size_t y = 0; y < header.ysize; ++y) {
size_t y_in = flipped_y ? header.ysize - 1 - y : y;
const uint8_t* row_in = &pos[y_in * frame->color.stride];
uint8_t* row_out = &out[y * frame->color.stride];
memcpy(row_out, row_in, frame->color.stride);
}
} else {
size_t pwidth = PackedImage::BitsPerChannel(data_type) / 8;
for (size_t y = 0; y < header.ysize; ++y) {
for (size_t x = 0; x < header.xsize; ++x) {
memcpy(out, pos, frame->color.pixel_stride());
out += frame->color.pixel_stride();
pos += frame->color.pixel_stride();
for (auto& p : ec_out) {
memcpy(p, pos, pwidth);
pos += pwidth;
p += pwidth;
}
}
}
}
return true;
}

View File

@@ -0,0 +1,159 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "lib/extras/dec_group_jpeg.h"
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <memory>
#include <utility>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "lib/extras/dec_group_jpeg.cc"
#include <hwy/foreach_target.h>
#include <hwy/highway.h>
#include "lib/jxl/base/status.h"
#include "lib/jxl/dct_scales.h"
#include "lib/jxl/dec_transforms-inl.h"
#include "lib/jxl/render_pipeline/render_pipeline.h"
HWY_BEFORE_NAMESPACE();
namespace jxl {
namespace HWY_NAMESPACE {
// These templates are not found via ADL.
using hwy::HWY_NAMESPACE::And;
using hwy::HWY_NAMESPACE::AndNot;
using hwy::HWY_NAMESPACE::ApproximateReciprocal;
using hwy::HWY_NAMESPACE::Gt;
using hwy::HWY_NAMESPACE::IfThenElse;
using hwy::HWY_NAMESPACE::IfThenElseZero;
using hwy::HWY_NAMESPACE::Lt;
using hwy::HWY_NAMESPACE::Rebind;
using hwy::HWY_NAMESPACE::Vec;
using hwy::HWY_NAMESPACE::Xor;
using D = HWY_FULL(float);
using DI = HWY_FULL(int32_t);
constexpr D d;
constexpr DI di;
template <class DI>
HWY_INLINE HWY_MAYBE_UNUSED Vec<Rebind<float, DI>> AdjustQuantBias(
DI di, const size_t c, const Vec<DI> quant_i,
const float* HWY_RESTRICT biases) {
const Rebind<float, DI> df;
const auto quant = ConvertTo(df, quant_i);
// Compare |quant|, keep sign bit for negating result.
const auto kSign = BitCast(df, Set(di, INT32_MIN));
const auto sign = And(quant, kSign); // TODO(janwas): = abs ^ orig
const auto abs_quant = AndNot(kSign, quant);
// If |x| is 1, kZeroBias creates a different bias for each channel.
// We're implementing the following:
// if (quant == 0) return 0;
// if (quant == 1) return biases[c];
// if (quant == -1) return -biases[c];
// return quant - biases[3] / quant;
// Integer comparison is not helpful because Clang incurs bypass penalties
// from unnecessarily mixing integer and float.
const auto is_01 = Lt(abs_quant, Set(df, 1.125f));
const auto not_0 = Gt(abs_quant, Zero(df));
// Bitwise logic is faster than quant * biases[c].
const auto one_bias = IfThenElseZero(not_0, Xor(Set(df, biases[c]), sign));
// About 2E-5 worse than ReciprocalNR or division.
const auto bias =
NegMulAdd(Set(df, biases[3]), ApproximateReciprocal(quant), quant);
return IfThenElse(is_01, one_bias, bias);
}
void DequantBlock(const int16_t* JXL_RESTRICT qblock, size_t c,
const float* JXL_RESTRICT dequant_matrices,
const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
for (size_t k = 0; k < kDCTBlockSize; k += Lanes(d)) {
const auto mul = Load(d, dequant_matrices + c * kDCTBlockSize + k);
Rebind<int16_t, DI> di16;
Vec<DI> quantized = PromoteTo(di, Load(di16, qblock + k));
const auto dequant = Mul(AdjustQuantBias(di, c, quantized, biases), mul);
Store(dequant, d, block + k);
}
}
Status DecodeGroupJpeg(const Image3S& coeffs, size_t group_idx,
const Rect block_rect, const YCbCrChromaSubsampling& cs,
const float* dequant_matrices,
float* JXL_RESTRICT group_dec_cache, size_t thread,
RenderPipelineInput& render_pipeline_input) {
HWY_ALIGN float* const block = group_dec_cache;
HWY_ALIGN float* const scratch_space = block + kDCTBlockSize;
size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)};
size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)};
static constexpr float kDefaultQuantBias[4] = {
1.0f - 0.05465007330715401f,
1.0f - 0.07005449891748593f,
1.0f - 0.049935103337343655f,
0.145f,
};
for (size_t c = 0; c < 3; ++c) {
ImageF* rpbuffer = render_pipeline_input.GetBuffer(c).first;
Rect rect = render_pipeline_input.GetBuffer(c).second;
size_t xsize_blocks = DivCeil(block_rect.xsize(), 1 << hshift[c]);
size_t ysize_blocks = DivCeil(block_rect.ysize(), 1 << vshift[c]);
size_t offset = 0;
for (size_t by = 0; by < ysize_blocks; ++by) {
float* JXL_RESTRICT idct_row = rect.Row(rpbuffer, by * kBlockDim);
size_t idct_stride = rpbuffer->PixelsPerRow();
for (size_t bx = 0; bx < xsize_blocks; ++bx) {
const int16_t* qblock = &coeffs.PlaneRow(c, group_idx)[offset];
offset += kDCTBlockSize;
DequantBlock(qblock, c, dequant_matrices, kDefaultQuantBias, block);
// IDCT
float* JXL_RESTRICT idct_pos = idct_row + bx * kBlockDim;
// JPEG XL transposes the DCT, JPEG doesn't.
Transpose<8, 8>::Run(DCTFrom(block, 8), DCTTo(scratch_space, 8));
TransformToPixels(AcStrategy::DCT, scratch_space, idct_pos, idct_stride,
block);
}
}
}
return true;
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace jxl
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace jxl {
namespace {
HWY_EXPORT(DecodeGroupJpeg);
} // namespace
namespace extras {
Status DecodeGroupJpeg(const Image3S& coeffs, size_t group_idx,
const Rect block_rect, const YCbCrChromaSubsampling& cs,
const float* dequant_matrices,
float* JXL_RESTRICT group_dec_cache, size_t thread,
RenderPipelineInput& render_pipeline_input) {
return HWY_DYNAMIC_DISPATCH(DecodeGroupJpeg)(
coeffs, group_idx, block_rect, cs, dequant_matrices, group_dec_cache,
thread, render_pipeline_input);
}
} // namespace extras
} // namespace jxl
#endif // HWY_ONCE

View File

@@ -0,0 +1,31 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef LIB_EXTRAS_DEC_GROUP_JPEG_H_
#define LIB_EXTRAS_DEC_GROUP_JPEG_H_
#include <stddef.h>
#include <stdint.h>
#include <vector>
#include "lib/jxl/base/status.h"
#include "lib/jxl/frame_header.h"
#include "lib/jxl/image.h"
#include "lib/jxl/render_pipeline/render_pipeline.h"
namespace jxl {
namespace extras {
Status DecodeGroupJpeg(const Image3S& coeffs, size_t group_idx,
const Rect block_rect, const YCbCrChromaSubsampling& cs,
const float* dequant_matrices,
float* JXL_RESTRICT group_dec_cache, size_t thread,
RenderPipelineInput& render_pipeline_input);
} // namespace extras
} // namespace jxl
#endif // LIB_EXTRAS_DEC_GROUP_JPEG_H_

View File

@@ -0,0 +1,274 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "lib/extras/decode_jpeg.h"
#include "lib/extras/dec_group_jpeg.h"
#include "lib/jxl/base/status.h"
#include "lib/jxl/color_encoding_internal.h"
#include "lib/jxl/common.h"
#include "lib/jxl/frame_header.h"
#include "lib/jxl/image.h"
#include "lib/jxl/jpeg/enc_jpeg_data.h"
#include "lib/jxl/jpeg/enc_jpeg_data_reader.h"
#include "lib/jxl/render_pipeline/render_pipeline.h"
#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h"
#include "lib/jxl/render_pipeline/stage_write.h"
#include "lib/jxl/render_pipeline/stage_ycbcr.h"
namespace jxl {
namespace extras {
namespace {
Rect BlockGroupRect(const FrameDimensions& frame_dim, size_t group_index) {
const size_t gx = group_index % frame_dim.xsize_groups;
const size_t gy = group_index / frame_dim.xsize_groups;
const Rect rect(gx * (frame_dim.group_dim >> 3),
gy * (frame_dim.group_dim >> 3), frame_dim.group_dim >> 3,
frame_dim.group_dim >> 3, frame_dim.xsize_blocks,
frame_dim.ysize_blocks);
return rect;
}
Rect DCGroupRect(const FrameDimensions& frame_dim, size_t group_index) {
const size_t gx = group_index % frame_dim.xsize_dc_groups;
const size_t gy = group_index / frame_dim.xsize_dc_groups;
const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim,
frame_dim.group_dim, frame_dim.group_dim,
frame_dim.xsize_blocks, frame_dim.ysize_blocks);
return rect;
}
Status SetChromaSubsamplingFromJpegData(const jpeg::JPEGData& jpeg_data,
YCbCrChromaSubsampling* cs) {
size_t nbcomp = jpeg_data.components.size();
if (nbcomp == 3) {
uint8_t hsample[3], vsample[3];
for (size_t i = 0; i < nbcomp; i++) {
hsample[i] = jpeg_data.components[i].h_samp_factor;
vsample[i] = jpeg_data.components[i].v_samp_factor;
}
JXL_RETURN_IF_ERROR(cs->Set(hsample, vsample));
} else if (nbcomp == 1) {
uint8_t hsample[3], vsample[3];
for (size_t i = 0; i < 3; i++) {
hsample[i] = jpeg_data.components[0].h_samp_factor;
vsample[i] = jpeg_data.components[0].v_samp_factor;
}
JXL_RETURN_IF_ERROR(cs->Set(hsample, vsample));
}
return true;
}
bool IsYCbCrJpeg(const jpeg::JPEGData& jpeg_data) {
size_t nbcomp = jpeg_data.components.size();
bool is_rgb = false;
const auto& markers = jpeg_data.marker_order;
// If there is a JFIF marker, this is YCbCr. Otherwise...
if (std::find(markers.begin(), markers.end(), 0xE0) == markers.end()) {
// Try to find an 'Adobe' marker.
size_t app_markers = 0;
size_t i = 0;
for (; i < markers.size(); i++) {
// This is an APP marker.
if ((markers[i] & 0xF0) == 0xE0) {
JXL_CHECK(app_markers < jpeg_data.app_data.size());
// APP14 marker
if (markers[i] == 0xEE) {
const auto& data = jpeg_data.app_data[app_markers];
if (data.size() == 15 && data[3] == 'A' && data[4] == 'd' &&
data[5] == 'o' && data[6] == 'b' && data[7] == 'e') {
// 'Adobe' marker.
is_rgb = data[14] == 0;
break;
}
}
app_markers++;
}
}
if (i == markers.size()) {
// No 'Adobe' marker, guess from component IDs.
is_rgb = nbcomp == 3 && jpeg_data.components[0].id == 'R' &&
jpeg_data.components[1].id == 'G' &&
jpeg_data.components[2].id == 'B';
}
}
return (!is_rgb || nbcomp == 1);
}
inline std::array<int, 3> JpegOrder(bool is_ycbcr, bool is_gray) {
if (is_gray) {
return {{0, 0, 0}};
} else if (is_ycbcr) {
return {{1, 0, 2}};
} else {
return {{0, 1, 2}};
}
}
void SetDequantWeightsFromJpegData(const jpeg::JPEGData& jpeg_data,
const bool is_ycbcr, float* dequant) {
auto jpeg_c_map = JpegOrder(is_ycbcr, jpeg_data.components.size() == 1);
const float kDequantScale = 1.0f / (8 * 255);
for (size_t c = 0; c < 3; c++) {
size_t jpeg_c = jpeg_c_map[c];
const int32_t* quant =
jpeg_data.quant[jpeg_data.components[jpeg_c].quant_idx].values.data();
for (size_t k = 0; k < kDCTBlockSize; ++k) {
dequant[c * kDCTBlockSize + k] = quant[k] * kDequantScale;
}
}
}
void SetCoefficientsFromJpegData(const jpeg::JPEGData& jpeg_data,
const FrameDimensions& frame_dim,
const YCbCrChromaSubsampling& cs,
const bool is_ycbcr, Image3S* coeffs) {
auto jpeg_c_map = JpegOrder(is_ycbcr, jpeg_data.components.size() == 1);
*coeffs = Image3S(kGroupDim * kGroupDim, frame_dim.num_groups);
for (size_t c = 0; c < 3; ++c) {
if (jpeg_data.components.size() == 1 && c != 1) {
ZeroFillImage(&coeffs->Plane(c));
continue;
}
const auto& comp = jpeg_data.components[jpeg_c_map[c]];
size_t hshift = cs.HShift(c);
size_t vshift = cs.VShift(c);
int dcquant = jpeg_data.quant[comp.quant_idx].values.data()[0];
int16_t dc_level = 1024 / dcquant;
size_t jpeg_stride = comp.width_in_blocks * kDCTBlockSize;
for (size_t group_index = 0; group_index < frame_dim.num_groups;
group_index++) {
Rect block_rect = BlockGroupRect(frame_dim, group_index);
size_t xsize_blocks = DivCeil(block_rect.xsize(), 1 << hshift);
size_t ysize_blocks = DivCeil(block_rect.ysize(), 1 << vshift);
size_t group_xsize = xsize_blocks * kDCTBlockSize;
size_t bx0 = block_rect.x0() >> hshift;
size_t by0 = block_rect.y0() >> vshift;
size_t jpeg_offset = by0 * jpeg_stride + bx0 * kDCTBlockSize;
const int16_t* JXL_RESTRICT jpeg_coeffs =
comp.coeffs.data() + jpeg_offset;
int16_t* JXL_RESTRICT coeff_row = coeffs->PlaneRow(c, group_index);
for (size_t by = 0; by < ysize_blocks; ++by) {
memcpy(&coeff_row[by * group_xsize], &jpeg_coeffs[by * jpeg_stride],
group_xsize * sizeof(coeff_row[0]));
}
if (!is_ycbcr) {
for (size_t offset = 0; offset < coeffs->xsize();
offset += kDCTBlockSize) {
coeff_row[offset] += dc_level;
}
}
}
}
}
std::unique_ptr<RenderPipeline> PreparePipeline(
const YCbCrChromaSubsampling& cs, const bool is_ycbcr,
const FrameDimensions& frame_dim, PackedImage* output) {
RenderPipeline::Builder builder(3);
if (!cs.Is444()) {
for (size_t c = 0; c < 3; c++) {
if (cs.HShift(c) != 0) {
builder.AddStage(GetChromaUpsamplingStage(c, /*horizontal=*/true));
}
if (cs.VShift(c) != 0) {
builder.AddStage(GetChromaUpsamplingStage(c, /*horizontal=*/false));
}
}
}
if (is_ycbcr) {
builder.AddStage(GetYCbCrStage());
}
ImageOutput main_output;
main_output.format = output->format;
main_output.bits_per_sample =
PackedImage::BitsPerChannel(output->format.data_type);
main_output.buffer = reinterpret_cast<uint8_t*>(output->pixels());
main_output.buffer_size = output->pixels_size;
main_output.stride = output->stride;
std::vector<ImageOutput> extra_output;
builder.AddStage(GetWriteToOutputStage(
main_output, output->xsize, output->ysize,
/*has_alpha=*/false,
/*unpremul_alpha=*/false,
/*alpha_c=*/0, Orientation::kIdentity, extra_output));
return std::move(builder).Finalize(frame_dim);
}
} // namespace
Status DecodeJpeg(const std::vector<uint8_t>& compressed,
JxlDataType output_data_type, ThreadPool* pool,
PackedPixelFile* ppf) {
jpeg::JPEGData jpeg_data;
JXL_RETURN_IF_ERROR(jpeg::ReadJpeg(compressed.data(), compressed.size(),
jpeg::JpegReadMode::kReadAll, &jpeg_data));
const size_t xsize = jpeg_data.width;
const size_t ysize = jpeg_data.height;
const uint32_t nbcomp = jpeg_data.components.size();
const bool is_ycbcr = IsYCbCrJpeg(jpeg_data);
ppf->info.xsize = xsize;
ppf->info.ysize = ysize;
ppf->info.num_color_channels = nbcomp;
ppf->info.bits_per_sample = PackedImage::BitsPerChannel(output_data_type);
ColorEncoding color_encoding;
JXL_RETURN_IF_ERROR(SetColorEncodingFromJpegData(jpeg_data, &color_encoding));
PaddedBytes icc = color_encoding.ICC();
ppf->icc.assign(icc.data(), icc.data() + icc.size());
ConvertInternalToExternalColorEncoding(color_encoding, &ppf->color_encoding);
YCbCrChromaSubsampling cs;
JXL_RETURN_IF_ERROR(SetChromaSubsamplingFromJpegData(jpeg_data, &cs));
FrameDimensions frame_dim;
frame_dim.Set(xsize, ysize, /*group_size_shift=*/1, cs.MaxHShift(),
cs.MaxVShift(),
/*modular_mode=*/false, /*upsampling=*/1);
std::vector<float> dequant(3 * kDCTBlockSize);
SetDequantWeightsFromJpegData(jpeg_data, is_ycbcr, &dequant[0]);
Image3S coeffs;
SetCoefficientsFromJpegData(jpeg_data, frame_dim, cs, is_ycbcr, &coeffs);
JxlPixelFormat format = {nbcomp, output_data_type, JXL_LITTLE_ENDIAN, 0};
ppf->frames.emplace_back(xsize, ysize, format);
auto& frame = ppf->frames.back();
std::unique_ptr<RenderPipeline> render_pipeline =
PreparePipeline(cs, is_ycbcr, frame_dim, &frame.color);
JXL_RETURN_IF_ERROR(render_pipeline->IsInitialized());
hwy::AlignedFreeUniquePtr<float[]> float_memory;
const auto allocate_storage = [&](const size_t num_threads) -> Status {
JXL_RETURN_IF_ERROR(
render_pipeline->PrepareForThreads(num_threads,
/*use_group_ids=*/false));
float_memory = hwy::AllocateAligned<float>(kDCTBlockSize * 2 * num_threads);
return true;
};
const auto process_group = [&](const uint32_t group_index,
const size_t thread) {
RenderPipelineInput input =
render_pipeline->GetInputBuffers(group_index, thread);
float* group_dec_cache = float_memory.get() + thread * kDCTBlockSize * 2;
const Rect block_rect = BlockGroupRect(frame_dim, group_index);
JXL_CHECK(DecodeGroupJpeg(coeffs, group_index, block_rect, cs, &dequant[0],
group_dec_cache, thread, input));
input.Done();
};
JXL_CHECK(RunOnPool(pool, 0, frame_dim.num_groups, allocate_storage,
process_group, "Decode Groups"));
return true;
}
} // namespace extras
} // namespace jxl

View File

@@ -0,0 +1,26 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef LIB_EXTRAS_DECODE_JPEG_H_
#define LIB_EXTRAS_DECODE_JPEG_H_
#include <stdint.h>
#include <vector>
#include "lib/extras/packed_image.h"
#include "lib/jxl/base/data_parallel.h"
namespace jxl {
namespace extras {
Status DecodeJpeg(const std::vector<uint8_t>& compressed,
JxlDataType output_data_type, ThreadPool* pool,
PackedPixelFile* ppf);
} // namespace extras
} // namespace jxl
#endif // LIB_EXTRAS_DECODE_JPEG_H_

View File

@@ -58,8 +58,10 @@ class APNGEncoder : public Encoder {
std::vector<JxlPixelFormat> formats;
for (const uint32_t num_channels : {1, 2, 3, 4}) {
for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
formats.push_back(JxlPixelFormat{num_channels, data_type,
JXL_BIG_ENDIAN, /*align=*/0});
for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
formats.push_back(
JxlPixelFormat{num_channels, data_type, endianness, /*align=*/0});
}
}
}
return formats;
@@ -233,21 +235,7 @@ Status APNGEncoder::EncodePackedPixelFileToAPNG(
} else {
memcpy(&out[0], in, out_size);
}
} else if (format.data_type == JXL_TYPE_FLOAT) {
float mul = 65535.0;
const uint8_t* p_in = in;
uint8_t* p_out = out.data();
for (size_t i = 0; i < num_samples; ++i, p_in += 4, p_out += 2) {
uint32_t val = (format.endianness == JXL_BIG_ENDIAN ? LoadBE32(p_in)
: LoadLE32(p_in));
float fval;
memcpy(&fval, &val, 4);
StoreBE16(static_cast<uint32_t>(fval * mul + 0.5), p_out);
}
} else {
return JXL_FAILURE("Unsupported pixel data type");
}
png_structp png_ptr;
png_infop info_ptr;

View File

@@ -40,6 +40,34 @@ Status Encoder::VerifyBasicInfo(const JxlBasicInfo& info) const {
return true;
}
Status Encoder::VerifyFormat(const JxlPixelFormat& format) const {
for (auto f : AcceptedFormats()) {
if (f.num_channels != format.num_channels) continue;
if (f.data_type != format.data_type) continue;
if (f.data_type == JXL_TYPE_UINT8 || f.endianness == format.endianness) {
return true;
}
}
return JXL_FAILURE("Format is not in the list of accepted formats.");
}
Status Encoder::VerifyBitDepth(JxlDataType data_type, uint32_t bits_per_sample,
uint32_t exponent_bits) const {
if ((data_type == JXL_TYPE_UINT8 &&
(bits_per_sample == 0 || bits_per_sample > 8 || exponent_bits != 0)) ||
(data_type == JXL_TYPE_UINT16 &&
(bits_per_sample <= 8 || bits_per_sample > 16 || exponent_bits != 0)) ||
(data_type == JXL_TYPE_FLOAT16 &&
(bits_per_sample != 16 || exponent_bits != 5)) ||
(data_type == JXL_TYPE_FLOAT &&
(bits_per_sample != 32 || exponent_bits != 8))) {
return JXL_FAILURE(
"Incompatible data_type %d and bit depth %u with exponent bits %u",
(int)data_type, bits_per_sample, exponent_bits);
}
return true;
}
Status Encoder::VerifyPackedImage(const PackedImage& image,
const JxlBasicInfo& info) const {
if (image.pixels() == nullptr) {
@@ -57,10 +85,10 @@ Status Encoder::VerifyPackedImage(const PackedImage& image,
image.format.num_channels != info_num_channels) {
return JXL_FAILURE("Frame size does not match image size");
}
if (info.bits_per_sample >
PackedImage::BitsPerChannel(image.format.data_type)) {
return JXL_FAILURE("Bit depth does not fit pixel data type");
}
JXL_RETURN_IF_ERROR(VerifyFormat(image.format));
JXL_RETURN_IF_ERROR(VerifyBitDepth(image.format.data_type,
info.bits_per_sample,
info.exponent_bits_per_sample));
return true;
}

View File

@@ -60,6 +60,11 @@ class Encoder {
Status VerifyBasicInfo(const JxlBasicInfo& info) const;
Status VerifyFormat(const JxlPixelFormat& format) const;
Status VerifyBitDepth(JxlDataType data_type, uint32_t bits_per_sample,
uint32_t exponent_bits) const;
Status VerifyPackedImage(const PackedImage& image,
const JxlBasicInfo& info) const;

View File

@@ -111,7 +111,7 @@ Status EncodeWithLibJpeg(const PackedImage& image, const JxlBasicInfo& info,
const std::vector<uint8_t>& icc,
std::vector<uint8_t> exif, size_t quality,
const std::string& chroma_subsampling,
std::vector<uint8_t>* bytes) {
bool progressive, std::vector<uint8_t>* bytes) {
if (BITS_IN_JSAMPLE != 8 || sizeof(JSAMPLE) != 1) {
return JXL_FAILURE("Only 8 bit JSAMPLE is supported.");
}
@@ -139,6 +139,9 @@ Status EncodeWithLibJpeg(const PackedImage& image, const JxlBasicInfo& info,
jpeg_set_colorspace(&cinfo, JCS_RGB);
}
jpeg_set_quality(&cinfo, quality, TRUE);
if (progressive) {
jpeg_simple_progression(&cinfo);
}
jpeg_start_compress(&cinfo, TRUE);
if (!icc.empty()) {
WriteICCProfile(&cinfo, icc);
@@ -209,7 +212,8 @@ Status EncodeImageJPG(const PackedImage& image, const JxlBasicInfo& info,
const std::vector<uint8_t>& icc,
std::vector<uint8_t> exif, JpegEncoder encoder,
size_t quality, const std::string& chroma_subsampling,
ThreadPool* pool, std::vector<uint8_t>* bytes) {
bool progressive, ThreadPool* pool,
std::vector<uint8_t>* bytes) {
if (image.format.data_type != JXL_TYPE_UINT8) {
return JXL_FAILURE("Unsupported pixel data type");
}
@@ -222,9 +226,9 @@ Status EncodeImageJPG(const PackedImage& image, const JxlBasicInfo& info,
switch (encoder) {
case JpegEncoder::kLibJpeg:
JXL_RETURN_IF_ERROR(EncodeWithLibJpeg(image, info, color_encoding, icc,
std::move(exif), quality,
chroma_subsampling, bytes));
JXL_RETURN_IF_ERROR(
EncodeWithLibJpeg(image, info, color_encoding, icc, std::move(exif),
quality, chroma_subsampling, progressive, bytes));
break;
case JpegEncoder::kSJpeg:
JXL_RETURN_IF_ERROR(EncodeWithSJpeg(image, info, icc, std::move(exif),
@@ -253,28 +257,26 @@ class JPEGEncoder : public Encoder {
Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
ThreadPool* pool = nullptr) const override {
JXL_RETURN_IF_ERROR(VerifyBasicInfo(ppf.info));
const auto& options = this->options();
int quality = 100;
auto it_quality = options.find("q");
if (it_quality != options.end()) {
std::istringstream is(it_quality->second);
JXL_RETURN_IF_ERROR(static_cast<bool>(is >> quality));
}
std::string chroma_subsampling = "444";
auto it_chroma_subsampling = options.find("chroma_subsampling");
if (it_chroma_subsampling != options.end()) {
chroma_subsampling = it_chroma_subsampling->second;
}
JpegEncoder jpeg_encoder = JpegEncoder::kLibJpeg;
auto it_encoder = options.find("jpeg_encoder");
if (it_encoder != options.end()) {
if (it_encoder->second == "libjpeg") {
jpeg_encoder = JpegEncoder::kLibJpeg;
} else if (it_encoder->second == "sjpeg") {
jpeg_encoder = JpegEncoder::kSJpeg;
} else {
return JXL_FAILURE("unknown jpeg encoder \"%s\"",
it_encoder->second.c_str());
bool progressive = false;
for (const auto& it : options()) {
if (it.first == "q") {
std::istringstream is(it.second);
JXL_RETURN_IF_ERROR(static_cast<bool>(is >> quality));
} else if (it.first == "chroma_subsampling") {
chroma_subsampling = it.second;
} else if (it.first == "jpeg_encoder") {
if (it.second == "libjpeg") {
jpeg_encoder = JpegEncoder::kLibJpeg;
} else if (it.second == "sjpeg") {
jpeg_encoder = JpegEncoder::kSJpeg;
} else {
return JXL_FAILURE("unknown jpeg encoder \"%s\"", it.second.c_str());
}
} else if (it.first == "progressive") {
progressive = true;
}
}
std::vector<uint8_t> icc;
@@ -288,7 +290,7 @@ class JPEGEncoder : public Encoder {
encoded_image->bitstreams.emplace_back();
JXL_RETURN_IF_ERROR(EncodeImageJPG(
frame.color, ppf.info, ppf.color_encoding, icc, ppf.metadata.exif,
jpeg_encoder, quality, chroma_subsampling, pool,
jpeg_encoder, quality, chroma_subsampling, progressive, pool,
&encoded_image->bitstreams.back()));
}
return true;

View File

@@ -0,0 +1,231 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "lib/extras/enc/jxl.h"
#include "jxl/encode_cxx.h"
namespace jxl {
namespace extras {
JxlEncoderStatus SetOption(const JXLOption& opt,
JxlEncoderFrameSettings* settings) {
return opt.is_float
? JxlEncoderFrameSettingsSetFloatOption(settings, opt.id, opt.fval)
: JxlEncoderFrameSettingsSetOption(settings, opt.id, opt.ival);
}
bool SetFrameOptions(const std::vector<JXLOption>& options, size_t frame_index,
size_t* option_idx, JxlEncoderFrameSettings* settings) {
while (*option_idx < options.size()) {
const auto& opt = options[*option_idx];
if (opt.frame_index > frame_index) {
break;
}
if (JXL_ENC_SUCCESS != SetOption(opt, settings)) {
fprintf(stderr, "Setting option id %d failed.\n", opt.id);
return false;
}
(*option_idx)++;
}
return true;
}
bool EncodeImageJXL(const JXLCompressParams& params, const PackedPixelFile& ppf,
const std::vector<uint8_t>* jpeg_bytes,
std::vector<uint8_t>* compressed) {
auto encoder = JxlEncoderMake(/*memory_manager=*/nullptr);
JxlEncoder* enc = encoder.get();
if (params.runner_opaque != nullptr &&
JXL_ENC_SUCCESS != JxlEncoderSetParallelRunner(enc, params.runner,
params.runner_opaque)) {
fprintf(stderr, "JxlEncoderSetParallelRunner failed\n");
return false;
}
auto settings = JxlEncoderFrameSettingsCreate(enc, nullptr);
size_t option_idx = 0;
if (!SetFrameOptions(params.options, 0, &option_idx, settings)) {
return false;
}
if (JXL_ENC_SUCCESS !=
JxlEncoderSetFrameDistance(settings, params.distance)) {
fprintf(stderr, "Setting frame distance failed.\n");
return false;
}
bool use_container = params.use_container;
if (!ppf.metadata.exif.empty() || !ppf.metadata.xmp.empty() ||
!ppf.metadata.jumbf.empty() || !ppf.metadata.iptc.empty() ||
(jpeg_bytes && params.jpeg_store_metadata)) {
use_container = true;
}
if (JXL_ENC_SUCCESS !=
JxlEncoderUseContainer(enc, static_cast<int>(use_container))) {
fprintf(stderr, "JxlEncoderUseContainer failed.\n");
return false;
}
if (jpeg_bytes) {
if (params.jpeg_store_metadata &&
JXL_ENC_SUCCESS != JxlEncoderStoreJPEGMetadata(enc, JXL_TRUE)) {
fprintf(stderr, "Storing JPEG metadata failed.\n");
return false;
}
if (JXL_ENC_SUCCESS != JxlEncoderAddJPEGFrame(settings, jpeg_bytes->data(),
jpeg_bytes->size())) {
fprintf(stderr, "JxlEncoderAddJPEGFrame() failed.\n");
return false;
}
} else {
size_t num_alpha_channels = 0; // Adjusted below.
JxlBasicInfo basic_info = ppf.info;
if (basic_info.alpha_bits > 0) num_alpha_channels = 1;
if (params.intensity_target > 0) {
basic_info.intensity_target = params.intensity_target;
}
basic_info.num_extra_channels =
std::max<uint32_t>(num_alpha_channels, ppf.info.num_extra_channels);
basic_info.num_color_channels = ppf.info.num_color_channels;
const bool lossless = params.distance == 0;
basic_info.uses_original_profile = lossless;
if (params.override_bitdepth != 0) {
basic_info.bits_per_sample = params.override_bitdepth;
basic_info.exponent_bits_per_sample =
params.override_bitdepth == 32 ? 8 : 0;
}
if (JXL_ENC_SUCCESS !=
JxlEncoderSetCodestreamLevel(enc, params.codestream_level)) {
fprintf(stderr, "Setting --codestream_level failed.\n");
return false;
}
if (JXL_ENC_SUCCESS != JxlEncoderSetBasicInfo(enc, &basic_info)) {
fprintf(stderr, "JxlEncoderSetBasicInfo() failed.\n");
return false;
}
if (JXL_ENC_SUCCESS !=
JxlEncoderSetFrameBitDepth(settings, &params.input_bitdepth)) {
fprintf(stderr, "JxlEncoderSetFrameBitDepth() failed.\n");
return false;
}
if (lossless &&
JXL_ENC_SUCCESS != JxlEncoderSetFrameLossless(settings, JXL_TRUE)) {
fprintf(stderr, "JxlEncoderSetFrameLossless() failed.\n");
return false;
}
if (!ppf.icc.empty()) {
if (JXL_ENC_SUCCESS !=
JxlEncoderSetICCProfile(enc, ppf.icc.data(), ppf.icc.size())) {
fprintf(stderr, "JxlEncoderSetICCProfile() failed.\n");
return false;
}
} else {
if (JXL_ENC_SUCCESS !=
JxlEncoderSetColorEncoding(enc, &ppf.color_encoding)) {
fprintf(stderr, "JxlEncoderSetColorEncoding() failed.\n");
return false;
}
}
for (size_t num_frame = 0; num_frame < ppf.frames.size(); ++num_frame) {
const jxl::extras::PackedFrame& pframe = ppf.frames[num_frame];
const jxl::extras::PackedImage& pimage = pframe.color;
JxlPixelFormat ppixelformat = pimage.format;
if (JXL_ENC_SUCCESS !=
JxlEncoderSetFrameHeader(settings, &pframe.frame_info)) {
fprintf(stderr, "JxlEncoderSetFrameHeader() failed.\n");
return false;
}
if (!SetFrameOptions(params.options, num_frame, &option_idx, settings)) {
return false;
}
if (num_alpha_channels > 0) {
JxlExtraChannelInfo extra_channel_info;
JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &extra_channel_info);
extra_channel_info.bits_per_sample = ppf.info.alpha_bits;
extra_channel_info.exponent_bits_per_sample =
ppf.info.alpha_exponent_bits;
if (params.premultiply != -1) {
if (params.premultiply != 0 && params.premultiply != 1) {
fprintf(stderr, "premultiply must be one of: -1, 0, 1.\n");
return false;
}
extra_channel_info.alpha_premultiplied = params.premultiply;
}
if (JXL_ENC_SUCCESS !=
JxlEncoderSetExtraChannelInfo(enc, 0, &extra_channel_info)) {
fprintf(stderr, "JxlEncoderSetExtraChannelInfo() failed.\n");
return false;
}
// We take the extra channel blend info frame_info, but don't do
// clamping.
JxlBlendInfo extra_channel_blend_info =
pframe.frame_info.layer_info.blend_info;
extra_channel_blend_info.clamp = JXL_FALSE;
JxlEncoderSetExtraChannelBlendInfo(settings, 0,
&extra_channel_blend_info);
}
size_t num_interleaved_alpha =
(ppixelformat.num_channels - ppf.info.num_color_channels);
// Add extra channel info for the rest of the extra channels.
for (size_t i = 0; i < ppf.info.num_extra_channels; ++i) {
if (i < ppf.extra_channels_info.size()) {
const auto& ec_info = ppf.extra_channels_info[i].ec_info;
if (JXL_ENC_SUCCESS !=
JxlEncoderSetExtraChannelInfo(enc, num_interleaved_alpha + i,
&ec_info)) {
fprintf(stderr, "JxlEncoderSetExtraChannelInfo() failed.\n");
return false;
}
}
}
if (JXL_ENC_SUCCESS != JxlEncoderAddImageFrame(settings, &ppixelformat,
pimage.pixels(),
pimage.pixels_size)) {
fprintf(stderr, "JxlEncoderAddImageFrame() failed.\n");
return false;
}
// Only set extra channel buffer if it is provided non-interleaved.
for (size_t i = 0; i < pframe.extra_channels.size(); ++i) {
if (JXL_ENC_SUCCESS !=
JxlEncoderSetExtraChannelBuffer(settings, &ppixelformat,
pframe.extra_channels[i].pixels(),
pframe.extra_channels[i].stride *
pframe.extra_channels[i].ysize,
num_interleaved_alpha + i)) {
fprintf(stderr, "JxlEncoderSetExtraChannelBuffer() failed.\n");
return false;
}
}
}
}
JxlEncoderCloseInput(enc);
// Reading compressed output
compressed->clear();
compressed->resize(4096);
uint8_t* next_out = compressed->data();
size_t avail_out = compressed->size() - (next_out - compressed->data());
JxlEncoderStatus result = JXL_ENC_NEED_MORE_OUTPUT;
while (result == JXL_ENC_NEED_MORE_OUTPUT) {
result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
if (result == JXL_ENC_NEED_MORE_OUTPUT) {
size_t offset = next_out - compressed->data();
compressed->resize(compressed->size() * 2);
next_out = compressed->data() + offset;
avail_out = compressed->size() - offset;
}
}
compressed->resize(next_out - compressed->data());
if (result != JXL_ENC_SUCCESS) {
fprintf(stderr, "JxlEncoderProcessOutput failed.\n");
return false;
}
return true;
}
} // namespace extras
} // namespace jxl

View File

@@ -0,0 +1,73 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef LIB_EXTRAS_ENC_JXL_H_
#define LIB_EXTRAS_ENC_JXL_H_
#include <stdint.h>
#include <vector>
#include "jxl/encode.h"
#include "jxl/parallel_runner.h"
#include "jxl/thread_parallel_runner.h"
#include "jxl/types.h"
#include "lib/extras/packed_image.h"
namespace jxl {
namespace extras {
struct JXLOption {
JXLOption(JxlEncoderFrameSettingId id, int64_t val, size_t frame_index)
: id(id), is_float(false), ival(val), frame_index(frame_index) {}
JXLOption(JxlEncoderFrameSettingId id, float val, size_t frame_index)
: id(id), is_float(true), fval(val), frame_index(frame_index) {}
JxlEncoderFrameSettingId id;
bool is_float;
union {
int64_t ival;
float fval;
};
size_t frame_index;
};
struct JXLCompressParams {
std::vector<JXLOption> options;
// Target butteraugli distance, 0.0 means lossless.
float distance = 1.0f;
// If set to true, forces container mode.
bool use_container = false;
// Whether to enable/disable byte-exact jpeg reconstruction for jpeg inputs.
bool jpeg_store_metadata = true;
// Upper bound on the intensity level present in the image in nits (zero means
// that the library chooses a default).
float intensity_target = 0;
// Overrides for bitdepth, codestream level and alpha premultiply.
size_t override_bitdepth = 0;
int32_t codestream_level = -1;
int32_t premultiply = -1;
// Override input buffer interpretation.
JxlBitDepth input_bitdepth = {JXL_BIT_DEPTH_FROM_PIXEL_FORMAT, 0, 0};
// If runner_opaque is set, the decoder uses this parallel runner.
JxlParallelRunner runner = JxlThreadParallelRunner;
void* runner_opaque = nullptr;
void AddOption(JxlEncoderFrameSettingId id, int64_t val) {
options.emplace_back(JXLOption(id, val, 0));
}
void AddFloatOption(JxlEncoderFrameSettingId id, float val) {
options.emplace_back(JXLOption(id, val, 0));
}
};
bool EncodeImageJXL(const JXLCompressParams& params, const PackedPixelFile& ppf,
const std::vector<uint8_t>* jpeg_bytes,
std::vector<uint8_t>* compressed);
} // namespace extras
} // namespace jxl
#endif // LIB_EXTRAS_ENC_JXL_H_

View File

@@ -32,69 +32,6 @@ namespace {
constexpr size_t kMaxHeaderSize = 200;
Status EncodeHeader(const PackedImage& image, size_t bits_per_sample,
bool little_endian, char* header, int* chars_written) {
size_t num_channels = image.format.num_channels;
bool is_gray = num_channels <= 2;
bool has_alpha = num_channels == 2 || num_channels == 4;
if (has_alpha) { // PAM
if (bits_per_sample > 16) return JXL_FAILURE("PNM cannot have > 16 bits");
const uint32_t max_val = (1U << bits_per_sample) - 1;
*chars_written =
snprintf(header, kMaxHeaderSize,
"P7\nWIDTH %" PRIuS "\nHEIGHT %" PRIuS
"\nDEPTH %u\nMAXVAL %u\nTUPLTYPE %s\nENDHDR\n",
image.xsize, image.ysize, is_gray ? 2 : 4, max_val,
is_gray ? "GRAYSCALE_ALPHA" : "RGB_ALPHA");
JXL_RETURN_IF_ERROR(static_cast<unsigned int>(*chars_written) <
kMaxHeaderSize);
} else if (bits_per_sample == 32) { // PFM
const char type = is_gray ? 'f' : 'F';
const double scale = little_endian ? -1.0 : 1.0;
*chars_written =
snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%.1f\n",
type, image.xsize, image.ysize, scale);
JXL_RETURN_IF_ERROR(static_cast<unsigned int>(*chars_written) <
kMaxHeaderSize);
} else { // PGM/PPM
if (bits_per_sample > 16) return JXL_FAILURE("PNM cannot have > 16 bits");
const uint32_t max_val = (1U << bits_per_sample) - 1;
const char type = is_gray ? '5' : '6';
*chars_written =
snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%u\n",
type, image.xsize, image.ysize, max_val);
JXL_RETURN_IF_ERROR(static_cast<unsigned int>(*chars_written) <
kMaxHeaderSize);
}
return true;
}
Status EncodeImagePNM(const PackedImage& image, size_t bits_per_sample,
std::vector<uint8_t>* bytes) {
if (bits_per_sample <= 16 && image.format.endianness != JXL_BIG_ENDIAN) {
return JXL_FAILURE("PPM/PGM requires big-endian pixel format.");
}
bool is_little_endian =
(image.format.endianness == JXL_LITTLE_ENDIAN ||
(image.format.endianness == JXL_NATIVE_ENDIAN && IsLittleEndian()));
char header[kMaxHeaderSize];
int header_size = 0;
JXL_RETURN_IF_ERROR(EncodeHeader(image, bits_per_sample, is_little_endian,
header, &header_size));
bytes->resize(static_cast<size_t>(header_size) + image.pixels_size);
memcpy(bytes->data(), header, static_cast<size_t>(header_size));
const bool flipped_y = bits_per_sample == 32; // PFMs are flipped
const uint8_t* in = reinterpret_cast<const uint8_t*>(image.pixels());
uint8_t* out = bytes->data() + header_size;
for (size_t y = 0; y < image.ysize; ++y) {
size_t y_out = flipped_y ? image.ysize - 1 - y : y;
const uint8_t* row_in = &in[y * image.stride];
uint8_t* row_out = &out[y_out * image.stride];
memcpy(row_out, row_in, image.stride);
}
return true;
}
class PNMEncoder : public Encoder {
public:
Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
@@ -110,8 +47,8 @@ class PNMEncoder : public Encoder {
for (const auto& frame : ppf.frames) {
JXL_RETURN_IF_ERROR(VerifyPackedImage(frame.color, ppf.info));
encoded_image->bitstreams.emplace_back();
JXL_RETURN_IF_ERROR(EncodeImagePNM(frame.color, ppf.info.bits_per_sample,
&encoded_image->bitstreams.back()));
JXL_RETURN_IF_ERROR(
EncodeFrame(ppf, frame, &encoded_image->bitstreams.back()));
}
for (size_t i = 0; i < ppf.extra_channels_info.size(); ++i) {
const auto& ec_info = ppf.extra_channels_info[i].ec_info;
@@ -119,30 +56,61 @@ class PNMEncoder : public Encoder {
auto& ec_bitstreams = encoded_image->extra_channel_bitstreams.back();
for (const auto& frame : ppf.frames) {
ec_bitstreams.emplace_back();
JXL_RETURN_IF_ERROR(EncodeImagePNM(frame.extra_channels[i],
ec_info.bits_per_sample,
&ec_bitstreams.back()));
JXL_RETURN_IF_ERROR(EncodeExtraChannel(frame.extra_channels[i],
ec_info.bits_per_sample,
&ec_bitstreams.back()));
}
}
return true;
}
protected:
virtual Status EncodeFrame(const PackedPixelFile& ppf,
const PackedFrame& frame,
std::vector<uint8_t>* bytes) const = 0;
virtual Status EncodeExtraChannel(const PackedImage& image,
size_t bits_per_sample,
std::vector<uint8_t>* bytes) const = 0;
};
class PPMEncoder : public PNMEncoder {
public:
std::vector<JxlPixelFormat> AcceptedFormats() const override {
std::vector<JxlPixelFormat> formats;
for (const uint32_t num_channels : {1, 2, 3, 4}) {
for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
for (JxlEndianness endianness : {JXL_BIG_ENDIAN}) {
formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
/*data_type=*/data_type,
/*endianness=*/endianness,
/*align=*/0});
}
}
}
return formats;
return {JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0},
JxlPixelFormat{3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}};
}
Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
std::vector<uint8_t>* bytes) const override {
return EncodeImage(frame.color, ppf.info.bits_per_sample, bytes);
}
Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
std::vector<uint8_t>* bytes) const override {
return EncodeImage(image, bits_per_sample, bytes);
}
private:
Status EncodeImage(const PackedImage& image, size_t bits_per_sample,
std::vector<uint8_t>* bytes) const {
uint32_t maxval = (1u << bits_per_sample) - 1;
char type = image.format.num_channels == 1 ? '5' : '6';
char header[kMaxHeaderSize];
size_t header_size =
snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%u\n",
type, image.xsize, image.ysize, maxval);
JXL_RETURN_IF_ERROR(header_size < kMaxHeaderSize);
bytes->resize(header_size + image.pixels_size);
memcpy(bytes->data(), header, header_size);
memcpy(bytes->data() + header_size,
reinterpret_cast<uint8_t*>(image.pixels()), image.pixels_size);
return true;
}
};
class PGMEncoder : public PPMEncoder {
public:
std::vector<JxlPixelFormat> AcceptedFormats() const override {
return {JxlPixelFormat{1, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0},
JxlPixelFormat{1, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}};
}
};
@@ -151,54 +119,168 @@ class PFMEncoder : public PNMEncoder {
std::vector<JxlPixelFormat> AcceptedFormats() const override {
std::vector<JxlPixelFormat> formats;
for (const uint32_t num_channels : {1, 3}) {
for (const JxlDataType data_type : {JXL_TYPE_FLOAT16, JXL_TYPE_FLOAT}) {
for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
/*data_type=*/data_type,
/*endianness=*/endianness,
/*align=*/0});
for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
/*data_type=*/JXL_TYPE_FLOAT,
/*endianness=*/endianness,
/*align=*/0});
}
}
return formats;
}
Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
std::vector<uint8_t>* bytes) const override {
return EncodeImage(frame.color, bytes);
}
Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
std::vector<uint8_t>* bytes) const override {
return EncodeImage(image, bytes);
}
private:
Status EncodeImage(const PackedImage& image,
std::vector<uint8_t>* bytes) const {
char type = image.format.num_channels == 1 ? 'f' : 'F';
double scale = image.format.endianness == JXL_LITTLE_ENDIAN ? -1.0 : 1.0;
char header[kMaxHeaderSize];
size_t header_size =
snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%.1f\n",
type, image.xsize, image.ysize, scale);
JXL_RETURN_IF_ERROR(header_size < kMaxHeaderSize);
bytes->resize(header_size + image.pixels_size);
memcpy(bytes->data(), header, header_size);
const uint8_t* in = reinterpret_cast<const uint8_t*>(image.pixels());
uint8_t* out = bytes->data() + header_size;
for (size_t y = 0; y < image.ysize; ++y) {
size_t y_out = image.ysize - 1 - y;
const uint8_t* row_in = &in[y * image.stride];
uint8_t* row_out = &out[y_out * image.stride];
memcpy(row_out, row_in, image.stride);
}
return true;
}
};
class PAMEncoder : public PNMEncoder {
public:
std::vector<JxlPixelFormat> AcceptedFormats() const override {
std::vector<JxlPixelFormat> formats;
for (const uint32_t num_channels : {1, 2, 3, 4}) {
for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
/*data_type=*/data_type,
/*endianness=*/JXL_BIG_ENDIAN,
/*align=*/0});
}
}
return formats;
}
Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
std::vector<uint8_t>* bytes) const override {
const PackedImage& color = frame.color;
const auto& ec_info = ppf.extra_channels_info;
JXL_RETURN_IF_ERROR(frame.extra_channels.size() == ec_info.size());
for (const auto& ec : frame.extra_channels) {
if (ec.xsize != color.xsize || ec.ysize != color.ysize) {
return JXL_FAILURE("Extra channel and color size mismatch.");
}
if (ec.format.data_type != color.format.data_type ||
ec.format.endianness != color.format.endianness) {
return JXL_FAILURE("Extra channel and color format mismatch.");
}
}
if (ppf.info.bits_per_sample != ppf.info.alpha_bits) {
return JXL_FAILURE("Alpha bit depth does not match image bit depth");
}
for (const auto& it : ec_info) {
if (it.ec_info.bits_per_sample != ppf.info.bits_per_sample) {
return JXL_FAILURE(
"Extra channel bit depth does not match image bit depth");
}
}
const char* kColorTypes[4] = {"GRAYSCALE", "GRAYSCALE_ALPHA", "RGB",
"RGB_ALPHA"};
uint32_t maxval = (1u << ppf.info.bits_per_sample) - 1;
uint32_t depth = color.format.num_channels + ec_info.size();
char header[kMaxHeaderSize];
size_t pos = 0;
pos += snprintf(header + pos, kMaxHeaderSize - pos,
"P7\nWIDTH %" PRIuS "\nHEIGHT %" PRIuS
"\nDEPTH %u\n"
"MAXVAL %u\nTUPLTYPE %s\n",
color.xsize, color.ysize, depth, maxval,
kColorTypes[color.format.num_channels - 1]);
JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
for (const auto& info : ec_info) {
pos += snprintf(header + pos, kMaxHeaderSize - pos, "TUPLTYPE %s\n",
ExtraChannelTypeName(info.ec_info.type).c_str());
JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
}
pos += snprintf(header + pos, kMaxHeaderSize - pos, "ENDHDR\n");
JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
size_t total_size = color.pixels_size;
for (const auto& ec : frame.extra_channels) {
total_size += ec.pixels_size;
}
bytes->resize(pos + total_size);
memcpy(bytes->data(), header, pos);
// If we have no extra channels, just copy color pixel data over.
if (frame.extra_channels.empty()) {
memcpy(bytes->data() + pos, reinterpret_cast<uint8_t*>(color.pixels()),
color.pixels_size);
return true;
}
// Interleave color and extra channels.
const uint8_t* in = reinterpret_cast<const uint8_t*>(color.pixels());
std::vector<const uint8_t*> ec_in(frame.extra_channels.size());
for (size_t i = 0; i < frame.extra_channels.size(); ++i) {
ec_in[i] =
reinterpret_cast<const uint8_t*>(frame.extra_channels[i].pixels());
}
uint8_t* out = bytes->data() + pos;
size_t pwidth = PackedImage::BitsPerChannel(color.format.data_type) / 8;
for (size_t y = 0; y < color.ysize; ++y) {
for (size_t x = 0; x < color.xsize; ++x) {
memcpy(out, in, color.pixel_stride());
out += color.pixel_stride();
in += color.pixel_stride();
for (auto& p : ec_in) {
memcpy(out, p, pwidth);
out += pwidth;
p += pwidth;
}
}
}
return formats;
return true;
}
Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
std::vector<uint8_t>* bytes) const override {
return true;
}
};
class PGMEncoder : public PPMEncoder {
public:
std::vector<JxlPixelFormat> AcceptedFormats() const override {
std::vector<JxlPixelFormat> formats = PPMEncoder::AcceptedFormats();
for (auto it = formats.begin(); it != formats.end();) {
if (it->num_channels > 2) {
it = formats.erase(it);
} else {
++it;
}
private:
static std::string ExtraChannelTypeName(JxlExtraChannelType type) {
switch (type) {
case JXL_CHANNEL_ALPHA:
return std::string("Alpha");
case JXL_CHANNEL_DEPTH:
return std::string("Depth");
case JXL_CHANNEL_SPOT_COLOR:
return std::string("SpotColor");
case JXL_CHANNEL_SELECTION_MASK:
return std::string("SelectionMask");
case JXL_CHANNEL_BLACK:
return std::string("Black");
case JXL_CHANNEL_CFA:
return std::string("CFA");
case JXL_CHANNEL_THERMAL:
return std::string("Thermal");
default:
return std::string("UNKNOWN");
}
return formats;
}
};
class PAMEncoder : public PPMEncoder {
public:
std::vector<JxlPixelFormat> AcceptedFormats() const override {
std::vector<JxlPixelFormat> formats = PPMEncoder::AcceptedFormats();
for (auto it = formats.begin(); it != formats.end();) {
if (it->num_channels != 2 && it->num_channels != 4) {
it = formats.erase(it);
} else {
++it;
}
}
return formats;
}
};
Span<const uint8_t> MakeSpan(const char* str) {
return Span<const uint8_t>(reinterpret_cast<const uint8_t*>(str),
strlen(str));
}
} // namespace
std::unique_ptr<Encoder> GetPPMEncoder() {

View File

@@ -31,27 +31,42 @@ namespace HWY_NAMESPACE {
void ComputeDCTCoefficients(const Image3F& opsin, const ImageF& qf,
const FrameDimensions& frame_dim, const float* qm,
std::vector<jpeg::JPEGComponent>* components) {
int max_samp_factor = 1;
for (const auto& c : *components) {
JXL_DASSERT(c.h_samp_factor == c.v_samp_factor);
max_samp_factor = std::max(c.h_samp_factor, max_samp_factor);
}
float qfmin, qfmax;
ImageMinMax(qf, &qfmin, &qfmax);
HWY_ALIGN float scratch_space[2 * kDCTBlockSize];
ImageF tmp;
for (size_t c = 0; c < 3; c++) {
std::vector<jpeg::coeff_t>& coeffs = (*components)[c].coeffs;
size_t num_blocks = frame_dim.xsize_blocks * frame_dim.ysize_blocks;
coeffs.resize(num_blocks * kDCTBlockSize);
auto& comp = (*components)[c];
const size_t xsize_blocks = comp.width_in_blocks;
const size_t ysize_blocks = comp.height_in_blocks;
JXL_DASSERT(max_samp_factor % comp.h_samp_factor == 0);
const int factor = max_samp_factor / comp.h_samp_factor;
const ImageF* plane = &opsin.Plane(c);
if (factor > 1) {
tmp = CopyImage(*plane);
DownsampleImage(&tmp, factor);
plane = &tmp;
}
std::vector<jpeg::coeff_t>& coeffs = comp.coeffs;
coeffs.resize(xsize_blocks * ysize_blocks * kDCTBlockSize);
const float* qmc = &qm[c * kDCTBlockSize];
for (size_t by = 0, bix = 0; by < frame_dim.ysize_blocks; by++) {
for (size_t bx = 0; bx < frame_dim.xsize_blocks; bx++, bix++) {
HWY_ALIGN float dct[kDCTBlockSize];
TransformFromPixels(AcStrategy::Type::DCT,
opsin.PlaneRow(c, 8 * by) + 8 * bx,
opsin.PixelsPerRow(), dct, scratch_space);
for (size_t by = 0, bix = 0; by < ysize_blocks; by++) {
for (size_t bx = 0; bx < xsize_blocks; bx++, bix++) {
jpeg::coeff_t* block = &coeffs[bix * kDCTBlockSize];
HWY_ALIGN float dct[kDCTBlockSize];
TransformFromPixels(AcStrategy::Type::DCT, plane->Row(8 * by) + 8 * bx,
plane->PixelsPerRow(), dct, scratch_space);
for (size_t iy = 0, i = 0; iy < 8; iy++) {
for (size_t ix = 0; ix < 8; ix++, i++) {
float coeff = 2040 * dct[i] * qmc[i];
// Create more zeros in areas where jpeg xl would have used a lower
// quantization multiplier.
float zero_bias = 0.5f * qfmax / qf.Row(by)[bx];
float zero_bias = 0.5f * qfmax / qf.Row(by * factor)[bx * factor];
int cc = std::abs(coeff) < zero_bias ? 0 : std::round(coeff);
// If the relative value of the adaptive quantization field is less
// than 0.5, we drop the least significant bit.
@@ -102,7 +117,7 @@ std::vector<uint8_t> CreateXybICCAppMarker() {
return icc_marker;
}
void AddJpegQuantMatrices(const ImageF& qf, float dc_quant,
void AddJpegQuantMatrices(const ImageF& qf, float dc_quant, float global_scale,
std::vector<jpeg::JPEGQuantTable>* quant_tables,
float* qm) {
// Create a custom JPEG XL dequant matrix. The quantization weight parameters
@@ -127,7 +142,6 @@ void AddJpegQuantMatrices(const ImageF& qf, float dc_quant,
// Scale the quant matrix based on the scaled XYB scales and the quant field.
float qfmin, qfmax;
ImageMinMax(qf, &qfmin, &qfmax);
const float global_scale = 0.66f;
for (size_t c = 0; c < 3; c++) {
const float scale = kScaledXYBScale[c] * global_scale;
qm[c * kDCTBlockSize] *= scale;
@@ -237,7 +251,9 @@ void AddJpegHuffmanCodes(std::vector<Histogram>& histograms,
}
void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
float global_scale, const bool subsample_blue,
const FrameDimensions& frame_dim, jpeg::JPEGData* out) {
*out = jpeg::JPEGData();
// ICC
out->marker_order.push_back(0xe2);
out->app_data.push_back(CreateXybICCAppMarker());
@@ -245,7 +261,7 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
// DQT
out->marker_order.emplace_back(0xdb);
float qm[3 * kDCTBlockSize];
AddJpegQuantMatrices(qf, dc_quant, &out->quant, qm);
AddJpegQuantMatrices(qf, dc_quant, global_scale, &out->quant, qm);
// SOF
out->marker_order.emplace_back(0xc2);
@@ -255,11 +271,15 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
out->components[0].id = 'R';
out->components[1].id = 'G';
out->components[2].id = 'B';
size_t max_samp_factor = subsample_blue ? 2 : 1;
for (size_t c = 0; c < 3; ++c) {
out->components[c].h_samp_factor = 1;
out->components[c].v_samp_factor = 1;
out->components[c].width_in_blocks = frame_dim.xsize_blocks;
out->components[c].height_in_blocks = frame_dim.ysize_blocks;
const size_t factor = (subsample_blue && c == 2) ? 2 : 1;
out->components[c].h_samp_factor = max_samp_factor / factor;
out->components[c].v_samp_factor = max_samp_factor / factor;
JXL_ASSERT(frame_dim.xsize_blocks % factor == 0);
JXL_ASSERT(frame_dim.ysize_blocks % factor == 0);
out->components[c].width_in_blocks = frame_dim.xsize_blocks / factor;
out->components[c].height_in_blocks = frame_dim.ysize_blocks / factor;
out->components[c].quant_idx = c;
}
HWY_DYNAMIC_DISPATCH(ComputeDCTCoefficients)
@@ -271,7 +291,7 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
// SOS
std::vector<ProgressiveScan> progressive_mode = {
// DC
{0, 0, 0, 0, true},
{0, 0, 0, 0, !subsample_blue},
// AC 1 - highest bits
{1, 63, 0, 1, false},
// AC 2 - lowest bit
@@ -315,18 +335,31 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
}
}
size_t JpegSize(const jpeg::JPEGData& jpeg_data) {
size_t total_size = 0;
auto countsize = [&total_size](const uint8_t* buf, size_t len) {
total_size += len;
return len;
};
JXL_CHECK(jpeg::WriteJpeg(jpeg_data, countsize));
return total_size;
}
} // namespace
Status EncodeJpeg(const ImageBundle& input, float distance, ThreadPool* pool,
std::vector<uint8_t>* compressed) {
Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
ThreadPool* pool, std::vector<uint8_t>* compressed) {
const bool subsample_blue = true;
const size_t max_shift = subsample_blue ? 1 : 0;
FrameDimensions frame_dim;
frame_dim.Set(input.xsize(), input.ysize(), 1, 0, 0, false, 1);
frame_dim.Set(input.xsize(), input.ysize(), 1, max_shift, max_shift, false,
1);
// Convert input to XYB colorspace.
Image3F opsin(frame_dim.xsize_padded, frame_dim.ysize_padded);
opsin.ShrinkTo(frame_dim.xsize, frame_dim.ysize);
ToXYB(input, pool, &opsin, GetJxlCms());
PadImageToBlockMultipleInPlace(&opsin);
PadImageToBlockMultipleInPlace(&opsin, 8 << max_shift);
// Compute adaptive quant field.
ImageF mask;
@@ -335,7 +368,39 @@ Status EncodeJpeg(const ImageBundle& input, float distance, ThreadPool* pool,
// Create jpeg data and optimize Huffman codes.
jpeg::JPEGData jpeg_data;
FillJPEGData(opsin, qf, InitialQuantDC(distance), frame_dim, &jpeg_data);
float global_scale = 0.66f;
float dc_quant = InitialQuantDC(distance);
FillJPEGData(opsin, qf, dc_quant, global_scale, subsample_blue, frame_dim,
&jpeg_data);
if (target_size != 0) {
// Tweak the jpeg data so that the resulting compressed file is
// approximately target_size long.
size_t prev_size = 0;
float best_error = 100.0f;
float best_global_scale = global_scale;
size_t iter = 0;
for (;;) {
size_t size = JpegSize(jpeg_data);
float error = size * 1.0f / target_size - 1.0f;
if (std::abs(error) < std::abs(best_error)) {
best_error = error;
best_global_scale = global_scale;
}
if (size == prev_size || std::abs(error) < 0.001f || iter >= 10) {
break;
}
global_scale *= 1.0f + error;
FillJPEGData(opsin, qf, dc_quant, global_scale, subsample_blue, frame_dim,
&jpeg_data);
prev_size = size;
++iter;
}
if (best_global_scale != global_scale) {
FillJPEGData(opsin, qf, dc_quant, best_global_scale, subsample_blue,
frame_dim, &jpeg_data);
}
}
// Write jpeg data to compressed stream.
auto write = [&compressed](const uint8_t* buf, size_t len) {

View File

@@ -16,8 +16,8 @@
namespace jxl {
namespace extras {
Status EncodeJpeg(const ImageBundle& input, float distance, ThreadPool* pool,
std::vector<uint8_t>* compressed);
Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
ThreadPool* pool, std::vector<uint8_t>* compressed);
} // namespace extras
} // namespace jxl

View File

@@ -33,6 +33,13 @@ class PackedImage {
PackedImage(size_t xsize, size_t ysize, const JxlPixelFormat& format)
: PackedImage(xsize, ysize, format, CalcStride(format, xsize)) {}
PackedImage Copy() const {
PackedImage copy(xsize, ysize, format);
memcpy(reinterpret_cast<uint8_t*>(copy.pixels()),
reinterpret_cast<const uint8_t*>(pixels()), pixels_size);
return copy;
}
// The interleaved pixels as defined in the storage format.
void* pixels() const { return pixels_.get(); }
@@ -98,6 +105,18 @@ class PackedFrame {
template <typename... Args>
explicit PackedFrame(Args&&... args) : color(std::forward<Args>(args)...) {}
PackedFrame Copy() const {
PackedFrame copy(color.xsize, color.ysize, color.format);
copy.frame_info = frame_info;
copy.name = name;
copy.color = color.Copy();
for (size_t i = 0; i < extra_channels.size(); ++i) {
PackedImage ec = extra_channels[i].Copy();
copy.extra_channels.emplace_back(std::move(ec));
}
return copy;
}
// The Frame metadata.
JxlFrameHeader frame_info = {};
std::string name;
@@ -117,17 +136,18 @@ class PackedMetadata {
std::vector<uint8_t> xmp;
};
// The extra channel metadata information.
struct PackedExtraChannel {
JxlExtraChannelInfo ec_info;
size_t index;
std::string name;
};
// Helper class representing a JXL image file as decoded to pixels from the API.
class PackedPixelFile {
public:
JxlBasicInfo info = {};
// The extra channel metadata information.
struct PackedExtraChannel {
JxlExtraChannelInfo ec_info;
size_t index;
std::string name;
};
std::vector<PackedExtraChannel> extra_channels_info;
// Color information of the decoded pixels.

View File

@@ -58,10 +58,8 @@ Status ConvertPackedFrameToImageBundle(const JxlBasicInfo& info,
JXL_RETURN_IF_ERROR(ConvertFromExternal(
span, frame.color.xsize, frame.color.ysize, io.metadata.m.color_encoding,
frame.color.format.num_channels,
/*alpha_is_premultiplied=*/info.alpha_premultiplied,
frame_bits_per_sample, frame.color.format.endianness, pool, bundle,
/*float_in=*/float_in, /*align=*/0));
frame_bits_per_sample, frame.color.format, pool, bundle));
bundle->extra_channels().resize(io.metadata.m.extra_channel_info.size());
for (size_t i = 0; i < frame.extra_channels.size(); i++) {
@@ -140,8 +138,7 @@ Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,
io->blobs.xmp = ppf.metadata.xmp;
// Append all other extra channels.
for (const PackedPixelFile::PackedExtraChannel& info :
ppf.extra_channels_info) {
for (const auto& info : ppf.extra_channels_info) {
ExtraChannelInfo out;
out.type = static_cast<jxl::ExtraChannel>(info.ec_info.type);
out.bit_depth.bits_per_sample = info.ec_info.bits_per_sample;

View File

@@ -22,6 +22,7 @@
#include "jxl/memory_manager.h"
#include "jxl/parallel_runner.h"
#include "jxl/types.h"
#include "jxl/version.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
@@ -742,14 +743,26 @@ typedef enum {
* represented, the ICC profile may be a close approximation. It is also not
* always feasible to deduce from an ICC profile which named color space it
* exactly represents, if any, as it can represent any arbitrary space.
* HDR color spaces such as those using PQ and HLG are also potentially
* problematic, in that: while ICC profiles can encode a transfer function
* that happens to approximate those of PQ and HLG (HLG for only one given
* system gamma at a time, and necessitating a 3D LUT if gamma is to be
* different from 1), they cannot (before ICCv4.4) semantically signal that
* this is the color space that they represent. Therefore, they will
* typically not actually be interpreted as representing an HDR color space.
* This is especially detrimental to PQ which will then be interpreted as if
* the maximum signal value represented SDR white instead of 10000 cd/m^2,
* meaning that the image will be displayed two orders of magnitude (5-7 EV)
* too dim.
* - The JPEG XL image has an encoded structured color profile, and it
* indicates an unknown or xyb color space. In that case, @ref
* JxlDecoderGetColorAsICCProfile is not available.
*
* When rendering an image on a system that supports ICC profiles, @ref
* JxlDecoderGetColorAsICCProfile should be used first. When rendering
* for a specific color space, possibly indicated in the JPEG XL
* image, @ref JxlDecoderGetColorAsEncodedProfile should be used first.
* When rendering an image on a system where ICC-based color management is used,
* @ref JxlDecoderGetColorAsICCProfile should generally be used first as it will
* return a ready-to-use profile (with the aforementioned caveat about HDR).
* When knowledge about the nominal color space is desired if available, @ref
* JxlDecoderGetColorAsEncodedProfile should be used first.
*
* @param dec decoder object
* @param unused_format deprecated, can be NULL
@@ -1437,6 +1450,21 @@ JXL_EXPORT size_t JxlDecoderGetIntendedDownsamplingRatio(JxlDecoder* dec);
*/
JXL_EXPORT JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec);
/**
* Sets the bit depth of the output buffer or callback.
*
* Can be called after @ref JxlDecoderSetImageOutBuffer or @ref
* JxlDecoderSetImageOutCallback. For float pixel data types, only the default
* @ref JXL_BIT_DEPTH_FROM_PIXEL_FORMAT setting is supported.
*
* @param dec decoder object
* @param bit_depth the bit depth setting of the pixel output
* @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
* incompatible custom bit depth and pixel data type.
*/
JXL_EXPORT JxlDecoderStatus
JxlDecoderSetImageOutBitDepth(JxlDecoder* dec, const JxlBitDepth* bit_depth);
#if defined(__cplusplus) || defined(c_plusplus)
}
#endif

View File

@@ -18,6 +18,7 @@
#include "jxl/jxl_export.h"
#include "jxl/memory_manager.h"
#include "jxl/parallel_runner.h"
#include "jxl/version.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
@@ -514,6 +515,22 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBlendInfo(
JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameName(
JxlEncoderFrameSettings* frame_settings, const char* frame_name);
/**
* Sets the bit depth of the input buffer.
*
* For float pixel formats, only the default JXL_BIT_DEPTH_FROM_PIXEL_FORMAT
* setting is allowed, while for unsigned pixel formats,
* JXL_BIT_DEPTH_FROM_CODESTREAM setting is also allowed. See the comment on
* @ref JxlEncoderAddImageFrame for the effects of the bit depth setting.
* @param frame_settings set of options and metadata for this frame. Also
* includes reference to the encoder object.
* @param bit_depth the bit depth setting of the pixel input
* @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
*/
JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameBitDepth(
JxlEncoderFrameSettings* frame_settings, const JxlBitDepth* bit_depth);
/**
* Sets the buffer to read JPEG encoded bytes from for the next frame to encode.
*
@@ -555,15 +572,22 @@ JxlEncoderAddJPEGFrame(const JxlEncoderFrameSettings* frame_settings,
* - JXL_TYPE_FLOAT, with nominal range 0..1
*
* Note: the sample data type in pixel_format is allowed to be different from
* what is described in the JxlBasicInfo. The type in pixel_format describes the
* format of the uncompressed pixel buffer. The bits_per_sample and
* exponent_bits_per_sample in the JxlBasicInfo describes what will actually be
* encoded in the JPEG XL codestream. For example, to encode a 12-bit image, you
* would set bits_per_sample to 12, and you could use e.g. JXL_TYPE_UINT16
* (where the values are rescaled to 16-bit, i.e. multiplied by 65535/4095) or
* JXL_TYPE_FLOAT (where the values are rescaled to 0..1, i.e. multiplied
* by 1.f/4095.f). While it is allowed, it is obviously not recommended to use a
* pixel_format with lower precision than what is specified in the JxlBasicInfo.
* what is described in the JxlBasicInfo. The type in pixel_format, together
* with an optional @ref JxlBitDepth parameter set by @ref
* JxlEncoderSetFrameBitDepth describes the format of the uncompressed pixel
* buffer. The bits_per_sample and exponent_bits_per_sample in the JxlBasicInfo
* describes what will actually be encoded in the JPEG XL codestream.
* For example, to encode a 12-bit image, you would set bits_per_sample to 12,
* while the input frame buffer can be in the following formats:
* - if pixel format is in JXL_TYPE_UINT16 with default bit depth setting
* (i.e. JXL_BIT_DEPTH_FROM_PIXEL_FORMAT), input sample values are rescaled
* to 16-bit, i.e. multiplied by 65535/4095;
* - if pixel format is in JXL_TYPE_UINT16 with JXL_BIT_DEPTH_FROM_CODESTREAM
* bit depth setting, input sample values are provided unscaled;
* - if pixel format is in JXL_TYPE_FLOAT, input sample values are rescaled
* to 0..1, i.e. multiplied by 1.f/4095.f.
* While it is allowed, it is obviously not recommended to use a pixel_format
* with lower precision than what is specified in the JxlBasicInfo.
*
* We support interleaved channels as described by the JxlPixelFormat:
* - single-channel data, e.g. grayscale

View File

@@ -111,6 +111,43 @@ typedef struct {
size_t align;
} JxlPixelFormat;
/** Settings for the interpretation of the input and output buffers.
*/
typedef enum {
/** This is the default setting, where the encoder expects the input pixels
* to use the full range of the pixel format data type (e.g. for UINT16, the
* input range is 0 .. 65535 and the value 65535 is mapped to 1.0 when
* converting to float), and the decoder uses the full range to output
* pixels. If the bit depth in the basic info is different from this, the
* encoder expects the values to be rescaled accordingly (e.g multiplied by
* 65535/4095 for a 12-bit image using UINT16 input data type). */
JXL_BIT_DEPTH_FROM_PIXEL_FORMAT = 0,
/** If this setting is selected, the encoder expects the input pixels to be
* in the range defined by the bits_per_sample value of the basic info (e.g.
* for 12-bit images using UINT16 input data types, the allowed range is
* 0 .. 4095 and the value 4095 is mapped to 1.0 when converting to float),
* and the decoder outputs pixels in this range. */
JXL_BIT_DEPTH_FROM_CODESTREAM = 1,
/** This setting can only be used in the decoder to select a custom range for
* pixel output */
JXL_BIT_DEPTH_CUSTOM = 2,
} JxlBitDepthType;
/** Data type for describing the interpretation of the input and output buffers
* in terms of the range of allowed input and output pixel values. */
typedef struct {
/** Bit depth setting, see comment on @ref JxlBitDepthType */
JxlBitDepthType type;
/** Custom bits per sample */
uint32_t bits_per_sample;
/** Custom exponent bits per sample */
uint32_t exponent_bits_per_sample;
} JxlBitDepth;
/** Data type holding the 4-character type name of an ISOBMFF box.
*/
typedef char JxlBoxType[4];

View File

@@ -447,6 +447,9 @@ else ()
)
endif ()
# Generate version.h
configure_file("jxl/version.h.in" "include/jxl/version.h")
# Headers for exporting/importing public headers
include(GenerateExportHeader)
set_target_properties(jxl_dec-obj PROPERTIES

View File

@@ -1,202 +0,0 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "lib/jxl/codec_y4m_testonly.h"
#include <stddef.h>
namespace jxl {
namespace test {
struct HeaderY4M {
size_t xsize;
size_t ysize;
size_t bits_per_sample;
int is_yuv; // Y4M: where 1 = 444, 2 = 422, 3 = 420
};
// Decode Y4M images.
class Y4MParser {
public:
explicit Y4MParser(const Span<const uint8_t> bytes)
: pos_(bytes.data()), end_(pos_ + bytes.size()) {}
// TODO(jon): support multi-frame y4m
Status ParseHeader(HeaderY4M* header, const uint8_t** pos) {
JXL_RETURN_IF_ERROR(ExpectString("YUV4MPEG2", 9));
header->is_yuv = 3;
// TODO(jon): check if 4:2:0 is indeed the default
header->bits_per_sample = 8;
// TODO(jon): check if there's a y4m convention for higher bit depths
while (pos_ < end_) {
char next = 0;
JXL_RETURN_IF_ERROR(ReadChar(&next));
if (next == 0x0A) break;
if (next != ' ') continue;
char field = 0;
JXL_RETURN_IF_ERROR(ReadChar(&field));
switch (field) {
case 'W':
JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize));
break;
case 'H':
JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize));
break;
case 'I':
JXL_RETURN_IF_ERROR(ReadChar(&next));
if (next != 'p') {
return JXL_FAILURE(
"Y4M: only progressive (no frame interlacing) allowed");
}
break;
case 'C': {
char c1 = 0;
JXL_RETURN_IF_ERROR(ReadChar(&c1));
char c2 = 0;
JXL_RETURN_IF_ERROR(ReadChar(&c2));
char c3 = 0;
JXL_RETURN_IF_ERROR(ReadChar(&c3));
if (c1 != '4') return JXL_FAILURE("Y4M: invalid C param");
if (c2 == '4') {
if (c3 != '4') return JXL_FAILURE("Y4M: invalid C param");
header->is_yuv = 1; // 444
} else if (c2 == '2') {
if (c3 == '2') {
header->is_yuv = 2; // 422
} else if (c3 == '0') {
header->is_yuv = 3; // 420
} else {
return JXL_FAILURE("Y4M: invalid C param");
}
} else {
return JXL_FAILURE("Y4M: invalid C param");
}
}
[[fallthrough]];
// no break: fallthrough because this field can have values like
// "C420jpeg" (we are ignoring the chroma sample location and treat
// everything like C420jpeg)
case 'F': // Framerate in fps as numerator:denominator
// TODO(jon): actually read this and set corresponding jxl
// metadata
case 'A': // Pixel aspect ratio (ignoring it, could perhaps adjust
// intrinsic dimensions based on this?)
case 'X': // Comment, ignore
// ignore the field value and go to next one
while (pos_ < end_) {
if (pos_[0] == ' ' || pos_[0] == 0x0A) break;
pos_++;
}
break;
default:
return JXL_FAILURE("Y4M: parse error");
}
}
JXL_RETURN_IF_ERROR(ExpectString("FRAME", 5));
while (true) {
char next = 0;
JXL_RETURN_IF_ERROR(ReadChar(&next));
if (next == 0x0A) {
*pos = pos_;
return true;
}
}
}
private:
Status ExpectString(const char* str, size_t len) {
// Unlikely to happen.
if (pos_ + len < pos_) return JXL_FAILURE("Y4M: overflow");
if (pos_ + len > end_ || strncmp(str, (const char*)pos_, len) != 0) {
return JXL_FAILURE("Y4M: expected %s", str);
}
pos_ += len;
return true;
}
Status ReadChar(char* out) {
// Unlikely to happen.
if (pos_ + 1 < pos_) return JXL_FAILURE("Y4M: overflow");
if (pos_ >= end_) {
return JXL_FAILURE("Y4M: unexpected end of input");
}
*out = *pos_;
pos_++;
return true;
}
static bool IsDigit(const uint8_t c) { return '0' <= c && c <= '9'; }
Status ParseUnsigned(size_t* number) {
if (pos_ == end_) return JXL_FAILURE("PNM: reached end before number");
if (!IsDigit(*pos_)) return JXL_FAILURE("PNM: expected unsigned number");
*number = 0;
while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
*number *= 10;
*number += *pos_ - '0';
++pos_;
}
return true;
}
const uint8_t* pos_;
const uint8_t* const end_;
};
Status DecodeImageY4M(const Span<const uint8_t> bytes, CodecInOut* io) {
Y4MParser parser(bytes);
HeaderY4M header = {};
const uint8_t* pos = nullptr;
JXL_RETURN_IF_ERROR(parser.ParseHeader(&header, &pos));
Image3F yuvdata(header.xsize, header.ysize);
ImageBundle bundle(&io->metadata.m);
const int hshift[3][3] = {{0, 0, 0}, {0, 1, 1}, {0, 1, 1}};
const int vshift[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 1, 1}};
for (size_t c = 0; c < 3; c++) {
for (size_t y = 0; y < header.ysize >> vshift[header.is_yuv - 1][c]; ++y) {
float* const JXL_RESTRICT row = yuvdata.PlaneRow((c == 2 ? 2 : 1 - c), y);
if (pos + (header.xsize >> hshift[header.is_yuv - 1][c]) >
bytes.data() + bytes.size())
return JXL_FAILURE("Not enough image data");
for (size_t x = 0; x < header.xsize >> hshift[header.is_yuv - 1][c];
++x) {
row[x] = (1.f / 255.f) * ((*pos++) - 128.f);
}
}
}
bundle.SetFromImage(std::move(yuvdata), io->metadata.m.color_encoding);
bundle.color_transform = ColorTransform::kYCbCr;
YCbCrChromaSubsampling subsampling;
uint8_t cssh[3] = {
2, static_cast<uint8_t>(hshift[header.is_yuv - 1][1] ? 1 : 2),
static_cast<uint8_t>(hshift[header.is_yuv - 1][2] ? 1 : 2)};
uint8_t cssv[3] = {
2, static_cast<uint8_t>(vshift[header.is_yuv - 1][1] ? 1 : 2),
static_cast<uint8_t>(vshift[header.is_yuv - 1][2] ? 1 : 2)};
JXL_RETURN_IF_ERROR(subsampling.Set(cssh, cssv));
bundle.chroma_subsampling = subsampling;
io->Main() = std::move(bundle);
JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetSRGB(ColorSpace::kRGB));
io->metadata.m.SetUintSamples(header.bits_per_sample);
io->metadata.m.SetAlphaBits(0);
io->dec_pixels = header.xsize * header.ysize;
io->metadata.m.bit_depth.bits_per_sample = io->Main().DetectRealBitdepth();
io->SetSize(header.xsize, header.ysize);
SetIntensityTarget(io);
return true;
}
} // namespace test
} // namespace jxl

View File

@@ -1,18 +0,0 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stdint.h>
#include "lib/jxl/base/padded_bytes.h"
#include "lib/jxl/base/status.h"
#include "lib/jxl/codec_in_out.h"
namespace jxl {
namespace test {
Status DecodeImageY4M(const Span<const uint8_t> bytes, CodecInOut* io);
} // namespace test
} // namespace jxl

View File

@@ -249,7 +249,7 @@ Status CreateICCHeader(const ColorEncoding& c,
WriteICCUint32(0, 0, header); // size, correct value filled in at end
WriteICCTag(kCmm, 4, header);
WriteICCUint32(0x04300000u, 8, header);
WriteICCUint32(0x04400000u, 8, header);
const char* profile_type =
c.GetColorSpace() == ColorSpace::kXYB ? "scnr" : "mntr";
WriteICCTag(profile_type, 12, header);
@@ -339,6 +339,44 @@ Status CreateICCChadTag(float chad[9], PaddedBytes* JXL_RESTRICT tags) {
return true;
}
void MaybeCreateICCCICPTag(const ColorEncoding& c,
PaddedBytes* JXL_RESTRICT tags, size_t* offset,
size_t* size, PaddedBytes* JXL_RESTRICT tagtable,
std::vector<size_t>* offsets) {
if (c.GetColorSpace() != ColorSpace::kRGB) {
return;
}
uint8_t primaries = 0;
if (c.primaries == Primaries::kP3) {
if (c.white_point == WhitePoint::kD65) {
primaries = 12;
} else if (c.white_point == WhitePoint::kDCI) {
primaries = 11;
} else {
return;
}
} else if (c.primaries != Primaries::kCustom &&
c.white_point == WhitePoint::kD65) {
primaries = static_cast<uint8_t>(c.primaries);
} else {
return;
}
if (c.tf.IsUnknown() || c.tf.IsGamma()) {
return;
}
WriteICCTag("cicp", tags->size(), tags);
WriteICCUint32(0, tags->size(), tags);
WriteICCUint8(primaries, tags->size(), tags);
WriteICCUint8(static_cast<uint8_t>(c.tf.GetTransferFunction()), tags->size(),
tags);
// Matrix
WriteICCUint8(0, tags->size(), tags);
// Full range
WriteICCUint8(1, tags->size(), tags);
FinalizeICCTag(tags, offset, size);
AddToICCTagTable("cicp", *offset, *size, tagtable, offsets);
}
void CreateICCCurvCurvTag(const std::vector<uint16_t>& curve,
PaddedBytes* JXL_RESTRICT tags) {
size_t pos = tags->size();
@@ -351,6 +389,7 @@ void CreateICCCurvCurvTag(const std::vector<uint16_t>& curve,
}
}
// Writes 12 + 4*params.size() bytes
Status CreateICCCurvParaTag(std::vector<float> params, size_t curve_type,
PaddedBytes* JXL_RESTRICT tags) {
WriteICCTag("para", tags->size(), tags);
@@ -365,30 +404,50 @@ Status CreateICCCurvParaTag(std::vector<float> params, size_t curve_type,
Status CreateICCLutAtoBTagForXYB(PaddedBytes* JXL_RESTRICT tags) {
WriteICCTag("mAB ", tags->size(), tags);
// 4 reserved bytes set to 0
WriteICCUint32(0, tags->size(), tags);
// number of input channels
WriteICCUint8(3, tags->size(), tags);
// number of output channels
WriteICCUint8(3, tags->size(), tags);
// 2 reserved bytes for padding
WriteICCUint16(0, tags->size(), tags);
WriteICCUint32(316, tags->size(), tags);
WriteICCUint32(268, tags->size(), tags);
WriteICCUint32(148, tags->size(), tags);
WriteICCUint32(80, tags->size(), tags);
// offset to first B curve
WriteICCUint32(32, tags->size(), tags);
// offset to matrix
WriteICCUint32(244, tags->size(), tags);
// offset to first M curve
WriteICCUint32(148, tags->size(), tags);
// offset to CLUT
WriteICCUint32(80, tags->size(), tags);
// offset to first A curve
// (reuse linear B curves)
WriteICCUint32(32, tags->size(), tags);
// offset = 32
// no-op curves
JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
// offset = 80
// number of grid points for each input channel
for (int i = 0; i < 16; ++i) {
WriteICCUint8(i < 3 ? 2 : 0, tags->size(), tags);
}
// precision = 2
WriteICCUint8(2, tags->size(), tags);
WriteICCUint8(2, tags->size(), tags);
WriteICCUint8(2, tags->size(), tags);
WriteICCUint8(0, tags->size(), tags);
WriteICCUint32(0, tags->size(), tags);
WriteICCUint32(0, tags->size(), tags);
WriteICCUint32(0, tags->size(), tags);
WriteICCUint8(2, tags->size(), tags);
// 3 bytes of padding
WriteICCUint8(0, tags->size(), tags);
WriteICCUint16(0, tags->size(), tags);
const float kOffsets[3] = {0.015387, 0.028101, 0.277706};
const float kScaling[3] = {1.125, 1.125, 1. / 1.511027};
const float kOffsets[3] = {
kScaledXYBOffset[0] + kScaledXYBOffset[1],
kScaledXYBOffset[1] - kScaledXYBOffset[0] + 1.0f / kScaledXYBScale[0],
kScaledXYBOffset[1] + kScaledXYBOffset[2]};
const float kScaling[3] = {
1.0f / (1.0f / kScaledXYBScale[0] + 1.0f / kScaledXYBScale[1]),
1.0f / (1.0f / kScaledXYBScale[0] + 1.0f / kScaledXYBScale[1]),
1.0f / (1.0f / kScaledXYBScale[1] + 1.0f / kScaledXYBScale[2])};
// 2*2*2*3 entries of 2 bytes each = 48 bytes
for (size_t ix = 0; ix < 2; ++ix) {
for (size_t iy = 0; iy < 2; ++iy) {
for (size_t ib = 0; ib < 2; ++ib) {
@@ -414,6 +473,8 @@ Status CreateICCLutAtoBTagForXYB(PaddedBytes* JXL_RESTRICT tags) {
}
}
}
// offset = 148
// 3 curves with 5 parameters = 3 * (12 + 5 * 4) = 96 bytes
for (size_t i = 0; i < 3; ++i) {
const float b =
-kOffsets[i] - std::cbrt(jxl::kNegOpsinAbsorbanceBiasRGB[i]);
@@ -423,23 +484,24 @@ Status CreateICCLutAtoBTagForXYB(PaddedBytes* JXL_RESTRICT tags) {
b,
0, // unused
std::max(0.f, -b * kScaling[i]), // make skcms happy
jxl::kNegOpsinAbsorbanceBiasRGB[i],
0, // unused
};
JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(params, 4, tags));
JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(params, 3, tags));
}
// offset = 244
const double matrix[] = {1.5170095, -1.1065225, 0.071623,
-0.050022, 0.5683655, -0.018344,
-1.387676, 1.1145555, 0.6857255};
// 12 * 4 = 48 bytes
for (size_t i = 0; i < 9; ++i) {
JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(matrix[i], tags->size(), tags));
}
JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(0.0f, tags->size(), tags));
JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(0.0f, tags->size(), tags));
JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(0.0f, tags->size(), tags));
JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
for (size_t i = 0; i < 3; ++i) {
float intercept = 0;
for (size_t j = 0; j < 3; ++j) {
intercept += matrix[i * 3 + j] * jxl::kNegOpsinAbsorbanceBiasRGB[j];
}
JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(intercept, tags->size(), tags));
}
return true;
}
} // namespace
@@ -481,9 +543,7 @@ Status MaybeCreateProfile(const ColorEncoding& c,
FinalizeICCTag(&tags, &tag_offset, &tag_size);
AddToICCTagTable("desc", tag_offset, tag_size, &tagtable, &offsets);
const std::string copyright =
"Copyright 2019 Google LLC, CC-BY-SA 3.0 Unported "
"license(https://creativecommons.org/licenses/by-sa/3.0/legalcode)";
const std::string copyright = "CC0";
CreateICCMlucTag(copyright, &tags);
FinalizeICCTag(&tags, &tag_offset, &tag_size);
AddToICCTagTable("cprt", tag_offset, tag_size, &tagtable, &offsets);
@@ -511,6 +571,9 @@ Status MaybeCreateProfile(const ColorEncoding& c,
}
if (c.GetColorSpace() == ColorSpace::kRGB) {
MaybeCreateICCCICPTag(c, &tags, &tag_offset, &tag_size, &tagtable,
&offsets);
const PrimariesCIExy primaries = c.GetPrimaries();
float m[9];
JXL_RETURN_IF_ERROR(CreateICCRGBMatrix(primaries.r, primaries.g,

View File

@@ -392,7 +392,7 @@ TEST_F(ColorManagementTest, XYBProfile) {
}
}
}
static float kMaxError[3] = {8.5e-4, 4e-4, 5e-4};
static float kMaxError[3] = {9e-4, 4e-4, 5e-4};
printf("Maximum errors:\n");
for (size_t c = 0; c < 3; ++c) {
debug_print_color(max_err_i[c]);

View File

@@ -144,10 +144,11 @@ Status PassesDecoderState::PreparePipeline(ImageBundle* decoded,
frame_header.save_before_color_transform);
JXL_ASSERT(!options.render_spotcolors ||
!decoded->metadata()->Find(ExtraChannel::kSpotColor));
bool is_rgba = (format.num_channels == 4);
uint8_t* rgb_output = reinterpret_cast<uint8_t*>(image_buffer);
builder.AddStage(GetFastXYBTosRGB8Stage(rgb_output, stride, width, height,
is_rgba, has_alpha, alpha_c));
bool is_rgba = (main_output.format.num_channels == 4);
uint8_t* rgb_output = reinterpret_cast<uint8_t*>(main_output.buffer);
builder.AddStage(GetFastXYBTosRGB8Stage(rgb_output, main_output.stride,
width, height, is_rgba, has_alpha,
alpha_c));
} else {
bool linear = false;
if (frame_header.color_transform == ColorTransform::kYCbCr) {
@@ -212,10 +213,10 @@ Status PassesDecoderState::PreparePipeline(ImageBundle* decoded,
linear = false;
}
if (pixel_callback.IsPresent() || image_buffer) {
builder.AddStage(GetWriteToOutputStage(
pixel_callback, image_buffer, width, height, stride, format,
has_alpha, unpremul_alpha, alpha_c, undo_orientation));
if (main_output.callback.IsPresent() || main_output.buffer) {
builder.AddStage(GetWriteToOutputStage(main_output, width, height,
has_alpha, unpremul_alpha, alpha_c,
undo_orientation, extra_output));
} else {
builder.AddStage(GetWriteToImageBundleStage(
decoded, output_encoding_info.color_encoding));

View File

@@ -56,6 +56,20 @@ struct PixelCallback {
void* init_opaque = nullptr;
};
struct ImageOutput {
// Pixel format of the output pixels, used for buffer and callback output.
JxlPixelFormat format;
// Output bit depth for unsigned data types, used for float to int conversion.
size_t bits_per_sample;
// Callback for line-by-line output.
PixelCallback callback;
// Pixel buffer for image output.
void* buffer;
size_t buffer_size;
// Length of a row of image_buffer in bytes (based on oriented width).
size_t stride;
};
// Per-frame decoder state. All the images here should be accessed through a
// group rect (either with block units or pixel units).
struct PassesDecoderState {
@@ -77,17 +91,11 @@ struct PassesDecoderState {
// Sigma values for EPF.
ImageF sigma;
// Pixel buffer for image output.
void* image_buffer;
// Image dimensions before applying undo_orientation.
size_t width;
size_t height;
// Length of a row of image_buffer in bytes (based on oriented width).
size_t stride;
// Callback for line-by-line output.
PixelCallback pixel_callback;
// Pixel format of the output pixels, used for buffer and callback output.
JxlPixelFormat format;
ImageOutput main_output;
std::vector<ImageOutput> extra_output;
// Whether to use int16 float-XYB-to-uint8-srgb conversion.
bool fast_xyb_srgb8_conversion;
@@ -134,8 +142,9 @@ struct PassesDecoderState {
b_dm_multiplier =
std::pow(1 / (1.25f), shared->frame_header.b_qm_scale - 2.0f);
pixel_callback = PixelCallback();
image_buffer = nullptr;
main_output.callback = PixelCallback();
main_output.buffer = nullptr;
extra_output.clear();
fast_xyb_srgb8_conversion = false;
unpremul_alpha = false;

View File

@@ -173,28 +173,22 @@ class FrameDecoder {
}
// Sets the pixel callback or image buffer where the pixels will be decoded.
// This is not supported for all images. If it succeeds, HasRGBBuffer() will
// return true.
// If it does not succeed, the image is decoded to the ImageBundle passed to
// InitFrame instead.
//
// @param undo_orientation: if true, indicates the frame decoder should apply
// the exif orientation to bring the image to the intended display
// orientation.
void SetImageOutput(const PixelCallback& pixel_callback, void* image_buffer,
size_t xsize, size_t ysize, JxlPixelFormat format,
size_t image_buffer_size, size_t xsize, size_t ysize,
JxlPixelFormat format, size_t bits_per_sample,
bool unpremul_alpha, bool undo_orientation) const {
dec_state_->pixel_callback = pixel_callback;
dec_state_->image_buffer = image_buffer;
dec_state_->width = xsize;
dec_state_->height = ysize;
dec_state_->format = format;
dec_state_->stride =
(xsize * BytesPerChannel(format.data_type) * format.num_channels);
if (format.align > 1) {
dec_state_->stride =
(jxl::DivCeil(dec_state_->stride, format.align) * format.align);
}
dec_state_->main_output.format = format;
dec_state_->main_output.bits_per_sample = bits_per_sample;
dec_state_->main_output.callback = pixel_callback;
dec_state_->main_output.buffer = image_buffer;
dec_state_->main_output.buffer_size = image_buffer_size;
dec_state_->main_output.stride = GetStride(xsize, format);
const jxl::ExtraChannelInfo* alpha =
decoded_->metadata()->Find(jxl::ExtraChannel::kAlpha);
if (alpha && alpha->alpha_associated && unpremul_alpha) {
@@ -206,9 +200,11 @@ class FrameDecoder {
std::swap(dec_state_->width, dec_state_->height);
}
}
dec_state_->extra_output.clear();
#if !JXL_HIGH_PRECISION
if (dec_state_->image_buffer && (format.data_type == JXL_TYPE_UINT8) &&
(format.num_channels >= 3) && !dec_state_->unpremul_alpha &&
if (dec_state_->main_output.buffer &&
(format.data_type == JXL_TYPE_UINT8) && (format.num_channels >= 3) &&
!dec_state_->unpremul_alpha &&
(dec_state_->undo_orientation == Orientation::kIdentity) &&
decoded_->metadata()->xyb_encoded &&
dec_state_->output_encoding_info.color_encoding.IsSRGB() &&
@@ -221,12 +217,15 @@ class FrameDecoder {
#endif
}
// Returns true if the rgb output buffer passed by MaybeSetRGB8OutputBuffer
// has been/will be populated by Flush() / FinalizeFrame(), or if a pixel
// callback has been used.
bool HasRGBBuffer() const {
return dec_state_->image_buffer != nullptr ||
dec_state_->pixel_callback.IsPresent();
void AddExtraChannelOutput(void* buffer, size_t buffer_size, size_t xsize,
JxlPixelFormat format, size_t bits_per_sample) {
ImageOutput out;
out.format = format;
out.bits_per_sample = bits_per_sample;
out.buffer = buffer;
out.buffer_size = buffer_size;
out.stride = GetStride(xsize, format);
dec_state_->extra_output.push_back(out);
}
private:
@@ -273,6 +272,15 @@ class FrameDecoder {
: 2u);
}
static size_t GetStride(const size_t xsize, JxlPixelFormat format) {
size_t stride =
(xsize * BytesPerChannel(format.data_type) * format.num_channels);
if (format.align > 1) {
stride = (jxl::DivCeil(stride, format.align) * format.align);
}
return stride;
}
PassesDecoderState* dec_state_;
ThreadPool* pool_;
std::vector<TocEntry> toc_;

View File

@@ -65,12 +65,13 @@ class Rec2408ToneMapper {
Mul(Set(df_, 10000), TF_PQ().DisplayFromEncoded(df_, e4))));
const V ratio = Div(new_luminance, luminance);
const V inv_target_peak = Set(df_, inv_target_peak_);
const V normalizer = Set(df_, normalizer_);
const V multiplier = Mul(ratio, normalizer);
for (V* const val : {red, green, blue}) {
*val = Mul(IfThenElse(Le(luminance, Set(df_, 1e-6f)), new_luminance,
Mul(*val, ratio)),
normalizer);
*val = IfThenElse(Le(luminance, Set(df_, 1e-6f)),
Mul(new_luminance, inv_target_peak),
Mul(*val, multiplier));
}
}
@@ -98,8 +99,8 @@ class Rec2408ToneMapper {
ks,
MulAdd(Add(t_b_3, MulAdd(Set(df_, -2), t_b_2, t_b)),
Sub(Set(df_, 1), ks),
MulAdd(Set(df_, -2), t_b_3,
Mul(Mul(Set(df_, 3), t_b_2), max_lum))));
Mul(MulAdd(Set(df_, -2), t_b_3, Mul(Set(df_, 3), t_b_2)),
max_lum)));
}
D df_;
@@ -125,6 +126,7 @@ class Rec2408ToneMapper {
const float inv_one_minus_ks_ = 1.0f / std::max(1e-6f, 1.0f - ks_);
const float normalizer_ = source_range_.second / target_range_.second;
const float inv_target_peak_ = 1.f / target_range_.second;
};
class HlgOOTF {

View File

@@ -144,6 +144,20 @@ size_t BitsPerChannel(JxlDataType data_type) {
}
}
template <typename T>
uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata,
JxlPixelFormat format) {
if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
return BitsPerChannel(format.data_type);
} else if (bit_depth.type == JXL_BIT_DEPTH_FROM_CODESTREAM) {
return metadata.bit_depth.bits_per_sample;
} else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
return bit_depth.bits_per_sample;
} else {
return 0;
}
}
enum class DecoderStage : uint32_t {
kInited, // Decoder created, no JxlDecoderProcessInput called yet
kStarted, // Running JxlDecoderProcessInput calls
@@ -415,6 +429,7 @@ struct JxlDecoderStruct {
size_t image_out_size;
JxlPixelFormat image_out_format;
JxlBitDepth image_out_bit_depth;
// For extra channels. Empty if no extra channels are requested, and they are
// reset each frame
@@ -701,6 +716,7 @@ void JxlDecoderRewindDecodingState(JxlDecoder* dec) {
dec->image_out_destroy_callback = nullptr;
dec->image_out_init_opaque = nullptr;
dec->image_out_size = 0;
dec->image_out_bit_depth.type = JXL_BIT_DEPTH_FROM_PIXEL_FORMAT;
dec->extra_channel_output.clear();
dec->dec_pixels = 0;
dec->next_in = 0;
@@ -1072,93 +1088,6 @@ JxlDecoderStatus JxlDecoderReadAllHeaders(JxlDecoder* dec) {
return JXL_DEC_SUCCESS;
}
static size_t GetStride(const JxlDecoder* dec, const JxlPixelFormat& format) {
size_t xsize, ysize;
GetCurrentDimensions(dec, xsize, ysize);
size_t stride = xsize * (BitsPerChannel(format.data_type) *
format.num_channels / jxl::kBitsPerByte);
if (format.align > 1) {
stride = jxl::DivCeil(stride, format.align) * format.align;
}
return stride;
}
// Internal wrapper around jxl::ConvertToExternal which converts the stride,
// format and orientation and allows to choose whether to get all RGB(A)
// channels or alternatively get a single extra channel.
// If want_extra_channel, a valid index to a single extra channel must be
// given, the output must be single-channel, and format.num_channels is ignored
// and treated as if it is 1.
static JxlDecoderStatus ConvertImageInternal(
const JxlDecoder* dec, const jxl::ImageBundle& frame,
const JxlPixelFormat& format, bool want_extra_channel,
size_t extra_channel_index, void* out_image, size_t out_size,
const PixelCallback& out_callback) {
// TODO(lode): handle mismatch of RGB/grayscale color profiles and pixel data
// color/grayscale format
const size_t stride = GetStride(dec, format);
bool float_format = format.data_type == JXL_TYPE_FLOAT ||
format.data_type == JXL_TYPE_FLOAT16;
jxl::Orientation undo_orientation = dec->keep_orientation
? jxl::Orientation::kIdentity
: dec->metadata.m.GetOrientation();
jxl::Status status(true);
if (want_extra_channel) {
JXL_ASSERT(extra_channel_index < frame.extra_channels().size());
status = jxl::ConvertToExternal(frame.extra_channels()[extra_channel_index],
BitsPerChannel(format.data_type),
float_format, format.endianness, stride,
dec->thread_pool.get(), out_image, out_size,
out_callback, undo_orientation);
} else {
status = jxl::ConvertToExternal(
frame, BitsPerChannel(format.data_type), float_format,
format.num_channels, format.endianness, stride, dec->thread_pool.get(),
out_image, out_size, out_callback, undo_orientation,
dec->unpremul_alpha);
}
return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR;
}
// Outputs the preview or full image (including extra channels) in the internal
// image bundle to the image buffers and/or image callback provided through the
// API.
// TODO(szabadka) Handle all these cases in the low-memory code-path and remove
// this function.
JxlDecoderStatus JxlDecoderOutputImage(JxlDecoder* dec) {
if (!dec->frame_dec->HasRGBBuffer()) {
JxlDecoderStatus status = ConvertImageInternal(
dec, *dec->ib, dec->image_out_format,
/*want_extra_channel=*/false,
/*extra_channel_index=*/0, dec->image_out_buffer, dec->image_out_size,
PixelCallback{dec->image_out_init_callback, dec->image_out_run_callback,
dec->image_out_destroy_callback,
dec->image_out_init_opaque});
if (status != JXL_DEC_SUCCESS) return status;
}
bool has_ec = !dec->ib->extra_channels().empty();
for (size_t i = 0; i < dec->extra_channel_output.size(); ++i) {
void* buffer = dec->extra_channel_output[i].buffer;
// buffer nullptr indicates this extra channel is not requested
if (!buffer) continue;
if (!has_ec) {
JXL_WARNING("Extra channels are not supported when callback is used");
return JXL_DEC_ERROR;
}
const JxlPixelFormat* format = &dec->extra_channel_output[i].format;
JxlDecoderStatus status = ConvertImageInternal(
dec, *dec->ib, *format,
/*want_extra_channel=*/true, /*extra_channel_index=*/i, buffer,
dec->extra_channel_output[i].buffer_size, /*out_callback=*/{});
if (status != JXL_DEC_SUCCESS) return status;
}
return JXL_DEC_SUCCESS;
}
JxlDecoderStatus JxlDecoderProcessSections(JxlDecoder* dec) {
Span<const uint8_t> span;
JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
@@ -1463,15 +1392,27 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
}
}
if (dec->image_out_buffer_set && dec->extra_channel_output.empty()) {
if (dec->image_out_buffer_set) {
size_t xsize, ysize;
GetCurrentDimensions(dec, xsize, ysize);
size_t bits_per_sample = GetBitDepth(
dec->image_out_bit_depth, dec->metadata.m, dec->image_out_format);
dec->frame_dec->SetImageOutput(
PixelCallback{
dec->image_out_init_callback, dec->image_out_run_callback,
dec->image_out_destroy_callback, dec->image_out_init_opaque},
reinterpret_cast<uint8_t*>(dec->image_out_buffer), xsize, ysize,
dec->image_out_format, dec->unpremul_alpha, !dec->keep_orientation);
reinterpret_cast<uint8_t*>(dec->image_out_buffer),
dec->image_out_size, xsize, ysize, dec->image_out_format,
bits_per_sample, dec->unpremul_alpha, !dec->keep_orientation);
for (size_t i = 0; i < dec->extra_channel_output.size(); ++i) {
const auto& extra = dec->extra_channel_output[i];
size_t ec_bits_per_sample =
GetBitDepth(dec->image_out_bit_depth,
dec->metadata.m.extra_channel_info[i], extra.format);
dec->frame_dec->AddExtraChannelOutput(extra.buffer, extra.buffer_size,
xsize, extra.format,
ec_bits_per_sample);
}
}
size_t next_num_passes_to_pause = dec->frame_dec->NextNumPassesToPause();
@@ -1527,9 +1468,6 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
}
if (dec->preview_frame || dec->is_last_of_still) {
if (dec->image_out_buffer_set) {
JXL_API_RETURN_IF_ERROR(JxlDecoderOutputImage(dec));
}
dec->image_out_buffer_set = false;
dec->extra_channel_output.clear();
}
@@ -2347,11 +2285,7 @@ JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) {
return JXL_DEC_ERROR;
}
if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) {
return JXL_DEC_SUCCESS;
}
return jxl::JxlDecoderOutputImage(dec);
return JXL_DEC_SUCCESS;
}
JXL_EXPORT JxlDecoderStatus JxlDecoderPreviewOutBufferSize(
@@ -2809,3 +2743,41 @@ JxlDecoderStatus JxlDecoderSetProgressiveDetail(JxlDecoder* dec,
dec->prog_detail = detail;
return JXL_DEC_SUCCESS;
}
namespace {
template <typename T>
JxlDecoderStatus VerifyOutputBitDepth(JxlBitDepth bit_depth, const T& metadata,
JxlPixelFormat format) {
if ((format.data_type == JXL_TYPE_FLOAT ||
format.data_type == JXL_TYPE_FLOAT16) &&
bit_depth.type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
return JXL_API_ERROR(
"Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT is implemented "
"for float types.");
}
uint32_t bits_per_sample = GetBitDepth(bit_depth, metadata, format);
if (format.data_type == JXL_TYPE_UINT8 &&
(bits_per_sample == 0 || bits_per_sample > 8)) {
return JXL_API_ERROR("Inavlid bit depth %u for uint8 output",
bits_per_sample);
} else if (format.data_type == JXL_TYPE_UINT16 &&
(bits_per_sample == 0 || bits_per_sample > 16)) {
return JXL_API_ERROR("Inavlid bit depth %u for uint16 output",
bits_per_sample);
}
return JXL_DEC_SUCCESS;
}
} // namespace
JxlDecoderStatus JxlDecoderSetImageOutBitDepth(JxlDecoder* dec,
const JxlBitDepth* bit_depth) {
if (!dec->image_out_buffer_set) {
return JXL_API_ERROR("No image out buffer was set.");
}
JXL_API_RETURN_IF_ERROR(
VerifyOutputBitDepth(*bit_depth, dec->metadata.m, dec->image_out_format));
dec->image_out_bit_depth = *bit_depth;
return JXL_DEC_SUCCESS;
}

View File

@@ -259,13 +259,15 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
if (params.intensity_target != 0) {
io.metadata.m.SetIntensityTarget(params.intensity_target);
}
JxlPixelFormat format = {static_cast<uint32_t>(num_channels), JXL_TYPE_UINT16,
JXL_BIG_ENDIAN, 0};
// Make the grayscale-ness of the io metadata color_encoding and the packed
// image match.
io.metadata.m.color_encoding = color_encoding;
EXPECT_TRUE(ConvertFromExternal(
pixels, xsize, ysize, color_encoding, num_channels,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
&pool, &io.Main(), /*float_in=*/false, /*align=*/0));
EXPECT_TRUE(ConvertFromExternal(pixels, xsize, ysize, color_encoding,
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/16, format, &pool,
&io.Main()));
jxl::PaddedBytes jpeg_data;
if (params.jpeg_codestream != nullptr) {
#if JPEGXL_ENABLE_JPEG
@@ -1334,11 +1336,9 @@ TEST_P(DecodeTestParam, PixelTest) {
io.SetSize(config.xsize, config.ysize);
EXPECT_TRUE(ConvertFromExternal(bytes, config.xsize, config.ysize,
color_encoding, orig_channels,
color_encoding,
/*alpha_is_premultiplied=*/false, 16,
JXL_BIG_ENDIAN, nullptr, &io.Main(),
/*float_in=*/false,
/*align=*/0));
format_orig, nullptr, &io.Main()));
for (size_t i = 0; i < pixels.size(); i++) pixels[i] = 0;
EXPECT_TRUE(ConvertToExternal(
@@ -1448,8 +1448,6 @@ std::vector<PixelTestConfig> GeneratePixelTests() {
// Test previews.
for (int preview_mode = 0; preview_mode < jxl::kNumPreviewModes;
preview_mode++) {
// if (preview_mode == jxl::kBigPreview &&
// ch_info[0].output_channels != 3) continue;
make_test(ch_info[0], 77, 33, (jxl::PreviewMode)preview_mode,
/*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
JXL_ORIENT_IDENTITY,
@@ -1664,12 +1662,10 @@ TEST(DecodeTest, PixelTestWithICCProfileLossy) {
jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
jxl::CodecInOut io0;
io0.SetSize(xsize, ysize);
EXPECT_TRUE(
ConvertFromExternal(span0, xsize, ysize, color_encoding0, /*channels=*/3,
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/16, format_orig.endianness,
/*pool=*/nullptr, &io0.Main(), /*float_in=*/false,
/*align=*/0));
EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/16, format_orig,
/*pool=*/nullptr, &io0.Main()));
jxl::ColorEncoding color_encoding1;
EXPECT_TRUE(color_encoding1.SetICC(std::move(icc)));
@@ -1677,15 +1673,14 @@ TEST(DecodeTest, PixelTestWithICCProfileLossy) {
jxl::CodecInOut io1;
io1.SetSize(xsize, ysize);
EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
channels, /*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/32, format.endianness,
/*pool=*/nullptr, &io1.Main(),
/*float_in=*/true, /*align=*/0));
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/32, format,
/*pool=*/nullptr, &io1.Main()));
jxl::ButteraugliParams ba;
EXPECT_THAT(ButteraugliDistance(io0, io1, ba, jxl::GetJxlCms(),
/*distmap=*/nullptr, nullptr),
IsSlightlyBelow(0.785f));
IsSlightlyBelow(0.85f));
JxlDecoderDestroy(dec);
}
@@ -1722,21 +1717,25 @@ double ButteraugliDistance(size_t xsize, size_t ysize,
jxl::CodecInOut in;
in.metadata.m.color_encoding = color_in;
in.metadata.m.SetIntensityTarget(intensity_in);
JxlPixelFormat format_in = {static_cast<uint32_t>(color_in.Channels()),
JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
EXPECT_TRUE(jxl::ConvertFromExternal(
jxl::Span<const uint8_t>(pixels_in.data(), pixels_in.size()), xsize,
ysize, color_in, color_in.Channels(),
ysize, color_in,
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/16, JXL_BIG_ENDIAN,
/*pool=*/nullptr, &in.Main(), /*float_in=*/false, /*align=*/0));
/*bits_per_sample=*/16, format_in,
/*pool=*/nullptr, &in.Main()));
jxl::CodecInOut out;
out.metadata.m.color_encoding = color_out;
out.metadata.m.SetIntensityTarget(intensity_out);
JxlPixelFormat format_out = {static_cast<uint32_t>(color_out.Channels()),
JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
EXPECT_TRUE(jxl::ConvertFromExternal(
jxl::Span<const uint8_t>(pixels_out.data(), pixels_out.size()), xsize,
ysize, color_out, color_out.Channels(),
ysize, color_out,
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/16, JXL_BIG_ENDIAN,
/*pool=*/nullptr, &out.Main(), /*float_in=*/false, /*align=*/0));
/*bits_per_sample=*/16, format_out,
/*pool=*/nullptr, &out.Main()));
return ButteraugliDistance(in, out, jxl::ButteraugliParams(),
jxl::GetJxlCms(), nullptr, nullptr);
}
@@ -1926,22 +1925,18 @@ TEST(DecodeTest, PixelTestOpaqueSrgbLossy) {
jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
jxl::CodecInOut io0;
io0.SetSize(xsize, ysize);
EXPECT_TRUE(ConvertFromExternal(
span0, xsize, ysize, color_encoding0, /*channels=*/3,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
format_orig.endianness,
/*pool=*/nullptr, &io0.Main(), /*float_in=*/false,
/*align=*/0));
EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/16, format_orig,
/*pool=*/nullptr, &io0.Main()));
jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false);
jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
jxl::CodecInOut io1;
EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
channels, /*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/8, format.endianness,
/*pool=*/nullptr, &io1.Main(),
/*float_in=*/false,
/*align=*/0));
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/8, format,
/*pool=*/nullptr, &io1.Main()));
jxl::ButteraugliParams ba;
EXPECT_THAT(ButteraugliDistance(io0, io1, ba, jxl::GetJxlCms(),
@@ -1982,22 +1977,18 @@ TEST(DecodeTest, PixelTestOpaqueSrgbLossyNoise) {
jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
jxl::CodecInOut io0;
io0.SetSize(xsize, ysize);
EXPECT_TRUE(ConvertFromExternal(
span0, xsize, ysize, color_encoding0, /*channels=*/3,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
format_orig.endianness,
/*pool=*/nullptr, &io0.Main(), /*float_in=*/false,
/*align=*/0));
EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/16, format_orig,
/*pool=*/nullptr, &io0.Main()));
jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false);
jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
jxl::CodecInOut io1;
EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
channels, /*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/8, format.endianness,
/*pool=*/nullptr, &io1.Main(),
/*float_in=*/false,
/*align=*/0));
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/8, format,
/*pool=*/nullptr, &io1.Main()));
jxl::ButteraugliParams ba;
EXPECT_THAT(ButteraugliDistance(io0, io1, ba, jxl::GetJxlCms(),
@@ -2362,7 +2353,7 @@ TEST(DecodeTest, DCNotGettableTest) {
TEST(DecodeTest, PreviewTest) {
size_t xsize = 77, ysize = 120;
std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
for (jxl::PreviewMode mode : {jxl::kSmallPreview, jxl::kBigPreview}) {
jxl::TestCodestreamParams params;
params.preview_mode = mode;
@@ -2393,9 +2384,8 @@ TEST(DecodeTest, PreviewTest) {
jxl::CodecInOut io0;
EXPECT_TRUE(jxl::ConvertFromExternal(
jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
c_srgb, 3, /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN, /*pool=*/nullptr, &io0.Main(),
/*float_in=*/false, /*align=*/0));
c_srgb, /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
format_orig, /*pool=*/nullptr, &io0.Main()));
GeneratePreview(params.preview_mode, &io0.Main());
size_t xsize_preview = io0.Main().xsize();
@@ -2416,9 +2406,9 @@ TEST(DecodeTest, PreviewTest) {
jxl::CodecInOut io1;
EXPECT_TRUE(jxl::ConvertFromExternal(
jxl::Span<const uint8_t>(preview.data(), preview.size()), xsize_preview,
ysize_preview, c_srgb, 3, /*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/8, JXL_LITTLE_ENDIAN,
/*pool=*/nullptr, &io1.Main(), /*float_in=*/false, /*align=*/0));
ysize_preview, c_srgb, /*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/8, format,
/*pool=*/nullptr, &io1.Main()));
jxl::ButteraugliParams ba;
// TODO(lode): this ButteraugliDistance silently returns 0 (dangerous for
@@ -2492,10 +2482,9 @@ TEST(DecodeTest, AnimationTest) {
EXPECT_TRUE(ConvertFromExternal(
jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
/*float_in=*/false, /*align=*/0));
ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
/*pool=*/nullptr, &bundle));
bundle.duration = frame_durations[i];
io.frames.push_back(std::move(bundle));
}
@@ -2596,10 +2585,9 @@ TEST(DecodeTest, AnimationTestStreaming) {
EXPECT_TRUE(ConvertFromExternal(
jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
/*float_in=*/false, /*align=*/0));
ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
/*pool=*/nullptr, &bundle));
bundle.duration = frame_durations[i];
io.frames.push_back(std::move(bundle));
}
@@ -2815,10 +2803,9 @@ TEST(DecodeTest, SkipCurrentFrameTest) {
EXPECT_TRUE(ConvertFromExternal(
jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
/*float_in=*/false, /*align=*/0));
ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
/*pool=*/nullptr, &bundle));
bundle.duration = frame_durations[i];
io.frames.push_back(std::move(bundle));
}
@@ -2930,10 +2917,9 @@ TEST(DecodeTest, SkipFrameTest) {
EXPECT_TRUE(ConvertFromExternal(
jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
/*float_in=*/false, /*align=*/0));
ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
/*pool=*/nullptr, &bundle));
bundle.duration = frame_durations[i];
io.frames.push_back(std::move(bundle));
}
@@ -3067,10 +3053,8 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {
jxl::Span<const uint8_t>(frame_internal.data(),
frame_internal.size()),
xsize, ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
/*channels=*/3,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle_internal,
/*float_in=*/false, /*align=*/0));
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
/*pool=*/nullptr, &bundle_internal));
bundle_internal.duration = 0;
bundle_internal.use_for_next_frame = true;
io.frames.push_back(std::move(bundle_internal));
@@ -3083,10 +3067,9 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {
jxl::ImageBundle bundle(&io.metadata.m);
EXPECT_TRUE(ConvertFromExternal(
jxl::Span<const uint8_t>(frame.data(), frame.size()), xsize, ysize,
jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
/*float_in=*/false, /*align=*/0));
jxl::ColorEncoding::SRGB(/*is_gray=*/false),
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
/*pool=*/nullptr, &bundle));
bundle.duration = frame_durations[i];
// Create some variation in which frames depend on which.
if (i != 3 && i != 9 && i != 10) {
@@ -3294,10 +3277,8 @@ TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
jxl::Span<const uint8_t>(frame_internal.data(),
frame_internal.size()),
xsize / 2, ysize / 2, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
/*channels=*/4,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle_internal,
/*float_in=*/false, /*align=*/0));
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
/*pool=*/nullptr, &bundle_internal));
bundle_internal.duration = 0;
bundle_internal.use_for_next_frame = true;
bundle_internal.origin = {13, 17};
@@ -3315,10 +3296,9 @@ TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
jxl::ImageBundle bundle(&io.metadata.m);
EXPECT_TRUE(ConvertFromExternal(
jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/4,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
/*float_in=*/false, /*align=*/0));
cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
/*pool=*/nullptr, &bundle));
bundle.duration = 5 + i;
frame_durations_nc.push_back(5 + i);
frame_durations_c.push_back(5 + i);
@@ -3579,10 +3559,8 @@ TEST(DecodeTest, OrientedCroppedFrameTest) {
EXPECT_TRUE(ConvertFromExternal(
jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
/*channels=*/4,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
/*float_in=*/false, /*align=*/0));
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
/*pool=*/nullptr, &bundle));
bundle.origin = {cropx0, cropy0};
bundle.use_for_next_frame = true;
io.frames.push_back(std::move(bundle));
@@ -4659,14 +4637,15 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
}
std::vector<uint8_t> pixels =
jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
jxl::ColorEncoding color_encoding = jxl::ColorEncoding::SRGB(false);
jxl::CodecInOut io;
EXPECT_TRUE(jxl::ConvertFromExternal(
jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
color_encoding, num_channels,
color_encoding,
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/16, JXL_BIG_ENDIAN,
/*pool=*/nullptr, &io.Main(), /*float_in=*/false, /*align=*/0));
/*bits_per_sample=*/16, format,
/*pool=*/nullptr, &io.Main()));
jxl::TestCodestreamParams params;
if (lossless) {
params.cparams.SetLossless();
@@ -4681,7 +4660,6 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
num_channels, params);
JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
for (size_t increment : {(size_t)1, data.size()}) {
printf(
@@ -4782,11 +4760,9 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
jxl::CodecInOut io1;
EXPECT_TRUE(jxl::ConvertFromExternal(
jxl::Span<const uint8_t>(passes[p].data(), passes[p].size()), xsize,
ysize, color_encoding, num_channels,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
JXL_BIG_ENDIAN,
/*pool=*/nullptr, &io1.Main(), /*float_in=*/false,
/*align=*/0));
ysize, color_encoding,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
/*pool=*/nullptr, &io1.Main()));
distances[p] = ButteraugliDistance(io, io1, ba, jxl::GetJxlCms(),
nullptr, nullptr);
if (p == kNumPasses) break;
@@ -4800,7 +4776,7 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
// Verify that the returned pass image is actually not the
// same as the next pass image, by checking that it has a bit
// worse butteraugli score.
EXPECT_LT(distances[next_p] * 1.2f, distances[p]);
EXPECT_LT(distances[next_p] * 1.1f, distances[p]);
p = next_p;
}
}

View File

@@ -733,8 +733,8 @@ ImageF TileDistMap(const ImageF& distmap, int tile_size, int margin,
return tile_distmap;
}
constexpr float kDcQuantPow = 0.57f;
static const float kDcQuant = 1.12f;
constexpr float kDcQuantPow = 0.66f;
static const float kDcQuant = 1.0f;
static const float kAcQuant = 0.8294f;
void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin,
@@ -1037,7 +1037,7 @@ void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
}
float InitialQuantDC(float butteraugli_target) {
const float kDcMul = 2.9; // Butteraugli target where non-linearity kicks in.
const float kDcMul = 1.5; // Butteraugli target where non-linearity kicks in.
const float butteraugli_target_dc = std::max<float>(
0.5f * butteraugli_target,
std::min<float>(butteraugli_target,

View File

@@ -179,9 +179,7 @@ Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
} else {
auto compute_dc_coeffs = [&](int group_index, int /* thread */) {
modular_frame_encoder->AddVarDCTDC(
dc, group_index,
enc_state->cparams.butteraugli_distance >= 2.0f &&
enc_state->cparams.speed_tier < SpeedTier::kFalcon,
dc, group_index, enc_state->cparams.speed_tier < SpeedTier::kFalcon,
enc_state, /*jpeg_transcode=*/false);
};
JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, shared.frame_dim.num_dc_groups,

View File

@@ -84,42 +84,54 @@ void JXL_INLINE LoadFloatRow(float* JXL_RESTRICT row_out, const uint8_t* in,
uint32_t JXL_INLINE Load8(const uint8_t* p) { return *p; }
Status PixelFormatToExternal(const JxlPixelFormat& pixel_format,
size_t* bitdepth, bool* float_in) {
if (pixel_format.data_type == JXL_TYPE_FLOAT) {
*bitdepth = 32;
*float_in = true;
} else if (pixel_format.data_type == JXL_TYPE_FLOAT16) {
*bitdepth = 16;
*float_in = true;
} else if (pixel_format.data_type == JXL_TYPE_UINT8) {
*bitdepth = 8;
*float_in = false;
} else if (pixel_format.data_type == JXL_TYPE_UINT16) {
*bitdepth = 16;
*float_in = false;
} else {
return JXL_FAILURE("unsupported pixel format data type");
size_t JxlDataTypeBytes(JxlDataType data_type) {
switch (data_type) {
case JXL_TYPE_UINT8:
return 1;
case JXL_TYPE_UINT16:
return 2;
case JXL_TYPE_FLOAT16:
return 2;
case JXL_TYPE_FLOAT:
return 4;
default:
return 0;
}
return true;
}
} // namespace
Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
size_t ysize, size_t bits_per_sample,
JxlEndianness endianness, ThreadPool* pool,
ImageF* channel, bool float_in, size_t align) {
// TODO(firsching): Avoid code duplication with the function below.
JXL_CHECK(float_in ? bits_per_sample == 16 || bits_per_sample == 32
: bits_per_sample > 0 && bits_per_sample <= 16);
const size_t bytes_per_pixel = DivCeil(bits_per_sample, jxl::kBitsPerByte);
JxlPixelFormat format, size_t c, ThreadPool* pool,
ImageF* channel) {
if (format.data_type == JXL_TYPE_UINT8) {
JXL_RETURN_IF_ERROR(bits_per_sample > 0 && bits_per_sample <= 8);
} else if (format.data_type == JXL_TYPE_UINT16) {
JXL_RETURN_IF_ERROR(bits_per_sample > 8 && bits_per_sample <= 16);
} else if (format.data_type == JXL_TYPE_FLOAT16) {
JXL_RETURN_IF_ERROR(bits_per_sample == 16);
} else if (format.data_type == JXL_TYPE_FLOAT) {
JXL_RETURN_IF_ERROR(bits_per_sample == 32);
} else {
JXL_FAILURE("unsupported pixel format data type %d", format.data_type);
}
size_t bytes_per_channel = JxlDataTypeBytes(format.data_type);
size_t bytes_per_pixel = format.num_channels * bytes_per_channel;
size_t pixel_offset = c * bytes_per_channel;
const size_t last_row_size = xsize * bytes_per_pixel;
const size_t align = format.align;
const size_t row_size =
(align > 1 ? jxl::DivCeil(last_row_size, align) * align : last_row_size);
const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
if (xsize == 0 || ysize == 0) return JXL_FAILURE("Empty image");
if (bytes.size() < bytes_to_read) {
return JXL_FAILURE("Buffer size is too small");
return JXL_FAILURE("Buffer size is too small, expected: %" PRIuS
" got: %" PRIuS " (Image: %" PRIuS "x%" PRIuS
"x%u, bytes_per_channel: %" PRIuS ")",
bytes_to_read, bytes.size(), xsize, ysize,
format.num_channels, bytes_per_channel);
}
JXL_ASSERT(channel->xsize() == xsize);
JXL_ASSERT(channel->ysize() == ysize);
@@ -130,18 +142,19 @@ Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
}
const bool little_endian =
endianness == JXL_LITTLE_ENDIAN ||
(endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
format.endianness == JXL_LITTLE_ENDIAN ||
(format.endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
const uint8_t* const in = bytes.data();
if (float_in) {
if (format.data_type == JXL_TYPE_FLOAT ||
format.data_type == JXL_TYPE_FLOAT16) {
JXL_RETURN_IF_ERROR(RunOnPool(
pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
[&](const uint32_t task, size_t /*thread*/) {
const size_t y = task;
size_t i = row_size * task;
size_t i = row_size * task + pixel_offset;
float* JXL_RESTRICT row_out = channel->Row(y);
if (bits_per_sample == 16) {
if (format.data_type == JXL_TYPE_FLOAT16) {
if (little_endian) {
for (size_t x = 0; x < xsize; ++x) {
row_out[x] = LoadLEFloat16(in + i);
@@ -174,9 +187,9 @@ Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
[&](const uint32_t task, size_t /*thread*/) {
const size_t y = task;
size_t i = row_size * task;
size_t i = row_size * task + pixel_offset;
float* JXL_RESTRICT row_out = channel->Row(y);
if (bits_per_sample <= 8) {
if (format.data_type == JXL_TYPE_UINT8) {
LoadFloatRow<Load8>(row_out, in + i, mul, xsize, bytes_per_pixel);
} else {
if (little_endian) {
@@ -195,187 +208,36 @@ Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
}
Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
size_t ysize, const ColorEncoding& c_current,
size_t channels, bool alpha_is_premultiplied,
size_t bits_per_sample, JxlEndianness endianness,
ThreadPool* pool, ImageBundle* ib, bool float_in,
size_t align) {
JXL_CHECK(float_in ? bits_per_sample == 16 || bits_per_sample == 32
: bits_per_sample > 0 && bits_per_sample <= 16);
bool alpha_is_premultiplied, size_t bits_per_sample,
JxlPixelFormat format, ThreadPool* pool,
ImageBundle* ib) {
const size_t color_channels = c_current.Channels();
bool has_alpha = channels == 2 || channels == 4;
if (channels < color_channels) {
bool has_alpha = format.num_channels == 2 || format.num_channels == 4;
if (format.num_channels < color_channels) {
return JXL_FAILURE("Expected %" PRIuS
" color channels, received only %" PRIuS " channels",
color_channels, channels);
" color channels, received only %u channels",
color_channels, format.num_channels);
}
const size_t bytes_per_channel = DivCeil(bits_per_sample, jxl::kBitsPerByte);
const size_t bytes_per_pixel = channels * bytes_per_channel;
if (bits_per_sample > 16 && bits_per_sample < 32) {
return JXL_FAILURE("not supported, try bits_per_sample=32");
}
const size_t last_row_size = xsize * bytes_per_pixel;
const size_t row_size =
(align > 1 ? jxl::DivCeil(last_row_size, align) * align : last_row_size);
const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
if (xsize == 0 || ysize == 0) return JXL_FAILURE("Empty image");
if (bytes.size() < bytes_to_read) {
return JXL_FAILURE(
"Buffer size is too small: expected at least %" PRIuS
" bytes (= %" PRIuS " * %" PRIuS " * %" PRIuS "), got %" PRIuS " bytes",
bytes_to_read, xsize, ysize, bytes_per_pixel, bytes.size());
}
// Too large buffer is likely an application bug, so also fail for that.
// Do allow padding to stride in last row though.
if (bytes.size() > row_size * ysize) {
return JXL_FAILURE(
"Buffer size is too large: expected at most %" PRIuS " bytes (= %" PRIuS
" * %" PRIuS " * %" PRIuS "), got %" PRIuS " bytes",
row_size * ysize, xsize, ysize, bytes_per_pixel, bytes.size());
}
const bool little_endian =
endianness == JXL_LITTLE_ENDIAN ||
(endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
const uint8_t* const in = bytes.data();
Image3F color(xsize, ysize);
if (float_in) {
for (size_t c = 0; c < color_channels; ++c) {
JXL_RETURN_IF_ERROR(RunOnPool(
pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
[&](const uint32_t task, size_t /*thread*/) {
const size_t y = task;
size_t i =
row_size * task + (c * bits_per_sample / jxl::kBitsPerByte);
float* JXL_RESTRICT row_out = color.PlaneRow(c, y);
if (bits_per_sample == 16) {
if (little_endian) {
for (size_t x = 0; x < xsize; ++x) {
row_out[x] = LoadLEFloat16(in + i);
i += bytes_per_pixel;
}
} else {
for (size_t x = 0; x < xsize; ++x) {
row_out[x] = LoadBEFloat16(in + i);
i += bytes_per_pixel;
}
}
} else {
if (little_endian) {
for (size_t x = 0; x < xsize; ++x) {
row_out[x] = LoadLEFloat(in + i);
i += bytes_per_pixel;
}
} else {
for (size_t x = 0; x < xsize; ++x) {
row_out[x] = LoadBEFloat(in + i);
i += bytes_per_pixel;
}
}
}
},
"ConvertRGBFloat"));
}
} else {
// Multiplier to convert from the integer range to floating point 0-1 range.
float mul = 1. / ((1ull << bits_per_sample) - 1);
for (size_t c = 0; c < color_channels; ++c) {
JXL_RETURN_IF_ERROR(RunOnPool(
pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
[&](const uint32_t task, size_t /*thread*/) {
const size_t y = task;
size_t i = row_size * task + c * bytes_per_channel;
float* JXL_RESTRICT row_out = color.PlaneRow(c, y);
if (bits_per_sample <= 8) {
LoadFloatRow<Load8>(row_out, in + i, mul, xsize, bytes_per_pixel);
} else {
if (little_endian) {
LoadFloatRow<LoadLE16>(row_out, in + i, mul, xsize,
bytes_per_pixel);
} else {
LoadFloatRow<LoadBE16>(row_out, in + i, mul, xsize,
bytes_per_pixel);
}
}
},
"ConvertRGBUint"));
}
for (size_t c = 0; c < color_channels; ++c) {
JXL_RETURN_IF_ERROR(ConvertFromExternal(bytes, xsize, ysize,
bits_per_sample, format, c, pool,
&color.Plane(c)));
}
if (color_channels == 1) {
CopyImageTo(color.Plane(0), &color.Plane(1));
CopyImageTo(color.Plane(0), &color.Plane(2));
}
ib->SetFromImage(std::move(color), c_current);
// Passing an interleaved image with an alpha channel to an image that doesn't
// have alpha channel just discards the passed alpha channel.
if (has_alpha && ib->HasAlpha()) {
ImageF alpha(xsize, ysize);
if (float_in) {
JXL_RETURN_IF_ERROR(RunOnPool(
pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
[&](const uint32_t task, size_t /*thread*/) {
const size_t y = task;
size_t i = row_size * task +
((channels - 1) * bits_per_sample / jxl::kBitsPerByte);
float* JXL_RESTRICT row_out = alpha.Row(y);
if (bits_per_sample == 16) {
if (little_endian) {
for (size_t x = 0; x < xsize; ++x) {
row_out[x] = LoadLEFloat16(in + i);
i += bytes_per_pixel;
}
} else {
for (size_t x = 0; x < xsize; ++x) {
row_out[x] = LoadBEFloat16(in + i);
i += bytes_per_pixel;
}
}
} else {
if (little_endian) {
for (size_t x = 0; x < xsize; ++x) {
row_out[x] = LoadLEFloat(in + i);
i += bytes_per_pixel;
}
} else {
for (size_t x = 0; x < xsize; ++x) {
row_out[x] = LoadBEFloat(in + i);
i += bytes_per_pixel;
}
}
}
},
"ConvertAlphaFloat"));
} else {
float mul = 1. / ((1ull << bits_per_sample) - 1);
JXL_RETURN_IF_ERROR(RunOnPool(
pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
[&](const uint32_t task, size_t /*thread*/) {
const size_t y = task;
size_t i = row_size * task + (channels - 1) * bytes_per_channel;
float* JXL_RESTRICT row_out = alpha.Row(y);
if (bits_per_sample <= 8) {
LoadFloatRow<Load8>(row_out, in + i, mul, xsize, bytes_per_pixel);
} else {
if (little_endian) {
LoadFloatRow<LoadLE16>(row_out, in + i, mul, xsize,
bytes_per_pixel);
} else {
LoadFloatRow<LoadBE16>(row_out, in + i, mul, xsize,
bytes_per_pixel);
}
}
},
"ConvertAlphaUint"));
}
JXL_RETURN_IF_ERROR(
ConvertFromExternal(bytes, xsize, ysize, bits_per_sample, format,
format.num_channels - 1, pool, &alpha));
ib->SetAlpha(std::move(alpha), alpha_is_premultiplied);
} else if (!has_alpha && ib->HasAlpha()) {
// if alpha is not passed, but it is expected, then assume
@@ -391,18 +253,10 @@ Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize,
size_t ysize, const void* buffer, size_t size,
ThreadPool* pool, ImageF* channel) {
size_t bitdepth;
bool float_in;
JXL_RETURN_IF_ERROR(
PixelFormatToExternal(pixel_format, &bitdepth, &float_in));
JXL_RETURN_IF_ERROR(ConvertFromExternal(
size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte;
return ConvertFromExternal(
jxl::Span<const uint8_t>(static_cast<const uint8_t*>(buffer), size),
xsize, ysize, bitdepth, pixel_format.endianness, pool, channel, float_in,
pixel_format.align));
return true;
xsize, ysize, bitdepth, pixel_format, 0, pool, channel);
}
Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize,
@@ -410,16 +264,11 @@ Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize,
jxl::ThreadPool* pool,
const jxl::ColorEncoding& c_current,
jxl::ImageBundle* ib) {
size_t bitdepth;
bool float_in;
JXL_RETURN_IF_ERROR(
PixelFormatToExternal(pixel_format, &bitdepth, &float_in));
size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte;
JXL_RETURN_IF_ERROR(ConvertFromExternal(
jxl::Span<const uint8_t>(static_cast<const uint8_t*>(buffer), size),
xsize, ysize, c_current, pixel_format.num_channels,
/*alpha_is_premultiplied=*/false, bitdepth, pixel_format.endianness, pool,
ib, float_in, pixel_format.align));
xsize, ysize, c_current,
/*alpha_is_premultiplied=*/false, bitdepth, pixel_format, pool, ib));
ib->VerifyMetadata();
return true;

View File

@@ -23,17 +23,16 @@
namespace jxl {
Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
size_t ysize, size_t bits_per_sample,
JxlEndianness endianness, ThreadPool* pool,
ImageF* channel, bool float_in, size_t align);
JxlPixelFormat format, size_t c, ThreadPool* pool,
ImageF* channel);
// Convert an interleaved pixel buffer to the internal ImageBundle
// representation. This is the opposite of ConvertToExternal().
Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
size_t ysize, const ColorEncoding& c_current,
size_t channels, bool alpha_is_premultiplied,
size_t bits_per_sample, JxlEndianness endianness,
ThreadPool* pool, ImageBundle* ib, bool float_in,
size_t align);
bool alpha_is_premultiplied, size_t bits_per_sample,
JxlPixelFormat format, ThreadPool* pool,
ImageBundle* ib);
Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize,
size_t ysize, const void* buffer, size_t size,
ThreadPool* pool, ImageF* channel);

View File

@@ -21,17 +21,16 @@ void BM_EncExternalImage_ConvertImageRGBA(benchmark::State& state) {
ImageBundle ib(&im);
std::vector<uint8_t> interleaved(xsize * ysize * 4);
JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
for (auto _ : state) {
for (size_t i = 0; i < kNumIter; ++i) {
JXL_CHECK(ConvertFromExternal(
Span<const uint8_t>(interleaved.data(), interleaved.size()), xsize,
ysize,
/*c_current=*/ColorEncoding::SRGB(),
/*channels=*/4,
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/8, JXL_NATIVE_ENDIAN,
/*pool=*/nullptr, &ib, /*float_in=*/false, /*align=*/0));
/*bits_per_sample=*/8, format,
/*pool=*/nullptr, &ib));
}
}

View File

@@ -25,23 +25,23 @@ TEST(ExternalImageTest, InvalidSize) {
im.SetAlphaBits(8);
ImageBundle ib(&im);
JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
const uint8_t buf[10 * 100 * 8] = {};
EXPECT_FALSE(ConvertFromExternal(
Span<const uint8_t>(buf, 10), /*xsize=*/10, /*ysize=*/100,
/*c_current=*/ColorEncoding::SRGB(), /*channels=*/4,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
nullptr, &ib, /*float_in=*/false, /*align=*/0));
/*c_current=*/ColorEncoding::SRGB(),
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format, nullptr,
&ib));
EXPECT_FALSE(ConvertFromExternal(
Span<const uint8_t>(buf, sizeof(buf) - 1), /*xsize=*/10, /*ysize=*/100,
/*c_current=*/ColorEncoding::SRGB(), /*channels=*/4,
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
nullptr, &ib, /*float_in=*/false, /*align=*/0));
/*c_current=*/ColorEncoding::SRGB(),
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format, nullptr,
&ib));
EXPECT_TRUE(
ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), /*xsize=*/10,
/*ysize=*/100, /*c_current=*/ColorEncoding::SRGB(),
/*channels=*/4, /*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/16, JXL_BIG_ENDIAN, nullptr, &ib,
/*float_in=*/false, /*align=*/0));
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/16, format, nullptr, &ib));
}
#endif
@@ -54,14 +54,14 @@ TEST(ExternalImageTest, AlphaMissing) {
const size_t ysize = 20;
const uint8_t buf[xsize * ysize * 4] = {};
JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
// has_alpha is true but the ImageBundle has no alpha. Alpha channel should
// be ignored.
EXPECT_TRUE(
ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), xsize, ysize,
/*c_current=*/ColorEncoding::SRGB(),
/*channels=*/4, /*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/8, JXL_BIG_ENDIAN, nullptr, &ib,
/*float_in=*/false, /*align=*/0));
EXPECT_TRUE(ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), xsize,
ysize,
/*c_current=*/ColorEncoding::SRGB(),
/*alpha_is_premultiplied=*/false,
/*bits_per_sample=*/8, format, nullptr, &ib));
EXPECT_FALSE(ib.HasAlpha());
}

Some files were not shown because too many files have changed in this diff Show More