Bug 1793238 - Update libjxl r=tnikkel

Differential Revision: https://phabricator.services.mozilla.com/D158771
2022-10-06 17:31:29 +00:00
parent 9cf3d82c8c
commit db107725b2
123 changed files with 4471 additions and 3040 deletions
--- a/media/highway/moz.yaml
+++ b/media/highway/moz.yaml
@@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 7f2e26854086fba4255220fd6c77e9141f1f87cc
+  release: 22e3d7276f4157d4a47586ba9fd91dd6303f441a

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 7f2e26854086fba4255220fd6c77e9141f1f87cc
+  revision: 22e3d7276f4157d4a47586ba9fd91dd6303f441a

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libjxl/include/jxl/version.h
+++ b/media/libjxl/include/jxl/version.h
@@ -0,0 +1,14 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef JXL_VERSION_H_
+#define JXL_VERSION_H_
+
+#define JPEGXL_MAJOR_VERSION 0
+#define JPEGXL_MINOR_VERSION 0
+#define JPEGXL_PATCH_VERSION 0
+
+#endif /* JXL_VERSION_H_ */
--- a/media/libjxl/moz.build
+++ b/media/libjxl/moz.build
@@ -103,13 +103,10 @@ SOURCES += [
    "/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc",
 ]

-DEFINES["JPEGXL_MAJOR_VERSION"] = "0"
-DEFINES["JPEGXL_MINOR_VERSION"] = "0"
-DEFINES["JPEGXL_PATCH_VERSION"] = "0"
-
 EXPORTS.jxl += [
    "./include/jxl/jxl_export.h",
    "./include/jxl/jxl_threads_export.h",
+    "./include/jxl/version.h",
    "/third_party/jpeg-xl/lib/include/jxl/butteraugli.h",
    "/third_party/jpeg-xl/lib/include/jxl/butteraugli_cxx.h",
    "/third_party/jpeg-xl/lib/include/jxl/cms_interface.h",
--- a/media/libjxl/moz.yaml
+++ b/media/libjxl/moz.yaml
@@ -10,9 +10,9 @@ origin:

  url: https://github.com/libjxl/libjxl

-  release: 3e0b08d4ee53a08f9b58739e088c5bdecebae74d (2022-09-09T11:59:45Z).
+  release: 19e36b964cd966e2408bad87182faa38b7de3e9e

-  revision: 3e0b08d4ee53a08f9b58739e088c5bdecebae74d
+  revision: 19e36b964cd966e2408bad87182faa38b7de3e9e

  license: Apache-2.0

--- a/third_party/highway/CMakeLists.txt
+++ b/third_party/highway/CMakeLists.txt
@@ -19,7 +19,7 @@ if(POLICY CMP0083)
  cmake_policy(SET CMP0083 NEW)
 endif()

-project(hwy VERSION 1.0.0)  # Keep in sync with highway.h version
+project(hwy VERSION 1.0.1)  # Keep in sync with highway.h version

 # Directly define the ABI version from the cmake project() version values:
 set(LIBRARY_VERSION "${hwy_VERSION}")
@@ -89,6 +89,9 @@ list(APPEND HWY_CONTRIB_SOURCES
    hwy/contrib/sort/vqsort-inl.h
    hwy/contrib/sort/vqsort.cc
    hwy/contrib/sort/vqsort.h
+    hwy/contrib/algo/copy-inl.h
+    hwy/contrib/algo/find-inl.h
+    hwy/contrib/algo/transform-inl.h
 )
 endif()  # HWY_ENABLE_CONTRIB

--- a/third_party/highway/debian/changelog
+++ b/third_party/highway/debian/changelog
@@ -1,3 +1,15 @@
+highway (1.0.1-1) UNRELEASED; urgency=medium
+
+* Add Eq128, i64 Mul, unsigned->float ConvertTo
+* Faster sort for few unique keys, more robust pivot selection
+* Fix: floating-point generator for sort tests, Min/MaxOfLanes for i16
+* Fix: avoid always_inline in debug, link atomic
+* GCC warnings: string.h, maybe-uninitialized, ignored-attributes
+* GCC warnings: preprocessor int overflow, spurious use-after-free/overflow
+* Doc: <=HWY_AVX3, Full32/64/128, how to use generic-inl
+
+ -- Jan Wassenberg <janwas@google.com>  Tue, 23 Aug 2022 10:00:00 +0200
+
 highway (1.0.0-1) UNRELEASED; urgency=medium

 * ABI change: 64-bit target values, more room for expansion
--- a/third_party/highway/hwy/base.h
+++ b/third_party/highway/hwy/base.h
@@ -24,6 +24,9 @@
 #include "hwy/detect_compiler_arch.h"
 #include "hwy/highway_export.h"

+#if HWY_COMPILER_MSVC
+#include <string.h>  // memcpy
+#endif
 #if HWY_ARCH_X86
 #include <atomic>
 #endif
@@ -131,6 +134,19 @@
 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))

+#if HWY_COMPILER_GCC_ACTUAL
+// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
+#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
+#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
+#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
+#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
+#define HWY_DEFAULT_UNROLL HWY_UNROLL()
+#else
+#define HWY_UNROLL(factor)
+#define HWY_DEFAULT_UNROLL HWY_UNROLL()
+#endif
+
+
 // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
 // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
 // does, without generating code.
@@ -863,10 +879,18 @@ HWY_API void CopyBytes(const From* from, To* to) {
 #if HWY_COMPILER_MSVC
  memcpy(to, from, kBytes);
 #else
-  __builtin_memcpy(to, from, kBytes);
+  __builtin_memcpy(
+      static_cast<void*>(to), static_cast<const void*>(from), kBytes);
 #endif
 }

+// Same as CopyBytes, but for same-sized objects; avoids a size argument.
+template <typename From, typename To>
+HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
+  static_assert(sizeof(From) == sizeof(To), "");
+  CopyBytes<sizeof(From)>(from, to);
+}
+
 template <size_t kBytes, typename To>
 HWY_API void ZeroBytes(To* to) {
 #if HWY_COMPILER_MSVC
@@ -880,13 +904,13 @@ HWY_API float F32FromBF16(bfloat16_t bf) {
  uint32_t bits = bf.bits;
  bits <<= 16;
  float f;
-  CopyBytes<4>(&bits, &f);
+  CopySameSize(&bits, &f);
  return f;
 }

 HWY_API bfloat16_t BF16FromF32(float f) {
  uint32_t bits;
-  CopyBytes<4>(&f, &bits);
+  CopySameSize(&f, &bits);
  bfloat16_t bf;
  bf.bits = static_cast<uint16_t>(bits >> 16);
  return bf;
--- a/third_party/highway/hwy/contrib/algo/copy-inl.h
+++ b/third_party/highway/hwy/contrib/algo/copy-inl.h
@@ -22,8 +22,6 @@
 #define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
 #endif

-#include <string.h>  // memcpy
-
 #include "hwy/highway.h"

 HWY_BEFORE_NAMESPACE();
--- a/third_party/highway/hwy/contrib/algo/transform_test.cc
+++ b/third_party/highway/hwy/contrib/algo/transform_test.cc
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <string.h>
+#include <string.h>  // memcpy

 #include "hwy/aligned_allocator.h"

--- a/third_party/highway/hwy/contrib/image/image.h
+++ b/third_party/highway/hwy/contrib/image/image.h
@@ -18,7 +18,6 @@

 // SIMD/multicore-friendly planar image representation with row accessors.

-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
@@ -104,7 +103,7 @@ struct HWY_CONTRIB_DLLEXPORT ImageBase {
  HWY_INLINE void* VoidRow(const size_t y) const {
 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
    if (y >= ysize_) {
-      HWY_ABORT("Row(%" PRIu64 ") >= %u\n", static_cast<uint64_t>(y), ysize_);
+      HWY_ABORT("Row(%d) >= %u\n", static_cast<int>(y), ysize_);
    }
 #endif

@@ -223,14 +222,11 @@ class Image3 {

  Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
    if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
-      HWY_ABORT("Not same size: %" PRIu64 " x %" PRIu64 ", %" PRIu64
-                " x %" PRIu64 ", %" PRIu64 " x %" PRIu64 "\n",
-                static_cast<uint64_t>(plane0.xsize()),
-                static_cast<uint64_t>(plane0.ysize()),
-                static_cast<uint64_t>(plane1.xsize()),
-                static_cast<uint64_t>(plane1.ysize()),
-                static_cast<uint64_t>(plane2.xsize()),
-                static_cast<uint64_t>(plane2.ysize()));
+      HWY_ABORT(
+          "Not same size: %d x %d, %d x %d, %d x %d\n",
+          static_cast<int>(plane0.xsize()), static_cast<int>(plane0.ysize()),
+          static_cast<int>(plane1.xsize()), static_cast<int>(plane1.ysize()),
+          static_cast<int>(plane2.xsize()), static_cast<int>(plane2.ysize()));
    }
    planes_[0] = std::move(plane0);
    planes_[1] = std::move(plane1);
@@ -294,9 +290,8 @@ class Image3 {
  HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
    if (c >= kNumPlanes || y >= ysize()) {
-      HWY_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") >= %" PRIu64 "\n",
-                static_cast<uint64_t>(c), static_cast<uint64_t>(y),
-                static_cast<uint64_t>(ysize()));
+      HWY_ABORT("PlaneRow(%d, %d) >= %d\n", static_cast<int>(c),
+                static_cast<int>(y), static_cast<int>(ysize()));
    }
 #endif
    // Use the first plane's stride because the compiler might not realize they
--- a/third_party/highway/hwy/contrib/math/math_test.cc
+++ b/third_party/highway/hwy/contrib/math/math_test.cc
@@ -13,6 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
 #include <stdio.h>

 #include <cfloat>  // FLT_MAX
--- a/third_party/highway/hwy/contrib/sort/BUILD
+++ b/third_party/highway/hwy/contrib/sort/BUILD
@@ -99,6 +99,7 @@ cc_library(
        "traits-inl.h",
        "traits128-inl.h",
        "vqsort-inl.h",
+        # Placeholder for internal instrumentation. Do not remove.
    ],
    deps = [
        # Only if VQSORT_SECURE_RNG is set.
--- a/third_party/highway/hwy/contrib/sort/algo-inl.h
+++ b/third_party/highway/hwy/contrib/sort/algo-inl.h
@@ -124,7 +124,7 @@ class InputStats {
    // bit representations as the checksum.
    uint64_t bits = 0;
    static_assert(sizeof(T) <= 8, "Expected a built-in type");
-    CopyBytes<sizeof(T)>(&value, &bits);
+    CopyBytes<sizeof(T)>(&value, &bits);  // not same size
    sum_ += bits;
    count_ += 1;
  }
--- a/third_party/highway/hwy/contrib/sort/bench_sort.cc
+++ b/third_party/highway/hwy/contrib/sort/bench_sort.cc
@@ -15,7 +15,6 @@

 #include <stdint.h>
 #include <stdio.h>
-#include <string.h>  // memcpy

 #include <vector>

@@ -51,6 +50,7 @@ using detail::SharedTraits;

 #if VQSORT_ENABLED || HWY_IDE
 using detail::OrderAscending128;
+using detail::OrderAscendingKV128;
 using detail::Traits128;

 template <class Traits>
@@ -81,8 +81,9 @@ HWY_NOINLINE void BenchPartition() {
      // The pivot value can influence performance. Do exactly what vqsort will
      // do so that the performance (influenced by prefetching and branch
      // prediction) is likely to predict the actual performance inside vqsort.
-      const auto pivot = detail::ChoosePivot(d, st, aligned.get(), 0, num_lanes,
-                                             buf.get(), rng);
+      detail::PivotResult result;
+      const auto pivot = detail::ChoosePivot(d, st, aligned.get(), num_lanes,
+                                             buf.get(), rng, result);

      const Timestamp t0;
      detail::Partition(d, st, aligned.get(), 0, num_lanes - 1, pivot,
@@ -110,7 +111,7 @@ HWY_NOINLINE void BenchAllPartition() {
  BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
  BenchPartition<Traits128<OrderAscending128>>();
  // BenchPartition<Traits128<OrderDescending128>>();
-  // BenchPartition<Traits128<OrderAscendingKV128>>();
+  BenchPartition<Traits128<OrderAscendingKV128>>();
 }

 template <class Traits>
@@ -258,12 +259,9 @@ HWY_NOINLINE void BenchSort(size_t num_keys) {

 HWY_NOINLINE void BenchAllSort() {
  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 ||
-      HWY_TARGET == HWY_EMU128) {
+  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
    return;
  }
-  // Only enable EMU128 on x86 - it's slow on emulators.
-  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;

  constexpr size_t K = 1000;
  constexpr size_t M = K * K;
@@ -287,7 +285,7 @@ HWY_NOINLINE void BenchAllSort() {

 #if !HAVE_VXSORT && VQSORT_ENABLED
    BenchSort<Traits128<OrderAscending128>>(num_keys);
-    // BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
+    BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
 #endif
  }
 }
--- a/third_party/highway/hwy/contrib/sort/sort_test.cc
+++ b/third_party/highway/hwy/contrib/sort/sort_test.cc
@@ -13,6 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>  // memcpy
@@ -218,9 +222,6 @@ HWY_NOINLINE void TestAllBaseCase() {
 #if defined(_MSC_VER)
  return;
 #endif
-  // Only enable EMU128 on x86 - it's slow on emulators.
-  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
-
  TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
  TestBaseCase<TraitsLane<OrderDescending<int64_t> > >();
  TestBaseCase<Traits128<OrderAscending128> >();
@@ -356,9 +357,6 @@ static HWY_NOINLINE void TestPartition() {
 }

 HWY_NOINLINE void TestAllPartition() {
-  // Only enable EMU128 on x86 - it's slow on emulators.
-  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
-
  TestPartition<TraitsLane<OrderAscending<int16_t> > >();
  TestPartition<TraitsLane<OrderDescending<int32_t> > >();
  TestPartition<TraitsLane<OrderAscending<int64_t> > >();
@@ -490,9 +488,6 @@ void TestSort(size_t num_lanes) {
 #if defined(_MSC_VER)
  return;
 #endif
-  // Only enable EMU128 on x86 - it's slow on emulators.
-  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
-
  using Order = typename Traits::Order;
  using LaneType = typename Traits::LaneType;
  using KeyType = typename Traits::KeyType;
--- a/third_party/highway/hwy/contrib/sort/traits-inl.h
+++ b/third_party/highway/hwy/contrib/sort/traits-inl.h
@@ -41,7 +41,7 @@ namespace detail {
 // independent of the order.
 template <typename T>
 struct KeyLane {
-  constexpr bool Is128() const { return false; }
+  static constexpr bool Is128() { return false; }
  constexpr size_t LanesPerKey() const { return 1; }

  // What type bench_sort should allocate for generating inputs.
@@ -130,7 +130,7 @@ struct KeyLane {
 #if HWY_HAVE_FLOAT64  // in case D is float32
    const RepartitionToWide<D> dw;
 #else
-    const RepartitionToWide<RebindToUnsigned<D>> dw;
+    const RepartitionToWide<RebindToUnsigned<D> > dw;
 #endif
    return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
  }
@@ -146,7 +146,7 @@ struct KeyLane {
 #if HWY_HAVE_FLOAT64  // in case D is float32
    const RepartitionToWide<D> dw;
 #else
-    const RepartitionToWide<RebindToUnsigned<D>> dw;
+    const RepartitionToWide<RebindToUnsigned<D> > dw;
 #endif
    return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
  }
@@ -160,7 +160,7 @@ struct KeyLane {
 #if HWY_HAVE_FLOAT64  // in case D is float32
    const RepartitionToWide<D> dw;
 #else
-    const RepartitionToWide<RebindToUnsigned<D>> dw;
+    const RepartitionToWide<RebindToUnsigned<D> > dw;
 #endif
    return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
  }
@@ -181,9 +181,7 @@ template <typename T>
 struct OrderAscending : public KeyLane<T> {
  using Order = SortAscending;

-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return *a < *b;
-  }
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }

  template <class D>
  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
@@ -222,15 +220,18 @@ struct OrderAscending : public KeyLane<T> {
  HWY_INLINE Vec<D> LastValue(D d) const {
    return Set(d, hwy::HighestValue<T>());
  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Sub(v, Set(d, 1));
+  }
 };

 template <typename T>
 struct OrderDescending : public KeyLane<T> {
  using Order = SortDescending;

-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return *b < *a;
-  }
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }

  template <class D>
  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
@@ -268,6 +269,11 @@ struct OrderDescending : public KeyLane<T> {
  HWY_INLINE Vec<D> LastValue(D d) const {
    return Set(d, hwy::LowestValue<T>());
  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Add(v, Set(d, 1));
+  }
 };

 // Shared code that depends on Order.
--- a/third_party/highway/hwy/contrib/sort/traits128-inl.h
+++ b/third_party/highway/hwy/contrib/sort/traits128-inl.h
@@ -39,7 +39,7 @@ namespace detail {
 // along with an abstraction layer for single-lane vs. lane-pair, which is
 // independent of the order.
 struct KeyAny128 {
-  constexpr bool Is128() const { return true; }
+  static constexpr bool Is128() { return true; }
  constexpr size_t LanesPerKey() const { return 2; }

  // What type bench_sort should allocate for generating inputs.
@@ -130,8 +130,8 @@ struct Key128 : public KeyAny128 {
  std::string KeyString() const { return "U128"; }

  template <class D>
-  HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
-    return Eq128(a, b);
+  HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Eq128(d, a, b);
  }

  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
@@ -184,6 +184,12 @@ struct OrderAscending128 : public Key128 {
  HWY_INLINE Vec<D> LastValue(D d) const {
    return Set(d, hwy::HighestValue<TFromD<D> >());
  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
+    return Sub(v, k1);
+  }
 };

 struct OrderDescending128 : public Key128 {
@@ -224,6 +230,12 @@ struct OrderDescending128 : public Key128 {
  HWY_INLINE Vec<D> LastValue(D d) const {
    return Set(d, hwy::LowestValue<TFromD<D> >());
  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
+    return Add(v, k1);
+  }
 };

 // Base class shared between OrderAscendingKV128, OrderDescendingKV128.
@@ -234,8 +246,8 @@ struct KeyValue128 : public KeyAny128 {
  std::string KeyString() const { return "KV128"; }

  template <class D>
-  HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
-    return Eq128Upper(a, b);
+  HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Eq128Upper(d, a, b);
  }

  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
@@ -281,6 +293,12 @@ struct OrderAscendingKV128 : public KeyValue128 {
  HWY_INLINE Vec<D> LastValue(D d) const {
    return Set(d, hwy::HighestValue<TFromD<D> >());
  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
+    return Sub(v, k1);
+  }
 };

 struct OrderDescendingKV128 : public KeyValue128 {
@@ -321,6 +339,12 @@ struct OrderDescendingKV128 : public KeyValue128 {
  HWY_INLINE Vec<D> LastValue(D d) const {
    return Set(d, hwy::LowestValue<TFromD<D> >());
  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
+    return Add(v, k1);
+  }
 };

 // Shared code that depends on Order.
--- a/third_party/highway/hwy/contrib/sort/vqsort-inl.h
+++ b/third_party/highway/hwy/contrib/sort/vqsort-inl.h
@@ -61,6 +61,7 @@

 #include "hwy/contrib/sort/shared-inl.h"
 #include "hwy/contrib/sort/sorting_networks-inl.h"
+// Placeholder for internal instrumentation. Do not remove.
 #include "hwy/highway.h"

 HWY_BEFORE_NAMESPACE();
@@ -573,14 +574,44 @@ HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
 }
 #endif  // VQSORT_PRINT

+template <class V>
+V OrXor(const V o, const V x1, const V x2) {
+  // TODO(janwas): ternlog?
+  return Or(o, Xor(x1, x2));
+}
+
+// Returns a lower bound on the index of the first mismatch, or `num` if all
+// are equal. `num` is const to ensure we don't change it, which would lead to
+// bugs because the caller will check whether we return the original value.
 template <class D, class Traits, typename T>
-HWY_INLINE bool ScanEqual(D d, Traits st, const T* HWY_RESTRICT keys,
-                          size_t num) {
+HWY_NOINLINE size_t LowerBoundOfMismatch(D d, Traits st,
+                                         const T* HWY_RESTRICT keys,
+                                         const size_t num) {
  using V = Vec<decltype(d)>;
  const size_t N = Lanes(d);
  HWY_DASSERT(num >= N);  // See HandleSpecialCases
  const V reference = st.SetKey(d, keys);
  const V zero = Zero(d);
+
+  size_t i = 0;
+
+  // Vector-align keys + i.
+  const size_t misalign =
+      (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (N - 1);
+  if (HWY_LIKELY(misalign != 0)) {
+    HWY_DASSERT(misalign % st.LanesPerKey() == 0);
+    const size_t consume = N - misalign;
+    const auto mask = FirstN(d, consume);
+    const V v0 = LoadU(d, keys);
+    // Only check masked lanes; consider others to be equal to the reference.
+    if (!AllTrue(d, Or(Not(mask), Eq(v0, reference)))) {
+      return 0;  // not equal
+    }
+    i = consume;
+  }
+  HWY_DASSERT(((reinterpret_cast<uintptr_t>(keys + i) / sizeof(T)) & (N - 1)) ==
+              0);
+
  // Sticky bits registering any difference between `keys` and the first key.
  // We use vector XOR because it may be cheaper than comparisons, especially
  // for 128-bit. 2x unrolled for more ILP.
@@ -592,81 +623,112 @@ HWY_INLINE bool ScanEqual(D d, Traits st, const T* HWY_RESTRICT keys,
  // after a 'group', which consists of kLoops times two vectors.
  constexpr size_t kLoops = 4;
  const size_t lanes_per_group = kLoops * 2 * N;
-  size_t i = 0;
+
  for (; i + lanes_per_group <= num; i += lanes_per_group) {
+    HWY_DEFAULT_UNROLL
    for (size_t loop = 0; loop < kLoops; ++loop) {
-      const V v0 = LoadU(d, keys + i + loop * 2 * N);
-      const V v1 = LoadU(d, keys + i + loop * 2 * N + N);
-      // TODO(janwas): ternlog
-      diff0 = Or(diff0, Xor(v0, reference));
-      diff1 = Or(diff1, Xor(v1, reference));
+      const V v0 = Load(d, keys + i + loop * 2 * N);
+      const V v1 = Load(d, keys + i + loop * 2 * N + N);
+      diff0 = OrXor(diff0, v0, reference);
+      diff1 = OrXor(diff1, v1, reference);
    }
    diff0 = Or(diff0, diff1);
    if (!AllTrue(d, Eq(diff0, zero))) {
-      return false;
+      return i;  // not equal
    }
  }
-  // Whole vectors, no unrolling
+  // Whole vectors, no unrolling, compare directly
  for (; i + N <= num; i += N) {
-    const V v0 = LoadU(d, keys + i);
-    // TODO(janwas): ternlog
-    diff0 = Or(diff0, Xor(v0, reference));
-    if (!AllTrue(d, Eq(diff0, zero))) {
-      return false;
+    const V v0 = Load(d, keys + i);
+    if (!AllTrue(d, Eq(v0, reference))) {
+      return i;  // not equal
    }
  }
  // If there are remainders, re-check the last whole vector.
  if (HWY_LIKELY(i != num)) {
    const V v0 = LoadU(d, keys + num - N);
-    // TODO(janwas): ternlog
-    diff0 = Or(diff0, Xor(v0, reference));
-    if (!AllTrue(d, Eq(diff0, zero))) {
-      return false;
+    if (!AllTrue(d, Eq(v0, reference))) {
+      return i;  // not equal
    }
  }

-  return true;
+  return num;  // all equal
 }

-// Returns key prior to reference in sort order.
+enum class PivotResult {
+  kAllEqual,  // stop without partitioning
+  kNormal,    // partition and recurse left and right
+  kIsFirst,   // partition but skip left recursion
+  kWasLast,   // partition but skip right recursion
+};
+
+// Classifies (and possibly modifies) `pivot` by scanning for the first/last
+// key from index `idx_diff`, which is less than `num`.
 template <class D, class Traits, typename T>
-HWY_INLINE Vec<D> ScanForPrev(D d, Traits st, const T* HWY_RESTRICT keys,
-                              size_t num, Vec<D> reference,
-                              T* HWY_RESTRICT buf) {
+HWY_NOINLINE PivotResult CheckFirstLast(D d, Traits st,
+                                        const T* HWY_RESTRICT keys, size_t num,
+                                        size_t idx_diff,
+                                        Vec<D>* HWY_RESTRICT pivot,
+                                        T* HWY_RESTRICT buf) {
  const size_t N = Lanes(d);
  HWY_DASSERT(num >= N);  // See HandleSpecialCases
+  HWY_DASSERT(idx_diff < num);

-  Vec<D> prev = st.FirstValue(d);
-  Mask<D> any_found = st.Compare(d, prev, prev);  // false
+  Vec<D> first = st.LastValue(d);
+  Vec<D> last = st.FirstValue(d);
+  // Early out for mostly-0 arrays, where pivot is often FirstValue.
+  if (AllTrue(d, st.EqualKeys(d, *pivot, last))) {
+    return PivotResult::kIsFirst;
+  }

-  size_t i = 0;
+  // We know keys[0, idx_diff) are equal, but they might be the first/last, so
+  // start scanning one vector before.
+  size_t i = static_cast<size_t>(
+      HWY_MAX(static_cast<intptr_t>(idx_diff) - static_cast<intptr_t>(N), 0));
+
+  constexpr size_t kLoops = 4;
+  const size_t lanes_per_group = kLoops * N;
+
+  // Whole group, unrolled
+  for (; i + lanes_per_group <= num; i += lanes_per_group) {
+    HWY_DEFAULT_UNROLL
+    for (size_t loop = 0; loop < kLoops; ++loop) {
+      const Vec<D> curr = LoadU(d, keys + i + loop * N);
+      first = st.First(d, first, curr);
+      last = st.Last(d, last, curr);
+    }
+  }
  // Whole vectors, no unrolling
  for (; i + N <= num; i += N) {
    const Vec<D> curr = LoadU(d, keys + i);
-    const auto is_before = st.Compare(d, curr, reference);
-    any_found = Or(any_found, is_before);
-    prev = IfThenElse(is_before, st.Last(d, prev, curr), prev);
+    first = st.First(d, first, curr);
+    last = st.Last(d, last, curr);
  }
  // If there are remainders, re-check the last whole vector.
  if (HWY_LIKELY(i != num)) {
    const Vec<D> curr = LoadU(d, keys + num - N);
-    const auto is_before = st.Compare(d, curr, reference);
-    any_found = Or(any_found, is_before);
-    prev = IfThenElse(is_before, st.Last(d, prev, curr), prev);
+    first = st.First(d, first, curr);
+    last = st.Last(d, last, curr);
  }

-  const Vec<D> candidate = st.LastOfLanes(d, prev, buf);
-  // If we didn't find any key less than reference, we're still stuck with
-  // FirstValue; replace that with reference. (We cannot compare directly to
-  // FirstValue because that might be the desired value of prev.)
-  return IfThenElse(any_found, candidate, reference);
+  first = st.FirstOfLanes(d, first, buf);
+  last = st.LastOfLanes(d, last, buf);
+
+  if (AllTrue(d, st.EqualKeys(d, first, *pivot))) {
+    return PivotResult::kIsFirst;
+  }
+  // Fixup required because keys equal to the pivot go to the left partition,
+  // and the pivot is the last, so Partition would not change anything.
+  // Instead use the previous value in sort order, which is not necessarily an
+  // actual key.
+  if (AllTrue(d, st.EqualKeys(d, last, *pivot))) {
+    *pivot = st.PrevValue(d, *pivot);
+    return PivotResult::kWasLast;
+  }
+  return PivotResult::kNormal;
 }

-enum class PivotResult {
-  kNormal,    // use partition
-  kAllEqual,  // already done
-};
-
+// Writes samples from `keys[0, num)` into `buf`.
 template <class D, class Traits, typename T>
 HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
                            T* HWY_RESTRICT buf, Generator& rng) {
@@ -732,27 +794,25 @@ HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
  }
 }

-// Returns pivot, which is never the largest key (thus the right partition will
-// never be empty).
+// Returns pivot chosen from `keys[0, num)`. It will never be the largest key
+// (thus the right partition will never be empty).
 template <class D, class Traits, typename T>
 HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
-                                const size_t begin, const size_t end,
-                                T* HWY_RESTRICT buf, Generator& rng,
-                                PivotResult& result) {
+                                const size_t num, T* HWY_RESTRICT buf,
+                                Generator& rng, PivotResult& result) {
  using V = decltype(Zero(d));
-  const size_t N = Lanes(d);

  constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
  constexpr size_t N1 = st.LanesPerKey();

-  const size_t num = end - begin;
 #if VQSORT_PRINT
  fprintf(stderr, "\nChoosePivot num %zu:\n", num);
 #endif
-  DrawSamples(d, st, keys + begin, num, buf, rng);
+  DrawSamples(d, st, keys, num, buf, rng);

  SortSamples(st, buf);
 #if VQSORT_PRINT
+  const size_t N = Lanes(d);
  for (size_t i = 0; i < kSampleLanes; i += N) {
    Print(d, "", Load(d, buf + i), 0, N);
  }
@@ -760,27 +820,22 @@ HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,

  // All samples are equal.
  if (st.Equal1(buf, buf + kSampleLanes - N1)) {
-    const bool all_eq = ScanEqual(d, st, keys + begin, num);
+    const size_t idx_diff = LowerBoundOfMismatch(d, st, keys, num);
+    const bool all_eq = idx_diff == num;
 #if VQSORT_PRINT
-    fprintf(stderr, "Pivot num=%zu all eq samples, keys also: %d\n", num,
-            all_eq);
+    fprintf(stderr, "Pivot num=%zu samplesEq, idxDiff %zu keysEq: %d\n", num,
+            idx_diff, all_eq);
 #endif
    if (all_eq) {
      result = PivotResult::kAllEqual;
      return Zero(d);
    }

-    // If the sample is indeed the most common key and it is the largest, then
-    // the right partition will be empty. Prevent this by replacing the pivot
-    // with the previous key in sort order. By contrast, selecting the first key
-    // in sort order would guarantee (minimal) progress. We instead do a full
-    // scan to maximize load balance in case there are numerous keys that
-    // precede the most common key.
-    result = PivotResult::kNormal;
-    const V reference = st.SetKey(d, buf);
-    const V pivot = ScanForPrev(d, st, keys + begin, num, reference, buf);
+    V pivot = st.SetKey(d, buf);  // the single unique sample
+    result = CheckFirstLast(d, st, keys, num, idx_diff, &pivot, buf);
 #if VQSORT_PRINT
-    Print(d, "PREV pivot", pivot, 0, st.LanesPerKey());
+    fprintf(stderr, "PivotResult %d\n", static_cast<int>(result));
+    Print(d, "Adjusted pivot", pivot, 0, st.LanesPerKey());
 #endif
    return pivot;
  }
@@ -796,19 +851,32 @@ HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
 }

 template <class D, class Traits, typename T>
-void Recurse(D d, Traits st, T* HWY_RESTRICT keys, T* HWY_RESTRICT keys_end,
-             const size_t begin, const size_t end, const Vec<D> pivot,
-             T* HWY_RESTRICT buf, Generator& rng, size_t remaining_levels) {
-  HWY_DASSERT(begin + 1 < end);
-  const size_t num = end - begin;  // >= 2
+HWY_NOINLINE void Recurse(D d, Traits st, T* HWY_RESTRICT keys,
+                          T* HWY_RESTRICT keys_end, const size_t begin,
+                          const size_t end, T* HWY_RESTRICT buf, Generator& rng,
+                          size_t remaining_levels) {
+  const size_t num = end - begin;  // >= 1
 #if VQSORT_PRINT
  fprintf(stderr, "- Recurse remaining %zu [%zu %zu) len %zu\n",
          remaining_levels, begin, end, num);
  Vec<D> first, last;
-  ScanMinMax(d, st, keys + begin, num, buf, first, last);
+  if (num >= Lanes(d)) {
+    ScanMinMax(d, st, keys + begin, num, buf, first, last);
+  }
  Print(d, "first", first, 0, st.LanesPerKey());
  Print(d, "last", last, 0, st.LanesPerKey());
 #endif
+  HWY_DASSERT(begin < end);
+
+  if (HWY_UNLIKELY(num <= Constants::BaseCaseNum(Lanes(d)))) {
+    BaseCase(d, st, keys + begin, keys_end, num, buf);
+    return;
+  }
+  PivotResult result;
+  Vec<D> pivot = ChoosePivot(d, st, keys + begin, num, buf, rng, result);
+  if (HWY_UNLIKELY(result == PivotResult::kAllEqual)) {
+    return;
+  }

  // Too many recursions. This is unlikely to happen because we select pivots
  // from large (though still O(1)) samples.
@@ -820,47 +888,24 @@ void Recurse(D d, Traits st, T* HWY_RESTRICT keys, T* HWY_RESTRICT keys_end,
    return;
  }

-  const ptrdiff_t base_case_num =
-      static_cast<ptrdiff_t>(Constants::BaseCaseNum(Lanes(d)));
  const size_t bound = Partition(d, st, keys, begin, end, pivot, buf);
-
-  const ptrdiff_t num_left =
-      static_cast<ptrdiff_t>(bound) - static_cast<ptrdiff_t>(begin);
-  const ptrdiff_t num_right =
-      static_cast<ptrdiff_t>(end) - static_cast<ptrdiff_t>(bound);
-
-  // ChoosePivot ensures pivot != largest key, so this should never happen.
-  HWY_ASSERT(num_right != 0);
-
-  if (HWY_UNLIKELY(num_left <= base_case_num)) {
-    BaseCase(d, st, keys + begin, keys_end, static_cast<size_t>(num_left), buf);
-  } else {
-    PivotResult result;
-    const Vec<D> next_pivot =
-        ChoosePivot(d, st, keys, begin, bound, buf, rng, result);
-    if (result != PivotResult::kAllEqual) {
-      Recurse(d, st, keys, keys_end, begin, bound, next_pivot, buf, rng,
-              remaining_levels - 1);
-    }
+  // ChoosePivot ensures pivot != last key, so the right partition is never
+  // empty. Nor is the left, because the pivot is either one of the keys, or
+  // the value prior to the last (which is not the only value).
+  HWY_ASSERT(begin != bound && bound != end);
+  if (HWY_LIKELY(result != PivotResult::kIsFirst)) {
+    Recurse(d, st, keys, keys_end, begin, bound, buf, rng,
+            remaining_levels - 1);
  }
-  if (HWY_UNLIKELY(num_right <= base_case_num)) {
-    BaseCase(d, st, keys + bound, keys_end, static_cast<size_t>(num_right),
-             buf);
-  } else {
-    PivotResult result;
-    const Vec<D> next_pivot =
-        ChoosePivot(d, st, keys, bound, end, buf, rng, result);
-    if (result != PivotResult::kAllEqual) {
-      Recurse(d, st, keys, keys_end, bound, end, next_pivot, buf, rng,
-              remaining_levels - 1);
-    }
+  if (HWY_LIKELY(result != PivotResult::kWasLast)) {
+    Recurse(d, st, keys, keys_end, bound, end, buf, rng, remaining_levels - 1);
  }
 }

 // Returns true if sorting is finished.
 template <class D, class Traits, typename T>
-bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
-                        T* HWY_RESTRICT buf) {
+HWY_INLINE bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys,
+                                   size_t num) {
  const size_t N = Lanes(d);
  const size_t base_case_num = Constants::BaseCaseNum(N);

@@ -876,16 +921,15 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
      HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols;
  const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
  if (partial_128 || huge_vec) {
-    // PERFORMANCE WARNING: falling back to HeapSort.
+#if VQSORT_PRINT
+    fprintf(stderr, "WARNING: using slow HeapSort: partial %d huge %d\n",
+            partial_128, huge_vec);
+#endif
    HeapSort(st, keys, num);
    return true;
  }

-  // Small arrays: use sorting network, no need for other checks.
-  if (HWY_UNLIKELY(num <= base_case_num)) {
-    BaseCase(d, st, keys, keys + num, num, buf);
-    return true;
-  }
+  // Small arrays are already handled by Recurse.

  // We could also check for already sorted/reverse/equal, but that's probably
  // counterproductive if vqsort is used as a base case.
@@ -925,31 +969,26 @@ void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
  buf = storage;
 #endif  // !HWY_HAVE_SCALABLE

-  if (detail::HandleSpecialCases(d, st, keys, num, buf)) return;
+  if (detail::HandleSpecialCases(d, st, keys, num)) return;

 #if HWY_MAX_BYTES > 64
  // sorting_networks-inl and traits assume no more than 512 bit vectors.
-  if (Lanes(d) > 64 / sizeof(T)) {
+  if (HWY_UNLIKELY(Lanes(d) > 64 / sizeof(T))) {
    return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf);
  }
 #endif  // HWY_MAX_BYTES > 64

-  // Pulled out of the recursion so we can special-case degenerate partitions.
  detail::Generator rng(keys, num);
-  detail::PivotResult result;
-  const Vec<D> pivot =
-      detail::ChoosePivot(d, st, keys, 0, num, buf, rng, result);

-  if (result != detail::PivotResult::kAllEqual) {
-    // Introspection: switch to worst-case N*logN heapsort after this many.
-    const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
-    detail::Recurse(d, st, keys, keys + num, 0, num, pivot, buf, rng,
-                    max_levels);
-  }
+  // Introspection: switch to worst-case N*logN heapsort after this many.
+  const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
+  detail::Recurse(d, st, keys, keys + num, 0, num, buf, rng, max_levels);
 #else
  (void)d;
  (void)buf;
-  // PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
+#if VQSORT_PRINT
+  fprintf(stderr, "WARNING: using slow HeapSort because vqsort disabled\n");
+#endif
  return detail::HeapSort(st, keys, num);
 #endif  // VQSORT_ENABLED
 }
--- a/third_party/highway/hwy/detect_compiler_arch.h
+++ b/third_party/highway/hwy/detect_compiler_arch.h
@@ -50,6 +50,12 @@
 #define HWY_COMPILER_ICC 0
 #endif

+#ifdef __INTEL_LLVM_COMPILER
+#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER
+#else
+#define HWY_COMPILER_ICX 0
+#endif
+
 // HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
 // compiler extensions (eg. Clang, Intel...)
 #ifdef __GNUC__
--- a/third_party/highway/hwy/examples/benchmark.cc
+++ b/third_party/highway/hwy/examples/benchmark.cc
@@ -13,6 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
--- a/third_party/highway/hwy/examples/skeleton-inl.h
+++ b/third_party/highway/hwy/examples/skeleton-inl.h
@@ -19,7 +19,9 @@
 // splitting code into different files while still inlining instead of requiring
 // calling through function pointers.

-// Include guard (still compiled once per target)
+// Per-target include guard. This is only required when using dynamic dispatch,
+// i.e. including foreach_target.h. For static dispatch, a normal include
+// guard would be fine because the header is only compiled once.
 #if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
 #ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
 #undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
@@ -36,7 +38,8 @@ HWY_BEFORE_NAMESPACE();
 namespace skeleton {
 namespace HWY_NAMESPACE {

-using namespace hwy::HWY_NAMESPACE;
+// Highway ops reside here; ADL does not find templates nor builtins.
+namespace hn = hwy::HWY_NAMESPACE;

 // Example of a type-agnostic (caller-specified lane type) and width-agnostic
 // (uses best available instruction set) function in a header.
@@ -46,12 +49,12 @@ template <class D, typename T>
 HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
                                 const T* HWY_RESTRICT add_array,
                                 const size_t size, T* HWY_RESTRICT x_array) {
-  for (size_t i = 0; i < size; i += Lanes(d)) {
-    const auto mul = Load(d, mul_array + i);
-    const auto add = Load(d, add_array + i);
-    auto x = Load(d, x_array + i);
-    x = MulAdd(mul, x, add);
-    Store(x, d, x_array + i);
+  for (size_t i = 0; i < size; i += hn::Lanes(d)) {
+    const auto mul = hn::Load(d, mul_array + i);
+    const auto add = hn::Load(d, add_array + i);
+    auto x = hn::Load(d, x_array + i);
+    x = hn::MulAdd(mul, x, add);
+    hn::Store(x, d, x_array + i);
  }
 }

--- a/third_party/highway/hwy/examples/skeleton.cc
+++ b/third_party/highway/hwy/examples/skeleton.cc
@@ -17,22 +17,28 @@

 #include <stdio.h>

+// >>>> for dynamic dispatch only, skip if you want static dispatch
+
 // First undef to prevent error when re-included.
 #undef HWY_TARGET_INCLUDE
-// For runtime dispatch, specify the name of the current file (unfortunately
+// For dynamic dispatch, specify the name of the current file (unfortunately
 // __FILE__ is not reliable) so that foreach_target.h can re-include it.
 #define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
 // Generates code for each enabled target by re-including this source file.
 #include "hwy/foreach_target.h"  // IWYU pragma: keep

+// <<<< end of dynamic dispatch
+
 // Must come after foreach_target.h to avoid redefinition errors.
 #include "hwy/highway.h"

 // Optional, can instead add HWY_ATTR to all functions.
 HWY_BEFORE_NAMESPACE();
+
 namespace skeleton {
 // This namespace name is unique per target, which allows code for multiple
-// targets to co-exist in the same translation unit.
+// targets to co-exist in the same translation unit. Required when using dynamic
+// dispatch, otherwise optional.
 namespace HWY_NAMESPACE {

 // Highway ops reside here; ADL does not find templates nor builtins.
@@ -104,6 +110,7 @@ HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
                                 uint8_t* HWY_RESTRICT out) {
  // This must reside outside of HWY_NAMESPACE because it references (calls the
  // appropriate one from) the per-target implementations there.
+  // For static dispatch, use HWY_STATIC_DISPATCH.
  return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
 }

--- a/third_party/highway/hwy/examples/skeleton_test.cc
+++ b/third_party/highway/hwy/examples/skeleton_test.cc
@@ -62,7 +62,7 @@ struct TestFloorLog2 {
 };

 HWY_NOINLINE void TestAllFloorLog2() {
-  ForPartialVectors<TestFloorLog2>()(float());
+  hn::ForPartialVectors<TestFloorLog2>()(float());
 }

 // Calls function defined in skeleton-inl.h.
@@ -91,7 +91,7 @@ struct TestSumMulAdd {
 };

 HWY_NOINLINE void TestAllSumMulAdd() {
-  ForFloatTypes(ForPartialVectors<TestSumMulAdd>());
+  hn::ForFloatTypes(hn::ForPartialVectors<TestSumMulAdd>());
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
--- a/third_party/highway/hwy/highway.h
+++ b/third_party/highway/hwy/highway.h
@@ -29,7 +29,7 @@ namespace hwy {
 // API version (https://semver.org/); keep in sync with CMakeLists.txt.
 #define HWY_MAJOR 1
 #define HWY_MINOR 0
-#define HWY_PATCH 0
+#define HWY_PATCH 1

 //------------------------------------------------------------------------------
 // Shorthand for tags (defined in shared-inl.h) used to select overloads.
--- a/third_party/highway/hwy/highway_test.cc
+++ b/third_party/highway/hwy/highway_test.cc
@@ -15,7 +15,6 @@

 #include <stddef.h>
 #include <stdint.h>
-#include <string.h>

 #include <bitset>

@@ -224,7 +223,7 @@ HWY_INLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
    // avoid truncating doubles.
    uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0};
    const T lane = GetLane(v);
-    memcpy(bytes, &lane, sizeof(T));
+    CopyBytes<sizeof(T)>(&lane, bytes);
    Abort(file, line,
          "Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x "
          "%02x)",
--- a/third_party/highway/hwy/nanobenchmark.cc
+++ b/third_party/highway/hwy/nanobenchmark.cc
@@ -15,11 +15,13 @@

 #include "hwy/nanobenchmark.h"

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdio.h>
-#include <stdlib.h>  // abort
-#include <string.h>  // memcpy
+#include <stdlib.h>
 #include <time.h>    // clock_gettime

 #include <algorithm>  // sort
@@ -414,7 +416,7 @@ std::string BrandString() {

  for (size_t i = 0; i < 3; ++i) {
    Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
-    memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
+    CopyBytes<sizeof(abcd)>(&abcd[0], brand_string + i * 16);  // not same size
  }
  brand_string[48] = 0;
  return brand_string;
--- a/third_party/highway/hwy/nanobenchmark_test.cc
+++ b/third_party/highway/hwy/nanobenchmark_test.cc
@@ -15,6 +15,9 @@

 #include "hwy/nanobenchmark.h"

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
 #include <inttypes.h>
 #include <stdint.h>
 #include <stdio.h>
--- a/third_party/highway/hwy/ops/arm_neon-inl.h
+++ b/third_party/highway/hwy/ops/arm_neon-inl.h
@@ -1030,6 +1030,9 @@ template <typename T, size_t N>
 HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
  HWY_DIAGNOSTICS(push)
  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+#if HWY_COMPILER_GCC_ACTUAL
+  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
+#endif
  typename detail::Raw128<T, N>::type a;
  return Vec128<T, N>(a);
  HWY_DIAGNOSTICS(pop)
@@ -3285,6 +3288,16 @@ HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
  return Vec128<float, N>(vcvt_f32_s32(v.raw));
 }

+HWY_API Vec128<float> ConvertTo(Full128<float> /* tag */,
+                                const Vec128<uint32_t> v) {
+  return Vec128<float>(vcvtq_f32_u32(v.raw));
+}
+template <size_t N, HWY_IF_LE64(uint32_t, N)>
+HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
+                                   const Vec128<uint32_t, N> v) {
+  return Vec128<float, N>(vcvt_f32_u32(v.raw));
+}
+
 // Truncates (rounds toward zero).
 HWY_API Vec128<int32_t> ConvertTo(Full128<int32_t> /* tag */,
                                  const Vec128<float> v) {
@@ -3307,6 +3320,15 @@ HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
  return Vec64<double>(vcvt_f64_s64(v.raw));
 }

+HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
+                                 const Vec128<uint64_t> v) {
+  return Vec128<double>(vcvtq_f64_u64(v.raw));
+}
+HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
+                                const Vec64<uint64_t> v) {
+  return Vec64<double>(vcvt_f64_u64(v.raw));
+}
+
 // Truncates (rounds toward zero).
 HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */,
                                  const Vec128<double> v) {
@@ -4979,24 +5001,52 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
  return Max(v10, v01);
 }

-// u16/i16
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const Repartition<int32_t, Simd<T, N, 0>> d32;
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
  // Also broadcast into odd lanes.
-  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const Repartition<int32_t, Simd<T, N, 0>> d32;
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
  // Also broadcast into odd lanes.
-  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }

 }  // namespace detail
@@ -6356,64 +6406,6 @@ HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
  return IfThenElse(Lt128Upper(d, b, a), a, b);
 }

-// ================================================== Operator wrapper
-
-// These apply to all x86_*-inl.h because there are no restrictions on V.
-
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
-}
-
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
-}
-
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
-}
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
-}
-
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
-}
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
-}
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
-}
-
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
-}
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
-}
-
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
-}
-
 namespace detail {  // for code folding
 #if HWY_ARCH_ARM_V7
 #undef vuzp1_s8
--- a/third_party/highway/hwy/ops/arm_sve-inl.h
+++ b/third_party/highway/hwy/ops/arm_sve-inl.h
@@ -629,6 +629,13 @@ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
 HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, Mul, mul)
 HWY_SVE_FOREACH_UIF3264(HWY_SVE_RETV_ARGPVV, Mul, mul)

+// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
+#ifdef HWY_NATIVE_I64MULLO
+#undef HWY_NATIVE_I64MULLO
+#else
+#define HWY_NATIVE_I64MULLO
+#endif
+
 // ------------------------------ MulHigh
 HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
 namespace detail {
@@ -1497,11 +1504,18 @@ HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
 // ------------------------------ ConvertTo F

 #define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)                     \
+  /* signed integers */                                                       \
  template <size_t N, int kPow2>                                              \
  HWY_API HWY_SVE_V(BASE, BITS)                                               \
      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
    return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v);       \
  }                                                                           \
+  /* unsigned integers */                                                     \
+  template <size_t N, int kPow2>                                              \
+  HWY_API HWY_SVE_V(BASE, BITS)                                               \
+      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \
+    return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v);       \
+  }                                                                           \
  /* Truncates (rounds toward zero). */                                       \
  template <size_t N, int kPow2>                                              \
  HWY_API HWY_SVE_V(int, BITS)                                                \
@@ -2248,9 +2262,9 @@ HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) {
 #endif
 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
  uint64_t bits = 0;  // predicate reg is 32-bit
-  CopyBytes<4>(&mask, &bits);
+  CopyBytes<4>(&mask, &bits);  // not same size - 64-bit more efficient
  // Concatenate LSB for upper and lower blocks, pre-scale by 4 for table idx.
-  const size_t offset = ((bits & 1) ? 4 : 0) + ((bits & 0x10000) ? 8 : 0);
+  const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u);
  // See CompressIsPartition. Manually generated; flip halves if mask = [0, 1].
  alignas(16) static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1,
                                                        0, 1, 2, 3, 0, 1, 2, 3};
@@ -2680,7 +2694,7 @@ HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
  // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane.
  // The "at least 8 byte" guarantee in quick_reference ensures this is safe.
  uint32_t mask_bits;
-  CopyBytes<4>(bits, &mask_bits);
+  CopyBytes<4>(bits, &mask_bits);  // copy from bytes
  const auto vbits = Set(du, mask_bits);

  // 2 ^ {0,1, .., 31}, will not have more lanes than that.
--- a/third_party/highway/hwy/ops/emu128-inl.h
+++ b/third_party/highway/hwy/ops/emu128-inl.h
@@ -101,9 +101,7 @@ using TFromV = TFromD<DFromV<V>>;
 template <typename T, size_t N, typename FromT, size_t FromN>
 HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> /* tag */, Vec128<FromT, FromN> v) {
  Vec128<T, N> to;
-  static_assert(sizeof(T) * N == sizeof(FromT) * FromN,
-                "Casting does not change size");
-  CopyBytes<sizeof(T) * N>(v.raw, to.raw);
+  CopySameSize(&v, &to);
  return to;
 }

@@ -285,8 +283,7 @@ template <typename TFrom, typename TTo, size_t N>
 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
                                   Mask128<TFrom, N> mask) {
  Mask128<TTo, N> to;
-  static_assert(sizeof(TTo) * N == sizeof(TFrom) * N, "Must have same size");
-  CopyBytes<sizeof(TTo) * N>(mask.bits, to.bits);
+  CopySameSize(&mask, &to);
  return to;
 }

@@ -294,15 +291,14 @@ HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
 template <typename T, size_t N>
 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
  Mask128<T, N> mask;
-  static_assert(sizeof(v) == sizeof(mask), "Must have same size");
-  CopyBytes<sizeof(T) * N>(v.raw, mask.bits);
+  CopySameSize(&v, &mask);
  return mask;
 }

 template <typename T, size_t N>
 Vec128<T, N> VecFromMask(const Mask128<T, N> mask) {
  Vec128<T, N> v;
-  CopyBytes<sizeof(T) * N>(mask.bits, v.raw);
+  CopySameSize(&mask, &v);
  return v;
 }

@@ -926,10 +922,10 @@ HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
  for (size_t i = 0; i < N; ++i) {
    const float half = v.raw[i] * 0.5f;
    uint32_t bits;
-    CopyBytes<4>(&v.raw[i], &bits);
+    CopySameSize(&v.raw[i], &bits);
    // Initial guess based on log2(f)
    bits = 0x5F3759DF - (bits >> 1);
-    CopyBytes<4>(&bits, &v.raw[i]);
+    CopySameSize(&bits, &v.raw[i]);
    // One Newton-Raphson iteration
    v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
  }
@@ -1039,7 +1035,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
    const bool positive = v.raw[i] > Float(0.0);

    Bits bits;
-    CopyBytes<sizeof(Bits)>(&v.raw[i], &bits);
+    CopySameSize(&v.raw[i], &bits);

    const int exponent =
        static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1059,7 +1055,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
    if (positive) bits += (kMantissaMask + 1) >> exponent;
    bits &= ~mantissa_mask;

-    CopyBytes<sizeof(Bits)>(&bits, &v.raw[i]);
+    CopySameSize(&bits, &v.raw[i]);
  }
  return v;
 }
@@ -1077,7 +1073,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
    const bool negative = v.raw[i] < Float(0.0);

    Bits bits;
-    CopyBytes<sizeof(Bits)>(&v.raw[i], &bits);
+    CopySameSize(&v.raw[i], &bits);

    const int exponent =
        static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1097,7 +1093,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
    if (negative) bits += (kMantissaMask + 1) >> exponent;
    bits &= ~mantissa_mask;

-    CopyBytes<sizeof(Bits)>(&bits, &v.raw[i]);
+    CopySameSize(&bits, &v.raw[i]);
  }
  return v;
 }
@@ -1110,7 +1106,7 @@ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
  for (size_t i = 0; i < N; ++i) {
    // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
    MakeUnsigned<T> bits;
-    memcpy(&bits, &v.raw[i], sizeof(T));
+    CopySameSize(&v.raw[i], &bits);
    bits += bits;
    bits >>= 1;  // clear sign bit
    // NaN if all exponent bits are set and the mantissa is not zero.
@@ -1278,7 +1274,7 @@ template <typename T, size_t N>
 HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */,
                          const T* HWY_RESTRICT aligned) {
  Vec128<T, N> v;
-  CopyBytes<sizeof(T) * N>(aligned, v.raw);
+  CopyBytes<sizeof(T) * N>(aligned, v.raw);  // copy from array
  return v;
 }

@@ -1305,7 +1301,7 @@ HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d,
 template <typename T, size_t N>
 HWY_API void Store(const Vec128<T, N> v, Simd<T, N, 0> /* tag */,
                   T* HWY_RESTRICT aligned) {
-  CopyBytes<sizeof(T) * N>(v.raw, aligned);
+  CopyBytes<sizeof(T) * N>(v.raw, aligned);  // copy to array
 }

 template <typename T, size_t N>
@@ -1434,7 +1430,7 @@ HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* base,
  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
  for (size_t i = 0; i < N; ++i) {
    uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw[i];
-    CopyBytes<sizeof(T)>(&v.raw[i], base8);
+    CopyBytes<sizeof(T)>(&v.raw[i], base8);  // copy to bytes
  }
 }

@@ -1457,7 +1453,7 @@ HWY_API Vec128<T, N> GatherOffset(Simd<T, N, 0> /* tag */, const T* base,
  for (size_t i = 0; i < N; ++i) {
    const uint8_t* base8 =
        reinterpret_cast<const uint8_t*>(base) + offset.raw[i];
-    CopyBytes<sizeof(T)>(base8, &v.raw[i]);
+    CopyBytes<sizeof(T)>(base8, &v.raw[i]);  // copy from bytes
  }
  return v;
 }
@@ -1556,12 +1552,12 @@ namespace detail {

 HWY_INLINE void StoreU16ToF16(const uint16_t val,
                              hwy::float16_t* HWY_RESTRICT to) {
-  CopyBytes<2>(&val, to);
+  CopySameSize(&val, to);
 }

 HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) {
  uint16_t bits16;
-  CopyBytes<2>(from, &bits16);
+  CopySameSize(from, &bits16);
  return bits16;
 }

@@ -1590,7 +1586,7 @@ HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
    const uint32_t biased_exp32 = biased_exp + (127 - 15);
    const uint32_t mantissa32 = mantissa << (23 - 10);
    const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
-    CopyBytes<4>(&bits32, &ret.raw[i]);
+    CopySameSize(&bits32, &ret.raw[i]);
  }
  return ret;
 }
@@ -1611,7 +1607,7 @@ HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
  Vec128<float16_t, N> ret;
  for (size_t i = 0; i < N; ++i) {
    uint32_t bits32;
-    CopyBytes<4>(&v.raw[i], &bits32);
+    CopySameSize(&v.raw[i], &bits32);
    const uint32_t sign = bits32 >> 31;
    const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
    const uint32_t mantissa32 = bits32 & 0x7FFFFF;
@@ -2446,62 +2442,6 @@ HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
  return Load(Full128<uint64_t>(), mul);
 }

-// ================================================== Operator wrapper
-
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
-}
-
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
-}
-
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
-}
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
-}
-
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
-}
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
-}
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
-}
-
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
-}
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
-}
-
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
-}
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
--- a/third_party/highway/hwy/ops/generic_ops-inl.h
+++ b/third_party/highway/hwy/ops/generic_ops-inl.h
@@ -1209,7 +1209,8 @@ HWY_API V PopulationCount(V v) {
 // RVV has a specialization that avoids the Set().
 #if HWY_TARGET != HWY_RVV
 // Slower fallback for capped vectors.
-template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1), HWY_IF_LT128_D(D)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
+          HWY_IF_LT128_D(D)>
 HWY_API V PopulationCount(V v) {
  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
  const D d;
@@ -1251,6 +1252,105 @@ HWY_API V PopulationCount(V v) {

 #endif  // HWY_NATIVE_POPCNT

+template <class V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8),
+          HWY_IF_LT128_D(D)>
+HWY_API V operator*(V x, V y) {
+  return Set(D(), GetLane(x) * GetLane(y));
+}
+
+// "Include guard": skip if native 64-bit mul instructions are available.
+#if (defined(HWY_NATIVE_I64MULLO) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_I64MULLO
+#undef HWY_NATIVE_I64MULLO
+#else
+#define HWY_NATIVE_I64MULLO
+#endif
+
+template <class V, class D64 = DFromV<V>, typename T = LaneType<V>,
+          HWY_IF_LANE_SIZE(T, 8), HWY_IF_UNSIGNED(T), HWY_IF_GE128_D(D64)>
+HWY_API V operator*(V x, V y) {
+  RepartitionToNarrow<D64> d32;
+  auto x32 = BitCast(d32, x);
+  auto y32 = BitCast(d32, y);
+  auto lolo = BitCast(d32, MulEven(x32, y32));
+  auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
+  auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
+  auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
+  return BitCast(D64{}, lolo + hi);
+}
+template <class V, class DI64 = DFromV<V>, typename T = LaneType<V>,
+          HWY_IF_LANE_SIZE(T, 8), HWY_IF_SIGNED(T), HWY_IF_GE128_D(DI64)>
+HWY_API V operator*(V x, V y) {
+  RebindToUnsigned<DI64> du64;
+  return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
+}
+
+#endif  // HWY_NATIVE_I64MULLO
+
+// ================================================== Operator wrapper
+
+// These targets currently cannot define operators and have already defined
+// (only) the corresponding functions such as Add.
+#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE &&      \
+    HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
+    HWY_TARGET != HWY_SVE2_128
+
+template <class V>
+HWY_API V Add(V a, V b) {
+  return a + b;
+}
+template <class V>
+HWY_API V Sub(V a, V b) {
+  return a - b;
+}
+
+template <class V>
+HWY_API V Mul(V a, V b) {
+  return a * b;
+}
+template <class V>
+HWY_API V Div(V a, V b) {
+  return a / b;
+}
+
+template <class V>
+V Shl(V a, V b) {
+  return a << b;
+}
+template <class V>
+V Shr(V a, V b) {
+  return a >> b;
+}
+
+template <class V>
+HWY_API auto Eq(V a, V b) -> decltype(a == b) {
+  return a == b;
+}
+template <class V>
+HWY_API auto Ne(V a, V b) -> decltype(a == b) {
+  return a != b;
+}
+template <class V>
+HWY_API auto Lt(V a, V b) -> decltype(a == b) {
+  return a < b;
+}
+
+template <class V>
+HWY_API auto Gt(V a, V b) -> decltype(a == b) {
+  return a > b;
+}
+template <class V>
+HWY_API auto Ge(V a, V b) -> decltype(a == b) {
+  return a >= b;
+}
+
+template <class V>
+HWY_API auto Le(V a, V b) -> decltype(a == b) {
+  return a <= b;
+}
+
+#endif  // HWY_TARGET for operators
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
--- a/third_party/highway/hwy/ops/rvv-inl.h
+++ b/third_party/highway/hwy/ops/rvv-inl.h
@@ -949,16 +949,16 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Max, fmax, _ALL)

 // ------------------------------ Mul

-// Only for internal use (Highway only promises Mul for 16/32-bit inputs).
-// Used by MulLower.
-namespace detail {
-HWY_RVV_FOREACH_U64(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
-}  // namespace detail
-
-HWY_RVV_FOREACH_UI16(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
-HWY_RVV_FOREACH_UI32(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
+HWY_RVV_FOREACH_UI163264(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL)

+// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
+#ifdef HWY_NATIVE_I64MULLO
+#undef HWY_NATIVE_I64MULLO
+#else
+#define HWY_NATIVE_I64MULLO
+#endif
+
 // ------------------------------ MulHigh

 // Only for internal use (Highway only promises MulHigh for 16-bit inputs).
@@ -2019,6 +2019,11 @@ HWY_API VFromD<Simd<uint16_t, N, kPow2>> DemoteTo(
      HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) {         \
    return vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d));                              \
  }                                                                            \
+  template <size_t N>                                                          \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo(                                \
+      HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) {\
+    return vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d));                             \
+  }                                                                            \
  /* Truncates (rounds toward zero). */                                        \
  template <size_t N>                                                          \
  HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
@@ -3069,14 +3074,14 @@ HWY_API VFromD<DW> MulEven(const V a, const V b) {
 // There is no 64x64 vwmul.
 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
 HWY_INLINE V MulEven(const V a, const V b) {
-  const auto lo = detail::Mul(a, b);
+  const auto lo = Mul(a, b);
  const auto hi = detail::MulHigh(a, b);
  return OddEven(detail::Slide1Up(hi), lo);
 }

 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
 HWY_INLINE V MulOdd(const V a, const V b) {
-  const auto lo = detail::Mul(a, b);
+  const auto lo = Mul(a, b);
  const auto hi = detail::MulHigh(a, b);
  return OddEven(hi, detail::Slide1Down(lo));
 }
--- a/third_party/highway/hwy/ops/scalar-inl.h
+++ b/third_party/highway/hwy/ops/scalar-inl.h
@@ -102,7 +102,7 @@ template <typename T, typename FromT>
 HWY_API Vec1<T> BitCast(Sisd<T> /* tag */, Vec1<FromT> v) {
  static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
  T to;
-  CopyBytes<sizeof(FromT)>(&v.raw, &to);
+  CopyBytes<sizeof(FromT)>(&v.raw, &to);  // not same size - ok to shrink
  return Vec1<T>(to);
 }

@@ -260,21 +260,21 @@ HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
 template <typename T>
 HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) {
  Mask1<T> mask;
-  CopyBytes<sizeof(mask.bits)>(&v.raw, &mask.bits);
+  CopySameSize(&v, &mask);
  return mask;
 }

 template <typename T>
 Vec1<T> VecFromMask(const Mask1<T> mask) {
  Vec1<T> v;
-  CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
+  CopySameSize(&mask, &v);
  return v;
 }

 template <typename T>
 Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
  Vec1<T> v;
-  CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
+  CopySameSize(&mask, &v);
  return v;
 }

@@ -697,10 +697,10 @@ HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
  float f = v.raw;
  const float half = f * 0.5f;
  uint32_t bits;
-  CopyBytes<4>(&f, &bits);
+  CopySameSize(&f, &bits);
  // Initial guess based on log2(f)
  bits = 0x5F3759DF - (bits >> 1);
-  CopyBytes<4>(&bits, &f);
+  CopySameSize(&bits, &f);
  // One Newton-Raphson iteration
  return Vec1<float>(f * (1.5f - (half * f * f)));
 }
@@ -778,7 +778,7 @@ V Ceiling(const V v) {
  const bool positive = f > Float(0.0);

  Bits bits;
-  CopyBytes<sizeof(Bits)>(&v, &bits);
+  CopySameSize(&v, &bits);

  const int exponent =
      static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -795,7 +795,7 @@ V Ceiling(const V v) {
  if (positive) bits += (kMantissaMask + 1) >> exponent;
  bits &= ~mantissa_mask;

-  CopyBytes<sizeof(Bits)>(&bits, &f);
+  CopySameSize(&bits, &f);
  return V(f);
 }

@@ -810,7 +810,7 @@ V Floor(const V v) {
  const bool negative = f < Float(0.0);

  Bits bits;
-  CopyBytes<sizeof(Bits)>(&v, &bits);
+  CopySameSize(&v, &bits);

  const int exponent =
      static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -827,7 +827,7 @@ V Floor(const V v) {
  if (negative) bits += (kMantissaMask + 1) >> exponent;
  bits &= ~mantissa_mask;

-  CopyBytes<sizeof(Bits)>(&bits, &f);
+  CopySameSize(&bits, &f);
  return V(f);
 }

@@ -889,7 +889,7 @@ template <typename T>
 HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
  // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
  MakeUnsigned<T> bits;
-  memcpy(&bits, &v, sizeof(v));
+  CopySameSize(&v, &bits);
  bits += bits;
  bits >>= 1;  // clear sign bit
  // NaN if all exponent bits are set and the mantissa is not zero.
@@ -929,7 +929,7 @@ HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
 template <typename T>
 HWY_API Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
  T t;
-  CopyBytes<sizeof(T)>(aligned, &t);
+  CopySameSize(aligned, &t);
  return Vec1<T>(t);
 }

@@ -955,7 +955,7 @@ HWY_API Vec1<T> LoadDup128(Sisd<T> d, const T* HWY_RESTRICT aligned) {
 template <typename T>
 HWY_API void Store(const Vec1<T> v, Sisd<T> /* tag */,
                   T* HWY_RESTRICT aligned) {
-  CopyBytes<sizeof(T)>(&v.raw, aligned);
+  CopySameSize(&v.raw, aligned);
 }

 template <typename T>
@@ -1119,7 +1119,7 @@ HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {

 HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
  uint16_t bits16;
-  CopyBytes<2>(&v.raw, &bits16);
+  CopySameSize(&v.raw, &bits16);
  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
  const uint32_t mantissa = bits16 & 0x3FF;
@@ -1136,7 +1136,7 @@ HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
  const uint32_t mantissa32 = mantissa << (23 - 10);
  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
  float out;
-  CopyBytes<4>(&bits32, &out);
+  CopySameSize(&bits32, &out);
  return Vec1<float>(out);
 }

@@ -1147,7 +1147,7 @@ HWY_API Vec1<float> PromoteTo(Sisd<float> d, const Vec1<bfloat16_t> v) {
 HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
                                 const Vec1<float> v) {
  uint32_t bits32;
-  CopyBytes<4>(&v.raw, &bits32);
+  CopySameSize(&v.raw, &bits32);
  const uint32_t sign = bits32 >> 31;
  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
@@ -1158,7 +1158,7 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
  Vec1<float16_t> out;
  if (exp < -24) {
    const uint16_t zero = 0;
-    CopyBytes<2>(&zero, &out.raw);
+    CopySameSize(&zero, &out.raw);
    return out;
  }

@@ -1182,7 +1182,7 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
  HWY_DASSERT(bits16 < 0x10000);
  const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
-  CopyBytes<2>(&narrowed, &out.raw);
+  CopySameSize(&narrowed, &out.raw);
  return out;
 }

@@ -1379,7 +1379,7 @@ HWY_API Vec1<TI> TableLookupBytes(const Vec1<T> in, const Vec1<TI> indices) {
  uint8_t in_bytes[sizeof(T)];
  uint8_t idx_bytes[sizeof(T)];
  uint8_t out_bytes[sizeof(T)];
-  CopyBytes<sizeof(T)>(&in, &in_bytes);
+  CopyBytes<sizeof(T)>(&in, &in_bytes);  // copy to bytes
  CopyBytes<sizeof(T)>(&indices, &idx_bytes);
  for (size_t i = 0; i < sizeof(T); ++i) {
    out_bytes[i] = in_bytes[idx_bytes[i]];
@@ -1394,7 +1394,7 @@ HWY_API Vec1<TI> TableLookupBytesOr0(const Vec1<T> in, const Vec1<TI> indices) {
  uint8_t in_bytes[sizeof(T)];
  uint8_t idx_bytes[sizeof(T)];
  uint8_t out_bytes[sizeof(T)];
-  CopyBytes<sizeof(T)>(&in, &in_bytes);
+  CopyBytes<sizeof(T)>(&in, &in_bytes);  // copy to bytes
  CopyBytes<sizeof(T)>(&indices, &idx_bytes);
  for (size_t i = 0; i < sizeof(T); ++i) {
    out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
@@ -1546,62 +1546,6 @@ HWY_API Vec1<T> MaxOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
  return v;
 }

-// ================================================== Operator wrapper
-
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
-}
-
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
-}
-
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
-}
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
-}
-
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
-}
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
-}
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
-}
-
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
-}
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
-}
-
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
-}
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
--- a/third_party/highway/hwy/ops/wasm_128-inl.h
+++ b/third_party/highway/hwy/ops/wasm_128-inl.h
@@ -3367,6 +3367,11 @@ HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
                                   const Vec128<int32_t, N> v) {
  return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
 }
+template <size_t N>
+HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
+                                   const Vec128<uint32_t, N> v) {
+  return Vec128<float, N>{wasm_f32x4_convert_u32x4(v.raw)};
+}
 // Truncates (rounds toward zero).
 template <size_t N>
 HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N, 0> /* tag */,
@@ -4348,26 +4353,52 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
  return Max(v10, v01);
 }

-// u16/i16
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const Repartition<int32_t, decltype(d)> d32;
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
  // Also broadcast into odd lanes.
-  return BitCast(d, Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const Repartition<int32_t, decltype(d)> d32;
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
  // Also broadcast into odd lanes.
-  return BitCast(d, Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }

 }  // namespace detail
@@ -4463,62 +4494,6 @@ HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
  return IfThenElse(Lt128Upper(d, b, a), a, b);
 }

-// ================================================== Operator wrapper
-
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
-}
-
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
-}
-
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
-}
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
-}
-
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
-}
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
-}
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
-}
-
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
-}
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
-}
-
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
-}
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
--- a/third_party/highway/hwy/ops/wasm_256-inl.h
+++ b/third_party/highway/hwy/ops/wasm_256-inl.h
@@ -592,7 +592,7 @@ HWY_API Vec256<int16_t> MulHigh(const Vec256<int16_t> a,
 }

 HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t>, Vec256<int16_t>) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }

 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
@@ -1043,7 +1043,7 @@ HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
 template <typename T>
    HWY_API Vec256 <
    T IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }

 template <typename T, HWY_IF_FLOAT(T)>
@@ -1333,13 +1333,13 @@ HWY_API Vec256<T> GatherIndex(const Full256<T> d, const T* HWY_RESTRICT base,
 // ------------------------------ ExtractLane
 template <typename T, size_t N>
 HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }

 // ------------------------------ InsertLane
 template <typename T, size_t N>
 HWY_API Vec128<T, N> InsertLane(const Vec128<T, N> v, size_t i, T t) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }

 // ------------------------------ GetLane
@@ -1846,21 +1846,21 @@ HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {

 template <typename T>
 HWY_API Vec256<T> Reverse2(Full256<T> d, const Vec256<T> v) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }

 // ------------------------------ Reverse4

 template <typename T>
 HWY_API Vec256<T> Reverse4(Full256<T> d, const Vec256<T> v) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }

 // ------------------------------ Reverse8

 template <typename T>
 HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }

 // ------------------------------ InterleaveLower
@@ -2065,13 +2065,13 @@ HWY_API Vec256<T> ConcatEven(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
 // ------------------------------ DupEven
 template <typename T>
 HWY_API Vec256<T> DupEven(Vec256<T> v) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }

 // ------------------------------ DupOdd
 template <typename T>
 HWY_API Vec256<T> DupOdd(Vec256<T> v) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }

 // ------------------------------ OddEven
@@ -2354,6 +2354,10 @@ HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
                                const Vec256<int32_t> v) {
  return Vec256<float>{wasm_f32x4_convert_i32x4(v.raw)};
 }
+HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
+                                const Vec256<uint32_t> v) {
+  return Vec256<float>{wasm_f32x4_convert_u32x4(v.raw)};
+}
 // Truncates (rounds toward zero).
 HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> /* tag */,
                                  const Vec256<float> v) {
@@ -2811,7 +2815,7 @@ HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
 // ------------------------------ CompressBlocksNot
 HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
                                           Mask256<uint64_t> mask) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }

 // ------------------------------ CompressBits
@@ -2968,22 +2972,12 @@ HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,

 // u16/i16
 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
-  const Repartition<int32_t, Full256<T>> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
-  // Also broadcast into odd lanes.
-  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
+HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> /*v*/) {
+  HWY_ASSERT(0);  // Not implemented
 }
 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
-  const Repartition<int32_t, Full256<T>> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
-  // Also broadcast into odd lanes.
-  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
+HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> /*v*/) {
+  HWY_ASSERT(0);  // Not implemented
 }

 }  // namespace detail
--- a/third_party/highway/hwy/ops/x86_128-inl.h
+++ b/third_party/highway/hwy/ops/x86_128-inl.h
@@ -17,6 +17,17 @@
 // operations when compiling for those targets.
 // External include guard in highway.h - see comment there.

+// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL
+#include "hwy/base.h"
+
+// Avoid uninitialized warnings in GCC's emmintrin.h - see
+// https://github.com/google/highway/issues/710 and pull/902)
+HWY_DIAGNOSTICS(push)
+#if HWY_COMPILER_GCC_ACTUAL
+HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
+#endif
+
 #include <emmintrin.h>
 #include <stdio.h>
 #if HWY_TARGET == HWY_SSSE3
@@ -27,8 +38,8 @@
 #endif
 #include <stddef.h>
 #include <stdint.h>
+#include <string.h>  // memcpy

-#include "hwy/base.h"
 #include "hwy/ops/shared-inl.h"

 #if HWY_IS_MSAN
@@ -1910,7 +1921,7 @@ template <typename T>
 HWY_API Vec64<T> Load(Full64<T> /* tag */, const T* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
  __m128i v = _mm_setzero_si128();
-  CopyBytes<8>(p, &v);
+  CopyBytes<8>(p, &v);  // not same size
  return Vec64<T>{v};
 #else
  return Vec64<T>{_mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))};
@@ -1921,7 +1932,7 @@ HWY_API Vec128<float, 2> Load(Full64<float> /* tag */,
                              const float* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
  __m128 v = _mm_setzero_ps();
-  CopyBytes<8>(p, &v);
+  CopyBytes<8>(p, &v);  // not same size
  return Vec128<float, 2>{v};
 #else
  const __m128 hi = _mm_setzero_ps();
@@ -1933,7 +1944,7 @@ HWY_API Vec64<double> Load(Full64<double> /* tag */,
                           const double* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
  __m128d v = _mm_setzero_pd();
-  CopyBytes<8>(p, &v);
+  CopyBytes<8>(p, &v);  // not same size
  return Vec64<double>{v};
 #else
  return Vec64<double>{_mm_load_sd(p)};
@@ -1944,7 +1955,7 @@ HWY_API Vec128<float, 1> Load(Full32<float> /* tag */,
                              const float* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
  __m128 v = _mm_setzero_ps();
-  CopyBytes<4>(p, &v);
+  CopyBytes<4>(p, &v);  // not same size
  return Vec128<float, 1>{v};
 #else
  return Vec128<float, 1>{_mm_load_ss(p)};
@@ -1957,11 +1968,11 @@ HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
  constexpr size_t kSize = sizeof(T) * N;
 #if HWY_SAFE_PARTIAL_LOAD_STORE
  __m128 v = _mm_setzero_ps();
-  CopyBytes<kSize>(p, &v);
+  CopyBytes<kSize>(p, &v);  // not same size
  return Vec128<T, N>{v};
 #else
  int32_t bits = 0;
-  CopyBytes<kSize>(p, &bits);
+  CopyBytes<kSize>(p, &bits);  // not same size
  return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
 #endif
 }
@@ -2111,7 +2122,7 @@ HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
 template <typename T>
 HWY_API void Store(Vec64<T> v, Full64<T> /* tag */, T* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<8>(&v, p);
+  CopyBytes<8>(&v, p);  // not same size
 #else
  _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw);
 #endif
@@ -2119,7 +2130,7 @@ HWY_API void Store(Vec64<T> v, Full64<T> /* tag */, T* HWY_RESTRICT p) {
 HWY_API void Store(const Vec128<float, 2> v, Full64<float> /* tag */,
                   float* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<8>(&v, p);
+  CopyBytes<8>(&v, p);  // not same size
 #else
  _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
 #endif
@@ -2127,7 +2138,7 @@ HWY_API void Store(const Vec128<float, 2> v, Full64<float> /* tag */,
 HWY_API void Store(const Vec64<double> v, Full64<double> /* tag */,
                   double* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<8>(&v, p);
+  CopyBytes<8>(&v, p);  // not same size
 #else
  _mm_storel_pd(p, v.raw);
 #endif
@@ -2136,12 +2147,12 @@ HWY_API void Store(const Vec64<double> v, Full64<double> /* tag */,
 // Any <= 32 bit except <float, 1>
 template <typename T, size_t N, HWY_IF_LE32(T, N)>
 HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  CopyBytes<sizeof(T) * N>(&v, p);
+  CopyBytes<sizeof(T) * N>(&v, p);  // not same size
 }
 HWY_API void Store(const Vec128<float, 1> v, Full32<float> /* tag */,
                   float* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<4>(&v, p);
+  CopyBytes<4>(&v, p);  // not same size
 #else
  _mm_store_ss(p, v.raw);
 #endif
@@ -2172,7 +2183,7 @@ HWY_API void ScalarMaskedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
  Store(BitCast(di, VecFromMask(d, m)), di, mask);
  for (size_t i = 0; i < N; ++i) {
    if (mask[i]) {
-      CopyBytes<sizeof(T)>(buf + i, p + i);
+      CopySameSize(buf + i, p + i);
    }
  }
 }
@@ -3635,9 +3646,9 @@ HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
  return lanes[kLane];
 #else
  // Bug in the intrinsic, returns int but should be float.
-  const int bits = _mm_extract_ps(v.raw, kLane);
+  const int32_t bits = _mm_extract_ps(v.raw, kLane);
  float ret;
-  CopyBytes<4>(&bits, &ret);
+  CopySameSize(&bits, &ret);
  return ret;
 #endif
 }
@@ -3814,7 +3825,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
  return Load(d, lanes);
 #else
  MakeSigned<T> ti;
-  CopyBytes<sizeof(T)>(&t, &ti);  // don't just cast because T might be float.
+  CopySameSize(&t, &ti);  // don't just cast because T might be float.
  return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
 #endif
 }
@@ -3830,7 +3841,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
  return Load(d, lanes);
 #else
  MakeSigned<T> ti;
-  CopyBytes<sizeof(T)>(&t, &ti);  // don't just cast because T might be float.
+  CopySameSize(&t, &ti);  // don't just cast because T might be float.
  return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
 #endif
 }
@@ -5582,6 +5593,26 @@ HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
  return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)};
 }

+template <size_t N>
+HWY_API Vec128<float, N> ConvertTo(HWY_MAYBE_UNUSED Simd<float, N, 0> df,
+                                   const Vec128<uint32_t, N> v) {
+#if HWY_TARGET <= HWY_AVX3
+  return Vec128<float, N>{_mm_cvtepu32_ps(v.raw)};
+#else
+  // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
+  const RebindToUnsigned<decltype(df)> du32;
+  const RebindToSigned<decltype(df)> d32;
+
+  const auto msk_lo = Set(du32, 0xFFFF);
+  const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
+
+  // Extract the 16 lowest/highest significant bits of v and cast to signed int
+  const auto v_lo = BitCast(d32, And(v, msk_lo));
+  const auto v_hi = BitCast(d32, ShiftRight<16>(v));
+  return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
+#endif
+}
+
 template <size_t N>
 HWY_API Vec128<double, N> ConvertTo(Simd<double, N, 0> dd,
                                    const Vec128<int64_t, N> v) {
@@ -5606,6 +5637,33 @@ HWY_API Vec128<double, N> ConvertTo(Simd<double, N, 0> dd,
 #endif
 }

+template <size_t N>
+HWY_API Vec128<double, N> ConvertTo(HWY_MAYBE_UNUSED Simd<double, N, 0> dd,
+                                    const Vec128<uint64_t, N> v) {
+#if HWY_TARGET <= HWY_AVX3
+  return Vec128<double, N>{_mm_cvtepu64_pd(v.raw)};
+#else
+  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
+  const RebindToUnsigned<decltype(dd)> d64;
+  using VU = VFromD<decltype(d64)>;
+
+  const VU msk_lo = Set(d64, 0xFFFFFFFF);
+  const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
+
+  // Extract the 32 lowest/highest significant bits of v
+  const VU v_lo = And(v, msk_lo);
+  const VU v_hi = ShiftRight<32>(v);
+
+  auto uint64_to_double128_fast = [&dd](VU w) HWY_ATTR {
+    w = Or(w, VU{detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)});
+    return BitCast(dd, w) - Set(dd, 0x0010000000000000);
+  };
+
+  const auto v_lo_dbl = uint64_to_double128_fast(v_lo);
+  return MulAdd(cnst2_32_dbl, uint64_to_double128_fast(v_hi), v_lo_dbl);
+#endif
+}
+
 // Truncates (rounds toward zero).
 template <size_t N>
 HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N, 0> di,
@@ -5959,8 +6017,8 @@ HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,

  // Non-full byte, need to clear the undefined upper bits.
  if (N < 8) {
-    const int mask = (1 << N) - 1;
-    bits[0] = static_cast<uint8_t>(bits[0] & mask);
+    const int mask_bits = (1 << N) - 1;
+    bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
  }

  return kNumBytes;
@@ -7103,24 +7161,52 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
  return Max(v10, v01);
 }

-// u16/i16
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const Repartition<int32_t, Simd<T, N, 0>> d32;
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
  // Also broadcast into odd lanes.
-  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const Repartition<int32_t, Simd<T, N, 0>> d32;
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
  // Also broadcast into odd lanes.
-  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }

 }  // namespace detail
@@ -7237,65 +7323,11 @@ HWY_API V Max128Upper(D d, const V a, const V b) {
  return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
 }

-// ================================================== Operator wrapper
-
-// These apply to all x86_*-inl.h because there are no restrictions on V.
-
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
-}
-
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
-}
-
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
-}
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
-}
-
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
-}
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
-}
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
-}
-
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
-}
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
-}
-
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
-}
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();
+
+// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
+// the warning seems to be issued at the call site of intrinsics, i.e. our code.
+HWY_DIAGNOSTICS(pop)
--- a/third_party/highway/hwy/ops/x86_256-inl.h
+++ b/third_party/highway/hwy/ops/x86_256-inl.h
@@ -49,6 +49,7 @@ HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")

 #include <stddef.h>
 #include <stdint.h>
+#include <string.h>  // memcpy

 #if HWY_IS_MSAN
 #include <sanitizer/msan_interface.h>
@@ -2368,7 +2369,7 @@ HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
  Store(BitCast(du, VecFromMask(d, m)), du, mask);
  for (size_t i = 0; i < 32 / sizeof(T); ++i) {
    if (mask[i]) {
-      CopyBytes<sizeof(T)>(buf + i, p + i);
+      CopySameSize(buf + i, p + i);
    }
  }
 }
@@ -4207,6 +4208,53 @@ HWY_API Vec256<double> ConvertTo(Full256<double> dd, const Vec256<int64_t> v) {
 #endif
 }

+HWY_API Vec256<float> ConvertTo(HWY_MAYBE_UNUSED Full256<float> df,
+                                const Vec256<uint32_t> v) {
+#if HWY_TARGET <= HWY_AVX3
+  return Vec256<float>{_mm256_cvtepu32_ps(v.raw)};
+#else
+  // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
+  const RebindToUnsigned<decltype(df)> du32;
+  const RebindToSigned<decltype(df)> d32;
+
+  const auto msk_lo = Set(du32, 0xFFFF);
+  const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
+
+  // Extract the 16 lowest/highest significant bits of v and cast to signed int
+  const auto v_lo = BitCast(d32, And(v, msk_lo));
+  const auto v_hi = BitCast(d32, ShiftRight<16>(v));
+
+  return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
+#endif
+}
+
+HWY_API Vec256<double> ConvertTo(HWY_MAYBE_UNUSED Full256<double> dd,
+                                  const Vec256<uint64_t> v) {
+#if HWY_TARGET <= HWY_AVX3
+  return Vec256<double>{_mm256_cvtepu64_pd(v.raw)};
+#else
+  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
+  const RebindToUnsigned<decltype(dd)> d64;
+  using VU = VFromD<decltype(d64)>;
+
+  const VU msk_lo = Set(d64, 0xFFFFFFFFULL);
+  const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
+
+   // Extract the 32 lowest significant bits of v
+  const VU v_lo = And(v, msk_lo);
+  const VU v_hi = ShiftRight<32>(v);
+
+  auto uint64_to_double256_fast = [&dd](Vec256<uint64_t> w) HWY_ATTR {
+    w = Or(w, Vec256<uint64_t>{
+                  detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)});
+    return BitCast(dd, w) - Set(dd, 0x0010000000000000);
+  };
+
+  const auto v_lo_dbl = uint64_to_double256_fast(v_lo);
+  return MulAdd(cnst2_32_dbl, uint64_to_double256_fast(v_hi), v_lo_dbl);
+#endif
+}
+
 // Truncates (rounds toward zero).
 HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> d, const Vec256<float> v) {
  return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw));
@@ -4396,8 +4444,8 @@ HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,

  // Non-full byte, need to clear the undefined upper bits.
  if (N < 8) {
-    const int mask = static_cast<int>((1ull << N) - 1);
-    bits[0] = static_cast<uint8_t>(bits[0] & mask);
+    const int mask_bits = static_cast<int>((1ull << N) - 1);
+    bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
  }
  return kNumBytes;
 }
@@ -5381,24 +5429,48 @@ HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
  return Max(v10, v01);
 }

-// u16/i16
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
-  const Repartition<int32_t, Full256<T>> d32;
+HWY_API Vec256<uint16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                    Vec256<uint16_t> v) {
+  const Full256<uint16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
  // Also broadcast into odd lanes.
-  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
-  const Repartition<int32_t, Full256<T>> d32;
+HWY_API Vec256<int16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                   Vec256<int16_t> v) {
+  const Full256<int16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+
+HWY_API Vec256<uint16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                    Vec256<uint16_t> v) {
+  const Full256<uint16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
  // Also broadcast into odd lanes.
-  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+HWY_API Vec256<int16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                   Vec256<int16_t> v) {
+  const Full256<int16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }

 }  // namespace detail
--- a/third_party/highway/hwy/ops/x86_512-inl.h
+++ b/third_party/highway/hwy/ops/x86_512-inl.h
@@ -1164,6 +1164,22 @@ HWY_API Vec512<uint16_t> operator*(Vec512<uint16_t> a, Vec512<uint16_t> b) {
 HWY_API Vec512<uint32_t> operator*(Vec512<uint32_t> a, Vec512<uint32_t> b) {
  return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
 }
+HWY_API Vec512<uint64_t> operator*(Vec512<uint64_t> a, Vec512<uint64_t> b) {
+  return Vec512<uint64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
+}
+HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) {
+  return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
+}
+HWY_API Vec128<uint64_t> operator*(Vec128<uint64_t> a, Vec128<uint64_t> b) {
+  return Vec128<uint64_t>{_mm_mullo_epi64(a.raw, b.raw)};
+}
+
+// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
+#ifdef HWY_NATIVE_I64MULLO
+#undef HWY_NATIVE_I64MULLO
+#else
+#define HWY_NATIVE_I64MULLO
+#endif

 // Signed
 HWY_API Vec512<int16_t> operator*(Vec512<int16_t> a, Vec512<int16_t> b) {
@@ -1172,7 +1188,15 @@ HWY_API Vec512<int16_t> operator*(Vec512<int16_t> a, Vec512<int16_t> b) {
 HWY_API Vec512<int32_t> operator*(Vec512<int32_t> a, Vec512<int32_t> b) {
  return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
 }
-
+HWY_API Vec512<int64_t> operator*(Vec512<int64_t> a, Vec512<int64_t> b) {
+  return Vec512<int64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
+}
+HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) {
+  return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
+}
+HWY_API Vec128<int64_t> operator*(Vec128<int64_t> a, Vec128<int64_t> b) {
+  return Vec128<int64_t>{_mm_mullo_epi64(a.raw, b.raw)};
+}
 // Returns the upper 16 bits of a * b in each lane.
 HWY_API Vec512<uint16_t> MulHigh(Vec512<uint16_t> a, Vec512<uint16_t> b) {
  return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
@@ -3399,6 +3423,16 @@ HWY_API Vec512<double> ConvertTo(Full512<double> /* tag */,
  return Vec512<double>{_mm512_cvtepi64_pd(v.raw)};
 }

+HWY_API Vec512<float> ConvertTo(Full512<float> /* tag*/,
+                                const Vec512<uint32_t> v) {
+  return Vec512<float>{_mm512_cvtepu32_ps(v.raw)};
+}
+
+HWY_API Vec512<double> ConvertTo(Full512<double> /* tag*/,
+                                const Vec512<uint64_t> v) {
+  return Vec512<double>{_mm512_cvtepu64_pd(v.raw)};
+}
+
 // Truncates (rounds toward zero).
 HWY_API Vec512<int32_t> ConvertTo(Full512<int32_t> d, const Vec512<float> v) {
  return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw));
@@ -4231,14 +4265,22 @@ HWY_API Vec512<float> MinOfLanes(Full512<float> d, Vec512<float> v) {
 HWY_API Vec512<double> MinOfLanes(Full512<double> d, Vec512<double> v) {
  return Set(d, _mm512_reduce_min_pd(v.raw));
 }
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> MinOfLanes(Full512<T> d, Vec512<T> v) {
-  const Repartition<int32_t, decltype(d)> d32;
+HWY_API Vec512<uint16_t> MinOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
  const auto odd = ShiftRight<16>(BitCast(d32, v));
  const auto min = MinOfLanes(d32, Min(even, odd));
  // Also broadcast into odd lanes.
-  return BitCast(d, Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+HWY_API Vec512<int16_t> MinOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MinOfLanes(d32, Min(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }

 // Returns the maximum in each lane.
@@ -4260,14 +4302,22 @@ HWY_API Vec512<float> MaxOfLanes(Full512<float> d, Vec512<float> v) {
 HWY_API Vec512<double> MaxOfLanes(Full512<double> d, Vec512<double> v) {
  return Set(d, _mm512_reduce_max_pd(v.raw));
 }
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> MaxOfLanes(Full512<T> d, Vec512<T> v) {
-  const Repartition<int32_t, decltype(d)> d32;
+HWY_API Vec512<uint16_t> MaxOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
  const auto odd = ShiftRight<16>(BitCast(d32, v));
  const auto min = MaxOfLanes(d32, Max(even, odd));
  // Also broadcast into odd lanes.
-  return BitCast(d, Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+HWY_API Vec512<int16_t> MaxOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(d32, Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
--- a/third_party/highway/hwy/print-inl.h
+++ b/third_party/highway/hwy/print-inl.h
@@ -15,7 +15,6 @@

 // Print() function

-#include <inttypes.h>
 #include <stdint.h>

 #include "hwy/aligned_allocator.h"
--- a/third_party/highway/hwy/print.cc
+++ b/third_party/highway/hwy/print.cc
@@ -15,6 +15,9 @@

 #include "hwy/print.h"

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdio.h>
--- a/third_party/highway/hwy/targets.cc
+++ b/third_party/highway/hwy/targets.cc
@@ -15,6 +15,9 @@

 #include "hwy/targets.h"

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
 #include <inttypes.h>  // PRIx64
 #include <stdarg.h>
 #include <stddef.h>
@@ -23,7 +26,7 @@

 #include <atomic>

-#include "hwy/per_target.h"
+#include "hwy/per_target.h"  // VectorBytes

 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
 #include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
--- a/third_party/highway/hwy/tests/arithmetic_test.cc
+++ b/third_party/highway/hwy/tests/arithmetic_test.cc
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>

--- a/third_party/highway/hwy/tests/blockwise_shift_test.cc
+++ b/third_party/highway/hwy/tests/blockwise_shift_test.cc
@@ -15,7 +15,7 @@

 #include <stddef.h>
 #include <stdint.h>
-#include <string.h>
+#include <string.h>  // memcpy

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc"
@@ -63,17 +63,17 @@ struct TestShiftBytes {
    auto expected = AllocateAligned<T>(N);
    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());

-    const size_t kBlockSize = HWY_MIN(N8, 16);
-    for (size_t block = 0; block < N8; block += kBlockSize) {
+    const size_t block_size = HWY_MIN(N8, 16);
+    for (size_t block = 0; block < N8; block += block_size) {
      expected_bytes[block] = 0;
-      memcpy(expected_bytes + block + 1, in_bytes + block, kBlockSize - 1);
+      memcpy(expected_bytes + block + 1, in_bytes + block, block_size - 1);
    }
    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v));

-    for (size_t block = 0; block < N8; block += kBlockSize) {
-      memcpy(expected_bytes + block, in_bytes + block + 1, kBlockSize - 1);
-      expected_bytes[block + kBlockSize - 1] = 0;
+    for (size_t block = 0; block < N8; block += block_size) {
+      memcpy(expected_bytes + block, in_bytes + block + 1, block_size - 1);
+      expected_bytes[block + block_size - 1] = 0;
    }
    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v));
 #else
@@ -152,7 +152,7 @@ template <int kBytes>
 struct TestCombineShiftRightBytes {
  template <class T, class D>
  HWY_NOINLINE void operator()(T, D d) {
-    const size_t kBlockSize = 16;
+    constexpr size_t kBlockSize = 16;
    static_assert(kBytes < kBlockSize, "Shift count is per block");
    const Repartition<uint8_t, D> d8;
    const size_t N8 = Lanes(d8);
@@ -170,6 +170,7 @@ struct TestCombineShiftRightBytes {
        lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
      }
      for (size_t i = 0; i < N8; i += kBlockSize) {
+        // Arguments are not the same size.
        CopyBytes<kBlockSize>(&lo_bytes[i], combined);
        CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
        CopyBytes<kBlockSize>(combined + kBytes, &expected_bytes[i]);
@@ -194,7 +195,7 @@ struct TestCombineShiftRightLanes {
    auto hi_bytes = AllocateAligned<uint8_t>(N8);
    auto lo_bytes = AllocateAligned<uint8_t>(N8);
    auto expected_bytes = AllocateAligned<uint8_t>(N8);
-    const size_t kBlockSize = 16;
+    constexpr size_t kBlockSize = 16;
    uint8_t combined[2 * kBlockSize];

    // Random inputs in each lane
@@ -205,6 +206,7 @@ struct TestCombineShiftRightLanes {
        lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
      }
      for (size_t i = 0; i < N8; i += kBlockSize) {
+        // Arguments are not the same size.
        CopyBytes<kBlockSize>(&lo_bytes[i], combined);
        CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
        CopyBytes<kBlockSize>(combined + kLanes * sizeof(T),
--- a/third_party/highway/hwy/tests/combine_test.cc
+++ b/third_party/highway/hwy/tests/combine_test.cc
@@ -15,6 +15,7 @@

 #include <stddef.h>
 #include <stdint.h>
+#include <string.h>  // memcpy

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/combine_test.cc"
--- a/third_party/highway/hwy/tests/compress_test.cc
+++ b/third_party/highway/hwy/tests/compress_test.cc
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <inttypes.h>  // PRIu64
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>  // memset
@@ -44,19 +43,16 @@ void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
                 const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
                 int line) {
  if (expected_pos != actual_pos) {
-    hwy::Abort(
-        __FILE__, line,
-        "Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
-        TypeName(T(), Lanes(d)).c_str(), static_cast<uint64_t>(expected_pos),
-        static_cast<uint64_t>(actual_pos));
+    hwy::Abort(__FILE__, line, "Size mismatch for %s: expected %d, actual %d\n",
+               TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
+               static_cast<int>(actual_pos));
  }
  // Modified from AssertVecEqual - we may not be checking all lanes.
  for (size_t i = 0; i < num_to_check; ++i) {
    if (!IsEqual(expected[i], actual_u[i])) {
      const size_t N = Lanes(d);
-      fprintf(stderr, "Mismatch at i=%" PRIu64 " of %" PRIu64 ", line %d:\n\n",
-              static_cast<uint64_t>(i), static_cast<uint64_t>(num_to_check),
-              line);
+      fprintf(stderr, "Mismatch at i=%d of %d, line %d:\n\n",
+              static_cast<int>(i), static_cast<int>(num_to_check), line);
      Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
      Print(d, "in", Load(d, in.get()), 0, N);
      Print(d, "expect", Load(d, expected.get()), 0, N);
@@ -97,7 +93,7 @@ struct TestCompress {
        for (size_t i = 0; i < N; ++i) {
          const uint64_t bits = Random32(&rng);
          in_lanes[i] = T();  // cannot initialize float16_t directly.
-          CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
+          CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);  // not same size
          mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
          if (mask_lanes[i] > 0) {
            expected[expected_pos++] = in_lanes[i];
@@ -203,8 +199,8 @@ struct TestCompressBlocks {
      for (size_t i = 0; i < N; i += 2) {
        const uint64_t bits = Random32(&rng);
        in_lanes[i + 1] = in_lanes[i] = T();  // cannot set float16_t directly.
-        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
-        CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]);
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);      // not same size
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]);  // not same size
        mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0};
        if (mask_lanes[i] > 0) {
          expected[expected_pos++] = in_lanes[i];
@@ -598,8 +594,7 @@ void PrintCompress32x4Tables() {

    for (size_t i = 0; i < N; ++i) {
      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%" PRIu64 ",",
-               static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
      }
    }
  }
@@ -630,8 +625,7 @@ void PrintCompressNot32x4Tables() {

    for (size_t i = 0; i < N; ++i) {
      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%" PRIu64 ",",
-               static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
      }
    }
  }
@@ -662,8 +656,7 @@ void PrintCompress64x2Tables() {

    for (size_t i = 0; i < N; ++i) {
      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%" PRIu64 ",",
-               static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
      }
    }
  }
@@ -694,8 +687,7 @@ void PrintCompressNot64x2Tables() {

    for (size_t i = 0; i < N; ++i) {
      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%" PRIu64 ",",
-               static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
      }
    }
  }
--- a/third_party/highway/hwy/tests/convert_test.cc
+++ b/third_party/highway/hwy/tests/convert_test.cc
@@ -16,6 +16,9 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+
+#include <cmath>  // std::isfinite
+
 #include "hwy/base.h"

 #undef HWY_TARGET_INCLUDE
@@ -155,7 +158,7 @@ struct TestPromoteTo {
    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
      for (size_t i = 0; i < N; ++i) {
        const uint64_t bits = rng();
-        memcpy(&from[i], &bits, sizeof(T));
+        CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
        expected[i] = from[i];
      }

@@ -235,13 +238,19 @@ AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
      -2.00390625f, -3.99609375f,
      // No infinity/NaN - implementation-defined due to ARM.
  };
-  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
  const size_t N = Lanes(d);
+  HWY_ASSERT(N != 0);
  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
  auto in = AllocateAligned<float>(padded);
  auto expected = AllocateAligned<float>(padded);
-  std::copy(test_cases, test_cases + kNumTestCases, in.get());
-  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
+  size_t i = 0;
+  for (; i < kNumTestCases; ++i) {
+    in[i] = test_cases[i];
+  }
+  for (; i < padded; ++i) {
+    in[i] = 0.0f;
+  }
  return in;
 }

@@ -250,10 +259,11 @@ struct TestF16 {
  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
 #if HWY_HAVE_FLOAT16
    size_t padded;
+    const size_t N = Lanes(d32);  // same count for f16
+    HWY_ASSERT(N != 0);
    auto in = F16TestCases(d32, padded);
    using TF16 = float16_t;
    const Rebind<TF16, DF32> d16;
-    const size_t N = Lanes(d32);  // same count for f16
    auto temp16 = AllocateAligned<TF16>(N);

    for (size_t i = 0; i < padded; i += N) {
@@ -289,13 +299,19 @@ AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
      // negative +/- delta
      -2.015625f, -3.984375f,
  };
-  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
  const size_t N = Lanes(d);
+  HWY_ASSERT(N != 0);
  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
  auto in = AllocateAligned<float>(padded);
  auto expected = AllocateAligned<float>(padded);
-  std::copy(test_cases, test_cases + kNumTestCases, in.get());
-  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
+  size_t i = 0;
+  for (; i < kNumTestCases; ++i) {
+    in[i] = test_cases[i];
+  }
+  for (; i < padded; ++i) {
+    in[i] = 0.0f;
+  }
  return in;
 }

@@ -387,10 +403,13 @@ HWY_NOINLINE void TestAllTruncate() {
 struct TestIntFromFloatHuge {
  template <typename TF, class DF>
  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
-    // Still does not work, although ARMv7 manual says that float->int
-    // saturates, i.e. chooses the nearest representable value. Also causes
-    // out-of-memory for MSVC.
-#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC
+    // The ARMv7 manual says that float->int saturates, i.e. chooses the
+    // nearest representable value. This works correctly on armhf with GCC, but
+    // not with clang. For reasons unknown, MSVC also runs into an out-of-memory
+    // error here.
+#if HWY_COMPILER_CLANG || HWY_COMPILER_MSVC
+    (void)df;
+#else
    using TI = MakeSigned<TF>;
    const Rebind<TI, DF> di;

@@ -406,8 +425,6 @@ struct TestIntFromFloatHuge {
    // Huge negative
    Store(Set(di, LimitsMin<TI>()), di, expected.get());
    HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20))));
-#else
-    (void)df;
 #endif
  }
 };
@@ -451,7 +468,7 @@ class TestIntFromFloat {
      for (size_t i = 0; i < N; ++i) {
        do {
          const uint64_t bits = rng();
-          memcpy(&from[i], &bits, sizeof(TF));
+          CopyBytes<sizeof(TF)>(&bits, &from[i]);  // not same size
        } while (!std::isfinite(from[i]));
        if (from[i] >= max) {
          expected[i] = LimitsMax<TI>();
@@ -532,6 +549,34 @@ HWY_NOINLINE void TestAllFloatFromInt() {
  ForFloatTypes(ForPartialVectors<TestFloatFromInt>());
 }

+struct TestFloatFromUint {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    using TU = MakeUnsigned<TF>;
+    const RebindToUnsigned<DF> du;
+
+    // Integer positive
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(du, TU(4))));
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(65535.0)),
+                      ConvertTo(df, Iota(du, 65535)));  // 2^16-1
+    if (sizeof(TF) > 4) {
+      HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4294967295.0)),
+                        ConvertTo(df, Iota(du, 4294967295ULL)));  // 2^32-1
+    }
+
+    // Max positive
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TU>())),
+                      ConvertTo(df, Set(du, LimitsMax<TU>())));
+
+    // Zero
+    HWY_ASSERT_VEC_EQ(df, Zero(df), ConvertTo(df, Zero(du)));
+  }
+};
+
+HWY_NOINLINE void TestAllFloatFromUint() {
+  ForFloatTypes(ForPartialVectors<TestFloatFromUint>());
+}
+
 struct TestI32F64 {
  template <typename TF, class DF>
  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
@@ -591,6 +636,7 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllTruncate);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromUint);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
 }  // namespace hwy

--- a/third_party/highway/hwy/tests/crypto_test.cc
+++ b/third_party/highway/hwy/tests/crypto_test.cc
@@ -492,8 +492,8 @@ struct TestCLMul {
    const size_t padded = RoundUpTo(kCLMulNum, N);
    auto expected_lower = AllocateAligned<T>(padded);
    auto expected_upper = AllocateAligned<T>(padded);
-    memcpy(expected_lower.get(), kCLMulLower, kCLMulNum * sizeof(T));
-    memcpy(expected_upper.get(), kCLMulUpper, kCLMulNum * sizeof(T));
+    CopyBytes<kCLMulNum * sizeof(T)>(kCLMulLower, expected_lower.get());
+    CopyBytes<kCLMulNum * sizeof(T)>(kCLMulUpper, expected_upper.get());
    const size_t padding_size = (padded - kCLMulNum) * sizeof(T);
    memset(expected_lower.get() + kCLMulNum, 0, padding_size);
    memset(expected_upper.get() + kCLMulNum, 0, padding_size);
--- a/third_party/highway/hwy/tests/demote_test.cc
+++ b/third_party/highway/hwy/tests/demote_test.cc
@@ -15,7 +15,6 @@

 #include <stddef.h>
 #include <stdint.h>
-#include <string.h>

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/demote_test.cc"
@@ -66,7 +65,7 @@ struct TestDemoteTo {
      for (size_t i = 0; i < N; ++i) {
        do {
          const uint64_t bits = rng();
-          memcpy(&from[i], &bits, sizeof(T));
+          CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
        } while (!value_ok(from[i]));
        expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
      }
@@ -116,7 +115,7 @@ struct TestDemoteToFloat {
      for (size_t i = 0; i < N; ++i) {
        do {
          const uint64_t bits = rng();
-          memcpy(&from[i], &bits, sizeof(T));
+          CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
        } while (!IsFiniteT(from[i]));
        const T magn = std::abs(from[i]);
        const T max_abs = HighestValue<ToT>();
--- a/third_party/highway/hwy/tests/float_test.cc
+++ b/third_party/highway/hwy/tests/float_test.cc
@@ -15,7 +15,6 @@

 // Tests some ops specific to floating-point types (Div, Round etc.)

-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>

@@ -113,9 +112,8 @@ struct TestReciprocalSquareRoot {
      float err = lanes[i] - 0.090166f;
      if (err < 0.0f) err = -err;
      if (err >= 4E-4f) {
-        HWY_ABORT("Lane %" PRIu64 "(%" PRIu64 "): actual %f err %f\n",
-                  static_cast<uint64_t>(i), static_cast<uint64_t>(N), lanes[i],
-                  err);
+        HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast<int>(i),
+                  static_cast<int>(N), lanes[i], err);
      }
    }
  }
--- a/third_party/highway/hwy/tests/mask_mem_test.cc
+++ b/third_party/highway/hwy/tests/mask_mem_test.cc
@@ -13,6 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
--- a/third_party/highway/hwy/tests/mask_test.cc
+++ b/third_party/highway/hwy/tests/mask_test.cc
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>  // memcmp
--- a/third_party/highway/hwy/tests/mul_test.cc
+++ b/third_party/highway/hwy/tests/mul_test.cc
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>

@@ -58,7 +57,7 @@ struct TestUnsignedMul {
    HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));

    const size_t bits = sizeof(T) * 8;
-    const uint64_t mask = (1ull << bits) - 1;
+    const uint64_t mask = bits==64 ? (~uint64_t{0}) : (1ull << bits) - 1;
    const T max2 = (static_cast<uint64_t>(max) * max) & mask;
    HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
  }
@@ -97,13 +96,13 @@ HWY_NOINLINE void TestAllMul() {
  // No u8.
  test_unsigned(uint16_t());
  test_unsigned(uint32_t());
-  // No u64.
+  test_unsigned(uint64_t());

  const ForPartialVectors<TestSignedMul> test_signed;
  // No i8.
  test_signed(int16_t());
  test_signed(int32_t());
-  // No i64.
+  test_signed(int64_t());
 }

 struct TestMulHigh {
--- a/third_party/highway/hwy/tests/reduction_test.cc
+++ b/third_party/highway/hwy/tests/reduction_test.cc
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>

@@ -80,6 +79,35 @@ struct TestMinOfLanes {
      min = HWY_MIN(min, in_lanes[i]);
    }
    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+
+    // Bug #910: also check negative values
+    min = HighestValue<T>();
+    const T input_copy[] = {static_cast<T>(-1),
+                            static_cast<T>(-2),
+                            1,
+                            2,
+                            3,
+                            4,
+                            5,
+                            6,
+                            7,
+                            8,
+                            9,
+                            10,
+                            11,
+                            12,
+                            13,
+                            14};
+    size_t i = 0;
+    for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
+      in_lanes[i] = input_copy[i];
+      min = HWY_MIN(min, input_copy[i]);
+    }
+    // Pad with neutral element to full vector (so we can load)
+    for (; i < N; ++i) {
+      in_lanes[i] = min;
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
  }
 };

@@ -105,6 +133,35 @@ struct TestMaxOfLanes {
      max = HWY_MAX(max, in_lanes[i]);
    }
    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+
+    // Bug #910: also check negative values
+    max = LowestValue<T>();
+    const T input_copy[] = {static_cast<T>(-1),
+                            static_cast<T>(-2),
+                            1,
+                            2,
+                            3,
+                            4,
+                            5,
+                            6,
+                            7,
+                            8,
+                            9,
+                            10,
+                            11,
+                            12,
+                            13,
+                            14};
+    size_t i = 0;
+    for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
+      in_lanes[i] = input_copy[i];
+      max = HWY_MAX(max, in_lanes[i]);
+    }
+    // Pad with neutral element to full vector (so we can load)
+    for (; i < N; ++i) {
+      in_lanes[i] = max;
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
  }
 };

--- a/third_party/highway/hwy/tests/shift_test.cc
+++ b/third_party/highway/hwy/tests/shift_test.cc
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>

@@ -243,7 +242,7 @@ T RightShiftNegative(T val) {
  // seen divisions replaced with shifts, so resort to bit operations.
  using TU = hwy::MakeUnsigned<T>;
  TU bits;
-  CopyBytes<sizeof(T)>(&val, &bits);
+  CopySameSize(&val, &bits);

  const TU shifted = TU(bits >> kAmount);

@@ -252,7 +251,7 @@ T RightShiftNegative(T val) {
  const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());

  bits = shifted | sign_extended;
-  CopyBytes<sizeof(T)>(&bits, &val);
+  CopySameSize(&bits, &val);
  return val;
 }

@@ -356,7 +355,7 @@ struct TestVariableSignedRightShifts {
    for (size_t i = 0; i < N; ++i) {
      const size_t amount = i & kMaxShift;
      const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
-      CopyBytes<sizeof(T)>(&shifted, &expected[i]);
+      CopySameSize(&shifted, &expected[i]);
    }
    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));

@@ -364,7 +363,7 @@ struct TestVariableSignedRightShifts {
    for (size_t i = 0; i < N; ++i) {
      const size_t amount = kMaxShift - (i & kMaxShift);
      const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
-      CopyBytes<sizeof(T)>(&shifted, &expected[i]);
+      CopySameSize(&shifted, &expected[i]);
    }
    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
  }
--- a/third_party/highway/hwy/tests/test_util-inl.h
+++ b/third_party/highway/hwy/tests/test_util-inl.h
@@ -15,7 +15,6 @@

 // Target-specific helper functions for use by *_test.cc.

-#include <inttypes.h>
 #include <stdint.h>

 #include "hwy/base.h"
@@ -97,8 +96,8 @@ HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
  // First check whole bytes (if that many elements are still valid)
  for (; i < N / 8; ++i) {
    if (bits_a[i] != bits_b[i]) {
-      fprintf(stderr, "Mismatch in byte %" PRIu64 ": %d != %d\n",
-              static_cast<uint64_t>(i), bits_a[i], bits_b[i]);
+      fprintf(stderr, "Mismatch in byte %d: %d != %d\n", static_cast<int>(i),
+              bits_a[i], bits_b[i]);
      Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
      Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
      hwy::Abort(filename, line, "Masks not equal");
@@ -111,8 +110,8 @@ HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
    const int valid_a = bits_a[i] & mask;
    const int valid_b = bits_b[i] & mask;
    if (valid_a != valid_b) {
-      fprintf(stderr, "Mismatch in last byte %" PRIu64 ": %d != %d\n",
-              static_cast<uint64_t>(i), valid_a, valid_b);
+      fprintf(stderr, "Mismatch in last byte %d: %d != %d\n",
+              static_cast<int>(i), valid_a, valid_b);
      Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
      Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
      hwy::Abort(filename, line, "Masks not equal");
--- a/third_party/highway/hwy/tests/test_util.cc
+++ b/third_party/highway/hwy/tests/test_util.cc
@@ -15,7 +15,6 @@

 #include "hwy/tests/test_util.h"

-#include <inttypes.h>
 #include <stddef.h>
 #include <stdio.h>

@@ -71,8 +70,7 @@ HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
    CopyBytes<8>(actual_ptr, &actual);
    return ComputeUlpDelta(expected, actual) <= 1;
  } else {
-    HWY_ABORT("Unexpected float size %" PRIu64 "\n",
-              static_cast<uint64_t>(info.sizeof_t));
+    HWY_ABORT("Unexpected float size %d\n", static_cast<int>(info.sizeof_t));
    return false;
  }
 }
@@ -88,10 +86,9 @@ HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
  char actual_str[100];
  ToString(info, actual_ptr, actual_str);
  Abort(filename, line,
-        "%s, %sx%" PRIu64 " lane %" PRIu64
-        " mismatch: expected '%s', got '%s'.\n",
-        target_name, type_name, static_cast<uint64_t>(num_lanes),
-        static_cast<uint64_t>(lane), expected_str, actual_str);
+        "%s, %sx%d lane %d mismatch: expected '%s', got '%s'.\n", target_name,
+        type_name, static_cast<int>(num_lanes), static_cast<int>(lane),
+        expected_str, actual_str);
 }

 HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
--- a/third_party/highway/hwy/tests/test_util.h
+++ b/third_party/highway/hwy/tests/test_util.h
@@ -105,8 +105,8 @@ TU ComputeUlpDelta(const T expected, const T actual) {
  // Compute the difference in units of last place. We do not need to check for
  // differing signs; they will result in large differences, which is fine.
  TU ux, uy;
-  CopyBytes<sizeof(T)>(&expected, &ux);
-  CopyBytes<sizeof(T)>(&actual, &uy);
+  CopySameSize(&expected, &ux);
+  CopySameSize(&actual, &uy);

  // Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
  const TU ulp = HWY_MAX(ux, uy) - HWY_MIN(ux, uy);
--- a/third_party/jpeg-xl/.github/workflows/conformance.yml
+++ b/third_party/jpeg-xl/.github/workflows/conformance.yml
@@ -32,7 +32,7 @@ jobs:
      with:
        repository: libjxl/conformance
        # TODO(eustas): move ref to a global variable / file?
-        ref: a6a44bbbd69830e1dc862174599ce5738a0a414f
+        ref: 43d8135b50e53167ee6fee0b842b9eb15cfc4aa2
        path: conformance
    - name: Cache
      uses: actions/cache@v2
@@ -161,7 +161,7 @@ jobs:
      uses: actions/checkout@v2
      with:
        repository: libjxl/conformance
-        ref: a6a44bbbd69830e1dc862174599ce5738a0a414f
+        ref: 43d8135b50e53167ee6fee0b842b9eb15cfc4aa2
        path: conformance
    - name: Cache
      uses: actions/cache@v2
--- a/third_party/jpeg-xl/AUTHORS
+++ b/third_party/jpeg-xl/AUTHORS
@@ -48,6 +48,7 @@ roland-rollo
 Samuel Leong <wvvwvvvvwvvw@gmail.com>
 Sandro <sandro.jaeckel@gmail.com>
 Stephan T. Lavavej <stl@nuwen.net>
+Thomas Bonfort <thomas.bonfort@airbus.com>
 Vincent Torri <vincent.torri@gmail.com>
 xiota
 Yonatan Nebenzhal <yonatan.nebenzhl@gmail.com>
--- a/third_party/jpeg-xl/CHANGELOG.md
+++ b/third_party/jpeg-xl/CHANGELOG.md
@@ -7,9 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## Unreleased

+### Added
+ - encoder API: new function `JxlEncoderSetFrameBitDepth` to set the bit depth
+   of the input buffer.
+ - decoder API: new function `JxlDecoderSetImageBitDepth` to set the bit depth
+   of the output buffer.
+
 ## [0.7] - 2022-07-21

 ### Added
+ - Export version information in headers.
 - decoder API: Ability to decode the content of metadata boxes:
   `JXL_DEC_BOX`, `JXL_DEC_BOX_NEED_MORE_OUTPUT`, `JxlDecoderSetBoxBuffer`,
   `JxlDecoderGetBoxType`, `JxlDecoderGetBoxSizeRaw` and
--- a/third_party/jpeg-xl/ci.sh
+++ b/third_party/jpeg-xl/ci.sh
@@ -1394,10 +1394,8 @@ cmd_bump_version() {
    fi
  fi

-  newver="${major}.${minor}"
-  if [[ "${patch}" != "0" ]]; then
-    newver="${newver}.${patch}"
-  fi
+  newver="${major}.${minor}.${patch}"
+
  echo "Bumping version to ${newver} (${major}.${minor}.${patch})"
  sed -E \
    -e "s/(set\\(JPEGXL_MAJOR_VERSION) [0-9]+\\)/\\1 ${major})/" \
--- a/third_party/jpeg-xl/lib/extras/codec_test.cc
+++ b/third_party/jpeg-xl/lib/extras/codec_test.cc
@@ -51,9 +51,9 @@ std::string ExtensionFromCodec(Codec codec, const bool is_gray,
    case Codec::kPNG:
      return ".png";
    case Codec::kPNM:
+      if (bits_per_sample == 32) return ".pfm";
      if (has_alpha) return ".pam";
-      if (is_gray) return ".pgm";
-      return (bits_per_sample == 32) ? ".pfm" : ".ppm";
+      return is_gray ? ".pgm" : ".ppm";
    case Codec::kGIF:
      return ".gif";
    case Codec::kEXR:
@@ -173,10 +173,11 @@ struct TestImageParams {
  bool is_gray;
  bool add_alpha;
  bool big_endian;
+  bool add_extra_channels;

  bool ShouldTestRoundtrip() const {
    if (codec == Codec::kPNG) {
-      return true;
+      return bits_per_sample <= 16;
    } else if (codec == Codec::kPNM) {
      // TODO(szabadka) Make PNM encoder endianness-aware.
      return ((bits_per_sample <= 16 && big_endian) ||
@@ -213,7 +214,7 @@ struct TestImageParams {
  std::string DebugString() const {
    std::ostringstream os;
    os << "bps:" << bits_per_sample << " gr:" << is_gray << " al:" << add_alpha
-       << " be: " << big_endian;
+       << " be: " << big_endian << " ec: " << add_extra_channels;
    return os.str();
  }
 };
@@ -233,6 +234,19 @@ void CreateTestImage(const TestImageParams& params, PackedPixelFile* ppf) {

  PackedFrame frame(params.xsize, params.ysize, params.PixelFormat());
  FillPackedImage(params.bits_per_sample, &frame.color);
+  if (params.add_extra_channels) {
+    for (size_t i = 0; i < 7; ++i) {
+      JxlPixelFormat ec_format = params.PixelFormat();
+      ec_format.num_channels = 1;
+      PackedImage ec(params.xsize, params.ysize, ec_format);
+      FillPackedImage(params.bits_per_sample, &ec);
+      frame.extra_channels.emplace_back(std::move(ec));
+      PackedExtraChannel pec;
+      pec.ec_info.bits_per_sample = params.bits_per_sample;
+      pec.ec_info.type = static_cast<JxlExtraChannelType>(i);
+      ppf->extra_channels_info.emplace_back(std::move(pec));
+    }
+  }
  ppf->frames.emplace_back(std::move(frame));
 }

@@ -254,8 +268,13 @@ void TestRoundTrip(const TestImageParams& params, ThreadPool* pool) {
  ASSERT_EQ(encoded.bitstreams.size(), 1);

  PackedPixelFile ppf_out;
+  ColorHints color_hints;
+  if (params.codec == Codec::kPNM || params.codec == Codec::kPGX) {
+    color_hints.Add("color_space",
+                    params.is_gray ? "Gra_D65_Rel_SRG" : "RGB_D65_SRG_Rel_SRG");
+  }
  ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(encoded.bitstreams[0]),
-                          ColorHints(), SizeConstraints(), &ppf_out));
+                          color_hints, SizeConstraints(), &ppf_out));

  if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
      params.codec != Codec::kEXR) {
@@ -263,9 +282,21 @@ void TestRoundTrip(const TestImageParams& params, ThreadPool* pool) {
  }

  ASSERT_EQ(ppf_out.frames.size(), 1);
-  VerifySameImage(ppf_in.frames[0].color, ppf_in.info.bits_per_sample,
-                  ppf_out.frames[0].color, ppf_out.info.bits_per_sample,
+  const auto& frame_in = ppf_in.frames[0];
+  const auto& frame_out = ppf_out.frames[0];
+  VerifySameImage(frame_in.color, ppf_in.info.bits_per_sample, frame_out.color,
+                  ppf_out.info.bits_per_sample,
                  /*lossless=*/params.codec != Codec::kJPG);
+  ASSERT_EQ(frame_in.extra_channels.size(), frame_out.extra_channels.size());
+  ASSERT_EQ(ppf_out.extra_channels_info.size(),
+            frame_out.extra_channels.size());
+  for (size_t i = 0; i < frame_in.extra_channels.size(); ++i) {
+    VerifySameImage(frame_in.extra_channels[i], ppf_in.info.bits_per_sample,
+                    frame_out.extra_channels[i], ppf_out.info.bits_per_sample,
+                    /*lossless=*/true);
+    EXPECT_EQ(ppf_out.extra_channels_info[i].ec_info.type,
+              ppf_in.extra_channels_info[i].ec_info.type);
+  }
 }

 TEST(CodecTest, TestRoundTrip) {
@@ -285,7 +316,12 @@ TEST(CodecTest, TestRoundTrip) {
            params.is_gray = is_gray;
            params.add_alpha = add_alpha;
            params.big_endian = big_endian;
+            params.add_extra_channels = false;
            TestRoundTrip(params, &pool);
+            if (codec == Codec::kPNM && add_alpha) {
+              params.add_extra_channels = true;
+              TestRoundTrip(params, &pool);
+            }
          }
        }
      }
--- a/third_party/jpeg-xl/lib/extras/dec/jxl.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/jxl.cc
@@ -68,6 +68,39 @@ struct BoxProcessor {
  }
 };

+void SetBitDepthFromDataType(JxlDataType data_type, uint32_t* bits_per_sample,
+                             uint32_t* exponent_bits_per_sample) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      *bits_per_sample = 8;
+      *exponent_bits_per_sample = 0;
+      break;
+    case JXL_TYPE_UINT16:
+      *bits_per_sample = 16;
+      *exponent_bits_per_sample = 0;
+      break;
+    case JXL_TYPE_FLOAT16:
+      *bits_per_sample = 16;
+      *exponent_bits_per_sample = 5;
+      break;
+    case JXL_TYPE_FLOAT:
+      *bits_per_sample = 32;
+      *exponent_bits_per_sample = 8;
+      break;
+  }
+}
+
+template <typename T>
+void UpdateBitDepth(JxlBitDepth bit_depth, JxlDataType data_type, T* info) {
+  if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    SetBitDepthFromDataType(data_type, &info->bits_per_sample,
+                            &info->exponent_bits_per_sample);
+  } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
+    info->bits_per_sample = bit_depth.bits_per_sample;
+    info->exponent_bits_per_sample = bit_depth.exponent_bits_per_sample;
+  }
+}
+
 }  // namespace

 bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
@@ -185,8 +218,12 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
        }
        break;
      }
+      size_t released_size = JxlDecoderReleaseInput(dec);
      fprintf(stderr,
-              "Input file is truncated and allow_partial_input was disabled.");
+              "Input file is truncated (total bytes: %" PRIuS
+              ", processed bytes: %" PRIuS
+              ") and allow_partial_input was disabled.",
+              bytes_size, bytes_size - released_size);
      return false;
    } else if (status == JXL_DEC_BOX) {
      boxes.FinalizeOutput();
@@ -254,9 +291,11 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
      if (!have_alpha) {
        // Mark in the basic info that alpha channel was dropped.
        ppf->info.alpha_bits = 0;
-      } else if (dparams.unpremultiply_alpha) {
-        // Mark in the basic info that alpha was unpremultiplied.
-        ppf->info.alpha_premultiplied = false;
+      } else {
+        if (dparams.unpremultiply_alpha) {
+          // Mark in the basic info that alpha was unpremultiplied.
+          ppf->info.alpha_premultiplied = false;
+        }
      }
      bool alpha_found = false;
      for (uint32_t i = 0; i < ppf->info.num_extra_channels; ++i) {
@@ -421,9 +460,21 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
          return false;
        }
      }
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderSetImageOutBitDepth(dec, &dparams.output_bitdepth)) {
+        fprintf(stderr, "JxlDecoderSetImageOutBitDepth failed\n");
+        return false;
+      }
+      UpdateBitDepth(dparams.output_bitdepth, format.data_type, &ppf->info);
+      bool have_alpha = (format.num_channels == 2 || format.num_channels == 4);
+      if (have_alpha) {
+        // Interleaved alpha channels has the same bit depth as color channels.
+        ppf->info.alpha_bits = ppf->info.bits_per_sample;
+        ppf->info.alpha_exponent_bits = ppf->info.exponent_bits_per_sample;
+      }
      JxlPixelFormat ec_format = format;
      ec_format.num_channels = 1;
-      for (const auto& eci : ppf->extra_channels_info) {
+      for (auto& eci : ppf->extra_channels_info) {
        frame.extra_channels.emplace_back(jxl::extras::PackedImage(
            ppf->info.xsize, ppf->info.ysize, ec_format));
        auto& ec = frame.extra_channels.back();
@@ -446,6 +497,8 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
          fprintf(stderr, "JxlDecoderSetExtraChannelBuffer failed\n");
          return false;
        }
+        UpdateBitDepth(dparams.output_bitdepth, ec_format.data_type,
+                       &eci.ec_info);
      }
    } else if (status == JXL_DEC_SUCCESS) {
      // Decoding finished successfully.
--- a/third_party/jpeg-xl/lib/extras/dec/jxl.h
+++ b/third_party/jpeg-xl/lib/extras/dec/jxl.h
@@ -53,6 +53,9 @@ struct JXLDecompressParams {
  bool use_image_callback = true;
  // Whether to unpremultiply colors for associated alpha channels.
  bool unpremultiply_alpha = false;
+
+  // Controls the effective bit depth of the output pixels.
+  JxlBitDepth output_bitdepth = {JXL_BIT_DEPTH_FROM_PIXEL_FORMAT, 0, 0};
 };

 bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
--- a/third_party/jpeg-xl/lib/extras/dec/pnm.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/pnm.cc
@@ -24,6 +24,7 @@ struct HeaderPNM {
  size_t bits_per_sample;
  bool floating_point;
  bool big_endian;
+  std::vector<JxlExtraChannelType> ec_types;  // PAM
 };

 class Parser {
@@ -183,16 +184,20 @@ class Parser {
  Status ParseHeaderPAM(HeaderPNM* header, const uint8_t** pos) {
    size_t depth = 3;
    size_t max_val = 255;
+    JXL_RETURN_IF_ERROR(SkipWhitespace());
    while (!MatchString("ENDHDR", /*skipws=*/false)) {
-      JXL_RETURN_IF_ERROR(SkipWhitespace());
      if (MatchString("WIDTH")) {
        JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
      } else if (MatchString("HEIGHT")) {
        JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
      } else if (MatchString("DEPTH")) {
        JXL_RETURN_IF_ERROR(ParseUnsigned(&depth));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
      } else if (MatchString("MAXVAL")) {
        JXL_RETURN_IF_ERROR(ParseUnsigned(&max_val));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
      } else if (MatchString("TUPLTYPE")) {
        if (MatchString("RGB_ALPHA")) {
          header->has_alpha = true;
@@ -209,6 +214,20 @@ class Parser {
        } else if (MatchString("BLACKANDWHITE")) {
          header->is_gray = true;
          max_val = 1;
+        } else if (MatchString("Alpha")) {
+          header->ec_types.push_back(JXL_CHANNEL_ALPHA);
+        } else if (MatchString("Depth")) {
+          header->ec_types.push_back(JXL_CHANNEL_DEPTH);
+        } else if (MatchString("SpotColor")) {
+          header->ec_types.push_back(JXL_CHANNEL_SPOT_COLOR);
+        } else if (MatchString("SelectionMask")) {
+          header->ec_types.push_back(JXL_CHANNEL_SELECTION_MASK);
+        } else if (MatchString("Black")) {
+          header->ec_types.push_back(JXL_CHANNEL_BLACK);
+        } else if (MatchString("CFA")) {
+          header->ec_types.push_back(JXL_CHANNEL_CFA);
+        } else if (MatchString("Thermal")) {
+          header->ec_types.push_back(JXL_CHANNEL_THERMAL);
        } else {
          return JXL_FAILURE("PAM: unknown TUPLTYPE");
        }
@@ -223,7 +242,7 @@ class Parser {
    }
    size_t num_channels = header->is_gray ? 1 : 3;
    if (header->has_alpha) num_channels++;
-    if (num_channels != depth) {
+    if (num_channels + header->ec_types.size() != depth) {
      return JXL_FAILURE("PAM: bad DEPTH");
    }
    if (max_val == 0 || max_val >= 65536) {
@@ -341,7 +360,17 @@ Status DecodeImagePNM(const Span<const uint8_t> bytes,
  ppf->info.alpha_bits = (header.has_alpha ? ppf->info.bits_per_sample : 0);
  ppf->info.alpha_exponent_bits = 0;
  ppf->info.num_color_channels = (header.is_gray ? 1 : 3);
-  ppf->info.num_extra_channels = (header.has_alpha ? 1 : 0);
+  uint32_t num_alpha_channels = (header.has_alpha ? 1 : 0);
+  uint32_t num_interleaved_channels =
+      ppf->info.num_color_channels + num_alpha_channels;
+  ppf->info.num_extra_channels = num_alpha_channels + header.ec_types.size();
+
+  for (auto type : header.ec_types) {
+    PackedExtraChannel pec;
+    pec.ec_info.bits_per_sample = ppf->info.bits_per_sample;
+    pec.ec_info.type = type;
+    ppf->extra_channels_info.emplace_back(std::move(pec));
+  }

  JxlDataType data_type;
  if (header.floating_point) {
@@ -356,27 +385,50 @@ Status DecodeImagePNM(const Span<const uint8_t> bytes,
  }

  const JxlPixelFormat format{
-      /*num_channels=*/ppf->info.num_color_channels +
-          ppf->info.num_extra_channels,
+      /*num_channels=*/num_interleaved_channels,
      /*data_type=*/data_type,
      /*endianness=*/header.big_endian ? JXL_BIG_ENDIAN : JXL_LITTLE_ENDIAN,
      /*align=*/0,
  };
+  const JxlPixelFormat ec_format{1, format.data_type, format.endianness, 0};
  ppf->frames.clear();
  ppf->frames.emplace_back(header.xsize, header.ysize, format);
  auto* frame = &ppf->frames.back();
-
+  for (size_t i = 0; i < header.ec_types.size(); ++i) {
+    frame->extra_channels.emplace_back(header.xsize, header.ysize, ec_format);
+  }
  size_t pnm_remaining_size = bytes.data() + bytes.size() - pos;
  if (pnm_remaining_size < frame->color.pixels_size) {
    return JXL_FAILURE("PNM file too small");
  }
-  const bool flipped_y = header.bits_per_sample == 32;  // PFMs are flipped
+
  uint8_t* out = reinterpret_cast<uint8_t*>(frame->color.pixels());
-  for (size_t y = 0; y < header.ysize; ++y) {
-    size_t y_in = flipped_y ? header.ysize - 1 - y : y;
-    const uint8_t* row_in = &pos[y_in * frame->color.stride];
-    uint8_t* row_out = &out[y * frame->color.stride];
-    memcpy(row_out, row_in, frame->color.stride);
+  std::vector<uint8_t*> ec_out(header.ec_types.size());
+  for (size_t i = 0; i < ec_out.size(); ++i) {
+    ec_out[i] = reinterpret_cast<uint8_t*>(frame->extra_channels[i].pixels());
+  }
+  if (ec_out.empty()) {
+    const bool flipped_y = header.bits_per_sample == 32;  // PFMs are flipped
+    for (size_t y = 0; y < header.ysize; ++y) {
+      size_t y_in = flipped_y ? header.ysize - 1 - y : y;
+      const uint8_t* row_in = &pos[y_in * frame->color.stride];
+      uint8_t* row_out = &out[y * frame->color.stride];
+      memcpy(row_out, row_in, frame->color.stride);
+    }
+  } else {
+    size_t pwidth = PackedImage::BitsPerChannel(data_type) / 8;
+    for (size_t y = 0; y < header.ysize; ++y) {
+      for (size_t x = 0; x < header.xsize; ++x) {
+        memcpy(out, pos, frame->color.pixel_stride());
+        out += frame->color.pixel_stride();
+        pos += frame->color.pixel_stride();
+        for (auto& p : ec_out) {
+          memcpy(p, pos, pwidth);
+          pos += pwidth;
+          p += pwidth;
+        }
+      }
+    }
  }
  return true;
 }
--- a/third_party/jpeg-xl/lib/extras/dec_group_jpeg.cc
+++ b/third_party/jpeg-xl/lib/extras/dec_group_jpeg.cc
@@ -0,0 +1,159 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec_group_jpeg.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/extras/dec_group_jpeg.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::AndNot;
+using hwy::HWY_NAMESPACE::ApproximateReciprocal;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::Xor;
+
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+constexpr D d;
+constexpr DI di;
+
+template <class DI>
+HWY_INLINE HWY_MAYBE_UNUSED Vec<Rebind<float, DI>> AdjustQuantBias(
+    DI di, const size_t c, const Vec<DI> quant_i,
+    const float* HWY_RESTRICT biases) {
+  const Rebind<float, DI> df;
+
+  const auto quant = ConvertTo(df, quant_i);
+
+  // Compare |quant|, keep sign bit for negating result.
+  const auto kSign = BitCast(df, Set(di, INT32_MIN));
+  const auto sign = And(quant, kSign);  // TODO(janwas): = abs ^ orig
+  const auto abs_quant = AndNot(kSign, quant);
+
+  // If |x| is 1, kZeroBias creates a different bias for each channel.
+  // We're implementing the following:
+  // if (quant == 0) return 0;
+  // if (quant == 1) return biases[c];
+  // if (quant == -1) return -biases[c];
+  // return quant - biases[3] / quant;
+
+  // Integer comparison is not helpful because Clang incurs bypass penalties
+  // from unnecessarily mixing integer and float.
+  const auto is_01 = Lt(abs_quant, Set(df, 1.125f));
+  const auto not_0 = Gt(abs_quant, Zero(df));
+
+  // Bitwise logic is faster than quant * biases[c].
+  const auto one_bias = IfThenElseZero(not_0, Xor(Set(df, biases[c]), sign));
+
+  // About 2E-5 worse than ReciprocalNR or division.
+  const auto bias =
+      NegMulAdd(Set(df, biases[3]), ApproximateReciprocal(quant), quant);
+
+  return IfThenElse(is_01, one_bias, bias);
+}
+
+void DequantBlock(const int16_t* JXL_RESTRICT qblock, size_t c,
+                  const float* JXL_RESTRICT dequant_matrices,
+                  const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
+  for (size_t k = 0; k < kDCTBlockSize; k += Lanes(d)) {
+    const auto mul = Load(d, dequant_matrices + c * kDCTBlockSize + k);
+    Rebind<int16_t, DI> di16;
+    Vec<DI> quantized = PromoteTo(di, Load(di16, qblock + k));
+    const auto dequant = Mul(AdjustQuantBias(di, c, quantized, biases), mul);
+    Store(dequant, d, block + k);
+  }
+}
+
+Status DecodeGroupJpeg(const Image3S& coeffs, size_t group_idx,
+                       const Rect block_rect, const YCbCrChromaSubsampling& cs,
+                       const float* dequant_matrices,
+                       float* JXL_RESTRICT group_dec_cache, size_t thread,
+                       RenderPipelineInput& render_pipeline_input) {
+  HWY_ALIGN float* const block = group_dec_cache;
+  HWY_ALIGN float* const scratch_space = block + kDCTBlockSize;
+
+  size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)};
+  size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)};
+
+  static constexpr float kDefaultQuantBias[4] = {
+      1.0f - 0.05465007330715401f,
+      1.0f - 0.07005449891748593f,
+      1.0f - 0.049935103337343655f,
+      0.145f,
+  };
+
+  for (size_t c = 0; c < 3; ++c) {
+    ImageF* rpbuffer = render_pipeline_input.GetBuffer(c).first;
+    Rect rect = render_pipeline_input.GetBuffer(c).second;
+    size_t xsize_blocks = DivCeil(block_rect.xsize(), 1 << hshift[c]);
+    size_t ysize_blocks = DivCeil(block_rect.ysize(), 1 << vshift[c]);
+    size_t offset = 0;
+    for (size_t by = 0; by < ysize_blocks; ++by) {
+      float* JXL_RESTRICT idct_row = rect.Row(rpbuffer, by * kBlockDim);
+      size_t idct_stride = rpbuffer->PixelsPerRow();
+      for (size_t bx = 0; bx < xsize_blocks; ++bx) {
+        const int16_t* qblock = &coeffs.PlaneRow(c, group_idx)[offset];
+        offset += kDCTBlockSize;
+        DequantBlock(qblock, c, dequant_matrices, kDefaultQuantBias, block);
+        // IDCT
+        float* JXL_RESTRICT idct_pos = idct_row + bx * kBlockDim;
+        // JPEG XL transposes the DCT, JPEG doesn't.
+        Transpose<8, 8>::Run(DCTFrom(block, 8), DCTTo(scratch_space, 8));
+        TransformToPixels(AcStrategy::DCT, scratch_space, idct_pos, idct_stride,
+                          block);
+      }
+    }
+  }
+  return true;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace {
+HWY_EXPORT(DecodeGroupJpeg);
+}  // namespace
+
+namespace extras {
+Status DecodeGroupJpeg(const Image3S& coeffs, size_t group_idx,
+                       const Rect block_rect, const YCbCrChromaSubsampling& cs,
+                       const float* dequant_matrices,
+                       float* JXL_RESTRICT group_dec_cache, size_t thread,
+                       RenderPipelineInput& render_pipeline_input) {
+  return HWY_DYNAMIC_DISPATCH(DecodeGroupJpeg)(
+      coeffs, group_idx, block_rect, cs, dequant_matrices, group_dec_cache,
+      thread, render_pipeline_input);
+}
+
+}  // namespace extras
+}  // namespace jxl
+#endif  // HWY_ONCE
--- a/third_party/jpeg-xl/lib/extras/dec_group_jpeg.h
+++ b/third_party/jpeg-xl/lib/extras/dec_group_jpeg.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_GROUP_JPEG_H_
+#define LIB_EXTRAS_DEC_GROUP_JPEG_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+namespace jxl {
+namespace extras {
+
+Status DecodeGroupJpeg(const Image3S& coeffs, size_t group_idx,
+                       const Rect block_rect, const YCbCrChromaSubsampling& cs,
+                       const float* dequant_matrices,
+                       float* JXL_RESTRICT group_dec_cache, size_t thread,
+                       RenderPipelineInput& render_pipeline_input);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_GROUP_JPEG_H_
--- a/third_party/jpeg-xl/lib/extras/decode_jpeg.cc
+++ b/third_party/jpeg-xl/lib/extras/decode_jpeg.cc
@@ -0,0 +1,274 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/decode_jpeg.h"
+
+#include "lib/extras/dec_group_jpeg.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/jpeg/enc_jpeg_data_reader.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h"
+#include "lib/jxl/render_pipeline/stage_write.h"
+#include "lib/jxl/render_pipeline/stage_ycbcr.h"
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+Rect BlockGroupRect(const FrameDimensions& frame_dim, size_t group_index) {
+  const size_t gx = group_index % frame_dim.xsize_groups;
+  const size_t gy = group_index / frame_dim.xsize_groups;
+  const Rect rect(gx * (frame_dim.group_dim >> 3),
+                  gy * (frame_dim.group_dim >> 3), frame_dim.group_dim >> 3,
+                  frame_dim.group_dim >> 3, frame_dim.xsize_blocks,
+                  frame_dim.ysize_blocks);
+  return rect;
+}
+
+Rect DCGroupRect(const FrameDimensions& frame_dim, size_t group_index) {
+  const size_t gx = group_index % frame_dim.xsize_dc_groups;
+  const size_t gy = group_index / frame_dim.xsize_dc_groups;
+  const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim,
+                  frame_dim.group_dim, frame_dim.group_dim,
+                  frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+  return rect;
+}
+
+Status SetChromaSubsamplingFromJpegData(const jpeg::JPEGData& jpeg_data,
+                                        YCbCrChromaSubsampling* cs) {
+  size_t nbcomp = jpeg_data.components.size();
+  if (nbcomp == 3) {
+    uint8_t hsample[3], vsample[3];
+    for (size_t i = 0; i < nbcomp; i++) {
+      hsample[i] = jpeg_data.components[i].h_samp_factor;
+      vsample[i] = jpeg_data.components[i].v_samp_factor;
+    }
+    JXL_RETURN_IF_ERROR(cs->Set(hsample, vsample));
+  } else if (nbcomp == 1) {
+    uint8_t hsample[3], vsample[3];
+    for (size_t i = 0; i < 3; i++) {
+      hsample[i] = jpeg_data.components[0].h_samp_factor;
+      vsample[i] = jpeg_data.components[0].v_samp_factor;
+    }
+    JXL_RETURN_IF_ERROR(cs->Set(hsample, vsample));
+  }
+  return true;
+}
+
+bool IsYCbCrJpeg(const jpeg::JPEGData& jpeg_data) {
+  size_t nbcomp = jpeg_data.components.size();
+  bool is_rgb = false;
+  const auto& markers = jpeg_data.marker_order;
+  // If there is a JFIF marker, this is YCbCr. Otherwise...
+  if (std::find(markers.begin(), markers.end(), 0xE0) == markers.end()) {
+    // Try to find an 'Adobe' marker.
+    size_t app_markers = 0;
+    size_t i = 0;
+    for (; i < markers.size(); i++) {
+      // This is an APP marker.
+      if ((markers[i] & 0xF0) == 0xE0) {
+        JXL_CHECK(app_markers < jpeg_data.app_data.size());
+        // APP14 marker
+        if (markers[i] == 0xEE) {
+          const auto& data = jpeg_data.app_data[app_markers];
+          if (data.size() == 15 && data[3] == 'A' && data[4] == 'd' &&
+              data[5] == 'o' && data[6] == 'b' && data[7] == 'e') {
+            // 'Adobe' marker.
+            is_rgb = data[14] == 0;
+            break;
+          }
+        }
+        app_markers++;
+      }
+    }
+
+    if (i == markers.size()) {
+      // No 'Adobe' marker, guess from component IDs.
+      is_rgb = nbcomp == 3 && jpeg_data.components[0].id == 'R' &&
+               jpeg_data.components[1].id == 'G' &&
+               jpeg_data.components[2].id == 'B';
+    }
+  }
+  return (!is_rgb || nbcomp == 1);
+}
+
+inline std::array<int, 3> JpegOrder(bool is_ycbcr, bool is_gray) {
+  if (is_gray) {
+    return {{0, 0, 0}};
+  } else if (is_ycbcr) {
+    return {{1, 0, 2}};
+  } else {
+    return {{0, 1, 2}};
+  }
+}
+
+void SetDequantWeightsFromJpegData(const jpeg::JPEGData& jpeg_data,
+                                   const bool is_ycbcr, float* dequant) {
+  auto jpeg_c_map = JpegOrder(is_ycbcr, jpeg_data.components.size() == 1);
+  const float kDequantScale = 1.0f / (8 * 255);
+  for (size_t c = 0; c < 3; c++) {
+    size_t jpeg_c = jpeg_c_map[c];
+    const int32_t* quant =
+        jpeg_data.quant[jpeg_data.components[jpeg_c].quant_idx].values.data();
+    for (size_t k = 0; k < kDCTBlockSize; ++k) {
+      dequant[c * kDCTBlockSize + k] = quant[k] * kDequantScale;
+    }
+  }
+}
+
+void SetCoefficientsFromJpegData(const jpeg::JPEGData& jpeg_data,
+                                 const FrameDimensions& frame_dim,
+                                 const YCbCrChromaSubsampling& cs,
+                                 const bool is_ycbcr, Image3S* coeffs) {
+  auto jpeg_c_map = JpegOrder(is_ycbcr, jpeg_data.components.size() == 1);
+  *coeffs = Image3S(kGroupDim * kGroupDim, frame_dim.num_groups);
+  for (size_t c = 0; c < 3; ++c) {
+    if (jpeg_data.components.size() == 1 && c != 1) {
+      ZeroFillImage(&coeffs->Plane(c));
+      continue;
+    }
+    const auto& comp = jpeg_data.components[jpeg_c_map[c]];
+    size_t hshift = cs.HShift(c);
+    size_t vshift = cs.VShift(c);
+    int dcquant = jpeg_data.quant[comp.quant_idx].values.data()[0];
+    int16_t dc_level = 1024 / dcquant;
+    size_t jpeg_stride = comp.width_in_blocks * kDCTBlockSize;
+    for (size_t group_index = 0; group_index < frame_dim.num_groups;
+         group_index++) {
+      Rect block_rect = BlockGroupRect(frame_dim, group_index);
+      size_t xsize_blocks = DivCeil(block_rect.xsize(), 1 << hshift);
+      size_t ysize_blocks = DivCeil(block_rect.ysize(), 1 << vshift);
+      size_t group_xsize = xsize_blocks * kDCTBlockSize;
+      size_t bx0 = block_rect.x0() >> hshift;
+      size_t by0 = block_rect.y0() >> vshift;
+      size_t jpeg_offset = by0 * jpeg_stride + bx0 * kDCTBlockSize;
+      const int16_t* JXL_RESTRICT jpeg_coeffs =
+          comp.coeffs.data() + jpeg_offset;
+      int16_t* JXL_RESTRICT coeff_row = coeffs->PlaneRow(c, group_index);
+      for (size_t by = 0; by < ysize_blocks; ++by) {
+        memcpy(&coeff_row[by * group_xsize], &jpeg_coeffs[by * jpeg_stride],
+               group_xsize * sizeof(coeff_row[0]));
+      }
+      if (!is_ycbcr) {
+        for (size_t offset = 0; offset < coeffs->xsize();
+             offset += kDCTBlockSize) {
+          coeff_row[offset] += dc_level;
+        }
+      }
+    }
+  }
+}
+
+std::unique_ptr<RenderPipeline> PreparePipeline(
+    const YCbCrChromaSubsampling& cs, const bool is_ycbcr,
+    const FrameDimensions& frame_dim, PackedImage* output) {
+  RenderPipeline::Builder builder(3);
+  if (!cs.Is444()) {
+    for (size_t c = 0; c < 3; c++) {
+      if (cs.HShift(c) != 0) {
+        builder.AddStage(GetChromaUpsamplingStage(c, /*horizontal=*/true));
+      }
+      if (cs.VShift(c) != 0) {
+        builder.AddStage(GetChromaUpsamplingStage(c, /*horizontal=*/false));
+      }
+    }
+  }
+  if (is_ycbcr) {
+    builder.AddStage(GetYCbCrStage());
+  }
+  ImageOutput main_output;
+  main_output.format = output->format;
+  main_output.bits_per_sample =
+      PackedImage::BitsPerChannel(output->format.data_type);
+  main_output.buffer = reinterpret_cast<uint8_t*>(output->pixels());
+  main_output.buffer_size = output->pixels_size;
+  main_output.stride = output->stride;
+  std::vector<ImageOutput> extra_output;
+  builder.AddStage(GetWriteToOutputStage(
+      main_output, output->xsize, output->ysize,
+      /*has_alpha=*/false,
+      /*unpremul_alpha=*/false,
+      /*alpha_c=*/0, Orientation::kIdentity, extra_output));
+  return std::move(builder).Finalize(frame_dim);
+}
+
+}  // namespace
+
+Status DecodeJpeg(const std::vector<uint8_t>& compressed,
+                  JxlDataType output_data_type, ThreadPool* pool,
+                  PackedPixelFile* ppf) {
+  jpeg::JPEGData jpeg_data;
+  JXL_RETURN_IF_ERROR(jpeg::ReadJpeg(compressed.data(), compressed.size(),
+                                     jpeg::JpegReadMode::kReadAll, &jpeg_data));
+  const size_t xsize = jpeg_data.width;
+  const size_t ysize = jpeg_data.height;
+  const uint32_t nbcomp = jpeg_data.components.size();
+  const bool is_ycbcr = IsYCbCrJpeg(jpeg_data);
+
+  ppf->info.xsize = xsize;
+  ppf->info.ysize = ysize;
+  ppf->info.num_color_channels = nbcomp;
+  ppf->info.bits_per_sample = PackedImage::BitsPerChannel(output_data_type);
+
+  ColorEncoding color_encoding;
+  JXL_RETURN_IF_ERROR(SetColorEncodingFromJpegData(jpeg_data, &color_encoding));
+  PaddedBytes icc = color_encoding.ICC();
+  ppf->icc.assign(icc.data(), icc.data() + icc.size());
+  ConvertInternalToExternalColorEncoding(color_encoding, &ppf->color_encoding);
+
+  YCbCrChromaSubsampling cs;
+  JXL_RETURN_IF_ERROR(SetChromaSubsamplingFromJpegData(jpeg_data, &cs));
+
+  FrameDimensions frame_dim;
+  frame_dim.Set(xsize, ysize, /*group_size_shift=*/1, cs.MaxHShift(),
+                cs.MaxVShift(),
+                /*modular_mode=*/false, /*upsampling=*/1);
+
+  std::vector<float> dequant(3 * kDCTBlockSize);
+  SetDequantWeightsFromJpegData(jpeg_data, is_ycbcr, &dequant[0]);
+
+  Image3S coeffs;
+  SetCoefficientsFromJpegData(jpeg_data, frame_dim, cs, is_ycbcr, &coeffs);
+
+  JxlPixelFormat format = {nbcomp, output_data_type, JXL_LITTLE_ENDIAN, 0};
+  ppf->frames.emplace_back(xsize, ysize, format);
+  auto& frame = ppf->frames.back();
+
+  std::unique_ptr<RenderPipeline> render_pipeline =
+      PreparePipeline(cs, is_ycbcr, frame_dim, &frame.color);
+  JXL_RETURN_IF_ERROR(render_pipeline->IsInitialized());
+
+  hwy::AlignedFreeUniquePtr<float[]> float_memory;
+  const auto allocate_storage = [&](const size_t num_threads) -> Status {
+    JXL_RETURN_IF_ERROR(
+        render_pipeline->PrepareForThreads(num_threads,
+                                           /*use_group_ids=*/false));
+    float_memory = hwy::AllocateAligned<float>(kDCTBlockSize * 2 * num_threads);
+    return true;
+  };
+  const auto process_group = [&](const uint32_t group_index,
+                                 const size_t thread) {
+    RenderPipelineInput input =
+        render_pipeline->GetInputBuffers(group_index, thread);
+    float* group_dec_cache = float_memory.get() + thread * kDCTBlockSize * 2;
+    const Rect block_rect = BlockGroupRect(frame_dim, group_index);
+    JXL_CHECK(DecodeGroupJpeg(coeffs, group_index, block_rect, cs, &dequant[0],
+                              group_dec_cache, thread, input));
+    input.Done();
+  };
+  JXL_CHECK(RunOnPool(pool, 0, frame_dim.num_groups, allocate_storage,
+                      process_group, "Decode Groups"));
+
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
--- a/third_party/jpeg-xl/lib/extras/decode_jpeg.h
+++ b/third_party/jpeg-xl/lib/extras/decode_jpeg.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DECODE_JPEG_H_
+#define LIB_EXTRAS_DECODE_JPEG_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+
+namespace jxl {
+namespace extras {
+
+Status DecodeJpeg(const std::vector<uint8_t>& compressed,
+                  JxlDataType output_data_type, ThreadPool* pool,
+                  PackedPixelFile* ppf);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DECODE_JPEG_H_
--- a/third_party/jpeg-xl/lib/extras/enc/apng.cc
+++ b/third_party/jpeg-xl/lib/extras/enc/apng.cc
@@ -58,8 +58,10 @@ class APNGEncoder : public Encoder {
    std::vector<JxlPixelFormat> formats;
    for (const uint32_t num_channels : {1, 2, 3, 4}) {
      for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
-        formats.push_back(JxlPixelFormat{num_channels, data_type,
-                                         JXL_BIG_ENDIAN, /*align=*/0});
+        for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
+          formats.push_back(
+              JxlPixelFormat{num_channels, data_type, endianness, /*align=*/0});
+        }
      }
    }
    return formats;
@@ -233,21 +235,7 @@ Status APNGEncoder::EncodePackedPixelFileToAPNG(
      } else {
        memcpy(&out[0], in, out_size);
      }
-    } else if (format.data_type == JXL_TYPE_FLOAT) {
-      float mul = 65535.0;
-      const uint8_t* p_in = in;
-      uint8_t* p_out = out.data();
-      for (size_t i = 0; i < num_samples; ++i, p_in += 4, p_out += 2) {
-        uint32_t val = (format.endianness == JXL_BIG_ENDIAN ? LoadBE32(p_in)
-                                                            : LoadLE32(p_in));
-        float fval;
-        memcpy(&fval, &val, 4);
-        StoreBE16(static_cast<uint32_t>(fval * mul + 0.5), p_out);
-      }
-    } else {
-      return JXL_FAILURE("Unsupported pixel data type");
    }
-
    png_structp png_ptr;
    png_infop info_ptr;

--- a/third_party/jpeg-xl/lib/extras/enc/encode.cc
+++ b/third_party/jpeg-xl/lib/extras/enc/encode.cc
@@ -40,6 +40,34 @@ Status Encoder::VerifyBasicInfo(const JxlBasicInfo& info) const {
  return true;
 }

+Status Encoder::VerifyFormat(const JxlPixelFormat& format) const {
+  for (auto f : AcceptedFormats()) {
+    if (f.num_channels != format.num_channels) continue;
+    if (f.data_type != format.data_type) continue;
+    if (f.data_type == JXL_TYPE_UINT8 || f.endianness == format.endianness) {
+      return true;
+    }
+  }
+  return JXL_FAILURE("Format is not in the list of accepted formats.");
+}
+
+Status Encoder::VerifyBitDepth(JxlDataType data_type, uint32_t bits_per_sample,
+                               uint32_t exponent_bits) const {
+  if ((data_type == JXL_TYPE_UINT8 &&
+       (bits_per_sample == 0 || bits_per_sample > 8 || exponent_bits != 0)) ||
+      (data_type == JXL_TYPE_UINT16 &&
+       (bits_per_sample <= 8 || bits_per_sample > 16 || exponent_bits != 0)) ||
+      (data_type == JXL_TYPE_FLOAT16 &&
+       (bits_per_sample != 16 || exponent_bits != 5)) ||
+      (data_type == JXL_TYPE_FLOAT &&
+       (bits_per_sample != 32 || exponent_bits != 8))) {
+    return JXL_FAILURE(
+        "Incompatible data_type %d and bit depth %u with exponent bits %u",
+        (int)data_type, bits_per_sample, exponent_bits);
+  }
+  return true;
+}
+
 Status Encoder::VerifyPackedImage(const PackedImage& image,
                                  const JxlBasicInfo& info) const {
  if (image.pixels() == nullptr) {
@@ -57,10 +85,10 @@ Status Encoder::VerifyPackedImage(const PackedImage& image,
      image.format.num_channels != info_num_channels) {
    return JXL_FAILURE("Frame size does not match image size");
  }
-  if (info.bits_per_sample >
-      PackedImage::BitsPerChannel(image.format.data_type)) {
-    return JXL_FAILURE("Bit depth does not fit pixel data type");
-  }
+  JXL_RETURN_IF_ERROR(VerifyFormat(image.format));
+  JXL_RETURN_IF_ERROR(VerifyBitDepth(image.format.data_type,
+                                     info.bits_per_sample,
+                                     info.exponent_bits_per_sample));
  return true;
 }

--- a/third_party/jpeg-xl/lib/extras/enc/encode.h
+++ b/third_party/jpeg-xl/lib/extras/enc/encode.h
@@ -60,6 +60,11 @@ class Encoder {

  Status VerifyBasicInfo(const JxlBasicInfo& info) const;

+  Status VerifyFormat(const JxlPixelFormat& format) const;
+
+  Status VerifyBitDepth(JxlDataType data_type, uint32_t bits_per_sample,
+                        uint32_t exponent_bits) const;
+
  Status VerifyPackedImage(const PackedImage& image,
                           const JxlBasicInfo& info) const;

--- a/third_party/jpeg-xl/lib/extras/enc/jpg.cc
+++ b/third_party/jpeg-xl/lib/extras/enc/jpg.cc
@@ -111,7 +111,7 @@ Status EncodeWithLibJpeg(const PackedImage& image, const JxlBasicInfo& info,
                         const std::vector<uint8_t>& icc,
                         std::vector<uint8_t> exif, size_t quality,
                         const std::string& chroma_subsampling,
-                         std::vector<uint8_t>* bytes) {
+                         bool progressive, std::vector<uint8_t>* bytes) {
  if (BITS_IN_JSAMPLE != 8 || sizeof(JSAMPLE) != 1) {
    return JXL_FAILURE("Only 8 bit JSAMPLE is supported.");
  }
@@ -139,6 +139,9 @@ Status EncodeWithLibJpeg(const PackedImage& image, const JxlBasicInfo& info,
    jpeg_set_colorspace(&cinfo, JCS_RGB);
  }
  jpeg_set_quality(&cinfo, quality, TRUE);
+  if (progressive) {
+    jpeg_simple_progression(&cinfo);
+  }
  jpeg_start_compress(&cinfo, TRUE);
  if (!icc.empty()) {
    WriteICCProfile(&cinfo, icc);
@@ -209,7 +212,8 @@ Status EncodeImageJPG(const PackedImage& image, const JxlBasicInfo& info,
                      const std::vector<uint8_t>& icc,
                      std::vector<uint8_t> exif, JpegEncoder encoder,
                      size_t quality, const std::string& chroma_subsampling,
-                      ThreadPool* pool, std::vector<uint8_t>* bytes) {
+                      bool progressive, ThreadPool* pool,
+                      std::vector<uint8_t>* bytes) {
  if (image.format.data_type != JXL_TYPE_UINT8) {
    return JXL_FAILURE("Unsupported pixel data type");
  }
@@ -222,9 +226,9 @@ Status EncodeImageJPG(const PackedImage& image, const JxlBasicInfo& info,

  switch (encoder) {
    case JpegEncoder::kLibJpeg:
-      JXL_RETURN_IF_ERROR(EncodeWithLibJpeg(image, info, color_encoding, icc,
-                                            std::move(exif), quality,
-                                            chroma_subsampling, bytes));
+      JXL_RETURN_IF_ERROR(
+          EncodeWithLibJpeg(image, info, color_encoding, icc, std::move(exif),
+                            quality, chroma_subsampling, progressive, bytes));
      break;
    case JpegEncoder::kSJpeg:
      JXL_RETURN_IF_ERROR(EncodeWithSJpeg(image, info, icc, std::move(exif),
@@ -253,28 +257,26 @@ class JPEGEncoder : public Encoder {
  Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
                ThreadPool* pool = nullptr) const override {
    JXL_RETURN_IF_ERROR(VerifyBasicInfo(ppf.info));
-    const auto& options = this->options();
    int quality = 100;
-    auto it_quality = options.find("q");
-    if (it_quality != options.end()) {
-      std::istringstream is(it_quality->second);
-      JXL_RETURN_IF_ERROR(static_cast<bool>(is >> quality));
-    }
    std::string chroma_subsampling = "444";
-    auto it_chroma_subsampling = options.find("chroma_subsampling");
-    if (it_chroma_subsampling != options.end()) {
-      chroma_subsampling = it_chroma_subsampling->second;
-    }
    JpegEncoder jpeg_encoder = JpegEncoder::kLibJpeg;
-    auto it_encoder = options.find("jpeg_encoder");
-    if (it_encoder != options.end()) {
-      if (it_encoder->second == "libjpeg") {
-        jpeg_encoder = JpegEncoder::kLibJpeg;
-      } else if (it_encoder->second == "sjpeg") {
-        jpeg_encoder = JpegEncoder::kSJpeg;
-      } else {
-        return JXL_FAILURE("unknown jpeg encoder \"%s\"",
-                           it_encoder->second.c_str());
+    bool progressive = false;
+    for (const auto& it : options()) {
+      if (it.first == "q") {
+        std::istringstream is(it.second);
+        JXL_RETURN_IF_ERROR(static_cast<bool>(is >> quality));
+      } else if (it.first == "chroma_subsampling") {
+        chroma_subsampling = it.second;
+      } else if (it.first == "jpeg_encoder") {
+        if (it.second == "libjpeg") {
+          jpeg_encoder = JpegEncoder::kLibJpeg;
+        } else if (it.second == "sjpeg") {
+          jpeg_encoder = JpegEncoder::kSJpeg;
+        } else {
+          return JXL_FAILURE("unknown jpeg encoder \"%s\"", it.second.c_str());
+        }
+      } else if (it.first == "progressive") {
+        progressive = true;
      }
    }
    std::vector<uint8_t> icc;
@@ -288,7 +290,7 @@ class JPEGEncoder : public Encoder {
      encoded_image->bitstreams.emplace_back();
      JXL_RETURN_IF_ERROR(EncodeImageJPG(
          frame.color, ppf.info, ppf.color_encoding, icc, ppf.metadata.exif,
-          jpeg_encoder, quality, chroma_subsampling, pool,
+          jpeg_encoder, quality, chroma_subsampling, progressive, pool,
          &encoded_image->bitstreams.back()));
    }
    return true;
--- a/third_party/jpeg-xl/lib/extras/enc/jxl.cc
+++ b/third_party/jpeg-xl/lib/extras/enc/jxl.cc
@@ -0,0 +1,231 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/jxl.h"
+
+#include "jxl/encode_cxx.h"
+
+namespace jxl {
+namespace extras {
+
+JxlEncoderStatus SetOption(const JXLOption& opt,
+                           JxlEncoderFrameSettings* settings) {
+  return opt.is_float
+             ? JxlEncoderFrameSettingsSetFloatOption(settings, opt.id, opt.fval)
+             : JxlEncoderFrameSettingsSetOption(settings, opt.id, opt.ival);
+}
+
+bool SetFrameOptions(const std::vector<JXLOption>& options, size_t frame_index,
+                     size_t* option_idx, JxlEncoderFrameSettings* settings) {
+  while (*option_idx < options.size()) {
+    const auto& opt = options[*option_idx];
+    if (opt.frame_index > frame_index) {
+      break;
+    }
+    if (JXL_ENC_SUCCESS != SetOption(opt, settings)) {
+      fprintf(stderr, "Setting option id %d failed.\n", opt.id);
+      return false;
+    }
+    (*option_idx)++;
+  }
+  return true;
+}
+
+bool EncodeImageJXL(const JXLCompressParams& params, const PackedPixelFile& ppf,
+                    const std::vector<uint8_t>* jpeg_bytes,
+                    std::vector<uint8_t>* compressed) {
+  auto encoder = JxlEncoderMake(/*memory_manager=*/nullptr);
+  JxlEncoder* enc = encoder.get();
+
+  if (params.runner_opaque != nullptr &&
+      JXL_ENC_SUCCESS != JxlEncoderSetParallelRunner(enc, params.runner,
+                                                     params.runner_opaque)) {
+    fprintf(stderr, "JxlEncoderSetParallelRunner failed\n");
+    return false;
+  }
+
+  auto settings = JxlEncoderFrameSettingsCreate(enc, nullptr);
+  size_t option_idx = 0;
+  if (!SetFrameOptions(params.options, 0, &option_idx, settings)) {
+    return false;
+  }
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderSetFrameDistance(settings, params.distance)) {
+    fprintf(stderr, "Setting frame distance failed.\n");
+    return false;
+  }
+
+  bool use_container = params.use_container;
+  if (!ppf.metadata.exif.empty() || !ppf.metadata.xmp.empty() ||
+      !ppf.metadata.jumbf.empty() || !ppf.metadata.iptc.empty() ||
+      (jpeg_bytes && params.jpeg_store_metadata)) {
+    use_container = true;
+  }
+
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderUseContainer(enc, static_cast<int>(use_container))) {
+    fprintf(stderr, "JxlEncoderUseContainer failed.\n");
+    return false;
+  }
+
+  if (jpeg_bytes) {
+    if (params.jpeg_store_metadata &&
+        JXL_ENC_SUCCESS != JxlEncoderStoreJPEGMetadata(enc, JXL_TRUE)) {
+      fprintf(stderr, "Storing JPEG metadata failed.\n");
+      return false;
+    }
+    if (JXL_ENC_SUCCESS != JxlEncoderAddJPEGFrame(settings, jpeg_bytes->data(),
+                                                  jpeg_bytes->size())) {
+      fprintf(stderr, "JxlEncoderAddJPEGFrame() failed.\n");
+      return false;
+    }
+  } else {
+    size_t num_alpha_channels = 0;  // Adjusted below.
+    JxlBasicInfo basic_info = ppf.info;
+    if (basic_info.alpha_bits > 0) num_alpha_channels = 1;
+    if (params.intensity_target > 0) {
+      basic_info.intensity_target = params.intensity_target;
+    }
+    basic_info.num_extra_channels =
+        std::max<uint32_t>(num_alpha_channels, ppf.info.num_extra_channels);
+    basic_info.num_color_channels = ppf.info.num_color_channels;
+    const bool lossless = params.distance == 0;
+    basic_info.uses_original_profile = lossless;
+    if (params.override_bitdepth != 0) {
+      basic_info.bits_per_sample = params.override_bitdepth;
+      basic_info.exponent_bits_per_sample =
+          params.override_bitdepth == 32 ? 8 : 0;
+    }
+    if (JXL_ENC_SUCCESS !=
+        JxlEncoderSetCodestreamLevel(enc, params.codestream_level)) {
+      fprintf(stderr, "Setting --codestream_level failed.\n");
+      return false;
+    }
+    if (JXL_ENC_SUCCESS != JxlEncoderSetBasicInfo(enc, &basic_info)) {
+      fprintf(stderr, "JxlEncoderSetBasicInfo() failed.\n");
+      return false;
+    }
+    if (JXL_ENC_SUCCESS !=
+        JxlEncoderSetFrameBitDepth(settings, &params.input_bitdepth)) {
+      fprintf(stderr, "JxlEncoderSetFrameBitDepth() failed.\n");
+      return false;
+    }
+    if (lossless &&
+        JXL_ENC_SUCCESS != JxlEncoderSetFrameLossless(settings, JXL_TRUE)) {
+      fprintf(stderr, "JxlEncoderSetFrameLossless() failed.\n");
+      return false;
+    }
+    if (!ppf.icc.empty()) {
+      if (JXL_ENC_SUCCESS !=
+          JxlEncoderSetICCProfile(enc, ppf.icc.data(), ppf.icc.size())) {
+        fprintf(stderr, "JxlEncoderSetICCProfile() failed.\n");
+        return false;
+      }
+    } else {
+      if (JXL_ENC_SUCCESS !=
+          JxlEncoderSetColorEncoding(enc, &ppf.color_encoding)) {
+        fprintf(stderr, "JxlEncoderSetColorEncoding() failed.\n");
+        return false;
+      }
+    }
+
+    for (size_t num_frame = 0; num_frame < ppf.frames.size(); ++num_frame) {
+      const jxl::extras::PackedFrame& pframe = ppf.frames[num_frame];
+      const jxl::extras::PackedImage& pimage = pframe.color;
+      JxlPixelFormat ppixelformat = pimage.format;
+      if (JXL_ENC_SUCCESS !=
+          JxlEncoderSetFrameHeader(settings, &pframe.frame_info)) {
+        fprintf(stderr, "JxlEncoderSetFrameHeader() failed.\n");
+        return false;
+      }
+      if (!SetFrameOptions(params.options, num_frame, &option_idx, settings)) {
+        return false;
+      }
+      if (num_alpha_channels > 0) {
+        JxlExtraChannelInfo extra_channel_info;
+        JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &extra_channel_info);
+        extra_channel_info.bits_per_sample = ppf.info.alpha_bits;
+        extra_channel_info.exponent_bits_per_sample =
+            ppf.info.alpha_exponent_bits;
+        if (params.premultiply != -1) {
+          if (params.premultiply != 0 && params.premultiply != 1) {
+            fprintf(stderr, "premultiply must be one of: -1, 0, 1.\n");
+            return false;
+          }
+          extra_channel_info.alpha_premultiplied = params.premultiply;
+        }
+        if (JXL_ENC_SUCCESS !=
+            JxlEncoderSetExtraChannelInfo(enc, 0, &extra_channel_info)) {
+          fprintf(stderr, "JxlEncoderSetExtraChannelInfo() failed.\n");
+          return false;
+        }
+        // We take the extra channel blend info frame_info, but don't do
+        // clamping.
+        JxlBlendInfo extra_channel_blend_info =
+            pframe.frame_info.layer_info.blend_info;
+        extra_channel_blend_info.clamp = JXL_FALSE;
+        JxlEncoderSetExtraChannelBlendInfo(settings, 0,
+                                           &extra_channel_blend_info);
+      }
+      size_t num_interleaved_alpha =
+          (ppixelformat.num_channels - ppf.info.num_color_channels);
+      // Add extra channel info for the rest of the extra channels.
+      for (size_t i = 0; i < ppf.info.num_extra_channels; ++i) {
+        if (i < ppf.extra_channels_info.size()) {
+          const auto& ec_info = ppf.extra_channels_info[i].ec_info;
+          if (JXL_ENC_SUCCESS !=
+              JxlEncoderSetExtraChannelInfo(enc, num_interleaved_alpha + i,
+                                            &ec_info)) {
+            fprintf(stderr, "JxlEncoderSetExtraChannelInfo() failed.\n");
+            return false;
+          }
+        }
+      }
+      if (JXL_ENC_SUCCESS != JxlEncoderAddImageFrame(settings, &ppixelformat,
+                                                     pimage.pixels(),
+                                                     pimage.pixels_size)) {
+        fprintf(stderr, "JxlEncoderAddImageFrame() failed.\n");
+        return false;
+      }
+      // Only set extra channel buffer if it is provided non-interleaved.
+      for (size_t i = 0; i < pframe.extra_channels.size(); ++i) {
+        if (JXL_ENC_SUCCESS !=
+            JxlEncoderSetExtraChannelBuffer(settings, &ppixelformat,
+                                            pframe.extra_channels[i].pixels(),
+                                            pframe.extra_channels[i].stride *
+                                                pframe.extra_channels[i].ysize,
+                                            num_interleaved_alpha + i)) {
+          fprintf(stderr, "JxlEncoderSetExtraChannelBuffer() failed.\n");
+          return false;
+        }
+      }
+    }
+  }
+  JxlEncoderCloseInput(enc);
+  // Reading compressed output
+  compressed->clear();
+  compressed->resize(4096);
+  uint8_t* next_out = compressed->data();
+  size_t avail_out = compressed->size() - (next_out - compressed->data());
+  JxlEncoderStatus result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (result == JXL_ENC_NEED_MORE_OUTPUT) {
+    result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
+    if (result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed->data();
+      compressed->resize(compressed->size() * 2);
+      next_out = compressed->data() + offset;
+      avail_out = compressed->size() - offset;
+    }
+  }
+  compressed->resize(next_out - compressed->data());
+  if (result != JXL_ENC_SUCCESS) {
+    fprintf(stderr, "JxlEncoderProcessOutput failed.\n");
+    return false;
+  }
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
--- a/third_party/jpeg-xl/lib/extras/enc/jxl.h
+++ b/third_party/jpeg-xl/lib/extras/enc/jxl.h
@@ -0,0 +1,73 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_JXL_H_
+#define LIB_EXTRAS_ENC_JXL_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "jxl/encode.h"
+#include "jxl/parallel_runner.h"
+#include "jxl/thread_parallel_runner.h"
+#include "jxl/types.h"
+#include "lib/extras/packed_image.h"
+
+namespace jxl {
+namespace extras {
+
+struct JXLOption {
+  JXLOption(JxlEncoderFrameSettingId id, int64_t val, size_t frame_index)
+      : id(id), is_float(false), ival(val), frame_index(frame_index) {}
+  JXLOption(JxlEncoderFrameSettingId id, float val, size_t frame_index)
+      : id(id), is_float(true), fval(val), frame_index(frame_index) {}
+
+  JxlEncoderFrameSettingId id;
+  bool is_float;
+  union {
+    int64_t ival;
+    float fval;
+  };
+  size_t frame_index;
+};
+
+struct JXLCompressParams {
+  std::vector<JXLOption> options;
+  // Target butteraugli distance, 0.0 means lossless.
+  float distance = 1.0f;
+  // If set to true, forces container mode.
+  bool use_container = false;
+  // Whether to enable/disable byte-exact jpeg reconstruction for jpeg inputs.
+  bool jpeg_store_metadata = true;
+  // Upper bound on the intensity level present in the image in nits (zero means
+  // that the library chooses a default).
+  float intensity_target = 0;
+  // Overrides for bitdepth, codestream level and alpha premultiply.
+  size_t override_bitdepth = 0;
+  int32_t codestream_level = -1;
+  int32_t premultiply = -1;
+  // Override input buffer interpretation.
+  JxlBitDepth input_bitdepth = {JXL_BIT_DEPTH_FROM_PIXEL_FORMAT, 0, 0};
+  // If runner_opaque is set, the decoder uses this parallel runner.
+  JxlParallelRunner runner = JxlThreadParallelRunner;
+  void* runner_opaque = nullptr;
+
+  void AddOption(JxlEncoderFrameSettingId id, int64_t val) {
+    options.emplace_back(JXLOption(id, val, 0));
+  }
+  void AddFloatOption(JxlEncoderFrameSettingId id, float val) {
+    options.emplace_back(JXLOption(id, val, 0));
+  }
+};
+
+bool EncodeImageJXL(const JXLCompressParams& params, const PackedPixelFile& ppf,
+                    const std::vector<uint8_t>* jpeg_bytes,
+                    std::vector<uint8_t>* compressed);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_JXL_H_
--- a/third_party/jpeg-xl/lib/extras/enc/pnm.cc
+++ b/third_party/jpeg-xl/lib/extras/enc/pnm.cc
@@ -32,69 +32,6 @@ namespace {

 constexpr size_t kMaxHeaderSize = 200;

-Status EncodeHeader(const PackedImage& image, size_t bits_per_sample,
-                    bool little_endian, char* header, int* chars_written) {
-  size_t num_channels = image.format.num_channels;
-  bool is_gray = num_channels <= 2;
-  bool has_alpha = num_channels == 2 || num_channels == 4;
-  if (has_alpha) {  // PAM
-    if (bits_per_sample > 16) return JXL_FAILURE("PNM cannot have > 16 bits");
-    const uint32_t max_val = (1U << bits_per_sample) - 1;
-    *chars_written =
-        snprintf(header, kMaxHeaderSize,
-                 "P7\nWIDTH %" PRIuS "\nHEIGHT %" PRIuS
-                 "\nDEPTH %u\nMAXVAL %u\nTUPLTYPE %s\nENDHDR\n",
-                 image.xsize, image.ysize, is_gray ? 2 : 4, max_val,
-                 is_gray ? "GRAYSCALE_ALPHA" : "RGB_ALPHA");
-    JXL_RETURN_IF_ERROR(static_cast<unsigned int>(*chars_written) <
-                        kMaxHeaderSize);
-  } else if (bits_per_sample == 32) {  // PFM
-    const char type = is_gray ? 'f' : 'F';
-    const double scale = little_endian ? -1.0 : 1.0;
-    *chars_written =
-        snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%.1f\n",
-                 type, image.xsize, image.ysize, scale);
-    JXL_RETURN_IF_ERROR(static_cast<unsigned int>(*chars_written) <
-                        kMaxHeaderSize);
-  } else {  // PGM/PPM
-    if (bits_per_sample > 16) return JXL_FAILURE("PNM cannot have > 16 bits");
-    const uint32_t max_val = (1U << bits_per_sample) - 1;
-    const char type = is_gray ? '5' : '6';
-    *chars_written =
-        snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%u\n",
-                 type, image.xsize, image.ysize, max_val);
-    JXL_RETURN_IF_ERROR(static_cast<unsigned int>(*chars_written) <
-                        kMaxHeaderSize);
-  }
-  return true;
-}
-
-Status EncodeImagePNM(const PackedImage& image, size_t bits_per_sample,
-                      std::vector<uint8_t>* bytes) {
-  if (bits_per_sample <= 16 && image.format.endianness != JXL_BIG_ENDIAN) {
-    return JXL_FAILURE("PPM/PGM requires big-endian pixel format.");
-  }
-  bool is_little_endian =
-      (image.format.endianness == JXL_LITTLE_ENDIAN ||
-       (image.format.endianness == JXL_NATIVE_ENDIAN && IsLittleEndian()));
-  char header[kMaxHeaderSize];
-  int header_size = 0;
-  JXL_RETURN_IF_ERROR(EncodeHeader(image, bits_per_sample, is_little_endian,
-                                   header, &header_size));
-  bytes->resize(static_cast<size_t>(header_size) + image.pixels_size);
-  memcpy(bytes->data(), header, static_cast<size_t>(header_size));
-  const bool flipped_y = bits_per_sample == 32;  // PFMs are flipped
-  const uint8_t* in = reinterpret_cast<const uint8_t*>(image.pixels());
-  uint8_t* out = bytes->data() + header_size;
-  for (size_t y = 0; y < image.ysize; ++y) {
-    size_t y_out = flipped_y ? image.ysize - 1 - y : y;
-    const uint8_t* row_in = &in[y * image.stride];
-    uint8_t* row_out = &out[y_out * image.stride];
-    memcpy(row_out, row_in, image.stride);
-  }
-  return true;
-}
-
 class PNMEncoder : public Encoder {
 public:
  Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
@@ -110,8 +47,8 @@ class PNMEncoder : public Encoder {
    for (const auto& frame : ppf.frames) {
      JXL_RETURN_IF_ERROR(VerifyPackedImage(frame.color, ppf.info));
      encoded_image->bitstreams.emplace_back();
-      JXL_RETURN_IF_ERROR(EncodeImagePNM(frame.color, ppf.info.bits_per_sample,
-                                         &encoded_image->bitstreams.back()));
+      JXL_RETURN_IF_ERROR(
+          EncodeFrame(ppf, frame, &encoded_image->bitstreams.back()));
    }
    for (size_t i = 0; i < ppf.extra_channels_info.size(); ++i) {
      const auto& ec_info = ppf.extra_channels_info[i].ec_info;
@@ -119,30 +56,61 @@ class PNMEncoder : public Encoder {
      auto& ec_bitstreams = encoded_image->extra_channel_bitstreams.back();
      for (const auto& frame : ppf.frames) {
        ec_bitstreams.emplace_back();
-        JXL_RETURN_IF_ERROR(EncodeImagePNM(frame.extra_channels[i],
-                                           ec_info.bits_per_sample,
-                                           &ec_bitstreams.back()));
+        JXL_RETURN_IF_ERROR(EncodeExtraChannel(frame.extra_channels[i],
+                                               ec_info.bits_per_sample,
+                                               &ec_bitstreams.back()));
      }
    }
    return true;
  }
+
+ protected:
+  virtual Status EncodeFrame(const PackedPixelFile& ppf,
+                             const PackedFrame& frame,
+                             std::vector<uint8_t>* bytes) const = 0;
+  virtual Status EncodeExtraChannel(const PackedImage& image,
+                                    size_t bits_per_sample,
+                                    std::vector<uint8_t>* bytes) const = 0;
 };

 class PPMEncoder : public PNMEncoder {
 public:
  std::vector<JxlPixelFormat> AcceptedFormats() const override {
-    std::vector<JxlPixelFormat> formats;
-    for (const uint32_t num_channels : {1, 2, 3, 4}) {
-      for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
-        for (JxlEndianness endianness : {JXL_BIG_ENDIAN}) {
-          formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
-                                           /*data_type=*/data_type,
-                                           /*endianness=*/endianness,
-                                           /*align=*/0});
-        }
-      }
-    }
-    return formats;
+    return {JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0},
+            JxlPixelFormat{3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}};
+  }
+  Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
+                     std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(frame.color, ppf.info.bits_per_sample, bytes);
+  }
+  Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
+                            std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(image, bits_per_sample, bytes);
+  }
+
+ private:
+  Status EncodeImage(const PackedImage& image, size_t bits_per_sample,
+                     std::vector<uint8_t>* bytes) const {
+    uint32_t maxval = (1u << bits_per_sample) - 1;
+    char type = image.format.num_channels == 1 ? '5' : '6';
+    char header[kMaxHeaderSize];
+    size_t header_size =
+        snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%u\n",
+                 type, image.xsize, image.ysize, maxval);
+    JXL_RETURN_IF_ERROR(header_size < kMaxHeaderSize);
+    bytes->resize(header_size + image.pixels_size);
+    memcpy(bytes->data(), header, header_size);
+    memcpy(bytes->data() + header_size,
+           reinterpret_cast<uint8_t*>(image.pixels()), image.pixels_size);
+    return true;
+  }
+};
+
+class PGMEncoder : public PPMEncoder {
+ public:
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    return {JxlPixelFormat{1, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0},
+            JxlPixelFormat{1, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}};
  }
 };

@@ -151,54 +119,168 @@ class PFMEncoder : public PNMEncoder {
  std::vector<JxlPixelFormat> AcceptedFormats() const override {
    std::vector<JxlPixelFormat> formats;
    for (const uint32_t num_channels : {1, 3}) {
-      for (const JxlDataType data_type : {JXL_TYPE_FLOAT16, JXL_TYPE_FLOAT}) {
-        for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
-          formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
-                                           /*data_type=*/data_type,
-                                           /*endianness=*/endianness,
-                                           /*align=*/0});
+      for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
+        formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
+                                         /*data_type=*/JXL_TYPE_FLOAT,
+                                         /*endianness=*/endianness,
+                                         /*align=*/0});
+      }
+    }
+    return formats;
+  }
+  Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
+                     std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(frame.color, bytes);
+  }
+  Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
+                            std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(image, bytes);
+  }
+
+ private:
+  Status EncodeImage(const PackedImage& image,
+                     std::vector<uint8_t>* bytes) const {
+    char type = image.format.num_channels == 1 ? 'f' : 'F';
+    double scale = image.format.endianness == JXL_LITTLE_ENDIAN ? -1.0 : 1.0;
+    char header[kMaxHeaderSize];
+    size_t header_size =
+        snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%.1f\n",
+                 type, image.xsize, image.ysize, scale);
+    JXL_RETURN_IF_ERROR(header_size < kMaxHeaderSize);
+    bytes->resize(header_size + image.pixels_size);
+    memcpy(bytes->data(), header, header_size);
+    const uint8_t* in = reinterpret_cast<const uint8_t*>(image.pixels());
+    uint8_t* out = bytes->data() + header_size;
+    for (size_t y = 0; y < image.ysize; ++y) {
+      size_t y_out = image.ysize - 1 - y;
+      const uint8_t* row_in = &in[y * image.stride];
+      uint8_t* row_out = &out[y_out * image.stride];
+      memcpy(row_out, row_in, image.stride);
+    }
+    return true;
+  }
+};
+
+class PAMEncoder : public PNMEncoder {
+ public:
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    std::vector<JxlPixelFormat> formats;
+    for (const uint32_t num_channels : {1, 2, 3, 4}) {
+      for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
+        formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
+                                         /*data_type=*/data_type,
+                                         /*endianness=*/JXL_BIG_ENDIAN,
+                                         /*align=*/0});
+      }
+    }
+    return formats;
+  }
+  Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
+                     std::vector<uint8_t>* bytes) const override {
+    const PackedImage& color = frame.color;
+    const auto& ec_info = ppf.extra_channels_info;
+    JXL_RETURN_IF_ERROR(frame.extra_channels.size() == ec_info.size());
+    for (const auto& ec : frame.extra_channels) {
+      if (ec.xsize != color.xsize || ec.ysize != color.ysize) {
+        return JXL_FAILURE("Extra channel and color size mismatch.");
+      }
+      if (ec.format.data_type != color.format.data_type ||
+          ec.format.endianness != color.format.endianness) {
+        return JXL_FAILURE("Extra channel and color format mismatch.");
+      }
+    }
+    if (ppf.info.bits_per_sample != ppf.info.alpha_bits) {
+      return JXL_FAILURE("Alpha bit depth does not match image bit depth");
+    }
+    for (const auto& it : ec_info) {
+      if (it.ec_info.bits_per_sample != ppf.info.bits_per_sample) {
+        return JXL_FAILURE(
+            "Extra channel bit depth does not match image bit depth");
+      }
+    }
+    const char* kColorTypes[4] = {"GRAYSCALE", "GRAYSCALE_ALPHA", "RGB",
+                                  "RGB_ALPHA"};
+    uint32_t maxval = (1u << ppf.info.bits_per_sample) - 1;
+    uint32_t depth = color.format.num_channels + ec_info.size();
+    char header[kMaxHeaderSize];
+    size_t pos = 0;
+    pos += snprintf(header + pos, kMaxHeaderSize - pos,
+                    "P7\nWIDTH %" PRIuS "\nHEIGHT %" PRIuS
+                    "\nDEPTH %u\n"
+                    "MAXVAL %u\nTUPLTYPE %s\n",
+                    color.xsize, color.ysize, depth, maxval,
+                    kColorTypes[color.format.num_channels - 1]);
+    JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
+    for (const auto& info : ec_info) {
+      pos += snprintf(header + pos, kMaxHeaderSize - pos, "TUPLTYPE %s\n",
+                      ExtraChannelTypeName(info.ec_info.type).c_str());
+      JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
+    }
+    pos += snprintf(header + pos, kMaxHeaderSize - pos, "ENDHDR\n");
+    JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
+    size_t total_size = color.pixels_size;
+    for (const auto& ec : frame.extra_channels) {
+      total_size += ec.pixels_size;
+    }
+    bytes->resize(pos + total_size);
+    memcpy(bytes->data(), header, pos);
+    // If we have no extra channels, just copy color pixel data over.
+    if (frame.extra_channels.empty()) {
+      memcpy(bytes->data() + pos, reinterpret_cast<uint8_t*>(color.pixels()),
+             color.pixels_size);
+      return true;
+    }
+    // Interleave color and extra channels.
+    const uint8_t* in = reinterpret_cast<const uint8_t*>(color.pixels());
+    std::vector<const uint8_t*> ec_in(frame.extra_channels.size());
+    for (size_t i = 0; i < frame.extra_channels.size(); ++i) {
+      ec_in[i] =
+          reinterpret_cast<const uint8_t*>(frame.extra_channels[i].pixels());
+    }
+    uint8_t* out = bytes->data() + pos;
+    size_t pwidth = PackedImage::BitsPerChannel(color.format.data_type) / 8;
+    for (size_t y = 0; y < color.ysize; ++y) {
+      for (size_t x = 0; x < color.xsize; ++x) {
+        memcpy(out, in, color.pixel_stride());
+        out += color.pixel_stride();
+        in += color.pixel_stride();
+        for (auto& p : ec_in) {
+          memcpy(out, p, pwidth);
+          out += pwidth;
+          p += pwidth;
        }
      }
    }
-    return formats;
+    return true;
+  }
+  Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
+                            std::vector<uint8_t>* bytes) const override {
+    return true;
  }
-};

-class PGMEncoder : public PPMEncoder {
- public:
-  std::vector<JxlPixelFormat> AcceptedFormats() const override {
-    std::vector<JxlPixelFormat> formats = PPMEncoder::AcceptedFormats();
-    for (auto it = formats.begin(); it != formats.end();) {
-      if (it->num_channels > 2) {
-        it = formats.erase(it);
-      } else {
-        ++it;
-      }
+ private:
+  static std::string ExtraChannelTypeName(JxlExtraChannelType type) {
+    switch (type) {
+      case JXL_CHANNEL_ALPHA:
+        return std::string("Alpha");
+      case JXL_CHANNEL_DEPTH:
+        return std::string("Depth");
+      case JXL_CHANNEL_SPOT_COLOR:
+        return std::string("SpotColor");
+      case JXL_CHANNEL_SELECTION_MASK:
+        return std::string("SelectionMask");
+      case JXL_CHANNEL_BLACK:
+        return std::string("Black");
+      case JXL_CHANNEL_CFA:
+        return std::string("CFA");
+      case JXL_CHANNEL_THERMAL:
+        return std::string("Thermal");
+      default:
+        return std::string("UNKNOWN");
    }
-    return formats;
  }
 };

-class PAMEncoder : public PPMEncoder {
- public:
-  std::vector<JxlPixelFormat> AcceptedFormats() const override {
-    std::vector<JxlPixelFormat> formats = PPMEncoder::AcceptedFormats();
-    for (auto it = formats.begin(); it != formats.end();) {
-      if (it->num_channels != 2 && it->num_channels != 4) {
-        it = formats.erase(it);
-      } else {
-        ++it;
-      }
-    }
-    return formats;
-  }
-};
-
-Span<const uint8_t> MakeSpan(const char* str) {
-  return Span<const uint8_t>(reinterpret_cast<const uint8_t*>(str),
-                             strlen(str));
-}
-
 }  // namespace

 std::unique_ptr<Encoder> GetPPMEncoder() {
--- a/third_party/jpeg-xl/lib/extras/encode_jpeg.cc
+++ b/third_party/jpeg-xl/lib/extras/encode_jpeg.cc
@@ -31,27 +31,42 @@ namespace HWY_NAMESPACE {
 void ComputeDCTCoefficients(const Image3F& opsin, const ImageF& qf,
                            const FrameDimensions& frame_dim, const float* qm,
                            std::vector<jpeg::JPEGComponent>* components) {
+  int max_samp_factor = 1;
+  for (const auto& c : *components) {
+    JXL_DASSERT(c.h_samp_factor == c.v_samp_factor);
+    max_samp_factor = std::max(c.h_samp_factor, max_samp_factor);
+  }
  float qfmin, qfmax;
  ImageMinMax(qf, &qfmin, &qfmax);
  HWY_ALIGN float scratch_space[2 * kDCTBlockSize];
+  ImageF tmp;
  for (size_t c = 0; c < 3; c++) {
-    std::vector<jpeg::coeff_t>& coeffs = (*components)[c].coeffs;
-    size_t num_blocks = frame_dim.xsize_blocks * frame_dim.ysize_blocks;
-    coeffs.resize(num_blocks * kDCTBlockSize);
+    auto& comp = (*components)[c];
+    const size_t xsize_blocks = comp.width_in_blocks;
+    const size_t ysize_blocks = comp.height_in_blocks;
+    JXL_DASSERT(max_samp_factor % comp.h_samp_factor == 0);
+    const int factor = max_samp_factor / comp.h_samp_factor;
+    const ImageF* plane = &opsin.Plane(c);
+    if (factor > 1) {
+      tmp = CopyImage(*plane);
+      DownsampleImage(&tmp, factor);
+      plane = &tmp;
+    }
+    std::vector<jpeg::coeff_t>& coeffs = comp.coeffs;
+    coeffs.resize(xsize_blocks * ysize_blocks * kDCTBlockSize);
    const float* qmc = &qm[c * kDCTBlockSize];
-    for (size_t by = 0, bix = 0; by < frame_dim.ysize_blocks; by++) {
-      for (size_t bx = 0; bx < frame_dim.xsize_blocks; bx++, bix++) {
-        HWY_ALIGN float dct[kDCTBlockSize];
-        TransformFromPixels(AcStrategy::Type::DCT,
-                            opsin.PlaneRow(c, 8 * by) + 8 * bx,
-                            opsin.PixelsPerRow(), dct, scratch_space);
+    for (size_t by = 0, bix = 0; by < ysize_blocks; by++) {
+      for (size_t bx = 0; bx < xsize_blocks; bx++, bix++) {
        jpeg::coeff_t* block = &coeffs[bix * kDCTBlockSize];
+        HWY_ALIGN float dct[kDCTBlockSize];
+        TransformFromPixels(AcStrategy::Type::DCT, plane->Row(8 * by) + 8 * bx,
+                            plane->PixelsPerRow(), dct, scratch_space);
        for (size_t iy = 0, i = 0; iy < 8; iy++) {
          for (size_t ix = 0; ix < 8; ix++, i++) {
            float coeff = 2040 * dct[i] * qmc[i];
            // Create more zeros in areas where jpeg xl would have used a lower
            // quantization multiplier.
-            float zero_bias = 0.5f * qfmax / qf.Row(by)[bx];
+            float zero_bias = 0.5f * qfmax / qf.Row(by * factor)[bx * factor];
            int cc = std::abs(coeff) < zero_bias ? 0 : std::round(coeff);
            // If the relative value of the adaptive quantization field is less
            // than 0.5, we drop the least significant bit.
@@ -102,7 +117,7 @@ std::vector<uint8_t> CreateXybICCAppMarker() {
  return icc_marker;
 }

-void AddJpegQuantMatrices(const ImageF& qf, float dc_quant,
+void AddJpegQuantMatrices(const ImageF& qf, float dc_quant, float global_scale,
                          std::vector<jpeg::JPEGQuantTable>* quant_tables,
                          float* qm) {
  // Create a custom JPEG XL dequant matrix. The quantization weight parameters
@@ -127,7 +142,6 @@ void AddJpegQuantMatrices(const ImageF& qf, float dc_quant,
  // Scale the quant matrix based on the scaled XYB scales and the quant field.
  float qfmin, qfmax;
  ImageMinMax(qf, &qfmin, &qfmax);
-  const float global_scale = 0.66f;
  for (size_t c = 0; c < 3; c++) {
    const float scale = kScaledXYBScale[c] * global_scale;
    qm[c * kDCTBlockSize] *= scale;
@@ -237,7 +251,9 @@ void AddJpegHuffmanCodes(std::vector<Histogram>& histograms,
 }

 void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
+                  float global_scale, const bool subsample_blue,
                  const FrameDimensions& frame_dim, jpeg::JPEGData* out) {
+  *out = jpeg::JPEGData();
  // ICC
  out->marker_order.push_back(0xe2);
  out->app_data.push_back(CreateXybICCAppMarker());
@@ -245,7 +261,7 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
  // DQT
  out->marker_order.emplace_back(0xdb);
  float qm[3 * kDCTBlockSize];
-  AddJpegQuantMatrices(qf, dc_quant, &out->quant, qm);
+  AddJpegQuantMatrices(qf, dc_quant, global_scale, &out->quant, qm);

  // SOF
  out->marker_order.emplace_back(0xc2);
@@ -255,11 +271,15 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
  out->components[0].id = 'R';
  out->components[1].id = 'G';
  out->components[2].id = 'B';
+  size_t max_samp_factor = subsample_blue ? 2 : 1;
  for (size_t c = 0; c < 3; ++c) {
-    out->components[c].h_samp_factor = 1;
-    out->components[c].v_samp_factor = 1;
-    out->components[c].width_in_blocks = frame_dim.xsize_blocks;
-    out->components[c].height_in_blocks = frame_dim.ysize_blocks;
+    const size_t factor = (subsample_blue && c == 2) ? 2 : 1;
+    out->components[c].h_samp_factor = max_samp_factor / factor;
+    out->components[c].v_samp_factor = max_samp_factor / factor;
+    JXL_ASSERT(frame_dim.xsize_blocks % factor == 0);
+    JXL_ASSERT(frame_dim.ysize_blocks % factor == 0);
+    out->components[c].width_in_blocks = frame_dim.xsize_blocks / factor;
+    out->components[c].height_in_blocks = frame_dim.ysize_blocks / factor;
    out->components[c].quant_idx = c;
  }
  HWY_DYNAMIC_DISPATCH(ComputeDCTCoefficients)
@@ -271,7 +291,7 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
  // SOS
  std::vector<ProgressiveScan> progressive_mode = {
      // DC
-      {0, 0, 0, 0, true},
+      {0, 0, 0, 0, !subsample_blue},
      // AC 1 - highest bits
      {1, 63, 0, 1, false},
      // AC 2 - lowest bit
@@ -315,18 +335,31 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
  }
 }

+size_t JpegSize(const jpeg::JPEGData& jpeg_data) {
+  size_t total_size = 0;
+  auto countsize = [&total_size](const uint8_t* buf, size_t len) {
+    total_size += len;
+    return len;
+  };
+  JXL_CHECK(jpeg::WriteJpeg(jpeg_data, countsize));
+  return total_size;
+}
+
 }  // namespace

-Status EncodeJpeg(const ImageBundle& input, float distance, ThreadPool* pool,
-                  std::vector<uint8_t>* compressed) {
+Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed) {
+  const bool subsample_blue = true;
+  const size_t max_shift = subsample_blue ? 1 : 0;
  FrameDimensions frame_dim;
-  frame_dim.Set(input.xsize(), input.ysize(), 1, 0, 0, false, 1);
+  frame_dim.Set(input.xsize(), input.ysize(), 1, max_shift, max_shift, false,
+                1);

  // Convert input to XYB colorspace.
  Image3F opsin(frame_dim.xsize_padded, frame_dim.ysize_padded);
  opsin.ShrinkTo(frame_dim.xsize, frame_dim.ysize);
  ToXYB(input, pool, &opsin, GetJxlCms());
-  PadImageToBlockMultipleInPlace(&opsin);
+  PadImageToBlockMultipleInPlace(&opsin, 8 << max_shift);

  // Compute adaptive quant field.
  ImageF mask;
@@ -335,7 +368,39 @@ Status EncodeJpeg(const ImageBundle& input, float distance, ThreadPool* pool,

  // Create jpeg data and optimize Huffman codes.
  jpeg::JPEGData jpeg_data;
-  FillJPEGData(opsin, qf, InitialQuantDC(distance), frame_dim, &jpeg_data);
+  float global_scale = 0.66f;
+  float dc_quant = InitialQuantDC(distance);
+  FillJPEGData(opsin, qf, dc_quant, global_scale, subsample_blue, frame_dim,
+               &jpeg_data);
+
+  if (target_size != 0) {
+    // Tweak the jpeg data so that the resulting compressed file is
+    // approximately target_size long.
+    size_t prev_size = 0;
+    float best_error = 100.0f;
+    float best_global_scale = global_scale;
+    size_t iter = 0;
+    for (;;) {
+      size_t size = JpegSize(jpeg_data);
+      float error = size * 1.0f / target_size - 1.0f;
+      if (std::abs(error) < std::abs(best_error)) {
+        best_error = error;
+        best_global_scale = global_scale;
+      }
+      if (size == prev_size || std::abs(error) < 0.001f || iter >= 10) {
+        break;
+      }
+      global_scale *= 1.0f + error;
+      FillJPEGData(opsin, qf, dc_quant, global_scale, subsample_blue, frame_dim,
+                   &jpeg_data);
+      prev_size = size;
+      ++iter;
+    }
+    if (best_global_scale != global_scale) {
+      FillJPEGData(opsin, qf, dc_quant, best_global_scale, subsample_blue,
+                   frame_dim, &jpeg_data);
+    }
+  }

  // Write jpeg data to compressed stream.
  auto write = [&compressed](const uint8_t* buf, size_t len) {
--- a/third_party/jpeg-xl/lib/extras/encode_jpeg.h
+++ b/third_party/jpeg-xl/lib/extras/encode_jpeg.h
@@ -16,8 +16,8 @@
 namespace jxl {
 namespace extras {

-Status EncodeJpeg(const ImageBundle& input, float distance, ThreadPool* pool,
-                  std::vector<uint8_t>* compressed);
+Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed);

 }  // namespace extras
 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/extras/packed_image.h
+++ b/third_party/jpeg-xl/lib/extras/packed_image.h
@@ -33,6 +33,13 @@ class PackedImage {
  PackedImage(size_t xsize, size_t ysize, const JxlPixelFormat& format)
      : PackedImage(xsize, ysize, format, CalcStride(format, xsize)) {}

+  PackedImage Copy() const {
+    PackedImage copy(xsize, ysize, format);
+    memcpy(reinterpret_cast<uint8_t*>(copy.pixels()),
+           reinterpret_cast<const uint8_t*>(pixels()), pixels_size);
+    return copy;
+  }
+
  // The interleaved pixels as defined in the storage format.
  void* pixels() const { return pixels_.get(); }

@@ -98,6 +105,18 @@ class PackedFrame {
  template <typename... Args>
  explicit PackedFrame(Args&&... args) : color(std::forward<Args>(args)...) {}

+  PackedFrame Copy() const {
+    PackedFrame copy(color.xsize, color.ysize, color.format);
+    copy.frame_info = frame_info;
+    copy.name = name;
+    copy.color = color.Copy();
+    for (size_t i = 0; i < extra_channels.size(); ++i) {
+      PackedImage ec = extra_channels[i].Copy();
+      copy.extra_channels.emplace_back(std::move(ec));
+    }
+    return copy;
+  }
+
  // The Frame metadata.
  JxlFrameHeader frame_info = {};
  std::string name;
@@ -117,17 +136,18 @@ class PackedMetadata {
  std::vector<uint8_t> xmp;
 };

+// The extra channel metadata information.
+struct PackedExtraChannel {
+  JxlExtraChannelInfo ec_info;
+  size_t index;
+  std::string name;
+};
+
 // Helper class representing a JXL image file as decoded to pixels from the API.
 class PackedPixelFile {
 public:
  JxlBasicInfo info = {};

-  // The extra channel metadata information.
-  struct PackedExtraChannel {
-    JxlExtraChannelInfo ec_info;
-    size_t index;
-    std::string name;
-  };
  std::vector<PackedExtraChannel> extra_channels_info;

  // Color information of the decoded pixels.
--- a/third_party/jpeg-xl/lib/extras/packed_image_convert.cc
+++ b/third_party/jpeg-xl/lib/extras/packed_image_convert.cc
@@ -58,10 +58,8 @@ Status ConvertPackedFrameToImageBundle(const JxlBasicInfo& info,

  JXL_RETURN_IF_ERROR(ConvertFromExternal(
      span, frame.color.xsize, frame.color.ysize, io.metadata.m.color_encoding,
-      frame.color.format.num_channels,
      /*alpha_is_premultiplied=*/info.alpha_premultiplied,
-      frame_bits_per_sample, frame.color.format.endianness, pool, bundle,
-      /*float_in=*/float_in, /*align=*/0));
+      frame_bits_per_sample, frame.color.format, pool, bundle));

  bundle->extra_channels().resize(io.metadata.m.extra_channel_info.size());
  for (size_t i = 0; i < frame.extra_channels.size(); i++) {
@@ -140,8 +138,7 @@ Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,
  io->blobs.xmp = ppf.metadata.xmp;

  // Append all other extra channels.
-  for (const PackedPixelFile::PackedExtraChannel& info :
-       ppf.extra_channels_info) {
+  for (const auto& info : ppf.extra_channels_info) {
    ExtraChannelInfo out;
    out.type = static_cast<jxl::ExtraChannel>(info.ec_info.type);
    out.bit_depth.bits_per_sample = info.ec_info.bits_per_sample;
--- a/third_party/jpeg-xl/lib/include/jxl/decode.h
+++ b/third_party/jpeg-xl/lib/include/jxl/decode.h
@@ -22,6 +22,7 @@
 #include "jxl/memory_manager.h"
 #include "jxl/parallel_runner.h"
 #include "jxl/types.h"
+#include "jxl/version.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -742,14 +743,26 @@ typedef enum {
 *    represented, the ICC profile may be a close approximation. It is also not
 *    always feasible to deduce from an ICC profile which named color space it
 *    exactly represents, if any, as it can represent any arbitrary space.
+ *    HDR color spaces such as those using PQ and HLG are also potentially
+ *    problematic, in that: while ICC profiles can encode a transfer function
+ *    that happens to approximate those of PQ and HLG (HLG for only one given
+ *    system gamma at a time, and necessitating a 3D LUT if gamma is to be
+ *    different from 1), they cannot (before ICCv4.4) semantically signal that
+ *    this is the color space that they represent. Therefore, they will
+ *    typically not actually be interpreted as representing an HDR color space.
+ *    This is especially detrimental to PQ which will then be interpreted as if
+ *    the maximum signal value represented SDR white instead of 10000 cd/m^2,
+ *    meaning that the image will be displayed two orders of magnitude (5-7 EV)
+ *    too dim.
 *  - The JPEG XL image has an encoded structured color profile, and it
 *    indicates an unknown or xyb color space. In that case, @ref
 *    JxlDecoderGetColorAsICCProfile is not available.
 *
- * When rendering an image on a system that supports ICC profiles, @ref
- * JxlDecoderGetColorAsICCProfile should be used first. When rendering
- * for a specific color space, possibly indicated in the JPEG XL
- * image, @ref JxlDecoderGetColorAsEncodedProfile should be used first.
+ * When rendering an image on a system where ICC-based color management is used,
+ * @ref JxlDecoderGetColorAsICCProfile should generally be used first as it will
+ * return a ready-to-use profile (with the aforementioned caveat about HDR).
+ * When knowledge about the nominal color space is desired if available, @ref
+ * JxlDecoderGetColorAsEncodedProfile should be used first.
 *
 * @param dec decoder object
 * @param unused_format deprecated, can be NULL
@@ -1437,6 +1450,21 @@ JXL_EXPORT size_t JxlDecoderGetIntendedDownsamplingRatio(JxlDecoder* dec);
 */
 JXL_EXPORT JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec);

+/**
+ * Sets the bit depth of the output buffer or callback.
+ *
+ * Can be called after @ref JxlDecoderSetImageOutBuffer or @ref
+ * JxlDecoderSetImageOutCallback. For float pixel data types, only the default
+ * @ref JXL_BIT_DEPTH_FROM_PIXEL_FORMAT setting is supported.
+ *
+ * @param dec decoder object
+ * @param bit_depth the bit depth setting of the pixel output
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
+ *     incompatible custom bit depth and pixel data type.
+ */
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetImageOutBitDepth(JxlDecoder* dec, const JxlBitDepth* bit_depth);
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
--- a/third_party/jpeg-xl/lib/include/jxl/encode.h
+++ b/third_party/jpeg-xl/lib/include/jxl/encode.h
@@ -18,6 +18,7 @@
 #include "jxl/jxl_export.h"
 #include "jxl/memory_manager.h"
 #include "jxl/parallel_runner.h"
+#include "jxl/version.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -514,6 +515,22 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBlendInfo(
 JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameName(
    JxlEncoderFrameSettings* frame_settings, const char* frame_name);

+/**
+ * Sets the bit depth of the input buffer.
+ *
+ * For float pixel formats, only the default JXL_BIT_DEPTH_FROM_PIXEL_FORMAT
+ * setting is allowed, while for unsigned pixel formats,
+ * JXL_BIT_DEPTH_FROM_CODESTREAM setting is also allowed. See the comment on
+ * @ref JxlEncoderAddImageFrame for the effects of the bit depth setting.
+
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param bit_depth the bit depth setting of the pixel input
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameBitDepth(
+    JxlEncoderFrameSettings* frame_settings, const JxlBitDepth* bit_depth);
+
 /**
 * Sets the buffer to read JPEG encoded bytes from for the next frame to encode.
 *
@@ -555,15 +572,22 @@ JxlEncoderAddJPEGFrame(const JxlEncoderFrameSettings* frame_settings,
 * - JXL_TYPE_FLOAT, with nominal range 0..1
 *
 * Note: the sample data type in pixel_format is allowed to be different from
- * what is described in the JxlBasicInfo. The type in pixel_format describes the
- * format of the uncompressed pixel buffer. The bits_per_sample and
- * exponent_bits_per_sample in the JxlBasicInfo describes what will actually be
- * encoded in the JPEG XL codestream. For example, to encode a 12-bit image, you
- * would set bits_per_sample to 12, and you could use e.g. JXL_TYPE_UINT16
- * (where the values are rescaled to 16-bit, i.e. multiplied by 65535/4095) or
- * JXL_TYPE_FLOAT (where the values are rescaled to 0..1, i.e. multiplied
- * by 1.f/4095.f). While it is allowed, it is obviously not recommended to use a
- * pixel_format with lower precision than what is specified in the JxlBasicInfo.
+ * what is described in the JxlBasicInfo. The type in pixel_format, together
+ * with an optional @ref JxlBitDepth parameter set by @ref
+ * JxlEncoderSetFrameBitDepth describes the format of the uncompressed pixel
+ * buffer. The bits_per_sample and exponent_bits_per_sample in the JxlBasicInfo
+ * describes what will actually be encoded in the JPEG XL codestream.
+ * For example, to encode a 12-bit image, you would set bits_per_sample to 12,
+ * while the input frame buffer can be in the following formats:
+ *  - if pixel format is in JXL_TYPE_UINT16 with default bit depth setting
+ *    (i.e. JXL_BIT_DEPTH_FROM_PIXEL_FORMAT), input sample values are rescaled
+ *    to 16-bit, i.e. multiplied by 65535/4095;
+ *  - if pixel format is in JXL_TYPE_UINT16 with JXL_BIT_DEPTH_FROM_CODESTREAM
+ *    bit depth setting, input sample values are provided unscaled;
+ *  - if pixel format is in JXL_TYPE_FLOAT, input sample values are rescaled
+ *    to 0..1, i.e.  multiplied by 1.f/4095.f.
+ * While it is allowed, it is obviously not recommended to use a pixel_format
+ * with lower precision than what is specified in the JxlBasicInfo.
 *
 * We support interleaved channels as described by the JxlPixelFormat:
 * - single-channel data, e.g. grayscale
--- a/third_party/jpeg-xl/lib/include/jxl/types.h
+++ b/third_party/jpeg-xl/lib/include/jxl/types.h
@@ -111,6 +111,43 @@ typedef struct {
  size_t align;
 } JxlPixelFormat;

+/** Settings for the interpretation of the input and output buffers.
+ */
+typedef enum {
+  /** This is the default setting, where the encoder expects the input pixels
+   * to use the full range of the pixel format data type (e.g. for UINT16, the
+   * input range is 0 .. 65535 and the value 65535 is mapped to 1.0 when
+   * converting to float), and the decoder uses the full range to output
+   * pixels. If the bit depth in the basic info is different from this, the
+   * encoder expects the values to be rescaled accordingly (e.g multiplied by
+   * 65535/4095 for a 12-bit image using UINT16 input data type). */
+  JXL_BIT_DEPTH_FROM_PIXEL_FORMAT = 0,
+
+  /** If this setting is selected, the encoder expects the input pixels to be
+   * in the range defined by the bits_per_sample value of the basic info (e.g.
+   * for 12-bit images using UINT16 input data types, the allowed range is
+   * 0 .. 4095 and the value 4095 is mapped to 1.0 when converting to float),
+   * and the decoder outputs pixels in this range. */
+  JXL_BIT_DEPTH_FROM_CODESTREAM = 1,
+
+  /** This setting can only be used in the decoder to select a custom range for
+   * pixel output */
+  JXL_BIT_DEPTH_CUSTOM = 2,
+} JxlBitDepthType;
+
+/** Data type for describing the interpretation of the input and output buffers
+ * in terms of the range of allowed input and output pixel values. */
+typedef struct {
+  /** Bit depth setting, see comment on @ref JxlBitDepthType */
+  JxlBitDepthType type;
+
+  /** Custom bits per sample */
+  uint32_t bits_per_sample;
+
+  /** Custom exponent bits per sample */
+  uint32_t exponent_bits_per_sample;
+} JxlBitDepth;
+
 /** Data type holding the 4-character type name of an ISOBMFF box.
 */
 typedef char JxlBoxType[4];
--- a/third_party/jpeg-xl/lib/jxl.cmake
+++ b/third_party/jpeg-xl/lib/jxl.cmake
@@ -447,6 +447,9 @@ else ()
  )
 endif ()

+# Generate version.h
+configure_file("jxl/version.h.in" "include/jxl/version.h")
+
 # Headers for exporting/importing public headers
 include(GenerateExportHeader)
 set_target_properties(jxl_dec-obj PROPERTIES
--- a/third_party/jpeg-xl/lib/jxl/codec_y4m_testonly.cc
+++ b/third_party/jpeg-xl/lib/jxl/codec_y4m_testonly.cc
@@ -1,202 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/codec_y4m_testonly.h"
-
-#include <stddef.h>
-
-namespace jxl {
-namespace test {
-
-struct HeaderY4M {
-  size_t xsize;
-  size_t ysize;
-  size_t bits_per_sample;
-  int is_yuv;  // Y4M: where 1 = 444, 2 = 422, 3 = 420
-};
-
-// Decode Y4M images.
-class Y4MParser {
- public:
-  explicit Y4MParser(const Span<const uint8_t> bytes)
-      : pos_(bytes.data()), end_(pos_ + bytes.size()) {}
-
-  // TODO(jon): support multi-frame y4m
-  Status ParseHeader(HeaderY4M* header, const uint8_t** pos) {
-    JXL_RETURN_IF_ERROR(ExpectString("YUV4MPEG2", 9));
-    header->is_yuv = 3;
-    // TODO(jon): check if 4:2:0 is indeed the default
-    header->bits_per_sample = 8;
-    // TODO(jon): check if there's a y4m convention for higher bit depths
-    while (pos_ < end_) {
-      char next = 0;
-      JXL_RETURN_IF_ERROR(ReadChar(&next));
-      if (next == 0x0A) break;
-      if (next != ' ') continue;
-      char field = 0;
-      JXL_RETURN_IF_ERROR(ReadChar(&field));
-      switch (field) {
-        case 'W':
-          JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize));
-          break;
-        case 'H':
-          JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize));
-          break;
-        case 'I':
-          JXL_RETURN_IF_ERROR(ReadChar(&next));
-          if (next != 'p') {
-            return JXL_FAILURE(
-                "Y4M: only progressive (no frame interlacing) allowed");
-          }
-          break;
-        case 'C': {
-          char c1 = 0;
-          JXL_RETURN_IF_ERROR(ReadChar(&c1));
-          char c2 = 0;
-          JXL_RETURN_IF_ERROR(ReadChar(&c2));
-          char c3 = 0;
-          JXL_RETURN_IF_ERROR(ReadChar(&c3));
-          if (c1 != '4') return JXL_FAILURE("Y4M: invalid C param");
-          if (c2 == '4') {
-            if (c3 != '4') return JXL_FAILURE("Y4M: invalid C param");
-            header->is_yuv = 1;  // 444
-          } else if (c2 == '2') {
-            if (c3 == '2') {
-              header->is_yuv = 2;  // 422
-            } else if (c3 == '0') {
-              header->is_yuv = 3;  // 420
-            } else {
-              return JXL_FAILURE("Y4M: invalid C param");
-            }
-          } else {
-            return JXL_FAILURE("Y4M: invalid C param");
-          }
-        }
-          [[fallthrough]];
-          // no break: fallthrough because this field can have values like
-          // "C420jpeg" (we are ignoring the chroma sample location and treat
-          // everything like C420jpeg)
-        case 'F':  // Framerate in fps as numerator:denominator
-                   // TODO(jon): actually read this and set corresponding jxl
-                   // metadata
-        case 'A':  // Pixel aspect ratio (ignoring it, could perhaps adjust
-                   // intrinsic dimensions based on this?)
-        case 'X':  // Comment, ignore
-          // ignore the field value and go to next one
-          while (pos_ < end_) {
-            if (pos_[0] == ' ' || pos_[0] == 0x0A) break;
-            pos_++;
-          }
-          break;
-        default:
-          return JXL_FAILURE("Y4M: parse error");
-      }
-    }
-    JXL_RETURN_IF_ERROR(ExpectString("FRAME", 5));
-    while (true) {
-      char next = 0;
-      JXL_RETURN_IF_ERROR(ReadChar(&next));
-      if (next == 0x0A) {
-        *pos = pos_;
-        return true;
-      }
-    }
-  }
-
- private:
-  Status ExpectString(const char* str, size_t len) {
-    // Unlikely to happen.
-    if (pos_ + len < pos_) return JXL_FAILURE("Y4M: overflow");
-
-    if (pos_ + len > end_ || strncmp(str, (const char*)pos_, len) != 0) {
-      return JXL_FAILURE("Y4M: expected %s", str);
-    }
-    pos_ += len;
-    return true;
-  }
-
-  Status ReadChar(char* out) {
-    // Unlikely to happen.
-    if (pos_ + 1 < pos_) return JXL_FAILURE("Y4M: overflow");
-
-    if (pos_ >= end_) {
-      return JXL_FAILURE("Y4M: unexpected end of input");
-    }
-    *out = *pos_;
-    pos_++;
-    return true;
-  }
-
-  static bool IsDigit(const uint8_t c) { return '0' <= c && c <= '9'; }
-
-  Status ParseUnsigned(size_t* number) {
-    if (pos_ == end_) return JXL_FAILURE("PNM: reached end before number");
-    if (!IsDigit(*pos_)) return JXL_FAILURE("PNM: expected unsigned number");
-
-    *number = 0;
-    while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
-      *number *= 10;
-      *number += *pos_ - '0';
-      ++pos_;
-    }
-
-    return true;
-  }
-
-  const uint8_t* pos_;
-  const uint8_t* const end_;
-};
-
-Status DecodeImageY4M(const Span<const uint8_t> bytes, CodecInOut* io) {
-  Y4MParser parser(bytes);
-  HeaderY4M header = {};
-  const uint8_t* pos = nullptr;
-  JXL_RETURN_IF_ERROR(parser.ParseHeader(&header, &pos));
-
-  Image3F yuvdata(header.xsize, header.ysize);
-  ImageBundle bundle(&io->metadata.m);
-  const int hshift[3][3] = {{0, 0, 0}, {0, 1, 1}, {0, 1, 1}};
-  const int vshift[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 1, 1}};
-
-  for (size_t c = 0; c < 3; c++) {
-    for (size_t y = 0; y < header.ysize >> vshift[header.is_yuv - 1][c]; ++y) {
-      float* const JXL_RESTRICT row = yuvdata.PlaneRow((c == 2 ? 2 : 1 - c), y);
-      if (pos + (header.xsize >> hshift[header.is_yuv - 1][c]) >
-          bytes.data() + bytes.size())
-        return JXL_FAILURE("Not enough image data");
-      for (size_t x = 0; x < header.xsize >> hshift[header.is_yuv - 1][c];
-           ++x) {
-        row[x] = (1.f / 255.f) * ((*pos++) - 128.f);
-      }
-    }
-  }
-  bundle.SetFromImage(std::move(yuvdata), io->metadata.m.color_encoding);
-  bundle.color_transform = ColorTransform::kYCbCr;
-
-  YCbCrChromaSubsampling subsampling;
-  uint8_t cssh[3] = {
-      2, static_cast<uint8_t>(hshift[header.is_yuv - 1][1] ? 1 : 2),
-      static_cast<uint8_t>(hshift[header.is_yuv - 1][2] ? 1 : 2)};
-  uint8_t cssv[3] = {
-      2, static_cast<uint8_t>(vshift[header.is_yuv - 1][1] ? 1 : 2),
-      static_cast<uint8_t>(vshift[header.is_yuv - 1][2] ? 1 : 2)};
-
-  JXL_RETURN_IF_ERROR(subsampling.Set(cssh, cssv));
-  bundle.chroma_subsampling = subsampling;
-  io->Main() = std::move(bundle);
-
-  JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetSRGB(ColorSpace::kRGB));
-  io->metadata.m.SetUintSamples(header.bits_per_sample);
-  io->metadata.m.SetAlphaBits(0);
-  io->dec_pixels = header.xsize * header.ysize;
-
-  io->metadata.m.bit_depth.bits_per_sample = io->Main().DetectRealBitdepth();
-  io->SetSize(header.xsize, header.ysize);
-  SetIntensityTarget(io);
-  return true;
-}
-
-}  // namespace test
-}  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/codec_y4m_testonly.h
+++ b/third_party/jpeg-xl/lib/jxl/codec_y4m_testonly.h
@@ -1,18 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <stdint.h>
-
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
-
-namespace jxl {
-namespace test {
-
-Status DecodeImageY4M(const Span<const uint8_t> bytes, CodecInOut* io);
-
-}  // namespace test
-}  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/color_management.cc
+++ b/third_party/jpeg-xl/lib/jxl/color_management.cc
@@ -249,7 +249,7 @@ Status CreateICCHeader(const ColorEncoding& c,

  WriteICCUint32(0, 0, header);  // size, correct value filled in at end
  WriteICCTag(kCmm, 4, header);
-  WriteICCUint32(0x04300000u, 8, header);
+  WriteICCUint32(0x04400000u, 8, header);
  const char* profile_type =
      c.GetColorSpace() == ColorSpace::kXYB ? "scnr" : "mntr";
  WriteICCTag(profile_type, 12, header);
@@ -339,6 +339,44 @@ Status CreateICCChadTag(float chad[9], PaddedBytes* JXL_RESTRICT tags) {
  return true;
 }

+void MaybeCreateICCCICPTag(const ColorEncoding& c,
+                           PaddedBytes* JXL_RESTRICT tags, size_t* offset,
+                           size_t* size, PaddedBytes* JXL_RESTRICT tagtable,
+                           std::vector<size_t>* offsets) {
+  if (c.GetColorSpace() != ColorSpace::kRGB) {
+    return;
+  }
+  uint8_t primaries = 0;
+  if (c.primaries == Primaries::kP3) {
+    if (c.white_point == WhitePoint::kD65) {
+      primaries = 12;
+    } else if (c.white_point == WhitePoint::kDCI) {
+      primaries = 11;
+    } else {
+      return;
+    }
+  } else if (c.primaries != Primaries::kCustom &&
+             c.white_point == WhitePoint::kD65) {
+    primaries = static_cast<uint8_t>(c.primaries);
+  } else {
+    return;
+  }
+  if (c.tf.IsUnknown() || c.tf.IsGamma()) {
+    return;
+  }
+  WriteICCTag("cicp", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  WriteICCUint8(primaries, tags->size(), tags);
+  WriteICCUint8(static_cast<uint8_t>(c.tf.GetTransferFunction()), tags->size(),
+                tags);
+  // Matrix
+  WriteICCUint8(0, tags->size(), tags);
+  // Full range
+  WriteICCUint8(1, tags->size(), tags);
+  FinalizeICCTag(tags, offset, size);
+  AddToICCTagTable("cicp", *offset, *size, tagtable, offsets);
+}
+
 void CreateICCCurvCurvTag(const std::vector<uint16_t>& curve,
                          PaddedBytes* JXL_RESTRICT tags) {
  size_t pos = tags->size();
@@ -351,6 +389,7 @@ void CreateICCCurvCurvTag(const std::vector<uint16_t>& curve,
  }
 }

+// Writes 12 + 4*params.size() bytes
 Status CreateICCCurvParaTag(std::vector<float> params, size_t curve_type,
                            PaddedBytes* JXL_RESTRICT tags) {
  WriteICCTag("para", tags->size(), tags);
@@ -365,30 +404,50 @@ Status CreateICCCurvParaTag(std::vector<float> params, size_t curve_type,

 Status CreateICCLutAtoBTagForXYB(PaddedBytes* JXL_RESTRICT tags) {
  WriteICCTag("mAB ", tags->size(), tags);
+  // 4 reserved bytes set to 0
  WriteICCUint32(0, tags->size(), tags);
+  // number of input channels
  WriteICCUint8(3, tags->size(), tags);
+  // number of output channels
  WriteICCUint8(3, tags->size(), tags);
+  // 2 reserved bytes for padding
  WriteICCUint16(0, tags->size(), tags);
-  WriteICCUint32(316, tags->size(), tags);
-  WriteICCUint32(268, tags->size(), tags);
-  WriteICCUint32(148, tags->size(), tags);
-  WriteICCUint32(80, tags->size(), tags);
+  // offset to first B curve
  WriteICCUint32(32, tags->size(), tags);
+  // offset to matrix
+  WriteICCUint32(244, tags->size(), tags);
+  // offset to first M curve
+  WriteICCUint32(148, tags->size(), tags);
+  // offset to CLUT
+  WriteICCUint32(80, tags->size(), tags);
+  // offset to first A curve
+  // (reuse linear B curves)
+  WriteICCUint32(32, tags->size(), tags);
+
+  // offset = 32
+  // no-op curves
  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  // offset = 80
+  // number of grid points for each input channel
+  for (int i = 0; i < 16; ++i) {
+    WriteICCUint8(i < 3 ? 2 : 0, tags->size(), tags);
+  }
+  // precision = 2
  WriteICCUint8(2, tags->size(), tags);
-  WriteICCUint8(2, tags->size(), tags);
-  WriteICCUint8(2, tags->size(), tags);
-  WriteICCUint8(0, tags->size(), tags);
-  WriteICCUint32(0, tags->size(), tags);
-  WriteICCUint32(0, tags->size(), tags);
-  WriteICCUint32(0, tags->size(), tags);
-  WriteICCUint8(2, tags->size(), tags);
+  // 3 bytes of padding
  WriteICCUint8(0, tags->size(), tags);
  WriteICCUint16(0, tags->size(), tags);
-  const float kOffsets[3] = {0.015387, 0.028101, 0.277706};
-  const float kScaling[3] = {1.125, 1.125, 1. / 1.511027};
+  const float kOffsets[3] = {
+      kScaledXYBOffset[0] + kScaledXYBOffset[1],
+      kScaledXYBOffset[1] - kScaledXYBOffset[0] + 1.0f / kScaledXYBScale[0],
+      kScaledXYBOffset[1] + kScaledXYBOffset[2]};
+  const float kScaling[3] = {
+      1.0f / (1.0f / kScaledXYBScale[0] + 1.0f / kScaledXYBScale[1]),
+      1.0f / (1.0f / kScaledXYBScale[0] + 1.0f / kScaledXYBScale[1]),
+      1.0f / (1.0f / kScaledXYBScale[1] + 1.0f / kScaledXYBScale[2])};
+  // 2*2*2*3 entries of 2 bytes each = 48 bytes
  for (size_t ix = 0; ix < 2; ++ix) {
    for (size_t iy = 0; iy < 2; ++iy) {
      for (size_t ib = 0; ib < 2; ++ib) {
@@ -414,6 +473,8 @@ Status CreateICCLutAtoBTagForXYB(PaddedBytes* JXL_RESTRICT tags) {
      }
    }
  }
+  // offset = 148
+  // 3 curves with 5 parameters = 3 * (12 + 5 * 4) = 96 bytes
  for (size_t i = 0; i < 3; ++i) {
    const float b =
        -kOffsets[i] - std::cbrt(jxl::kNegOpsinAbsorbanceBiasRGB[i]);
@@ -423,23 +484,24 @@ Status CreateICCLutAtoBTagForXYB(PaddedBytes* JXL_RESTRICT tags) {
        b,
        0,                                // unused
        std::max(0.f, -b * kScaling[i]),  // make skcms happy
-        jxl::kNegOpsinAbsorbanceBiasRGB[i],
-        0,  // unused
    };
-    JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(params, 4, tags));
+    JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(params, 3, tags));
  }
+  // offset = 244
  const double matrix[] = {1.5170095, -1.1065225, 0.071623,
                           -0.050022, 0.5683655,  -0.018344,
                           -1.387676, 1.1145555,  0.6857255};
+  // 12 * 4 = 48 bytes
  for (size_t i = 0; i < 9; ++i) {
    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(matrix[i], tags->size(), tags));
  }
-  JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(0.0f, tags->size(), tags));
-  JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(0.0f, tags->size(), tags));
-  JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(0.0f, tags->size(), tags));
-  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
-  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
-  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  for (size_t i = 0; i < 3; ++i) {
+    float intercept = 0;
+    for (size_t j = 0; j < 3; ++j) {
+      intercept += matrix[i * 3 + j] * jxl::kNegOpsinAbsorbanceBiasRGB[j];
+    }
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(intercept, tags->size(), tags));
+  }
  return true;
 }
 }  // namespace
@@ -481,9 +543,7 @@ Status MaybeCreateProfile(const ColorEncoding& c,
  FinalizeICCTag(&tags, &tag_offset, &tag_size);
  AddToICCTagTable("desc", tag_offset, tag_size, &tagtable, &offsets);

-  const std::string copyright =
-      "Copyright 2019 Google LLC, CC-BY-SA 3.0 Unported "
-      "license(https://creativecommons.org/licenses/by-sa/3.0/legalcode)";
+  const std::string copyright = "CC0";
  CreateICCMlucTag(copyright, &tags);
  FinalizeICCTag(&tags, &tag_offset, &tag_size);
  AddToICCTagTable("cprt", tag_offset, tag_size, &tagtable, &offsets);
@@ -511,6 +571,9 @@ Status MaybeCreateProfile(const ColorEncoding& c,
  }

  if (c.GetColorSpace() == ColorSpace::kRGB) {
+    MaybeCreateICCCICPTag(c, &tags, &tag_offset, &tag_size, &tagtable,
+                          &offsets);
+
    const PrimariesCIExy primaries = c.GetPrimaries();
    float m[9];
    JXL_RETURN_IF_ERROR(CreateICCRGBMatrix(primaries.r, primaries.g,
--- a/third_party/jpeg-xl/lib/jxl/color_management_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/color_management_test.cc
@@ -392,7 +392,7 @@ TEST_F(ColorManagementTest, XYBProfile) {
      }
    }
  }
-  static float kMaxError[3] = {8.5e-4, 4e-4, 5e-4};
+  static float kMaxError[3] = {9e-4, 4e-4, 5e-4};
  printf("Maximum errors:\n");
  for (size_t c = 0; c < 3; ++c) {
    debug_print_color(max_err_i[c]);
--- a/third_party/jpeg-xl/lib/jxl/dec_cache.cc
+++ b/third_party/jpeg-xl/lib/jxl/dec_cache.cc
@@ -144,10 +144,11 @@ Status PassesDecoderState::PreparePipeline(ImageBundle* decoded,
               frame_header.save_before_color_transform);
    JXL_ASSERT(!options.render_spotcolors ||
               !decoded->metadata()->Find(ExtraChannel::kSpotColor));
-    bool is_rgba = (format.num_channels == 4);
-    uint8_t* rgb_output = reinterpret_cast<uint8_t*>(image_buffer);
-    builder.AddStage(GetFastXYBTosRGB8Stage(rgb_output, stride, width, height,
-                                            is_rgba, has_alpha, alpha_c));
+    bool is_rgba = (main_output.format.num_channels == 4);
+    uint8_t* rgb_output = reinterpret_cast<uint8_t*>(main_output.buffer);
+    builder.AddStage(GetFastXYBTosRGB8Stage(rgb_output, main_output.stride,
+                                            width, height, is_rgba, has_alpha,
+                                            alpha_c));
  } else {
    bool linear = false;
    if (frame_header.color_transform == ColorTransform::kYCbCr) {
@@ -212,10 +213,10 @@ Status PassesDecoderState::PreparePipeline(ImageBundle* decoded,
      linear = false;
    }

-    if (pixel_callback.IsPresent() || image_buffer) {
-      builder.AddStage(GetWriteToOutputStage(
-          pixel_callback, image_buffer, width, height, stride, format,
-          has_alpha, unpremul_alpha, alpha_c, undo_orientation));
+    if (main_output.callback.IsPresent() || main_output.buffer) {
+      builder.AddStage(GetWriteToOutputStage(main_output, width, height,
+                                             has_alpha, unpremul_alpha, alpha_c,
+                                             undo_orientation, extra_output));
    } else {
      builder.AddStage(GetWriteToImageBundleStage(
          decoded, output_encoding_info.color_encoding));
--- a/third_party/jpeg-xl/lib/jxl/dec_cache.h
+++ b/third_party/jpeg-xl/lib/jxl/dec_cache.h
@@ -56,6 +56,20 @@ struct PixelCallback {
  void* init_opaque = nullptr;
 };

+struct ImageOutput {
+  // Pixel format of the output pixels, used for buffer and callback output.
+  JxlPixelFormat format;
+  // Output bit depth for unsigned data types, used for float to int conversion.
+  size_t bits_per_sample;
+  // Callback for line-by-line output.
+  PixelCallback callback;
+  // Pixel buffer for image output.
+  void* buffer;
+  size_t buffer_size;
+  // Length of a row of image_buffer in bytes (based on oriented width).
+  size_t stride;
+};
+
 // Per-frame decoder state. All the images here should be accessed through a
 // group rect (either with block units or pixel units).
 struct PassesDecoderState {
@@ -77,17 +91,11 @@ struct PassesDecoderState {
  // Sigma values for EPF.
  ImageF sigma;

-  // Pixel buffer for image output.
-  void* image_buffer;
  // Image dimensions before applying undo_orientation.
  size_t width;
  size_t height;
-  // Length of a row of image_buffer in bytes (based on oriented width).
-  size_t stride;
-  // Callback for line-by-line output.
-  PixelCallback pixel_callback;
-  // Pixel format of the output pixels, used for buffer and callback output.
-  JxlPixelFormat format;
+  ImageOutput main_output;
+  std::vector<ImageOutput> extra_output;

  // Whether to use int16 float-XYB-to-uint8-srgb conversion.
  bool fast_xyb_srgb8_conversion;
@@ -134,8 +142,9 @@ struct PassesDecoderState {
    b_dm_multiplier =
        std::pow(1 / (1.25f), shared->frame_header.b_qm_scale - 2.0f);

-    pixel_callback = PixelCallback();
-    image_buffer = nullptr;
+    main_output.callback = PixelCallback();
+    main_output.buffer = nullptr;
+    extra_output.clear();

    fast_xyb_srgb8_conversion = false;
    unpremul_alpha = false;
--- a/third_party/jpeg-xl/lib/jxl/dec_frame.h
+++ b/third_party/jpeg-xl/lib/jxl/dec_frame.h
@@ -173,28 +173,22 @@ class FrameDecoder {
  }

  // Sets the pixel callback or image buffer where the pixels will be decoded.
-  // This is not supported for all images. If it succeeds, HasRGBBuffer() will
-  // return true.
-  // If it does not succeed, the image is decoded to the ImageBundle passed to
-  // InitFrame instead.
  //
  // @param undo_orientation: if true, indicates the frame decoder should apply
  // the exif orientation to bring the image to the intended display
  // orientation.
  void SetImageOutput(const PixelCallback& pixel_callback, void* image_buffer,
-                      size_t xsize, size_t ysize, JxlPixelFormat format,
+                      size_t image_buffer_size, size_t xsize, size_t ysize,
+                      JxlPixelFormat format, size_t bits_per_sample,
                      bool unpremul_alpha, bool undo_orientation) const {
-    dec_state_->pixel_callback = pixel_callback;
-    dec_state_->image_buffer = image_buffer;
    dec_state_->width = xsize;
    dec_state_->height = ysize;
-    dec_state_->format = format;
-    dec_state_->stride =
-        (xsize * BytesPerChannel(format.data_type) * format.num_channels);
-    if (format.align > 1) {
-      dec_state_->stride =
-          (jxl::DivCeil(dec_state_->stride, format.align) * format.align);
-    }
+    dec_state_->main_output.format = format;
+    dec_state_->main_output.bits_per_sample = bits_per_sample;
+    dec_state_->main_output.callback = pixel_callback;
+    dec_state_->main_output.buffer = image_buffer;
+    dec_state_->main_output.buffer_size = image_buffer_size;
+    dec_state_->main_output.stride = GetStride(xsize, format);
    const jxl::ExtraChannelInfo* alpha =
        decoded_->metadata()->Find(jxl::ExtraChannel::kAlpha);
    if (alpha && alpha->alpha_associated && unpremul_alpha) {
@@ -206,9 +200,11 @@ class FrameDecoder {
        std::swap(dec_state_->width, dec_state_->height);
      }
    }
+    dec_state_->extra_output.clear();
 #if !JXL_HIGH_PRECISION
-    if (dec_state_->image_buffer && (format.data_type == JXL_TYPE_UINT8) &&
-        (format.num_channels >= 3) && !dec_state_->unpremul_alpha &&
+    if (dec_state_->main_output.buffer &&
+        (format.data_type == JXL_TYPE_UINT8) && (format.num_channels >= 3) &&
+        !dec_state_->unpremul_alpha &&
        (dec_state_->undo_orientation == Orientation::kIdentity) &&
        decoded_->metadata()->xyb_encoded &&
        dec_state_->output_encoding_info.color_encoding.IsSRGB() &&
@@ -221,12 +217,15 @@ class FrameDecoder {
 #endif
  }

-  // Returns true if the rgb output buffer passed by MaybeSetRGB8OutputBuffer
-  // has been/will be populated by Flush() / FinalizeFrame(), or if a pixel
-  // callback has been used.
-  bool HasRGBBuffer() const {
-    return dec_state_->image_buffer != nullptr ||
-           dec_state_->pixel_callback.IsPresent();
+  void AddExtraChannelOutput(void* buffer, size_t buffer_size, size_t xsize,
+                             JxlPixelFormat format, size_t bits_per_sample) {
+    ImageOutput out;
+    out.format = format;
+    out.bits_per_sample = bits_per_sample;
+    out.buffer = buffer;
+    out.buffer_size = buffer_size;
+    out.stride = GetStride(xsize, format);
+    dec_state_->extra_output.push_back(out);
  }

 private:
@@ -273,6 +272,15 @@ class FrameDecoder {
                                          : 2u);
  }

+  static size_t GetStride(const size_t xsize, JxlPixelFormat format) {
+    size_t stride =
+        (xsize * BytesPerChannel(format.data_type) * format.num_channels);
+    if (format.align > 1) {
+      stride = (jxl::DivCeil(stride, format.align) * format.align);
+    }
+    return stride;
+  }
+
  PassesDecoderState* dec_state_;
  ThreadPool* pool_;
  std::vector<TocEntry> toc_;
--- a/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h
+++ b/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h
@@ -65,12 +65,13 @@ class Rec2408ToneMapper {
                Mul(Set(df_, 10000), TF_PQ().DisplayFromEncoded(df_, e4))));

    const V ratio = Div(new_luminance, luminance);
-
+    const V inv_target_peak = Set(df_, inv_target_peak_);
    const V normalizer = Set(df_, normalizer_);
+    const V multiplier = Mul(ratio, normalizer);
    for (V* const val : {red, green, blue}) {
-      *val = Mul(IfThenElse(Le(luminance, Set(df_, 1e-6f)), new_luminance,
-                            Mul(*val, ratio)),
-                 normalizer);
+      *val = IfThenElse(Le(luminance, Set(df_, 1e-6f)),
+                        Mul(new_luminance, inv_target_peak),
+                        Mul(*val, multiplier));
    }
  }

@@ -98,8 +99,8 @@ class Rec2408ToneMapper {
        ks,
        MulAdd(Add(t_b_3, MulAdd(Set(df_, -2), t_b_2, t_b)),
               Sub(Set(df_, 1), ks),
-               MulAdd(Set(df_, -2), t_b_3,
-                      Mul(Mul(Set(df_, 3), t_b_2), max_lum))));
+               Mul(MulAdd(Set(df_, -2), t_b_3, Mul(Set(df_, 3), t_b_2)),
+                   max_lum)));
  }

  D df_;
@@ -125,6 +126,7 @@ class Rec2408ToneMapper {
  const float inv_one_minus_ks_ = 1.0f / std::max(1e-6f, 1.0f - ks_);

  const float normalizer_ = source_range_.second / target_range_.second;
+  const float inv_target_peak_ = 1.f / target_range_.second;
 };

 class HlgOOTF {
--- a/third_party/jpeg-xl/lib/jxl/decode.cc
+++ b/third_party/jpeg-xl/lib/jxl/decode.cc
@@ -144,6 +144,20 @@ size_t BitsPerChannel(JxlDataType data_type) {
  }
 }

+template <typename T>
+uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                     JxlPixelFormat format) {
+  if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return BitsPerChannel(format.data_type);
+  } else if (bit_depth.type == JXL_BIT_DEPTH_FROM_CODESTREAM) {
+    return metadata.bit_depth.bits_per_sample;
+  } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
+    return bit_depth.bits_per_sample;
+  } else {
+    return 0;
+  }
+}
+
 enum class DecoderStage : uint32_t {
  kInited,              // Decoder created, no JxlDecoderProcessInput called yet
  kStarted,             // Running JxlDecoderProcessInput calls
@@ -415,6 +429,7 @@ struct JxlDecoderStruct {
  size_t image_out_size;

  JxlPixelFormat image_out_format;
+  JxlBitDepth image_out_bit_depth;

  // For extra channels. Empty if no extra channels are requested, and they are
  // reset each frame
@@ -701,6 +716,7 @@ void JxlDecoderRewindDecodingState(JxlDecoder* dec) {
  dec->image_out_destroy_callback = nullptr;
  dec->image_out_init_opaque = nullptr;
  dec->image_out_size = 0;
+  dec->image_out_bit_depth.type = JXL_BIT_DEPTH_FROM_PIXEL_FORMAT;
  dec->extra_channel_output.clear();
  dec->dec_pixels = 0;
  dec->next_in = 0;
@@ -1072,93 +1088,6 @@ JxlDecoderStatus JxlDecoderReadAllHeaders(JxlDecoder* dec) {
  return JXL_DEC_SUCCESS;
 }

-static size_t GetStride(const JxlDecoder* dec, const JxlPixelFormat& format) {
-  size_t xsize, ysize;
-  GetCurrentDimensions(dec, xsize, ysize);
-  size_t stride = xsize * (BitsPerChannel(format.data_type) *
-                           format.num_channels / jxl::kBitsPerByte);
-  if (format.align > 1) {
-    stride = jxl::DivCeil(stride, format.align) * format.align;
-  }
-  return stride;
-}
-
-// Internal wrapper around jxl::ConvertToExternal which converts the stride,
-// format and orientation and allows to choose whether to get all RGB(A)
-// channels or alternatively get a single extra channel.
-// If want_extra_channel, a valid index to a single extra channel must be
-// given, the output must be single-channel, and format.num_channels is ignored
-// and treated as if it is 1.
-static JxlDecoderStatus ConvertImageInternal(
-    const JxlDecoder* dec, const jxl::ImageBundle& frame,
-    const JxlPixelFormat& format, bool want_extra_channel,
-    size_t extra_channel_index, void* out_image, size_t out_size,
-    const PixelCallback& out_callback) {
-  // TODO(lode): handle mismatch of RGB/grayscale color profiles and pixel data
-  // color/grayscale format
-  const size_t stride = GetStride(dec, format);
-
-  bool float_format = format.data_type == JXL_TYPE_FLOAT ||
-                      format.data_type == JXL_TYPE_FLOAT16;
-
-  jxl::Orientation undo_orientation = dec->keep_orientation
-                                          ? jxl::Orientation::kIdentity
-                                          : dec->metadata.m.GetOrientation();
-
-  jxl::Status status(true);
-  if (want_extra_channel) {
-    JXL_ASSERT(extra_channel_index < frame.extra_channels().size());
-    status = jxl::ConvertToExternal(frame.extra_channels()[extra_channel_index],
-                                    BitsPerChannel(format.data_type),
-                                    float_format, format.endianness, stride,
-                                    dec->thread_pool.get(), out_image, out_size,
-                                    out_callback, undo_orientation);
-  } else {
-    status = jxl::ConvertToExternal(
-        frame, BitsPerChannel(format.data_type), float_format,
-        format.num_channels, format.endianness, stride, dec->thread_pool.get(),
-        out_image, out_size, out_callback, undo_orientation,
-        dec->unpremul_alpha);
-  }
-
-  return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR;
-}
-
-// Outputs the preview or full image (including extra channels) in the internal
-// image bundle to the image buffers and/or image callback provided through the
-// API.
-// TODO(szabadka) Handle all these cases in the low-memory code-path and remove
-// this function.
-JxlDecoderStatus JxlDecoderOutputImage(JxlDecoder* dec) {
-  if (!dec->frame_dec->HasRGBBuffer()) {
-    JxlDecoderStatus status = ConvertImageInternal(
-        dec, *dec->ib, dec->image_out_format,
-        /*want_extra_channel=*/false,
-        /*extra_channel_index=*/0, dec->image_out_buffer, dec->image_out_size,
-        PixelCallback{dec->image_out_init_callback, dec->image_out_run_callback,
-                      dec->image_out_destroy_callback,
-                      dec->image_out_init_opaque});
-    if (status != JXL_DEC_SUCCESS) return status;
-  }
-  bool has_ec = !dec->ib->extra_channels().empty();
-  for (size_t i = 0; i < dec->extra_channel_output.size(); ++i) {
-    void* buffer = dec->extra_channel_output[i].buffer;
-    // buffer nullptr indicates this extra channel is not requested
-    if (!buffer) continue;
-    if (!has_ec) {
-      JXL_WARNING("Extra channels are not supported when callback is used");
-      return JXL_DEC_ERROR;
-    }
-    const JxlPixelFormat* format = &dec->extra_channel_output[i].format;
-    JxlDecoderStatus status = ConvertImageInternal(
-        dec, *dec->ib, *format,
-        /*want_extra_channel=*/true, /*extra_channel_index=*/i, buffer,
-        dec->extra_channel_output[i].buffer_size, /*out_callback=*/{});
-    if (status != JXL_DEC_SUCCESS) return status;
-  }
-  return JXL_DEC_SUCCESS;
-}
-
 JxlDecoderStatus JxlDecoderProcessSections(JxlDecoder* dec) {
  Span<const uint8_t> span;
  JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
@@ -1463,15 +1392,27 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
        }
      }

-      if (dec->image_out_buffer_set && dec->extra_channel_output.empty()) {
+      if (dec->image_out_buffer_set) {
        size_t xsize, ysize;
        GetCurrentDimensions(dec, xsize, ysize);
+        size_t bits_per_sample = GetBitDepth(
+            dec->image_out_bit_depth, dec->metadata.m, dec->image_out_format);
        dec->frame_dec->SetImageOutput(
            PixelCallback{
                dec->image_out_init_callback, dec->image_out_run_callback,
                dec->image_out_destroy_callback, dec->image_out_init_opaque},
-            reinterpret_cast<uint8_t*>(dec->image_out_buffer), xsize, ysize,
-            dec->image_out_format, dec->unpremul_alpha, !dec->keep_orientation);
+            reinterpret_cast<uint8_t*>(dec->image_out_buffer),
+            dec->image_out_size, xsize, ysize, dec->image_out_format,
+            bits_per_sample, dec->unpremul_alpha, !dec->keep_orientation);
+        for (size_t i = 0; i < dec->extra_channel_output.size(); ++i) {
+          const auto& extra = dec->extra_channel_output[i];
+          size_t ec_bits_per_sample =
+              GetBitDepth(dec->image_out_bit_depth,
+                          dec->metadata.m.extra_channel_info[i], extra.format);
+          dec->frame_dec->AddExtraChannelOutput(extra.buffer, extra.buffer_size,
+                                                xsize, extra.format,
+                                                ec_bits_per_sample);
+        }
      }

      size_t next_num_passes_to_pause = dec->frame_dec->NextNumPassesToPause();
@@ -1527,9 +1468,6 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
      }

      if (dec->preview_frame || dec->is_last_of_still) {
-        if (dec->image_out_buffer_set) {
-          JXL_API_RETURN_IF_ERROR(JxlDecoderOutputImage(dec));
-        }
        dec->image_out_buffer_set = false;
        dec->extra_channel_output.clear();
      }
@@ -2347,11 +2285,7 @@ JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) {
    return JXL_DEC_ERROR;
  }

-  if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) {
-    return JXL_DEC_SUCCESS;
-  }
-
-  return jxl::JxlDecoderOutputImage(dec);
+  return JXL_DEC_SUCCESS;
 }

 JXL_EXPORT JxlDecoderStatus JxlDecoderPreviewOutBufferSize(
@@ -2809,3 +2743,41 @@ JxlDecoderStatus JxlDecoderSetProgressiveDetail(JxlDecoder* dec,
  dec->prog_detail = detail;
  return JXL_DEC_SUCCESS;
 }
+
+namespace {
+
+template <typename T>
+JxlDecoderStatus VerifyOutputBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                                      JxlPixelFormat format) {
+  if ((format.data_type == JXL_TYPE_FLOAT ||
+       format.data_type == JXL_TYPE_FLOAT16) &&
+      bit_depth.type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return JXL_API_ERROR(
+        "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT is implemented "
+        "for float types.");
+  }
+  uint32_t bits_per_sample = GetBitDepth(bit_depth, metadata, format);
+  if (format.data_type == JXL_TYPE_UINT8 &&
+      (bits_per_sample == 0 || bits_per_sample > 8)) {
+    return JXL_API_ERROR("Inavlid bit depth %u for uint8 output",
+                         bits_per_sample);
+  } else if (format.data_type == JXL_TYPE_UINT16 &&
+             (bits_per_sample == 0 || bits_per_sample > 16)) {
+    return JXL_API_ERROR("Inavlid bit depth %u for uint16 output",
+                         bits_per_sample);
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+}  // namespace
+
+JxlDecoderStatus JxlDecoderSetImageOutBitDepth(JxlDecoder* dec,
+                                               const JxlBitDepth* bit_depth) {
+  if (!dec->image_out_buffer_set) {
+    return JXL_API_ERROR("No image out buffer was set.");
+  }
+  JXL_API_RETURN_IF_ERROR(
+      VerifyOutputBitDepth(*bit_depth, dec->metadata.m, dec->image_out_format));
+  dec->image_out_bit_depth = *bit_depth;
+  return JXL_DEC_SUCCESS;
+}
--- a/third_party/jpeg-xl/lib/jxl/decode_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/decode_test.cc
@@ -259,13 +259,15 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
  if (params.intensity_target != 0) {
    io.metadata.m.SetIntensityTarget(params.intensity_target);
  }
+  JxlPixelFormat format = {static_cast<uint32_t>(num_channels), JXL_TYPE_UINT16,
+                           JXL_BIG_ENDIAN, 0};
  // Make the grayscale-ness of the io metadata color_encoding and the packed
  // image match.
  io.metadata.m.color_encoding = color_encoding;
-  EXPECT_TRUE(ConvertFromExternal(
-      pixels, xsize, ysize, color_encoding, num_channels,
-      /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      &pool, &io.Main(), /*float_in=*/false, /*align=*/0));
+  EXPECT_TRUE(ConvertFromExternal(pixels, xsize, ysize, color_encoding,
+                                  /*alpha_is_premultiplied=*/false,
+                                  /*bits_per_sample=*/16, format, &pool,
+                                  &io.Main()));
  jxl::PaddedBytes jpeg_data;
  if (params.jpeg_codestream != nullptr) {
 #if JPEGXL_ENABLE_JPEG
@@ -1334,11 +1336,9 @@ TEST_P(DecodeTestParam, PixelTest) {
    io.SetSize(config.xsize, config.ysize);

    EXPECT_TRUE(ConvertFromExternal(bytes, config.xsize, config.ysize,
-                                    color_encoding, orig_channels,
+                                    color_encoding,
                                    /*alpha_is_premultiplied=*/false, 16,
-                                    JXL_BIG_ENDIAN, nullptr, &io.Main(),
-                                    /*float_in=*/false,
-                                    /*align=*/0));
+                                    format_orig, nullptr, &io.Main()));

    for (size_t i = 0; i < pixels.size(); i++) pixels[i] = 0;
    EXPECT_TRUE(ConvertToExternal(
@@ -1448,8 +1448,6 @@ std::vector<PixelTestConfig> GeneratePixelTests() {
  // Test previews.
  for (int preview_mode = 0; preview_mode < jxl::kNumPreviewModes;
       preview_mode++) {
-    // if (preview_mode == jxl::kBigPreview &&
-    //	ch_info[0].output_channels != 3) continue;
    make_test(ch_info[0], 77, 33, (jxl::PreviewMode)preview_mode,
              /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
              JXL_ORIENT_IDENTITY,
@@ -1664,12 +1662,10 @@ TEST(DecodeTest, PixelTestWithICCProfileLossy) {
  jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
  jxl::CodecInOut io0;
  io0.SetSize(xsize, ysize);
-  EXPECT_TRUE(
-      ConvertFromExternal(span0, xsize, ysize, color_encoding0, /*channels=*/3,
-                          /*alpha_is_premultiplied=*/false,
-                          /*bits_per_sample=*/16, format_orig.endianness,
-                          /*pool=*/nullptr, &io0.Main(), /*float_in=*/false,
-                          /*align=*/0));
+  EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                  /*alpha_is_premultiplied=*/false,
+                                  /*bits_per_sample=*/16, format_orig,
+                                  /*pool=*/nullptr, &io0.Main()));

  jxl::ColorEncoding color_encoding1;
  EXPECT_TRUE(color_encoding1.SetICC(std::move(icc)));
@@ -1677,15 +1673,14 @@ TEST(DecodeTest, PixelTestWithICCProfileLossy) {
  jxl::CodecInOut io1;
  io1.SetSize(xsize, ysize);
  EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
-                                  channels, /*alpha_is_premultiplied=*/false,
-                                  /*bits_per_sample=*/32, format.endianness,
-                                  /*pool=*/nullptr, &io1.Main(),
-                                  /*float_in=*/true, /*align=*/0));
+                                  /*alpha_is_premultiplied=*/false,
+                                  /*bits_per_sample=*/32, format,
+                                  /*pool=*/nullptr, &io1.Main()));

  jxl::ButteraugliParams ba;
  EXPECT_THAT(ButteraugliDistance(io0, io1, ba, jxl::GetJxlCms(),
                                  /*distmap=*/nullptr, nullptr),
-              IsSlightlyBelow(0.785f));
+              IsSlightlyBelow(0.85f));

  JxlDecoderDestroy(dec);
 }
@@ -1722,21 +1717,25 @@ double ButteraugliDistance(size_t xsize, size_t ysize,
  jxl::CodecInOut in;
  in.metadata.m.color_encoding = color_in;
  in.metadata.m.SetIntensityTarget(intensity_in);
+  JxlPixelFormat format_in = {static_cast<uint32_t>(color_in.Channels()),
+                              JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
  EXPECT_TRUE(jxl::ConvertFromExternal(
      jxl::Span<const uint8_t>(pixels_in.data(), pixels_in.size()), xsize,
-      ysize, color_in, color_in.Channels(),
+      ysize, color_in,
      /*alpha_is_premultiplied=*/false,
-      /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      /*pool=*/nullptr, &in.Main(), /*float_in=*/false, /*align=*/0));
+      /*bits_per_sample=*/16, format_in,
+      /*pool=*/nullptr, &in.Main()));
  jxl::CodecInOut out;
  out.metadata.m.color_encoding = color_out;
  out.metadata.m.SetIntensityTarget(intensity_out);
+  JxlPixelFormat format_out = {static_cast<uint32_t>(color_out.Channels()),
+                               JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
  EXPECT_TRUE(jxl::ConvertFromExternal(
      jxl::Span<const uint8_t>(pixels_out.data(), pixels_out.size()), xsize,
-      ysize, color_out, color_out.Channels(),
+      ysize, color_out,
      /*alpha_is_premultiplied=*/false,
-      /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      /*pool=*/nullptr, &out.Main(), /*float_in=*/false, /*align=*/0));
+      /*bits_per_sample=*/16, format_out,
+      /*pool=*/nullptr, &out.Main()));
  return ButteraugliDistance(in, out, jxl::ButteraugliParams(),
                             jxl::GetJxlCms(), nullptr, nullptr);
 }
@@ -1926,22 +1925,18 @@ TEST(DecodeTest, PixelTestOpaqueSrgbLossy) {
    jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
    jxl::CodecInOut io0;
    io0.SetSize(xsize, ysize);
-    EXPECT_TRUE(ConvertFromExternal(
-        span0, xsize, ysize, color_encoding0, /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        format_orig.endianness,
-        /*pool=*/nullptr, &io0.Main(), /*float_in=*/false,
-        /*align=*/0));
+    EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                    /*alpha_is_premultiplied=*/false,
+                                    /*bits_per_sample=*/16, format_orig,
+                                    /*pool=*/nullptr, &io0.Main()));

    jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false);
    jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
    jxl::CodecInOut io1;
    EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
-                                    channels, /*alpha_is_premultiplied=*/false,
-                                    /*bits_per_sample=*/8, format.endianness,
-                                    /*pool=*/nullptr, &io1.Main(),
-                                    /*float_in=*/false,
-                                    /*align=*/0));
+                                    /*alpha_is_premultiplied=*/false,
+                                    /*bits_per_sample=*/8, format,
+                                    /*pool=*/nullptr, &io1.Main()));

    jxl::ButteraugliParams ba;
    EXPECT_THAT(ButteraugliDistance(io0, io1, ba, jxl::GetJxlCms(),
@@ -1982,22 +1977,18 @@ TEST(DecodeTest, PixelTestOpaqueSrgbLossyNoise) {
    jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
    jxl::CodecInOut io0;
    io0.SetSize(xsize, ysize);
-    EXPECT_TRUE(ConvertFromExternal(
-        span0, xsize, ysize, color_encoding0, /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        format_orig.endianness,
-        /*pool=*/nullptr, &io0.Main(), /*float_in=*/false,
-        /*align=*/0));
+    EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                    /*alpha_is_premultiplied=*/false,
+                                    /*bits_per_sample=*/16, format_orig,
+                                    /*pool=*/nullptr, &io0.Main()));

    jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false);
    jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
    jxl::CodecInOut io1;
    EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
-                                    channels, /*alpha_is_premultiplied=*/false,
-                                    /*bits_per_sample=*/8, format.endianness,
-                                    /*pool=*/nullptr, &io1.Main(),
-                                    /*float_in=*/false,
-                                    /*align=*/0));
+                                    /*alpha_is_premultiplied=*/false,
+                                    /*bits_per_sample=*/8, format,
+                                    /*pool=*/nullptr, &io1.Main()));

    jxl::ButteraugliParams ba;
    EXPECT_THAT(ButteraugliDistance(io0, io1, ba, jxl::GetJxlCms(),
@@ -2362,7 +2353,7 @@ TEST(DecodeTest, DCNotGettableTest) {
 TEST(DecodeTest, PreviewTest) {
  size_t xsize = 77, ysize = 120;
  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
-
+  JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
  for (jxl::PreviewMode mode : {jxl::kSmallPreview, jxl::kBigPreview}) {
    jxl::TestCodestreamParams params;
    params.preview_mode = mode;
@@ -2393,9 +2384,8 @@ TEST(DecodeTest, PreviewTest) {
    jxl::CodecInOut io0;
    EXPECT_TRUE(jxl::ConvertFromExternal(
        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-        c_srgb, 3, /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &io0.Main(),
-        /*float_in=*/false, /*align=*/0));
+        c_srgb, /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
+        format_orig, /*pool=*/nullptr, &io0.Main()));
    GeneratePreview(params.preview_mode, &io0.Main());

    size_t xsize_preview = io0.Main().xsize();
@@ -2416,9 +2406,9 @@ TEST(DecodeTest, PreviewTest) {
    jxl::CodecInOut io1;
    EXPECT_TRUE(jxl::ConvertFromExternal(
        jxl::Span<const uint8_t>(preview.data(), preview.size()), xsize_preview,
-        ysize_preview, c_srgb, 3, /*alpha_is_premultiplied=*/false,
-        /*bits_per_sample=*/8, JXL_LITTLE_ENDIAN,
-        /*pool=*/nullptr, &io1.Main(), /*float_in=*/false, /*align=*/0));
+        ysize_preview, c_srgb, /*alpha_is_premultiplied=*/false,
+        /*bits_per_sample=*/8, format,
+        /*pool=*/nullptr, &io1.Main()));

    jxl::ButteraugliParams ba;
    // TODO(lode): this ButteraugliDistance silently returns 0 (dangerous for
@@ -2492,10 +2482,9 @@ TEST(DecodeTest, AnimationTest) {

    EXPECT_TRUE(ConvertFromExternal(
        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
-        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
    bundle.duration = frame_durations[i];
    io.frames.push_back(std::move(bundle));
  }
@@ -2596,10 +2585,9 @@ TEST(DecodeTest, AnimationTestStreaming) {

    EXPECT_TRUE(ConvertFromExternal(
        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
-        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
    bundle.duration = frame_durations[i];
    io.frames.push_back(std::move(bundle));
  }
@@ -2815,10 +2803,9 @@ TEST(DecodeTest, SkipCurrentFrameTest) {

    EXPECT_TRUE(ConvertFromExternal(
        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
-        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
    bundle.duration = frame_durations[i];
    io.frames.push_back(std::move(bundle));
  }
@@ -2930,10 +2917,9 @@ TEST(DecodeTest, SkipFrameTest) {

    EXPECT_TRUE(ConvertFromExternal(
        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
-        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
    bundle.duration = frame_durations[i];
    io.frames.push_back(std::move(bundle));
  }
@@ -3067,10 +3053,8 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {
          jxl::Span<const uint8_t>(frame_internal.data(),
                                   frame_internal.size()),
          xsize, ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
-          /*channels=*/3,
-          /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-          JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle_internal,
-          /*float_in=*/false, /*align=*/0));
+          /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle_internal));
      bundle_internal.duration = 0;
      bundle_internal.use_for_next_frame = true;
      io.frames.push_back(std::move(bundle_internal));
@@ -3083,10 +3067,9 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {
    jxl::ImageBundle bundle(&io.metadata.m);
    EXPECT_TRUE(ConvertFromExternal(
        jxl::Span<const uint8_t>(frame.data(), frame.size()), xsize, ysize,
-        jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+        jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
    bundle.duration = frame_durations[i];
    // Create some variation in which frames depend on which.
    if (i != 3 && i != 9 && i != 10) {
@@ -3294,10 +3277,8 @@ TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
          jxl::Span<const uint8_t>(frame_internal.data(),
                                   frame_internal.size()),
          xsize / 2, ysize / 2, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
-          /*channels=*/4,
-          /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-          JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle_internal,
-          /*float_in=*/false, /*align=*/0));
+          /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle_internal));
      bundle_internal.duration = 0;
      bundle_internal.use_for_next_frame = true;
      bundle_internal.origin = {13, 17};
@@ -3315,10 +3296,9 @@ TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
    jxl::ImageBundle bundle(&io.metadata.m);
    EXPECT_TRUE(ConvertFromExternal(
        jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
-        cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/4,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+        cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
    bundle.duration = 5 + i;
    frame_durations_nc.push_back(5 + i);
    frame_durations_c.push_back(5 + i);
@@ -3579,10 +3559,8 @@ TEST(DecodeTest, OrientedCroppedFrameTest) {
      EXPECT_TRUE(ConvertFromExternal(
          jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
          cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
-          /*channels=*/4,
-          /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-          JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-          /*float_in=*/false, /*align=*/0));
+          /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle));
      bundle.origin = {cropx0, cropy0};
      bundle.use_for_next_frame = true;
      io.frames.push_back(std::move(bundle));
@@ -4659,14 +4637,15 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
    }
    std::vector<uint8_t> pixels =
        jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+    JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
    jxl::ColorEncoding color_encoding = jxl::ColorEncoding::SRGB(false);
    jxl::CodecInOut io;
    EXPECT_TRUE(jxl::ConvertFromExternal(
        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-        color_encoding, num_channels,
+        color_encoding,
        /*alpha_is_premultiplied=*/false,
-        /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-        /*pool=*/nullptr, &io.Main(), /*float_in=*/false, /*align=*/0));
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &io.Main()));
    jxl::TestCodestreamParams params;
    if (lossless) {
      params.cparams.SetLossless();
@@ -4681,7 +4660,6 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
        num_channels, params);
-    JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};

    for (size_t increment : {(size_t)1, data.size()}) {
      printf(
@@ -4782,11 +4760,9 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
        jxl::CodecInOut io1;
        EXPECT_TRUE(jxl::ConvertFromExternal(
            jxl::Span<const uint8_t>(passes[p].data(), passes[p].size()), xsize,
-            ysize, color_encoding, num_channels,
-            /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-            JXL_BIG_ENDIAN,
-            /*pool=*/nullptr, &io1.Main(), /*float_in=*/false,
-            /*align=*/0));
+            ysize, color_encoding,
+            /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format,
+            /*pool=*/nullptr, &io1.Main()));
        distances[p] = ButteraugliDistance(io, io1, ba, jxl::GetJxlCms(),
                                           nullptr, nullptr);
        if (p == kNumPasses) break;
@@ -4800,7 +4776,7 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
        // Verify that the returned pass image is actually not the
        // same as the next pass image, by checking that it has a bit
        // worse butteraugli score.
-        EXPECT_LT(distances[next_p] * 1.2f, distances[p]);
+        EXPECT_LT(distances[next_p] * 1.1f, distances[p]);
        p = next_p;
      }
    }
--- a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
@@ -733,8 +733,8 @@ ImageF TileDistMap(const ImageF& distmap, int tile_size, int margin,
  return tile_distmap;
 }

-constexpr float kDcQuantPow = 0.57f;
-static const float kDcQuant = 1.12f;
+constexpr float kDcQuantPow = 0.66f;
+static const float kDcQuant = 1.0f;
 static const float kAcQuant = 0.8294f;

 void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin,
@@ -1037,7 +1037,7 @@ void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
 }

 float InitialQuantDC(float butteraugli_target) {
-  const float kDcMul = 2.9;  // Butteraugli target where non-linearity kicks in.
+  const float kDcMul = 1.5;  // Butteraugli target where non-linearity kicks in.
  const float butteraugli_target_dc = std::max<float>(
      0.5f * butteraugli_target,
      std::min<float>(butteraugli_target,
--- a/third_party/jpeg-xl/lib/jxl/enc_cache.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_cache.cc
@@ -179,9 +179,7 @@ Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
  } else {
    auto compute_dc_coeffs = [&](int group_index, int /* thread */) {
      modular_frame_encoder->AddVarDCTDC(
-          dc, group_index,
-          enc_state->cparams.butteraugli_distance >= 2.0f &&
-              enc_state->cparams.speed_tier < SpeedTier::kFalcon,
+          dc, group_index, enc_state->cparams.speed_tier < SpeedTier::kFalcon,
          enc_state, /*jpeg_transcode=*/false);
    };
    JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, shared.frame_dim.num_dc_groups,
--- a/third_party/jpeg-xl/lib/jxl/enc_external_image.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image.cc
@@ -84,42 +84,54 @@ void JXL_INLINE LoadFloatRow(float* JXL_RESTRICT row_out, const uint8_t* in,

 uint32_t JXL_INLINE Load8(const uint8_t* p) { return *p; }

-Status PixelFormatToExternal(const JxlPixelFormat& pixel_format,
-                             size_t* bitdepth, bool* float_in) {
-  if (pixel_format.data_type == JXL_TYPE_FLOAT) {
-    *bitdepth = 32;
-    *float_in = true;
-  } else if (pixel_format.data_type == JXL_TYPE_FLOAT16) {
-    *bitdepth = 16;
-    *float_in = true;
-  } else if (pixel_format.data_type == JXL_TYPE_UINT8) {
-    *bitdepth = 8;
-    *float_in = false;
-  } else if (pixel_format.data_type == JXL_TYPE_UINT16) {
-    *bitdepth = 16;
-    *float_in = false;
-  } else {
-    return JXL_FAILURE("unsupported pixel format data type");
+size_t JxlDataTypeBytes(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 1;
+    case JXL_TYPE_UINT16:
+      return 2;
+    case JXL_TYPE_FLOAT16:
+      return 2;
+    case JXL_TYPE_FLOAT:
+      return 4;
+    default:
+      return 0;
  }
-  return true;
 }
+
 }  // namespace

 Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
                           size_t ysize, size_t bits_per_sample,
-                           JxlEndianness endianness, ThreadPool* pool,
-                           ImageF* channel, bool float_in, size_t align) {
-  // TODO(firsching): Avoid code duplication with the function below.
-  JXL_CHECK(float_in ? bits_per_sample == 16 || bits_per_sample == 32
-                     : bits_per_sample > 0 && bits_per_sample <= 16);
-  const size_t bytes_per_pixel = DivCeil(bits_per_sample, jxl::kBitsPerByte);
+                           JxlPixelFormat format, size_t c, ThreadPool* pool,
+                           ImageF* channel) {
+  if (format.data_type == JXL_TYPE_UINT8) {
+    JXL_RETURN_IF_ERROR(bits_per_sample > 0 && bits_per_sample <= 8);
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    JXL_RETURN_IF_ERROR(bits_per_sample > 8 && bits_per_sample <= 16);
+  } else if (format.data_type == JXL_TYPE_FLOAT16) {
+    JXL_RETURN_IF_ERROR(bits_per_sample == 16);
+  } else if (format.data_type == JXL_TYPE_FLOAT) {
+    JXL_RETURN_IF_ERROR(bits_per_sample == 32);
+  } else {
+    JXL_FAILURE("unsupported pixel format data type %d", format.data_type);
+  }
+  size_t bytes_per_channel = JxlDataTypeBytes(format.data_type);
+  size_t bytes_per_pixel = format.num_channels * bytes_per_channel;
+  size_t pixel_offset = c * bytes_per_channel;
+
  const size_t last_row_size = xsize * bytes_per_pixel;
+  const size_t align = format.align;
  const size_t row_size =
      (align > 1 ? jxl::DivCeil(last_row_size, align) * align : last_row_size);
  const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
  if (xsize == 0 || ysize == 0) return JXL_FAILURE("Empty image");
  if (bytes.size() < bytes_to_read) {
-    return JXL_FAILURE("Buffer size is too small");
+    return JXL_FAILURE("Buffer size is too small, expected: %" PRIuS
+                       " got: %" PRIuS " (Image: %" PRIuS "x%" PRIuS
+                       "x%u, bytes_per_channel: %" PRIuS ")",
+                       bytes_to_read, bytes.size(), xsize, ysize,
+                       format.num_channels, bytes_per_channel);
  }
  JXL_ASSERT(channel->xsize() == xsize);
  JXL_ASSERT(channel->ysize() == ysize);
@@ -130,18 +142,19 @@ Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
  }

  const bool little_endian =
-      endianness == JXL_LITTLE_ENDIAN ||
-      (endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
+      format.endianness == JXL_LITTLE_ENDIAN ||
+      (format.endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());

  const uint8_t* const in = bytes.data();
-  if (float_in) {
+  if (format.data_type == JXL_TYPE_FLOAT ||
+      format.data_type == JXL_TYPE_FLOAT16) {
    JXL_RETURN_IF_ERROR(RunOnPool(
        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
        [&](const uint32_t task, size_t /*thread*/) {
          const size_t y = task;
-          size_t i = row_size * task;
+          size_t i = row_size * task + pixel_offset;
          float* JXL_RESTRICT row_out = channel->Row(y);
-          if (bits_per_sample == 16) {
+          if (format.data_type == JXL_TYPE_FLOAT16) {
            if (little_endian) {
              for (size_t x = 0; x < xsize; ++x) {
                row_out[x] = LoadLEFloat16(in + i);
@@ -174,9 +187,9 @@ Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
        [&](const uint32_t task, size_t /*thread*/) {
          const size_t y = task;
-          size_t i = row_size * task;
+          size_t i = row_size * task + pixel_offset;
          float* JXL_RESTRICT row_out = channel->Row(y);
-          if (bits_per_sample <= 8) {
+          if (format.data_type == JXL_TYPE_UINT8) {
            LoadFloatRow<Load8>(row_out, in + i, mul, xsize, bytes_per_pixel);
          } else {
            if (little_endian) {
@@ -195,187 +208,36 @@ Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
 }
 Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
                           size_t ysize, const ColorEncoding& c_current,
-                           size_t channels, bool alpha_is_premultiplied,
-                           size_t bits_per_sample, JxlEndianness endianness,
-                           ThreadPool* pool, ImageBundle* ib, bool float_in,
-                           size_t align) {
-  JXL_CHECK(float_in ? bits_per_sample == 16 || bits_per_sample == 32
-                     : bits_per_sample > 0 && bits_per_sample <= 16);
-
+                           bool alpha_is_premultiplied, size_t bits_per_sample,
+                           JxlPixelFormat format, ThreadPool* pool,
+                           ImageBundle* ib) {
  const size_t color_channels = c_current.Channels();
-  bool has_alpha = channels == 2 || channels == 4;
-  if (channels < color_channels) {
+  bool has_alpha = format.num_channels == 2 || format.num_channels == 4;
+  if (format.num_channels < color_channels) {
    return JXL_FAILURE("Expected %" PRIuS
-                       " color channels, received only %" PRIuS " channels",
-                       color_channels, channels);
+                       " color channels, received only %u channels",
+                       color_channels, format.num_channels);
  }

-  const size_t bytes_per_channel = DivCeil(bits_per_sample, jxl::kBitsPerByte);
-  const size_t bytes_per_pixel = channels * bytes_per_channel;
-  if (bits_per_sample > 16 && bits_per_sample < 32) {
-    return JXL_FAILURE("not supported, try bits_per_sample=32");
-  }
-
-  const size_t last_row_size = xsize * bytes_per_pixel;
-  const size_t row_size =
-      (align > 1 ? jxl::DivCeil(last_row_size, align) * align : last_row_size);
-  const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
-  if (xsize == 0 || ysize == 0) return JXL_FAILURE("Empty image");
-  if (bytes.size() < bytes_to_read) {
-    return JXL_FAILURE(
-        "Buffer size is too small: expected at least %" PRIuS
-        " bytes (= %" PRIuS " * %" PRIuS " * %" PRIuS "), got %" PRIuS " bytes",
-        bytes_to_read, xsize, ysize, bytes_per_pixel, bytes.size());
-  }
-  // Too large buffer is likely an application bug, so also fail for that.
-  // Do allow padding to stride in last row though.
-  if (bytes.size() > row_size * ysize) {
-    return JXL_FAILURE(
-        "Buffer size is too large: expected at most %" PRIuS " bytes (= %" PRIuS
-        " * %" PRIuS " * %" PRIuS "), got %" PRIuS " bytes",
-        row_size * ysize, xsize, ysize, bytes_per_pixel, bytes.size());
-  }
-  const bool little_endian =
-      endianness == JXL_LITTLE_ENDIAN ||
-      (endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
-
-  const uint8_t* const in = bytes.data();
-
  Image3F color(xsize, ysize);
-
-  if (float_in) {
-    for (size_t c = 0; c < color_channels; ++c) {
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-          [&](const uint32_t task, size_t /*thread*/) {
-            const size_t y = task;
-            size_t i =
-                row_size * task + (c * bits_per_sample / jxl::kBitsPerByte);
-            float* JXL_RESTRICT row_out = color.PlaneRow(c, y);
-            if (bits_per_sample == 16) {
-              if (little_endian) {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadLEFloat16(in + i);
-                  i += bytes_per_pixel;
-                }
-              } else {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadBEFloat16(in + i);
-                  i += bytes_per_pixel;
-                }
-              }
-            } else {
-              if (little_endian) {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadLEFloat(in + i);
-                  i += bytes_per_pixel;
-                }
-              } else {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadBEFloat(in + i);
-                  i += bytes_per_pixel;
-                }
-              }
-            }
-          },
-          "ConvertRGBFloat"));
-    }
-  } else {
-    // Multiplier to convert from the integer range to floating point 0-1 range.
-    float mul = 1. / ((1ull << bits_per_sample) - 1);
-    for (size_t c = 0; c < color_channels; ++c) {
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-          [&](const uint32_t task, size_t /*thread*/) {
-            const size_t y = task;
-            size_t i = row_size * task + c * bytes_per_channel;
-            float* JXL_RESTRICT row_out = color.PlaneRow(c, y);
-            if (bits_per_sample <= 8) {
-              LoadFloatRow<Load8>(row_out, in + i, mul, xsize, bytes_per_pixel);
-            } else {
-              if (little_endian) {
-                LoadFloatRow<LoadLE16>(row_out, in + i, mul, xsize,
-                                       bytes_per_pixel);
-              } else {
-                LoadFloatRow<LoadBE16>(row_out, in + i, mul, xsize,
-                                       bytes_per_pixel);
-              }
-            }
-          },
-          "ConvertRGBUint"));
-    }
+  for (size_t c = 0; c < color_channels; ++c) {
+    JXL_RETURN_IF_ERROR(ConvertFromExternal(bytes, xsize, ysize,
+                                            bits_per_sample, format, c, pool,
+                                            &color.Plane(c)));
  }
-
  if (color_channels == 1) {
    CopyImageTo(color.Plane(0), &color.Plane(1));
    CopyImageTo(color.Plane(0), &color.Plane(2));
  }
-
  ib->SetFromImage(std::move(color), c_current);

  // Passing an interleaved image with an alpha channel to an image that doesn't
  // have alpha channel just discards the passed alpha channel.
  if (has_alpha && ib->HasAlpha()) {
    ImageF alpha(xsize, ysize);
-
-    if (float_in) {
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-          [&](const uint32_t task, size_t /*thread*/) {
-            const size_t y = task;
-            size_t i = row_size * task +
-                       ((channels - 1) * bits_per_sample / jxl::kBitsPerByte);
-            float* JXL_RESTRICT row_out = alpha.Row(y);
-            if (bits_per_sample == 16) {
-              if (little_endian) {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadLEFloat16(in + i);
-                  i += bytes_per_pixel;
-                }
-              } else {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadBEFloat16(in + i);
-                  i += bytes_per_pixel;
-                }
-              }
-            } else {
-              if (little_endian) {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadLEFloat(in + i);
-                  i += bytes_per_pixel;
-                }
-              } else {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadBEFloat(in + i);
-                  i += bytes_per_pixel;
-                }
-              }
-            }
-          },
-          "ConvertAlphaFloat"));
-    } else {
-      float mul = 1. / ((1ull << bits_per_sample) - 1);
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-          [&](const uint32_t task, size_t /*thread*/) {
-            const size_t y = task;
-            size_t i = row_size * task + (channels - 1) * bytes_per_channel;
-            float* JXL_RESTRICT row_out = alpha.Row(y);
-            if (bits_per_sample <= 8) {
-              LoadFloatRow<Load8>(row_out, in + i, mul, xsize, bytes_per_pixel);
-            } else {
-              if (little_endian) {
-                LoadFloatRow<LoadLE16>(row_out, in + i, mul, xsize,
-                                       bytes_per_pixel);
-              } else {
-                LoadFloatRow<LoadBE16>(row_out, in + i, mul, xsize,
-                                       bytes_per_pixel);
-              }
-            }
-          },
-          "ConvertAlphaUint"));
-    }
-
+    JXL_RETURN_IF_ERROR(
+        ConvertFromExternal(bytes, xsize, ysize, bits_per_sample, format,
+                            format.num_channels - 1, pool, &alpha));
    ib->SetAlpha(std::move(alpha), alpha_is_premultiplied);
  } else if (!has_alpha && ib->HasAlpha()) {
    // if alpha is not passed, but it is expected, then assume
@@ -391,18 +253,10 @@ Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
 Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize,
                      size_t ysize, const void* buffer, size_t size,
                      ThreadPool* pool, ImageF* channel) {
-  size_t bitdepth;
-  bool float_in;
-
-  JXL_RETURN_IF_ERROR(
-      PixelFormatToExternal(pixel_format, &bitdepth, &float_in));
-
-  JXL_RETURN_IF_ERROR(ConvertFromExternal(
+  size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte;
+  return ConvertFromExternal(
      jxl::Span<const uint8_t>(static_cast<const uint8_t*>(buffer), size),
-      xsize, ysize, bitdepth, pixel_format.endianness, pool, channel, float_in,
-      pixel_format.align));
-
-  return true;
+      xsize, ysize, bitdepth, pixel_format, 0, pool, channel);
 }

 Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize,
@@ -410,16 +264,11 @@ Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize,
                           jxl::ThreadPool* pool,
                           const jxl::ColorEncoding& c_current,
                           jxl::ImageBundle* ib) {
-  size_t bitdepth;
-  bool float_in;
-  JXL_RETURN_IF_ERROR(
-      PixelFormatToExternal(pixel_format, &bitdepth, &float_in));
-
+  size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte;
  JXL_RETURN_IF_ERROR(ConvertFromExternal(
      jxl::Span<const uint8_t>(static_cast<const uint8_t*>(buffer), size),
-      xsize, ysize, c_current, pixel_format.num_channels,
-      /*alpha_is_premultiplied=*/false, bitdepth, pixel_format.endianness, pool,
-      ib, float_in, pixel_format.align));
+      xsize, ysize, c_current,
+      /*alpha_is_premultiplied=*/false, bitdepth, pixel_format, pool, ib));
  ib->VerifyMetadata();

  return true;
--- a/third_party/jpeg-xl/lib/jxl/enc_external_image.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image.h
@@ -23,17 +23,16 @@
 namespace jxl {
 Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
                           size_t ysize, size_t bits_per_sample,
-                           JxlEndianness endianness, ThreadPool* pool,
-                           ImageF* channel, bool float_in, size_t align);
+                           JxlPixelFormat format, size_t c, ThreadPool* pool,
+                           ImageF* channel);

 // Convert an interleaved pixel buffer to the internal ImageBundle
 // representation. This is the opposite of ConvertToExternal().
 Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
                           size_t ysize, const ColorEncoding& c_current,
-                           size_t channels, bool alpha_is_premultiplied,
-                           size_t bits_per_sample, JxlEndianness endianness,
-                           ThreadPool* pool, ImageBundle* ib, bool float_in,
-                           size_t align);
+                           bool alpha_is_premultiplied, size_t bits_per_sample,
+                           JxlPixelFormat format, ThreadPool* pool,
+                           ImageBundle* ib);
 Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize,
                      size_t ysize, const void* buffer, size_t size,
                      ThreadPool* pool, ImageF* channel);
--- a/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc
@@ -21,17 +21,16 @@ void BM_EncExternalImage_ConvertImageRGBA(benchmark::State& state) {
  ImageBundle ib(&im);

  std::vector<uint8_t> interleaved(xsize * ysize * 4);
-
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
  for (auto _ : state) {
    for (size_t i = 0; i < kNumIter; ++i) {
      JXL_CHECK(ConvertFromExternal(
          Span<const uint8_t>(interleaved.data(), interleaved.size()), xsize,
          ysize,
          /*c_current=*/ColorEncoding::SRGB(),
-          /*channels=*/4,
          /*alpha_is_premultiplied=*/false,
-          /*bits_per_sample=*/8, JXL_NATIVE_ENDIAN,
-          /*pool=*/nullptr, &ib, /*float_in=*/false, /*align=*/0));
+          /*bits_per_sample=*/8, format,
+          /*pool=*/nullptr, &ib));
    }
  }

--- a/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc
@@ -25,23 +25,23 @@ TEST(ExternalImageTest, InvalidSize) {
  im.SetAlphaBits(8);
  ImageBundle ib(&im);

+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
  const uint8_t buf[10 * 100 * 8] = {};
  EXPECT_FALSE(ConvertFromExternal(
      Span<const uint8_t>(buf, 10), /*xsize=*/10, /*ysize=*/100,
-      /*c_current=*/ColorEncoding::SRGB(), /*channels=*/4,
-      /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      nullptr, &ib, /*float_in=*/false, /*align=*/0));
+      /*c_current=*/ColorEncoding::SRGB(),
+      /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format, nullptr,
+      &ib));
  EXPECT_FALSE(ConvertFromExternal(
      Span<const uint8_t>(buf, sizeof(buf) - 1), /*xsize=*/10, /*ysize=*/100,
-      /*c_current=*/ColorEncoding::SRGB(), /*channels=*/4,
-      /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      nullptr, &ib, /*float_in=*/false, /*align=*/0));
+      /*c_current=*/ColorEncoding::SRGB(),
+      /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, format, nullptr,
+      &ib));
  EXPECT_TRUE(
      ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), /*xsize=*/10,
                          /*ysize=*/100, /*c_current=*/ColorEncoding::SRGB(),
-                          /*channels=*/4, /*alpha_is_premultiplied=*/false,
-                          /*bits_per_sample=*/16, JXL_BIG_ENDIAN, nullptr, &ib,
-                          /*float_in=*/false, /*align=*/0));
+                          /*alpha_is_premultiplied=*/false,
+                          /*bits_per_sample=*/16, format, nullptr, &ib));
 }
 #endif

@@ -54,14 +54,14 @@ TEST(ExternalImageTest, AlphaMissing) {
  const size_t ysize = 20;
  const uint8_t buf[xsize * ysize * 4] = {};

+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
  // has_alpha is true but the ImageBundle has no alpha. Alpha channel should
  // be ignored.
-  EXPECT_TRUE(
-      ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), xsize, ysize,
-                          /*c_current=*/ColorEncoding::SRGB(),
-                          /*channels=*/4, /*alpha_is_premultiplied=*/false,
-                          /*bits_per_sample=*/8, JXL_BIG_ENDIAN, nullptr, &ib,
-                          /*float_in=*/false, /*align=*/0));
+  EXPECT_TRUE(ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), xsize,
+                                  ysize,
+                                  /*c_current=*/ColorEncoding::SRGB(),
+                                  /*alpha_is_premultiplied=*/false,
+                                  /*bits_per_sample=*/8, format, nullptr, &ib));
  EXPECT_FALSE(ib.HasAlpha());
 }

--- a/Show More
+++ b/Show More