Bug 1094275 - Update the in-tree copy of jemalloc3 to commit b4acf73. r=glandium

2014-12-17 09:14:48 +09:00
parent 13345e66e6
commit b9796fea26
129 changed files with 11347 additions and 10068 deletions
--- a/configure.in
+++ b/configure.in
@@ -9063,7 +9063,7 @@ esac
 # Run jemalloc configure script

 if test -z "$MOZ_NATIVE_JEMALLOC" -a "$MOZ_MEMORY" && test -n "$MOZ_JEMALLOC3" -o -n "$MOZ_REPLACE_MALLOC"; then
-  ac_configure_args="--build=$build --host=$target --enable-stats --with-jemalloc-prefix=je_"
+  ac_configure_args="--build=$build --host=$target --enable-stats --with-jemalloc-prefix=je_ --disable-valgrind"
  if test -n "$MOZ_REPLACE_MALLOC"; then
    # When using replace_malloc, we always want memalign and valloc exported from jemalloc.
    ac_configure_args="$ac_configure_args ac_cv_func_memalign=yes"
--- a/js/src/devtools/rootAnalysis/annotations.js
+++ b/js/src/devtools/rootAnalysis/annotations.js
@@ -8,6 +8,8 @@ var ignoreIndirectCalls = {
    "aMallocSizeOf" : true,
    "_malloc_message" : true,
    "je_malloc_message" : true,
+    "chunk_dalloc" : true,
+    "chunk_alloc" : true,
    "__conv" : true,
    "__convf" : true,
    "prerrortable.c:callback_newtable" : true,
--- a/memory/build/mozjemalloc_compat.c
+++ b/memory/build/mozjemalloc_compat.c
@@ -21,7 +21,7 @@ je_(mallctlnametomib)(const char *name, size_t *mibp, size_t *miblenp);
 MOZ_IMPORT_API int
 je_(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
 MOZ_IMPORT_API int
-je_(nallocm)(size_t *rsize, size_t size, int flags);
+je_(nallocx)(size_t size, int flags);

 #else
 #  include "jemalloc/jemalloc.h"
@@ -50,15 +50,12 @@ je_(nallocm)(size_t *rsize, size_t size, int flags);
 MOZ_MEMORY_API size_t
 malloc_good_size_impl(size_t size)
 {
-  size_t ret;
-  /* je_nallocm crashes when given a size of 0. As
+  /* je_nallocx crashes when given a size of 0. As
   * malloc_usable_size(malloc(0)) and malloc_usable_size(malloc(1))
   * return the same value, use a size of 1. */
  if (size == 0)
    size = 1;
-  if (!je_(nallocm)(&ret, size, 0))
-    return ret;
-  return size;
+  return je_(nallocx)(size, 0);
 }

 MOZ_JEMALLOC_API void
--- a/memory/jemalloc/0001-Dont-overwrite-VERSION-on-a-git-repository.patch
+++ b/memory/jemalloc/0001-Dont-overwrite-VERSION-on-a-git-repository.patch
@@ -0,0 +1,68 @@
+diff --git a/configure b/configure
+--- a/configure
+++ b/configure
+@@ -6662,28 +6662,6 @@ fi
+ 
+ 
+ 
+-if test "x`git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
+-        rm -f "${srcroot}VERSION"
+-  for pattern in '[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \
+-                 '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \
+-                 '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \
+-                 '[0-9][0-9].[0-9][0-9].[0-9]' \
+-                 '[0-9][0-9].[0-9][0-9].[0-9][0-9]'; do
+-    if test ! -e "${srcroot}VERSION" ; then
+-      git describe --long --abbrev=40 --match="${pattern}" > "${srcroot}VERSION.tmp" 2>/dev/null
+-      if test $? -eq 0 ; then
+-        mv "${srcroot}VERSION.tmp" "${srcroot}VERSION"
+-        break
+-      fi
+-    fi
+-  done
+-fi
+-rm -f "${srcroot}VERSION.tmp"
+-if test ! -e "${srcroot}VERSION" ; then
+-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: Missing VERSION file, and unable to generate it; creating bogus VERSION" >&5
+-$as_echo "Missing VERSION file, and unable to generate it; creating bogus VERSION" >&6; }
+-  echo "0.0.0-0-g0000000000000000000000000000000000000000" > "${srcroot}VERSION"
+-fi
+ jemalloc_version=`cat "${srcroot}VERSION"`
+ jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print $1}'`
+ jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print $2}'`
+diff --git a/configure.ac b/configure.ac
+--- a/configure.ac
+++ b/configure.ac
+@@ -1055,32 +1055,6 @@ dnl ============================================================================
+ dnl jemalloc configuration.
+ dnl 
+ 
+-dnl Set VERSION if source directory is inside a git repository.
+-if test "x`git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
+-  dnl Pattern globs aren't powerful enough to match both single- and
+-  dnl double-digit version numbers, so iterate over patterns to support up to
+-  dnl version 99.99.99 without any accidental matches.
+-  rm -f "${srcroot}VERSION"
+-  for pattern in ['[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \
+-                 '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \
+-                 '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \
+-                 '[0-9][0-9].[0-9][0-9].[0-9]' \
+-                 '[0-9][0-9].[0-9][0-9].[0-9][0-9]']; do
+-    if test ! -e "${srcroot}VERSION" ; then
+-      git describe --long --abbrev=40 --match="${pattern}" > "${srcroot}VERSION.tmp" 2>/dev/null
+-      if test $? -eq 0 ; then
+-        mv "${srcroot}VERSION.tmp" "${srcroot}VERSION"
+-        break
+-      fi
+-    fi
+-  done
+-fi
+-rm -f "${srcroot}VERSION.tmp"
+-if test ! -e "${srcroot}VERSION" ; then
+-  AC_MSG_RESULT(
+-    [Missing VERSION file, and unable to generate it; creating bogus VERSION])
+-  echo "0.0.0-0-g0000000000000000000000000000000000000000" > "${srcroot}VERSION"
+-fi
+ jemalloc_version=`cat "${srcroot}VERSION"`
+ jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
+ jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'`
--- a/memory/jemalloc/0001-Use-a-configure-test-to-detect-whether-to-use-a-cons.patch
+++ b/memory/jemalloc/0001-Use-a-configure-test-to-detect-whether-to-use-a-cons.patch
@@ -1,340 +0,0 @@
-diff --git a/configure b/configure
--- a/configure
-+++ b/configure
-@@ -1905,83 +1905,16 @@ fi
-   # interfere with the next link command; also delete a directory that is
-   # left behind by Apple's compiler.  We do this before executing the actions.
-   rm -rf conftest.dSYM conftest_ipa8_conftest.oo
-   eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-   as_fn_set_status $ac_retval
- 
- } # ac_fn_c_try_link
- 
-# ac_fn_c_check_func LINENO FUNC VAR
-# ----------------------------------
-# Tests whether FUNC exists, setting the cache variable VAR accordingly
-ac_fn_c_check_func ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-/* Define $2 to an innocuous variant, in case <limits.h> declares $2.
-   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
-#define $2 innocuous_$2
-
-/* System header to define __stub macros and hopefully few prototypes,
-    which can conflict with char $2 (); below.
-    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-    <limits.h> exists even on freestanding compilers.  */
-
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-
-#undef $2
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char $2 ();
-/* The GNU C library defines this for functions which it implements
-    to always fail with ENOSYS.  Some functions are actually named
-    something starting with __ and the normal name is an alias.  */
-#if defined __stub_$2 || defined __stub___$2
-choke me
-#endif
-
-int
-main ()
-{
-return $2 ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  eval "$3=yes"
-else
-  eval "$3=no"
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-
-} # ac_fn_c_check_func
-
- # ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES
- # -------------------------------------------------------
- # Tests whether HEADER exists, giving a warning if it cannot be compiled using
- # the include files in INCLUDES and setting the cache variable VAR
- # accordingly.
- ac_fn_c_check_header_mongrel ()
- {
-   as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-@@ -2059,16 +1992,83 @@ fi
- eval ac_res=\$$3
- 	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
- $as_echo "$ac_res" >&6; }
- fi
-   eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
- 
- } # ac_fn_c_check_header_mongrel
- 
-+# ac_fn_c_check_func LINENO FUNC VAR
-+# ----------------------------------
-+# Tests whether FUNC exists, setting the cache variable VAR accordingly
-+ac_fn_c_check_func ()
-+{
-+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-+$as_echo_n "checking for $2... " >&6; }
-+if eval \${$3+:} false; then :
-+  $as_echo_n "(cached) " >&6
-+else
-+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-+/* end confdefs.h.  */
-+/* Define $2 to an innocuous variant, in case <limits.h> declares $2.
-+   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
-+#define $2 innocuous_$2
-+
-+/* System header to define __stub macros and hopefully few prototypes,
-+    which can conflict with char $2 (); below.
-+    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-+    <limits.h> exists even on freestanding compilers.  */
-+
-+#ifdef __STDC__
-+# include <limits.h>
-+#else
-+# include <assert.h>
-+#endif
-+
-+#undef $2
-+
-+/* Override any GCC internal prototype to avoid an error.
-+   Use char because int might match the return type of a GCC
-+   builtin and then its argument prototype would still apply.  */
-+#ifdef __cplusplus
-+extern "C"
-+#endif
-+char $2 ();
-+/* The GNU C library defines this for functions which it implements
-+    to always fail with ENOSYS.  Some functions are actually named
-+    something starting with __ and the normal name is an alias.  */
-+#if defined __stub_$2 || defined __stub___$2
-+choke me
-+#endif
-+
-+int
-+main ()
-+{
-+return $2 ();
-+  ;
-+  return 0;
-+}
-+_ACEOF
-+if ac_fn_c_try_link "$LINENO"; then :
-+  eval "$3=yes"
-+else
-+  eval "$3=no"
-+fi
-+rm -f core conftest.err conftest.$ac_objext \
-+    conftest$ac_exeext conftest.$ac_ext
-+fi
-+eval ac_res=\$$3
-+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-+$as_echo "$ac_res" >&6; }
-+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-+
-+} # ac_fn_c_check_func
-+
- # ac_fn_c_check_type LINENO TYPE VAR INCLUDES
- # -------------------------------------------
- # Tests whether TYPE exists after having included INCLUDES, setting cache
- # variable VAR accordingly.
- ac_fn_c_check_type ()
- {
-   as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-@@ -4817,17 +4817,16 @@ esac
-     AR=$ac_ct_AR
-   fi
- else
-   AR="$ac_cv_prog_AR"
- fi
- 
- 
- default_munmap="1"
-JEMALLOC_USABLE_SIZE_CONST="const"
- case "${host}" in
-   *-*-darwin*)
- 	CFLAGS="$CFLAGS"
- 	abi="macho"
- 	$as_echo "#define JEMALLOC_PURGE_MADVISE_FREE  " >>confdefs.h
- 
- 	RPATH=""
- 	LD_PRELOAD_VAR="DYLD_INSERT_LIBRARIES"
-@@ -4850,17 +4849,16 @@ case "${host}" in
- 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
- 	abi="elf"
- 	$as_echo "#define JEMALLOC_HAS_ALLOCA_H 1" >>confdefs.h
- 
- 	$as_echo "#define JEMALLOC_PURGE_MADVISE_DONTNEED  " >>confdefs.h
- 
- 	$as_echo "#define JEMALLOC_THREADED_INIT  " >>confdefs.h
- 
-	JEMALLOC_USABLE_SIZE_CONST=""
- 	default_munmap="0"
- 	;;
-   *-*-netbsd*)
- 	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking ABI" >&5
- $as_echo_n "checking ABI... " >&6; }
-         cat confdefs.h - <<_ACEOF >conftest.$ac_ext
- /* end confdefs.h.  */
- #ifdef __ELF__
-@@ -4930,16 +4928,60 @@ rm -f core conftest.err conftest.$ac_obj
- 	PIC_CFLAGS=""
- 	;;
-   *)
- 	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: Unsupported operating system: ${host}" >&5
- $as_echo "Unsupported operating system: ${host}" >&6; }
- 	abi="elf"
- 	;;
- esac
-+
-+JEMALLOC_USABLE_SIZE_CONST=const
-+for ac_header in malloc.h
-+do :
-+  ac_fn_c_check_header_mongrel "$LINENO" "malloc.h" "ac_cv_header_malloc_h" "$ac_includes_default"
-+if test "x$ac_cv_header_malloc_h" = xyes; then :
-+  cat >>confdefs.h <<_ACEOF
-+#define HAVE_MALLOC_H 1
-+_ACEOF
-+
-+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether malloc_usable_size definition can use const argument" >&5
-+$as_echo_n "checking whether malloc_usable_size definition can use const argument... " >&6; }
-+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-+/* end confdefs.h.  */
-+#include <malloc.h>
-+     #include <stddef.h>
-+    size_t malloc_usable_size(const void *ptr);
-+
-+int
-+main ()
-+{
-+
-+  ;
-+  return 0;
-+}
-+_ACEOF
-+if ac_fn_c_try_compile "$LINENO"; then :
-+
-+                { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-+$as_echo "yes" >&6; }
-+
-+else
-+
-+                JEMALLOC_USABLE_SIZE_CONST=
-+                { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-+$as_echo "no" >&6; }
-+
-+fi
-+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-+
-+fi
-+
-+done
-+
- cat >>confdefs.h <<_ACEOF
- #define JEMALLOC_USABLE_SIZE_CONST $JEMALLOC_USABLE_SIZE_CONST
- _ACEOF
- 
- 
- 
- 
- 
-diff --git a/configure.ac b/configure.ac
--- a/configure.ac
-+++ b/configure.ac
-@@ -253,17 +253,16 @@ AC_PROG_AR
- dnl Platform-specific settings.  abi and RPATH can probably be determined
- dnl programmatically, but doing so is error-prone, which makes it generally
- dnl not worth the trouble.
- dnl 
- dnl Define cpp macros in CPPFLAGS, rather than doing AC_DEFINE(macro), since the
- dnl definitions need to be seen before any headers are included, which is a pain
- dnl to make happen otherwise.
- default_munmap="1"
-JEMALLOC_USABLE_SIZE_CONST="const"
- case "${host}" in
-   *-*-darwin*)
- 	CFLAGS="$CFLAGS"
- 	abi="macho"
- 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
- 	RPATH=""
- 	LD_PRELOAD_VAR="DYLD_INSERT_LIBRARIES"
- 	so="dylib"
-@@ -281,17 +280,16 @@ case "${host}" in
- 	;;
-   *-*-linux*)
- 	CFLAGS="$CFLAGS"
- 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
- 	abi="elf"
- 	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H])
- 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED], [ ])
- 	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ])
-	JEMALLOC_USABLE_SIZE_CONST=""
- 	default_munmap="0"
- 	;;
-   *-*-netbsd*)
- 	AC_MSG_CHECKING([ABI])
-         AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
- [[#ifdef __ELF__
- /* ELF */
- #else
-@@ -346,16 +344,32 @@ case "${host}" in
- 	SOREV="${so}"
- 	PIC_CFLAGS=""
- 	;;
-   *)
- 	AC_MSG_RESULT([Unsupported operating system: ${host}])
- 	abi="elf"
- 	;;
- esac
-+
-+JEMALLOC_USABLE_SIZE_CONST=const
-+AC_CHECK_HEADERS([malloc.h], [
-+  AC_MSG_CHECKING([whether malloc_usable_size definition can use const argument])
-+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
-+    [#include <malloc.h>
-+     #include <stddef.h>
-+    size_t malloc_usable_size(const void *ptr);
-+    ],
-+    [])],[
-+                AC_MSG_RESULT([yes])
-+         ],[
-+                JEMALLOC_USABLE_SIZE_CONST=
-+                AC_MSG_RESULT([no])
-+         ])
-+])
- AC_DEFINE_UNQUOTED([JEMALLOC_USABLE_SIZE_CONST], [$JEMALLOC_USABLE_SIZE_CONST])
- AC_SUBST([abi])
- AC_SUBST([RPATH])
- AC_SUBST([LD_PRELOAD_VAR])
- AC_SUBST([so])
- AC_SUBST([importlib])
- AC_SUBST([o])
- AC_SUBST([a])
--- a/memory/jemalloc/0002-Use-ULL-prefix-instead-of-LLU-for-unsigned-long-long.patch
+++ b/memory/jemalloc/0002-Use-ULL-prefix-instead-of-LLU-for-unsigned-long-long.patch
@@ -1,46 +0,0 @@
-diff --git a/include/jemalloc/internal/hash.h b/include/jemalloc/internal/hash.h
--- a/include/jemalloc/internal/hash.h
-+++ b/include/jemalloc/internal/hash.h
-@@ -71,19 +71,19 @@ hash_fmix_32(uint32_t h)
- 	return (h);
- }
- 
- JEMALLOC_INLINE uint64_t
- hash_fmix_64(uint64_t k)
- {
- 
- 	k ^= k >> 33;
-	k *= QU(0xff51afd7ed558ccdLLU);
-+	k *= QU(0xff51afd7ed558ccdULL);
- 	k ^= k >> 33;
-	k *= QU(0xc4ceb9fe1a85ec53LLU);
-+	k *= QU(0xc4ceb9fe1a85ec53ULL);
- 	k ^= k >> 33;
- 
- 	return (k);
- }
- 
- JEMALLOC_INLINE uint32_t
- hash_x86_32(const void *key, int len, uint32_t seed)
- {
-@@ -242,18 +242,18 @@ hash_x64_128(const void *key, const int 
-     uint64_t r_out[2])
- {
- 	const uint8_t *data = (const uint8_t *) key;
- 	const int nblocks = len / 16;
- 
- 	uint64_t h1 = seed;
- 	uint64_t h2 = seed;
- 
-	const uint64_t c1 = QU(0x87c37b91114253d5LLU);
-	const uint64_t c2 = QU(0x4cf5ad432745937fLLU);
-+	const uint64_t c1 = QU(0x87c37b91114253d5ULL);
-+	const uint64_t c2 = QU(0x4cf5ad432745937fULL);
- 
- 	/* body */
- 	{
- 		const uint64_t *blocks = (const uint64_t *) (data);
- 		int i;
- 
- 		for (i = 0; i < nblocks; i++) {
- 			uint64_t k1 = hash_get_block_64(blocks, i*2 + 0);
--- a/memory/jemalloc/0003-Don-t-use-msvc_compat-s-C99-headers-with-MSVC-versio.patch
+++ b/memory/jemalloc/0003-Don-t-use-msvc_compat-s-C99-headers-with-MSVC-versio.patch
--- a/memory/jemalloc/0004-Try-to-use-__builtin_ffsl-if-ffsl-is-unavailable.patch
+++ b/memory/jemalloc/0004-Try-to-use-__builtin_ffsl-if-ffsl-is-unavailable.patch
@@ -1,354 +0,0 @@
-diff --git a/configure b/configure
--- a/configure
-+++ b/configure
-@@ -6940,18 +6940,67 @@ else
-   je_cv_function_ffsl=no
- fi
- rm -f core conftest.err conftest.$ac_objext \
-     conftest$ac_exeext conftest.$ac_ext
- fi
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $je_cv_function_ffsl" >&5
- $as_echo "$je_cv_function_ffsl" >&6; }
- 
-if test "x${je_cv_function_ffsl}" != "xyes" ; then
-   as_fn_error $? "Cannot build without ffsl(3)" "$LINENO" 5
-+if test "x${je_cv_function_ffsl}" == "xyes" ; then
-+  $as_echo "#define JEMALLOC_INTERNAL_FFSL ffsl" >>confdefs.h
-+
-+  $as_echo "#define JEMALLOC_INTERNAL_FFS ffs" >>confdefs.h
-+
-+else
-+
-+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether a program using __builtin_ffsl is compilable" >&5
-+$as_echo_n "checking whether a program using __builtin_ffsl is compilable... " >&6; }
-+if ${je_cv_gcc_builtin_ffsl+:} false; then :
-+  $as_echo_n "(cached) " >&6
-+else
-+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-+/* end confdefs.h.  */
-+
-+  #include <stdio.h>
-+  #include <strings.h>
-+  #include <string.h>
-+
-+int
-+main ()
-+{
-+
-+	{
-+		int rv = __builtin_ffsl(0x08);
-+		printf("%d\n", rv);
-+	}
-+
-+  ;
-+  return 0;
-+}
-+_ACEOF
-+if ac_fn_c_try_link "$LINENO"; then :
-+  je_cv_gcc_builtin_ffsl=yes
-+else
-+  je_cv_gcc_builtin_ffsl=no
-+fi
-+rm -f core conftest.err conftest.$ac_objext \
-+    conftest$ac_exeext conftest.$ac_ext
-+fi
-+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $je_cv_gcc_builtin_ffsl" >&5
-+$as_echo "$je_cv_gcc_builtin_ffsl" >&6; }
-+
-+  if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
-+    $as_echo "#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl" >>confdefs.h
-+
-+    $as_echo "#define JEMALLOC_INTERNAL_FFS __builtin_ffs" >>confdefs.h
-+
-+  else
-+    as_fn_error $? "Cannot build without ffsl(3) or __builtin_ffsl()" "$LINENO" 5
-+  fi
- fi
- 
- 
- 
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether atomic(9) is compilable" >&5
- $as_echo_n "checking whether atomic(9) is compilable... " >&6; }
- if ${je_cv_atomic9+:} false; then :
-   $as_echo_n "(cached) " >&6
-diff --git a/configure.ac b/configure.ac
--- a/configure.ac
-+++ b/configure.ac
-@@ -1160,31 +1160,51 @@ fi
- AC_SUBST([enable_tls])
- if test "x${enable_tls}" = "x1" ; then
-   AC_DEFINE_UNQUOTED([JEMALLOC_TLS], [ ])
- elif test "x${force_tls}" = "x1" ; then
-   AC_MSG_ERROR([Failed to configure TLS, which is mandatory for correct function])
- fi
- 
- dnl ============================================================================
-dnl Check for ffsl(3), and fail if not found.  This function exists on all
-dnl platforms that jemalloc currently has a chance of functioning on without
-dnl modification.
-+dnl Check for ffsl(3), then __builtin_ffsl(), and fail if neither are found.
-+dnl One of those two functions should (theoretically) exist on all platforms
-+dnl that jemalloc currently has a chance of functioning on without modification.
-+dnl We additionally assume ffs() or __builtin_ffs() are defined if
-+dnl ffsl() or __builtin_ffsl() are defined, respectively.
- JE_COMPILABLE([a program using ffsl], [
- #include <stdio.h>
- #include <strings.h>
- #include <string.h>
- ], [
- 	{
- 		int rv = ffsl(0x08);
- 		printf("%d\n", rv);
- 	}
- ], [je_cv_function_ffsl])
-if test "x${je_cv_function_ffsl}" != "xyes" ; then
-   AC_MSG_ERROR([Cannot build without ffsl(3)])
-+if test "x${je_cv_function_ffsl}" == "xyes" ; then
-+  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
-+  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
-+else
-+  JE_COMPILABLE([a program using __builtin_ffsl], [
-+  #include <stdio.h>
-+  #include <strings.h>
-+  #include <string.h>
-+  ], [
-+	{
-+		int rv = __builtin_ffsl(0x08);
-+		printf("%d\n", rv);
-+	}
-+  ], [je_cv_gcc_builtin_ffsl])
-+  if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
-+    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
-+    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
-+  else
-+    AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
-+  fi
- fi
- 
- dnl ============================================================================
- dnl Check for atomic(9) operations as provided on FreeBSD.
- 
- JE_COMPILABLE([atomic(9)], [
- #include <sys/types.h>
- #include <machine/atomic.h>
-diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
--- a/include/jemalloc/internal/arena.h
-+++ b/include/jemalloc/internal/arena.h
-@@ -810,17 +810,17 @@ arena_run_regind(arena_run_t *run, arena
- 	 * Avoid doing division with a variable divisor if possible.  Using
- 	 * actual division here can reduce allocator throughput by over 20%!
- 	 */
- 	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run -
- 	    bin_info->reg0_offset);
- 
- 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
- 	interval = bin_info->reg_interval;
-	shift = ffs(interval) - 1;
-+	shift = jemalloc_ffs(interval) - 1;
- 	diff >>= shift;
- 	interval >>= shift;
- 
- 	if (interval == 1) {
- 		/* The divisor was a power of 2. */
- 		regind = diff;
- 	} else {
- 		/*
-diff --git a/include/jemalloc/internal/bitmap.h b/include/jemalloc/internal/bitmap.h
--- a/include/jemalloc/internal/bitmap.h
-+++ b/include/jemalloc/internal/bitmap.h
-@@ -125,21 +125,21 @@ bitmap_sfu(bitmap_t *bitmap, const bitma
- 	size_t bit;
- 	bitmap_t g;
- 	unsigned i;
- 
- 	assert(bitmap_full(bitmap, binfo) == false);
- 
- 	i = binfo->nlevels - 1;
- 	g = bitmap[binfo->levels[i].group_offset];
-	bit = ffsl(g) - 1;
-+	bit = jemalloc_ffsl(g) - 1;
- 	while (i > 0) {
- 		i--;
- 		g = bitmap[binfo->levels[i].group_offset + bit];
-		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1);
-+		bit = (bit << LG_BITMAP_GROUP_NBITS) + (jemalloc_ffsl(g) - 1);
- 	}
- 
- 	bitmap_set(bitmap, binfo, bit);
- 	return (bit);
- }
- 
- JEMALLOC_INLINE void
- bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
-diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
--- a/include/jemalloc/internal/jemalloc_internal.h.in
-+++ b/include/jemalloc/internal/jemalloc_internal.h.in
-@@ -9,21 +9,23 @@
- #  define EPERM  ERROR_WRITE_FAULT
- #  define EFAULT ERROR_INVALID_ADDRESS
- #  define ENOMEM ERROR_NOT_ENOUGH_MEMORY
- #  undef ERANGE
- #  define ERANGE ERROR_INVALID_DATA
- #else
- #  include <sys/param.h>
- #  include <sys/mman.h>
-#  include <sys/syscall.h>
-#  if !defined(SYS_write) && defined(__NR_write)
-#    define SYS_write __NR_write
-+#  if !defined(__pnacl__) && !defined(__native_client__)
-+#    include <sys/syscall.h>
-+#    if !defined(SYS_write) && defined(__NR_write)
-+#      define SYS_write __NR_write
-+#    endif
-+#    include <sys/uio.h>
- #  endif
-#  include <sys/uio.h>
- #  include <pthread.h>
- #  include <errno.h>
- #endif
- #include <sys/types.h>
- 
- #include <limits.h>
- #ifndef SIZE_T_MAX
- #  define SIZE_T_MAX	SIZE_MAX
-@@ -275,16 +277,19 @@ static const bool config_ivsalloc =
- #    define LG_QUANTUM		4
- #  endif
- #  ifdef __SH4__
- #    define LG_QUANTUM		4
- #  endif
- #  ifdef __tile__
- #    define LG_QUANTUM		4
- #  endif
-+#  ifdef __le32__
-+#    define LG_QUANTUM		4
-+#  endif
- #  ifndef LG_QUANTUM
- #    error "No LG_QUANTUM definition for architecture; specify via CPPFLAGS"
- #  endif
- #endif
- 
- #define	QUANTUM			((size_t)(1U << LG_QUANTUM))
- #define	QUANTUM_MASK		(QUANTUM - 1)
- 
-diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
-+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
-@@ -153,16 +153,23 @@
-  * memory map holes, much like munmap(2) does.
-  */
- #undef JEMALLOC_MREMAP
- 
- /* TLS is used to map arenas and magazine caches to threads. */
- #undef JEMALLOC_TLS
- 
- /*
-+ * ffs()/ffsl() functions to use for bitmapping.  Don't use these directly;
-+ * instead, use jemalloc_ffs() or jemalloc_ffsl() from util.h.
-+ */
-+#undef JEMALLOC_INTERNAL_FFSL
-+#undef JEMALLOC_INTERNAL_FFS
-+
-+/*
-  * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
-  * within jemalloc-owned chunks before dereferencing them.
-  */
- #undef JEMALLOC_IVSALLOC
- 
- /*
-  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
-  */
-diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
--- a/include/jemalloc/internal/util.h
-+++ b/include/jemalloc/internal/util.h
-@@ -104,22 +104,44 @@ void malloc_cprintf(void (*write)(void *
- void	malloc_printf(const char *format, ...)
-     JEMALLOC_ATTR(format(printf, 1, 2));
- 
- #endif /* JEMALLOC_H_EXTERNS */
- /******************************************************************************/
- #ifdef JEMALLOC_H_INLINES
- 
- #ifndef JEMALLOC_ENABLE_INLINE
-+int	jemalloc_ffsl(long bitmap);
-+int	jemalloc_ffs(int bitmap);
- size_t	pow2_ceil(size_t x);
- void	set_errno(int errnum);
- int	get_errno(void);
- #endif
- 
- #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_UTIL_C_))
-+
-+/* Sanity check: */
-+#if !defined(JEMALLOC_INTERNAL_FFSL) || !defined(JEMALLOC_INTERNAL_FFS)
-+#  error Both JEMALLOC_INTERNAL_FFSL && JEMALLOC_INTERNAL_FFS should have been defined by configure
-+#endif
-+
-+JEMALLOC_ALWAYS_INLINE int
-+jemalloc_ffsl(long bitmap)
-+{
-+
-+        return (JEMALLOC_INTERNAL_FFSL(bitmap));
-+}
-+
-+JEMALLOC_ALWAYS_INLINE int
-+jemalloc_ffs(int bitmap)
-+{
-+
-+        return (JEMALLOC_INTERNAL_FFS(bitmap));
-+}
-+
- /* Compute the smallest power of 2 that is >= x. */
- JEMALLOC_INLINE size_t
- pow2_ceil(size_t x)
- {
- 
- 	x--;
- 	x |= x >> 1;
- 	x |= x >> 2;
-diff --git a/src/arena.c b/src/arena.c
--- a/src/arena.c
-+++ b/src/arena.c
-@@ -2378,17 +2378,17 @@ bin_info_run_size_calc(arena_bin_info_t 
- 	/*
- 	 * Determine redzone size based on minimum alignment and minimum
- 	 * redzone size.  Add padding to the end of the run if it is needed to
- 	 * align the regions.  The padding allows each redzone to be half the
- 	 * minimum alignment; without the padding, each redzone would have to
- 	 * be twice as large in order to maintain alignment.
- 	 */
- 	if (config_fill && opt_redzone) {
-		size_t align_min = ZU(1) << (ffs(bin_info->reg_size) - 1);
-+		size_t align_min = ZU(1) << (jemalloc_ffs(bin_info->reg_size) - 1);
- 		if (align_min <= REDZONE_MINSIZE) {
- 			bin_info->redzone_size = REDZONE_MINSIZE;
- 			pad_size = 0;
- 		} else {
- 			bin_info->redzone_size = align_min >> 1;
- 			pad_size = bin_info->redzone_size;
- 		}
- 	} else {
-diff --git a/src/rtree.c b/src/rtree.c
--- a/src/rtree.c
-+++ b/src/rtree.c
-@@ -4,18 +4,18 @@
- rtree_t *
- rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)
- {
- 	rtree_t *ret;
- 	unsigned bits_per_level, bits_in_leaf, height, i;
- 
- 	assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));
- 
-	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
-	bits_in_leaf = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
-+	bits_per_level = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
-+	bits_in_leaf = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
- 	if (bits > bits_in_leaf) {
- 		height = 1 + (bits - bits_in_leaf) / bits_per_level;
- 		if ((height-1) * bits_per_level + bits_in_leaf != bits)
- 			height++;
- 	} else {
- 		height = 1;
- 	}
- 	assert((height-1) * bits_per_level + bits_in_leaf >= bits);
--- a/memory/jemalloc/0005-Check-for-__builtin_ffsl-before-ffsl.patch
+++ b/memory/jemalloc/0005-Check-for-__builtin_ffsl-before-ffsl.patch
@@ -1,225 +0,0 @@
-diff --git a/configure b/configure
--- a/configure
-+++ b/configure
-@@ -6904,27 +6904,70 @@ if test "x${enable_tls}" = "x1" ; then
- #define JEMALLOC_TLS
- _ACEOF
- 
- elif test "x${force_tls}" = "x1" ; then
-   as_fn_error $? "Failed to configure TLS, which is mandatory for correct function" "$LINENO" 5
- fi
- 
- 
-+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether a program using __builtin_ffsl is compilable" >&5
-+$as_echo_n "checking whether a program using __builtin_ffsl is compilable... " >&6; }
-+if ${je_cv_gcc_builtin_ffsl+:} false; then :
-+  $as_echo_n "(cached) " >&6
-+else
-+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-+/* end confdefs.h.  */
-+
-+#include <stdio.h>
-+#include <strings.h>
-+#include <string.h>
-+
-+int
-+main ()
-+{
-+
-+	{
-+		int rv = __builtin_ffsl(0x08);
-+		printf("%d\n", rv);
-+	}
-+
-+  ;
-+  return 0;
-+}
-+_ACEOF
-+if ac_fn_c_try_link "$LINENO"; then :
-+  je_cv_gcc_builtin_ffsl=yes
-+else
-+  je_cv_gcc_builtin_ffsl=no
-+fi
-+rm -f core conftest.err conftest.$ac_objext \
-+    conftest$ac_exeext conftest.$ac_ext
-+fi
-+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $je_cv_gcc_builtin_ffsl" >&5
-+$as_echo "$je_cv_gcc_builtin_ffsl" >&6; }
-+
-+if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
-+  $as_echo "#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl" >>confdefs.h
-+
-+  $as_echo "#define JEMALLOC_INTERNAL_FFS __builtin_ffs" >>confdefs.h
-+
-+else
-+
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether a program using ffsl is compilable" >&5
- $as_echo_n "checking whether a program using ffsl is compilable... " >&6; }
- if ${je_cv_function_ffsl+:} false; then :
-   $as_echo_n "(cached) " >&6
- else
-   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
- /* end confdefs.h.  */
- 
-#include <stdio.h>
-#include <strings.h>
-#include <string.h>
-+  #include <stdio.h>
-+  #include <strings.h>
-+  #include <string.h>
- 
- int
- main ()
- {
- 
- 	{
- 		int rv = ffsl(0x08);
- 		printf("%d\n", rv);
-@@ -6940,71 +6983,29 @@ else
-   je_cv_function_ffsl=no
- fi
- rm -f core conftest.err conftest.$ac_objext \
-     conftest$ac_exeext conftest.$ac_ext
- fi
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $je_cv_function_ffsl" >&5
- $as_echo "$je_cv_function_ffsl" >&6; }
- 
-if test "x${je_cv_function_ffsl}" == "xyes" ; then
-  $as_echo "#define JEMALLOC_INTERNAL_FFSL ffsl" >>confdefs.h
-
-  $as_echo "#define JEMALLOC_INTERNAL_FFS ffs" >>confdefs.h
-
-else
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether a program using __builtin_ffsl is compilable" >&5
-$as_echo_n "checking whether a program using __builtin_ffsl is compilable... " >&6; }
-if ${je_cv_gcc_builtin_ffsl+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-  #include <stdio.h>
-  #include <strings.h>
-  #include <string.h>
-
-int
-main ()
-{
-
-	{
-		int rv = __builtin_ffsl(0x08);
-		printf("%d\n", rv);
-	}
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  je_cv_gcc_builtin_ffsl=yes
-else
-  je_cv_gcc_builtin_ffsl=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $je_cv_gcc_builtin_ffsl" >&5
-$as_echo "$je_cv_gcc_builtin_ffsl" >&6; }
-
-  if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
-    $as_echo "#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl" >>confdefs.h
-
-    $as_echo "#define JEMALLOC_INTERNAL_FFS __builtin_ffs" >>confdefs.h
-+  if test "x${je_cv_function_ffsl}" == "xyes" ; then
-+    $as_echo "#define JEMALLOC_INTERNAL_FFSL ffsl" >>confdefs.h
-+
-+    $as_echo "#define JEMALLOC_INTERNAL_FFS ffs" >>confdefs.h
- 
-   else
-     as_fn_error $? "Cannot build without ffsl(3) or __builtin_ffsl()" "$LINENO" 5
-   fi
- fi
- 
- 
- 
-+
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether atomic(9) is compilable" >&5
- $as_echo_n "checking whether atomic(9) is compilable... " >&6; }
- if ${je_cv_atomic9+:} false; then :
-   $as_echo_n "(cached) " >&6
- else
-   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
- /* end confdefs.h.  */
- 
-diff --git a/configure.ac b/configure.ac
--- a/configure.ac
-+++ b/configure.ac
-@@ -1160,53 +1160,54 @@ fi
- AC_SUBST([enable_tls])
- if test "x${enable_tls}" = "x1" ; then
-   AC_DEFINE_UNQUOTED([JEMALLOC_TLS], [ ])
- elif test "x${force_tls}" = "x1" ; then
-   AC_MSG_ERROR([Failed to configure TLS, which is mandatory for correct function])
- fi
- 
- dnl ============================================================================
-dnl Check for ffsl(3), then __builtin_ffsl(), and fail if neither are found.
-+dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
- dnl One of those two functions should (theoretically) exist on all platforms
- dnl that jemalloc currently has a chance of functioning on without modification.
- dnl We additionally assume ffs() or __builtin_ffs() are defined if
- dnl ffsl() or __builtin_ffsl() are defined, respectively.
-JE_COMPILABLE([a program using ffsl], [
-+JE_COMPILABLE([a program using __builtin_ffsl], [
- #include <stdio.h>
- #include <strings.h>
- #include <string.h>
- ], [
- 	{
-		int rv = ffsl(0x08);
-+		int rv = __builtin_ffsl(0x08);
- 		printf("%d\n", rv);
- 	}
-], [je_cv_function_ffsl])
-if test "x${je_cv_function_ffsl}" == "xyes" ; then
-  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
-  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
-+], [je_cv_gcc_builtin_ffsl])
-+if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
-+  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
-+  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
- else
-  JE_COMPILABLE([a program using __builtin_ffsl], [
-+  JE_COMPILABLE([a program using ffsl], [
-   #include <stdio.h>
-   #include <strings.h>
-   #include <string.h>
-   ], [
- 	{
-		int rv = __builtin_ffsl(0x08);
-+		int rv = ffsl(0x08);
- 		printf("%d\n", rv);
- 	}
-  ], [je_cv_gcc_builtin_ffsl])
-  if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
-    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
-    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
-+  ], [je_cv_function_ffsl])
-+  if test "x${je_cv_function_ffsl}" == "xyes" ; then
-+    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
-+    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
-   else
-     AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
-   fi
- fi
- 
-+
- dnl ============================================================================
- dnl Check for atomic(9) operations as provided on FreeBSD.
- 
- JE_COMPILABLE([atomic(9)], [
- #include <sys/types.h>
- #include <machine/atomic.h>
- #include <inttypes.h>
- ], [
--- a/memory/jemalloc/0006-Fix-clang-warnings.patch
+++ b/memory/jemalloc/0006-Fix-clang-warnings.patch
@@ -1,93 +0,0 @@
-diff --git a/src/prof.c b/src/prof.c
--- a/src/prof.c
-+++ b/src/prof.c
-@@ -1057,25 +1057,25 @@ label_open_close_error:
- 		prof_dump_ctx_cleanup(ctx.p, &ctx_ql);
- 	malloc_mutex_unlock(&prof_dump_mtx);
- 	return (true);
- }
- 
- #define	DUMP_FILENAME_BUFSIZE	(PATH_MAX + 1)
- #define	VSEQ_INVALID		UINT64_C(0xffffffffffffffff)
- static void
-prof_dump_filename(char *filename, char v, int64_t vseq)
-+prof_dump_filename(char *filename, char v, uint64_t vseq)
- {
- 
- 	cassert(config_prof);
- 
- 	if (vseq != VSEQ_INVALID) {
- 	        /* "<prefix>.<pid>.<seq>.v<vseq>.heap" */
- 		malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE,
-		    "%s.%d.%"PRIu64".%c%"PRId64".heap",
-+		    "%s.%d.%"PRIu64".%c%"PRIu64".heap",
- 		    opt_prof_prefix, (int)getpid(), prof_dump_seq, v, vseq);
- 	} else {
- 	        /* "<prefix>.<pid>.<seq>.<v>.heap" */
- 		malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE,
- 		    "%s.%d.%"PRIu64".%c.heap",
- 		    opt_prof_prefix, (int)getpid(), prof_dump_seq, v);
- 	}
- 	prof_dump_seq++;
-diff --git a/src/util.c b/src/util.c
--- a/src/util.c
-+++ b/src/util.c
-@@ -95,17 +95,17 @@ buferror(int err, char *buf, size_t bufl
- 	return (strerror_r(err, buf, buflen));
- #endif
- }
- 
- uintmax_t
- malloc_strtoumax(const char *restrict nptr, char **restrict endptr, int base)
- {
- 	uintmax_t ret, digit;
-	int b;
-+	unsigned b;
- 	bool neg;
- 	const char *p, *ns;
- 
- 	p = nptr;
- 	if (base < 0 || base == 1 || base > 36) {
- 		ns = p;
- 		set_errno(EINVAL);
- 		ret = UINTMAX_MAX;
-@@ -376,17 +376,19 @@ malloc_vsnprintf(char *str, size_t size,
- 		val = va_arg(ap, ssize_t);				\
- 		break;							\
- 	case 'z' | 0x80:						\
- 		val = va_arg(ap, size_t);				\
- 		break;							\
- 	case 'p': /* Synthetic; used for %p. */				\
- 		val = va_arg(ap, uintptr_t);				\
- 		break;							\
-	default: not_reached();						\
-+	default: 							\
-+		not_reached();						\
-+		val = 0;						\
- 	}								\
- } while (0)
- 
- 	i = 0;
- 	f = format;
- 	while (true) {
- 		switch (*f) {
- 		case '\0': goto label_out;
-@@ -543,17 +545,17 @@ malloc_vsnprintf(char *str, size_t size,
- 				buf[1] = '\0';
- 				APPEND_PADDED_S(buf, 1, width, left_justify);
- 				f++;
- 				break;
- 			} case 's':
- 				assert(len == '?' || len == 'l');
- 				assert_not_implemented(len != 'l');
- 				s = va_arg(ap, char *);
-				slen = (prec < 0) ? strlen(s) : prec;
-+				slen = (prec < 0) ? strlen(s) : (size_t)prec;
- 				APPEND_PADDED_S(s, slen, width, left_justify);
- 				f++;
- 				break;
- 			case 'p': {
- 				uintmax_t val;
- 				char buf[X2S_BUFSIZE];
- 
- 				GET_ARG_NUMERIC(val, 'p');
--- a/memory/jemalloc/0007-Ensure-the-default-purgeable-zone-is-after-the-defau.patch
+++ b/memory/jemalloc/0007-Ensure-the-default-purgeable-zone-is-after-the-defau.patch
@@ -1,59 +0,0 @@
-diff --git a/src/zone.c b/src/zone.c
-index e0302ef..a722287 100644
--- a/src/zone.c
-+++ b/src/zone.c
-@@ -176,6 +176,7 @@ register_zone(void)
- 	 * register jemalloc's.
- 	 */
- 	malloc_zone_t *default_zone = malloc_default_zone();
-+	malloc_zone_t *purgeable_zone = NULL;
- 	if (!default_zone->zone_name ||
- 	    strcmp(default_zone->zone_name, "DefaultMallocZone") != 0) {
- 		return;
-@@ -237,22 +238,37 @@ register_zone(void)
- 	 * run time.
- 	 */
- 	if (malloc_default_purgeable_zone != NULL)
-		malloc_default_purgeable_zone();
-+		purgeable_zone = malloc_default_purgeable_zone();
- 
- 	/* Register the custom zone.  At this point it won't be the default. */
- 	malloc_zone_register(&zone);
- 
-	/*
-	 * Unregister and reregister the default zone.  On OSX >= 10.6,
-	 * unregistering takes the last registered zone and places it at the
-	 * location of the specified zone.  Unregistering the default zone thus
-	 * makes the last registered one the default.  On OSX < 10.6,
-	 * unregistering shifts all registered zones.  The first registered zone
-	 * then becomes the default.
-	 */
- 	do {
- 		default_zone = malloc_default_zone();
-+		/*
-+		 * Unregister and reregister the default zone.  On OSX >= 10.6,
-+		 * unregistering takes the last registered zone and places it
-+		 * at the location of the specified zone.  Unregistering the
-+		 * default zone thus makes the last registered one the default.
-+		 * On OSX < 10.6, unregistering shifts all registered zones.
-+		 * The first registered zone then becomes the default.
-+		 */
- 		malloc_zone_unregister(default_zone);
- 		malloc_zone_register(default_zone);
-+		/*
-+		 * On OSX 10.6, having the default purgeable zone appear before
-+		 * the default zone makes some things crash because it thinks it
-+		 * owns the default zone allocated pointers. We thus unregister/
-+		 * re-register it in order to ensure it's always after the
-+		 * default zone. On OSX < 10.6, there is no purgeable zone, so
-+		 * this does nothing. On OSX >= 10.6, unregistering replaces the
-+		 * purgeable zone with the last registered zone above, i.e the
-+		 * default zone. Registering it again then puts it at the end,
-+		 * obviously after the default zone.
-+		 */
-+		if (purgeable_zone) {
-+			malloc_zone_unregister(purgeable_zone);
-+			malloc_zone_register(purgeable_zone);
-+		}
- 	} while (malloc_default_zone() != &zone);
- }
--- a/memory/jemalloc/0008-Allow-to-build-with-clang-cl.patch
+++ b/memory/jemalloc/0008-Allow-to-build-with-clang-cl.patch
@@ -1,22 +0,0 @@
-diff --git a/include/msvc_compat/C99/stdbool.h b/include/msvc_compat/C99/stdbool.h
--- a/include/msvc_compat/C99/stdbool.h
-+++ b/include/msvc_compat/C99/stdbool.h
-@@ -1,16 +1,18 @@
- #ifndef stdbool_h
- #define stdbool_h
- 
- #include <wtypes.h>
- 
- /* MSVC doesn't define _Bool or bool in C, but does have BOOL */
- /* Note this doesn't pass autoconf's test because (bool) 0.5 != true */
-+#ifndef __clang__
- typedef BOOL _Bool;
-+#endif
- 
- #define bool _Bool
- #define true 1
- #define false 0
- 
- #define __bool_true_false_are_defined 1
- 
- #endif /* stdbool_h */
--- a/memory/jemalloc/0009-Remove-srcroot-from-cfghdrs_in-cfgoutputs_in-and-cfg.patch
+++ b/memory/jemalloc/0009-Remove-srcroot-from-cfghdrs_in-cfgoutputs_in-and-cfg.patch
@@ -1,225 +0,0 @@
-diff --git a/Makefile.in b/Makefile.in
--- a/Makefile.in
-+++ b/Makefile.in
-@@ -37,19 +37,19 @@ EXE := @exe@
- LIBPREFIX := @libprefix@
- REV := @rev@
- install_suffix := @install_suffix@
- ABI := @abi@
- XSLTPROC := @XSLTPROC@
- AUTOCONF := @AUTOCONF@
- _RPATH = @RPATH@
- RPATH = $(if $(1),$(call _RPATH,$(1)))
-cfghdrs_in := @cfghdrs_in@
-+cfghdrs_in := $(addprefix $(srcroot),@cfghdrs_in@)
- cfghdrs_out := @cfghdrs_out@
-cfgoutputs_in := @cfgoutputs_in@
-+cfgoutputs_in := $(addprefix $(srcroot),@cfgoutputs_in@)
- cfgoutputs_out := @cfgoutputs_out@
- enable_autogen := @enable_autogen@
- enable_code_coverage := @enable_code_coverage@
- enable_experimental := @enable_experimental@
- enable_zone_allocator := @enable_zone_allocator@
- DSO_LDFLAGS = @DSO_LDFLAGS@
- SOREV = @SOREV@
- PIC_CFLAGS = @PIC_CFLAGS@
-diff --git a/configure b/configure
--- a/configure
-+++ b/configure
-@@ -5654,25 +5654,25 @@ else
- fi
- 
- install_suffix="$INSTALL_SUFFIX"
- 
- 
- je_="je_"
- 
- 
-cfgoutputs_in="${srcroot}Makefile.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/html.xsl.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/manpages.xsl.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/jemalloc.xml.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc_macros.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc_protos.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/test.sh.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/include/test/jemalloc_test.h.in"
-+cfgoutputs_in="Makefile.in"
-+cfgoutputs_in="${cfgoutputs_in} doc/html.xsl.in"
-+cfgoutputs_in="${cfgoutputs_in} doc/manpages.xsl.in"
-+cfgoutputs_in="${cfgoutputs_in} doc/jemalloc.xml.in"
-+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_macros.h.in"
-+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_protos.h.in"
-+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/internal/jemalloc_internal.h.in"
-+cfgoutputs_in="${cfgoutputs_in} test/test.sh.in"
-+cfgoutputs_in="${cfgoutputs_in} test/include/test/jemalloc_test.h.in"
- 
- cfgoutputs_out="Makefile"
- cfgoutputs_out="${cfgoutputs_out} doc/html.xsl"
- cfgoutputs_out="${cfgoutputs_out} doc/manpages.xsl"
- cfgoutputs_out="${cfgoutputs_out} doc/jemalloc.xml"
- cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_macros.h"
- cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_protos.h"
- cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_internal.h"
-@@ -5684,28 +5684,28 @@ cfgoutputs_tup="${cfgoutputs_tup} doc/ht
- cfgoutputs_tup="${cfgoutputs_tup} doc/manpages.xsl:doc/manpages.xsl.in"
- cfgoutputs_tup="${cfgoutputs_tup} doc/jemalloc.xml:doc/jemalloc.xml.in"
- cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_macros.h:include/jemalloc/jemalloc_macros.h.in"
- cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_protos.h:include/jemalloc/jemalloc_protos.h.in"
- cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_internal.h"
- cfgoutputs_tup="${cfgoutputs_tup} test/test.sh:test/test.sh.in"
- cfgoutputs_tup="${cfgoutputs_tup} test/include/test/jemalloc_test.h:test/include/test/jemalloc_test.h.in"
- 
-cfghdrs_in="${srcroot}include/jemalloc/jemalloc_defs.h.in"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal_defs.h.in"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_namespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_unnamespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_symbols.txt"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/public_namespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/public_unnamespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/size_classes.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc_rename.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc_mangle.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}test/include/test/jemalloc_test_defs.h.in"
-+cfghdrs_in="include/jemalloc/jemalloc_defs.h.in"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/jemalloc_internal_defs.h.in"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_namespace.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_unnamespace.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_symbols.txt"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_namespace.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_unnamespace.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/size_classes.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_rename.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_mangle.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc.sh"
-+cfghdrs_in="${cfghdrs_in} test/include/test/jemalloc_test_defs.h.in"
- 
- cfghdrs_out="include/jemalloc/jemalloc_defs.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc${install_suffix}.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/private_namespace.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/private_unnamespace.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_symbols.txt"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_namespace.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_unnamespace.h"
-@@ -5713,18 +5713,18 @@ cfghdrs_out="${cfghdrs_out} include/jema
- cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_protos_jet.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_rename.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_mangle.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_mangle_jet.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/jemalloc_internal_defs.h"
- cfghdrs_out="${cfghdrs_out} test/include/test/jemalloc_test_defs.h"
- 
- cfghdrs_tup="include/jemalloc/jemalloc_defs.h:include/jemalloc/jemalloc_defs.h.in"
-cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:${srcroot}include/jemalloc/internal/jemalloc_internal_defs.h.in"
-cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:${srcroot}test/include/test/jemalloc_test_defs.h.in"
-+cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:include/jemalloc/internal/jemalloc_internal_defs.h.in"
-+cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:test/include/test/jemalloc_test_defs.h.in"
- 
- # Check whether --enable-cc-silence was given.
- if test "${enable_cc_silence+set}" = set; then :
-   enableval=$enable_cc_silence; if test "x$enable_cc_silence" = "xno" ; then
-   enable_cc_silence="0"
- else
-   enable_cc_silence="1"
- fi
-diff --git a/configure.ac b/configure.ac
--- a/configure.ac
-+++ b/configure.ac
-@@ -546,25 +546,25 @@ AC_ARG_WITH([install_suffix],
- install_suffix="$INSTALL_SUFFIX"
- AC_SUBST([install_suffix])
- 
- dnl Substitute @je_@ in jemalloc_protos.h.in, primarily to make generation of
- dnl jemalloc_protos_jet.h easy.
- je_="je_"
- AC_SUBST([je_])
- 
-cfgoutputs_in="${srcroot}Makefile.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/html.xsl.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/manpages.xsl.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/jemalloc.xml.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc_macros.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc_protos.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/test.sh.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/include/test/jemalloc_test.h.in"
-+cfgoutputs_in="Makefile.in"
-+cfgoutputs_in="${cfgoutputs_in} doc/html.xsl.in"
-+cfgoutputs_in="${cfgoutputs_in} doc/manpages.xsl.in"
-+cfgoutputs_in="${cfgoutputs_in} doc/jemalloc.xml.in"
-+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_macros.h.in"
-+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_protos.h.in"
-+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/internal/jemalloc_internal.h.in"
-+cfgoutputs_in="${cfgoutputs_in} test/test.sh.in"
-+cfgoutputs_in="${cfgoutputs_in} test/include/test/jemalloc_test.h.in"
- 
- cfgoutputs_out="Makefile"
- cfgoutputs_out="${cfgoutputs_out} doc/html.xsl"
- cfgoutputs_out="${cfgoutputs_out} doc/manpages.xsl"
- cfgoutputs_out="${cfgoutputs_out} doc/jemalloc.xml"
- cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_macros.h"
- cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_protos.h"
- cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_internal.h"
-@@ -576,28 +576,28 @@ cfgoutputs_tup="${cfgoutputs_tup} doc/ht
- cfgoutputs_tup="${cfgoutputs_tup} doc/manpages.xsl:doc/manpages.xsl.in"
- cfgoutputs_tup="${cfgoutputs_tup} doc/jemalloc.xml:doc/jemalloc.xml.in"
- cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_macros.h:include/jemalloc/jemalloc_macros.h.in"
- cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_protos.h:include/jemalloc/jemalloc_protos.h.in"
- cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_internal.h"
- cfgoutputs_tup="${cfgoutputs_tup} test/test.sh:test/test.sh.in"
- cfgoutputs_tup="${cfgoutputs_tup} test/include/test/jemalloc_test.h:test/include/test/jemalloc_test.h.in"
- 
-cfghdrs_in="${srcroot}include/jemalloc/jemalloc_defs.h.in"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal_defs.h.in"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_namespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_unnamespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_symbols.txt"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/public_namespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/public_unnamespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/size_classes.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc_rename.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc_mangle.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}test/include/test/jemalloc_test_defs.h.in"
-+cfghdrs_in="include/jemalloc/jemalloc_defs.h.in"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/jemalloc_internal_defs.h.in"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_namespace.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_unnamespace.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_symbols.txt"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_namespace.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_unnamespace.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/size_classes.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_rename.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_mangle.sh"
-+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc.sh"
-+cfghdrs_in="${cfghdrs_in} test/include/test/jemalloc_test_defs.h.in"
- 
- cfghdrs_out="include/jemalloc/jemalloc_defs.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc${install_suffix}.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/private_namespace.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/private_unnamespace.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_symbols.txt"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_namespace.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_unnamespace.h"
-@@ -605,18 +605,18 @@ cfghdrs_out="${cfghdrs_out} include/jema
- cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_protos_jet.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_rename.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_mangle.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_mangle_jet.h"
- cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/jemalloc_internal_defs.h"
- cfghdrs_out="${cfghdrs_out} test/include/test/jemalloc_test_defs.h"
- 
- cfghdrs_tup="include/jemalloc/jemalloc_defs.h:include/jemalloc/jemalloc_defs.h.in"
-cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:${srcroot}include/jemalloc/internal/jemalloc_internal_defs.h.in"
-cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:${srcroot}test/include/test/jemalloc_test_defs.h.in"
-+cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:include/jemalloc/internal/jemalloc_internal_defs.h.in"
-+cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:test/include/test/jemalloc_test_defs.h.in"
- 
- dnl Do not silence irrelevant compiler warnings by default, since enabling this
- dnl option incurs a performance penalty.
- AC_ARG_ENABLE([cc-silence],
-   [AS_HELP_STRING([--enable-cc-silence],
-                   [Silence irrelevant compiler warnings])],
- [if test "x$enable_cc_silence" = "xno" ; then
-   enable_cc_silence="0"
--- a/memory/jemalloc/moz.build
+++ b/memory/jemalloc/moz.build
@@ -27,6 +27,8 @@ SOURCES += [
    'src/src/tcache.c',
    'src/src/tsd.c',
    'src/src/util.c',
+    # FIXME do we ever want valgrind.c?
+    # 'src/src/valgrind.c',
 ]

 # Only OSX needs the zone allocation implementation,
--- a/memory/jemalloc/src/INSTALL
+++ b/memory/jemalloc/src/INSTALL
@@ -1,10 +1,23 @@
-Building and installing jemalloc can be as simple as typing the following while
-in the root directory of the source tree:
+Building and installing a packaged release of jemalloc can be as simple as
+typing the following while in the root directory of the source tree:

    ./configure
    make
    make install

+If building from unpackaged developer sources, the simplest command sequence
+that might work is:
+
+    ./autogen.sh
+    make dist
+    make
+    make install
+
+Note that documentation is not built by the default target because doing so
+would create a dependency on xsltproc in packaged releases, hence the
+requirement to either run 'make dist' or avoid installing docs via the various
+install_* targets documented below.
+
 === Advanced configuration =====================================================

 The 'configure' script supports numerous options that allow control of which
@@ -56,7 +69,7 @@ any of the following arguments (not a definitive list) to 'configure':
    replace the "malloc", "calloc", etc. symbols.

 --without-export
-    Don't export public APIs. This can be useful when building jemalloc as a
+    Don't export public APIs.  This can be useful when building jemalloc as a
    static library, or to avoid exporting public APIs when using the zone
    allocator on OSX.

@@ -71,10 +84,10 @@ any of the following arguments (not a definitive list) to 'configure':
    versions of jemalloc can coexist in the same installation directory.  For
    example, libjemalloc.so.0 becomes libjemalloc<suffix>.so.0.

--enable-cc-silence
-    Enable code that silences non-useful compiler warnings.  This is helpful
-    when trying to tell serious warnings from those due to compiler
-    limitations, but it potentially incurs a performance penalty.
+--disable-cc-silence
+    Disable code that silences non-useful compiler warnings.  This is mainly
+    useful during development when auditing the set of warnings that are being
+    silenced.

 --enable-debug
    Enable assertions and validation code.  This incurs a substantial
@@ -96,7 +109,7 @@ any of the following arguments (not a definitive list) to 'configure':

 --enable-ivsalloc
    Enable validation code, which verifies that pointers reside within
-    jemalloc-owned chunks before dereferencing them. This incurs a substantial
+    jemalloc-owned chunks before dereferencing them.  This incurs a substantial
    performance hit.

 --disable-stats
@@ -132,12 +145,6 @@ any of the following arguments (not a definitive list) to 'configure':
    released in bulk, thus reducing the total number of mutex operations.  See
    the "opt.tcache" option for usage details.

--enable-mremap
-    Enable huge realloc() via mremap(2).  mremap() is disabled by default
-    because the flavor used is specific to Linux, which has a quirk in its
-    virtual memory allocation algorithm that causes semi-permanent VM map holes
-    under normal jemalloc operation.
-
 --disable-munmap
    Disable virtual memory deallocation via munmap(2); instead keep track of
    the virtual memory for later use.  munmap() is disabled by default (i.e.
@@ -145,10 +152,6 @@ any of the following arguments (not a definitive list) to 'configure':
    memory allocation algorithm that causes semi-permanent VM map holes under
    normal jemalloc operation.

--enable-dss
-    Enable support for page allocation/deallocation via sbrk(2), in addition to
-    mmap(2).
-
 --disable-fill
    Disable support for junk/zero filling of memory, quarantine, and redzones.
    See the "opt.junk", "opt.zero", "opt.quarantine", and "opt.redzone" option
@@ -157,11 +160,8 @@ any of the following arguments (not a definitive list) to 'configure':
 --disable-valgrind
    Disable support for Valgrind.

--disable-experimental
-    Disable support for the experimental API (*allocm()).
-
 --disable-zone-allocator
-    Disable zone allocator for Darwin. This means jemalloc won't be hooked as
+    Disable zone allocator for Darwin.  This means jemalloc won't be hooked as
    the default allocator on OSX/iOS.

 --enable-utrace
@@ -189,6 +189,93 @@ any of the following arguments (not a definitive list) to 'configure':
    Specify where to find DocBook XSL stylesheets when building the
    documentation.

+--with-lg-page=<lg-page>
+    Specify the base 2 log of the system page size.  This option is only useful
+    when cross compiling, since the configure script automatically determines
+    the host's page size by default.
+
+--with-lg-page-sizes=<lg-page-sizes>
+    Specify the comma-separated base 2 logs of the page sizes to support.  This
+    option may be useful when cross-compiling in combination with
+    --with-lg-page, but its primary use case is for integration with FreeBSD's
+    libc, wherein jemalloc is embedded.
+
+--with-lg-size-class-group=<lg-size-class-group>
+    Specify the base 2 log of how many size classes to use for each doubling in
+    size.  By default jemalloc uses <lg-size-class-group>=2, which results in
+    e.g. the following size classes:
+
+      [...], 64,
+      80, 96, 112, 128,
+      160, [...]
+
+    <lg-size-class-group>=3 results in e.g. the following size classes:
+
+      [...], 64,
+      72, 80, 88, 96, 104, 112, 120, 128,
+      144, [...]
+
+    The minimal <lg-size-class-group>=0 causes jemalloc to only provide size
+    classes that are powers of 2:
+
+      [...],
+      64,
+      128,
+      256,
+      [...]
+
+    An implementation detail currently limits the total number of small size
+    classes to 255, and a compilation error will result if the
+    <lg-size-class-group> you specify cannot be supported.  The limit is
+    roughly <lg-size-class-group>=4, depending on page size.
+
+--with-lg-quantum=<lg-quantum>
+    Specify the base 2 log of the minimum allocation alignment.  jemalloc needs
+    to know the minimum alignment that meets the following C standard
+    requirement (quoted from the April 12, 2011 draft of the C11 standard):
+
+      The pointer returned if the allocation succeeds is suitably aligned so
+      that it may be assigned to a pointer to any type of object with a
+      fundamental alignment requirement and then used to access such an object
+      or an array of such objects in the space allocated [...]
+
+    This setting is architecture-specific, and although jemalloc includes known
+    safe values for the most commonly used modern architectures, there is a
+    wrinkle related to GNU libc (glibc) that may impact your choice of
+    <lg-quantum>.  On most modern architectures, this mandates 16-byte alignment
+    (<lg-quantum>=4), but the glibc developers chose not to meet this
+    requirement for performance reasons.  An old discussion can be found at
+    https://sourceware.org/bugzilla/show_bug.cgi?id=206 .  Unlike glibc,
+    jemalloc does follow the C standard by default (caveat: jemalloc
+    technically cheats if --with-lg-tiny-min is smaller than
+    --with-lg-quantum), but the fact that Linux systems already work around
+    this allocator noncompliance means that it is generally safe in practice to
+    let jemalloc's minimum alignment follow glibc's lead.  If you specify
+    --with-lg-quantum=3 during configuration, jemalloc will provide additional
+    size classes that are not 16-byte-aligned (24, 40, and 56, assuming
+    --with-lg-size-class-group=2).
+
+--with-lg-tiny-min=<lg-tiny-min>
+    Specify the base 2 log of the minimum tiny size class to support.  Tiny
+    size classes are powers of 2 less than the quantum, and are only
+    incorporated if <lg-tiny-min> is less than <lg-quantum> (see
+    --with-lg-quantum).  Tiny size classes technically violate the C standard
+    requirement for minimum alignment, and crashes could conceivably result if
+    the compiler were to generate instructions that made alignment assumptions,
+    both because illegal instruction traps could result, and because accesses
+    could straddle page boundaries and cause segmentation faults due to
+    accessing unmapped addresses.
+
+    The default of <lg-tiny-min>=3 works well in practice even on architectures
+    that technically require 16-byte alignment, probably for the same reason
+    --with-lg-quantum=3 works.  Smaller tiny size classes can, and will, cause
+    crashes (see https://bugzilla.mozilla.org/show_bug.cgi?id=691003 for an
+    example).
+
+    This option is rarely useful, and is mainly provided as documentation of a
+    subtle implementation detail.  If you do use this option, specify a
+    value in [3, ..., <lg-quantum>].
+
 The following environment variables (not a definitive list) impact configure's
 behavior:

--- a/memory/jemalloc/src/Makefile.in
+++ b/memory/jemalloc/src/Makefile.in
@@ -48,7 +48,7 @@ cfgoutputs_in := $(addprefix $(srcroot),@cfgoutputs_in@)
 cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
 enable_code_coverage := @enable_code_coverage@
-enable_experimental := @enable_experimental@
+enable_valgrind := @enable_valgrind@
 enable_zone_allocator := @enable_zone_allocator@
 DSO_LDFLAGS = @DSO_LDFLAGS@
 SOREV = @SOREV@
@@ -83,6 +83,9 @@ C_SRCS := $(srcroot)src/jemalloc.c $(srcroot)src/arena.c \
 	$(srcroot)src/mb.c $(srcroot)src/mutex.c $(srcroot)src/prof.c \
 	$(srcroot)src/quarantine.c $(srcroot)src/rtree.c $(srcroot)src/stats.c \
 	$(srcroot)src/tcache.c $(srcroot)src/util.c $(srcroot)src/tsd.c
+ifeq ($(enable_valgrind), 1)
+C_SRCS += $(srcroot)src/valgrind.c
+endif
 ifeq ($(enable_zone_allocator), 1)
 C_SRCS += $(srcroot)src/zone.c
 endif
@@ -98,26 +101,36 @@ DSOS := $(objroot)lib/$(LIBJEMALLOC).$(SOREV)
 ifneq ($(SOREV),$(SO))
 DSOS += $(objroot)lib/$(LIBJEMALLOC).$(SO)
 endif
+PC := $(objroot)jemalloc.pc
 MAN3 := $(objroot)doc/jemalloc$(install_suffix).3
 DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml
 DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.html)
 DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.3)
 DOCS := $(DOCS_HTML) $(DOCS_MAN3)
-C_TESTLIB_SRCS := $(srcroot)test/src/math.c $(srcroot)test/src/mtx.c \
-	$(srcroot)test/src/SFMT.c $(srcroot)test/src/test.c \
-	$(srcroot)test/src/thd.c
+C_TESTLIB_SRCS := $(srcroot)test/src/btalloc.c $(srcroot)test/src/btalloc_0.c \
+	$(srcroot)test/src/btalloc_1.c $(srcroot)test/src/math.c \
+	$(srcroot)test/src/mtx.c $(srcroot)test/src/SFMT.c \
+	$(srcroot)test/src/test.c $(srcroot)test/src/thd.c \
+	$(srcroot)test/src/timer.c
 C_UTIL_INTEGRATION_SRCS := $(srcroot)src/util.c
-TESTS_UNIT := $(srcroot)test/unit/bitmap.c \
+TESTS_UNIT := $(srcroot)test/unit/atomic.c \
+	$(srcroot)test/unit/bitmap.c \
 	$(srcroot)test/unit/ckh.c \
 	$(srcroot)test/unit/hash.c \
 	$(srcroot)test/unit/junk.c \
+	$(srcroot)test/unit/junk_alloc.c \
+	$(srcroot)test/unit/junk_free.c \
+	$(srcroot)test/unit/lg_chunk.c \
 	$(srcroot)test/unit/mallctl.c \
 	$(srcroot)test/unit/math.c \
 	$(srcroot)test/unit/mq.c \
 	$(srcroot)test/unit/mtx.c \
 	$(srcroot)test/unit/prof_accum.c \
+	$(srcroot)test/unit/prof_active.c \
 	$(srcroot)test/unit/prof_gdump.c \
 	$(srcroot)test/unit/prof_idump.c \
+	$(srcroot)test/unit/prof_reset.c \
+	$(srcroot)test/unit/prof_thread_name.c \
 	$(srcroot)test/unit/ql.c \
 	$(srcroot)test/unit/qr.c \
 	$(srcroot)test/unit/quarantine.c \
@@ -128,23 +141,18 @@ TESTS_UNIT := $(srcroot)test/unit/bitmap.c \
 	$(srcroot)test/unit/tsd.c \
 	$(srcroot)test/unit/util.c \
 	$(srcroot)test/unit/zero.c
-TESTS_UNIT_AUX := $(srcroot)test/unit/prof_accum_a.c \
-	$(srcroot)test/unit/prof_accum_b.c
 TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/allocated.c \
+	$(srcroot)test/integration/sdallocx.c \
 	$(srcroot)test/integration/mallocx.c \
-	$(srcroot)test/integration/mremap.c \
+	$(srcroot)test/integration/MALLOCX_ARENA.c \
 	$(srcroot)test/integration/posix_memalign.c \
 	$(srcroot)test/integration/rallocx.c \
 	$(srcroot)test/integration/thread_arena.c \
 	$(srcroot)test/integration/thread_tcache_enabled.c \
-	$(srcroot)test/integration/xallocx.c
-ifeq ($(enable_experimental), 1)
-TESTS_INTEGRATION += $(srcroot)test/integration/allocm.c \
-	$(srcroot)test/integration/MALLOCX_ARENA.c \
-	$(srcroot)test/integration/rallocm.c
-endif
-TESTS_STRESS :=
+	$(srcroot)test/integration/xallocx.c \
+	$(srcroot)test/integration/chunk.c
+TESTS_STRESS := $(srcroot)test/stress/microbench.c
 TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_STRESS)

 C_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.$(O))
@@ -157,10 +165,9 @@ C_TESTLIB_STRESS_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.stress.$(O))
 C_TESTLIB_OBJS := $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(C_TESTLIB_STRESS_OBJS)

 TESTS_UNIT_OBJS := $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%.$(O))
-TESTS_UNIT_AUX_OBJS := $(TESTS_UNIT_AUX:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_INTEGRATION_OBJS := $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_STRESS_OBJS := $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%.$(O))
-TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_UNIT_AUX_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_STRESS_OBJS)
+TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_STRESS_OBJS)

 .PHONY: all dist build_doc_html build_doc_man build_doc
 .PHONY: install_bin install_include install_lib
@@ -209,18 +216,12 @@ $(C_TESTLIB_STRESS_OBJS): $(objroot)test/src/%.stress.$(O): $(srcroot)test/src/%
 $(C_TESTLIB_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST -DJEMALLOC_STRESS_TESTLIB
 $(C_TESTLIB_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
 $(TESTS_UNIT_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST
-$(TESTS_UNIT_AUX_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST
-define make-unit-link-dep
-$(1): TESTS_UNIT_LINK_OBJS += $(2)
-$(1): $(2)
-endef
-$(foreach test, $(TESTS_UNIT:$(srcroot)test/unit/%.c=$(objroot)test/unit/%$(EXE)), $(eval $(call make-unit-link-dep,$(test),$(filter $(test:%=%_a.$(O)) $(test:%=%_b.$(O)),$(TESTS_UNIT_AUX_OBJS)))))
 $(TESTS_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST
 $(TESTS_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST
 $(TESTS_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.c
 $(TESTS_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
 ifneq ($(IMPORTLIB),$(SO))
-$(C_OBJS): CPPFLAGS += -DDLLEXPORT
+$(C_OBJS) $(C_JET_OBJS): CPPFLAGS += -DDLLEXPORT
 endif

 ifndef CC_MM
@@ -229,7 +230,7 @@ HEADER_DIRS = $(srcroot)include/jemalloc/internal \
 	$(objroot)include/jemalloc $(objroot)include/jemalloc/internal
 HEADERS = $(wildcard $(foreach dir,$(HEADER_DIRS),$(dir)/*.h))
 $(C_OBJS) $(C_PIC_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): $(HEADERS)
-$(TESTS_OBJS): $(objroot)test/unit/jemalloc_test.h
+$(TESTS_OBJS): $(objroot)test/include/test/jemalloc_test.h
 endif

 $(C_OBJS) $(C_PIC_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): %.$(O):
@@ -301,7 +302,14 @@ install_lib_static: $(STATIC_LIBS)
 	install -m 755 $$l $(LIBDIR); \
 done

-install_lib: install_lib_shared install_lib_static
+install_lib_pc: $(PC)
+	install -d $(LIBDIR)/pkgconfig
+	@for l in $(PC); do \
+	echo "install -m 644 $$l $(LIBDIR)/pkgconfig"; \
+	install -m 644 $$l $(LIBDIR)/pkgconfig; \
+done
+
+install_lib: install_lib_shared install_lib_static install_lib_pc

 install_doc_html:
 	install -d $(DATADIR)/doc/jemalloc$(install_suffix)
@@ -400,7 +408,6 @@ clean:
 	rm -f $(objroot)*.gcov.*

 distclean: clean
-	rm -rf $(objroot)autom4te.cache
 	rm -f $(objroot)bin/jemalloc.sh
 	rm -f $(objroot)config.log
 	rm -f $(objroot)config.status
--- a/memory/jemalloc/src/VERSION
+++ b/memory/jemalloc/src/VERSION
@@ -1 +1 @@
-3.6.0-0-g46c0af68bd248b04df75e4f92d5fb804c3d75340
+3.6.0-204-gb4acf7300a4ca3423ca36fe227e9bc2e23f25b9f
--- a/memory/jemalloc/src/bin/pprof
+++ b/memory/jemalloc/src/bin/pprof
@@ -2,11 +2,11 @@

 # Copyright (c) 1998-2007, Google Inc.
 # All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
-# 
+#
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
@@ -16,7 +16,7 @@
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
-# 
+#
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -223,6 +223,7 @@ Call-graph Options:
   --edgefraction=<f>  Hide edges below <f>*total [default=.001]
   --maxdegree=<n>     Max incoming/outgoing edges per node [default=8]
   --focus=<regexp>    Focus on nodes matching <regexp>
+   --thread=<n>        Show profile for thread <n>
   --ignore=<regexp>   Ignore nodes matching <regexp>
   --scale=<n>         Set GV scaling [default=0]
   --heapcheck         Make nodes with non-0 object counts
@@ -332,6 +333,7 @@ sub Init() {
  $main::opt_edgefraction = 0.001;
  $main::opt_maxdegree = 8;
  $main::opt_focus = '';
+  $main::opt_thread = undef;
  $main::opt_ignore = '';
  $main::opt_scale = 0;
  $main::opt_heapcheck = 0;
@@ -402,6 +404,7 @@ sub Init() {
             "edgefraction=f" => \$main::opt_edgefraction,
             "maxdegree=i"    => \$main::opt_maxdegree,
             "focus=s"        => \$main::opt_focus,
+             "thread=s"       => \$main::opt_thread,
             "ignore=s"       => \$main::opt_ignore,
             "scale=i"        => \$main::opt_scale,
             "heapcheck"      => \$main::opt_heapcheck,
@@ -562,6 +565,86 @@ sub Init() {
  }
 }

+sub FilterAndPrint {
+  my ($profile, $symbols, $libs, $thread) = @_;
+
+  # Get total data in profile
+  my $total = TotalProfile($profile);
+
+  # Remove uniniteresting stack items
+  $profile = RemoveUninterestingFrames($symbols, $profile);
+
+  # Focus?
+  if ($main::opt_focus ne '') {
+    $profile = FocusProfile($symbols, $profile, $main::opt_focus);
+  }
+
+  # Ignore?
+  if ($main::opt_ignore ne '') {
+    $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
+  }
+
+  my $calls = ExtractCalls($symbols, $profile);
+
+  # Reduce profiles to required output granularity, and also clean
+  # each stack trace so a given entry exists at most once.
+  my $reduced = ReduceProfile($symbols, $profile);
+
+  # Get derived profiles
+  my $flat = FlatProfile($reduced);
+  my $cumulative = CumulativeProfile($reduced);
+
+  # Print
+  if (!$main::opt_interactive) {
+    if ($main::opt_disasm) {
+      PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
+    } elsif ($main::opt_list) {
+      PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0);
+    } elsif ($main::opt_text) {
+      # Make sure the output is empty when have nothing to report
+      # (only matters when --heapcheck is given but we must be
+      # compatible with old branches that did not pass --heapcheck always):
+      if ($total != 0) {
+        printf("Total%s: %s %s\n",
+               (defined($thread) ? " (t$thread)" : ""),
+               Unparse($total), Units());
+      }
+      PrintText($symbols, $flat, $cumulative, -1);
+    } elsif ($main::opt_raw) {
+      PrintSymbolizedProfile($symbols, $profile, $main::prog);
+    } elsif ($main::opt_callgrind) {
+      PrintCallgrind($calls);
+    } else {
+      if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
+        if ($main::opt_gv) {
+          RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_evince) {
+          RunEvince(TempName($main::next_tmpfile, "pdf"), "");
+        } elsif ($main::opt_web) {
+          my $tmp = TempName($main::next_tmpfile, "svg");
+          RunWeb($tmp);
+          # The command we run might hand the file name off
+          # to an already running browser instance and then exit.
+          # Normally, we'd remove $tmp on exit (right now),
+          # but fork a child to remove $tmp a little later, so that the
+          # browser has time to load it first.
+          delete $main::tempnames{$tmp};
+          if (fork() == 0) {
+            sleep 5;
+            unlink($tmp);
+            exit(0);
+          }
+        }
+      } else {
+        cleanup();
+        exit(1);
+      }
+    }
+  } else {
+    InteractiveMode($profile, $symbols, $libs, $total);
+  }
+}
+
 sub Main() {
  Init();
  $main::collected_profile = undef;
@@ -605,9 +688,6 @@ sub Main() {
    $symbol_map = MergeSymbols($symbol_map, $base->{symbols});
  }

-  # Get total data in profile
-  my $total = TotalProfile($profile);
-
  # Collect symbols
  my $symbols;
  if ($main::use_symbolized_profile) {
@@ -622,75 +702,17 @@ sub Main() {
    $symbols = ExtractSymbols($libs, $pcs);
  }

-  # Remove uniniteresting stack items
-  $profile = RemoveUninterestingFrames($symbols, $profile);
-
-  # Focus?
-  if ($main::opt_focus ne '') {
-    $profile = FocusProfile($symbols, $profile, $main::opt_focus);
+  if (!defined($main::opt_thread)) {
+    FilterAndPrint($profile, $symbols, $libs);
  }
-
-  # Ignore?
-  if ($main::opt_ignore ne '') {
-    $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
-  }
-
-  my $calls = ExtractCalls($symbols, $profile);
-
-  # Reduce profiles to required output granularity, and also clean
-  # each stack trace so a given entry exists at most once.
-  my $reduced = ReduceProfile($symbols, $profile);
-
-  # Get derived profiles
-  my $flat = FlatProfile($reduced);
-  my $cumulative = CumulativeProfile($reduced);
-
-  # Print
-  if (!$main::opt_interactive) {
-    if ($main::opt_disasm) {
-      PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
-    } elsif ($main::opt_list) {
-      PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0);
-    } elsif ($main::opt_text) {
-      # Make sure the output is empty when have nothing to report
-      # (only matters when --heapcheck is given but we must be
-      # compatible with old branches that did not pass --heapcheck always):
-      if ($total != 0) {
-        printf("Total: %s %s\n", Unparse($total), Units());
-      }
-      PrintText($symbols, $flat, $cumulative, -1);
-    } elsif ($main::opt_raw) {
-      PrintSymbolizedProfile($symbols, $profile, $main::prog);
-    } elsif ($main::opt_callgrind) {
-      PrintCallgrind($calls);
-    } else {
-      if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-        if ($main::opt_gv) {
-          RunGV(TempName($main::next_tmpfile, "ps"), "");
-        } elsif ($main::opt_evince) {
-          RunEvince(TempName($main::next_tmpfile, "pdf"), "");
-        } elsif ($main::opt_web) {
-          my $tmp = TempName($main::next_tmpfile, "svg");
-          RunWeb($tmp);
-          # The command we run might hand the file name off
-          # to an already running browser instance and then exit.
-          # Normally, we'd remove $tmp on exit (right now),
-          # but fork a child to remove $tmp a little later, so that the
-          # browser has time to load it first.
-          delete $main::tempnames{$tmp};
-          if (fork() == 0) {
-            sleep 5;
-            unlink($tmp);
-            exit(0);
-          }
-        }
-      } else {
-        cleanup();
-        exit(1);
+  if (defined($data->{threads})) {
+    foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) {
+      if (defined($main::opt_thread) &&
+          ($main::opt_thread eq '*' || $main::opt_thread == $thread)) {
+        my $thread_profile = $data->{threads}{$thread};
+        FilterAndPrint($thread_profile, $symbols, $libs, $thread);
      }
    }
-  } else {
-    InteractiveMode($profile, $symbols, $libs, $total);
  }

  cleanup();
@@ -1683,23 +1705,23 @@ sub PrintSource {
                        HtmlPrintNumber($c2),
                        UnparseAddress($offset, $e->[0]),
                        CleanDisassembly($e->[3]));
-      
+
      # Append the most specific source line associated with this instruction
      if (length($dis) < 80) { $dis .= (' ' x (80 - length($dis))) };
      $dis = HtmlEscape($dis);
      my $f = $e->[5];
      my $l = $e->[6];
      if ($f ne $last_dis_filename) {
-        $dis .= sprintf("<span class=disasmloc>%s:%d</span>", 
+        $dis .= sprintf("<span class=disasmloc>%s:%d</span>",
                        HtmlEscape(CleanFileName($f)), $l);
      } elsif ($l ne $last_dis_linenum) {
        # De-emphasize the unchanged file name portion
        $dis .= sprintf("<span class=unimportant>%s</span>" .
-                        "<span class=disasmloc>:%d</span>", 
+                        "<span class=disasmloc>:%d</span>",
                        HtmlEscape(CleanFileName($f)), $l);
      } else {
        # De-emphasize the entire location
-        $dis .= sprintf("<span class=unimportant>%s:%d</span>", 
+        $dis .= sprintf("<span class=unimportant>%s:%d</span>",
                        HtmlEscape(CleanFileName($f)), $l);
      }
      $last_dis_filename = $f;
@@ -1788,8 +1810,8 @@ sub PrintSource {
        if (defined($dis) && $dis ne '') {
          $asm = "<span class=\"asm\">" . $dis . "</span>";
        }
-        my $source_class = (($n1 + $n2 > 0) 
-                            ? "livesrc" 
+        my $source_class = (($n1 + $n2 > 0)
+                            ? "livesrc"
                            : (($asm ne "") ? "deadsrc" : "nop"));
        printf $output (
          "<span class=\"line\">%5d</span> " .
@@ -2811,9 +2833,15 @@ sub RemoveUninterestingFrames {
                      'free',
                      'memalign',
                      'posix_memalign',
+                      'aligned_alloc',
                      'pvalloc',
                      'valloc',
                      'realloc',
+                      'mallocx', # jemalloc
+                      'rallocx', # jemalloc
+                      'xallocx', # jemalloc
+                      'dallocx', # jemalloc
+                      'sdallocx', # jemalloc
                      'tc_calloc',
                      'tc_cfree',
                      'tc_malloc',
@@ -2923,6 +2951,10 @@ sub RemoveUninterestingFrames {
      if (exists($symbols->{$a})) {
        my $func = $symbols->{$a}->[0];
        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
+          # Throw away the portion of the backtrace seen so far, under the
+          # assumption that previous frames were for functions internal to the
+          # allocator.
+          @path = ();
          next;
        }
      }
@@ -3680,6 +3712,7 @@ sub IsSymbolizedProfileFile {
 #      $result->{version}     Version number of profile file
 #      $result->{period}      Sampling period (in microseconds)
 #      $result->{profile}     Profile object
+#      $result->{threads}     Map of thread IDs to profile objects
 #      $result->{map}         Memory map info from profile
 #      $result->{pcs}         Hash of all PC values seen, key is hex address
 sub ReadProfile {
@@ -3728,6 +3761,9 @@ sub ReadProfile {
  } elsif ($header =~ m/^heap profile:/) {
    $main::profile_type = 'heap';
    $result =  ReadHeapProfile($prog, *PROFILE, $header);
+  } elsif ($header =~ m/^heap/) {
+    $main::profile_type = 'heap';
+    $result = ReadThreadedHeapProfile($prog, $fname, $header);
  } elsif ($header =~ m/^--- *$contention_marker/o) {
    $main::profile_type = 'contention';
    $result = ReadSynchProfile($prog, *PROFILE);
@@ -3870,11 +3906,7 @@ sub ReadCPUProfile {
  return $r;
 }

-sub ReadHeapProfile {
-  my $prog = shift;
-  local *PROFILE = shift;
-  my $header = shift;
-
+sub HeapProfileIndex {
  my $index = 1;
  if ($main::opt_inuse_space) {
    $index = 1;
@@ -3885,6 +3917,84 @@ sub ReadHeapProfile {
  } elsif ($main::opt_alloc_objects) {
    $index = 2;
  }
+  return $index;
+}
+
+sub ReadMappedLibraries {
+  my $fh = shift;
+  my $map = "";
+  # Read the /proc/self/maps data
+  while (<$fh>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $map .= $_;
+  }
+  return $map;
+}
+
+sub ReadMemoryMap {
+  my $fh = shift;
+  my $map = "";
+  # Read /proc/self/maps data as formatted by DumpAddressMap()
+  my $buildvar = "";
+  while (<PROFILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Parse "build=<dir>" specification if supplied
+    if (m/^\s*build=(.*)\n/) {
+      $buildvar = $1;
+    }
+
+    # Expand "$build" variable if available
+    $_ =~ s/\$build\b/$buildvar/g;
+
+    $map .= $_;
+  }
+  return $map;
+}
+
+sub AdjustSamples {
+  my ($sample_adjustment, $sampling_algorithm, $n1, $s1, $n2, $s2) = @_;
+  if ($sample_adjustment) {
+    if ($sampling_algorithm == 2) {
+      # Remote-heap version 2
+      # The sampling frequency is the rate of a Poisson process.
+      # This means that the probability of sampling an allocation of
+      # size X with sampling rate Y is 1 - exp(-X/Y)
+      if ($n1 != 0) {
+        my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+        my $scale_factor = 1/(1 - exp(-$ratio));
+        $n1 *= $scale_factor;
+        $s1 *= $scale_factor;
+      }
+      if ($n2 != 0) {
+        my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+        my $scale_factor = 1/(1 - exp(-$ratio));
+        $n2 *= $scale_factor;
+        $s2 *= $scale_factor;
+      }
+    } else {
+      # Remote-heap version 1
+      my $ratio;
+      $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+      if ($ratio < 1) {
+        $n1 /= $ratio;
+        $s1 /= $ratio;
+      }
+      $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+      if ($ratio < 1) {
+        $n2 /= $ratio;
+        $s2 /= $ratio;
+      }
+    }
+  }
+  return ($n1, $s1, $n2, $s2);
+}
+
+sub ReadHeapProfile {
+  my $prog = shift;
+  local *PROFILE = shift;
+  my $header = shift;
+
+  my $index = HeapProfileIndex();

  # Find the type of this profile.  The header line looks like:
  #    heap profile:   1246:  8800744 [  1246:  8800744] @ <heap-url>/266053
@@ -3974,29 +4084,12 @@ sub ReadHeapProfile {
  while (<PROFILE>) {
    s/\r//g;         # turn windows-looking lines into unix-looking lines
    if (/^MAPPED_LIBRARIES:/) {
-      # Read the /proc/self/maps data
-      while (<PROFILE>) {
-        s/\r//g;         # turn windows-looking lines into unix-looking lines
-        $map .= $_;
-      }
+      $map .= ReadMappedLibraries(*PROFILE);
      last;
    }

    if (/^--- Memory map:/) {
-      # Read /proc/self/maps data as formatted by DumpAddressMap()
-      my $buildvar = "";
-      while (<PROFILE>) {
-        s/\r//g;         # turn windows-looking lines into unix-looking lines
-        # Parse "build=<dir>" specification if supplied
-        if (m/^\s*build=(.*)\n/) {
-          $buildvar = $1;
-        }
-
-        # Expand "$build" variable if available
-        $_ =~ s/\$build\b/$buildvar/g;
-
-        $map .= $_;
-      }
+      $map .= ReadMemoryMap(*PROFILE);
      last;
    }

@@ -4007,42 +4100,8 @@ sub ReadHeapProfile {
    if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) {
      my $stack = $5;
      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
-
-      if ($sample_adjustment) {
-        if ($sampling_algorithm == 2) {
-          # Remote-heap version 2
-          # The sampling frequency is the rate of a Poisson process.
-          # This means that the probability of sampling an allocation of
-          # size X with sampling rate Y is 1 - exp(-X/Y)
-          if ($n1 != 0) {
-            my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-            my $scale_factor = 1/(1 - exp(-$ratio));
-            $n1 *= $scale_factor;
-            $s1 *= $scale_factor;
-          }
-          if ($n2 != 0) {
-            my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-            my $scale_factor = 1/(1 - exp(-$ratio));
-            $n2 *= $scale_factor;
-            $s2 *= $scale_factor;
-          }
-        } else {
-          # Remote-heap version 1
-          my $ratio;
-          $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-          if ($ratio < 1) {
-            $n1 /= $ratio;
-            $s1 /= $ratio;
-          }
-          $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-          if ($ratio < 1) {
-            $n2 /= $ratio;
-            $s2 /= $ratio;
-          }
-        }
-      }
-
-      my @counts = ($n1, $s1, $n2, $s2);
+      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
+                                 $n1, $s1, $n2, $s2);
      AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
    }
  }
@@ -4056,6 +4115,83 @@ sub ReadHeapProfile {
  return $r;
 }

+sub ReadThreadedHeapProfile {
+  my ($prog, $fname, $header) = @_;
+
+  my $index = HeapProfileIndex();
+  my $sampling_algorithm = 0;
+  my $sample_adjustment = 0;
+  chomp($header);
+  my $type = "unknown";
+  # Assuming a very specific type of header for now.
+  if ($header =~ m"^heap_v2/(\d+)") {
+    $type = "_v2";
+    $sampling_algorithm = 2;
+    $sample_adjustment = int($1);
+  }
+  if ($type ne "_v2" || !defined($sample_adjustment)) {
+    die "Threaded heap profiles require v2 sampling with a sample rate\n";
+  }
+
+  my $profile = {};
+  my $thread_profiles = {};
+  my $pcs = {};
+  my $map = "";
+  my $stack = "";
+
+  while (<PROFILE>) {
+    s/\r//g;
+    if (/^MAPPED_LIBRARIES:/) {
+      $map .= ReadMappedLibraries(*PROFILE);
+      last;
+    }
+
+    if (/^--- Memory map:/) {
+      $map .= ReadMemoryMap(*PROFILE);
+      last;
+    }
+
+    # Read entry of the form:
+    # @ a1 a2 ... an
+    #   t*: <count1>: <bytes1> [<count2>: <bytes2>]
+    #   t1: <count1>: <bytes1> [<count2>: <bytes2>]
+    #     ...
+    #   tn: <count1>: <bytes1> [<count2>: <bytes2>]
+    s/^\s*//;
+    s/\s*$//;
+    if (m/^@\s+(.*)$/) {
+      $stack = $1;
+    } elsif (m/^\s*(t(\*|\d+)):\s+(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]$/) {
+      if ($stack eq "") {
+        # Still in the header, so this is just a per-thread summary.
+        next;
+      }
+      my $thread = $2;
+      my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6);
+      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
+                                 $n1, $s1, $n2, $s2);
+      if ($thread eq "*") {
+        AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
+      } else {
+        if (!exists($thread_profiles->{$thread})) {
+          $thread_profiles->{$thread} = {};
+        }
+        AddEntries($thread_profiles->{$thread}, $pcs,
+                   FixCallerAddresses($stack), $counts[$index]);
+      }
+    }
+  }
+
+  my $r = {};
+  $r->{version} = "heap";
+  $r->{period} = 1;
+  $r->{profile} = $profile;
+  $r->{threads} = $thread_profiles;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
 sub ReadSynchProfile {
  my $prog = shift;
  local *PROFILE = shift;
@@ -4747,7 +4883,7 @@ sub MapToSymbols {
 	}
      }
    }
-    
+
    # Prepend to accumulated symbols for pcstr
    # (so that caller comes before callee)
    my $sym = $symbols->{$pcstr};
@@ -4941,7 +5077,7 @@ sub ConfigureTool {
    my $dirname = $`;    # this is everything up to and including the last slash
    if (-x "$dirname$tool") {
      $path = "$dirname$tool";
-    } else { 
+    } else {
      $path = $tool;
    }
  }
--- a/memory/jemalloc/src/config.guess
+++ b/memory/jemalloc/src/config.guess
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright 1992-2013 Free Software Foundation, Inc.
+#   Copyright 1992-2014 Free Software Foundation, Inc.

-timestamp='2013-06-10'
+timestamp='2014-03-23'

 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -50,7 +50,7 @@ version="\
 GNU config.guess ($timestamp)

 Originally written by Per Bothner.
-Copyright 1992-2013 Free Software Foundation, Inc.
+Copyright 1992-2014 Free Software Foundation, Inc.

 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -149,7 +149,7 @@ Linux|GNU|GNU/*)
 	LIBC=gnu
 	#endif
 	EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
 	;;
 esac

@@ -826,7 +826,7 @@ EOF
    *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
-    i*:MSYS*:*)
+    *:MSYS*:*)
 	echo ${UNAME_MACHINE}-pc-msys
 	exit ;;
    i*:windows32*:*)
@@ -969,10 +969,10 @@ EOF
 	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
 	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
 	;;
-    or1k:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+    openrisc*:Linux:*:*)
+	echo or1k-unknown-linux-${LIBC}
 	exit ;;
-    or32:Linux:*:*)
+    or32:Linux:*:* | or1k*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
    padre:Linux:*:*)
@@ -1260,16 +1260,26 @@ EOF
 	if test "$UNAME_PROCESSOR" = unknown ; then
 	    UNAME_PROCESSOR=powerpc
 	fi
-	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
-	    if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
-		grep IS_64BIT_ARCH >/dev/null
-	    then
-		case $UNAME_PROCESSOR in
-		    i386) UNAME_PROCESSOR=x86_64 ;;
-		    powerpc) UNAME_PROCESSOR=powerpc64 ;;
-		esac
+	if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
+	    if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+		if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		    (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		    grep IS_64BIT_ARCH >/dev/null
+		then
+		    case $UNAME_PROCESSOR in
+			i386) UNAME_PROCESSOR=x86_64 ;;
+			powerpc) UNAME_PROCESSOR=powerpc64 ;;
+		    esac
+		fi
 	    fi
+	elif test "$UNAME_PROCESSOR" = i386 ; then
+	    # Avoid executing cc on OS X 10.9, as it ships with a stub
+	    # that puts up a graphical alert prompting to install
+	    # developer tools.  Any system running Mac OS X 10.7 or
+	    # later (Darwin 11 and later) is required to have a 64-bit
+	    # processor. This is not true of the ARM version of Darwin
+	    # that Apple uses in portable devices.
+	    UNAME_PROCESSOR=x86_64
 	fi
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
 	exit ;;
@@ -1361,154 +1371,6 @@ EOF
 	exit ;;
 esac

-eval $set_cc_for_build
-cat >$dummy.c <<EOF
-#ifdef _SEQUENT_
-# include <sys/types.h>
-# include <sys/utsname.h>
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-	"4"
-#else
-	""
-#endif
-	); exit (0);
-#endif
-#endif
-
-#if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix\n"); exit (0);
-#endif
-
-#if defined (hp300) && !defined (hpux)
-  printf ("m68k-hp-bsd\n"); exit (0);
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
-  if (version < 4)
-    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  else
-    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-    struct utsname un;
-
-    uname(&un);
-
-    if (strncmp(un.version, "V2", 2) == 0) {
-	printf ("i386-sequent-ptx2\n"); exit (0);
-    }
-    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-	printf ("i386-sequent-ptx1\n"); exit (0);
-    }
-    printf ("i386-sequent-ptx\n"); exit (0);
-
-#endif
-
-#if defined (vax)
-# if !defined (ultrix)
-#  include <sys/param.h>
-#  if defined (BSD)
-#   if BSD == 43
-      printf ("vax-dec-bsd4.3\n"); exit (0);
-#   else
-#    if BSD == 199006
-      printf ("vax-dec-bsd4.3reno\n"); exit (0);
-#    else
-      printf ("vax-dec-bsd\n"); exit (0);
-#    endif
-#   endif
-#  else
-    printf ("vax-dec-bsd\n"); exit (0);
-#  endif
-# else
-    printf ("vax-dec-ultrix\n"); exit (0);
-# endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
-	{ echo "$SYSTEM_NAME"; exit; }
-
-# Apollos put the system type in the environment.
-
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
-
-# Convex versions that predate uname can use getsysinfo(1)
-
-if [ -x /usr/convex/getsysinfo ]
-then
-    case `getsysinfo -f cpu_type` in
-    c1*)
-	echo c1-convex-bsd
-	exit ;;
-    c2*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    c34*)
-	echo c34-convex-bsd
-	exit ;;
-    c38*)
-	echo c38-convex-bsd
-	exit ;;
-    c4*)
-	echo c4-convex-bsd
-	exit ;;
-    esac
-fi
-
 cat >&2 <<EOF
 $0: unable to guess system type

--- a/memory/jemalloc/src/config.sub
+++ b/memory/jemalloc/src/config.sub
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright 1992-2013 Free Software Foundation, Inc.
+#   Copyright 1992-2014 Free Software Foundation, Inc.

-timestamp='2013-10-01'
+timestamp='2014-05-01'

 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -68,7 +68,7 @@ Report bugs and patches to <config-patches@gnu.org>."
 version="\
 GNU config.sub ($timestamp)

-Copyright 1992-2013 Free Software Foundation, Inc.
+Copyright 1992-2014 Free Software Foundation, Inc.

 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -283,8 +283,10 @@ case $basic_machine in
 	| mips64vr5900 | mips64vr5900el \
 	| mipsisa32 | mipsisa32el \
 	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa32r6 | mipsisa32r6el \
 	| mipsisa64 | mipsisa64el \
 	| mipsisa64r2 | mipsisa64r2el \
+	| mipsisa64r6 | mipsisa64r6el \
 	| mipsisa64sb1 | mipsisa64sb1el \
 	| mipsisa64sr71k | mipsisa64sr71kel \
 	| mipsr5900 | mipsr5900el \
@@ -296,8 +298,7 @@ case $basic_machine in
 	| nds32 | nds32le | nds32be \
 	| nios | nios2 | nios2eb | nios2el \
 	| ns16k | ns32k \
-	| open8 \
-	| or1k | or32 \
+	| open8 | or1k | or1knd | or32 \
 	| pdp10 | pdp11 | pj | pjl \
 	| powerpc | powerpc64 | powerpc64le | powerpcle \
 	| pyramid \
@@ -402,8 +403,10 @@ case $basic_machine in
 	| mips64vr5900-* | mips64vr5900el-* \
 	| mipsisa32-* | mipsisa32el-* \
 	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa32r6-* | mipsisa32r6el-* \
 	| mipsisa64-* | mipsisa64el-* \
 	| mipsisa64r2-* | mipsisa64r2el-* \
+	| mipsisa64r6-* | mipsisa64r6el-* \
 	| mipsisa64sb1-* | mipsisa64sb1el-* \
 	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
 	| mipsr5900-* | mipsr5900el-* \
@@ -415,6 +418,7 @@ case $basic_machine in
 	| nios-* | nios2-* | nios2eb-* | nios2el-* \
 	| none-* | np1-* | ns16k-* | ns32k-* \
 	| open8-* \
+	| or1k*-* \
 	| orion-* \
 	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
 	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
@@ -1376,7 +1380,7 @@ case $os in
 	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
 	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
 	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* | -tirtos*)
 	# Remember, each alternative MUST END IN *, to match a version number.
 		;;
 	-qnx*)
@@ -1400,6 +1404,9 @@ case $os in
 	-mac*)
 		os=`echo $os | sed -e 's|mac|macos|'`
 		;;
+	# Apple iOS
+	-ios*)
+		;;
 	-linux-dietlibc)
 		os=-linux-dietlibc
 		;;
@@ -1594,9 +1601,6 @@ case $basic_machine in
 	mips*-*)
 		os=-elf
 		;;
-	or1k-*)
-		os=-elf
-		;;
 	or32-*)
 		os=-coff
 		;;
--- a/memory/jemalloc/src/configure
+++ b/memory/jemalloc/src/configure
--- a/memory/jemalloc/src/configure.ac
+++ b/memory/jemalloc/src/configure.ac
@@ -44,7 +44,7 @@ AC_CACHE_CHECK([whether $1 is compilable],
 dnl ============================================================================

 dnl Library revision.
-rev=1
+rev=2
 AC_SUBST([rev])

 srcroot=$srcdir
@@ -141,7 +141,8 @@ if test "x$CFLAGS" = "x" ; then
    JE_CFLAGS_APPEND([-Zi])
    JE_CFLAGS_APPEND([-MT])
    JE_CFLAGS_APPEND([-W3])
-    CPPFLAGS="$CPPFLAGS -I${srcroot}/include/msvc_compat"
+    JE_CFLAGS_APPEND([-FS])
+    CPPFLAGS="$CPPFLAGS -I${srcdir}/include/msvc_compat"
  fi
 fi
 dnl Append EXTRA_CFLAGS to CFLAGS, if defined.
@@ -156,7 +157,7 @@ if test "x${ac_cv_big_endian}" = "x1" ; then
 fi

 if test "x${je_cv_msvc}" = "xyes" -a "x${ac_cv_header_inttypes_h}" = "xno"; then
-  CPPFLAGS="$CPPFLAGS -I${srcroot}/include/msvc_compat/C99"
+  CPPFLAGS="$CPPFLAGS -I${srcdir}/include/msvc_compat/C99"
 fi

 AC_CHECK_SIZEOF([void *])
@@ -205,23 +206,14 @@ AC_CANONICAL_HOST
 dnl CPU-specific settings.
 CPU_SPINWAIT=""
 case "${host_cpu}" in
-  i[[345]]86)
-	;;
  i686|x86_64)
-	JE_COMPILABLE([pause instruction], [],
-	              [[__asm__ volatile("pause"); return 0;]],
-	              [je_cv_pause])
+	AC_CACHE_VAL([je_cv_pause],
+	  [JE_COMPILABLE([pause instruction], [],
+	                [[__asm__ volatile("pause"); return 0;]],
+	                [je_cv_pause])])
 	if test "x${je_cv_pause}" = "xyes" ; then
 	    CPU_SPINWAIT='__asm__ volatile("pause")'
 	fi
-	dnl emmintrin.h fails to compile unless MMX, SSE, and SSE2 are
-	dnl supported.
-	JE_COMPILABLE([SSE2 intrinsics], [
-#include <emmintrin.h>
-], [], [je_cv_sse2])
-	if test "x${je_cv_sse2}" = "xyes" ; then
-	  AC_DEFINE_UNQUOTED([HAVE_SSE2], [ ])
-	fi
 	;;
  powerpc)
 	AC_DEFINE_UNQUOTED([HAVE_ALTIVEC], [ ])
@@ -263,7 +255,7 @@ dnl definitions need to be seen before any headers are included, which is a pain
 dnl to make happen otherwise.
 default_munmap="1"
 case "${host}" in
-  *-*-darwin*)
+  *-*-darwin* | *-*-ios*)
 	CFLAGS="$CFLAGS"
 	abi="macho"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
@@ -282,6 +274,11 @@ case "${host}" in
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	force_lazy_lock="1"
 	;;
+  *-*-dragonfly*)
+	CFLAGS="$CFLAGS"
+	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
+	;;
  *-*-linux*)
 	CFLAGS="$CFLAGS"
 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
@@ -324,7 +321,7 @@ case "${host}" in
 	fi
 	abi="xcoff"
 	;;
-  *-*-mingw*)
+  *-*-mingw* | *-*-cygwin*)
 	abi="pecoff"
 	force_tls="0"
 	RPATH=""
@@ -405,7 +402,7 @@ SAVED_CFLAGS="${CFLAGS}"
 JE_CFLAGS_APPEND([-Werror])
 JE_COMPILABLE([tls_model attribute], [],
              [static __thread int
-               __attribute__((tls_model("initial-exec"))) foo;
+               __attribute__((tls_model("initial-exec"), unused)) foo;
               foo = 0;],
              [je_cv_tls_model])
 CFLAGS="${SAVED_CFLAGS}"
@@ -446,7 +443,7 @@ AC_PROG_RANLIB
 AC_PATH_PROG([LD], [ld], [false], [$PATH])
 AC_PATH_PROG([AUTOCONF], [autoconf], [false], [$PATH])

-public_syms="malloc_conf malloc_message malloc calloc posix_memalign aligned_alloc realloc free mallocx rallocx xallocx sallocx dallocx nallocx mallctl mallctlnametomib mallctlbymib malloc_stats_print malloc_usable_size"
+public_syms="malloc_conf malloc_message malloc calloc posix_memalign aligned_alloc realloc free mallocx rallocx xallocx sallocx dallocx sdallocx nallocx mallctl mallctlnametomib mallctlbymib malloc_stats_print malloc_usable_size"

 dnl Check for allocator-related functions that should be wrapped.
 AC_CHECK_FUNC([memalign],
@@ -456,24 +453,6 @@ AC_CHECK_FUNC([valloc],
 	      [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC], [ ])
 	       public_syms="${public_syms} valloc"])

-dnl Support the experimental API by default.
-AC_ARG_ENABLE([experimental],
-  [AS_HELP_STRING([--disable-experimental],
-   [Disable support for the experimental API])],
-[if test "x$enable_experimental" = "xno" ; then
-  enable_experimental="0"
-else
-  enable_experimental="1"
-fi
-],
-[enable_experimental="1"]
-)
-if test "x$enable_experimental" = "x1" ; then
-  AC_DEFINE([JEMALLOC_EXPERIMENTAL], [ ])
-  public_syms="${public_syms} allocm dallocm nallocm rallocm sallocm"
-fi
-AC_SUBST([enable_experimental])
-
 dnl Do not compute test code coverage by default.
 GCOV_FLAGS=
 AC_ARG_ENABLE([code-coverage],
@@ -552,31 +531,37 @@ je_="je_"
 AC_SUBST([je_])

 cfgoutputs_in="Makefile.in"
+cfgoutputs_in="${cfgoutputs_in} jemalloc.pc.in"
 cfgoutputs_in="${cfgoutputs_in} doc/html.xsl.in"
 cfgoutputs_in="${cfgoutputs_in} doc/manpages.xsl.in"
 cfgoutputs_in="${cfgoutputs_in} doc/jemalloc.xml.in"
 cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_macros.h.in"
 cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_protos.h.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_typedefs.h.in"
 cfgoutputs_in="${cfgoutputs_in} include/jemalloc/internal/jemalloc_internal.h.in"
 cfgoutputs_in="${cfgoutputs_in} test/test.sh.in"
 cfgoutputs_in="${cfgoutputs_in} test/include/test/jemalloc_test.h.in"

 cfgoutputs_out="Makefile"
+cfgoutputs_out="${cfgoutputs_out} jemalloc.pc"
 cfgoutputs_out="${cfgoutputs_out} doc/html.xsl"
 cfgoutputs_out="${cfgoutputs_out} doc/manpages.xsl"
 cfgoutputs_out="${cfgoutputs_out} doc/jemalloc.xml"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_macros.h"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_protos.h"
+cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_typedefs.h"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_internal.h"
 cfgoutputs_out="${cfgoutputs_out} test/test.sh"
 cfgoutputs_out="${cfgoutputs_out} test/include/test/jemalloc_test.h"

 cfgoutputs_tup="Makefile"
+cfgoutputs_tup="${cfgoutputs_tup} jemalloc.pc:jemalloc.pc.in"
 cfgoutputs_tup="${cfgoutputs_tup} doc/html.xsl:doc/html.xsl.in"
 cfgoutputs_tup="${cfgoutputs_tup} doc/manpages.xsl:doc/manpages.xsl.in"
 cfgoutputs_tup="${cfgoutputs_tup} doc/jemalloc.xml:doc/jemalloc.xml.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_macros.h:include/jemalloc/jemalloc_macros.h.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_protos.h:include/jemalloc/jemalloc_protos.h.in"
+cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_typedefs.h:include/jemalloc/jemalloc_typedefs.h.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_internal.h"
 cfgoutputs_tup="${cfgoutputs_tup} test/test.sh:test/test.sh.in"
 cfgoutputs_tup="${cfgoutputs_tup} test/include/test/jemalloc_test.h:test/include/test/jemalloc_test.h.in"
@@ -613,18 +598,17 @@ cfghdrs_tup="include/jemalloc/jemalloc_defs.h:include/jemalloc/jemalloc_defs.h.i
 cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:include/jemalloc/internal/jemalloc_internal_defs.h.in"
 cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:test/include/test/jemalloc_test_defs.h.in"

-dnl Do not silence irrelevant compiler warnings by default, since enabling this
-dnl option incurs a performance penalty.
+dnl Silence irrelevant compiler warnings by default.
 AC_ARG_ENABLE([cc-silence],
-  [AS_HELP_STRING([--enable-cc-silence],
-                  [Silence irrelevant compiler warnings])],
+  [AS_HELP_STRING([--disable-cc-silence],
+                  [Do not silence irrelevant compiler warnings])],
 [if test "x$enable_cc_silence" = "xno" ; then
  enable_cc_silence="0"
 else
  enable_cc_silence="1"
 fi
 ],
-[enable_cc_silence="0"]
+[enable_cc_silence="1"]
 )
 if test "x$enable_cc_silence" = "x1" ; then
  AC_DEFINE([JEMALLOC_CC_SILENCE], [ ])
@@ -739,7 +723,7 @@ fi,
 if test "x$backtrace_method" = "x" -a "x$enable_prof_libunwind" = "x1" ; then
  AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
  if test "x$LUNWIND" = "x-lunwind" ; then
-    AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
+    AC_CHECK_LIB([unwind], [unw_backtrace], [LIBS="$LIBS $LUNWIND"],
                 [enable_prof_libunwind="0"])
  else
    LIBS="$LIBS $LUNWIND"
@@ -800,11 +784,6 @@ fi
 AC_MSG_CHECKING([configured backtracing method])
 AC_MSG_RESULT([$backtrace_method])
 if test "x$enable_prof" = "x1" ; then
-  if test "x${force_tls}" = "x0" ; then
-    AC_MSG_ERROR([Heap profiling requires TLS]);
-  fi
-  force_tls="1"
-
  if test "x$abi" != "xpecoff"; then
    dnl Heap profiling uses the log(3) function.
    LIBS="$LIBS -lm"
@@ -830,33 +809,6 @@ if test "x$enable_tcache" = "x1" ; then
 fi
 AC_SUBST([enable_tcache])

-dnl Disable mremap() for huge realloc() by default.
-AC_ARG_ENABLE([mremap],
-  [AS_HELP_STRING([--enable-mremap], [Enable mremap(2) for huge realloc()])],
-[if test "x$enable_mremap" = "xno" ; then
-  enable_mremap="0"
-else
-  enable_mremap="1"
-fi
-],
-[enable_mremap="0"]
-)
-if test "x$enable_mremap" = "x1" ; then
-  JE_COMPILABLE([mremap(...MREMAP_FIXED...)], [
-#define	_GNU_SOURCE
-#include <sys/mman.h>
-], [
-void *p = mremap((void *)0, 0, 0, MREMAP_MAYMOVE|MREMAP_FIXED, (void *)0);
-], [je_cv_mremap_fixed])
-  if test "x${je_cv_mremap_fixed}" = "xno" ; then
-    enable_mremap="0"
-  fi
-fi
-if test "x$enable_mremap" = "x1" ; then
-  AC_DEFINE([JEMALLOC_MREMAP], [ ])
-fi
-AC_SUBST([enable_mremap])
-
 dnl Enable VM deallocation via munmap() by default.
 AC_ARG_ENABLE([munmap],
  [AS_HELP_STRING([--disable-munmap], [Disable VM deallocation via munmap(2)])],
@@ -873,34 +825,22 @@ if test "x$enable_munmap" = "x1" ; then
 fi
 AC_SUBST([enable_munmap])

-dnl Do not enable allocation from DSS by default.
-AC_ARG_ENABLE([dss],
-  [AS_HELP_STRING([--enable-dss], [Enable allocation from DSS])],
-[if test "x$enable_dss" = "xno" ; then
-  enable_dss="0"
-else
-  enable_dss="1"
-fi
-],
-[enable_dss="0"]
-)
+dnl Enable allocation from DSS if supported by the OS.
+have_dss="1"
 dnl Check whether the BSD/SUSv1 sbrk() exists.  If not, disable DSS support.
 AC_CHECK_FUNC([sbrk], [have_sbrk="1"], [have_sbrk="0"])
 if test "x$have_sbrk" = "x1" ; then
-  if test "x$sbrk_deprecated" == "x1" ; then
+  if test "x$sbrk_deprecated" = "x1" ; then
    AC_MSG_RESULT([Disabling dss allocation because sbrk is deprecated])
-    enable_dss="0"
-  else
-    AC_DEFINE([JEMALLOC_HAVE_SBRK], [ ])
+    have_dss="0"
  fi
 else
-  enable_dss="0"
+  have_dss="0"
 fi

-if test "x$enable_dss" = "x1" ; then
+if test "x$have_dss" = "x1" ; then
  AC_DEFINE([JEMALLOC_DSS], [ ])
 fi
-AC_SUBST([enable_dss])

 dnl Support the junk/zero filling option by default.
 AC_ARG_ENABLE([fill],
@@ -992,8 +932,66 @@ if test "x$enable_xmalloc" = "x1" ; then
 fi
 AC_SUBST([enable_xmalloc])

-AC_CACHE_CHECK([STATIC_PAGE_SHIFT],
-               [je_cv_static_page_shift],
+dnl ============================================================================
+dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
+dnl One of those two functions should (theoretically) exist on all platforms
+dnl that jemalloc currently has a chance of functioning on without modification.
+dnl We additionally assume ffs() or __builtin_ffs() are defined if
+dnl ffsl() or __builtin_ffsl() are defined, respectively.
+JE_COMPILABLE([a program using __builtin_ffsl], [
+#include <stdio.h>
+#include <strings.h>
+#include <string.h>
+], [
+	{
+		int rv = __builtin_ffsl(0x08);
+		printf("%d\n", rv);
+	}
+], [je_cv_gcc_builtin_ffsl])
+if test "x${je_cv_gcc_builtin_ffsl}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
+  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
+else
+  JE_COMPILABLE([a program using ffsl], [
+  #include <stdio.h>
+  #include <strings.h>
+  #include <string.h>
+  ], [
+	{
+		int rv = ffsl(0x08);
+		printf("%d\n", rv);
+	}
+  ], [je_cv_function_ffsl])
+  if test "x${je_cv_function_ffsl}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
+    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
+  else
+    AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
+  fi
+fi
+
+AC_ARG_WITH([lg_tiny_min],
+  [AS_HELP_STRING([--with-lg-tiny-min=<lg-tiny-min>],
+   [Base 2 log of minimum tiny size class to support])],
+  [LG_TINY_MIN="$with_lg_tiny_min"],
+  [LG_TINY_MIN="3"])
+AC_DEFINE_UNQUOTED([LG_TINY_MIN], [$LG_TINY_MIN])
+
+AC_ARG_WITH([lg_quantum],
+  [AS_HELP_STRING([--with-lg-quantum=<lg-quantum>],
+   [Base 2 log of minimum allocation alignment])],
+  [LG_QUANTA="$with_lg_quantum"],
+  [LG_QUANTA="3 4"])
+if test "x$with_lg_quantum" != "x" ; then
+  AC_DEFINE_UNQUOTED([LG_QUANTUM], [$with_lg_quantum])
+fi
+
+AC_ARG_WITH([lg_page],
+  [AS_HELP_STRING([--with-lg-page=<lg-page>], [Base 2 log of system page size])],
+  [LG_PAGE="$with_lg_page"], [LG_PAGE="detect"])
+if test "x$LG_PAGE" == "xdetect"; then
+  AC_CACHE_CHECK([LG_PAGE],
+               [je_cv_lg_page],
               AC_RUN_IFELSE([AC_LANG_PROGRAM(
 [[
 #include <strings.h>
@@ -1018,7 +1016,7 @@ AC_CACHE_CHECK([STATIC_PAGE_SHIFT],
    if (result == -1) {
 	return 1;
    }
-    result = ffsl(result) - 1;
+    result = JEMALLOC_INTERNAL_FFSL(result) - 1;

    f = fopen("conftest.out", "w");
    if (f == NULL) {
@@ -1029,24 +1027,35 @@ AC_CACHE_CHECK([STATIC_PAGE_SHIFT],

    return 0;
 ]])],
-                             [je_cv_static_page_shift=`cat conftest.out`],
-                             [je_cv_static_page_shift=undefined]))
-
-if test "x$je_cv_static_page_shift" != "xundefined"; then
-   AC_DEFINE_UNQUOTED([STATIC_PAGE_SHIFT], [$je_cv_static_page_shift])
-else
-   AC_MSG_ERROR([cannot determine value for STATIC_PAGE_SHIFT])
+                             [je_cv_lg_page=`cat conftest.out`],
+                             [je_cv_lg_page=undefined],
+                             [je_cv_lg_page=12]))
 fi
+if test "x${je_cv_lg_page}" != "x" ; then
+  LG_PAGE="${je_cv_lg_page}"
+fi
+if test "x${LG_PAGE}" != "xundefined" ; then
+   AC_DEFINE_UNQUOTED([LG_PAGE], [$LG_PAGE])
+else
+   AC_MSG_ERROR([cannot determine value for LG_PAGE])
+fi
+
+AC_ARG_WITH([lg_page_sizes],
+  [AS_HELP_STRING([--with-lg-page-sizes=<lg-page-sizes>],
+   [Base 2 logs of system page sizes to support])],
+  [LG_PAGE_SIZES="$with_lg_page_sizes"], [LG_PAGE_SIZES="$LG_PAGE"])
+
+AC_ARG_WITH([lg_size_class_group],
+  [AS_HELP_STRING([--with-lg-size-class-group=<lg-size-class-group>],
+   [Base 2 log of size classes per doubling])],
+  [LG_SIZE_CLASS_GROUP="$with_lg_size_class_group"],
+  [LG_SIZE_CLASS_GROUP="2"])

 dnl ============================================================================
 dnl jemalloc configuration.
 dnl 

-dnl Set VERSION if source directory has an embedded git repository.
-if test -d "${srcroot}.git" ; then
-  git describe --long --abbrev=40 > ${srcroot}VERSION
-fi
-jemalloc_version=`cat ${srcroot}VERSION`
+jemalloc_version=`cat "${srcroot}VERSION"`
 jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
 jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'`
 jemalloc_version_bugfix=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]3}'`
@@ -1073,6 +1082,24 @@ fi

 CPPFLAGS="$CPPFLAGS -D_REENTRANT"

+dnl Check if the GNU-specific secure_getenv function exists.
+AC_CHECK_FUNC([secure_getenv],
+              [have_secure_getenv="1"],
+              [have_secure_getenv="0"]
+             )
+if test "x$have_secure_getenv" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_SECURE_GETENV], [ ])
+fi
+
+dnl Check if the Solaris/BSD issetugid function exists.
+AC_CHECK_FUNC([issetugid],
+              [have_issetugid="1"],
+              [have_issetugid="0"]
+             )
+if test "x$have_issetugid" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ISSETUGID], [ ])
+fi
+
 dnl Check whether the BSD-specific _malloc_thread_cleanup() exists.  If so, use
 dnl it rather than pthreads TSD cleanup functions to support cleanup during
 dnl thread exit, in order to avoid pthreads library recursion during
@@ -1165,43 +1192,25 @@ elif test "x${force_tls}" = "x1" ; then
 fi

 dnl ============================================================================
-dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
-dnl One of those two functions should (theoretically) exist on all platforms
-dnl that jemalloc currently has a chance of functioning on without modification.
-dnl We additionally assume ffs() or __builtin_ffs() are defined if
-dnl ffsl() or __builtin_ffsl() are defined, respectively.
-JE_COMPILABLE([a program using __builtin_ffsl], [
-#include <stdio.h>
-#include <strings.h>
-#include <string.h>
-], [
-	{
-		int rv = __builtin_ffsl(0x08);
-		printf("%d\n", rv);
-	}
-], [je_cv_gcc_builtin_ffsl])
-if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
-  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
-  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
-else
-  JE_COMPILABLE([a program using ffsl], [
-  #include <stdio.h>
-  #include <strings.h>
-  #include <string.h>
-  ], [
-	{
-		int rv = ffsl(0x08);
-		printf("%d\n", rv);
-	}
-  ], [je_cv_function_ffsl])
-  if test "x${je_cv_function_ffsl}" == "xyes" ; then
-    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
-    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
-  else
-    AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
-  fi
-fi
+dnl Check for C11 atomics.

+JE_COMPILABLE([C11 atomics], [
+#include <stdint.h>
+#if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
+#include <stdatomic.h>
+#else
+#error Atomics not available
+#endif
+], [
+    uint64_t *p = (uint64_t *)0;
+    uint64_t x = 1;
+    volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+    uint64_t r = atomic_fetch_add(a, x) + x;
+    return (r == 0);
+], [je_cv_c11atomics])
+if test "x${je_cv_c11atomics}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_C11ATOMICS])
+fi

 dnl ============================================================================
 dnl Check for atomic(9) operations as provided on FreeBSD.
@@ -1248,6 +1257,20 @@ if test "x${je_cv_osatomic}" = "xyes" ; then
  AC_DEFINE([JEMALLOC_OSATOMIC], [ ])
 fi

+dnl ============================================================================
+dnl Check for madvise(2).
+
+JE_COMPILABLE([madvise(2)], [
+#include <sys/mman.h>
+], [
+	{
+		madvise((void *)0, 0, 0);
+	}
+], [je_cv_madvise])
+if test "x${je_cv_madvise}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_MADVISE], [ ])
+fi
+
 dnl ============================================================================
 dnl Check whether __sync_{add,sub}_and_fetch() are available despite
 dnl __GCC_HAVE_SYNC_COMPARE_AND_SWAP_n macros being undefined.
@@ -1282,6 +1305,29 @@ if test "x${je_cv_atomic9}" != "xyes" -a "x${je_cv_osatomic}" != "xyes" ; then
  JE_SYNC_COMPARE_AND_SWAP_CHECK(64, 8)
 fi

+dnl ============================================================================
+dnl Check for __builtin_clz() and __builtin_clzl().
+
+AC_CACHE_CHECK([for __builtin_clz],
+               [je_cv_builtin_clz],
+               [AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+                                                [
+                                                {
+                                                        unsigned x = 0;
+                                                        int y = __builtin_clz(x);
+                                                }
+                                                {
+                                                        unsigned long x = 0;
+                                                        int y = __builtin_clzl(x);
+                                                }
+                                                ])],
+                               [je_cv_builtin_clz=yes],
+                               [je_cv_builtin_clz=no])])
+
+if test "x${je_cv_builtin_clz}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_BUILTIN_CLZ], [ ])
+fi
+
 dnl ============================================================================
 dnl Check for spinlock(3) operations as provided on Darwin.

@@ -1330,7 +1376,7 @@ if test "x${enable_zone_allocator}" = "x1" ; then
  AC_DEFUN([JE_ZONE_PROGRAM],
    [AC_LANG_PROGRAM(
      [#include <malloc/malloc.h>],
-      [static foo[[sizeof($1) $2 sizeof(void *) * $3 ? 1 : -1]]]
+      [static int foo[[sizeof($1) $2 sizeof(void *) * $3 ? 1 : -1]]]
    )])

  AC_COMPILE_IFELSE([JE_ZONE_PROGRAM(malloc_zone_t,==,14)],[JEMALLOC_ZONE_VERSION=3],[
@@ -1355,6 +1401,49 @@ if test "x${enable_zone_allocator}" = "x1" ; then
  AC_DEFINE_UNQUOTED(JEMALLOC_ZONE_VERSION, [$JEMALLOC_ZONE_VERSION])
 fi

+dnl ============================================================================
+dnl Check for glibc malloc hooks
+
+JE_COMPILABLE([glibc malloc hook], [
+#include <stddef.h>
+
+extern void (* __free_hook)(void *ptr);
+extern void *(* __malloc_hook)(size_t size);
+extern void *(* __realloc_hook)(void *ptr, size_t size);
+], [
+  void *ptr = 0L;
+  if (__malloc_hook) ptr = __malloc_hook(1);
+  if (__realloc_hook) ptr = __realloc_hook(ptr, 2);
+  if (__free_hook && ptr) __free_hook(ptr);
+], [je_cv_glibc_malloc_hook])
+if test "x${je_cv_glibc_malloc_hook}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_GLIBC_MALLOC_HOOK], [ ])
+fi
+
+JE_COMPILABLE([glibc memalign hook], [
+#include <stddef.h>
+
+extern void *(* __memalign_hook)(size_t alignment, size_t size);
+], [
+  void *ptr = 0L;
+  if (__memalign_hook) ptr = __memalign_hook(16, 7);
+], [je_cv_glibc_memalign_hook])
+if test "x${je_cv_glibc_memalign_hook}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_GLIBC_MEMALIGN_HOOK], [ ])
+fi
+
+JE_COMPILABLE([pthreads adaptive mutexes], [
+#include <pthread.h>
+], [
+  pthread_mutexattr_t attr;
+  pthread_mutexattr_init(&attr);
+  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+  pthread_mutexattr_destroy(&attr);
+], [je_cv_pthread_mutex_adaptive_np])
+if test "x${je_cv_pthread_mutex_adaptive_np}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP], [ ])
+fi
+
 dnl ============================================================================
 dnl Check for typedefs, structures, and compiler characteristics.
 AC_HEADER_STDBOOL
@@ -1415,10 +1504,14 @@ AC_CONFIG_COMMANDS([include/jemalloc/internal/public_unnamespace.h], [
 ])
 AC_CONFIG_COMMANDS([include/jemalloc/internal/size_classes.h], [
  mkdir -p "${objroot}include/jemalloc/internal"
-  "${srcdir}/include/jemalloc/internal/size_classes.sh" > "${objroot}include/jemalloc/internal/size_classes.h"
+  "${srcdir}/include/jemalloc/internal/size_classes.sh" "${LG_QUANTA}" ${LG_TINY_MIN} "${LG_PAGE_SIZES}" ${LG_SIZE_CLASS_GROUP} > "${objroot}include/jemalloc/internal/size_classes.h"
 ], [
  srcdir="${srcdir}"
  objroot="${objroot}"
+  LG_QUANTA="${LG_QUANTA}"
+  LG_TINY_MIN=${LG_TINY_MIN}
+  LG_PAGE_SIZES="${LG_PAGE_SIZES}"
+  LG_SIZE_CLASS_GROUP=${LG_SIZE_CLASS_GROUP}
 ])
 AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_protos_jet.h], [
  mkdir -p "${objroot}include/jemalloc"
@@ -1504,7 +1597,6 @@ AC_MSG_RESULT([JEMALLOC_PRIVATE_NAMESPACE])
 AC_MSG_RESULT([                   : ${JEMALLOC_PRIVATE_NAMESPACE}])
 AC_MSG_RESULT([install_suffix     : ${install_suffix}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
-AC_MSG_RESULT([experimental       : ${enable_experimental}])
 AC_MSG_RESULT([cc-silence         : ${enable_cc_silence}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([code-coverage      : ${enable_code_coverage}])
@@ -1518,9 +1610,7 @@ AC_MSG_RESULT([fill               : ${enable_fill}])
 AC_MSG_RESULT([utrace             : ${enable_utrace}])
 AC_MSG_RESULT([valgrind           : ${enable_valgrind}])
 AC_MSG_RESULT([xmalloc            : ${enable_xmalloc}])
-AC_MSG_RESULT([mremap             : ${enable_mremap}])
 AC_MSG_RESULT([munmap             : ${enable_munmap}])
-AC_MSG_RESULT([dss                : ${enable_dss}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
 AC_MSG_RESULT([tls                : ${enable_tls}])
 AC_MSG_RESULT([===============================================================================])
--- a/memory/jemalloc/src/doc/jemalloc.xml.in
+++ b/memory/jemalloc/src/doc/jemalloc.xml.in
--- a/memory/jemalloc/src/include/jemalloc/internal/arena.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/arena.h
@@ -1,30 +1,10 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES

-/*
- * RUN_MAX_OVRHD indicates maximum desired run header overhead.  Runs are sized
- * as small as possible such that this setting is still honored, without
- * violating other constraints.  The goal is to make runs as small as possible
- * without exceeding a per run external fragmentation threshold.
- *
- * We use binary fixed point math for overhead computations, where the binary
- * point is implicitly RUN_BFP bits to the left.
- *
- * Note that it is possible to set RUN_MAX_OVRHD low enough that it cannot be
- * honored for some/all object sizes, since when heap profiling is enabled
- * there is one pointer of header overhead per object (plus a constant).  This
- * constraint is relaxed (ignored) for runs that are so small that the
- * per-region overhead is greater than:
- *
- *   (RUN_MAX_OVRHD / (reg_interval << (3+RUN_BFP))
- */
-#define	RUN_BFP			12
-/*                                    \/   Implicit binary fixed point. */
-#define	RUN_MAX_OVRHD		0x0000003dU
-#define	RUN_MAX_OVRHD_RELAX	0x00001800U
+#define	LARGE_MINCLASS		(ZU(1) << LG_LARGE_MINCLASS)

 /* Maximum number of regions in one run. */
-#define	LG_RUN_MAXREGS		11
+#define	LG_RUN_MAXREGS		(LG_PAGE - LG_TINY_MIN)
 #define	RUN_MAXREGS		(1U << LG_RUN_MAXREGS)

 /*
@@ -43,9 +23,10 @@
 */
 #define	LG_DIRTY_MULT_DEFAULT	3

-typedef struct arena_chunk_map_s arena_chunk_map_t;
-typedef struct arena_chunk_s arena_chunk_t;
 typedef struct arena_run_s arena_run_t;
+typedef struct arena_chunk_map_bits_s arena_chunk_map_bits_t;
+typedef struct arena_chunk_map_misc_s arena_chunk_map_misc_t;
+typedef struct arena_chunk_s arena_chunk_t;
 typedef struct arena_bin_info_s arena_bin_info_t;
 typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
@@ -54,41 +35,19 @@ typedef struct arena_s arena_t;
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS

+struct arena_run_s {
+	/* Index of bin this run is associated with. */
+	index_t		binind;
+
+	/* Number of free regions in run. */
+	unsigned	nfree;
+
+	/* Per region allocated/deallocated bitmap. */
+	bitmap_t	bitmap[BITMAP_GROUPS_MAX];
+};
+
 /* Each element of the chunk map corresponds to one page within the chunk. */
-struct arena_chunk_map_s {
-#ifndef JEMALLOC_PROF
-	/*
-	 * Overlay prof_ctx in order to allow it to be referenced by dead code.
-	 * Such antics aren't warranted for per arena data structures, but
-	 * chunk map overhead accounts for a percentage of memory, rather than
-	 * being just a fixed cost.
-	 */
-	union {
-#endif
-	union {
-		/*
-		 * Linkage for run trees.  There are two disjoint uses:
-		 *
-		 * 1) arena_t's runs_avail tree.
-		 * 2) arena_run_t conceptually uses this linkage for in-use
-		 *    non-full runs, rather than directly embedding linkage.
-		 */
-		rb_node(arena_chunk_map_t)	rb_link;
-		/*
-		 * List of runs currently in purgatory.  arena_chunk_purge()
-		 * temporarily allocates runs that contain dirty pages while
-		 * purging, so that other threads cannot use the runs while the
-		 * purging thread is operating without the arena lock held.
-		 */
-		ql_elm(arena_chunk_map_t)	ql_link;
-	}				u;
-
-	/* Profile counters, used for large object runs. */
-	prof_ctx_t			*prof_ctx;
-#ifndef JEMALLOC_PROF
-	}; /* union { ... }; */
-#endif
-
+struct arena_chunk_map_bits_s {
 	/*
 	 * Run address (or size) and various flags are stored together.  The bit
 	 * layout looks like (assuming 32-bit system):
@@ -110,7 +69,6 @@ struct arena_chunk_map_s {
 	 * p : run page offset
 	 * s : run size
 	 * n : binind for size class; large objects set these to BININD_INVALID
-	 *     except for promoted allocations (see prof_promote)
 	 * x : don't care
 	 * - : 0
 	 * + : 1
@@ -137,11 +95,15 @@ struct arena_chunk_map_s {
 	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
 	 *     -------- -------- ----++++ ++++D-LA
 	 *
-	 *   Large (sampled, size <= PAGE):
+	 *   Large (sampled, size <= LARGE_MINCLASS):
 	 *     ssssssss ssssssss ssssnnnn nnnnD-LA
+	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
+	 *     -------- -------- ----++++ ++++D-LA
 	 *
-	 *   Large (not sampled, size == PAGE):
+	 *   Large (not sampled, size == LARGE_MINCLASS):
 	 *     ssssssss ssssssss ssss++++ ++++D-LA
+	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
+	 *     -------- -------- ----++++ ++++D-LA
 	 */
 	size_t				bits;
 #define	CHUNK_MAP_BININD_SHIFT	4
@@ -156,52 +118,49 @@ struct arena_chunk_map_s {
 #define	CHUNK_MAP_ALLOCATED	((size_t)0x1U)
 #define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
 };
-typedef rb_tree(arena_chunk_map_t) arena_avail_tree_t;
-typedef rb_tree(arena_chunk_map_t) arena_run_tree_t;
-typedef ql_head(arena_chunk_map_t) arena_chunk_mapelms_t;
+
+/*
+ * Each arena_chunk_map_misc_t corresponds to one page within the chunk, just
+ * like arena_chunk_map_bits_t.  Two separate arrays are stored within each
+ * chunk header in order to improve cache locality.
+ */
+struct arena_chunk_map_misc_s {
+	/*
+	 * Linkage for run trees.  There are two disjoint uses:
+	 *
+	 * 1) arena_t's runs_avail tree.
+	 * 2) arena_run_t conceptually uses this linkage for in-use non-full
+	 * runs, rather than directly embedding linkage.
+	 */
+	rb_node(arena_chunk_map_misc_t)		rb_link;
+
+	union {
+		/* Linkage for list of dirty runs. */
+		ql_elm(arena_chunk_map_misc_t)	dr_link;
+
+		/* Profile counters, used for large object runs. */
+		prof_tctx_t			*prof_tctx;
+
+		/* Small region run metadata. */
+		arena_run_t			run;
+	};
+};
+typedef rb_tree(arena_chunk_map_misc_t) arena_avail_tree_t;
+typedef rb_tree(arena_chunk_map_misc_t) arena_run_tree_t;
+typedef ql_head(arena_chunk_map_misc_t) arena_chunk_miscelms_t;

 /* Arena chunk header. */
 struct arena_chunk_s {
 	/* Arena that owns the chunk. */
 	arena_t			*arena;

-	/* Linkage for tree of arena chunks that contain dirty runs. */
-	rb_node(arena_chunk_t)	dirty_link;
-
-	/* Number of dirty pages. */
-	size_t			ndirty;
-
-	/* Number of available runs. */
-	size_t			nruns_avail;
-
-	/*
-	 * Number of available run adjacencies that purging could coalesce.
-	 * Clean and dirty available runs are not coalesced, which causes
-	 * virtual memory fragmentation.  The ratio of
-	 * (nruns_avail-nruns_adjac):nruns_adjac is used for tracking this
-	 * fragmentation.
-	 */
-	size_t			nruns_adjac;
-
 	/*
 	 * Map of pages within chunk that keeps track of free/large/small.  The
 	 * first map_bias entries are omitted, since the chunk header does not
 	 * need to be tracked in the map.  This omission saves a header page
 	 * for common chunk sizes (e.g. 4 MiB).
 	 */
-	arena_chunk_map_t	map[1]; /* Dynamically sized. */
-};
-typedef rb_tree(arena_chunk_t) arena_chunk_tree_t;
-
-struct arena_run_s {
-	/* Bin this run is associated with. */
-	arena_bin_t	*bin;
-
-	/* Index of next region that has never been allocated, or nregs. */
-	uint32_t	nextind;
-
-	/* Number of free regions in run. */
-	unsigned	nfree;
+	arena_chunk_map_bits_t	map_bits[1]; /* Dynamically sized. */
 };

 /*
@@ -212,12 +171,7 @@ struct arena_run_s {
 * Each run has the following layout:
 *
 *               /--------------------\
- *               | arena_run_t header |
- *               | ...                |
- * bitmap_offset | bitmap             |
- *               | ...                |
- *   ctx0_offset | ctx map            |
- *               | ...                |
+ *               | pad?               |
 *               |--------------------|
 *               | redzone            |
 *   reg0_offset | region 0           |
@@ -258,24 +212,12 @@ struct arena_bin_info_s {
 	/* Total number of regions in a run for this bin's size class. */
 	uint32_t	nregs;

-	/*
-	 * Offset of first bitmap_t element in a run header for this bin's size
-	 * class.
-	 */
-	uint32_t	bitmap_offset;
-
 	/*
 	 * Metadata used to manipulate bitmaps for runs associated with this
 	 * bin.
 	 */
 	bitmap_info_t	bitmap_info;

-	/*
-	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
-	 * class, or 0 if (config_prof == false || opt_prof == false).
-	 */
-	uint32_t	ctx0_offset;
-
 	/* Offset of first region in a run for this bin's size class. */
 	uint32_t	reg0_offset;
 };
@@ -321,8 +263,7 @@ struct arena_s {
 	/*
 	 * There are three classes of arena operations from a locking
 	 * perspective:
-	 * 1) Thread asssignment (modifies nthreads) is protected by
-	 *    arenas_lock.
+	 * 1) Thread assignment (modifies nthreads) is protected by arenas_lock.
 	 * 2) Bin-related operations are protected by bin locks.
 	 * 3) Chunk- and run-related operations are protected by this mutex.
 	 */
@@ -339,9 +280,6 @@ struct arena_s {

 	dss_prec_t		dss_prec;

-	/* Tree of dirty-page-containing chunks this arena manages. */
-	arena_chunk_tree_t	chunks_dirty;
-
 	/*
 	 * In order to avoid rapid chunk allocation/deallocation when an arena
 	 * oscillates right on the cusp of needing a new chunk, cache the most
@@ -354,7 +292,7 @@ struct arena_s {
 	 */
 	arena_chunk_t		*spare;

-	/* Number of pages in active runs. */
+	/* Number of pages in active runs and huge regions. */
 	size_t			nactive;

 	/*
@@ -365,20 +303,21 @@ struct arena_s {
 	 */
 	size_t			ndirty;

-	/*
-	 * Approximate number of pages being purged.  It is possible for
-	 * multiple threads to purge dirty pages concurrently, and they use
-	 * npurgatory to indicate the total number of pages all threads are
-	 * attempting to purge.
-	 */
-	size_t			npurgatory;
-
 	/*
 	 * Size/address-ordered trees of this arena's available runs.  The trees
 	 * are used for first-best-fit run allocation.
 	 */
 	arena_avail_tree_t	runs_avail;

+	/* List of dirty runs this arena manages. */
+	arena_chunk_miscelms_t	runs_dirty;
+
+	/*
+	 * User-configurable chunk allocation and deallocation functions.
+	 */
+	chunk_alloc_t		*chunk_alloc;
+	chunk_dalloc_t		*chunk_dalloc;
+
 	/* bins is used to store trees of free regions. */
 	arena_bin_t		bins[NBINS];
 };
@@ -388,22 +327,28 @@ struct arena_s {
 #ifdef JEMALLOC_H_EXTERNS

 extern ssize_t	opt_lg_dirty_mult;
-/*
- * small_size2bin is a compact lookup table that rounds request sizes up to
- * size classes.  In order to reduce cache footprint, the table is compressed,
- * and all accesses are via the SMALL_SIZE2BIN macro.
- */
-extern uint8_t const	small_size2bin[];
-#define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])

 extern arena_bin_info_t	arena_bin_info[NBINS];

-/* Number of large size classes. */
-#define			nlclasses (chunk_npages - map_bias)
+extern size_t		map_bias; /* Number of arena chunk header pages. */
+extern size_t		map_misc_offset;
+extern size_t		arena_maxrun; /* Max run size for arenas. */
+extern size_t		arena_maxclass; /* Max size class for arenas. */
+extern unsigned		nlclasses; /* Number of large size classes. */
+extern unsigned		nhclasses; /* Number of huge size classes. */

+void	*arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
+    bool *zero);
+void	arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize);
+void	arena_chunk_ralloc_huge_similar(arena_t *arena, void *chunk,
+    size_t oldsize, size_t usize);
+void	arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk,
+    size_t oldsize, size_t usize);
+bool	arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk,
+    size_t oldsize, size_t usize, bool *zero);
 void	arena_purge_all(arena_t *arena);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
-    size_t binind, uint64_t prof_accumbytes);
+    index_t binind, uint64_t prof_accumbytes);
 void	arena_alloc_junk_small(void *ptr, arena_bin_info_t *bin_info,
    bool zero);
 #ifdef JEMALLOC_JET
@@ -420,17 +365,19 @@ void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero);
 void	arena_prof_promoted(const void *ptr, size_t size);
-void	arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    arena_chunk_map_t *mapelm);
+void	arena_dalloc_bin_junked_locked(arena_t *arena, arena_chunk_t *chunk,
+    void *ptr, arena_chunk_map_bits_t *bitselm);
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t pageind, arena_chunk_map_t *mapelm);
+    size_t pageind, arena_chunk_map_bits_t *bitselm);
 void	arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
    size_t pageind);
 #ifdef JEMALLOC_JET
 typedef void (arena_dalloc_junk_large_t)(void *, size_t);
 extern arena_dalloc_junk_large_t *arena_dalloc_junk_large;
+#else
+void	arena_dalloc_junk_large(void *ptr, size_t usize);
 #endif
-void	arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk,
+void	arena_dalloc_large_junked_locked(arena_t *arena, arena_chunk_t *chunk,
    void *ptr);
 void	arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #ifdef JEMALLOC_JET
@@ -439,15 +386,15 @@ extern arena_ralloc_junk_large_t *arena_ralloc_junk_large;
 #endif
 bool	arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
    size_t extra, bool zero);
-void	*arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
-    bool try_tcache_dalloc);
+void	*arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
+    size_t size, size_t extra, size_t alignment, bool zero,
+    bool try_tcache_alloc, bool try_tcache_dalloc);
 dss_prec_t	arena_dss_prec_get(arena_t *arena);
-void	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
+bool	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
 void	arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
    size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
-    malloc_large_stats_t *lstats);
-bool	arena_new(arena_t *arena, unsigned ind);
+    malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats);
+arena_t	*arena_new(unsigned ind);
 void	arena_boot(void);
 void	arena_prefork(arena_t *arena);
 void	arena_postfork_parent(arena_t *arena);
@@ -458,7 +405,13 @@ void	arena_postfork_child(arena_t *arena);
 #ifdef JEMALLOC_H_INLINES

 #ifndef JEMALLOC_ENABLE_INLINE
-arena_chunk_map_t	*arena_mapp_get(arena_chunk_t *chunk, size_t pageind);
+arena_chunk_map_bits_t	*arena_bitselm_get(arena_chunk_t *chunk,
+    size_t pageind);
+arena_chunk_map_misc_t	*arena_miscelm_get(arena_chunk_t *chunk,
+    size_t pageind);
+size_t	arena_miscelm_to_pageind(arena_chunk_map_misc_t *miscelm);
+void	*arena_miscelm_to_rpages(arena_chunk_map_misc_t *miscelm);
+arena_chunk_map_misc_t	*arena_run_to_miscelm(arena_run_t *run);
 size_t	*arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbitsp_read(size_t *mapbitsp);
 size_t	arena_mapbits_get(arena_chunk_t *chunk, size_t pageind);
@@ -466,7 +419,7 @@ size_t	arena_mapbits_unallocated_size_get(arena_chunk_t *chunk,
    size_t pageind);
 size_t	arena_mapbits_large_size_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_small_runind_get(arena_chunk_t *chunk, size_t pageind);
-size_t	arena_mapbits_binind_get(arena_chunk_t *chunk, size_t pageind);
+index_t	arena_mapbits_binind_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_dirty_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_unzeroed_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_large_get(arena_chunk_t *chunk, size_t pageind);
@@ -479,43 +432,91 @@ void	arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
 void	arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind,
    size_t size, size_t flags);
 void	arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
-    size_t binind);
+    index_t binind);
 void	arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind,
-    size_t runind, size_t binind, size_t flags);
+    size_t runind, index_t binind, size_t flags);
 void	arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
    size_t unzeroed);
 bool	arena_prof_accum_impl(arena_t *arena, uint64_t accumbytes);
 bool	arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes);
 bool	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
-size_t	arena_ptr_small_binind_get(const void *ptr, size_t mapbits);
-size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
+index_t	arena_ptr_small_binind_get(const void *ptr, size_t mapbits);
+index_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
 unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
    const void *ptr);
-prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
-void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
+prof_tctx_t	*arena_prof_tctx_get(const void *ptr);
+void	arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
+void	*arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
-void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr,
+void	arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr,
+    bool try_tcache);
+void	arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
    bool try_tcache);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
 #  ifdef JEMALLOC_ARENA_INLINE_A
-JEMALLOC_ALWAYS_INLINE arena_chunk_map_t *
-arena_mapp_get(arena_chunk_t *chunk, size_t pageind)
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_bits_t *
+arena_bitselm_get(arena_chunk_t *chunk, size_t pageind)
 {

 	assert(pageind >= map_bias);
 	assert(pageind < chunk_npages);

-	return (&chunk->map[pageind-map_bias]);
+	return (&chunk->map_bits[pageind-map_bias]);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
+arena_miscelm_get(arena_chunk_t *chunk, size_t pageind)
+{
+
+	assert(pageind >= map_bias);
+	assert(pageind < chunk_npages);
+
+	return ((arena_chunk_map_misc_t *)((uintptr_t)chunk +
+	    (uintptr_t)map_misc_offset) + pageind-map_bias);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+arena_miscelm_to_pageind(arena_chunk_map_misc_t *miscelm)
+{
+	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+	size_t pageind = ((uintptr_t)miscelm - ((uintptr_t)chunk +
+	    map_misc_offset)) / sizeof(arena_chunk_map_misc_t) + map_bias;
+
+	assert(pageind >= map_bias);
+	assert(pageind < chunk_npages);
+
+	return (pageind);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+arena_miscelm_to_rpages(arena_chunk_map_misc_t *miscelm)
+{
+	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+	size_t pageind = arena_miscelm_to_pageind(miscelm);
+
+	return ((void *)((uintptr_t)chunk + (pageind << LG_PAGE)));
+}
+
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
+arena_run_to_miscelm(arena_run_t *run)
+{
+	arena_chunk_map_misc_t *miscelm = (arena_chunk_map_misc_t
+	    *)((uintptr_t)run - offsetof(arena_chunk_map_misc_t, run));
+
+	assert(arena_miscelm_to_pageind(miscelm) >= map_bias);
+	assert(arena_miscelm_to_pageind(miscelm) < chunk_npages);
+
+	return (miscelm);
 }

 JEMALLOC_ALWAYS_INLINE size_t *
 arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind)
 {

-	return (&arena_mapp_get(chunk, pageind)->bits);
+	return (&arena_bitselm_get(chunk, pageind)->bits);
 }

 JEMALLOC_ALWAYS_INLINE size_t
@@ -564,11 +565,11 @@ arena_mapbits_small_runind_get(arena_chunk_t *chunk, size_t pageind)
 	return (mapbits >> LG_PAGE);
 }

-JEMALLOC_ALWAYS_INLINE size_t
+JEMALLOC_ALWAYS_INLINE index_t
 arena_mapbits_binind_get(arena_chunk_t *chunk, size_t pageind)
 {
 	size_t mapbits;
-	size_t binind;
+	index_t binind;

 	mapbits = arena_mapbits_get(chunk, pageind);
 	binind = (mapbits & CHUNK_MAP_BININD_MASK) >> CHUNK_MAP_BININD_SHIFT;
@@ -660,20 +661,20 @@ arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size,

 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
-    size_t binind)
+    index_t binind)
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);

 	assert(binind <= BININD_INVALID);
-	assert(arena_mapbits_large_size_get(chunk, pageind) == PAGE);
+	assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS);
 	arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_BININD_MASK) |
 	    (binind << CHUNK_MAP_BININD_SHIFT));
 }

 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind, size_t runind,
-    size_t binind, size_t flags)
+    index_t binind, size_t flags)
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
@@ -719,7 +720,7 @@ arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes)

 	cassert(config_prof);

-	if (prof_interval == 0)
+	if (likely(prof_interval == 0))
 		return (false);
 	return (arena_prof_accum_impl(arena, accumbytes));
 }
@@ -730,7 +731,7 @@ arena_prof_accum(arena_t *arena, uint64_t accumbytes)

 	cassert(config_prof);

-	if (prof_interval == 0)
+	if (likely(prof_interval == 0))
 		return (false);

 	{
@@ -743,10 +744,10 @@ arena_prof_accum(arena_t *arena, uint64_t accumbytes)
 	}
 }

-JEMALLOC_ALWAYS_INLINE size_t
+JEMALLOC_ALWAYS_INLINE index_t
 arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 {
-	size_t binind;
+	index_t binind;

 	binind = (mapbits & CHUNK_MAP_BININD_MASK) >> CHUNK_MAP_BININD_SHIFT;

@@ -755,10 +756,13 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 		arena_t *arena;
 		size_t pageind;
 		size_t actual_mapbits;
+		size_t rpages_ind;
 		arena_run_t *run;
 		arena_bin_t *bin;
-		size_t actual_binind;
+		index_t run_binind, actual_binind;
 		arena_bin_info_t *bin_info;
+		arena_chunk_map_misc_t *miscelm;
+		void *rpages;

 		assert(binind != BININD_INVALID);
 		assert(binind < NBINS);
@@ -769,13 +773,17 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 		assert(mapbits == actual_mapbits);
 		assert(arena_mapbits_large_get(chunk, pageind) == 0);
 		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
-		    (actual_mapbits >> LG_PAGE)) << LG_PAGE));
-		bin = run->bin;
+		rpages_ind = pageind - arena_mapbits_small_runind_get(chunk,
+		    pageind);
+		miscelm = arena_miscelm_get(chunk, rpages_ind);
+		run = &miscelm->run;
+		run_binind = run->binind;
+		bin = &arena->bins[run_binind];
 		actual_binind = bin - arena->bins;
-		assert(binind == actual_binind);
+		assert(run_binind == actual_binind);
 		bin_info = &arena_bin_info[actual_binind];
-		assert(((uintptr_t)ptr - ((uintptr_t)run +
+		rpages = arena_miscelm_to_rpages(miscelm);
+		assert(((uintptr_t)ptr - ((uintptr_t)rpages +
 		    (uintptr_t)bin_info->reg0_offset)) % bin_info->reg_interval
 		    == 0);
 	}
@@ -785,10 +793,10 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 #  endif /* JEMALLOC_ARENA_INLINE_A */

 #  ifdef JEMALLOC_ARENA_INLINE_B
-JEMALLOC_INLINE size_t
+JEMALLOC_INLINE index_t
 arena_bin_index(arena_t *arena, arena_bin_t *bin)
 {
-	size_t binind = bin - arena->bins;
+	index_t binind = bin - arena->bins;
 	assert(binind < NBINS);
 	return (binind);
 }
@@ -798,19 +806,21 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 {
 	unsigned shift, diff, regind;
 	size_t interval;
+	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
+	void *rpages = arena_miscelm_to_rpages(miscelm);

 	/*
 	 * Freeing a pointer lower than region zero can cause assertion
 	 * failure.
 	 */
-	assert((uintptr_t)ptr >= (uintptr_t)run +
+	assert((uintptr_t)ptr >= (uintptr_t)rpages +
 	    (uintptr_t)bin_info->reg0_offset);

 	/*
 	 * Avoid doing division with a variable divisor if possible.  Using
 	 * actual division here can reduce allocator throughput by over 20%!
 	 */
-	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run -
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)rpages -
 	    bin_info->reg0_offset);

 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
@@ -850,8 +860,8 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
 		};

-		if (interval <= ((sizeof(interval_invs) / sizeof(unsigned)) +
-		    2)) {
+		if (likely(interval <= ((sizeof(interval_invs) /
+		    sizeof(unsigned)) + 2))) {
 			regind = (diff * interval_invs[interval - 3]) >>
 			    SIZE_INV_SHIFT;
 		} else
@@ -865,10 +875,10 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 	return (regind);
 }

-JEMALLOC_INLINE prof_ctx_t *
-arena_prof_ctx_get(const void *ptr)
+JEMALLOC_INLINE prof_tctx_t *
+arena_prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	arena_chunk_t *chunk;
 	size_t pageind, mapbits;

@@ -880,31 +890,16 @@ arena_prof_ctx_get(const void *ptr)
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
-		if (prof_promote)
-			ret = (prof_ctx_t *)(uintptr_t)1U;
-		else {
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> LG_PAGE)) <<
-			    LG_PAGE));
-			size_t binind = arena_ptr_small_binind_get(ptr,
-			    mapbits);
-			arena_bin_info_t *bin_info = &arena_bin_info[binind];
-			unsigned regind;
-
-			regind = arena_run_regind(run, bin_info, ptr);
-			ret = *(prof_ctx_t **)((uintptr_t)run +
-			    bin_info->ctx0_offset + (regind *
-			    sizeof(prof_ctx_t *)));
-		}
-	} else
-		ret = arena_mapp_get(chunk, pageind)->prof_ctx;
+	if (likely((mapbits & CHUNK_MAP_LARGE) == 0))
+		ret = (prof_tctx_t *)(uintptr_t)1U;
+	else
+		ret = arena_miscelm_get(chunk, pageind)->prof_tctx;

 	return (ret);
 }

 JEMALLOC_INLINE void
-arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
+arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;
 	size_t pageind;
@@ -917,59 +912,42 @@ arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);

-	if (usize > SMALL_MAXCLASS || (prof_promote &&
-	    ((uintptr_t)ctx != (uintptr_t)1U || arena_mapbits_large_get(chunk,
-	    pageind) != 0))) {
-		assert(arena_mapbits_large_get(chunk, pageind) != 0);
-		arena_mapp_get(chunk, pageind)->prof_ctx = ctx;
-	} else {
-		assert(arena_mapbits_large_get(chunk, pageind) == 0);
-		if (prof_promote == false) {
-			size_t mapbits = arena_mapbits_get(chunk, pageind);
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> LG_PAGE)) <<
-			    LG_PAGE));
-			size_t binind;
-			arena_bin_info_t *bin_info;
-			unsigned regind;
-
-			binind = arena_ptr_small_binind_get(ptr, mapbits);
-			bin_info = &arena_bin_info[binind];
-			regind = arena_run_regind(run, bin_info, ptr);
-
-			*((prof_ctx_t **)((uintptr_t)run +
-			    bin_info->ctx0_offset + (regind * sizeof(prof_ctx_t
-			    *)))) = ctx;
-		}
-	}
+	if (unlikely(arena_mapbits_large_get(chunk, pageind) != 0))
+		arena_miscelm_get(chunk, pageind)->prof_tctx = tctx;
 }

 JEMALLOC_ALWAYS_INLINE void *
-arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache)
+arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    bool try_tcache)
 {
 	tcache_t *tcache;

 	assert(size != 0);
 	assert(size <= arena_maxclass);

-	if (size <= SMALL_MAXCLASS) {
-		if (try_tcache && (tcache = tcache_get(true)) != NULL)
+	if (likely(size <= SMALL_MAXCLASS)) {
+		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
+		    true)) != NULL))
 			return (tcache_alloc_small(tcache, size, zero));
 		else {
-			return (arena_malloc_small(choose_arena(arena), size,
-			    zero));
+			arena = arena_choose(tsd, arena);
+			if (unlikely(arena == NULL))
+				return (NULL);
+			return (arena_malloc_small(arena, size, zero));
 		}
 	} else {
 		/*
 		 * Initialize tcache after checking size in order to avoid
 		 * infinite recursion during tcache initialization.
 		 */
-		if (try_tcache && size <= tcache_maxclass && (tcache =
-		    tcache_get(true)) != NULL)
+		if (try_tcache && size <= tcache_maxclass && likely((tcache =
+		    tcache_get(tsd, true)) != NULL))
 			return (tcache_alloc_large(tcache, size, zero));
 		else {
-			return (arena_malloc_large(choose_arena(arena), size,
-			    zero));
+			arena = arena_choose(tsd, arena);
+			if (unlikely(arena == NULL))
+				return (NULL);
+			return (arena_malloc_large(arena, size, zero));
 		}
 	}
 }
@@ -980,7 +958,8 @@ arena_salloc(const void *ptr, bool demote)
 {
 	size_t ret;
 	arena_chunk_t *chunk;
-	size_t pageind, binind;
+	size_t pageind;
+	index_t binind;

 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
@@ -989,71 +968,103 @@ arena_salloc(const void *ptr, bool demote)
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 	binind = arena_mapbits_binind_get(chunk, pageind);
-	if (binind == BININD_INVALID || (config_prof && demote == false &&
-	    prof_promote && arena_mapbits_large_get(chunk, pageind) != 0)) {
+	if (unlikely(binind == BININD_INVALID || (config_prof && !demote &&
+	    arena_mapbits_large_get(chunk, pageind) != 0))) {
 		/*
-		 * Large allocation.  In the common case (demote == true), and
-		 * as this is an inline function, most callers will only end up
-		 * looking at binind to determine that ptr is a small
-		 * allocation.
+		 * Large allocation.  In the common case (demote), and as this
+		 * is an inline function, most callers will only end up looking
+		 * at binind to determine that ptr is a small allocation.
 		 */
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 		ret = arena_mapbits_large_size_get(chunk, pageind);
 		assert(ret != 0);
 		assert(pageind + (ret>>LG_PAGE) <= chunk_npages);
-		assert(ret == PAGE || arena_mapbits_large_size_get(chunk,
-		    pageind+(ret>>LG_PAGE)-1) == 0);
-		assert(binind == arena_mapbits_binind_get(chunk,
-		    pageind+(ret>>LG_PAGE)-1));
 		assert(arena_mapbits_dirty_get(chunk, pageind) ==
 		    arena_mapbits_dirty_get(chunk, pageind+(ret>>LG_PAGE)-1));
 	} else {
-		/*
-		 * Small allocation (possibly promoted to a large object due to
-		 * prof_promote).
-		 */
+		/* Small allocation (possibly promoted to a large object). */
 		assert(arena_mapbits_large_get(chunk, pageind) != 0 ||
 		    arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
 		    pageind)) == binind);
-		ret = arena_bin_info[binind].reg_size;
+		ret = index2size(binind);
 	}

 	return (ret);
 }

 JEMALLOC_ALWAYS_INLINE void
-arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr, bool try_tcache)
+arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, bool try_tcache)
 {
 	size_t pageind, mapbits;
 	tcache_t *tcache;

-	assert(arena != NULL);
-	assert(chunk->arena == arena);
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);

 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+	if (likely((mapbits & CHUNK_MAP_LARGE) == 0)) {
 		/* Small allocation. */
-		if (try_tcache && (tcache = tcache_get(false)) != NULL) {
-			size_t binind;
-
-			binind = arena_ptr_small_binind_get(ptr, mapbits);
+		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
+		    false)) != NULL)) {
+			index_t binind = arena_ptr_small_binind_get(ptr,
+			    mapbits);
 			tcache_dalloc_small(tcache, ptr, binind);
 		} else
-			arena_dalloc_small(arena, chunk, ptr, pageind);
+			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
 	} else {
 		size_t size = arena_mapbits_large_size_get(chunk, pageind);

 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);

-		if (try_tcache && size <= tcache_maxclass && (tcache =
-		    tcache_get(false)) != NULL) {
+		if (try_tcache && size <= tcache_maxclass && likely((tcache =
+		    tcache_get(tsd, false)) != NULL))
 			tcache_dalloc_large(tcache, ptr, size);
-		} else
-			arena_dalloc_large(arena, chunk, ptr);
+		else
+			arena_dalloc_large(chunk->arena, chunk, ptr);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
+    bool try_tcache)
+{
+	tcache_t *tcache;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	if (config_prof && opt_prof) {
+		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+		if (arena_mapbits_large_get(chunk, pageind) != 0) {
+			/* Make sure to use promoted size, not request size. */
+			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+			size = arena_mapbits_large_size_get(chunk, pageind);
+		}
+	}
+	assert(s2u(size) == s2u(arena_salloc(ptr, false)));
+
+	if (likely(size <= SMALL_MAXCLASS)) {
+		/* Small allocation. */
+		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
+		    false)) != NULL)) {
+			index_t binind = size2index(size);
+			tcache_dalloc_small(tcache, ptr, binind);
+		} else {
+			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
+			    LG_PAGE;
+			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
+		}
+	} else {
+		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+
+		if (try_tcache && size <= tcache_maxclass && (tcache =
+		    tcache_get(tsd, false)) != NULL)
+			tcache_dalloc_large(tcache, ptr, size);
+		else
+			arena_dalloc_large(chunk->arena, chunk, ptr);
 	}
 }
 #  endif /* JEMALLOC_ARENA_INLINE_B */
--- a/memory/jemalloc/src/include/jemalloc/internal/atomic.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/atomic.h
@@ -18,6 +18,17 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

+/*
+ * All functions return the arithmetic result of the atomic operation.  Some
+ * atomic operation APIs return the value prior to mutation, in which case the
+ * following functions must redundantly compute the result so that it can be
+ * returned.  These functions are normally inlined, so the extra operations can
+ * be optimized away if the return values aren't used by the callers.
+ *
+ *   <t> atomic_add_<t>(<t> *p, <t> x) { return (*p + x); }
+ *   <t> atomic_sub_<t>(<t> *p, <t> x) { return (*p - x); }
+ */
+
 #ifndef JEMALLOC_ENABLE_INLINE
 uint64_t	atomic_add_uint64(uint64_t *p, uint64_t x);
 uint64_t	atomic_sub_uint64(uint64_t *p, uint64_t x);
@@ -47,21 +58,35 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)

 	return (__sync_sub_and_fetch(p, x));
 }
-#elif (defined(_MSC_VER))
+#  elif (defined(_MSC_VER))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {

-	return (InterlockedExchangeAdd64(p, x));
+	return (InterlockedExchangeAdd64(p, x) + x);
 }

 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {

-	return (InterlockedExchangeAdd64(p, -((int64_t)x)));
+	return (InterlockedExchangeAdd64(p, -((int64_t)x)) - x);
 }
-#elif (defined(JEMALLOC_OSATOMIC))
+#  elif (defined(JEMALLOC_C11ATOMICS))
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (atomic_fetch_add(a, x) + x);
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (atomic_fetch_sub(a, x) - x);
+}
+#  elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@@ -79,28 +104,31 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
+	uint64_t t = x;

 	asm volatile (
 	    "lock; xaddq %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );

-	return (x);
+	return (t + x);
 }

 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
+	uint64_t t;

 	x = (uint64_t)(-(int64_t)x);
+	t = x;
 	asm volatile (
 	    "lock; xaddq %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );

-	return (x);
+	return (t + x);
 }
 #  elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint64_t
@@ -164,14 +192,28 @@ JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {

-	return (InterlockedExchangeAdd(p, x));
+	return (InterlockedExchangeAdd(p, x) + x);
 }

 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {

-	return (InterlockedExchangeAdd(p, -((int32_t)x)));
+	return (InterlockedExchangeAdd(p, -((int32_t)x)) - x);
+}
+#  elif (defined(JEMALLOC_C11ATOMICS))
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (atomic_fetch_add(a, x) + x);
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (atomic_fetch_sub(a, x) - x);
 }
 #elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint32_t
@@ -191,28 +233,31 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
+	uint32_t t = x;

 	asm volatile (
 	    "lock; xaddl %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );

-	return (x);
+	return (t + x);
 }

 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
+	uint32_t t;

 	x = (uint32_t)(-(int32_t)x);
+	t = x;
 	asm volatile (
 	    "lock; xaddl %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );

-	return (x);
+	return (t + x);
 }
 #elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint32_t
--- a/memory/jemalloc/src/include/jemalloc/internal/base.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/base.h
@@ -12,7 +12,7 @@
 void	*base_alloc(size_t size);
 void	*base_calloc(size_t number, size_t size);
 extent_node_t *base_node_alloc(void);
-void	base_node_dealloc(extent_node_t *node);
+void	base_node_dalloc(extent_node_t *node);
 bool	base_boot(void);
 void	base_prefork(void);
 void	base_postfork_parent(void);
--- a/memory/jemalloc/src/include/jemalloc/internal/bitmap.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/bitmap.h
@@ -3,6 +3,7 @@

 /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
 #define	LG_BITMAP_MAXBITS	LG_RUN_MAXREGS
+#define	BITMAP_MAXBITS		(ZU(1) << LG_BITMAP_MAXBITS)

 typedef struct bitmap_level_s bitmap_level_t;
 typedef struct bitmap_info_s bitmap_info_t;
@@ -14,6 +15,51 @@ typedef unsigned long bitmap_t;
 #define	BITMAP_GROUP_NBITS		(ZU(1) << LG_BITMAP_GROUP_NBITS)
 #define	BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)

+/* Number of groups required to store a given number of bits. */
+#define	BITMAP_BITS2GROUPS(nbits)					\
+    ((nbits + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS)
+
+/*
+ * Number of groups required at a particular level for a given number of bits.
+ */
+#define	BITMAP_GROUPS_L0(nbits)						\
+    BITMAP_BITS2GROUPS(nbits)
+#define	BITMAP_GROUPS_L1(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits))
+#define	BITMAP_GROUPS_L2(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))
+#define	BITMAP_GROUPS_L3(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
+	BITMAP_BITS2GROUPS((nbits)))))
+
+/*
+ * Assuming the number of levels, number of groups required for a given number
+ * of bits.
+ */
+#define	BITMAP_GROUPS_1_LEVEL(nbits)					\
+    BITMAP_GROUPS_L0(nbits)
+#define	BITMAP_GROUPS_2_LEVEL(nbits)					\
+    (BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits))
+#define	BITMAP_GROUPS_3_LEVEL(nbits)					\
+    (BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits))
+#define	BITMAP_GROUPS_4_LEVEL(nbits)					\
+    (BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits))
+
+/*
+ * Maximum number of groups required to support LG_BITMAP_MAXBITS.
+ */
+#if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS)
+#else
+#  error "Unsupported bitmap size"
+#endif
+
 /* Maximum number of levels possible. */
 #define	BITMAP_MAX_LEVELS						\
    (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP)				\
@@ -93,7 +139,7 @@ bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 	bitmap_t g;

 	assert(bit < binfo->nbits);
-	assert(bitmap_get(bitmap, binfo, bit) == false);
+	assert(!bitmap_get(bitmap, binfo, bit));
 	goff = bit >> LG_BITMAP_GROUP_NBITS;
 	gp = &bitmap[goff];
 	g = *gp;
@@ -126,7 +172,7 @@ bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo)
 	bitmap_t g;
 	unsigned i;

-	assert(bitmap_full(bitmap, binfo) == false);
+	assert(!bitmap_full(bitmap, binfo));

 	i = binfo->nlevels - 1;
 	g = bitmap[binfo->levels[i].group_offset];
@@ -158,7 +204,7 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 	assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))) == 0);
 	g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
 	*gp = g;
-	assert(bitmap_get(bitmap, binfo, bit) == false);
+	assert(!bitmap_get(bitmap, binfo, bit));
 	/* Propagate group state transitions up the tree. */
 	if (propagate) {
 		unsigned i;
@@ -172,7 +218,7 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 			    == 0);
 			g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
 			*gp = g;
-			if (propagate == false)
+			if (!propagate)
 				break;
 		}
 	}
--- a/memory/jemalloc/src/include/jemalloc/internal/chunk.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/chunk.h
@@ -40,13 +40,15 @@ extern rtree_t		*chunks_rtree;
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
-extern size_t		map_bias; /* Number of arena chunk header pages. */
-extern size_t		arena_maxclass; /* Max size class for arenas. */

-void	*chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
-    dss_prec_t dss_prec);
+void	*chunk_alloc_base(size_t size);
+void	*chunk_alloc_arena(chunk_alloc_t *chunk_alloc,
+    chunk_dalloc_t *chunk_dalloc, unsigned arena_ind, void *new_addr,
+    size_t size, size_t alignment, bool *zero);
+void	*chunk_alloc_default(void *new_addr, size_t size, size_t alignment,
+    bool *zero, unsigned arena_ind);
 void	chunk_unmap(void *chunk, size_t size);
-void	chunk_dealloc(void *chunk, size_t size, bool unmap);
+bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
--- a/memory/jemalloc/src/include/jemalloc/internal/chunk_dss.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/chunk_dss.h
@@ -23,7 +23,8 @@ extern const char *dss_prec_names[];

 dss_prec_t	chunk_dss_prec_get(void);
 bool	chunk_dss_prec_set(dss_prec_t dss_prec);
-void	*chunk_alloc_dss(size_t size, size_t alignment, bool *zero);
+void	*chunk_alloc_dss(void *new_addr, size_t size, size_t alignment,
+    bool *zero);
 bool	chunk_in_dss(void *chunk);
 bool	chunk_dss_boot(void);
 void	chunk_dss_prefork(void);
--- a/memory/jemalloc/src/include/jemalloc/internal/chunk_mmap.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/chunk_mmap.h
@@ -12,7 +12,7 @@
 bool	pages_purge(void *addr, size_t length);

 void	*chunk_alloc_mmap(size_t size, size_t alignment, bool *zero);
-bool	chunk_dealloc_mmap(void *chunk, size_t size);
+bool	chunk_dalloc_mmap(void *chunk, size_t size);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
--- a/memory/jemalloc/src/include/jemalloc/internal/ckh.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/ckh.h
@@ -66,13 +66,13 @@ struct ckh_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

-bool	ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+bool	ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
    ckh_keycomp_t *keycomp);
-void	ckh_delete(ckh_t *ckh);
+void	ckh_delete(tsd_t *tsd, ckh_t *ckh);
 size_t	ckh_count(ckh_t *ckh);
 bool	ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data);
-bool	ckh_insert(ckh_t *ckh, const void *key, const void *data);
-bool	ckh_remove(ckh_t *ckh, const void *searchkey, void **key,
+bool	ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data);
+bool	ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
    void **data);
 bool	ckh_search(ckh_t *ckh, const void *seachkey, void **key, void **data);
 void	ckh_string_hash(const void *key, size_t r_hash[2]);
--- a/memory/jemalloc/src/include/jemalloc/internal/ctl.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/ctl.h
@@ -46,6 +46,7 @@ struct ctl_arena_stats_s {

 	malloc_bin_stats_t	bstats[NBINS];
 	malloc_large_stats_t	*lstats;	/* nlclasses elements. */
+	malloc_huge_stats_t	*hstats;	/* nhclasses elements. */
 };

 struct ctl_stats_s {
@@ -57,11 +58,6 @@ struct ctl_stats_s {
 		uint64_t	total;		/* stats_chunks.nchunks */
 		size_t		high;		/* stats_chunks.highchunks */
 	} chunks;
-	struct {
-		size_t		allocated;	/* huge_allocated */
-		uint64_t	nmalloc;	/* huge_nmalloc */
-		uint64_t	ndalloc;	/* huge_ndalloc */
-	} huge;
 	unsigned		narenas;
 	ctl_arena_stats_t	*arenas;	/* (narenas + 1) elements. */
 };
--- a/memory/jemalloc/src/include/jemalloc/internal/extent.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/extent.h
@@ -16,7 +16,7 @@ struct extent_node_s {
 	rb_node(extent_node_t)	link_ad;

 	/* Profile counters, used for huge objects. */
-	prof_ctx_t		*prof_ctx;
+	prof_tctx_t		*prof_tctx;

 	/* Pointer to the extent that this tree node is responsible for. */
 	void			*addr;
@@ -24,6 +24,9 @@ struct extent_node_s {
 	/* Total region size. */
 	size_t			size;

+	/* Arena from which this extent came, if any. */
+	arena_t			*arena;
+
 	/* True if zero-filled; used by chunk recycling code. */
 	bool			zeroed;
 };
--- a/memory/jemalloc/src/include/jemalloc/internal/hash.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/hash.h
@@ -35,13 +35,14 @@ JEMALLOC_INLINE uint32_t
 hash_rotl_32(uint32_t x, int8_t r)
 {

-	return (x << r) | (x >> (32 - r));
+	return ((x << r) | (x >> (32 - r)));
 }

 JEMALLOC_INLINE uint64_t
 hash_rotl_64(uint64_t x, int8_t r)
 {
-	return (x << r) | (x >> (64 - r));
+
+	return ((x << r) | (x >> (64 - r)));
 }

 JEMALLOC_INLINE uint32_t
@@ -76,9 +77,9 @@ hash_fmix_64(uint64_t k)
 {

 	k ^= k >> 33;
-	k *= QU(0xff51afd7ed558ccdULL);
+	k *= KQU(0xff51afd7ed558ccd);
 	k ^= k >> 33;
-	k *= QU(0xc4ceb9fe1a85ec53ULL);
+	k *= KQU(0xc4ceb9fe1a85ec53);
 	k ^= k >> 33;

 	return (k);
@@ -247,8 +248,8 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 	uint64_t h1 = seed;
 	uint64_t h2 = seed;

-	const uint64_t c1 = QU(0x87c37b91114253d5ULL);
-	const uint64_t c2 = QU(0x4cf5ad432745937fULL);
+	const uint64_t c1 = KQU(0x87c37b91114253d5);
+	const uint64_t c2 = KQU(0x4cf5ad432745937f);

 	/* body */
 	{
--- a/memory/jemalloc/src/include/jemalloc/internal/huge.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/huge.h
@@ -9,30 +9,23 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

-/* Huge allocation statistics. */
-extern uint64_t		huge_nmalloc;
-extern uint64_t		huge_ndalloc;
-extern size_t		huge_allocated;
-
-/* Protects chunk-related data structures. */
-extern malloc_mutex_t	huge_mtx;
-
-void	*huge_malloc(size_t size, bool zero, dss_prec_t dss_prec);
-void	*huge_palloc(size_t size, size_t alignment, bool zero,
-    dss_prec_t dss_prec);
+void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    bool try_tcache);
+void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
+    bool zero, bool try_tcache);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
-    size_t extra);
-void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_dalloc, dss_prec_t dss_prec);
+    size_t extra, bool zero);
+void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
+    size_t size, size_t extra, size_t alignment, bool zero,
+    bool try_tcache_alloc, bool try_tcache_dalloc);
 #ifdef JEMALLOC_JET
 typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
-void	huge_dalloc(void *ptr, bool unmap);
+void	huge_dalloc(tsd_t *tsd, void *ptr, bool try_tcache);
 size_t	huge_salloc(const void *ptr);
-dss_prec_t	huge_dss_prec_get(arena_t *arena);
-prof_ctx_t	*huge_prof_ctx_get(const void *ptr);
-void	huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
+void	huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 bool	huge_boot(void);
 void	huge_prefork(void);
 void	huge_postfork_parent(void);
--- a/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal.h.in
--- a/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -0,0 +1,60 @@
+#ifndef JEMALLOC_INTERNAL_DECLS_H
+#define	JEMALLOC_INTERNAL_DECLS_H
+
+#include <math.h>
+#ifdef _WIN32
+#  include <windows.h>
+#  define ENOENT ERROR_PATH_NOT_FOUND
+#  define EINVAL ERROR_BAD_ARGUMENTS
+#  define EAGAIN ERROR_OUTOFMEMORY
+#  define EPERM  ERROR_WRITE_FAULT
+#  define EFAULT ERROR_INVALID_ADDRESS
+#  define ENOMEM ERROR_NOT_ENOUGH_MEMORY
+#  undef ERANGE
+#  define ERANGE ERROR_INVALID_DATA
+#else
+#  include <sys/param.h>
+#  include <sys/mman.h>
+#  if !defined(__pnacl__) && !defined(__native_client__)
+#    include <sys/syscall.h>
+#    if !defined(SYS_write) && defined(__NR_write)
+#      define SYS_write __NR_write
+#    endif
+#    include <sys/uio.h>
+#  endif
+#  include <pthread.h>
+#  include <errno.h>
+#endif
+#include <sys/types.h>
+
+#include <limits.h>
+#ifndef SIZE_T_MAX
+#  define SIZE_T_MAX	SIZE_MAX
+#endif
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#ifndef offsetof
+#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#endif
+#include <inttypes.h>
+#include <string.h>
+#include <strings.h>
+#include <ctype.h>
+#ifdef _MSC_VER
+#  include <io.h>
+typedef intptr_t ssize_t;
+#  define PATH_MAX 1024
+#  define STDERR_FILENO 2
+#  define __func__ __FUNCTION__
+/* Disable warnings about deprecated system functions. */
+#  pragma warning(disable: 4996)
+#else
+#  include <unistd.h>
+#endif
+#include <fcntl.h>
+
+#endif /* JEMALLOC_INTERNAL_H */
--- a/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -22,6 +22,9 @@
 */
 #undef CPU_SPINWAIT

+/* Defined if C11 atomics are available. */
+#undef JEMALLOC_C11ATOMICS
+
 /* Defined if the equivalent of FreeBSD's atomic(9) functions are available. */
 #undef JEMALLOC_ATOMIC9

@@ -35,7 +38,7 @@
 * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
 * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
 * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines)
+ * functions are defined in libgcc instead of being inlines).
 */
 #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4

@@ -43,16 +46,36 @@
 * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
 * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
 * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines)
+ * functions are defined in libgcc instead of being inlines).
 */
 #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8

+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#undef JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if madvise(2) is available.
+ */
+#undef JEMALLOC_HAVE_MADVISE
+
 /*
 * Defined if OSSpin*() functions are available, as provided by Darwin, and
 * documented in the spinlock(3) manual page.
 */
 #undef JEMALLOC_OSSPIN

+/*
+ * Defined if secure_getenv(3) is available.
+ */
+#undef JEMALLOC_HAVE_SECURE_GETENV
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#undef JEMALLOC_HAVE_ISSETUGID
+
 /*
 * Defined if _malloc_thread_cleanup() exists.  At least in the case of
 * FreeBSD, pthread_key_create() allocates, which if used during malloc
@@ -76,9 +99,6 @@
 */
 #undef JEMALLOC_MUTEX_INIT_CB

-/* Defined if sbrk() is supported. */
-#undef JEMALLOC_HAVE_SBRK
-
 /* Non-empty if the tls_model attribute is supported. */
 #undef JEMALLOC_TLS_MODEL

@@ -137,8 +157,17 @@
 /* Support lazy locking (avoid locking unless a second thread is launched). */
 #undef JEMALLOC_LAZY_LOCK

-/* One page is 2^STATIC_PAGE_SHIFT bytes. */
-#undef STATIC_PAGE_SHIFT
+/* Minimum size class to support is 2^LG_TINY_MIN bytes. */
+#undef LG_TINY_MIN
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+#undef LG_QUANTUM
+
+/* One page is 2^LG_PAGE bytes. */
+#undef LG_PAGE

 /*
 * If defined, use munmap() to unmap freed chunks, rather than storing them for
@@ -147,13 +176,6 @@
 */
 #undef JEMALLOC_MUNMAP

-/*
- * If defined, use mremap(...MREMAP_FIXED...) for huge realloc().  This is
- * disabled by default because it is Linux-specific and it will cause virtual
- * memory map holes, much like munmap(2) does.
- */
-#undef JEMALLOC_MREMAP
-
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef JEMALLOC_TLS

@@ -189,9 +211,7 @@
 #undef JEMALLOC_PURGE_MADVISE_DONTNEED
 #undef JEMALLOC_PURGE_MADVISE_FREE

-/*
- * Define if operating system has alloca.h header.
- */
+/* Define if operating system has alloca.h header. */
 #undef JEMALLOC_HAS_ALLOCA_H

 /* C99 restrict keyword supported. */
@@ -209,4 +229,13 @@
 /* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
 #undef LG_SIZEOF_INTMAX_T

+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+#undef JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook. */
+#undef JEMALLOC_GLIBC_MEMALIGN_HOOK
+
+/* Adaptive mutex support in pthreads. */
+#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
--- a/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -39,9 +39,15 @@
 #endif

 #define	ZU(z)	((size_t)z)
+#define	ZI(z)	((ssize_t)z)
 #define	QU(q)	((uint64_t)q)
 #define	QI(q)	((int64_t)q)

+#define	KZU(z)	ZU(z##ULL)
+#define	KZI(z)	ZI(z##LL)
+#define	KQU(q)	QU(q##ULL)
+#define	KQI(q)	QI(q##LL)
+
 #ifndef __DECONST
 #  define	__DECONST(type, var)	((type)(uintptr_t)(const void *)(var))
 #endif
--- a/memory/jemalloc/src/include/jemalloc/internal/mutex.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/mutex.h
@@ -10,7 +10,7 @@ typedef struct malloc_mutex_s malloc_mutex_t;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 #  define MALLOC_MUTEX_INITIALIZER {PTHREAD_MUTEX_INITIALIZER, NULL}
 #else
-#  if (defined(PTHREAD_MUTEX_ADAPTIVE_NP) &&				\
+#  if (defined(JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP) &&		\
       defined(PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP))
 #    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_ADAPTIVE_NP
 #    define MALLOC_MUTEX_INITIALIZER {PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP}
--- a/memory/jemalloc/src/include/jemalloc/internal/private_symbols.txt
+++ b/memory/jemalloc/src/include/jemalloc/internal/private_symbols.txt
@@ -1,20 +1,33 @@
 a0calloc
 a0free
+a0get
 a0malloc
+arena_get
+arena_get_hard
 arena_alloc_junk_small
 arena_bin_index
 arena_bin_info
+arena_bitselm_get
 arena_boot
+arena_choose
+arena_choose_hard
+arena_chunk_alloc_huge
+arena_chunk_dalloc_huge
+arena_chunk_ralloc_huge_expand
+arena_chunk_ralloc_huge_shrink
+arena_chunk_ralloc_huge_similar
+arena_cleanup
 arena_dalloc
 arena_dalloc_bin
-arena_dalloc_bin_locked
+arena_dalloc_bin_junked_locked
 arena_dalloc_junk_large
 arena_dalloc_junk_small
 arena_dalloc_large
-arena_dalloc_large_locked
+arena_dalloc_large_junked_locked
 arena_dalloc_small
 arena_dss_prec_get
 arena_dss_prec_set
+arena_init
 arena_malloc
 arena_malloc_large
 arena_malloc_small
@@ -36,8 +49,13 @@ arena_mapbits_unzeroed_set
 arena_mapbitsp_get
 arena_mapbitsp_read
 arena_mapbitsp_write
-arena_mapp_get
 arena_maxclass
+arena_maxrun
+arena_migrate
+arena_miscelm_get
+arena_miscelm_to_pageind
+arena_miscelm_to_rpages
+arena_nbound
 arena_new
 arena_palloc
 arena_postfork_child
@@ -46,9 +64,9 @@ arena_prefork
 arena_prof_accum
 arena_prof_accum_impl
 arena_prof_accum_locked
-arena_prof_ctx_get
-arena_prof_ctx_set
 arena_prof_promoted
+arena_prof_tctx_get
+arena_prof_tctx_set
 arena_ptr_small_binind_get
 arena_purge_all
 arena_quarantine_junk_small
@@ -57,23 +75,13 @@ arena_ralloc_junk_large
 arena_ralloc_no_move
 arena_redzone_corruption
 arena_run_regind
+arena_run_to_miscelm
 arena_salloc
+arena_sdalloc
 arena_stats_merge
 arena_tcache_fill_small
-arenas
-arenas_booted
-arenas_cleanup
-arenas_extend
-arenas_initialized
-arenas_lock
-arenas_tls
-arenas_tsd
-arenas_tsd_boot
-arenas_tsd_cleanup_wrapper
-arenas_tsd_get
-arenas_tsd_get_wrapper
-arenas_tsd_init_head
-arenas_tsd_set
+arenas_cache_bypass_cleanup
+arenas_cache_cleanup
 atomic_add_u
 atomic_add_uint32
 atomic_add_uint64
@@ -86,7 +94,7 @@ base_alloc
 base_boot
 base_calloc
 base_node_alloc
-base_node_dealloc
+base_node_dalloc
 base_postfork_child
 base_postfork_parent
 base_prefork
@@ -101,14 +109,14 @@ bitmap_size
 bitmap_unset
 bt_init
 buferror
-choose_arena
-choose_arena_hard
-chunk_alloc
+chunk_alloc_arena
+chunk_alloc_base
+chunk_alloc_default
 chunk_alloc_dss
 chunk_alloc_mmap
 chunk_boot
-chunk_dealloc
-chunk_dealloc_mmap
+chunk_dalloc_default
+chunk_dalloc_mmap
 chunk_dss_boot
 chunk_dss_postfork_child
 chunk_dss_postfork_parent
@@ -197,41 +205,45 @@ huge_allocated
 huge_boot
 huge_dalloc
 huge_dalloc_junk
-huge_dss_prec_get
 huge_malloc
-huge_mtx
 huge_ndalloc
 huge_nmalloc
 huge_palloc
 huge_postfork_child
 huge_postfork_parent
 huge_prefork
-huge_prof_ctx_get
-huge_prof_ctx_set
+huge_prof_tctx_get
+huge_prof_tctx_set
 huge_ralloc
 huge_ralloc_no_move
 huge_salloc
-iallocm
 icalloc
 icalloct
 idalloc
 idalloct
 imalloc
 imalloct
+in_valgrind
+index2size
+index2size_compute
+index2size_lookup
+index2size_tab
 ipalloc
 ipalloct
 iqalloc
-iqalloct
 iralloc
 iralloct
 iralloct_realign
 isalloc
+isdalloct
 isthreaded
+isqalloc
 ivsalloc
 ixalloc
 jemalloc_postfork_child
 jemalloc_postfork_parent
 jemalloc_prefork
+lg_floor
 malloc_cprintf
 malloc_mutex_init
 malloc_mutex_lock
@@ -242,7 +254,8 @@ malloc_mutex_unlock
 malloc_printf
 malloc_snprintf
 malloc_strtoumax
-malloc_tsd_boot
+malloc_tsd_boot0
+malloc_tsd_boot1
 malloc_tsd_cleanup_register
 malloc_tsd_dalloc
 malloc_tsd_malloc
@@ -251,16 +264,18 @@ malloc_vcprintf
 malloc_vsnprintf
 malloc_write
 map_bias
+map_misc_offset
 mb_write
 mutex_boot
-narenas_auto
-narenas_total
+narenas_cache_cleanup
 narenas_total_get
 ncpus
 nhbins
 opt_abort
 opt_dss
 opt_junk
+opt_junk_alloc
+opt_junk_free
 opt_lg_chunk
 opt_lg_dirty_mult
 opt_lg_prof_interval
@@ -279,61 +294,54 @@ opt_redzone
 opt_stats_print
 opt_tcache
 opt_utrace
-opt_valgrind
 opt_xmalloc
 opt_zero
 p2rz
 pages_purge
 pow2_ceil
+prof_active_get
+prof_active_get_unlocked
+prof_active_set
+prof_alloc_prep
+prof_alloc_rollback
 prof_backtrace
 prof_boot0
 prof_boot1
 prof_boot2
 prof_bt_count
-prof_ctx_get
-prof_ctx_set
+prof_dump_header
 prof_dump_open
 prof_free
+prof_free_sampled_object
 prof_gdump
 prof_idump
 prof_interval
 prof_lookup
 prof_malloc
+prof_malloc_sample_object
 prof_mdump
 prof_postfork_child
 prof_postfork_parent
 prof_prefork
-prof_promote
 prof_realloc
+prof_reset
 prof_sample_accum_update
 prof_sample_threshold_update
-prof_tdata_booted
+prof_tctx_get
+prof_tctx_set
 prof_tdata_cleanup
 prof_tdata_get
 prof_tdata_init
-prof_tdata_initialized
-prof_tdata_tls
-prof_tdata_tsd
-prof_tdata_tsd_boot
-prof_tdata_tsd_cleanup_wrapper
-prof_tdata_tsd_get
-prof_tdata_tsd_get_wrapper
-prof_tdata_tsd_init_head
-prof_tdata_tsd_set
+prof_thread_active_get
+prof_thread_active_init_get
+prof_thread_active_init_set
+prof_thread_active_set
+prof_thread_name_get
+prof_thread_name_set
 quarantine
 quarantine_alloc_hook
-quarantine_boot
-quarantine_booted
+quarantine_alloc_hook_work
 quarantine_cleanup
-quarantine_init
-quarantine_tls
-quarantine_tsd
-quarantine_tsd_boot
-quarantine_tsd_cleanup_wrapper
-quarantine_tsd_get
-quarantine_tsd_get_wrapper
-quarantine_tsd_init_head
-quarantine_tsd_set
 register_zone
 rtree_delete
 rtree_get
@@ -344,9 +352,14 @@ rtree_postfork_parent
 rtree_prefork
 rtree_set
 s2u
+s2u_compute
+s2u_lookup
 sa2u
 set_errno
-small_size2bin
+size2index
+size2index_compute
+size2index_lookup
+size2index_tab
 stats_cactive
 stats_cactive_add
 stats_cactive_get
@@ -359,55 +372,62 @@ tcache_alloc_small
 tcache_alloc_small_hard
 tcache_arena_associate
 tcache_arena_dissociate
+tcache_arena_reassociate
 tcache_bin_flush_large
 tcache_bin_flush_small
 tcache_bin_info
-tcache_boot0
-tcache_boot1
-tcache_booted
+tcache_boot
+tcache_cleanup
 tcache_create
 tcache_dalloc_large
 tcache_dalloc_small
-tcache_destroy
-tcache_enabled_booted
+tcache_enabled_cleanup
 tcache_enabled_get
-tcache_enabled_initialized
 tcache_enabled_set
-tcache_enabled_tls
-tcache_enabled_tsd
-tcache_enabled_tsd_boot
-tcache_enabled_tsd_cleanup_wrapper
-tcache_enabled_tsd_get
-tcache_enabled_tsd_get_wrapper
-tcache_enabled_tsd_init_head
-tcache_enabled_tsd_set
 tcache_event
 tcache_event_hard
 tcache_flush
 tcache_get
-tcache_initialized
+tcache_get_hard
 tcache_maxclass
 tcache_salloc
 tcache_stats_merge
-tcache_thread_cleanup
-tcache_tls
-tcache_tsd
-tcache_tsd_boot
-tcache_tsd_cleanup_wrapper
-tcache_tsd_get
-tcache_tsd_get_wrapper
-tcache_tsd_init_head
-tcache_tsd_set
-thread_allocated_booted
-thread_allocated_initialized
-thread_allocated_tls
-thread_allocated_tsd
-thread_allocated_tsd_boot
-thread_allocated_tsd_cleanup_wrapper
-thread_allocated_tsd_get
-thread_allocated_tsd_get_wrapper
-thread_allocated_tsd_init_head
-thread_allocated_tsd_set
+thread_allocated_cleanup
+thread_deallocated_cleanup
+tsd_booted
+tsd_arena_get
+tsd_arena_set
+tsd_boot
+tsd_boot0
+tsd_boot1
+tsd_cleanup
+tsd_cleanup_wrapper
+tsd_fetch
+tsd_get
+tsd_wrapper_get
+tsd_wrapper_set
+tsd_initialized
 tsd_init_check_recursion
 tsd_init_finish
+tsd_init_head
+tsd_nominal
+tsd_quarantine_get
+tsd_quarantine_set
+tsd_set
+tsd_tcache_enabled_get
+tsd_tcache_enabled_set
+tsd_tcache_get
+tsd_tcache_set
+tsd_tls
+tsd_tsd
+tsd_prof_tdata_get
+tsd_prof_tdata_set
+tsd_thread_allocated_get
+tsd_thread_allocated_set
+tsd_thread_deallocated_get
+tsd_thread_deallocated_set
 u2rz
+valgrind_freelike_block
+valgrind_make_mem_defined
+valgrind_make_mem_noaccess
+valgrind_make_mem_undefined
--- a/memory/jemalloc/src/include/jemalloc/internal/prng.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/prng.h
@@ -15,7 +15,7 @@
 * See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints.
 *
 * This choice of m has the disadvantage that the quality of the bits is
- * proportional to bit position.  For example. the lowest bit has a cycle of 2,
+ * proportional to bit position.  For example, the lowest bit has a cycle of 2,
 * the next has a cycle of 4, etc.  For this reason, we prefer to use the upper
 * bits.
 *
--- a/memory/jemalloc/src/include/jemalloc/internal/prof.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/prof.h
@@ -3,8 +3,8 @@

 typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
-typedef struct prof_thr_cnt_s prof_thr_cnt_t;
-typedef struct prof_ctx_s prof_ctx_t;
+typedef struct prof_tctx_s prof_tctx_t;
+typedef struct prof_gctx_s prof_gctx_t;
 typedef struct prof_tdata_s prof_tdata_t;

 /* Option defaults. */
@@ -23,9 +23,6 @@ typedef struct prof_tdata_s prof_tdata_t;
 */
 #define	PROF_BT_MAX			128

-/* Maximum number of backtraces to store in each per thread LRU cache. */
-#define	PROF_TCMAX			1024
-
 /* Initial hash table size. */
 #define	PROF_CKH_MINITEMS		64

@@ -36,11 +33,17 @@ typedef struct prof_tdata_s prof_tdata_t;
 #define	PROF_PRINTF_BUFSIZE		128

 /*
- * Number of mutexes shared among all ctx's.  No space is allocated for these
+ * Number of mutexes shared among all gctx's.  No space is allocated for these
 * unless profiling is enabled, so it's okay to over-provision.
 */
 #define	PROF_NCTX_LOCKS			1024

+/*
+ * Number of mutexes shared among all tdata's.  No space is allocated for these
+ * unless profiling is enabled, so it's okay to over-provision.
+ */
+#define	PROF_NTDATA_LOCKS		256
+
 /*
 * prof_tdata pointers close to NULL are used to encode state information that
 * is used for cleaning up during thread shutdown.
@@ -63,141 +66,163 @@ struct prof_bt_s {
 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
 typedef struct {
 	prof_bt_t	*bt;
-	unsigned	nignore;
 	unsigned	max;
 } prof_unwind_data_t;
 #endif

 struct prof_cnt_s {
-	/*
-	 * Profiling counters.  An allocation/deallocation pair can operate on
-	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
-	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require something
-	 * like 128-bit counters; this implementation doesn't bother to solve
-	 * that problem.
-	 */
-	int64_t		curobjs;
-	int64_t		curbytes;
+	/* Profiling counters. */
+	uint64_t	curobjs;
+	uint64_t	curbytes;
 	uint64_t	accumobjs;
 	uint64_t	accumbytes;
 };

-struct prof_thr_cnt_s {
-	/* Linkage into prof_ctx_t's cnts_ql. */
-	ql_elm(prof_thr_cnt_t)	cnts_link;
+typedef enum {
+	prof_tctx_state_initializing,
+	prof_tctx_state_nominal,
+	prof_tctx_state_dumping,
+	prof_tctx_state_purgatory /* Dumper must finish destroying. */
+} prof_tctx_state_t;

-	/* Linkage into thread's LRU. */
-	ql_elm(prof_thr_cnt_t)	lru_link;
+struct prof_tctx_s {
+	/* Thread data for thread that performed the allocation. */
+	prof_tdata_t		*tdata;

 	/*
-	 * Associated context.  If a thread frees an object that it did not
-	 * allocate, it is possible that the context is not cached in the
-	 * thread's hash table, in which case it must be able to look up the
-	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's cnts_ql.
+	 * Copy of tdata->thr_uid, necessary because tdata may be defunct during
+	 * teardown.
 	 */
-	prof_ctx_t		*ctx;
+	uint64_t		thr_uid;

-	/*
-	 * Threads use memory barriers to update the counters.  Since there is
-	 * only ever one writer, the only challenge is for the reader to get a
-	 * consistent read of the counters.
-	 *
-	 * The writer uses this series of operations:
-	 *
-	 * 1) Increment epoch to an odd number.
-	 * 2) Update counters.
-	 * 3) Increment epoch to an even number.
-	 *
-	 * The reader must assure 1) that the epoch is even while it reads the
-	 * counters, and 2) that the epoch doesn't change between the time it
-	 * starts and finishes reading the counters.
-	 */
-	unsigned		epoch;
-
-	/* Profiling counters. */
+	/* Profiling counters, protected by tdata->lock. */
 	prof_cnt_t		cnts;
+
+	/* Associated global context. */
+	prof_gctx_t		*gctx;
+
+	/* Linkage into gctx's tctxs. */
+	rb_node(prof_tctx_t)	tctx_link;
+
+	/*
+	 * True during prof_alloc_prep()..prof_malloc_sample_object(), prevents
+	 * sample vs destroy race.
+	 */
+	bool			prepared;
+
+	/* Current dump-related state, protected by gctx->lock. */
+	prof_tctx_state_t	state;
+
+	/*
+	 * Copy of cnts snapshotted during early dump phase, protected by
+	 * dump_mtx.
+	 */
+	prof_cnt_t		dump_cnts;
 };
+typedef rb_tree(prof_tctx_t) prof_tctx_tree_t;

-struct prof_ctx_s {
-	/* Associated backtrace. */
-	prof_bt_t		*bt;
-
-	/* Protects nlimbo, cnt_merged, and cnts_ql. */
+struct prof_gctx_s {
+	/* Protects nlimbo, cnt_summed, and tctxs. */
 	malloc_mutex_t		*lock;

 	/*
-	 * Number of threads that currently cause this ctx to be in a state of
+	 * Number of threads that currently cause this gctx to be in a state of
 	 * limbo due to one of:
-	 *   - Initializing per thread counters associated with this ctx.
-	 *   - Preparing to destroy this ctx.
-	 *   - Dumping a heap profile that includes this ctx.
+	 *   - Initializing this gctx.
+	 *   - Initializing per thread counters associated with this gctx.
+	 *   - Preparing to destroy this gctx.
+	 *   - Dumping a heap profile that includes this gctx.
 	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
-	 * ctx.
+	 * gctx.
 	 */
 	unsigned		nlimbo;

+	/*
+	 * Tree of profile counters, one for each thread that has allocated in
+	 * this context.
+	 */
+	prof_tctx_tree_t	tctxs;
+
+	/* Linkage for tree of contexts to be dumped. */
+	rb_node(prof_gctx_t)	dump_link;
+
 	/* Temporary storage for summation during dump. */
 	prof_cnt_t		cnt_summed;

-	/* When threads exit, they merge their stats into cnt_merged. */
-	prof_cnt_t		cnt_merged;
+	/* Associated backtrace. */
+	prof_bt_t		bt;

-	/*
-	 * List of profile counters, one for each thread that has allocated in
-	 * this context.
-	 */
-	ql_head(prof_thr_cnt_t)	cnts_ql;
-
-	/* Linkage for list of contexts to be dumped. */
-	ql_elm(prof_ctx_t)	dump_link;
+	/* Backtrace vector, variable size, referred to by bt. */
+	void			*vec[1];
 };
-typedef ql_head(prof_ctx_t) prof_ctx_list_t;
+typedef rb_tree(prof_gctx_t) prof_gctx_tree_t;

 struct prof_tdata_s {
+	malloc_mutex_t		*lock;
+
+	/* Monotonically increasing unique thread identifier. */
+	uint64_t		thr_uid;
+
 	/*
-	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
-	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
-	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
-	 * others will ever write them.
-	 *
-	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
-	 * counter data into the associated prof_ctx_t objects, and unlink/free
-	 * the prof_thr_cnt_t objects.
+	 * Monotonically increasing discriminator among tdata structures
+	 * associated with the same thr_uid.
 	 */
-	ckh_t			bt2cnt;
+	uint64_t		thr_discrim;

-	/* LRU for contents of bt2cnt. */
-	ql_head(prof_thr_cnt_t)	lru_ql;
+	/* Included in heap profile dumps if non-NULL. */
+	char			*thread_name;

-	/* Backtrace vector, used for calls to prof_backtrace(). */
-	void			**vec;
+	bool			attached;
+	bool			expired;
+
+	rb_node(prof_tdata_t)	tdata_link;
+
+	/*
+	 * Hash of (prof_bt_t *)-->(prof_tctx_t *).  Each thread tracks
+	 * backtraces for which it has non-zero allocation/deallocation counters
+	 * associated with thread-specific prof_tctx_t objects.  Other threads
+	 * may write to prof_tctx_t contents when freeing associated objects.
+	 */
+	ckh_t			bt2tctx;

 	/* Sampling state. */
 	uint64_t		prng_state;
-	uint64_t		threshold;
-	uint64_t		accum;
+	uint64_t		bytes_until_sample;

 	/* State used to avoid dumping while operating on prof internals. */
 	bool			enq;
 	bool			enq_idump;
 	bool			enq_gdump;
+
+	/*
+	 * Set to true during an early dump phase for tdata's which are
+	 * currently being dumped.  New threads' tdata's have this initialized
+	 * to false so that they aren't accidentally included in later dump
+	 * phases.
+	 */
+	bool			dumping;
+
+	/*
+	 * True if profiling is active for this tdata's thread
+	 * (thread.prof.active mallctl).
+	 */
+	bool			active;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
+
+	/* Backtrace vector, used for calls to prof_backtrace(). */
+	void			*vec[PROF_BT_MAX];
 };
+typedef rb_tree(prof_tdata_t) prof_tdata_tree_t;

 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

 extern bool	opt_prof;
-/*
- * Even if opt_prof is true, sampling can be temporarily disabled by setting
- * opt_prof_active to false.  No locking is used when updating opt_prof_active,
- * so there are no guarantees regarding how long it will take for all threads
- * to notice state changes.
- */
 extern bool	opt_prof_active;
+extern bool	opt_prof_thread_active_init;
 extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
 extern bool	opt_prof_gdump;       /* High-water memory dumping. */
@@ -211,6 +236,9 @@ extern char	opt_prof_prefix[
 #endif
    1];

+/* Accessed via prof_active_[gs]et{_unlocked,}(). */
+extern bool	prof_active;
+
 /*
 * Profile dump interval, measured in bytes allocated.  Each arena triggers a
 * profile dump when it reaches this threshold.  The effect is that the
@@ -221,191 +249,128 @@ extern char	opt_prof_prefix[
 extern uint64_t	prof_interval;

 /*
- * If true, promote small sampled objects to large objects, since small run
- * headers do not have embedded profile context pointers.
+ * Initialized as opt_lg_prof_sample, and potentially modified during profiling
+ * resets.
 */
-extern bool	prof_promote;
+extern size_t	lg_prof_sample;

+void	prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx);
 void	bt_init(prof_bt_t *bt, void **vec);
-void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
-prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
+void	prof_backtrace(prof_bt_t *bt);
+prof_tctx_t	*prof_lookup(tsd_t *tsd, prof_bt_t *bt);
 #ifdef JEMALLOC_JET
+size_t	prof_tdata_count(void);
 size_t	prof_bt_count(void);
+const prof_cnt_t *prof_cnt_all(void);
 typedef int (prof_dump_open_t)(bool, const char *);
 extern prof_dump_open_t *prof_dump_open;
+typedef bool (prof_dump_header_t)(bool, const prof_cnt_t *);
+extern prof_dump_header_t *prof_dump_header;
 #endif
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
 void	prof_gdump(void);
-prof_tdata_t	*prof_tdata_init(void);
-void	prof_tdata_cleanup(void *arg);
+prof_tdata_t	*prof_tdata_init(tsd_t *tsd);
+prof_tdata_t	*prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata);
+void	prof_reset(tsd_t *tsd, size_t lg_sample);
+void	prof_tdata_cleanup(tsd_t *tsd);
+const char	*prof_thread_name_get(void);
+bool	prof_active_get(void);
+bool	prof_active_set(bool active);
+int	prof_thread_name_set(tsd_t *tsd, const char *thread_name);
+bool	prof_thread_active_get(void);
+bool	prof_thread_active_set(bool active);
+bool	prof_thread_active_init_get(void);
+bool	prof_thread_active_init_set(bool active_init);
 void	prof_boot0(void);
 void	prof_boot1(void);
 bool	prof_boot2(void);
 void	prof_prefork(void);
 void	prof_postfork_parent(void);
 void	prof_postfork_child(void);
+void	prof_sample_threshold_update(prof_tdata_t *tdata);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

-#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
-	prof_tdata_t *prof_tdata;					\
-	prof_bt_t bt;							\
-									\
-	assert(size == s2u(size));					\
-									\
-	prof_tdata = prof_tdata_get(true);				\
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {	\
-		if (prof_tdata != NULL)					\
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
-		else							\
-			ret = NULL;					\
-		break;							\
-	}								\
-									\
-	if (opt_prof_active == false) {					\
-		/* Sampling is currently inactive, so avoid sampling. */\
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
-	} else if (opt_lg_prof_sample == 0) {				\
-		/* Don't bother with sampling logic, since sampling   */\
-		/* interval is 1.                                     */\
-		bt_init(&bt, prof_tdata->vec);				\
-		prof_backtrace(&bt, nignore);				\
-		ret = prof_lookup(&bt);					\
-	} else {							\
-		if (prof_tdata->threshold == 0) {			\
-			/* Initialize.  Seed the prng differently for */\
-			/* each thread.                               */\
-			prof_tdata->prng_state =			\
-			    (uint64_t)(uintptr_t)&size;			\
-			prof_sample_threshold_update(prof_tdata);	\
-		}							\
-									\
-		/* Determine whether to capture a backtrace based on  */\
-		/* whether size is enough for prof_accum to reach     */\
-		/* prof_tdata->threshold.  However, delay updating    */\
-		/* these variables until prof_{m,re}alloc(), because  */\
-		/* we don't know for sure that the allocation will    */\
-		/* succeed.                                           */\
-		/*                                                    */\
-		/* Use subtraction rather than addition to avoid      */\
-		/* potential integer overflow.                        */\
-		if (size >= prof_tdata->threshold -			\
-		    prof_tdata->accum) {				\
-			bt_init(&bt, prof_tdata->vec);			\
-			prof_backtrace(&bt, nignore);			\
-			ret = prof_lookup(&bt);				\
-		} else							\
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
-	}								\
-} while (0)
-
 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
-
-prof_tdata_t	*prof_tdata_get(bool create);
-void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
-bool	prof_sample_accum_update(size_t size);
-void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr, size_t size);
+bool	prof_active_get_unlocked(void);
+prof_tdata_t	*prof_tdata_get(tsd_t *tsd, bool create);
+bool	prof_sample_accum_update(tsd_t *tsd, size_t usize, bool commit,
+    prof_tdata_t **tdata_out);
+prof_tctx_t	*prof_alloc_prep(tsd_t *tsd, size_t usize, bool update);
+prof_tctx_t	*prof_tctx_get(const void *ptr);
+void	prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx);
+void	prof_realloc(tsd_t *tsd, const void *ptr, size_t usize,
+    prof_tctx_t *tctx, bool updated, size_t old_usize, prof_tctx_t *old_tctx);
+void	prof_free(tsd_t *tsd, const void *ptr, size_t usize);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
-/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
-malloc_tsd_externs(prof_tdata, prof_tdata_t *)
-malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
-    prof_tdata_cleanup)
-
-JEMALLOC_INLINE prof_tdata_t *
-prof_tdata_get(bool create)
+JEMALLOC_ALWAYS_INLINE bool
+prof_active_get_unlocked(void)
 {
-	prof_tdata_t *prof_tdata;
+
+	/*
+	 * Even if opt_prof is true, sampling can be temporarily disabled by
+	 * setting prof_active to false.  No locking is used when reading
+	 * prof_active in the fast path, so there are no guarantees regarding
+	 * how long it will take for all threads to notice state changes.
+	 */
+	return (prof_active);
+}
+
+JEMALLOC_ALWAYS_INLINE prof_tdata_t *
+prof_tdata_get(tsd_t *tsd, bool create)
+{
+	prof_tdata_t *tdata;

 	cassert(config_prof);

-	prof_tdata = *prof_tdata_tsd_get();
-	if (create && prof_tdata == NULL)
-		prof_tdata = prof_tdata_init();
+	tdata = tsd_prof_tdata_get(tsd);
+	if (create) {
+		if (unlikely(tdata == NULL)) {
+			if (tsd_nominal(tsd)) {
+				tdata = prof_tdata_init(tsd);
+				tsd_prof_tdata_set(tsd, tdata);
+			}
+		} else if (unlikely(tdata->expired)) {
+			tdata = prof_tdata_reinit(tsd, tdata);
+			tsd_prof_tdata_set(tsd, tdata);
+		}
+		assert(tdata == NULL || tdata->attached);
+	}

-	return (prof_tdata);
+	return (tdata);
 }

-JEMALLOC_INLINE void
-prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
+prof_tctx_get(const void *ptr)
 {
-	/*
-	 * The body of this function is compiled out unless heap profiling is
-	 * enabled, so that it is possible to compile jemalloc with floating
-	 * point support completely disabled.  Avoiding floating point code is
-	 * important on memory-constrained systems, but it also enables a
-	 * workaround for versions of glibc that don't properly save/restore
-	 * floating point registers during dynamic lazy symbol loading (which
-	 * internally calls into whatever malloc implementation happens to be
-	 * integrated into the application).  Note that some compilers (e.g.
-	 * gcc 4.8) may use floating point registers for fast memory moves, so
-	 * jemalloc must be compiled with such optimizations disabled (e.g.
-	 * -mno-sse) in order for the workaround to be complete.
-	 */
-#ifdef JEMALLOC_PROF
-	uint64_t r;
-	double u;
-
-	cassert(config_prof);
-
-	/*
-	 * Compute sample threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
-	 *
-	 *                         __        __
-	 *                         |  log(u)  |                     1
-	 * prof_tdata->threshold = | -------- |, where p = -------------------
-	 *                         | log(1-p) |             opt_lg_prof_sample
-	 *                                                 2
-	 *
-	 * For more information on the math, see:
-	 *
-	 *   Non-Uniform Random Variate Generation
-	 *   Luc Devroye
-	 *   Springer-Verlag, New York, 1986
-	 *   pp 500
-	 *   (http://luc.devroye.org/rnbookindex.html)
-	 */
-	prng64(r, 53, prof_tdata->prng_state,
-	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
-	u = (double)r * (1.0/9007199254740992.0L);
-	prof_tdata->threshold = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
-	    + (uint64_t)1U;
-#endif
-}
-
-JEMALLOC_INLINE prof_ctx_t *
-prof_ctx_get(const void *ptr)
-{
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	arena_chunk_t *chunk;

 	cassert(config_prof);
 	assert(ptr != NULL);

 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		ret = arena_prof_ctx_get(ptr);
-	} else
-		ret = huge_prof_ctx_get(ptr);
+	if (likely(chunk != ptr))
+		ret = arena_prof_tctx_get(ptr);
+	else
+		ret = huge_prof_tctx_get(ptr);

 	return (ret);
 }

-JEMALLOC_INLINE void
-prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
+JEMALLOC_ALWAYS_INLINE void
+prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;

@@ -413,199 +378,117 @@ prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
 	assert(ptr != NULL);

 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		arena_prof_ctx_set(ptr, usize, ctx);
-	} else
-		huge_prof_ctx_set(ptr, ctx);
+	if (likely(chunk != ptr))
+		arena_prof_tctx_set(ptr, tctx);
+	else
+		huge_prof_tctx_set(ptr, tctx);
 }

-JEMALLOC_INLINE bool
-prof_sample_accum_update(size_t size)
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
+    prof_tdata_t **tdata_out)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;

 	cassert(config_prof);
-	/* Sampling logic is unnecessary if the interval is 1. */
-	assert(opt_lg_prof_sample != 0);

-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(tsd, true);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+		tdata = NULL;
+
+	if (tdata_out != NULL)
+		*tdata_out = tdata;
+
+	if (tdata == NULL)
 		return (true);

-	/* Take care to avoid integer overflow. */
-	if (size >= prof_tdata->threshold - prof_tdata->accum) {
-		prof_tdata->accum -= (prof_tdata->threshold - size);
-		/* Compute new sample threshold. */
-		prof_sample_threshold_update(prof_tdata);
-		while (prof_tdata->accum >= prof_tdata->threshold) {
-			prof_tdata->accum -= prof_tdata->threshold;
-			prof_sample_threshold_update(prof_tdata);
-		}
-		return (false);
+	if (tdata->bytes_until_sample >= usize) {
+		if (update)
+			tdata->bytes_until_sample -= usize;
+		return (true);
 	} else {
-		prof_tdata->accum += size;
-		return (true);
+		/* Compute new sample threshold. */
+		if (update)
+			prof_sample_threshold_update(tdata);
+		return (!tdata->active);
 	}
 }

-JEMALLOC_INLINE void
-prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
+prof_alloc_prep(tsd_t *tsd, size_t usize, bool update)
+{
+	prof_tctx_t *ret;
+	prof_tdata_t *tdata;
+	prof_bt_t bt;
+
+	assert(usize == s2u(usize));
+
+	if (!prof_active_get_unlocked() || likely(prof_sample_accum_update(tsd,
+	    usize, update, &tdata)))
+		ret = (prof_tctx_t *)(uintptr_t)1U;
+	else {
+		bt_init(&bt, tdata->vec);
+		prof_backtrace(&bt);
+		ret = prof_lookup(tsd, &bt);
+	}
+
+	return (ret);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 {

 	cassert(config_prof);
 	assert(ptr != NULL);
 	assert(usize == isalloc(ptr, true));

-	if (opt_lg_prof_sample != 0) {
-		if (prof_sample_accum_update(usize)) {
-			/*
-			 * Don't sample.  For malloc()-like allocation, it is
-			 * always possible to tell in advance how large an
-			 * object's usable size will be, so there should never
-			 * be a difference between the usize passed to
-			 * PROF_ALLOC_PREP() and prof_malloc().
-			 */
-			assert((uintptr_t)cnt == (uintptr_t)1U);
-		}
-	}
-
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, usize, cnt->ctx);
-
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += usize;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += usize;
-		}
-		/*********/
-		mb_write();
-		/*********/
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-	} else
-		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
+		prof_malloc_sample_object(ptr, usize, tctx);
+	else
+		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }

-JEMALLOC_INLINE void
-prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx)
+JEMALLOC_ALWAYS_INLINE void
+prof_realloc(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx,
+    bool updated, size_t old_usize, prof_tctx_t *old_tctx)
 {
-	prof_thr_cnt_t *told_cnt;

 	cassert(config_prof);
-	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);

-	if (ptr != NULL) {
+	if (!updated && ptr != NULL) {
 		assert(usize == isalloc(ptr, true));
-		if (opt_lg_prof_sample != 0) {
-			if (prof_sample_accum_update(usize)) {
-				/*
-				 * Don't sample.  The usize passed to
-				 * PROF_ALLOC_PREP() was larger than what
-				 * actually got allocated, so a backtrace was
-				 * captured for this allocation, even though
-				 * its actual usize was insufficient to cross
-				 * the sample threshold.
-				 */
-				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-			}
-		}
-	}
-
-	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
-		told_cnt = prof_lookup(old_ctx->bt);
-		if (told_cnt == NULL) {
+		if (prof_sample_accum_update(tsd, usize, true, NULL)) {
 			/*
-			 * It's too late to propagate OOM for this realloc(),
-			 * so operate directly on old_cnt->ctx->cnt_merged.
+			 * Don't sample.  The usize passed to PROF_ALLOC_PREP()
+			 * was larger than what actually got allocated, so a
+			 * backtrace was captured for this allocation, even
+			 * though its actual usize was insufficient to cross the
+			 * sample threshold.
 			 */
-			malloc_mutex_lock(old_ctx->lock);
-			old_ctx->cnt_merged.curobjs--;
-			old_ctx->cnt_merged.curbytes -= old_usize;
-			malloc_mutex_unlock(old_ctx->lock);
-			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+			tctx = (prof_tctx_t *)(uintptr_t)1U;
 		}
-	} else
-		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+	}

-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, usize, cnt->ctx);
-		cnt->epoch++;
-	} else if (ptr != NULL)
-		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
-		told_cnt->cnts.curobjs--;
-		told_cnt->cnts.curbytes -= old_usize;
-	}
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += usize;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += usize;
-		}
-	}
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		cnt->epoch++;
-	/*********/
-	mb_write(); /* Not strictly necessary. */
+	if (unlikely((uintptr_t)old_tctx > (uintptr_t)1U))
+		prof_free_sampled_object(tsd, old_usize, old_tctx);
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
+		prof_malloc_sample_object(ptr, usize, tctx);
+	else
+		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }

-JEMALLOC_INLINE void
-prof_free(const void *ptr, size_t size)
+JEMALLOC_ALWAYS_INLINE void
+prof_free(tsd_t *tsd, const void *ptr, size_t usize)
 {
-	prof_ctx_t *ctx = prof_ctx_get(ptr);
+	prof_tctx_t *tctx = prof_tctx_get(ptr);

 	cassert(config_prof);
+	assert(usize == isalloc(ptr, true));

-	if ((uintptr_t)ctx > (uintptr_t)1) {
-		prof_thr_cnt_t *tcnt;
-		assert(size == isalloc(ptr, true));
-		tcnt = prof_lookup(ctx->bt);
-
-		if (tcnt != NULL) {
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->cnts.curobjs--;
-			tcnt->cnts.curbytes -= size;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-		} else {
-			/*
-			 * OOM during free() cannot be propagated, so operate
-			 * directly on cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(ctx->lock);
-			ctx->cnt_merged.curobjs--;
-			ctx->cnt_merged.curbytes -= size;
-			malloc_mutex_unlock(ctx->lock);
-		}
-	}
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
+		prof_free_sampled_object(tsd, usize, tctx);
 }
 #endif

--- a/memory/jemalloc/src/include/jemalloc/internal/ql.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/ql.h
@@ -1,6 +1,4 @@
-/*
- * List definitions.
- */
+/* List definitions. */
 #define	ql_head(a_type)							\
 struct {								\
 	a_type *qlh_first;						\
--- a/memory/jemalloc/src/include/jemalloc/internal/qr.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/qr.h
@@ -40,8 +40,10 @@ struct {								\
 	(a_qr_b)->a_field.qre_prev = t;					\
 } while (0)

-/* qr_meld() and qr_split() are functionally equivalent, so there's no need to
- * have two copies of the code. */
+/*
+ * qr_meld() and qr_split() are functionally equivalent, so there's no need to
+ * have two copies of the code.
+ */
 #define	qr_split(a_qr_a, a_qr_b, a_field)				\
 	qr_meld((a_qr_a), (a_qr_b), a_field)

--- a/memory/jemalloc/src/include/jemalloc/internal/quarantine.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/quarantine.h
@@ -29,36 +29,29 @@ struct quarantine_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

-quarantine_t	*quarantine_init(size_t lg_maxobjs);
-void	quarantine(void *ptr);
-void	quarantine_cleanup(void *arg);
-bool	quarantine_boot(void);
+void	quarantine_alloc_hook_work(tsd_t *tsd);
+void	quarantine(tsd_t *tsd, void *ptr);
+void	quarantine_cleanup(tsd_t *tsd);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), quarantine, quarantine_t *)
-
 void	quarantine_alloc_hook(void);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_QUARANTINE_C_))
-malloc_tsd_externs(quarantine, quarantine_t *)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, quarantine, quarantine_t *, NULL,
-    quarantine_cleanup)
-
 JEMALLOC_ALWAYS_INLINE void
 quarantine_alloc_hook(void)
 {
-	quarantine_t *quarantine;
+	tsd_t *tsd;

 	assert(config_fill && opt_quarantine);

-	quarantine = *quarantine_tsd_get();
-	if (quarantine == NULL)
-		quarantine_init(LG_MAXOBJS_INIT);
+	tsd = tsd_fetch();
+	if (tsd_quarantine_get(tsd) == NULL)
+		quarantine_alloc_hook_work(tsd);
 }
 #endif

--- a/memory/jemalloc/src/include/jemalloc/internal/rb.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/rb.h
@@ -158,6 +158,8 @@ struct {								\
 #define	rb_proto(a_attr, a_prefix, a_rbt_type, a_type)			\
 a_attr void								\
 a_prefix##new(a_rbt_type *rbtree);					\
+a_attr bool								\
+a_prefix##empty(a_rbt_type *rbtree);					\
 a_attr a_type *								\
 a_prefix##first(a_rbt_type *rbtree);					\
 a_attr a_type *								\
@@ -198,7 +200,7 @@ a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start,		\
 *                 int (a_cmp *)(a_type *a_node, a_type *a_other);
 *                                       ^^^^^^
 *                                    or a_key
- *               Interpretation of comparision function return values:
+ *               Interpretation of comparison function return values:
 *                 -1 : a_node <  a_other
 *                  0 : a_node == a_other
 *                  1 : a_node >  a_other
@@ -224,6 +226,13 @@ a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start,		\
 *       Args:
 *         tree: Pointer to an uninitialized red-black tree object.
 *
+ *   static bool
+ *   ex_empty(ex_t *tree);
+ *       Description: Determine whether tree is empty.
+ *       Args:
+ *         tree: Pointer to an initialized red-black tree object.
+ *       Ret: True if tree is empty, false otherwise.
+ *
 *   static ex_node_t *
 *   ex_first(ex_t *tree);
 *   static ex_node_t *
@@ -309,6 +318,10 @@ a_attr void								\
 a_prefix##new(a_rbt_type *rbtree) {					\
    rb_new(a_type, a_field, rbtree);					\
 }									\
+a_attr bool								\
+a_prefix##empty(a_rbt_type *rbtree) {					\
+    return (rbtree->rbt_root == &rbtree->rbt_nil);			\
+}									\
 a_attr a_type *								\
 a_prefix##first(a_rbt_type *rbtree) {					\
    a_type *ret;							\
@@ -580,7 +593,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 	if (left != &rbtree->rbt_nil) {					\
 	    /* node has no successor, but it has a left child.        */\
 	    /* Splice node out, without losing the left child.        */\
-	    assert(rbtn_red_get(a_type, a_field, node) == false);	\
+	    assert(!rbtn_red_get(a_type, a_field, node));		\
 	    assert(rbtn_red_get(a_type, a_field, left));		\
 	    rbtn_black_set(a_type, a_field, left);			\
 	    if (pathp == path) {					\
@@ -616,8 +629,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 	if (pathp->cmp < 0) {						\
 	    rbtn_left_set(a_type, a_field, pathp->node,			\
 	      pathp[1].node);						\
-	    assert(rbtn_red_get(a_type, a_field, pathp[1].node)		\
-	      == false);						\
+	    assert(!rbtn_red_get(a_type, a_field, pathp[1].node));	\
 	    if (rbtn_red_get(a_type, a_field, pathp->node)) {		\
 		a_type *right = rbtn_right_get(a_type, a_field,		\
 		  pathp->node);						\
@@ -681,7 +693,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 		    rbtn_rotate_left(a_type, a_field, pathp->node,	\
 		      tnode);						\
 		    /* Balance restored, but rotation modified        */\
-		    /* subree root, which may actually be the tree    */\
+		    /* subtree root, which may actually be the tree   */\
 		    /* root.                                          */\
 		    if (pathp == path) {				\
 			/* Set root. */					\
@@ -849,7 +861,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
    }									\
    /* Set root. */							\
    rbtree->rbt_root = path->node;					\
-    assert(rbtn_red_get(a_type, a_field, rbtree->rbt_root) == false);	\
+    assert(!rbtn_red_get(a_type, a_field, rbtree->rbt_root));		\
 }									\
 a_attr a_type *								\
 a_prefix##iter_recurse(a_rbt_type *rbtree, a_type *node,		\
--- a/memory/jemalloc/src/include/jemalloc/internal/size_classes.sh
+++ b/memory/jemalloc/src/include/jemalloc/internal/size_classes.sh
@@ -1,17 +1,26 @@
 #!/bin/sh
+#
+# Usage: size_classes.sh <lg_qarr> <lg_tmin> <lg_parr> <lg_g>

 # The following limits are chosen such that they cover all supported platforms.

-# Range of quanta.
-lg_qmin=3
-lg_qmax=4
+# Pointer sizes.
+lg_zarr="2 3"
+
+# Quanta.
+lg_qarr=$1

 # The range of tiny size classes is [2^lg_tmin..2^(lg_q-1)].
-lg_tmin=3
+lg_tmin=$2

-# Range of page sizes.
-lg_pmin=12
-lg_pmax=16
+# Maximum lookup size.
+lg_kmax=12
+
+# Page sizes.
+lg_parr=`echo $3 | tr ',' ' '`
+
+# Size class group size (number of size classes for each size doubling).
+lg_g=$4

 pow2() {
  e=$1
@@ -22,68 +31,218 @@ pow2() {
  done
 }

+lg() {
+  x=$1
+  lg_result=0
+  while [ ${x} -gt 1 ] ; do
+    lg_result=$((${lg_result} + 1))
+    x=$((${x} / 2))
+  done
+}
+
+size_class() {
+  index=$1
+  lg_grp=$2
+  lg_delta=$3
+  ndelta=$4
+  lg_p=$5
+  lg_kmax=$6
+
+  lg ${ndelta}; lg_ndelta=${lg_result}; pow2 ${lg_ndelta}
+  if [ ${pow2_result} -lt ${ndelta} ] ; then
+    rem="yes"
+  else
+    rem="no"
+  fi
+
+  lg_size=${lg_grp}
+  if [ $((${lg_delta} + ${lg_ndelta})) -eq ${lg_grp} ] ; then
+    lg_size=$((${lg_grp} + 1))
+  else
+    lg_size=${lg_grp}
+    rem="yes"
+  fi
+
+  if [ ${lg_size} -lt $((${lg_p} + ${lg_g})) ] ; then
+    bin="yes"
+  else
+    bin="no"
+  fi
+  if [ ${lg_size} -lt ${lg_kmax} \
+      -o ${lg_size} -eq ${lg_kmax} -a ${rem} = "no" ] ; then
+    lg_delta_lookup=${lg_delta}
+  else
+    lg_delta_lookup="no"
+  fi
+  printf '    SC(%3d, %6d, %8d, %6d, %3s, %2s) \\\n' ${index} ${lg_grp} ${lg_delta} ${ndelta} ${bin} ${lg_delta_lookup}
+  # Defined upon return:
+  # - lg_delta_lookup (${lg_delta} or "no")
+  # - bin ("yes" or "no")
+}
+
+sep_line() {
+  echo "                                               \\"
+}
+
+size_classes() {
+  lg_z=$1
+  lg_q=$2
+  lg_t=$3
+  lg_p=$4
+  lg_g=$5
+
+  pow2 $((${lg_z} + 3)); ptr_bits=${pow2_result}
+  pow2 ${lg_g}; g=${pow2_result}
+
+  echo "#define	SIZE_CLASSES \\"
+  echo "  /* index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup */ \\"
+
+  ntbins=0
+  nlbins=0
+  lg_tiny_maxclass='"NA"'
+  nbins=0
+
+  # Tiny size classes.
+  ndelta=0
+  index=0
+  lg_grp=${lg_t}
+  lg_delta=${lg_grp}
+  while [ ${lg_grp} -lt ${lg_q} ] ; do
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    if [ ${lg_delta_lookup} != "no" ] ; then
+      nlbins=$((${index} + 1))
+    fi
+    if [ ${bin} != "no" ] ; then
+      nbins=$((${index} + 1))
+    fi
+    ntbins=$((${ntbins} + 1))
+    lg_tiny_maxclass=${lg_grp} # Final written value is correct.
+    index=$((${index} + 1))
+    lg_delta=${lg_grp}
+    lg_grp=$((${lg_grp} + 1))
+  done
+
+  # First non-tiny group.
+  if [ ${ntbins} -gt 0 ] ; then
+    sep_line
+    # The first size class has an unusual encoding, because the size has to be
+    # split between grp and delta*ndelta.
+    lg_grp=$((${lg_grp} - 1))
+    ndelta=1
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    index=$((${index} + 1))
+    lg_grp=$((${lg_grp} + 1))
+    lg_delta=$((${lg_delta} + 1))
+  fi
+  while [ ${ndelta} -lt ${g} ] ; do
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    index=$((${index} + 1))
+    ndelta=$((${ndelta} + 1))
+  done
+
+  # All remaining groups.
+  lg_grp=$((${lg_grp} + ${lg_g}))
+  while [ ${lg_grp} -lt ${ptr_bits} ] ; do
+    sep_line
+    ndelta=1
+    if [ ${lg_grp} -eq $((${ptr_bits} - 1)) ] ; then
+      ndelta_limit=$((${g} - 1))
+    else
+      ndelta_limit=${g}
+    fi
+    while [ ${ndelta} -le ${ndelta_limit} ] ; do
+      size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+      if [ ${lg_delta_lookup} != "no" ] ; then
+        nlbins=$((${index} + 1))
+        # Final written value is correct:
+        lookup_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
+      fi
+      if [ ${bin} != "no" ] ; then
+        nbins=$((${index} + 1))
+        # Final written value is correct:
+        small_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
+        if [ ${lg_g} -gt 0 ] ; then
+          lg_large_minclass=$((${lg_grp} + 1))
+        else
+          lg_large_minclass=$((${lg_grp} + 2))
+        fi
+      fi
+      index=$((${index} + 1))
+      ndelta=$((${ndelta} + 1))
+    done
+    lg_grp=$((${lg_grp} + 1))
+    lg_delta=$((${lg_delta} + 1))
+  done
+  echo
+  nsizes=${index}
+
+  # Defined upon completion:
+  # - ntbins
+  # - nlbins
+  # - nbins
+  # - nsizes
+  # - lg_tiny_maxclass
+  # - lookup_maxclass
+  # - small_maxclass
+  # - lg_large_minclass
+}
+
 cat <<EOF
 /* This file was automatically generated by size_classes.sh. */
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES

+/*
+ * This header requires LG_SIZEOF_PTR, LG_TINY_MIN, LG_QUANTUM, and LG_PAGE to
+ * be defined prior to inclusion, and it in turn defines:
+ *
+ *   LG_SIZE_CLASS_GROUP: Lg of size class count for each size doubling.
+ *   SIZE_CLASSES: Complete table of
+ *                 SC(index, lg_delta, size, bin, lg_delta_lookup) tuples.
+ *     index: Size class index.
+ *     lg_grp: Lg group base size (no deltas added).
+ *     lg_delta: Lg delta to previous size class.
+ *     ndelta: Delta multiplier.  size == 1<<lg_grp + ndelta<<lg_delta
+ *     bin: 'yes' if a small bin size class, 'no' otherwise.
+ *     lg_delta_lookup: Same as lg_delta if a lookup table size class, 'no'
+ *                      otherwise.
+ *   NTBINS: Number of tiny bins.
+ *   NLBINS: Number of bins supported by the lookup table.
+ *   NBINS: Number of small size class bins.
+ *   NSIZES: Number of size classes.
+ *   LG_TINY_MAXCLASS: Lg of maximum tiny size class.
+ *   LOOKUP_MAXCLASS: Maximum size class included in lookup table.
+ *   SMALL_MAXCLASS: Maximum small size class.
+ *   LG_LARGE_MINCLASS: Lg of minimum large size class.
+ */
+
+#define	LG_SIZE_CLASS_GROUP	${lg_g}
+
 EOF

-lg_q=${lg_qmin}
-while [ ${lg_q} -le ${lg_qmax} ] ; do
-  lg_t=${lg_tmin}
-  while [ ${lg_t} -le ${lg_q} ] ; do
-    lg_p=${lg_pmin}
-    while [ ${lg_p} -le ${lg_pmax} ] ; do
-      echo "#if (LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
-      echo "#define	SIZE_CLASSES_DEFINED"
-      pow2 ${lg_q}; q=${pow2_result}
-      pow2 ${lg_t}; t=${pow2_result}
-      pow2 ${lg_p}; p=${pow2_result}
-      bin=0
-      psz=0
-      sz=${t}
-      delta=$((${sz} - ${psz}))
-      echo "/*  SIZE_CLASS(bin,	delta,	sz) */"
-      echo "#define	SIZE_CLASSES							\\"
-
-      # Tiny size classes.
-      while [ ${sz} -lt ${q} ] ; do
-        echo "    SIZE_CLASS(${bin},	${delta},	${sz})					\\"
-        bin=$((${bin} + 1))
-        psz=${sz}
-        sz=$((${sz} + ${sz}))
-        delta=$((${sz} - ${psz}))
+for lg_z in ${lg_zarr} ; do
+  for lg_q in ${lg_qarr} ; do
+    lg_t=${lg_tmin}
+    while [ ${lg_t} -le ${lg_q} ] ; do
+      # Iterate through page sizes and compute how many bins there are.
+      for lg_p in ${lg_parr} ; do
+        echo "#if (LG_SIZEOF_PTR == ${lg_z} && LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
+        size_classes ${lg_z} ${lg_q} ${lg_t} ${lg_p} ${lg_g}
+        echo "#define	SIZE_CLASSES_DEFINED"
+        echo "#define	NTBINS			${ntbins}"
+        echo "#define	NLBINS			${nlbins}"
+        echo "#define	NBINS			${nbins}"
+        echo "#define	NSIZES			${nsizes}"
+        echo "#define	LG_TINY_MAXCLASS	${lg_tiny_maxclass}"
+        echo "#define	LOOKUP_MAXCLASS		${lookup_maxclass}"
+        echo "#define	SMALL_MAXCLASS		${small_maxclass}"
+        echo "#define	LG_LARGE_MINCLASS	${lg_large_minclass}"
+        echo "#endif"
+        echo
      done
-      # Quantum-multiple size classes.  For each doubling of sz, as many as 4
-      # size classes exist.  Their spacing is the greater of:
-      # - q
-      # - sz/4, where sz is a power of 2
-      while [ ${sz} -lt ${p} ] ; do
-        if [ ${sz} -ge $((${q} * 4)) ] ; then
-          i=$((${sz} / 4))
-        else
-          i=${q}
-        fi
-        next_2pow=$((${sz} * 2))
-        while [ ${sz} -lt $next_2pow ] ; do
-          echo "    SIZE_CLASS(${bin},	${delta},	${sz})					\\"
-          bin=$((${bin} + 1))
-          psz=${sz}
-          sz=$((${sz} + ${i}))
-          delta=$((${sz} - ${psz}))
-        done
-      done
-      echo
-      echo "#define	NBINS		${bin}"
-      echo "#define	SMALL_MAXCLASS	${psz}"
-      echo "#endif"
-      echo
-      lg_p=$((${lg_p} + 1))
+      lg_t=$((${lg_t} + 1))
    done
-    lg_t=$((${lg_t} + 1))
  done
-  lg_q=$((${lg_q} + 1))
 done

 cat <<EOF
@@ -92,11 +251,10 @@ cat <<EOF
 #endif
 #undef SIZE_CLASSES_DEFINED
 /*
- * The small_size2bin lookup table uses uint8_t to encode each bin index, so we
+ * The size2index_tab lookup table uses uint8_t to encode each bin index, so we
 * cannot support more than 256 small size classes.  Further constrain NBINS to
- * 255 to support prof_promote, since all small size classes, plus a "not
- * small" size class must be stored in 8 bits of arena_chunk_map_t's bits
- * field.
+ * 255 since all small size classes, plus a "not small" size class must be
+ * stored in 8 bits of arena_chunk_map_bits_t's bits field.
 */
 #if (NBINS > 255)
 #  error "Too many small size classes"
--- a/memory/jemalloc/src/include/jemalloc/internal/stats.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/stats.h
@@ -4,6 +4,7 @@
 typedef struct tcache_bin_stats_s tcache_bin_stats_t;
 typedef struct malloc_bin_stats_s malloc_bin_stats_t;
 typedef struct malloc_large_stats_s malloc_large_stats_t;
+typedef struct malloc_huge_stats_s malloc_huge_stats_t;
 typedef struct arena_stats_s arena_stats_t;
 typedef struct chunk_stats_s chunk_stats_t;

@@ -20,12 +21,6 @@ struct tcache_bin_stats_s {
 };

 struct malloc_bin_stats_s {
-	/*
-	 * Current number of bytes allocated, including objects currently
-	 * cached by tcache.
-	 */
-	size_t		allocated;
-
 	/*
 	 * Total number of allocation/deallocation requests served directly by
 	 * the bin.  Note that tcache may allocate an object, then recycle it
@@ -42,6 +37,12 @@ struct malloc_bin_stats_s {
 	 */
 	uint64_t	nrequests;

+	/*
+	 * Current number of regions of this size class, including regions
+	 * currently cached by tcache.
+	 */
+	size_t		curregs;
+
 	/* Number of tcache fills from this bin. */
 	uint64_t	nfills;

@@ -78,10 +79,25 @@ struct malloc_large_stats_s {
 	 */
 	uint64_t	nrequests;

-	/* Current number of runs of this size class. */
+	/*
+	 * Current number of runs of this size class, including runs currently
+	 * cached by tcache.
+	 */
 	size_t		curruns;
 };

+struct malloc_huge_stats_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the arena.
+	 */
+	uint64_t	nmalloc;
+	uint64_t	ndalloc;
+
+	/* Current number of (multi-)chunk allocations of this size class. */
+	size_t		curhchunks;
+};
+
 struct arena_stats_s {
 	/* Number of bytes currently mapped. */
 	size_t		mapped;
@@ -101,13 +117,15 @@ struct arena_stats_s {
 	uint64_t	ndalloc_large;
 	uint64_t	nrequests_large;

-	/*
-	 * One element for each possible size class, including sizes that
-	 * overlap with bin size classes.  This is necessary because ipalloc()
-	 * sometimes has to use such large objects in order to assure proper
-	 * alignment.
-	 */
+	size_t		allocated_huge;
+	uint64_t	nmalloc_huge;
+	uint64_t	ndalloc_huge;
+
+	/* One element for each large size class. */
 	malloc_large_stats_t	*lstats;
+
+	/* One element for each huge size class. */
+	malloc_huge_stats_t	*hstats;
 };

 struct chunk_stats_s {
--- a/memory/jemalloc/src/include/jemalloc/internal/tcache.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/tcache.h
@@ -69,10 +69,10 @@ struct tcache_bin_s {

 struct tcache_s {
 	ql_elm(tcache_t) link;		/* Used for aggregating stats. */
-	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum() */
+	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum(). */
 	arena_t		*arena;		/* This thread's arena. */
 	unsigned	ev_cnt;		/* Event count since incremental GC. */
-	unsigned	next_gc_bin;	/* Next bin to GC. */
+	index_t		next_gc_bin;	/* Next bin to GC. */
 	tcache_bin_t	tbins[1];	/* Dynamically sized. */
 	/*
 	 * The pointer stacks associated with tbins follow as a contiguous
@@ -103,76 +103,63 @@ extern size_t			tcache_maxclass;
 size_t	tcache_salloc(const void *ptr);
 void	tcache_event_hard(tcache_t *tcache);
 void	*tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin,
-    size_t binind);
-void	tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
+    index_t binind);
+void	tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
    tcache_t *tcache);
-void	tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
+void	tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
    tcache_t *tcache);
 void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
+void	tcache_arena_reassociate(tcache_t *tcache, arena_t *arena);
 void	tcache_arena_dissociate(tcache_t *tcache);
-tcache_t *tcache_create(arena_t *arena);
-void	tcache_destroy(tcache_t *tcache);
-void	tcache_thread_cleanup(void *arg);
+tcache_t *tcache_get_hard(tsd_t *tsd);
+tcache_t *tcache_create(tsd_t *tsd, arena_t *arena);
+void	tcache_cleanup(tsd_t *tsd);
+void	tcache_enabled_cleanup(tsd_t *tsd);
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
-bool	tcache_boot0(void);
-bool	tcache_boot1(void);
+bool	tcache_boot(void);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), tcache, tcache_t *)
-malloc_tsd_protos(JEMALLOC_ATTR(unused), tcache_enabled, tcache_enabled_t)
-
 void	tcache_event(tcache_t *tcache);
 void	tcache_flush(void);
 bool	tcache_enabled_get(void);
-tcache_t *tcache_get(bool create);
+tcache_t *tcache_get(tsd_t *tsd, bool create);
 void	tcache_enabled_set(bool enabled);
 void	*tcache_alloc_easy(tcache_bin_t *tbin);
 void	*tcache_alloc_small(tcache_t *tcache, size_t size, bool zero);
 void	*tcache_alloc_large(tcache_t *tcache, size_t size, bool zero);
-void	tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind);
+void	tcache_dalloc_small(tcache_t *tcache, void *ptr, index_t binind);
 void	tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TCACHE_C_))
-/* Map of thread-specific caches. */
-malloc_tsd_externs(tcache, tcache_t *)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, tcache, tcache_t *, NULL,
-    tcache_thread_cleanup)
-/* Per thread flag that allows thread caches to be disabled. */
-malloc_tsd_externs(tcache_enabled, tcache_enabled_t)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, tcache_enabled, tcache_enabled_t,
-    tcache_enabled_default, malloc_tsd_no_cleanup)
-
 JEMALLOC_INLINE void
 tcache_flush(void)
 {
-	tcache_t *tcache;
+	tsd_t *tsd;

 	cassert(config_tcache);

-	tcache = *tcache_tsd_get();
-	if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX)
-		return;
-	tcache_destroy(tcache);
-	tcache = NULL;
-	tcache_tsd_set(&tcache);
+	tsd = tsd_fetch();
+	tcache_cleanup(tsd);
 }

 JEMALLOC_INLINE bool
 tcache_enabled_get(void)
 {
+	tsd_t *tsd;
 	tcache_enabled_t tcache_enabled;

 	cassert(config_tcache);

-	tcache_enabled = *tcache_enabled_tsd_get();
+	tsd = tsd_fetch();
+	tcache_enabled = tsd_tcache_enabled_get(tsd);
 	if (tcache_enabled == tcache_enabled_default) {
 		tcache_enabled = (tcache_enabled_t)opt_tcache;
-		tcache_enabled_tsd_set(&tcache_enabled);
+		tsd_tcache_enabled_set(tsd, tcache_enabled);
 	}

 	return ((bool)tcache_enabled);
@@ -181,78 +168,34 @@ tcache_enabled_get(void)
 JEMALLOC_INLINE void
 tcache_enabled_set(bool enabled)
 {
+	tsd_t *tsd;
 	tcache_enabled_t tcache_enabled;
-	tcache_t *tcache;

 	cassert(config_tcache);

+	tsd = tsd_fetch();
+
 	tcache_enabled = (tcache_enabled_t)enabled;
-	tcache_enabled_tsd_set(&tcache_enabled);
-	tcache = *tcache_tsd_get();
-	if (enabled) {
-		if (tcache == TCACHE_STATE_DISABLED) {
-			tcache = NULL;
-			tcache_tsd_set(&tcache);
-		}
-	} else /* disabled */ {
-		if (tcache > TCACHE_STATE_MAX) {
-			tcache_destroy(tcache);
-			tcache = NULL;
-		}
-		if (tcache == NULL) {
-			tcache = TCACHE_STATE_DISABLED;
-			tcache_tsd_set(&tcache);
-		}
-	}
+	tsd_tcache_enabled_set(tsd, tcache_enabled);
+
+	if (!enabled)
+		tcache_cleanup(tsd);
 }

 JEMALLOC_ALWAYS_INLINE tcache_t *
-tcache_get(bool create)
+tcache_get(tsd_t *tsd, bool create)
 {
 	tcache_t *tcache;

-	if (config_tcache == false)
-		return (NULL);
-	if (config_lazy_lock && isthreaded == false)
+	if (!config_tcache)
 		return (NULL);

-	tcache = *tcache_tsd_get();
-	if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX) {
-		if (tcache == TCACHE_STATE_DISABLED)
-			return (NULL);
-		if (tcache == NULL) {
-			if (create == false) {
-				/*
-				 * Creating a tcache here would cause
-				 * allocation as a side effect of free().
-				 * Ordinarily that would be okay since
-				 * tcache_create() failure is a soft failure
-				 * that doesn't propagate.  However, if TLS
-				 * data are freed via free() as in glibc,
-				 * subtle corruption could result from setting
-				 * a TLS variable after its backing memory is
-				 * freed.
-				 */
-				return (NULL);
-			}
-			if (tcache_enabled_get() == false) {
-				tcache_enabled_set(false); /* Memoize. */
-				return (NULL);
-			}
-			return (tcache_create(choose_arena(NULL)));
-		}
-		if (tcache == TCACHE_STATE_PURGATORY) {
-			/*
-			 * Make a note that an allocator function was called
-			 * after tcache_thread_cleanup() was called.
-			 */
-			tcache = TCACHE_STATE_REINCARNATED;
-			tcache_tsd_set(&tcache);
-			return (NULL);
-		}
-		if (tcache == TCACHE_STATE_REINCARNATED)
-			return (NULL);
-		not_reached();
+	tcache = tsd_tcache_get(tsd);
+	if (!create)
+		return (tcache);
+	if (unlikely(tcache == NULL) && tsd_nominal(tsd)) {
+		tcache = tcache_get_hard(tsd);
+		tsd_tcache_set(tsd, tcache);
 	}

 	return (tcache);
@@ -267,7 +210,7 @@ tcache_event(tcache_t *tcache)

 	tcache->ev_cnt++;
 	assert(tcache->ev_cnt <= TCACHE_GC_INCR);
-	if (tcache->ev_cnt == TCACHE_GC_INCR)
+	if (unlikely(tcache->ev_cnt == TCACHE_GC_INCR))
 		tcache_event_hard(tcache);
 }

@@ -276,12 +219,12 @@ tcache_alloc_easy(tcache_bin_t *tbin)
 {
 	void *ret;

-	if (tbin->ncached == 0) {
+	if (unlikely(tbin->ncached == 0)) {
 		tbin->low_water = -1;
 		return (NULL);
 	}
 	tbin->ncached--;
-	if ((int)tbin->ncached < tbin->low_water)
+	if (unlikely((int)tbin->ncached < tbin->low_water))
 		tbin->low_water = tbin->ncached;
 	ret = tbin->avail[tbin->ncached];
 	return (ret);
@@ -291,43 +234,42 @@ JEMALLOC_ALWAYS_INLINE void *
 tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 {
 	void *ret;
-	size_t binind;
+	index_t binind;
+	size_t usize;
 	tcache_bin_t *tbin;

-	binind = SMALL_SIZE2BIN(size);
+	binind = size2index(size);
 	assert(binind < NBINS);
 	tbin = &tcache->tbins[binind];
-	size = arena_bin_info[binind].reg_size;
+	usize = index2size(binind);
 	ret = tcache_alloc_easy(tbin);
-	if (ret == NULL) {
+	if (unlikely(ret == NULL)) {
 		ret = tcache_alloc_small_hard(tcache, tbin, binind);
 		if (ret == NULL)
 			return (NULL);
 	}
-	assert(tcache_salloc(ret) == arena_bin_info[binind].reg_size);
+	assert(tcache_salloc(ret) == usize);

-	if (zero == false) {
+	if (likely(!zero)) {
 		if (config_fill) {
-			if (opt_junk) {
+			if (unlikely(opt_junk_alloc)) {
 				arena_alloc_junk_small(ret,
 				    &arena_bin_info[binind], false);
-			} else if (opt_zero)
-				memset(ret, 0, size);
+			} else if (unlikely(opt_zero))
+				memset(ret, 0, usize);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	} else {
-		if (config_fill && opt_junk) {
+		if (config_fill && unlikely(opt_junk_alloc)) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-		memset(ret, 0, size);
+		memset(ret, 0, usize);
 	}

 	if (config_stats)
 		tbin->tstats.nrequests++;
 	if (config_prof)
-		tcache->prof_accumbytes += arena_bin_info[binind].reg_size;
+		tcache->prof_accumbytes += usize;
 	tcache_event(tcache);
 	return (ret);
 }
@@ -336,25 +278,26 @@ JEMALLOC_ALWAYS_INLINE void *
 tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 {
 	void *ret;
-	size_t binind;
+	index_t binind;
+	size_t usize;
 	tcache_bin_t *tbin;

-	size = PAGE_CEILING(size);
-	assert(size <= tcache_maxclass);
-	binind = NBINS + (size >> LG_PAGE) - 1;
+	binind = size2index(size);
+	usize = index2size(binind);
+	assert(usize <= tcache_maxclass);
 	assert(binind < nhbins);
 	tbin = &tcache->tbins[binind];
 	ret = tcache_alloc_easy(tbin);
-	if (ret == NULL) {
+	if (unlikely(ret == NULL)) {
 		/*
 		 * Only allocate one large object at a time, because it's quite
 		 * expensive to create one and not use it.
 		 */
-		ret = arena_malloc_large(tcache->arena, size, zero);
+		ret = arena_malloc_large(tcache->arena, usize, zero);
 		if (ret == NULL)
 			return (NULL);
 	} else {
-		if (config_prof && prof_promote && size == PAGE) {
+		if (config_prof && usize == LARGE_MINCLASS) {
 			arena_chunk_t *chunk =
 			    (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
 			size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
@@ -362,23 +305,20 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 			arena_mapbits_large_binind_set(chunk, pageind,
 			    BININD_INVALID);
 		}
-		if (zero == false) {
+		if (likely(!zero)) {
 			if (config_fill) {
-				if (opt_junk)
-					memset(ret, 0xa5, size);
-				else if (opt_zero)
-					memset(ret, 0, size);
+				if (unlikely(opt_junk_alloc))
+					memset(ret, 0xa5, usize);
+				else if (unlikely(opt_zero))
+					memset(ret, 0, usize);
 			}
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-		} else {
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-			memset(ret, 0, size);
-		}
+		} else
+			memset(ret, 0, usize);

 		if (config_stats)
 			tbin->tstats.nrequests++;
 		if (config_prof)
-			tcache->prof_accumbytes += size;
+			tcache->prof_accumbytes += usize;
 	}

 	tcache_event(tcache);
@@ -386,19 +326,19 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 }

 JEMALLOC_ALWAYS_INLINE void
-tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind)
+tcache_dalloc_small(tcache_t *tcache, void *ptr, index_t binind)
 {
 	tcache_bin_t *tbin;
 	tcache_bin_info_t *tbin_info;

 	assert(tcache_salloc(ptr) <= SMALL_MAXCLASS);

-	if (config_fill && opt_junk)
+	if (config_fill && unlikely(opt_junk_free))
 		arena_dalloc_junk_small(ptr, &arena_bin_info[binind]);

 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
-	if (tbin->ncached == tbin_info->ncached_max) {
+	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
 		tcache_bin_flush_small(tbin, binind, (tbin_info->ncached_max >>
 		    1), tcache);
 	}
@@ -412,7 +352,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind)
 JEMALLOC_ALWAYS_INLINE void
 tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 {
-	size_t binind;
+	index_t binind;
 	tcache_bin_t *tbin;
 	tcache_bin_info_t *tbin_info;

@@ -420,14 +360,14 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	assert(tcache_salloc(ptr) > SMALL_MAXCLASS);
 	assert(tcache_salloc(ptr) <= tcache_maxclass);

-	binind = NBINS + (size >> LG_PAGE) - 1;
+	binind = size2index(size);

-	if (config_fill && opt_junk)
-		memset(ptr, 0x5a, size);
+	if (config_fill && unlikely(opt_junk_free))
+		arena_dalloc_junk_large(ptr, size);

 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
-	if (tbin->ncached == tbin_info->ncached_max) {
+	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
 		tcache_bin_flush_large(tbin, binind, (tbin_info->ncached_max >>
 		    1), tcache);
 	}
--- a/memory/jemalloc/src/include/jemalloc/internal/tsd.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/tsd.h
@@ -2,7 +2,7 @@
 #ifdef JEMALLOC_H_TYPES

 /* Maximum number of malloc_tsd users with cleanup functions. */
-#define	MALLOC_TSD_CLEANUPS_MAX	8
+#define	MALLOC_TSD_CLEANUPS_MAX	2

 typedef bool (*malloc_tsd_cleanup_t)(void);

@@ -12,9 +12,18 @@ typedef struct tsd_init_block_s tsd_init_block_t;
 typedef struct tsd_init_head_s tsd_init_head_t;
 #endif

+typedef struct tsd_s tsd_t;
+
+typedef enum {
+	tsd_state_uninitialized,
+	tsd_state_nominal,
+	tsd_state_purgatory,
+	tsd_state_reincarnated
+} tsd_state_t;
+
 /*
 * TLS/TSD-agnostic macro-based implementation of thread-specific data.  There
- * are four macros that support (at least) three use cases: file-private,
+ * are five macros that support (at least) three use cases: file-private,
 * library-private, and library-private inlined.  Following is an example
 * library-private tsd variable:
 *
@@ -24,34 +33,36 @@ typedef struct tsd_init_head_s tsd_init_head_t;
 *           int y;
 *   } example_t;
 *   #define EX_INITIALIZER JEMALLOC_CONCAT({0, 0})
- *   malloc_tsd_protos(, example, example_t *)
- *   malloc_tsd_externs(example, example_t *)
+ *   malloc_tsd_types(example_, example_t)
+ *   malloc_tsd_protos(, example_, example_t)
+ *   malloc_tsd_externs(example_, example_t)
 * In example.c:
- *   malloc_tsd_data(, example, example_t *, EX_INITIALIZER)
- *   malloc_tsd_funcs(, example, example_t *, EX_INITIALIZER,
+ *   malloc_tsd_data(, example_, example_t, EX_INITIALIZER)
+ *   malloc_tsd_funcs(, example_, example_t, EX_INITIALIZER,
 *       example_tsd_cleanup)
 *
 * The result is a set of generated functions, e.g.:
 *
 *   bool example_tsd_boot(void) {...}
- *   example_t **example_tsd_get() {...}
- *   void example_tsd_set(example_t **val) {...}
+ *   example_t *example_tsd_get() {...}
+ *   void example_tsd_set(example_t *val) {...}
 *
 * Note that all of the functions deal in terms of (a_type *) rather than
 * (a_type)  so that it is possible to support non-pointer types (unlike
 * pthreads TSD).  example_tsd_cleanup() is passed an (a_type *) pointer that is
- * cast to (void *).  This means that the cleanup function needs to cast *and*
- * dereference the function argument, e.g.:
+ * cast to (void *).  This means that the cleanup function needs to cast the
+ * function argument to (a_type *), then dereference the resulting pointer to
+ * access fields, e.g.
 *
 *   void
 *   example_tsd_cleanup(void *arg)
 *   {
- *           example_t *example = *(example_t **)arg;
+ *           example_t *example = (example_t *)arg;
 *
+ *           example->x = 42;
 *           [...]
- *           if ([want the cleanup function to be called again]) {
- *                   example_tsd_set(&example);
- *           }
+ *           if ([want the cleanup function to be called again])
+ *                   example_tsd_set(example);
 *   }
 *
 * If example_tsd_set() is called within example_tsd_cleanup(), it will be
@@ -60,63 +71,96 @@ typedef struct tsd_init_head_s tsd_init_head_t;
 * non-NULL.
 */

+/* malloc_tsd_types(). */
+#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
+#define	malloc_tsd_types(a_name, a_type)
+#elif (defined(JEMALLOC_TLS))
+#define	malloc_tsd_types(a_name, a_type)
+#elif (defined(_WIN32))
+#define	malloc_tsd_types(a_name, a_type)				\
+typedef struct {							\
+	bool	initialized;						\
+	a_type	val;							\
+} a_name##tsd_wrapper_t;
+#else
+#define	malloc_tsd_types(a_name, a_type)				\
+typedef struct {							\
+	bool	initialized;						\
+	a_type	val;							\
+} a_name##tsd_wrapper_t;
+#endif
+
 /* malloc_tsd_protos(). */
 #define	malloc_tsd_protos(a_attr, a_name, a_type)			\
 a_attr bool								\
-a_name##_tsd_boot(void);						\
-a_attr a_type *								\
-a_name##_tsd_get(void);							\
+a_name##tsd_boot0(void);						\
 a_attr void								\
-a_name##_tsd_set(a_type *val);
+a_name##tsd_boot1(void);						\
+a_attr bool								\
+a_name##tsd_boot(void);							\
+a_attr a_type *								\
+a_name##tsd_get(void);							\
+a_attr void								\
+a_name##tsd_set(a_type *val);

 /* malloc_tsd_externs(). */
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern __thread a_type	a_name##_tls;					\
-extern __thread bool	a_name##_initialized;				\
-extern bool		a_name##_booted;
+extern __thread a_type	a_name##tsd_tls;				\
+extern __thread bool	a_name##tsd_initialized;			\
+extern bool		a_name##tsd_booted;
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern __thread a_type	a_name##_tls;					\
-extern pthread_key_t	a_name##_tsd;					\
-extern bool		a_name##_booted;
+extern __thread a_type	a_name##tsd_tls;				\
+extern pthread_key_t	a_name##tsd_tsd;				\
+extern bool		a_name##tsd_booted;
 #elif (defined(_WIN32))
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern DWORD		a_name##_tsd;					\
-extern bool		a_name##_booted;
+extern DWORD		a_name##tsd_tsd;				\
+extern a_name##tsd_wrapper_t	a_name##tsd_boot_wrapper;		\
+extern bool		a_name##tsd_booted;
 #else
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern pthread_key_t	a_name##_tsd;					\
-extern tsd_init_head_t	a_name##_tsd_init_head;				\
-extern bool		a_name##_booted;
+extern pthread_key_t	a_name##tsd_tsd;				\
+extern tsd_init_head_t	a_name##tsd_init_head;				\
+extern a_name##tsd_wrapper_t	a_name##tsd_boot_wrapper;		\
+extern bool		a_name##tsd_booted;
 #endif

 /* malloc_tsd_data(). */
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
 a_attr __thread a_type JEMALLOC_TLS_MODEL				\
-    a_name##_tls = a_initializer;					\
+    a_name##tsd_tls = a_initializer;					\
 a_attr __thread bool JEMALLOC_TLS_MODEL					\
-    a_name##_initialized = false;					\
-a_attr bool		a_name##_booted = false;
+    a_name##tsd_initialized = false;					\
+a_attr bool		a_name##tsd_booted = false;
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
 a_attr __thread a_type JEMALLOC_TLS_MODEL				\
-    a_name##_tls = a_initializer;					\
-a_attr pthread_key_t	a_name##_tsd;					\
-a_attr bool		a_name##_booted = false;
+    a_name##tsd_tls = a_initializer;					\
+a_attr pthread_key_t	a_name##tsd_tsd;				\
+a_attr bool		a_name##tsd_booted = false;
 #elif (defined(_WIN32))
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
-a_attr DWORD		a_name##_tsd;					\
-a_attr bool		a_name##_booted = false;
+a_attr DWORD		a_name##tsd_tsd;				\
+a_attr a_name##tsd_wrapper_t a_name##tsd_boot_wrapper = {		\
+	false,								\
+	a_initializer							\
+};									\
+a_attr bool		a_name##tsd_booted = false;
 #else
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
-a_attr pthread_key_t	a_name##_tsd;					\
-a_attr tsd_init_head_t	a_name##_tsd_init_head = {			\
+a_attr pthread_key_t	a_name##tsd_tsd;				\
+a_attr tsd_init_head_t	a_name##tsd_init_head = {			\
 	ql_head_initializer(blocks),					\
 	MALLOC_MUTEX_INITIALIZER					\
 };									\
-a_attr bool		a_name##_booted = false;
+a_attr a_name##tsd_wrapper_t a_name##tsd_boot_wrapper = {		\
+	false,								\
+	a_initializer							\
+};									\
+a_attr bool		a_name##tsd_booted = false;
 #endif

 /* malloc_tsd_funcs(). */
@@ -125,75 +169,100 @@ a_attr bool		a_name##_booted = false;
    a_cleanup)								\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_cleanup_wrapper(void)					\
+a_name##tsd_cleanup_wrapper(void)					\
 {									\
 									\
-	if (a_name##_initialized) {					\
-		a_name##_initialized = false;				\
-		a_cleanup(&a_name##_tls);				\
+	if (a_name##tsd_initialized) {					\
+		a_name##tsd_initialized = false;			\
+		a_cleanup(&a_name##tsd_tls);				\
 	}								\
-	return (a_name##_initialized);					\
+	return (a_name##tsd_initialized);				\
 }									\
 a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_name##tsd_boot0(void)							\
 {									\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
 		malloc_tsd_cleanup_register(				\
-		    &a_name##_tsd_cleanup_wrapper);			\
+		    &a_name##tsd_cleanup_wrapper);			\
 	}								\
-	a_name##_booted = true;						\
+	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+									\
+	/* Do nothing. */						\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	return (a_name##tsd_boot0());					\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
 									\
-	assert(a_name##_booted);					\
-	return (&a_name##_tls);						\
+	assert(a_name##tsd_booted);					\
+	return (&a_name##tsd_tls);					\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
 									\
-	assert(a_name##_booted);					\
-	a_name##_tls = (*val);						\
+	assert(a_name##tsd_booted);					\
+	a_name##tsd_tls = (*val);					\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
-		a_name##_initialized = true;				\
+		a_name##tsd_initialized = true;				\
 }
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
    a_cleanup)								\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_name##tsd_boot0(void)							\
 {									\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		if (pthread_key_create(&a_name##_tsd, a_cleanup) != 0)	\
+		if (pthread_key_create(&a_name##tsd_tsd, a_cleanup) !=	\
+		    0)							\
 			return (true);					\
 	}								\
-	a_name##_booted = true;						\
+	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+									\
+	/* Do nothing. */						\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	return (a_name##tsd_boot0());					\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
 									\
-	assert(a_name##_booted);					\
-	return (&a_name##_tls);						\
+	assert(a_name##tsd_booted);					\
+	return (&a_name##tsd_tls);					\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
 									\
-	assert(a_name##_booted);					\
-	a_name##_tls = (*val);						\
+	assert(a_name##tsd_booted);					\
+	a_name##tsd_tls = (*val);					\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		if (pthread_setspecific(a_name##_tsd,			\
-		    (void *)(&a_name##_tls))) {				\
+		if (pthread_setspecific(a_name##tsd_tsd,		\
+		    (void *)(&a_name##tsd_tls))) {			\
 			malloc_write("<jemalloc>: Error"		\
 			    " setting TSD for "#a_name"\n");		\
 			if (opt_abort)					\
@@ -204,27 +273,19 @@ a_name##_tsd_set(a_type *val)						\
 #elif (defined(_WIN32))
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
    a_cleanup)								\
-/* Data structure. */							\
-typedef struct {							\
-	bool	initialized;						\
-	a_type	val;							\
-} a_name##_tsd_wrapper_t;						\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_cleanup_wrapper(void)					\
+a_name##tsd_cleanup_wrapper(void)					\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	wrapper = (a_name##_tsd_wrapper_t *) TlsGetValue(a_name##_tsd);	\
+	wrapper = (a_name##tsd_wrapper_t *)TlsGetValue(a_name##tsd_tsd);\
 	if (wrapper == NULL)						\
 		return (false);						\
 	if (a_cleanup != malloc_tsd_no_cleanup &&			\
 	    wrapper->initialized) {					\
-		a_type val = wrapper->val;				\
-		a_type tsd_static_data = a_initializer;			\
 		wrapper->initialized = false;				\
-		wrapper->val = tsd_static_data;				\
-		a_cleanup(&val);					\
+		a_cleanup(&wrapper->val);				\
 		if (wrapper->initialized) {				\
 			/* Trigger another cleanup round. */		\
 			return (true);					\
@@ -233,63 +294,93 @@ a_name##_tsd_cleanup_wrapper(void)					\
 	malloc_tsd_dalloc(wrapper);					\
 	return (false);							\
 }									\
-a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_attr void								\
+a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 {									\
 									\
-	a_name##_tsd = TlsAlloc();					\
-	if (a_name##_tsd == TLS_OUT_OF_INDEXES)				\
-		return (true);						\
-	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		malloc_tsd_cleanup_register(				\
-		    &a_name##_tsd_cleanup_wrapper);			\
+	if (!TlsSetValue(a_name##tsd_tsd, (void *)wrapper)) {		\
+		malloc_write("<jemalloc>: Error setting"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
 	}								\
-	a_name##_booted = true;						\
-	return (false);							\
 }									\
-/* Get/set. */								\
-a_attr a_name##_tsd_wrapper_t *						\
-a_name##_tsd_get_wrapper(void)						\
+a_attr a_name##tsd_wrapper_t *						\
+a_name##tsd_wrapper_get(void)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)	\
-	    TlsGetValue(a_name##_tsd);					\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    TlsGetValue(a_name##tsd_tsd);				\
 									\
-	if (wrapper == NULL) {						\
-		wrapper = (a_name##_tsd_wrapper_t *)			\
-		    malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t));	\
+	if (unlikely(wrapper == NULL)) {				\
+		wrapper = (a_name##tsd_wrapper_t *)			\
+		    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));	\
 		if (wrapper == NULL) {					\
 			malloc_write("<jemalloc>: Error allocating"	\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
 		} else {						\
-			static a_type tsd_static_data = a_initializer;	\
 			wrapper->initialized = false;			\
-			wrapper->val = tsd_static_data;			\
-		}							\
-		if (!TlsSetValue(a_name##_tsd, (void *)wrapper)) {	\
-			malloc_write("<jemalloc>: Error setting"	\
-			    " TSD for "#a_name"\n");			\
-			abort();					\
+			wrapper->val = a_initializer;			\
 		}							\
+		a_name##tsd_wrapper_set(wrapper);			\
 	}								\
 	return (wrapper);						\
 }									\
-a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_attr bool								\
+a_name##tsd_boot0(void)							\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	a_name##tsd_tsd = TlsAlloc();					\
+	if (a_name##tsd_tsd == TLS_OUT_OF_INDEXES)			\
+		return (true);						\
+	if (a_cleanup != malloc_tsd_no_cleanup) {			\
+		malloc_tsd_cleanup_register(				\
+		    &a_name##tsd_cleanup_wrapper);			\
+	}								\
+	a_name##tsd_wrapper_set(&a_name##tsd_boot_wrapper);		\
+	a_name##tsd_booted = true;					\
+	return (false);							\
+}									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+	wrapper = (a_name##tsd_wrapper_t *)				\
+	    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));		\
+	if (wrapper == NULL) {						\
+		malloc_write("<jemalloc>: Error allocating"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
+	memcpy(wrapper, &a_name##tsd_boot_wrapper,			\
+	    sizeof(a_name##tsd_wrapper_t));				\
+	a_name##tsd_wrapper_set(wrapper);				\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	if (a_name##tsd_boot0())					\
+		return (true);						\
+	a_name##tsd_boot1();						\
+	return (false);							\
+}									\
+/* Get/set. */								\
+a_attr a_type *								\
+a_name##tsd_get(void)							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+									\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@@ -297,16 +388,11 @@ a_name##_tsd_set(a_type *val)						\
 #else
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
    a_cleanup)								\
-/* Data structure. */							\
-typedef struct {							\
-	bool	initialized;						\
-	a_type	val;							\
-} a_name##_tsd_wrapper_t;						\
 /* Initialization/cleanup. */						\
 a_attr void								\
-a_name##_tsd_cleanup_wrapper(void *arg)					\
+a_name##tsd_cleanup_wrapper(void *arg)					\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)arg;\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)arg;	\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup &&			\
 	    wrapper->initialized) {					\
@@ -314,7 +400,7 @@ a_name##_tsd_cleanup_wrapper(void *arg)					\
 		a_cleanup(&wrapper->val);				\
 		if (wrapper->initialized) {				\
 			/* Trigger another cleanup round. */		\
-			if (pthread_setspecific(a_name##_tsd,		\
+			if (pthread_setspecific(a_name##tsd_tsd,	\
 			    (void *)wrapper)) {				\
 				malloc_write("<jemalloc>: Error"	\
 				    " setting TSD for "#a_name"\n");	\
@@ -326,67 +412,97 @@ a_name##_tsd_cleanup_wrapper(void *arg)					\
 	}								\
 	malloc_tsd_dalloc(wrapper);					\
 }									\
-a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_attr void								\
+a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 {									\
 									\
-	if (pthread_key_create(&a_name##_tsd,				\
-	    a_name##_tsd_cleanup_wrapper) != 0)				\
-		return (true);						\
-	a_name##_booted = true;						\
-	return (false);							\
+	if (pthread_setspecific(a_name##tsd_tsd,			\
+	    (void *)wrapper)) {						\
+		malloc_write("<jemalloc>: Error setting"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
 }									\
-/* Get/set. */								\
-a_attr a_name##_tsd_wrapper_t *						\
-a_name##_tsd_get_wrapper(void)						\
+a_attr a_name##tsd_wrapper_t *						\
+a_name##tsd_wrapper_get(void)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)	\
-	    pthread_getspecific(a_name##_tsd);				\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    pthread_getspecific(a_name##tsd_tsd);			\
 									\
-	if (wrapper == NULL) {						\
+	if (unlikely(wrapper == NULL)) {				\
 		tsd_init_block_t block;					\
 		wrapper = tsd_init_check_recursion(			\
-		    &a_name##_tsd_init_head, &block);			\
+		    &a_name##tsd_init_head, &block);			\
 		if (wrapper)						\
 		    return (wrapper);					\
-		wrapper = (a_name##_tsd_wrapper_t *)			\
-		    malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t));	\
+		wrapper = (a_name##tsd_wrapper_t *)			\
+		    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));	\
 		block.data = wrapper;					\
 		if (wrapper == NULL) {					\
 			malloc_write("<jemalloc>: Error allocating"	\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
 		} else {						\
-			static a_type tsd_static_data = a_initializer;	\
 			wrapper->initialized = false;			\
-			wrapper->val = tsd_static_data;			\
+			wrapper->val = a_initializer;			\
 		}							\
-		if (pthread_setspecific(a_name##_tsd,			\
-		    (void *)wrapper)) {					\
-			malloc_write("<jemalloc>: Error setting"	\
-			    " TSD for "#a_name"\n");			\
-			abort();					\
-		}							\
-		tsd_init_finish(&a_name##_tsd_init_head, &block);	\
+		a_name##tsd_wrapper_set(wrapper);			\
+		tsd_init_finish(&a_name##tsd_init_head, &block);	\
 	}								\
 	return (wrapper);						\
 }									\
-a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_attr bool								\
+a_name##tsd_boot0(void)							\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	if (pthread_key_create(&a_name##tsd_tsd,			\
+	    a_name##tsd_cleanup_wrapper) != 0)				\
+		return (true);						\
+	a_name##tsd_wrapper_set(&a_name##tsd_boot_wrapper);		\
+	a_name##tsd_booted = true;					\
+	return (false);							\
+}									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+	wrapper = (a_name##tsd_wrapper_t *)				\
+	    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));		\
+	if (wrapper == NULL) {						\
+		malloc_write("<jemalloc>: Error allocating"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
+	memcpy(wrapper, &a_name##tsd_boot_wrapper,			\
+	    sizeof(a_name##tsd_wrapper_t));				\
+	a_name##tsd_wrapper_set(wrapper);				\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	if (a_name##tsd_boot0())					\
+		return (true);						\
+	a_name##tsd_boot1();						\
+	return (false);							\
+}									\
+/* Get/set. */								\
+a_attr a_type *								\
+a_name##tsd_get(void)							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+									\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@@ -410,25 +526,136 @@ struct tsd_init_head_s {
 };
 #endif

+#define	MALLOC_TSD							\
+/*  O(name,			type) */				\
+    O(tcache,			tcache_t *)				\
+    O(thread_allocated,		uint64_t)				\
+    O(thread_deallocated,	uint64_t)				\
+    O(prof_tdata,		prof_tdata_t *)				\
+    O(arena,			arena_t *)				\
+    O(arenas_cache,		arena_t **)				\
+    O(narenas_cache,		unsigned)				\
+    O(arenas_cache_bypass,	bool)					\
+    O(tcache_enabled,		tcache_enabled_t)			\
+    O(quarantine,		quarantine_t *)				\
+
+#define	TSD_INITIALIZER {						\
+    tsd_state_uninitialized,						\
+    NULL,								\
+    0,									\
+    0,									\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    0,									\
+    false,								\
+    tcache_enabled_default,						\
+    NULL								\
+}
+
+struct tsd_s {
+	tsd_state_t	state;
+#define	O(n, t)								\
+	t		n;
+MALLOC_TSD
+#undef O
+};
+
+static const tsd_t tsd_initializer = TSD_INITIALIZER;
+
+malloc_tsd_types(, tsd_t)
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS

 void	*malloc_tsd_malloc(size_t size);
 void	malloc_tsd_dalloc(void *wrapper);
-void	malloc_tsd_no_cleanup(void *);
+void	malloc_tsd_no_cleanup(void *arg);
 void	malloc_tsd_cleanup_register(bool (*f)(void));
-void	malloc_tsd_boot(void);
+bool	malloc_tsd_boot0(void);
+void	malloc_tsd_boot1(void);
 #if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
    !defined(_WIN32))
 void	*tsd_init_check_recursion(tsd_init_head_t *head,
    tsd_init_block_t *block);
 void	tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block);
 #endif
+void	tsd_cleanup(void *arg);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

+#ifndef JEMALLOC_ENABLE_INLINE
+malloc_tsd_protos(JEMALLOC_ATTR(unused), , tsd_t)
+
+tsd_t	*tsd_fetch(void);
+bool	tsd_nominal(tsd_t *tsd);
+#define	O(n, t)								\
+t	*tsd_##n##p_get(tsd_t *tsd);					\
+t	tsd_##n##_get(tsd_t *tsd);					\
+void	tsd_##n##_set(tsd_t *tsd, t n);
+MALLOC_TSD
+#undef O
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TSD_C_))
+malloc_tsd_externs(, tsd_t)
+malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, , tsd_t, tsd_initializer, tsd_cleanup)
+
+JEMALLOC_ALWAYS_INLINE tsd_t *
+tsd_fetch(void)
+{
+	tsd_t *tsd = tsd_get();
+
+	if (unlikely(tsd->state != tsd_state_nominal)) {
+		if (tsd->state == tsd_state_uninitialized) {
+			tsd->state = tsd_state_nominal;
+			/* Trigger cleanup handler registration. */
+			tsd_set(tsd);
+		} else if (tsd->state == tsd_state_purgatory) {
+			tsd->state = tsd_state_reincarnated;
+			tsd_set(tsd);
+		} else
+			assert(tsd->state == tsd_state_reincarnated);
+	}
+
+	return (tsd);
+}
+
+JEMALLOC_INLINE bool
+tsd_nominal(tsd_t *tsd)
+{
+
+	return (tsd->state == tsd_state_nominal);
+}
+
+#define	O(n, t)								\
+JEMALLOC_ALWAYS_INLINE t *						\
+tsd_##n##p_get(tsd_t *tsd)						\
+{									\
+									\
+	return (&tsd->n);						\
+}									\
+									\
+JEMALLOC_ALWAYS_INLINE t						\
+tsd_##n##_get(tsd_t *tsd)						\
+{									\
+									\
+	return (*tsd_##n##p_get(tsd));					\
+}									\
+									\
+JEMALLOC_ALWAYS_INLINE void						\
+tsd_##n##_set(tsd_t *tsd, t n)						\
+{									\
+									\
+	assert(tsd->state == tsd_state_nominal);			\
+	tsd->n = n;							\
+}
+MALLOC_TSD
+#undef O
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
--- a/memory/jemalloc/src/include/jemalloc/internal/util.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/util.h
@@ -27,13 +27,21 @@
 #  define JEMALLOC_CC_SILENCE_INIT(v)
 #endif

+#ifdef __GNUC__
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) !!(x)
+#define unlikely(x) !!(x)
+#endif
+
 /*
 * Define a custom assert() in order to reduce the chances of deadlock during
 * assertion failure.
 */
 #ifndef assert
 #define	assert(e) do {							\
-	if (config_debug && !(e)) {					\
+	if (unlikely(config_debug && !(e))) {				\
 		malloc_printf(						\
 		    "<jemalloc>: %s:%d: Failed assertion: \"%s\"\n",	\
 		    __FILE__, __LINE__, #e);				\
@@ -65,14 +73,14 @@

 #ifndef assert_not_implemented
 #define	assert_not_implemented(e) do {					\
-	if (config_debug && !(e))					\
+	if (unlikely(config_debug && !(e)))				\
 		not_implemented();					\
 } while (0)
 #endif

 /* Use to assert a particular configuration, e.g., cassert(config_debug). */
 #define	cassert(c) do {							\
-	if ((c) == false)						\
+	if (unlikely(!(c)))						\
 		not_reached();						\
 } while (0)

@@ -112,13 +120,14 @@ void	malloc_printf(const char *format, ...)
 int	jemalloc_ffsl(long bitmap);
 int	jemalloc_ffs(int bitmap);
 size_t	pow2_ceil(size_t x);
+size_t	lg_floor(size_t x);
 void	set_errno(int errnum);
 int	get_errno(void);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_UTIL_C_))

-/* Sanity check: */
+/* Sanity check. */
 #if !defined(JEMALLOC_INTERNAL_FFSL) || !defined(JEMALLOC_INTERNAL_FFS)
 #  error Both JEMALLOC_INTERNAL_FFSL && JEMALLOC_INTERNAL_FFS should have been defined by configure
 #endif
@@ -155,7 +164,74 @@ pow2_ceil(size_t x)
 	return (x);
 }

-/* Sets error code */
+#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+	size_t ret;
+
+	asm ("bsr %1, %0"
+	    : "=r"(ret) // Outputs.
+	    : "r"(x)    // Inputs.
+	    );
+	return (ret);
+}
+#elif (defined(_MSC_VER))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+    unsigned long ret;
+
+#if (LG_SIZEOF_PTR == 3)
+    _BitScanReverse64(&ret, x);
+#elif (LG_SIZEOF_PTR == 2)
+    _BitScanReverse(&ret, x);
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+    return (ret);
+}
+#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+
+#if (LG_SIZEOF_PTR == LG_SIZEOF_INT)
+	return (((8 << LG_SIZEOF_PTR) - 1) - __builtin_clz(x));
+#elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG)
+	return (((8 << LG_SIZEOF_PTR) - 1) - __builtin_clzl(x));
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+}
+#else
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+
+	x |= (x >> 1);
+	x |= (x >> 2);
+	x |= (x >> 4);
+	x |= (x >> 8);
+	x |= (x >> 16);
+#if (LG_SIZEOF_PTR == 3 && LG_SIZEOF_PTR == LG_SIZEOF_LONG)
+	x |= (x >> 32);
+	if (x == KZU(0xffffffffffffffff))
+		return (63);
+	x++;
+	return (jemalloc_ffsl(x) - 2);
+#elif (LG_SIZEOF_PTR == 2)
+	if (x == KZU(0xffffffff))
+		return (31);
+	x++;
+	return (jemalloc_ffs(x) - 2);
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+}
+#endif
+
+/* Set error code. */
 JEMALLOC_INLINE void
 set_errno(int errnum)
 {
@@ -167,7 +243,7 @@ set_errno(int errnum)
 #endif
 }

-/* Get last error code */
+/* Get last error code. */
 JEMALLOC_INLINE int
 get_errno(void)
 {
--- a/memory/jemalloc/src/include/jemalloc/internal/valgrind.h
+++ b/memory/jemalloc/src/include/jemalloc/internal/valgrind.h
@@ -0,0 +1,112 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#ifdef JEMALLOC_VALGRIND
+#include <valgrind/valgrind.h>
+
+/*
+ * The size that is reported to Valgrind must be consistent through a chain of
+ * malloc..realloc..realloc calls.  Request size isn't recorded anywhere in
+ * jemalloc, so it is critical that all callers of these macros provide usize
+ * rather than request size.  As a result, buffer overflow detection is
+ * technically weakened for the standard API, though it is generally accepted
+ * practice to consider any extra bytes reported by malloc_usable_size() as
+ * usable space.
+ */
+#define	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(ptr, usize) do {		\
+	if (unlikely(in_valgrind))					\
+		valgrind_make_mem_noaccess(ptr, usize);			\
+} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize) do {		\
+	if (unlikely(in_valgrind))					\
+		valgrind_make_mem_undefined(ptr, usize);		\
+} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ptr, usize) do {		\
+	if (unlikely(in_valgrind))					\
+		valgrind_make_mem_defined(ptr, usize);			\
+} while (0)
+/*
+ * The VALGRIND_MALLOCLIKE_BLOCK() and VALGRIND_RESIZEINPLACE_BLOCK() macro
+ * calls must be embedded in macros rather than in functions so that when
+ * Valgrind reports errors, there are no extra stack frames in the backtraces.
+ */
+#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {		\
+	if (unlikely(in_valgrind && cond))				\
+		VALGRIND_MALLOCLIKE_BLOCK(ptr, usize, p2rz(ptr), zero);	\
+} while (0)
+#define	JEMALLOC_VALGRIND_REALLOC(maybe_moved, ptr, usize,		\
+    ptr_maybe_null, old_ptr, old_usize, old_rzsize, old_ptr_maybe_null,	\
+    zero) do {								\
+	if (unlikely(in_valgrind)) {					\
+		size_t rzsize = p2rz(ptr);				\
+									\
+		if (!maybe_moved || ptr == old_ptr) {			\
+			VALGRIND_RESIZEINPLACE_BLOCK(ptr, old_usize,	\
+			    usize, rzsize);				\
+			if (zero && old_usize < usize) {		\
+				valgrind_make_mem_defined(		\
+				    (void *)((uintptr_t)ptr +		\
+				    old_usize), usize - old_usize);	\
+			}						\
+		} else {						\
+			if (!old_ptr_maybe_null || old_ptr != NULL) {	\
+				valgrind_freelike_block(old_ptr,	\
+				    old_rzsize);			\
+			}						\
+			if (!ptr_maybe_null || ptr != NULL) {		\
+				size_t copy_size = (old_usize < usize)	\
+				    ?  old_usize : usize;		\
+				size_t tail_size = usize - copy_size;	\
+				VALGRIND_MALLOCLIKE_BLOCK(ptr, usize,	\
+				    rzsize, false);			\
+				if (copy_size > 0) {			\
+					valgrind_make_mem_defined(ptr,	\
+					copy_size);			\
+				}					\
+				if (zero && tail_size > 0) {		\
+					valgrind_make_mem_defined(	\
+					    (void *)((uintptr_t)ptr +	\
+					    copy_size), tail_size);	\
+				}					\
+			}						\
+		}							\
+	}								\
+} while (0)
+#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {			\
+	if (unlikely(in_valgrind))					\
+		valgrind_freelike_block(ptr, rzsize);			\
+} while (0)
+#else
+#define	RUNNING_ON_VALGRIND	((unsigned)0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {} while (0)
+#define	JEMALLOC_VALGRIND_REALLOC(maybe_moved, ptr, usize,		\
+    ptr_maybe_null, old_ptr, old_usize, old_rzsize, old_ptr_maybe_null,	\
+    zero) do {} while (0)
+#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {} while (0)
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+#ifdef JEMALLOC_VALGRIND
+void	valgrind_make_mem_noaccess(void *ptr, size_t usize);
+void	valgrind_make_mem_undefined(void *ptr, size_t usize);
+void	valgrind_make_mem_defined(void *ptr, size_t usize);
+void	valgrind_freelike_block(void *ptr, size_t usize);
+#endif
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
+
--- a/memory/jemalloc/src/include/jemalloc/jemalloc.sh
+++ b/memory/jemalloc/src/include/jemalloc/jemalloc.sh
@@ -12,7 +12,7 @@ extern "C" {
 EOF

 for hdr in jemalloc_defs.h jemalloc_rename.h jemalloc_macros.h \
-           jemalloc_protos.h jemalloc_mangle.h ; do
+           jemalloc_protos.h jemalloc_typedefs.h jemalloc_mangle.h ; do
  cat "${objroot}include/jemalloc/${hdr}" \
      | grep -v 'Generated from .* by configure\.' \
      | sed -e 's/^#define /#define	/g' \
--- a/memory/jemalloc/src/include/jemalloc/jemalloc_defs.h.in
+++ b/memory/jemalloc/src/include/jemalloc/jemalloc_defs.h.in
@@ -1,9 +1,6 @@
 /* Defined if __attribute__((...)) syntax is supported. */
 #undef JEMALLOC_HAVE_ATTR

-/* Support the experimental API. */
-#undef JEMALLOC_EXPERIMENTAL
-
 /*
 * Define overrides for non-standard allocator-related functions if they are
 * present on the system.
--- a/memory/jemalloc/src/include/jemalloc/jemalloc_macros.h.in
+++ b/memory/jemalloc/src/include/jemalloc/jemalloc_macros.h.in
@@ -1,3 +1,6 @@
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
 #include <limits.h>
 #include <strings.h>

@@ -19,23 +22,6 @@
 /* Bias arena index bits so that 0 encodes "MALLOCX_ARENA() unspecified". */
 #  define MALLOCX_ARENA(a)	((int)(((a)+1) << 8))

-#ifdef JEMALLOC_EXPERIMENTAL
-#  define ALLOCM_LG_ALIGN(la)	(la)
-#  if LG_SIZEOF_PTR == 2
-#    define ALLOCM_ALIGN(a)	(ffs(a)-1)
-#  else
-#    define ALLOCM_ALIGN(a)						\
-	 ((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
-#  endif
-#  define ALLOCM_ZERO	((int)0x40)
-#  define ALLOCM_NO_MOVE	((int)0x80)
-/* Bias arena index bits so that 0 encodes "ALLOCM_ARENA() unspecified". */
-#  define ALLOCM_ARENA(a)	((int)(((a)+1) << 8))
-#  define ALLOCM_SUCCESS	0
-#  define ALLOCM_ERR_OOM	1
-#  define ALLOCM_ERR_NOT_MOVED	2
-#endif
-
 #ifdef JEMALLOC_HAVE_ATTR
 #  define JEMALLOC_ATTR(s) __attribute__((s))
 #  define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
--- a/memory/jemalloc/src/include/jemalloc/jemalloc_protos.h.in
+++ b/memory/jemalloc/src/include/jemalloc/jemalloc_protos.h.in
@@ -17,13 +17,17 @@ JEMALLOC_EXPORT void	*@je_@aligned_alloc(size_t alignment, size_t size)
 JEMALLOC_EXPORT void	*@je_@realloc(void *ptr, size_t size);
 JEMALLOC_EXPORT void	@je_@free(void *ptr);

-JEMALLOC_EXPORT void	*@je_@mallocx(size_t size, int flags);
+JEMALLOC_EXPORT void	*@je_@mallocx(size_t size, int flags)
+    JEMALLOC_ATTR(malloc);
 JEMALLOC_EXPORT void	*@je_@rallocx(void *ptr, size_t size, int flags);
 JEMALLOC_EXPORT size_t	@je_@xallocx(void *ptr, size_t size, size_t extra,
    int flags);
-JEMALLOC_EXPORT size_t	@je_@sallocx(const void *ptr, int flags);
+JEMALLOC_EXPORT size_t	@je_@sallocx(const void *ptr, int flags)
+    JEMALLOC_ATTR(pure);
 JEMALLOC_EXPORT void	@je_@dallocx(void *ptr, int flags);
-JEMALLOC_EXPORT size_t	@je_@nallocx(size_t size, int flags);
+JEMALLOC_EXPORT void	@je_@sdallocx(void *ptr, size_t size, int flags);
+JEMALLOC_EXPORT size_t	@je_@nallocx(size_t size, int flags)
+    JEMALLOC_ATTR(pure);

 JEMALLOC_EXPORT int	@je_@mallctl(const char *name, void *oldp,
    size_t *oldlenp, void *newp, size_t newlen);
@@ -44,15 +48,3 @@ JEMALLOC_EXPORT void *	@je_@memalign(size_t alignment, size_t size)
 #ifdef JEMALLOC_OVERRIDE_VALLOC
 JEMALLOC_EXPORT void *	@je_@valloc(size_t size) JEMALLOC_ATTR(malloc);
 #endif
-
-#ifdef JEMALLOC_EXPERIMENTAL
-JEMALLOC_EXPORT int	@je_@allocm(void **ptr, size_t *rsize, size_t size,
-    int flags) JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@rallocm(void **ptr, size_t *rsize, size_t size,
-    size_t extra, int flags) JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@sallocm(const void *ptr, size_t *rsize, int flags)
-    JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@dallocm(void *ptr, int flags)
-    JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@nallocm(size_t *rsize, size_t size, int flags);
-#endif
--- a/memory/jemalloc/src/include/jemalloc/jemalloc_typedefs.h.in
+++ b/memory/jemalloc/src/include/jemalloc/jemalloc_typedefs.h.in
@@ -0,0 +1,2 @@
+typedef void *(chunk_alloc_t)(void *, size_t, size_t, bool *, unsigned);
+typedef bool (chunk_dalloc_t)(void *, size_t, unsigned);
--- a/memory/jemalloc/src/include/msvc_compat/C99/stdbool.h
+++ b/memory/jemalloc/src/include/msvc_compat/C99/stdbool.h
@@ -5,6 +5,8 @@

 /* MSVC doesn't define _Bool or bool in C, but does have BOOL */
 /* Note this doesn't pass autoconf's test because (bool) 0.5 != true */
+/* Clang-cl uses MSVC headers, so needs msvc_compat, but has _Bool as
+ * a built-in type. */
 #ifndef __clang__
 typedef BOOL _Bool;
 #endif
--- a/memory/jemalloc/src/jemalloc.pc.in
+++ b/memory/jemalloc/src/jemalloc.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: jemalloc
+Description: A general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support.
+URL: http://www.canonware.com/jemalloc
+Version: @jemalloc_version@
+Cflags: -I${includedir}
+Libs: -L${libdir} -ljemalloc
--- a/memory/jemalloc/src/src/arena.c
+++ b/memory/jemalloc/src/src/arena.c
--- a/memory/jemalloc/src/src/base.c
+++ b/memory/jemalloc/src/src/base.c
@@ -16,24 +16,16 @@ static void		*base_next_addr;
 static void		*base_past_addr; /* Addr immediately past base_pages. */
 static extent_node_t	*base_nodes;

-/******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static bool	base_pages_alloc(size_t minsize);
-
 /******************************************************************************/

 static bool
 base_pages_alloc(size_t minsize)
 {
 	size_t csize;
-	bool zero;

 	assert(minsize != 0);
 	csize = CHUNK_CEILING(minsize);
-	zero = false;
-	base_pages = chunk_alloc(csize, chunksize, true, &zero,
-	    chunk_dss_prec_get());
+	base_pages = chunk_alloc_base(csize);
 	if (base_pages == NULL)
 		return (true);
 	base_next_addr = base_pages;
@@ -63,7 +55,7 @@ base_alloc(size_t size)
 	ret = base_next_addr;
 	base_next_addr = (void *)((uintptr_t)base_next_addr + csize);
 	malloc_mutex_unlock(&base_mtx);
-	VALGRIND_MAKE_MEM_UNDEFINED(ret, csize);
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, csize);

 	return (ret);
 }
@@ -89,7 +81,8 @@ base_node_alloc(void)
 		ret = base_nodes;
 		base_nodes = *(extent_node_t **)ret;
 		malloc_mutex_unlock(&base_mtx);
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, sizeof(extent_node_t));
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret,
+		    sizeof(extent_node_t));
 	} else {
 		malloc_mutex_unlock(&base_mtx);
 		ret = (extent_node_t *)base_alloc(sizeof(extent_node_t));
@@ -99,10 +92,10 @@ base_node_alloc(void)
 }

 void
-base_node_dealloc(extent_node_t *node)
+base_node_dalloc(extent_node_t *node)
 {

-	VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
 	malloc_mutex_lock(&base_mtx);
 	*(extent_node_t **)node = base_nodes;
 	base_nodes = node;
--- a/memory/jemalloc/src/src/bitmap.c
+++ b/memory/jemalloc/src/src/bitmap.c
@@ -2,19 +2,6 @@
 #include "jemalloc/internal/jemalloc_internal.h"

 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static size_t	bits2groups(size_t nbits);
-
-/******************************************************************************/
-
-static size_t
-bits2groups(size_t nbits)
-{
-
-	return ((nbits >> LG_BITMAP_GROUP_NBITS) +
-	    !!(nbits & BITMAP_GROUP_NBITS_MASK));
-}

 void
 bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
@@ -31,15 +18,16 @@ bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
 	 * that requires only one group.
 	 */
 	binfo->levels[0].group_offset = 0;
-	group_count = bits2groups(nbits);
+	group_count = BITMAP_BITS2GROUPS(nbits);
 	for (i = 1; group_count > 1; i++) {
 		assert(i < BITMAP_MAX_LEVELS);
 		binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
 		    + group_count;
-		group_count = bits2groups(group_count);
+		group_count = BITMAP_BITS2GROUPS(group_count);
 	}
 	binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
 	    + group_count;
+	assert(binfo->levels[i].group_offset <= BITMAP_GROUPS_MAX);
 	binfo->nlevels = i;
 	binfo->nbits = nbits;
 }
--- a/memory/jemalloc/src/src/chunk.c
+++ b/memory/jemalloc/src/src/chunk.c
@@ -27,23 +27,20 @@ rtree_t		*chunks_rtree;
 size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
-size_t		map_bias;
-size_t		arena_maxclass; /* Max size class for arenas. */

 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */

-static void	*chunk_recycle(extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, size_t size, size_t alignment, bool base,
-    bool *zero);
-static void	chunk_record(extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, void *chunk, size_t size);
+static void	chunk_dalloc_core(void *chunk, size_t size);

 /******************************************************************************/

 static void *
-chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
-    size_t alignment, bool base, bool *zero)
+chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
+    void *new_addr, size_t size, size_t alignment, bool base, bool *zero)
 {
 	void *ret;
 	extent_node_t *node;
@@ -65,11 +62,11 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
-	key.addr = NULL;
+	key.addr = new_addr;
 	key.size = alloc_size;
 	malloc_mutex_lock(&chunks_mtx);
 	node = extent_tree_szad_nsearch(chunks_szad, &key);
-	if (node == NULL) {
+	if (node == NULL || (new_addr && node->addr != new_addr)) {
 		malloc_mutex_unlock(&chunks_mtx);
 		return (NULL);
 	}
@@ -104,7 +101,7 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 			malloc_mutex_unlock(&chunks_mtx);
 			node = base_node_alloc();
 			if (node == NULL) {
-				chunk_dealloc(ret, size, true);
+				chunk_dalloc_core(ret, size);
 				return (NULL);
 			}
 			malloc_mutex_lock(&chunks_mtx);
@@ -119,15 +116,15 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 	malloc_mutex_unlock(&chunks_mtx);

 	if (node != NULL)
-		base_node_dealloc(node);
+		base_node_dalloc(node);
 	if (*zero) {
-		if (zeroed == false)
+		if (!zeroed)
 			memset(ret, 0, size);
 		else if (config_debug) {
 			size_t i;
 			size_t *p = (size_t *)(uintptr_t)ret;

-			VALGRIND_MAKE_MEM_DEFINED(ret, size);
+			JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ret, size);
 			for (i = 0; i < size / sizeof(size_t); i++)
 				assert(p[i] == 0);
 		}
@@ -136,14 +133,14 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 }

 /*
- * If the caller specifies (*zero == false), it is still possible to receive
- * zeroed memory, in which case *zero is toggled to true.  arena_chunk_alloc()
- * takes advantage of this to avoid demanding zeroed chunks, but taking
- * advantage of them if they are returned.
+ * If the caller specifies (!*zero), it is still possible to receive zeroed
+ * memory, in which case *zero is toggled to true.  arena_chunk_alloc() takes
+ * advantage of this to avoid demanding zeroed chunks, but taking advantage of
+ * them if they are returned.
 */
-void *
-chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
-    dss_prec_t dss_prec)
+static void *
+chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
+    bool *zero, dss_prec_t dss_prec)
 {
 	void *ret;

@@ -153,62 +150,121 @@ chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
 	assert((alignment & chunksize_mask) == 0);

 	/* "primary" dss. */
-	if (config_dss && dss_prec == dss_prec_primary) {
-		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
-		    alignment, base, zero)) != NULL)
-			goto label_return;
-		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
-			goto label_return;
+	if (have_dss && dss_prec == dss_prec_primary) {
+		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
+		    new_addr, size, alignment, base, zero)) != NULL)
+			return (ret);
+		if ((ret = chunk_alloc_dss(new_addr, size, alignment, zero))
+		    != NULL)
+			return (ret);
 	}
 	/* mmap. */
-	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, size,
-	    alignment, base, zero)) != NULL)
-		goto label_return;
-	if ((ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
-		goto label_return;
+	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, new_addr,
+	    size, alignment, base, zero)) != NULL)
+		return (ret);
+	/* Requesting an address not implemented for chunk_alloc_mmap(). */
+	if (new_addr == NULL &&
+	    (ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
+		return (ret);
 	/* "secondary" dss. */
-	if (config_dss && dss_prec == dss_prec_secondary) {
-		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
-		    alignment, base, zero)) != NULL)
-			goto label_return;
-		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
-			goto label_return;
+	if (have_dss && dss_prec == dss_prec_secondary) {
+		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
+		    new_addr, size, alignment, base, zero)) != NULL)
+			return (ret);
+		if ((ret = chunk_alloc_dss(new_addr, size, alignment, zero))
+		    != NULL)
+			return (ret);
 	}

 	/* All strategies for allocation failed. */
-	ret = NULL;
-label_return:
-	if (ret != NULL) {
-		if (config_ivsalloc && base == false) {
-			if (rtree_set(chunks_rtree, (uintptr_t)ret, 1)) {
-				chunk_dealloc(ret, size, true);
-				return (NULL);
-			}
-		}
-		if (config_stats || config_prof) {
-			bool gdump;
-			malloc_mutex_lock(&chunks_mtx);
-			if (config_stats)
-				stats_chunks.nchunks += (size / chunksize);
-			stats_chunks.curchunks += (size / chunksize);
-			if (stats_chunks.curchunks > stats_chunks.highchunks) {
-				stats_chunks.highchunks =
-				    stats_chunks.curchunks;
-				if (config_prof)
-					gdump = true;
-			} else if (config_prof)
-				gdump = false;
-			malloc_mutex_unlock(&chunks_mtx);
-			if (config_prof && opt_prof && opt_prof_gdump && gdump)
-				prof_gdump();
-		}
-		if (config_valgrind)
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+	return (NULL);
+}
+
+static bool
+chunk_register(void *chunk, size_t size, bool base)
+{
+
+	assert(chunk != NULL);
+	assert(CHUNK_ADDR2BASE(chunk) == chunk);
+
+	if (config_ivsalloc && !base) {
+		if (rtree_set(chunks_rtree, (uintptr_t)chunk, 1))
+			return (true);
+	}
+	if (config_stats || config_prof) {
+		bool gdump;
+		malloc_mutex_lock(&chunks_mtx);
+		if (config_stats)
+			stats_chunks.nchunks += (size / chunksize);
+		stats_chunks.curchunks += (size / chunksize);
+		if (stats_chunks.curchunks > stats_chunks.highchunks) {
+			stats_chunks.highchunks =
+			    stats_chunks.curchunks;
+			if (config_prof)
+				gdump = true;
+		} else if (config_prof)
+			gdump = false;
+		malloc_mutex_unlock(&chunks_mtx);
+		if (config_prof && opt_prof && opt_prof_gdump && gdump)
+			prof_gdump();
+	}
+	if (config_valgrind)
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(chunk, size);
+	return (false);
+}
+
+void *
+chunk_alloc_base(size_t size)
+{
+	void *ret;
+	bool zero;
+
+	zero = false;
+	ret = chunk_alloc_core(NULL, size, chunksize, true, &zero,
+	    chunk_dss_prec_get());
+	if (ret == NULL)
+		return (NULL);
+	if (chunk_register(ret, size, true)) {
+		chunk_dalloc_core(ret, size);
+		return (NULL);
 	}
-	assert(CHUNK_ADDR2BASE(ret) == ret);
 	return (ret);
 }

+void *
+chunk_alloc_arena(chunk_alloc_t *chunk_alloc, chunk_dalloc_t *chunk_dalloc,
+    unsigned arena_ind, void *new_addr, size_t size, size_t alignment,
+    bool *zero)
+{
+	void *ret;
+
+	ret = chunk_alloc(new_addr, size, alignment, zero, arena_ind);
+	if (ret != NULL && chunk_register(ret, size, false)) {
+		chunk_dalloc(ret, size, arena_ind);
+		ret = NULL;
+	}
+
+	return (ret);
+}
+
+/* Default arena chunk allocation routine in the absence of user override. */
+void *
+chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
+    unsigned arena_ind)
+{
+	arena_t *arena;
+
+	arena = arena_get(tsd_fetch(), arena_ind, false, true);
+	/*
+	 * The arena we're allocating on behalf of must have been initialized
+	 * already.
+	 */
+	assert(arena != NULL);
+
+	return (chunk_alloc_core(new_addr, size, alignment, false, zero,
+	    arena->dss_prec));
+}
+
 static void
 chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
    size_t size)
@@ -217,7 +273,7 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 	extent_node_t *xnode, *node, *prev, *xprev, key;

 	unzeroed = pages_purge(chunk, size);
-	VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
+	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);

 	/*
 	 * Allocate a node before acquiring chunks_mtx even though it might not
@@ -242,7 +298,7 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 		extent_tree_szad_remove(chunks_szad, node);
 		node->addr = chunk;
 		node->size += size;
-		node->zeroed = (node->zeroed && (unzeroed == false));
+		node->zeroed = (node->zeroed && !unzeroed);
 		extent_tree_szad_insert(chunks_szad, node);
 	} else {
 		/* Coalescing forward failed, so insert a new node. */
@@ -259,7 +315,7 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 		xnode = NULL; /* Prevent deallocation below. */
 		node->addr = chunk;
 		node->size = size;
-		node->zeroed = (unzeroed == false);
+		node->zeroed = !unzeroed;
 		extent_tree_ad_insert(chunks_ad, node);
 		extent_tree_szad_insert(chunks_szad, node);
 	}
@@ -292,9 +348,9 @@ label_return:
 	 * avoid potential deadlock.
 	 */
 	if (xnode != NULL)
-		base_node_dealloc(xnode);
+		base_node_dalloc(xnode);
 	if (xprev != NULL)
-		base_node_dealloc(xprev);
+		base_node_dalloc(xprev);
 }

 void
@@ -305,14 +361,14 @@ chunk_unmap(void *chunk, size_t size)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);

-	if (config_dss && chunk_in_dss(chunk))
+	if (have_dss && chunk_in_dss(chunk))
 		chunk_record(&chunks_szad_dss, &chunks_ad_dss, chunk, size);
-	else if (chunk_dealloc_mmap(chunk, size))
+	else if (chunk_dalloc_mmap(chunk, size))
 		chunk_record(&chunks_szad_mmap, &chunks_ad_mmap, chunk, size);
 }

-void
-chunk_dealloc(void *chunk, size_t size, bool unmap)
+static void
+chunk_dalloc_core(void *chunk, size_t size)
 {

 	assert(chunk != NULL);
@@ -329,8 +385,16 @@ chunk_dealloc(void *chunk, size_t size, bool unmap)
 		malloc_mutex_unlock(&chunks_mtx);
 	}

-	if (unmap)
-		chunk_unmap(chunk, size);
+	chunk_unmap(chunk, size);
+}
+
+/* Default arena chunk deallocation routine in the absence of user override. */
+bool
+chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
+{
+
+	chunk_dalloc_core(chunk, size);
+	return (false);
 }

 bool
@@ -343,12 +407,11 @@ chunk_boot(void)
 	chunksize_mask = chunksize - 1;
 	chunk_npages = (chunksize >> LG_PAGE);

-	if (config_stats || config_prof) {
-		if (malloc_mutex_init(&chunks_mtx))
-			return (true);
+	if (malloc_mutex_init(&chunks_mtx))
+		return (true);
+	if (config_stats || config_prof)
 		memset(&stats_chunks, 0, sizeof(chunk_stats_t));
-	}
-	if (config_dss && chunk_dss_boot())
+	if (have_dss && chunk_dss_boot())
 		return (true);
 	extent_tree_szad_new(&chunks_szad_mmap);
 	extent_tree_ad_new(&chunks_ad_mmap);
--- a/memory/jemalloc/src/src/chunk_dss.c
+++ b/memory/jemalloc/src/src/chunk_dss.c
@@ -32,7 +32,7 @@ static void *
 chunk_dss_sbrk(intptr_t increment)
 {

-#ifdef JEMALLOC_HAVE_SBRK
+#ifdef JEMALLOC_DSS
 	return (sbrk(increment));
 #else
 	not_implemented();
@@ -45,7 +45,7 @@ chunk_dss_prec_get(void)
 {
 	dss_prec_t ret;

-	if (config_dss == false)
+	if (!have_dss)
 		return (dss_prec_disabled);
 	malloc_mutex_lock(&dss_mtx);
 	ret = dss_prec_default;
@@ -57,8 +57,8 @@ bool
 chunk_dss_prec_set(dss_prec_t dss_prec)
 {

-	if (config_dss == false)
-		return (true);
+	if (!have_dss)
+		return (dss_prec != dss_prec_disabled);
 	malloc_mutex_lock(&dss_mtx);
 	dss_prec_default = dss_prec;
 	malloc_mutex_unlock(&dss_mtx);
@@ -66,11 +66,11 @@ chunk_dss_prec_set(dss_prec_t dss_prec)
 }

 void *
-chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
+chunk_alloc_dss(void *new_addr, size_t size, size_t alignment, bool *zero)
 {
 	void *ret;

-	cassert(config_dss);
+	cassert(have_dss);
 	assert(size > 0 && (size & chunksize_mask) == 0);
 	assert(alignment > 0 && (alignment & chunksize_mask) == 0);

@@ -93,8 +93,17 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 		 * malloc.
 		 */
 		do {
+			/* Avoid an unnecessary system call. */
+			if (new_addr != NULL && dss_max != new_addr)
+				break;
+
 			/* Get the current end of the DSS. */
 			dss_max = chunk_dss_sbrk(0);
+
+			/* Make sure the earlier condition still holds. */
+			if (new_addr != NULL && dss_max != new_addr)
+				break;
+
 			/*
 			 * Calculate how much padding is necessary to
 			 * chunk-align the end of the DSS.
@@ -126,7 +135,8 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 				if (cpad_size != 0)
 					chunk_unmap(cpad, cpad_size);
 				if (*zero) {
-					VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+					JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
+					    ret, size);
 					memset(ret, 0, size);
 				}
 				return (ret);
@@ -143,7 +153,7 @@ chunk_in_dss(void *chunk)
 {
 	bool ret;

-	cassert(config_dss);
+	cassert(have_dss);

 	malloc_mutex_lock(&dss_mtx);
 	if ((uintptr_t)chunk >= (uintptr_t)dss_base
@@ -160,7 +170,7 @@ bool
 chunk_dss_boot(void)
 {

-	cassert(config_dss);
+	cassert(have_dss);

 	if (malloc_mutex_init(&dss_mtx))
 		return (true);
@@ -175,7 +185,7 @@ void
 chunk_dss_prefork(void)
 {

-	if (config_dss)
+	if (have_dss)
 		malloc_mutex_prefork(&dss_mtx);
 }

@@ -183,7 +193,7 @@ void
 chunk_dss_postfork_parent(void)
 {

-	if (config_dss)
+	if (have_dss)
 		malloc_mutex_postfork_parent(&dss_mtx);
 }

@@ -191,7 +201,7 @@ void
 chunk_dss_postfork_child(void)
 {

-	if (config_dss)
+	if (have_dss)
 		malloc_mutex_postfork_child(&dss_mtx);
 }

--- a/memory/jemalloc/src/src/chunk_mmap.c
+++ b/memory/jemalloc/src/src/chunk_mmap.c
@@ -121,7 +121,7 @@ pages_purge(void *addr, size_t length)
 #ifdef _WIN32
 	VirtualAlloc(addr, length, MEM_RESET, PAGE_READWRITE);
 	unzeroed = true;
-#else
+#elif defined(JEMALLOC_HAVE_MADVISE)
 #  ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
 #    define JEMALLOC_MADV_PURGE MADV_DONTNEED
 #    define JEMALLOC_MADV_ZEROS true
@@ -129,12 +129,15 @@ pages_purge(void *addr, size_t length)
 #    define JEMALLOC_MADV_PURGE MADV_FREE
 #    define JEMALLOC_MADV_ZEROS false
 #  else
-#    error "No method defined for purging unused dirty pages."
+#    error "No madvise(2) flag defined for purging unused dirty pages."
 #  endif
 	int err = madvise(addr, length, JEMALLOC_MADV_PURGE);
-	unzeroed = (JEMALLOC_MADV_ZEROS == false || err != 0);
+	unzeroed = (!JEMALLOC_MADV_ZEROS || err != 0);
 #  undef JEMALLOC_MADV_PURGE
 #  undef JEMALLOC_MADV_ZEROS
+#else
+	/* Last resort no-op. */
+	unzeroed = true;
 #endif
 	return (unzeroed);
 }
@@ -200,11 +203,11 @@ chunk_alloc_mmap(size_t size, size_t alignment, bool *zero)
 }

 bool
-chunk_dealloc_mmap(void *chunk, size_t size)
+chunk_dalloc_mmap(void *chunk, size_t size)
 {

 	if (config_munmap)
 		pages_unmap(chunk, size);

-	return (config_munmap == false);
+	return (!config_munmap);
 }
--- a/memory/jemalloc/src/src/ckh.c
+++ b/memory/jemalloc/src/src/ckh.c
@@ -40,8 +40,8 @@
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */

-static bool	ckh_grow(ckh_t *ckh);
-static void	ckh_shrink(ckh_t *ckh);
+static bool	ckh_grow(tsd_t *tsd, ckh_t *ckh);
+static void	ckh_shrink(tsd_t *tsd, ckh_t *ckh);

 /******************************************************************************/

@@ -185,7 +185,7 @@ ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey,
 		}

 		bucket = tbucket;
-		if (ckh_try_bucket_insert(ckh, bucket, key, data) == false)
+		if (!ckh_try_bucket_insert(ckh, bucket, key, data))
 			return (false);
 	}
 }
@@ -201,12 +201,12 @@ ckh_try_insert(ckh_t *ckh, void const**argkey, void const**argdata)

 	/* Try to insert in primary bucket. */
 	bucket = hashes[0] & ((ZU(1) << ckh->lg_curbuckets) - 1);
-	if (ckh_try_bucket_insert(ckh, bucket, key, data) == false)
+	if (!ckh_try_bucket_insert(ckh, bucket, key, data))
 		return (false);

 	/* Try to insert in secondary bucket. */
 	bucket = hashes[1] & ((ZU(1) << ckh->lg_curbuckets) - 1);
-	if (ckh_try_bucket_insert(ckh, bucket, key, data) == false)
+	if (!ckh_try_bucket_insert(ckh, bucket, key, data))
 		return (false);

 	/*
@@ -243,7 +243,7 @@ ckh_rebuild(ckh_t *ckh, ckhc_t *aTab)
 }

 static bool
-ckh_grow(ckh_t *ckh)
+ckh_grow(tsd_t *tsd, ckh_t *ckh)
 {
 	bool ret;
 	ckhc_t *tab, *ttab;
@@ -270,7 +270,7 @@ ckh_grow(ckh_t *ckh)
 			ret = true;
 			goto label_return;
 		}
-		tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
+		tab = (ckhc_t *)ipalloc(tsd, usize, CACHELINE, true);
 		if (tab == NULL) {
 			ret = true;
 			goto label_return;
@@ -281,13 +281,13 @@ ckh_grow(ckh_t *ckh)
 		tab = ttab;
 		ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;

-		if (ckh_rebuild(ckh, tab) == false) {
-			idalloc(tab);
+		if (!ckh_rebuild(ckh, tab)) {
+			idalloc(tsd, tab);
 			break;
 		}

 		/* Rebuilding failed, so back out partially rebuilt table. */
-		idalloc(ckh->tab);
+		idalloc(tsd, ckh->tab);
 		ckh->tab = tab;
 		ckh->lg_curbuckets = lg_prevbuckets;
 	}
@@ -298,7 +298,7 @@ label_return:
 }

 static void
-ckh_shrink(ckh_t *ckh)
+ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 {
 	ckhc_t *tab, *ttab;
 	size_t lg_curcells, usize;
@@ -313,7 +313,7 @@ ckh_shrink(ckh_t *ckh)
 	usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
 	if (usize == 0)
 		return;
-	tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
+	tab = (ckhc_t *)ipalloc(tsd, usize, CACHELINE, true);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -327,8 +327,8 @@ ckh_shrink(ckh_t *ckh)
 	tab = ttab;
 	ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;

-	if (ckh_rebuild(ckh, tab) == false) {
-		idalloc(tab);
+	if (!ckh_rebuild(ckh, tab)) {
+		idalloc(tsd, tab);
 #ifdef CKH_COUNT
 		ckh->nshrinks++;
 #endif
@@ -336,7 +336,7 @@ ckh_shrink(ckh_t *ckh)
 	}

 	/* Rebuilding failed, so back out partially rebuilt table. */
-	idalloc(ckh->tab);
+	idalloc(tsd, ckh->tab);
 	ckh->tab = tab;
 	ckh->lg_curbuckets = lg_prevbuckets;
 #ifdef CKH_COUNT
@@ -345,7 +345,8 @@ ckh_shrink(ckh_t *ckh)
 }

 bool
-ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
+ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+    ckh_keycomp_t *keycomp)
 {
 	bool ret;
 	size_t mincells, usize;
@@ -366,10 +367,10 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 	ckh->count = 0;

 	/*
-	 * Find the minimum power of 2 that is large enough to fit aBaseCount
+	 * Find the minimum power of 2 that is large enough to fit minitems
 	 * entries.  We are using (2+,2) cuckoo hashing, which has an expected
 	 * maximum load factor of at least ~0.86, so 0.75 is a conservative load
-	 * factor that will typically allow 2^aLgMinItems to fit without ever
+	 * factor that will typically allow mincells items to fit without ever
 	 * growing the table.
 	 */
 	assert(LG_CKH_BUCKET_CELLS > 0);
@@ -388,7 +389,7 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 		ret = true;
 		goto label_return;
 	}
-	ckh->tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
+	ckh->tab = (ckhc_t *)ipalloc(tsd, usize, CACHELINE, true);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto label_return;
@@ -400,7 +401,7 @@ label_return:
 }

 void
-ckh_delete(ckh_t *ckh)
+ckh_delete(tsd_t *tsd, ckh_t *ckh)
 {

 	assert(ckh != NULL);
@@ -417,7 +418,7 @@ ckh_delete(ckh_t *ckh)
 	    (unsigned long long)ckh->nrelocs);
 #endif

-	idalloc(ckh->tab);
+	idalloc(tsd, ckh->tab);
 	if (config_debug)
 		memset(ckh, 0x5a, sizeof(ckh_t));
 }
@@ -452,7 +453,7 @@ ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data)
 }

 bool
-ckh_insert(ckh_t *ckh, const void *key, const void *data)
+ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data)
 {
 	bool ret;

@@ -464,7 +465,7 @@ ckh_insert(ckh_t *ckh, const void *key, const void *data)
 #endif

 	while (ckh_try_insert(ckh, &key, &data)) {
-		if (ckh_grow(ckh)) {
+		if (ckh_grow(tsd, ckh)) {
 			ret = true;
 			goto label_return;
 		}
@@ -476,7 +477,8 @@ label_return:
 }

 bool
-ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data)
+ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
+    void **data)
 {
 	size_t cell;

@@ -497,7 +499,7 @@ ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data)
 		    + LG_CKH_BUCKET_CELLS - 2)) && ckh->lg_curbuckets
 		    > ckh->lg_minbuckets) {
 			/* Ignore error due to OOM. */
-			ckh_shrink(ckh);
+			ckh_shrink(tsd, ckh);
 		}

 		return (false);
--- a/memory/jemalloc/src/src/ctl.c
+++ b/memory/jemalloc/src/src/ctl.c
--- a/memory/jemalloc/src/src/extent.c
+++ b/memory/jemalloc/src/src/extent.c
@@ -3,7 +3,7 @@

 /******************************************************************************/

-static inline int
+JEMALLOC_INLINE_C int
 extent_szad_comp(extent_node_t *a, extent_node_t *b)
 {
 	int ret;
@@ -25,7 +25,7 @@ extent_szad_comp(extent_node_t *a, extent_node_t *b)
 rb_gen(, extent_tree_szad_, extent_tree_t, extent_node_t, link_szad,
    extent_szad_comp)

-static inline int
+JEMALLOC_INLINE_C int
 extent_ad_comp(extent_node_t *a, extent_node_t *b)
 {
 	uintptr_t a_addr = (uintptr_t)a->addr;
--- a/memory/jemalloc/src/src/huge.c
+++ b/memory/jemalloc/src/src/huge.c
@@ -4,11 +4,8 @@
 /******************************************************************************/
 /* Data. */

-uint64_t	huge_nmalloc;
-uint64_t	huge_ndalloc;
-size_t		huge_allocated;
-
-malloc_mutex_t	huge_mtx;
+/* Protects chunk-related data structures. */
+static malloc_mutex_t	huge_mtx;

 /******************************************************************************/

@@ -16,30 +13,32 @@ malloc_mutex_t	huge_mtx;
 static extent_tree_t	huge;

 void *
-huge_malloc(size_t size, bool zero, dss_prec_t dss_prec)
+huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero, bool try_tcache)
 {
+	size_t usize;

-	return (huge_palloc(size, chunksize, zero, dss_prec));
+	usize = s2u(size);
+	if (usize == 0) {
+		/* size_t overflow. */
+		return (NULL);
+	}
+
+	return (huge_palloc(tsd, arena, usize, chunksize, zero, try_tcache));
 }

 void *
-huge_palloc(size_t size, size_t alignment, bool zero, dss_prec_t dss_prec)
+huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
+    bool zero, bool try_tcache)
 {
 	void *ret;
-	size_t csize;
 	extent_node_t *node;
 	bool is_zeroed;

 	/* Allocate one or more contiguous chunks for this request. */

-	csize = CHUNK_CEILING(size);
-	if (csize == 0) {
-		/* size is large enough to cause size_t wrap-around. */
-		return (NULL);
-	}
-
 	/* Allocate an extent node with which to track the chunk. */
-	node = base_node_alloc();
+	node = ipalloct(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
+	    CACHELINE, false, try_tcache, NULL);
 	if (node == NULL)
 		return (NULL);

@@ -48,148 +47,32 @@ huge_palloc(size_t size, size_t alignment, bool zero, dss_prec_t dss_prec)
 	 * it is possible to make correct junk/zero fill decisions below.
 	 */
 	is_zeroed = zero;
-	ret = chunk_alloc(csize, alignment, false, &is_zeroed, dss_prec);
-	if (ret == NULL) {
-		base_node_dealloc(node);
+	arena = arena_choose(tsd, arena);
+	if (unlikely(arena == NULL) || (ret = arena_chunk_alloc_huge(arena,
+	    usize, alignment, &is_zeroed)) == NULL) {
+		idalloct(tsd, node, try_tcache);
 		return (NULL);
 	}

 	/* Insert node into huge. */
 	node->addr = ret;
-	node->size = csize;
+	node->size = usize;
+	node->zeroed = is_zeroed;
+	node->arena = arena;

 	malloc_mutex_lock(&huge_mtx);
 	extent_tree_ad_insert(&huge, node);
-	if (config_stats) {
-		stats_cactive_add(csize);
-		huge_nmalloc++;
-		huge_allocated += csize;
-	}
 	malloc_mutex_unlock(&huge_mtx);

-	if (config_fill && zero == false) {
-		if (opt_junk)
-			memset(ret, 0xa5, csize);
-		else if (opt_zero && is_zeroed == false)
-			memset(ret, 0, csize);
-	}
+	if (zero || (config_fill && unlikely(opt_zero))) {
+		if (!is_zeroed)
+			memset(ret, 0, usize);
+	} else if (config_fill && unlikely(opt_junk_alloc))
+		memset(ret, 0xa5, usize);

 	return (ret);
 }

-bool
-huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
-{
-
-	/*
-	 * Avoid moving the allocation if the size class can be left the same.
-	 */
-	if (oldsize > arena_maxclass
-	    && CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
-	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
-		assert(CHUNK_CEILING(oldsize) == oldsize);
-		return (false);
-	}
-
-	/* Reallocation would require a move. */
-	return (true);
-}
-
-void *
-huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_dalloc, dss_prec_t dss_prec)
-{
-	void *ret;
-	size_t copysize;
-
-	/* Try to avoid moving the allocation. */
-	if (huge_ralloc_no_move(ptr, oldsize, size, extra) == false)
-		return (ptr);
-
-	/*
-	 * size and oldsize are different enough that we need to use a
-	 * different size class.  In that case, fall back to allocating new
-	 * space and copying.
-	 */
-	if (alignment > chunksize)
-		ret = huge_palloc(size + extra, alignment, zero, dss_prec);
-	else
-		ret = huge_malloc(size + extra, zero, dss_prec);
-
-	if (ret == NULL) {
-		if (extra == 0)
-			return (NULL);
-		/* Try again, this time without extra. */
-		if (alignment > chunksize)
-			ret = huge_palloc(size, alignment, zero, dss_prec);
-		else
-			ret = huge_malloc(size, zero, dss_prec);
-
-		if (ret == NULL)
-			return (NULL);
-	}
-
-	/*
-	 * Copy at most size bytes (not size+extra), since the caller has no
-	 * expectation that the extra bytes will be reliably preserved.
-	 */
-	copysize = (size < oldsize) ? size : oldsize;
-
-#ifdef JEMALLOC_MREMAP
-	/*
-	 * Use mremap(2) if this is a huge-->huge reallocation, and neither the
-	 * source nor the destination are in dss.
-	 */
-	if (oldsize >= chunksize && (config_dss == false || (chunk_in_dss(ptr)
-	    == false && chunk_in_dss(ret) == false))) {
-		size_t newsize = huge_salloc(ret);
-
-		/*
-		 * Remove ptr from the tree of huge allocations before
-		 * performing the remap operation, in order to avoid the
-		 * possibility of another thread acquiring that mapping before
-		 * this one removes it from the tree.
-		 */
-		huge_dalloc(ptr, false);
-		if (mremap(ptr, oldsize, newsize, MREMAP_MAYMOVE|MREMAP_FIXED,
-		    ret) == MAP_FAILED) {
-			/*
-			 * Assuming no chunk management bugs in the allocator,
-			 * the only documented way an error can occur here is
-			 * if the application changed the map type for a
-			 * portion of the old allocation.  This is firmly in
-			 * undefined behavior territory, so write a diagnostic
-			 * message, and optionally abort.
-			 */
-			char buf[BUFERROR_BUF];
-
-			buferror(get_errno(), buf, sizeof(buf));
-			malloc_printf("<jemalloc>: Error in mremap(): %s\n",
-			    buf);
-			if (opt_abort)
-				abort();
-			memcpy(ret, ptr, copysize);
-			chunk_dealloc_mmap(ptr, oldsize);
-		} else if (config_fill && zero == false && opt_junk && oldsize
-		    < newsize) {
-			/*
-			 * mremap(2) clobbers the original mapping, so
-			 * junk/zero filling is not preserved.  There is no
-			 * need to zero fill here, since any trailing
-			 * uninititialized memory is demand-zeroed by the
-			 * kernel, but junk filling must be redone.
-			 */
-			memset(ret + oldsize, 0xa5, newsize - oldsize);
-		}
-	} else
-#endif
-	{
-		memcpy(ret, ptr, copysize);
-		iqalloct(ptr, try_tcache_dalloc);
-	}
-	return (ret);
-}
-
 #ifdef JEMALLOC_JET
 #undef huge_dalloc_junk
 #define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk_impl)
@@ -198,12 +81,12 @@ static void
 huge_dalloc_junk(void *ptr, size_t usize)
 {

-	if (config_fill && config_dss && opt_junk) {
+	if (config_fill && have_dss && unlikely(opt_junk_free)) {
 		/*
 		 * Only bother junk filling if the chunk isn't about to be
 		 * unmapped.
 		 */
-		if (config_munmap == false || (config_dss && chunk_in_dss(ptr)))
+		if (!config_munmap || (have_dss && chunk_in_dss(ptr)))
 			memset(ptr, 0x5a, usize);
 	}
 }
@@ -213,34 +96,266 @@ huge_dalloc_junk(void *ptr, size_t usize)
 huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
 #endif

+static void
+huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
+    size_t size, size_t extra, bool zero)
+{
+	size_t usize_next;
+	bool zeroed;
+	extent_node_t *node, key;
+	arena_t *arena;
+
+	/* Increase usize to incorporate extra. */
+	while (usize < s2u(size+extra) && (usize_next = s2u(usize+1)) < oldsize)
+		usize = usize_next;
+
+	if (oldsize == usize)
+		return;
+
+	/* Fill if necessary (shrinking). */
+	if (oldsize > usize) {
+		size_t sdiff = CHUNK_CEILING(usize) - usize;
+		zeroed = (sdiff != 0) ? !pages_purge((void *)((uintptr_t)ptr +
+		    usize), sdiff) : true;
+		if (config_fill && unlikely(opt_junk_free)) {
+			memset((void *)((uintptr_t)ptr + usize), 0x5a, oldsize -
+			    usize);
+			zeroed = false;
+		}
+	} else
+		zeroed = true;
+
+	malloc_mutex_lock(&huge_mtx);
+	key.addr = ptr;
+	node = extent_tree_ad_search(&huge, &key);
+	assert(node != NULL);
+	assert(node->addr == ptr);
+	arena = node->arena;
+	/* Update the size of the huge allocation. */
+	assert(node->size != usize);
+	node->size = usize;
+	/* Clear node->zeroed if zeroing failed above. */
+	node->zeroed = (node->zeroed && zeroed);
+	malloc_mutex_unlock(&huge_mtx);
+
+	arena_chunk_ralloc_huge_similar(arena, ptr, oldsize, usize);
+
+	/* Fill if necessary (growing). */
+	if (oldsize < usize) {
+		if (zero || (config_fill && unlikely(opt_zero))) {
+			if (!zeroed) {
+				memset((void *)((uintptr_t)ptr + oldsize), 0,
+				    usize - oldsize);
+			}
+		} else if (config_fill && unlikely(opt_junk_alloc)) {
+			memset((void *)((uintptr_t)ptr + oldsize), 0xa5, usize -
+			    oldsize);
+		}
+	}
+}
+
+static void
+huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
+{
+	size_t sdiff;
+	bool zeroed;
+	extent_node_t *node, key;
+	arena_t *arena;
+
+	sdiff = CHUNK_CEILING(usize) - usize;
+	zeroed = (sdiff != 0) ? !pages_purge((void *)((uintptr_t)ptr + usize),
+	    sdiff) : true;
+	if (config_fill && unlikely(opt_junk_free)) {
+		huge_dalloc_junk((void *)((uintptr_t)ptr + usize), oldsize -
+		    usize);
+		zeroed = false;
+	}
+
+	malloc_mutex_lock(&huge_mtx);
+	key.addr = ptr;
+	node = extent_tree_ad_search(&huge, &key);
+	assert(node != NULL);
+	assert(node->addr == ptr);
+	arena = node->arena;
+	/* Update the size of the huge allocation. */
+	node->size = usize;
+	/* Clear node->zeroed if zeroing failed above. */
+	node->zeroed = (node->zeroed && zeroed);
+	malloc_mutex_unlock(&huge_mtx);
+
+	/* Zap the excess chunks. */
+	arena_chunk_ralloc_huge_shrink(arena, ptr, oldsize, usize);
+}
+
+static bool
+huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
+	size_t usize;
+	extent_node_t *node, key;
+	arena_t *arena;
+	bool is_zeroed_subchunk, is_zeroed_chunk;
+
+	usize = s2u(size);
+	if (usize == 0) {
+		/* size_t overflow. */
+		return (true);
+	}
+
+	malloc_mutex_lock(&huge_mtx);
+	key.addr = ptr;
+	node = extent_tree_ad_search(&huge, &key);
+	assert(node != NULL);
+	assert(node->addr == ptr);
+	arena = node->arena;
+	is_zeroed_subchunk = node->zeroed;
+	malloc_mutex_unlock(&huge_mtx);
+
+	/*
+	 * Copy zero into is_zeroed_chunk and pass the copy to chunk_alloc(), so
+	 * that it is possible to make correct junk/zero fill decisions below.
+	 */
+	is_zeroed_chunk = zero;
+
+	if (arena_chunk_ralloc_huge_expand(arena, ptr, oldsize, usize,
+	     &is_zeroed_chunk))
+		return (true);
+
+	malloc_mutex_lock(&huge_mtx);
+	/* Update the size of the huge allocation. */
+	node->size = usize;
+	malloc_mutex_unlock(&huge_mtx);
+
+	if (zero || (config_fill && unlikely(opt_zero))) {
+		if (!is_zeroed_subchunk) {
+			memset((void *)((uintptr_t)ptr + oldsize), 0,
+			    CHUNK_CEILING(oldsize) - oldsize);
+		}
+		if (!is_zeroed_chunk) {
+			memset((void *)((uintptr_t)ptr +
+			    CHUNK_CEILING(oldsize)), 0, usize -
+			    CHUNK_CEILING(oldsize));
+		}
+	} else if (config_fill && unlikely(opt_junk_alloc)) {
+		memset((void *)((uintptr_t)ptr + oldsize), 0xa5, usize -
+		    oldsize);
+	}
+
+	return (false);
+}
+
+bool
+huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
+    bool zero)
+{
+	size_t usize;
+
+	/* Both allocations must be huge to avoid a move. */
+	if (oldsize < chunksize)
+		return (true);
+
+	assert(s2u(oldsize) == oldsize);
+	usize = s2u(size);
+	if (usize == 0) {
+		/* size_t overflow. */
+		return (true);
+	}
+
+	/*
+	 * Avoid moving the allocation if the existing chunk size accommodates
+	 * the new size.
+	 */
+	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize)
+	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
+		huge_ralloc_no_move_similar(ptr, oldsize, usize, size, extra,
+		    zero);
+		return (false);
+	}
+
+	/* Shrink the allocation in-place. */
+	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize)) {
+		huge_ralloc_no_move_shrink(ptr, oldsize, usize);
+		return (false);
+	}
+
+	/* Attempt to expand the allocation in-place. */
+	if (huge_ralloc_no_move_expand(ptr, oldsize, size + extra,
+	    zero)) {
+		if (extra == 0)
+			return (true);
+
+		/* Try again, this time without extra. */
+		return (huge_ralloc_no_move_expand(ptr, oldsize, size, zero));
+	}
+	return (false);
+}
+
+void *
+huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
+    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
+    bool try_tcache_dalloc)
+{
+	void *ret;
+	size_t copysize;
+
+	/* Try to avoid moving the allocation. */
+	if (!huge_ralloc_no_move(ptr, oldsize, size, extra, zero))
+		return (ptr);
+
+	/*
+	 * size and oldsize are different enough that we need to use a
+	 * different size class.  In that case, fall back to allocating new
+	 * space and copying.
+	 */
+	if (alignment > chunksize) {
+		ret = huge_palloc(tsd, arena, size + extra, alignment, zero,
+		    try_tcache_alloc);
+	} else {
+		ret = huge_malloc(tsd, arena, size + extra, zero,
+		    try_tcache_alloc);
+	}
+
+	if (ret == NULL) {
+		if (extra == 0)
+			return (NULL);
+		/* Try again, this time without extra. */
+		if (alignment > chunksize) {
+			ret = huge_palloc(tsd, arena, size, alignment, zero,
+			    try_tcache_alloc);
+		} else {
+			ret = huge_malloc(tsd, arena, size, zero,
+			    try_tcache_alloc);
+		}
+
+		if (ret == NULL)
+			return (NULL);
+	}
+
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
+	copysize = (size < oldsize) ? size : oldsize;
+	memcpy(ret, ptr, copysize);
+	isqalloc(tsd, ptr, oldsize, try_tcache_dalloc);
+	return (ret);
+}
+
 void
-huge_dalloc(void *ptr, bool unmap)
+huge_dalloc(tsd_t *tsd, void *ptr, bool try_tcache)
 {
 	extent_node_t *node, key;

 	malloc_mutex_lock(&huge_mtx);
-
 	/* Extract from tree of huge allocations. */
 	key.addr = ptr;
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 	assert(node->addr == ptr);
 	extent_tree_ad_remove(&huge, node);
-
-	if (config_stats) {
-		stats_cactive_sub(node->size);
-		huge_ndalloc++;
-		huge_allocated -= node->size;
-	}
-
 	malloc_mutex_unlock(&huge_mtx);

-	if (unmap)
-		huge_dalloc_junk(node->addr, node->size);
-
-	chunk_dealloc(node->addr, node->size, unmap);
-
-	base_node_dealloc(node);
+	huge_dalloc_junk(node->addr, node->size);
+	arena_chunk_dalloc_huge(node->arena, node->addr, node->size);
+	idalloct(tsd, node, try_tcache);
 }

 size_t
@@ -263,17 +378,10 @@ huge_salloc(const void *ptr)
 	return (ret);
 }

-dss_prec_t
-huge_dss_prec_get(arena_t *arena)
+prof_tctx_t *
+huge_prof_tctx_get(const void *ptr)
 {
-
-	return (arena_dss_prec_get(choose_arena(arena)));
-}
-
-prof_ctx_t *
-huge_prof_ctx_get(const void *ptr)
-{
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	extent_node_t *node, key;

 	malloc_mutex_lock(&huge_mtx);
@@ -283,7 +391,7 @@ huge_prof_ctx_get(const void *ptr)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);

-	ret = node->prof_ctx;
+	ret = node->prof_tctx;

 	malloc_mutex_unlock(&huge_mtx);

@@ -291,7 +399,7 @@ huge_prof_ctx_get(const void *ptr)
 }

 void
-huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	extent_node_t *node, key;

@@ -302,7 +410,7 @@ huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);

-	node->prof_ctx = ctx;
+	node->prof_tctx = tctx;

 	malloc_mutex_unlock(&huge_mtx);
 }
@@ -316,12 +424,6 @@ huge_boot(void)
 		return (true);
 	extent_tree_ad_new(&huge);

-	if (config_stats) {
-		huge_nmalloc = 0;
-		huge_ndalloc = 0;
-		huge_allocated = 0;
-	}
-
 	return (false);
 }

--- a/memory/jemalloc/src/src/jemalloc.c
+++ b/memory/jemalloc/src/src/jemalloc.c
--- a/memory/jemalloc/src/src/prof.c
+++ b/memory/jemalloc/src/src/prof.c
--- a/memory/jemalloc/src/src/quarantine.c
+++ b/memory/jemalloc/src/src/quarantine.c
@@ -2,33 +2,31 @@
 #include "jemalloc/internal/jemalloc_internal.h"

 /*
- * quarantine pointers close to NULL are used to encode state information that
+ * Quarantine pointers close to NULL are used to encode state information that
 * is used for cleaning up during thread shutdown.
 */
 #define	QUARANTINE_STATE_REINCARNATED	((quarantine_t *)(uintptr_t)1)
 #define	QUARANTINE_STATE_PURGATORY	((quarantine_t *)(uintptr_t)2)
 #define	QUARANTINE_STATE_MAX		QUARANTINE_STATE_PURGATORY

-/******************************************************************************/
-/* Data. */
-
-malloc_tsd_data(, quarantine, quarantine_t *, NULL)
-
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */

-static quarantine_t	*quarantine_grow(quarantine_t *quarantine);
-static void	quarantine_drain_one(quarantine_t *quarantine);
-static void	quarantine_drain(quarantine_t *quarantine, size_t upper_bound);
+static quarantine_t	*quarantine_grow(tsd_t *tsd, quarantine_t *quarantine);
+static void	quarantine_drain_one(tsd_t *tsd, quarantine_t *quarantine);
+static void	quarantine_drain(tsd_t *tsd, quarantine_t *quarantine,
+    size_t upper_bound);

 /******************************************************************************/

-quarantine_t *
-quarantine_init(size_t lg_maxobjs)
+static quarantine_t *
+quarantine_init(tsd_t *tsd, size_t lg_maxobjs)
 {
 	quarantine_t *quarantine;

-	quarantine = (quarantine_t *)imalloc(offsetof(quarantine_t, objs) +
+	assert(tsd_nominal(tsd));
+
+	quarantine = (quarantine_t *)imalloc(tsd, offsetof(quarantine_t, objs) +
 	    ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)));
 	if (quarantine == NULL)
 		return (NULL);
@@ -37,19 +35,36 @@ quarantine_init(size_t lg_maxobjs)
 	quarantine->first = 0;
 	quarantine->lg_maxobjs = lg_maxobjs;

-	quarantine_tsd_set(&quarantine);
-
 	return (quarantine);
 }

+void
+quarantine_alloc_hook_work(tsd_t *tsd)
+{
+	quarantine_t *quarantine;
+
+	if (!tsd_nominal(tsd))
+		return;
+
+	quarantine = quarantine_init(tsd, LG_MAXOBJS_INIT);
+	/*
+	 * Check again whether quarantine has been initialized, because
+	 * quarantine_init() may have triggered recursive initialization.
+	 */
+	if (tsd_quarantine_get(tsd) == NULL)
+		tsd_quarantine_set(tsd, quarantine);
+	else
+		idalloc(tsd, quarantine);
+}
+
 static quarantine_t *
-quarantine_grow(quarantine_t *quarantine)
+quarantine_grow(tsd_t *tsd, quarantine_t *quarantine)
 {
 	quarantine_t *ret;

-	ret = quarantine_init(quarantine->lg_maxobjs + 1);
+	ret = quarantine_init(tsd, quarantine->lg_maxobjs + 1);
 	if (ret == NULL) {
-		quarantine_drain_one(quarantine);
+		quarantine_drain_one(tsd, quarantine);
 		return (quarantine);
 	}

@@ -71,17 +86,18 @@ quarantine_grow(quarantine_t *quarantine)
 		memcpy(&ret->objs[ncopy_a], quarantine->objs, ncopy_b *
 		    sizeof(quarantine_obj_t));
 	}
-	idalloc(quarantine);
+	idalloc(tsd, quarantine);

+	tsd_quarantine_set(tsd, ret);
 	return (ret);
 }

 static void
-quarantine_drain_one(quarantine_t *quarantine)
+quarantine_drain_one(tsd_t *tsd, quarantine_t *quarantine)
 {
 	quarantine_obj_t *obj = &quarantine->objs[quarantine->first];
 	assert(obj->usize == isalloc(obj->ptr, config_prof));
-	idalloc(obj->ptr);
+	idalloc(tsd, obj->ptr);
 	quarantine->curbytes -= obj->usize;
 	quarantine->curobjs--;
 	quarantine->first = (quarantine->first + 1) & ((ZU(1) <<
@@ -89,15 +105,15 @@ quarantine_drain_one(quarantine_t *quarantine)
 }

 static void
-quarantine_drain(quarantine_t *quarantine, size_t upper_bound)
+quarantine_drain(tsd_t *tsd, quarantine_t *quarantine, size_t upper_bound)
 {

 	while (quarantine->curbytes > upper_bound && quarantine->curobjs > 0)
-		quarantine_drain_one(quarantine);
+		quarantine_drain_one(tsd, quarantine);
 }

 void
-quarantine(void *ptr)
+quarantine(tsd_t *tsd, void *ptr)
 {
 	quarantine_t *quarantine;
 	size_t usize = isalloc(ptr, config_prof);
@@ -105,17 +121,8 @@ quarantine(void *ptr)
 	cassert(config_fill);
 	assert(opt_quarantine);

-	quarantine = *quarantine_tsd_get();
-	if ((uintptr_t)quarantine <= (uintptr_t)QUARANTINE_STATE_MAX) {
-		if (quarantine == QUARANTINE_STATE_PURGATORY) {
-			/*
-			 * Make a note that quarantine() was called after
-			 * quarantine_cleanup() was called.
-			 */
-			quarantine = QUARANTINE_STATE_REINCARNATED;
-			quarantine_tsd_set(&quarantine);
-		}
-		idalloc(ptr);
+	if ((quarantine = tsd_quarantine_get(tsd)) == NULL) {
+		idalloc(tsd, ptr);
 		return;
 	}
 	/*
@@ -125,11 +132,11 @@ quarantine(void *ptr)
 	if (quarantine->curbytes + usize > opt_quarantine) {
 		size_t upper_bound = (opt_quarantine >= usize) ? opt_quarantine
 		    - usize : 0;
-		quarantine_drain(quarantine, upper_bound);
+		quarantine_drain(tsd, quarantine, upper_bound);
 	}
 	/* Grow the quarantine ring buffer if it's full. */
 	if (quarantine->curobjs == (ZU(1) << quarantine->lg_maxobjs))
-		quarantine = quarantine_grow(quarantine);
+		quarantine = quarantine_grow(tsd, quarantine);
 	/* quarantine_grow() must free a slot if it fails to grow. */
 	assert(quarantine->curobjs < (ZU(1) << quarantine->lg_maxobjs));
 	/* Append ptr if its size doesn't exceed the quarantine size. */
@@ -141,12 +148,12 @@ quarantine(void *ptr)
 		obj->usize = usize;
 		quarantine->curbytes += usize;
 		quarantine->curobjs++;
-		if (config_fill && opt_junk) {
+		if (config_fill && unlikely(opt_junk_free)) {
 			/*
 			 * Only do redzone validation if Valgrind isn't in
 			 * operation.
 			 */
-			if ((config_valgrind == false || opt_valgrind == false)
+			if ((!config_valgrind || likely(!in_valgrind))
 			    && usize <= SMALL_MAXCLASS)
 				arena_quarantine_junk_small(ptr, usize);
 			else
@@ -154,46 +161,22 @@ quarantine(void *ptr)
 		}
 	} else {
 		assert(quarantine->curbytes == 0);
-		idalloc(ptr);
+		idalloc(tsd, ptr);
 	}
 }

 void
-quarantine_cleanup(void *arg)
+quarantine_cleanup(tsd_t *tsd)
 {
-	quarantine_t *quarantine = *(quarantine_t **)arg;
+	quarantine_t *quarantine;

-	if (quarantine == QUARANTINE_STATE_REINCARNATED) {
-		/*
-		 * Another destructor deallocated memory after this destructor
-		 * was called.  Reset quarantine to QUARANTINE_STATE_PURGATORY
-		 * in order to receive another callback.
-		 */
-		quarantine = QUARANTINE_STATE_PURGATORY;
-		quarantine_tsd_set(&quarantine);
-	} else if (quarantine == QUARANTINE_STATE_PURGATORY) {
-		/*
-		 * The previous time this destructor was called, we set the key
-		 * to QUARANTINE_STATE_PURGATORY so that other destructors
-		 * wouldn't cause re-creation of the quarantine.  This time, do
-		 * nothing, so that the destructor will not be called again.
-		 */
-	} else if (quarantine != NULL) {
-		quarantine_drain(quarantine, 0);
-		idalloc(quarantine);
-		quarantine = QUARANTINE_STATE_PURGATORY;
-		quarantine_tsd_set(&quarantine);
+	if (!config_fill)
+		return;
+
+	quarantine = tsd_quarantine_get(tsd);
+	if (quarantine != NULL) {
+		quarantine_drain(tsd, quarantine, 0);
+		idalloc(tsd, quarantine);
+		tsd_quarantine_set(tsd, NULL);
 	}
 }
-
-bool
-quarantine_boot(void)
-{
-
-	cassert(config_fill);
-
-	if (quarantine_tsd_boot())
-		return (true);
-
-	return (false);
-}
--- a/memory/jemalloc/src/src/rtree.c
+++ b/memory/jemalloc/src/src/rtree.c
@@ -9,8 +9,10 @@ rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)

 	assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));

-	bits_per_level = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
-	bits_in_leaf = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
+	bits_per_level = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void
+	    *)))) - 1;
+	bits_in_leaf = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE /
+	    sizeof(uint8_t)))) - 1;
 	if (bits > bits_in_leaf) {
 		height = 1 + (bits - bits_in_leaf) / bits_per_level;
 		if ((height-1) * bits_per_level + bits_in_leaf != bits)
--- a/memory/jemalloc/src/src/stats.c
+++ b/memory/jemalloc/src/src/stats.c
@@ -48,8 +48,10 @@ static void	stats_arena_bins_print(void (*write_cb)(void *, const char *),
    void *cbopaque, unsigned i);
 static void	stats_arena_lruns_print(void (*write_cb)(void *, const char *),
    void *cbopaque, unsigned i);
+static void	stats_arena_hchunks_print(
+    void (*write_cb)(void *, const char *), void *cbopaque, unsigned i);
 static void	stats_arena_print(void (*write_cb)(void *, const char *),
-    void *cbopaque, unsigned i, bool bins, bool large);
+    void *cbopaque, unsigned i, bool bins, bool large, bool huge);

 /******************************************************************************/

@@ -58,62 +60,56 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
    unsigned i)
 {
 	size_t page;
-	bool config_tcache;
-	unsigned nbins, j, gap_start;
+	bool config_tcache, in_gap;
+	unsigned nbins, j;

 	CTL_GET("arenas.page", &page, size_t);

 	CTL_GET("config.tcache", &config_tcache, bool);
 	if (config_tcache) {
 		malloc_cprintf(write_cb, cbopaque,
-		    "bins:     bin  size regs pgs    allocated      nmalloc"
-		    "      ndalloc    nrequests       nfills     nflushes"
-		    "      newruns       reruns      curruns\n");
+		    "bins:           size ind    allocated      nmalloc"
+		    "      ndalloc    nrequests      curregs      curruns regs"
+		    " pgs  util       nfills     nflushes      newruns"
+		    "       reruns\n");
 	} else {
 		malloc_cprintf(write_cb, cbopaque,
-		    "bins:     bin  size regs pgs    allocated      nmalloc"
-		    "      ndalloc      newruns       reruns      curruns\n");
+		    "bins:           size ind    allocated      nmalloc"
+		    "      ndalloc    nrequests      curregs      curruns regs"
+		    " pgs  util      newruns       reruns\n");
 	}
 	CTL_GET("arenas.nbins", &nbins, unsigned);
-	for (j = 0, gap_start = UINT_MAX; j < nbins; j++) {
+	for (j = 0, in_gap = false; j < nbins; j++) {
 		uint64_t nruns;

 		CTL_IJ_GET("stats.arenas.0.bins.0.nruns", &nruns, uint64_t);
-		if (nruns == 0) {
-			if (gap_start == UINT_MAX)
-				gap_start = j;
-		} else {
-			size_t reg_size, run_size, allocated;
+		if (nruns == 0)
+			in_gap = true;
+		else {
+			size_t reg_size, run_size, curregs, availregs, milli;
+			size_t curruns;
 			uint32_t nregs;
 			uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
 			uint64_t reruns;
-			size_t curruns;
+			char util[6]; /* "x.yyy". */

-			if (gap_start != UINT_MAX) {
-				if (j > gap_start + 1) {
-					/* Gap of more than one size class. */
-					malloc_cprintf(write_cb, cbopaque,
-					    "[%u..%u]\n", gap_start,
-					    j - 1);
-				} else {
-					/* Gap of one size class. */
-					malloc_cprintf(write_cb, cbopaque,
-					    "[%u]\n", gap_start);
-				}
-				gap_start = UINT_MAX;
+			if (in_gap) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "                     ---\n");
+				in_gap = false;
 			}
 			CTL_J_GET("arenas.bin.0.size", &reg_size, size_t);
 			CTL_J_GET("arenas.bin.0.nregs", &nregs, uint32_t);
 			CTL_J_GET("arenas.bin.0.run_size", &run_size, size_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.allocated",
-			    &allocated, size_t);
 			CTL_IJ_GET("stats.arenas.0.bins.0.nmalloc",
 			    &nmalloc, uint64_t);
 			CTL_IJ_GET("stats.arenas.0.bins.0.ndalloc",
 			    &ndalloc, uint64_t);
+			CTL_IJ_GET("stats.arenas.0.bins.0.curregs",
+			    &curregs, size_t);
+			CTL_IJ_GET("stats.arenas.0.bins.0.nrequests",
+			    &nrequests, uint64_t);
 			if (config_tcache) {
-				CTL_IJ_GET("stats.arenas.0.bins.0.nrequests",
-				    &nrequests, uint64_t);
 				CTL_IJ_GET("stats.arenas.0.bins.0.nfills",
 				    &nfills, uint64_t);
 				CTL_IJ_GET("stats.arenas.0.bins.0.nflushes",
@@ -123,35 +119,47 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			    uint64_t);
 			CTL_IJ_GET("stats.arenas.0.bins.0.curruns", &curruns,
 			    size_t);
+
+			availregs = nregs * curruns;
+			milli = (availregs != 0) ? (1000 * curregs) / availregs
+			    : 1000;
+			assert(milli <= 1000);
+			if (milli < 10) {
+				malloc_snprintf(util, sizeof(util), "0.00%zu",
+				    milli);
+			} else if (milli < 100) {
+				malloc_snprintf(util, sizeof(util), "0.0%zu",
+				    milli);
+			} else if (milli < 1000) {
+				malloc_snprintf(util, sizeof(util), "0.%zu",
+				    milli);
+			} else
+				malloc_snprintf(util, sizeof(util), "1");
+
 			if (config_tcache) {
 				malloc_cprintf(write_cb, cbopaque,
-				    "%13u %5zu %4u %3zu %12zu %12"PRIu64
+				    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64
+				    " %12"PRIu64" %12zu %12zu %4u %3zu %-5s"
 				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
-				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
-				    " %12zu\n",
-				    j, reg_size, nregs, run_size / page,
-				    allocated, nmalloc, ndalloc, nrequests,
-				    nfills, nflushes, nruns, reruns, curruns);
+				    " %12"PRIu64"\n",
+				    reg_size, j, curregs * reg_size, nmalloc,
+				    ndalloc, nrequests, curregs, curruns, nregs,
+				    run_size / page, util, nfills, nflushes,
+				    nruns, reruns);
 			} else {
 				malloc_cprintf(write_cb, cbopaque,
-				    "%13u %5zu %4u %3zu %12zu %12"PRIu64
-				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
-				    " %12zu\n",
-				    j, reg_size, nregs, run_size / page,
-				    allocated, nmalloc, ndalloc, nruns, reruns,
-				    curruns);
+				    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64
+				    " %12"PRIu64" %12zu %12zu %4u %3zu %-5s"
+				    " %12"PRIu64" %12"PRIu64"\n",
+				    reg_size, j, curregs * reg_size, nmalloc,
+				    ndalloc, nrequests, curregs, curruns, nregs,
+				    run_size / page, util, nruns, reruns);
 			}
 		}
 	}
-	if (gap_start != UINT_MAX) {
-		if (j > gap_start + 1) {
-			/* Gap of more than one size class. */
-			malloc_cprintf(write_cb, cbopaque, "[%u..%u]\n",
-			    gap_start, j - 1);
-		} else {
-			/* Gap of one size class. */
-			malloc_cprintf(write_cb, cbopaque, "[%u]\n", gap_start);
-		}
+	if (in_gap) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "                     ---\n");
 	}
 }

@@ -159,16 +167,15 @@ static void
 stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
    unsigned i)
 {
-	size_t page, nlruns, j;
-	ssize_t gap_start;
-
-	CTL_GET("arenas.page", &page, size_t);
+	unsigned nbins, nlruns, j;
+	bool in_gap;

 	malloc_cprintf(write_cb, cbopaque,
-	    "large:   size pages      nmalloc      ndalloc    nrequests"
-	    "      curruns\n");
-	CTL_GET("arenas.nlruns", &nlruns, size_t);
-	for (j = 0, gap_start = -1; j < nlruns; j++) {
+	    "large:          size ind    allocated      nmalloc      ndalloc"
+	    "    nrequests      curruns\n");
+	CTL_GET("arenas.nbins", &nbins, unsigned);
+	CTL_GET("arenas.nlruns", &nlruns, unsigned);
+	for (j = 0, in_gap = false; j < nlruns; j++) {
 		uint64_t nmalloc, ndalloc, nrequests;
 		size_t run_size, curruns;

@@ -178,32 +185,82 @@ stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    uint64_t);
 		CTL_IJ_GET("stats.arenas.0.lruns.0.nrequests", &nrequests,
 		    uint64_t);
-		if (nrequests == 0) {
-			if (gap_start == -1)
-				gap_start = j;
-		} else {
+		if (nrequests == 0)
+			in_gap = true;
+		else {
 			CTL_J_GET("arenas.lrun.0.size", &run_size, size_t);
 			CTL_IJ_GET("stats.arenas.0.lruns.0.curruns", &curruns,
 			    size_t);
-			if (gap_start != -1) {
-				malloc_cprintf(write_cb, cbopaque, "[%zu]\n",
-				    j - gap_start);
-				gap_start = -1;
+			if (in_gap) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "                     ---\n");
+				in_gap = false;
 			}
 			malloc_cprintf(write_cb, cbopaque,
-			    "%13zu %5zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+			    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
 			    " %12zu\n",
-			    run_size, run_size / page, nmalloc, ndalloc,
-			    nrequests, curruns);
+			    run_size, nbins + j, curruns * run_size, nmalloc,
+			    ndalloc, nrequests, curruns);
 		}
 	}
-	if (gap_start != -1)
-		malloc_cprintf(write_cb, cbopaque, "[%zu]\n", j - gap_start);
+	if (in_gap) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "                     ---\n");
+	}
+}
+
+static void
+stats_arena_hchunks_print(void (*write_cb)(void *, const char *),
+    void *cbopaque, unsigned i)
+{
+	unsigned nbins, nlruns, nhchunks, j;
+	bool in_gap;
+
+	malloc_cprintf(write_cb, cbopaque,
+	    "huge:           size ind    allocated      nmalloc      ndalloc"
+	    "    nrequests   curhchunks\n");
+	CTL_GET("arenas.nbins", &nbins, unsigned);
+	CTL_GET("arenas.nlruns", &nlruns, unsigned);
+	CTL_GET("arenas.nhchunks", &nhchunks, unsigned);
+	for (j = 0, in_gap = false; j < nhchunks; j++) {
+		uint64_t nmalloc, ndalloc, nrequests;
+		size_t hchunk_size, curhchunks;
+
+		CTL_IJ_GET("stats.arenas.0.hchunks.0.nmalloc", &nmalloc,
+		    uint64_t);
+		CTL_IJ_GET("stats.arenas.0.hchunks.0.ndalloc", &ndalloc,
+		    uint64_t);
+		CTL_IJ_GET("stats.arenas.0.hchunks.0.nrequests", &nrequests,
+		    uint64_t);
+		if (nrequests == 0)
+			in_gap = true;
+		else {
+			CTL_J_GET("arenas.hchunk.0.size", &hchunk_size,
+			    size_t);
+			CTL_IJ_GET("stats.arenas.0.hchunks.0.curhchunks",
+			    &curhchunks, size_t);
+			if (in_gap) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "                     ---\n");
+				in_gap = false;
+			}
+			malloc_cprintf(write_cb, cbopaque,
+			    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+			    " %12zu\n",
+			    hchunk_size, nbins + nlruns + j,
+			    curhchunks * hchunk_size, nmalloc, ndalloc,
+			    nrequests, curhchunks);
+		}
+	}
+	if (in_gap) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "                     ---\n");
+	}
 }

 static void
 stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    unsigned i, bool bins, bool large)
+    unsigned i, bool bins, bool large, bool huge)
 {
 	unsigned nthreads;
 	const char *dss;
@@ -213,6 +270,8 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	uint64_t small_nmalloc, small_ndalloc, small_nrequests;
 	size_t large_allocated;
 	uint64_t large_nmalloc, large_ndalloc, large_nrequests;
+	size_t huge_allocated;
+	uint64_t huge_nmalloc, huge_ndalloc, huge_nrequests;

 	CTL_GET("arenas.page", &page, size_t);

@@ -234,35 +293,51 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	    nmadvise, nmadvise == 1 ? "" : "s", purged);

 	malloc_cprintf(write_cb, cbopaque,
-	    "            allocated      nmalloc      ndalloc    nrequests\n");
+	    "                            allocated      nmalloc      ndalloc"
+	    "    nrequests\n");
 	CTL_I_GET("stats.arenas.0.small.allocated", &small_allocated, size_t);
 	CTL_I_GET("stats.arenas.0.small.nmalloc", &small_nmalloc, uint64_t);
 	CTL_I_GET("stats.arenas.0.small.ndalloc", &small_ndalloc, uint64_t);
 	CTL_I_GET("stats.arenas.0.small.nrequests", &small_nrequests, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "small:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
+	    "small:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+	    "\n",
 	    small_allocated, small_nmalloc, small_ndalloc, small_nrequests);
 	CTL_I_GET("stats.arenas.0.large.allocated", &large_allocated, size_t);
 	CTL_I_GET("stats.arenas.0.large.nmalloc", &large_nmalloc, uint64_t);
 	CTL_I_GET("stats.arenas.0.large.ndalloc", &large_ndalloc, uint64_t);
 	CTL_I_GET("stats.arenas.0.large.nrequests", &large_nrequests, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "large:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
+	    "large:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+	    "\n",
 	    large_allocated, large_nmalloc, large_ndalloc, large_nrequests);
+	CTL_I_GET("stats.arenas.0.huge.allocated", &huge_allocated, size_t);
+	CTL_I_GET("stats.arenas.0.huge.nmalloc", &huge_nmalloc, uint64_t);
+	CTL_I_GET("stats.arenas.0.huge.ndalloc", &huge_ndalloc, uint64_t);
+	CTL_I_GET("stats.arenas.0.huge.nrequests", &huge_nrequests, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "total:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
-	    small_allocated + large_allocated,
-	    small_nmalloc + large_nmalloc,
-	    small_ndalloc + large_ndalloc,
-	    small_nrequests + large_nrequests);
-	malloc_cprintf(write_cb, cbopaque, "active:  %12zu\n", pactive * page);
+	    "huge:                    %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+	    "\n",
+	    huge_allocated, huge_nmalloc, huge_ndalloc, huge_nrequests);
+	malloc_cprintf(write_cb, cbopaque,
+	    "total:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+	    "\n",
+	    small_allocated + large_allocated + huge_allocated,
+	    small_nmalloc + large_nmalloc + huge_nmalloc,
+	    small_ndalloc + large_ndalloc + huge_ndalloc,
+	    small_nrequests + large_nrequests + huge_nrequests);
+	malloc_cprintf(write_cb, cbopaque, "active:                  %12zu\n",
+	    pactive * page);
 	CTL_I_GET("stats.arenas.0.mapped", &mapped, size_t);
-	malloc_cprintf(write_cb, cbopaque, "mapped:  %12zu\n", mapped);
+	malloc_cprintf(write_cb, cbopaque, "mapped:                  %12zu\n",
+	    mapped);

 	if (bins)
 		stats_arena_bins_print(write_cb, cbopaque, i);
 	if (large)
 		stats_arena_lruns_print(write_cb, cbopaque, i);
+	if (huge)
+		stats_arena_hchunks_print(write_cb, cbopaque, i);
 }

 void
@@ -277,6 +352,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	bool unmerged = true;
 	bool bins = true;
 	bool large = true;
+	bool huge = true;

 	/*
 	 * Refresh stats, in case mallctl() was called by the application.
@@ -319,6 +395,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			case 'l':
 				large = false;
 				break;
+			case 'h':
+				huge = false;
+				break;
 			default:;
 			}
 		}
@@ -327,7 +406,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	malloc_cprintf(write_cb, cbopaque,
 	    "___ Begin jemalloc statistics ___\n");
 	if (general) {
-		int err;
 		const char *cpv;
 		bool bv;
 		unsigned uv;
@@ -346,26 +424,31 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    bv ? "enabled" : "disabled");

 #define	OPT_WRITE_BOOL(n)						\
-		if ((err = je_mallctl("opt."#n, &bv, &bsz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &bv, &bsz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %s\n", bv ? "true" : "false");	\
 		}
+#define	OPT_WRITE_BOOL_MUTABLE(n, m) {					\
+		bool bv2;						\
+		if (je_mallctl("opt."#n, &bv, &bsz, NULL, 0) == 0 &&	\
+		    je_mallctl(#m, &bv2, &bsz, NULL, 0) == 0) {		\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "  opt."#n": %s ("#m": %s)\n", bv ? "true"	\
+			    : "false", bv2 ? "true" : "false");		\
+		}							\
+}
 #define	OPT_WRITE_SIZE_T(n)						\
-		if ((err = je_mallctl("opt."#n, &sv, &ssz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &sv, &ssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			"  opt."#n": %zu\n", sv);			\
 		}
 #define	OPT_WRITE_SSIZE_T(n)						\
-		if ((err = je_mallctl("opt."#n, &ssv, &sssz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %zd\n", ssv);			\
 		}
 #define	OPT_WRITE_CHAR_P(n)						\
-		if ((err = je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": \"%s\"\n", cpv);		\
 		}
@@ -389,7 +472,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		OPT_WRITE_SSIZE_T(lg_tcache_max)
 		OPT_WRITE_BOOL(prof)
 		OPT_WRITE_CHAR_P(prof_prefix)
-		OPT_WRITE_BOOL(prof_active)
+		OPT_WRITE_BOOL_MUTABLE(prof_active, prof.active)
+		OPT_WRITE_BOOL_MUTABLE(prof_thread_active_init,
+		    prof.thread_active_init)
 		OPT_WRITE_SSIZE_T(lg_prof_sample)
 		OPT_WRITE_BOOL(prof_accum)
 		OPT_WRITE_SSIZE_T(lg_prof_interval)
@@ -398,6 +483,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		OPT_WRITE_BOOL(prof_leak)

 #undef OPT_WRITE_BOOL
+#undef OPT_WRITE_BOOL_MUTABLE
 #undef OPT_WRITE_SIZE_T
 #undef OPT_WRITE_SSIZE_T
 #undef OPT_WRITE_CHAR_P
@@ -425,14 +511,12 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			malloc_cprintf(write_cb, cbopaque,
 			    "Min active:dirty page ratio per arena: N/A\n");
 		}
-		if ((err = je_mallctl("arenas.tcache_max", &sv, &ssz, NULL, 0))
-		    == 0) {
+		if (je_mallctl("arenas.tcache_max", &sv, &ssz, NULL, 0) == 0) {
 			malloc_cprintf(write_cb, cbopaque,
 			    "Maximum thread-cached size class: %zu\n", sv);
 		}
-		if ((err = je_mallctl("opt.prof", &bv, &bsz, NULL, 0)) == 0 &&
-		    bv) {
-			CTL_GET("opt.lg_prof_sample", &sv, size_t);
+		if (je_mallctl("opt.prof", &bv, &bsz, NULL, 0) == 0 && bv) {
+			CTL_GET("prof.lg_sample", &sv, size_t);
 			malloc_cprintf(write_cb, cbopaque,
 			    "Average profile sample interval: %"PRIu64
 			    " (2^%zu)\n", (((uint64_t)1U) << sv), sv);
@@ -458,8 +542,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		size_t allocated, active, mapped;
 		size_t chunks_current, chunks_high;
 		uint64_t chunks_total;
-		size_t huge_allocated;
-		uint64_t huge_nmalloc, huge_ndalloc;

 		CTL_GET("stats.cactive", &cactive, size_t *);
 		CTL_GET("stats.allocated", &allocated, size_t);
@@ -481,16 +563,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    "  %13"PRIu64" %12zu %12zu\n",
 		    chunks_total, chunks_high, chunks_current);

-		/* Print huge stats. */
-		CTL_GET("stats.huge.nmalloc", &huge_nmalloc, uint64_t);
-		CTL_GET("stats.huge.ndalloc", &huge_ndalloc, uint64_t);
-		CTL_GET("stats.huge.allocated", &huge_allocated, size_t);
-		malloc_cprintf(write_cb, cbopaque,
-		    "huge: nmalloc      ndalloc    allocated\n");
-		malloc_cprintf(write_cb, cbopaque,
-		    " %12"PRIu64" %12"PRIu64" %12zu\n",
-		    huge_nmalloc, huge_ndalloc, huge_allocated);
-
 		if (merged) {
 			unsigned narenas;

@@ -508,12 +580,12 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 						ninitialized++;
 				}

-				if (ninitialized > 1 || unmerged == false) {
+				if (ninitialized > 1 || !unmerged) {
 					/* Print merged arena stats. */
 					malloc_cprintf(write_cb, cbopaque,
 					    "\nMerged arenas stats:\n");
 					stats_arena_print(write_cb, cbopaque,
-					    narenas, bins, large);
+					    narenas, bins, large, huge);
 				}
 			}
 		}
@@ -539,7 +611,8 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 						    cbopaque,
 						    "\narenas[%u]:\n", i);
 						stats_arena_print(write_cb,
-						    cbopaque, i, bins, large);
+						    cbopaque, i, bins, large,
+						    huge);
 					}
 				}
 			}
--- a/memory/jemalloc/src/src/tcache.c
+++ b/memory/jemalloc/src/src/tcache.c
@@ -4,9 +4,6 @@
 /******************************************************************************/
 /* Data. */

-malloc_tsd_data(, tcache, tcache_t *, NULL)
-malloc_tsd_data(, tcache_enabled, tcache_enabled_t, tcache_enabled_default)
-
 bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;

@@ -27,7 +24,7 @@ size_t	tcache_salloc(const void *ptr)
 void
 tcache_event_hard(tcache_t *tcache)
 {
-	size_t binind = tcache->next_gc_bin;
+	index_t binind = tcache->next_gc_bin;
 	tcache_bin_t *tbin = &tcache->tbins[binind];
 	tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];

@@ -65,7 +62,7 @@ tcache_event_hard(tcache_t *tcache)
 }

 void *
-tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind)
+tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, index_t binind)
 {
 	void *ret;

@@ -79,7 +76,7 @@ tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind)
 }

 void
-tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
+tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
    tcache_t *tcache)
 {
 	void *ptr;
@@ -104,7 +101,7 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,

 		malloc_mutex_lock(&bin->lock);
 		if (config_stats && arena == tcache->arena) {
-			assert(merged_stats == false);
+			assert(!merged_stats);
 			merged_stats = true;
 			bin->stats.nflushes++;
 			bin->stats.nrequests += tbin->tstats.nrequests;
@@ -118,14 +115,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 			if (chunk->arena == arena) {
 				size_t pageind = ((uintptr_t)ptr -
 				    (uintptr_t)chunk) >> LG_PAGE;
-				arena_chunk_map_t *mapelm =
-				    arena_mapp_get(chunk, pageind);
-				if (config_fill && opt_junk) {
-					arena_alloc_junk_small(ptr,
-					    &arena_bin_info[binind], true);
-				}
-				arena_dalloc_bin_locked(arena, chunk, ptr,
-				    mapelm);
+				arena_chunk_map_bits_t *bitselm =
+				    arena_bitselm_get(chunk, pageind);
+				arena_dalloc_bin_junked_locked(arena, chunk,
+				    ptr, bitselm);
 			} else {
 				/*
 				 * This object was allocated via a different
@@ -139,7 +132,7 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 		}
 		malloc_mutex_unlock(&bin->lock);
 	}
-	if (config_stats && merged_stats == false) {
+	if (config_stats && !merged_stats) {
 		/*
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
@@ -160,7 +153,7 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 }

 void
-tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
+tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
    tcache_t *tcache)
 {
 	void *ptr;
@@ -200,9 +193,10 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
 			ptr = tbin->avail[i];
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-			if (chunk->arena == arena)
-				arena_dalloc_large_locked(arena, chunk, ptr);
-			else {
+			if (chunk->arena == arena) {
+				arena_dalloc_large_junked_locked(arena, chunk,
+				    ptr);
+			} else {
 				/*
 				 * This object was allocated via a different
 				 * arena than the one that is currently locked.
@@ -217,7 +211,7 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
 		if (config_prof && idump)
 			prof_idump();
 	}
-	if (config_stats && merged_stats == false) {
+	if (config_stats && !merged_stats) {
 		/*
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
@@ -252,6 +246,14 @@ tcache_arena_associate(tcache_t *tcache, arena_t *arena)
 	tcache->arena = arena;
 }

+void
+tcache_arena_reassociate(tcache_t *tcache, arena_t *arena)
+{
+
+	tcache_arena_dissociate(tcache);
+	tcache_arena_associate(tcache, arena);
+}
+
 void
 tcache_arena_dissociate(tcache_t *tcache)
 {
@@ -266,7 +268,23 @@ tcache_arena_dissociate(tcache_t *tcache)
 }

 tcache_t *
-tcache_create(arena_t *arena)
+tcache_get_hard(tsd_t *tsd)
+{
+	arena_t *arena;
+
+	if (!tcache_enabled_get()) {
+		if (tsd_nominal(tsd))
+			tcache_enabled_set(false); /* Memoize. */
+		return (NULL);
+	}
+	arena = arena_choose(tsd, NULL);
+	if (unlikely(arena == NULL))
+		return (NULL);
+	return (tcache_create(tsd, arena));
+}
+
+tcache_t *
+tcache_create(tsd_t *tsd, arena_t *arena)
 {
 	tcache_t *tcache;
 	size_t size, stack_offset;
@@ -277,23 +295,10 @@ tcache_create(arena_t *arena)
 	size = PTR_CEILING(size);
 	stack_offset = size;
 	size += stack_nelms * sizeof(void *);
-	/*
-	 * Round up to the nearest multiple of the cacheline size, in order to
-	 * avoid the possibility of false cacheline sharing.
-	 *
-	 * That this works relies on the same logic as in ipalloc(), but we
-	 * cannot directly call ipalloc() here due to tcache bootstrapping
-	 * issues.
-	 */
-	size = (size + CACHELINE_MASK) & (-CACHELINE);
-
-	if (size <= SMALL_MAXCLASS)
-		tcache = (tcache_t *)arena_malloc_small(arena, size, true);
-	else if (size <= tcache_maxclass)
-		tcache = (tcache_t *)arena_malloc_large(arena, size, true);
-	else
-		tcache = (tcache_t *)icalloct(size, false, arena);
+	/* Avoid false cacheline sharing. */
+	size = sa2u(size, CACHELINE);

+	tcache = ipalloct(tsd, size, CACHELINE, true, false, arena);
 	if (tcache == NULL)
 		return (NULL);

@@ -307,16 +312,13 @@ tcache_create(arena_t *arena)
 		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 	}

-	tcache_tsd_set(&tcache);
-
 	return (tcache);
 }

-void
-tcache_destroy(tcache_t *tcache)
+static void
+tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 {
 	unsigned i;
-	size_t tcache_size;

 	tcache_arena_dissociate(tcache);

@@ -351,54 +353,30 @@ tcache_destroy(tcache_t *tcache)
 	    arena_prof_accum(tcache->arena, tcache->prof_accumbytes))
 		prof_idump();

-	tcache_size = arena_salloc(tcache, false);
-	if (tcache_size <= SMALL_MAXCLASS) {
-		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
-		arena_t *arena = chunk->arena;
-		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
-		    LG_PAGE;
-		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
-
-		arena_dalloc_bin(arena, chunk, tcache, pageind, mapelm);
-	} else if (tcache_size <= tcache_maxclass) {
-		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
-		arena_t *arena = chunk->arena;
-
-		arena_dalloc_large(arena, chunk, tcache);
-	} else
-		idalloct(tcache, false);
+	idalloct(tsd, tcache, false);
 }

 void
-tcache_thread_cleanup(void *arg)
+tcache_cleanup(tsd_t *tsd)
 {
-	tcache_t *tcache = *(tcache_t **)arg;
+	tcache_t *tcache;

-	if (tcache == TCACHE_STATE_DISABLED) {
-		/* Do nothing. */
-	} else if (tcache == TCACHE_STATE_REINCARNATED) {
-		/*
-		 * Another destructor called an allocator function after this
-		 * destructor was called.  Reset tcache to
-		 * TCACHE_STATE_PURGATORY in order to receive another callback.
-		 */
-		tcache = TCACHE_STATE_PURGATORY;
-		tcache_tsd_set(&tcache);
-	} else if (tcache == TCACHE_STATE_PURGATORY) {
-		/*
-		 * The previous time this destructor was called, we set the key
-		 * to TCACHE_STATE_PURGATORY so that other destructors wouldn't
-		 * cause re-creation of the tcache.  This time, do nothing, so
-		 * that the destructor will not be called again.
-		 */
-	} else if (tcache != NULL) {
-		assert(tcache != TCACHE_STATE_PURGATORY);
-		tcache_destroy(tcache);
-		tcache = TCACHE_STATE_PURGATORY;
-		tcache_tsd_set(&tcache);
+	if (!config_tcache)
+		return;
+
+	if ((tcache = tsd_tcache_get(tsd)) != NULL) {
+		tcache_destroy(tsd, tcache);
+		tsd_tcache_set(tsd, NULL);
 	}
 }

+void
+tcache_enabled_cleanup(tsd_t *tsd)
+{
+
+	/* Do nothing. */
+}
+
 /* Caller must own arena->lock. */
 void
 tcache_stats_merge(tcache_t *tcache, arena_t *arena)
@@ -427,7 +405,7 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena)
 }

 bool
-tcache_boot0(void)
+tcache_boot(void)
 {
 	unsigned i;

@@ -467,13 +445,3 @@ tcache_boot0(void)

 	return (false);
 }
-
-bool
-tcache_boot1(void)
-{
-
-	if (tcache_tsd_boot() || tcache_enabled_tsd_boot())
-		return (true);
-
-	return (false);
-}
--- a/memory/jemalloc/src/src/tsd.c
+++ b/memory/jemalloc/src/src/tsd.c
@@ -7,21 +7,22 @@
 static unsigned ncleanups;
 static malloc_tsd_cleanup_t cleanups[MALLOC_TSD_CLEANUPS_MAX];

+malloc_tsd_data(, , tsd_t, TSD_INITIALIZER)
+
 /******************************************************************************/

 void *
 malloc_tsd_malloc(size_t size)
 {

-	/* Avoid choose_arena() in order to dodge bootstrapping issues. */
-	return (arena_malloc(arenas[0], size, false, false));
+	return (a0malloc(CACHELINE_CEILING(size)));
 }

 void
 malloc_tsd_dalloc(void *wrapper)
 {

-	idalloct(wrapper, false);
+	a0free(wrapper);
 }

 void
@@ -67,10 +68,58 @@ malloc_tsd_cleanup_register(bool (*f)(void))
 }

 void
-malloc_tsd_boot(void)
+tsd_cleanup(void *arg)
+{
+	tsd_t *tsd = (tsd_t *)arg;
+
+	switch (tsd->state) {
+	case tsd_state_nominal:
+#define O(n, t)								\
+		n##_cleanup(tsd);
+MALLOC_TSD
+#undef O
+		tsd->state = tsd_state_purgatory;
+		tsd_set(tsd);
+		break;
+	case tsd_state_purgatory:
+		/*
+		 * The previous time this destructor was called, we set the
+		 * state to tsd_state_purgatory so that other destructors
+		 * wouldn't cause re-creation of the tsd.  This time, do
+		 * nothing, and do not request another callback.
+		 */
+		break;
+	case tsd_state_reincarnated:
+		/*
+		 * Another destructor deallocated memory after this destructor
+		 * was called.  Reset state to tsd_state_purgatory and request
+		 * another callback.
+		 */
+		tsd->state = tsd_state_purgatory;
+		tsd_set(tsd);
+		break;
+	default:
+		not_reached();
+	}
+}
+
+bool
+malloc_tsd_boot0(void)
 {

 	ncleanups = 0;
+	if (tsd_boot0())
+		return (true);
+	*tsd_arenas_cache_bypassp_get(tsd_fetch()) = true;
+	return (false);
+}
+
+void
+malloc_tsd_boot1(void)
+{
+
+	tsd_boot1();
+	*tsd_arenas_cache_bypassp_get(tsd_fetch()) = false;
 }

 #ifdef _WIN32
--- a/memory/jemalloc/src/src/util.c
+++ b/memory/jemalloc/src/src/util.c
@@ -266,7 +266,7 @@ d2s(intmax_t x, char sign, char *s, size_t *slen_p)
 		sign = '-';
 	switch (sign) {
 	case '-':
-		if (neg == false)
+		if (!neg)
 			break;
 		/* Fall through. */
 	case ' ':
@@ -329,7 +329,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 	/* Left padding. */						\
 	size_t pad_len = (width == -1) ? 0 : ((slen < (size_t)width) ?	\
 	    (size_t)width - slen : 0);					\
-	if (left_justify == false && pad_len != 0) {			\
+	if (!left_justify && pad_len != 0) {				\
 		size_t j;						\
 		for (j = 0; j < pad_len; j++)				\
 			APPEND_C(' ');					\
@@ -381,7 +381,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 	case 'p': /* Synthetic; used for %p. */				\
 		val = va_arg(ap, uintptr_t);				\
 		break;							\
-	default: 							\
+	default:							\
 		not_reached();						\
 		val = 0;						\
 	}								\
@@ -406,19 +406,19 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			while (true) {
 				switch (*f) {
 				case '#':
-					assert(alt_form == false);
+					assert(!alt_form);
 					alt_form = true;
 					break;
 				case '-':
-					assert(left_justify == false);
+					assert(!left_justify);
 					left_justify = true;
 					break;
 				case ' ':
-					assert(plus_space == false);
+					assert(!plus_space);
 					plus_space = true;
 					break;
 				case '+':
-					assert(plus_plus == false);
+					assert(!plus_plus);
 					plus_plus = true;
 					break;
 				default: goto label_width;
--- a/memory/jemalloc/src/src/valgrind.c
+++ b/memory/jemalloc/src/src/valgrind.c
@@ -0,0 +1,34 @@
+#include "jemalloc/internal/jemalloc_internal.h"
+#ifndef JEMALLOC_VALGRIND
+#  error "This source file is for Valgrind integration."
+#endif
+
+#include <valgrind/memcheck.h>
+
+void
+valgrind_make_mem_noaccess(void *ptr, size_t usize)
+{
+
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, usize);
+}
+
+void
+valgrind_make_mem_undefined(void *ptr, size_t usize)
+{
+
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize);
+}
+
+void
+valgrind_make_mem_defined(void *ptr, size_t usize)
+{
+
+	VALGRIND_MAKE_MEM_DEFINED(ptr, usize);
+}
+
+void
+valgrind_freelike_block(void *ptr, size_t usize)
+{
+
+	VALGRIND_FREELIKE_BLOCK(ptr, usize);
+}
--- a/memory/jemalloc/src/src/zone.c
+++ b/memory/jemalloc/src/src/zone.c
@@ -258,13 +258,13 @@ register_zone(void)
 		/*
 		 * On OSX 10.6, having the default purgeable zone appear before
 		 * the default zone makes some things crash because it thinks it
-		 * owns the default zone allocated pointers. We thus unregister/
-		 * re-register it in order to ensure it's always after the
-		 * default zone. On OSX < 10.6, there is no purgeable zone, so
-		 * this does nothing. On OSX >= 10.6, unregistering replaces the
-		 * purgeable zone with the last registered zone above, i.e the
-		 * default zone. Registering it again then puts it at the end,
-		 * obviously after the default zone.
+		 * owns the default zone allocated pointers.  We thus
+		 * unregister/re-register it in order to ensure it's always
+		 * after the default zone.  On OSX < 10.6, there is no purgeable
+		 * zone, so this does nothing.  On OSX >= 10.6, unregistering
+		 * replaces the purgeable zone with the last registered zone
+		 * above, i.e. the default zone.  Registering it again then puts
+		 * it at the end, obviously after the default zone.
 		 */
 		if (purgeable_zone) {
 			malloc_zone_unregister(purgeable_zone);
--- a/memory/jemalloc/src/test/include/test/btalloc.h
+++ b/memory/jemalloc/src/test/include/test/btalloc.h
@@ -0,0 +1,31 @@
+/* btalloc() provides a mechanism for allocating via permuted backtraces. */
+void	*btalloc(size_t size, unsigned bits);
+
+#define	btalloc_n_proto(n)						\
+void	*btalloc_##n(size_t size, unsigned bits);
+btalloc_n_proto(0)
+btalloc_n_proto(1)
+
+#define	btalloc_n_gen(n)						\
+void *									\
+btalloc_##n(size_t size, unsigned bits)					\
+{									\
+	void *p;							\
+									\
+	if (bits == 0)							\
+		p = mallocx(size, 0);					\
+	else {								\
+		switch (bits & 0x1U) {					\
+		case 0:							\
+			p = (btalloc_0(size, bits >> 1));		\
+			break;						\
+		case 1:							\
+			p = (btalloc_1(size, bits >> 1));		\
+			break;						\
+		default: not_reached();					\
+		}							\
+	}								\
+	/* Intentionally sabotage tail call optimization. */		\
+	assert_ptr_not_null(p, "Unexpected mallocx() failure");		\
+	return (p);							\
+}
--- a/memory/jemalloc/src/test/include/test/jemalloc_test.h.in
+++ b/memory/jemalloc/src/test/include/test/jemalloc_test.h.in
@@ -5,6 +5,7 @@
 #include <inttypes.h>
 #include <math.h>
 #include <string.h>
+#include <sys/time.h>

 #ifdef _WIN32
 #  include <windows.h>
@@ -132,10 +133,12 @@
 /*
 * Common test utilities.
 */
+#include "test/btalloc.h"
 #include "test/math.h"
 #include "test/mtx.h"
 #include "test/mq.h"
 #include "test/test.h"
+#include "test/timer.h"
 #include "test/thd.h"
 #define	MEXP 19937
 #include "test/SFMT.h"
--- a/memory/jemalloc/src/test/include/test/jemalloc_test_defs.h.in
+++ b/memory/jemalloc/src/test/include/test/jemalloc_test_defs.h.in
@@ -1,5 +1,9 @@
 #include "jemalloc/internal/jemalloc_internal_defs.h"
+#include "jemalloc/internal/jemalloc_internal_decls.h"

-/* For use by SFMT. */
+/*
+ * For use by SFMT.  configure.ac doesn't actually define HAVE_SSE2 because its
+ * dependencies are notoriously unportable in practice.
+ */
 #undef HAVE_SSE2
 #undef HAVE_ALTIVEC
--- a/memory/jemalloc/src/test/include/test/math.h
+++ b/memory/jemalloc/src/test/include/test/math.h
@@ -299,7 +299,7 @@ pt_chi2(double p, double df, double ln_gamma_df_2)

 /*
 * Given a value p in [0..1] and Gamma distribution shape and scale parameters,
- * compute the upper limit on the definite integeral from [0..z] that satisfies
+ * compute the upper limit on the definite integral from [0..z] that satisfies
 * p.
 */
 JEMALLOC_INLINE double
--- a/memory/jemalloc/src/test/include/test/test.h
+++ b/memory/jemalloc/src/test/include/test/test.h
@@ -1,6 +1,6 @@
 #define	ASSERT_BUFSIZE	256

-#define	assert_cmp(t, a, b, cmp, neg_cmp, pri, fmt...) do {		\
+#define	assert_cmp(t, a, b, cmp, neg_cmp, pri, ...) do {		\
 	t a_ = (a);							\
 	t b_ = (b);							\
 	if (!(a_ cmp b_)) {						\
@@ -12,205 +12,205 @@
 		    "%"pri" "#neg_cmp" %"pri": ",			\
 		    __func__, __FILE__, __LINE__,			\
 		    #a, #b, a_, b_);					\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)

-#define	assert_ptr_eq(a, b, fmt...)	assert_cmp(void *, a, b, ==,	\
-    !=, "p", fmt)
-#define	assert_ptr_ne(a, b, fmt...)	assert_cmp(void *, a, b, !=,	\
-    ==, "p", fmt)
-#define	assert_ptr_null(a, fmt...)	assert_cmp(void *, a, NULL, ==,	\
-    !=, "p", fmt)
-#define	assert_ptr_not_null(a, fmt...)	assert_cmp(void *, a, NULL, !=,	\
-    ==, "p", fmt)
+#define	assert_ptr_eq(a, b, ...)	assert_cmp(void *, a, b, ==,	\
+    !=, "p", __VA_ARGS__)
+#define	assert_ptr_ne(a, b, ...)	assert_cmp(void *, a, b, !=,	\
+    ==, "p", __VA_ARGS__)
+#define	assert_ptr_null(a, ...)		assert_cmp(void *, a, NULL, ==,	\
+    !=, "p", __VA_ARGS__)
+#define	assert_ptr_not_null(a, ...)	assert_cmp(void *, a, NULL, !=,	\
+    ==, "p", __VA_ARGS__)

-#define	assert_c_eq(a, b, fmt...)	assert_cmp(char, a, b, ==, !=, "c", fmt)
-#define	assert_c_ne(a, b, fmt...)	assert_cmp(char, a, b, !=, ==, "c", fmt)
-#define	assert_c_lt(a, b, fmt...)	assert_cmp(char, a, b, <, >=, "c", fmt)
-#define	assert_c_le(a, b, fmt...)	assert_cmp(char, a, b, <=, >, "c", fmt)
-#define	assert_c_ge(a, b, fmt...)	assert_cmp(char, a, b, >=, <, "c", fmt)
-#define	assert_c_gt(a, b, fmt...)	assert_cmp(char, a, b, >, <=, "c", fmt)
+#define	assert_c_eq(a, b, ...)	assert_cmp(char, a, b, ==, !=, "c", __VA_ARGS__)
+#define	assert_c_ne(a, b, ...)	assert_cmp(char, a, b, !=, ==, "c", __VA_ARGS__)
+#define	assert_c_lt(a, b, ...)	assert_cmp(char, a, b, <, >=, "c", __VA_ARGS__)
+#define	assert_c_le(a, b, ...)	assert_cmp(char, a, b, <=, >, "c", __VA_ARGS__)
+#define	assert_c_ge(a, b, ...)	assert_cmp(char, a, b, >=, <, "c", __VA_ARGS__)
+#define	assert_c_gt(a, b, ...)	assert_cmp(char, a, b, >, <=, "c", __VA_ARGS__)

-#define	assert_x_eq(a, b, fmt...)	assert_cmp(int, a, b, ==, !=, "#x", fmt)
-#define	assert_x_ne(a, b, fmt...)	assert_cmp(int, a, b, !=, ==, "#x", fmt)
-#define	assert_x_lt(a, b, fmt...)	assert_cmp(int, a, b, <, >=, "#x", fmt)
-#define	assert_x_le(a, b, fmt...)	assert_cmp(int, a, b, <=, >, "#x", fmt)
-#define	assert_x_ge(a, b, fmt...)	assert_cmp(int, a, b, >=, <, "#x", fmt)
-#define	assert_x_gt(a, b, fmt...)	assert_cmp(int, a, b, >, <=, "#x", fmt)
+#define	assert_x_eq(a, b, ...)	assert_cmp(int, a, b, ==, !=, "#x", __VA_ARGS__)
+#define	assert_x_ne(a, b, ...)	assert_cmp(int, a, b, !=, ==, "#x", __VA_ARGS__)
+#define	assert_x_lt(a, b, ...)	assert_cmp(int, a, b, <, >=, "#x", __VA_ARGS__)
+#define	assert_x_le(a, b, ...)	assert_cmp(int, a, b, <=, >, "#x", __VA_ARGS__)
+#define	assert_x_ge(a, b, ...)	assert_cmp(int, a, b, >=, <, "#x", __VA_ARGS__)
+#define	assert_x_gt(a, b, ...)	assert_cmp(int, a, b, >, <=, "#x", __VA_ARGS__)

-#define	assert_d_eq(a, b, fmt...)	assert_cmp(int, a, b, ==, !=, "d", fmt)
-#define	assert_d_ne(a, b, fmt...)	assert_cmp(int, a, b, !=, ==, "d", fmt)
-#define	assert_d_lt(a, b, fmt...)	assert_cmp(int, a, b, <, >=, "d", fmt)
-#define	assert_d_le(a, b, fmt...)	assert_cmp(int, a, b, <=, >, "d", fmt)
-#define	assert_d_ge(a, b, fmt...)	assert_cmp(int, a, b, >=, <, "d", fmt)
-#define	assert_d_gt(a, b, fmt...)	assert_cmp(int, a, b, >, <=, "d", fmt)
+#define	assert_d_eq(a, b, ...)	assert_cmp(int, a, b, ==, !=, "d", __VA_ARGS__)
+#define	assert_d_ne(a, b, ...)	assert_cmp(int, a, b, !=, ==, "d", __VA_ARGS__)
+#define	assert_d_lt(a, b, ...)	assert_cmp(int, a, b, <, >=, "d", __VA_ARGS__)
+#define	assert_d_le(a, b, ...)	assert_cmp(int, a, b, <=, >, "d", __VA_ARGS__)
+#define	assert_d_ge(a, b, ...)	assert_cmp(int, a, b, >=, <, "d", __VA_ARGS__)
+#define	assert_d_gt(a, b, ...)	assert_cmp(int, a, b, >, <=, "d", __VA_ARGS__)

-#define	assert_u_eq(a, b, fmt...)	assert_cmp(int, a, b, ==, !=, "u", fmt)
-#define	assert_u_ne(a, b, fmt...)	assert_cmp(int, a, b, !=, ==, "u", fmt)
-#define	assert_u_lt(a, b, fmt...)	assert_cmp(int, a, b, <, >=, "u", fmt)
-#define	assert_u_le(a, b, fmt...)	assert_cmp(int, a, b, <=, >, "u", fmt)
-#define	assert_u_ge(a, b, fmt...)	assert_cmp(int, a, b, >=, <, "u", fmt)
-#define	assert_u_gt(a, b, fmt...)	assert_cmp(int, a, b, >, <=, "u", fmt)
+#define	assert_u_eq(a, b, ...)	assert_cmp(int, a, b, ==, !=, "u", __VA_ARGS__)
+#define	assert_u_ne(a, b, ...)	assert_cmp(int, a, b, !=, ==, "u", __VA_ARGS__)
+#define	assert_u_lt(a, b, ...)	assert_cmp(int, a, b, <, >=, "u", __VA_ARGS__)
+#define	assert_u_le(a, b, ...)	assert_cmp(int, a, b, <=, >, "u", __VA_ARGS__)
+#define	assert_u_ge(a, b, ...)	assert_cmp(int, a, b, >=, <, "u", __VA_ARGS__)
+#define	assert_u_gt(a, b, ...)	assert_cmp(int, a, b, >, <=, "u", __VA_ARGS__)

-#define	assert_ld_eq(a, b, fmt...)	assert_cmp(long, a, b, ==,	\
-    !=, "ld", fmt)
-#define	assert_ld_ne(a, b, fmt...)	assert_cmp(long, a, b, !=,	\
-    ==, "ld", fmt)
-#define	assert_ld_lt(a, b, fmt...)	assert_cmp(long, a, b, <,	\
-    >=, "ld", fmt)
-#define	assert_ld_le(a, b, fmt...)	assert_cmp(long, a, b, <=,	\
-    >, "ld", fmt)
-#define	assert_ld_ge(a, b, fmt...)	assert_cmp(long, a, b, >=,	\
-    <, "ld", fmt)
-#define	assert_ld_gt(a, b, fmt...)	assert_cmp(long, a, b, >,	\
-    <=, "ld", fmt)
+#define	assert_ld_eq(a, b, ...)	assert_cmp(long, a, b, ==,	\
+    !=, "ld", __VA_ARGS__)
+#define	assert_ld_ne(a, b, ...)	assert_cmp(long, a, b, !=,	\
+    ==, "ld", __VA_ARGS__)
+#define	assert_ld_lt(a, b, ...)	assert_cmp(long, a, b, <,	\
+    >=, "ld", __VA_ARGS__)
+#define	assert_ld_le(a, b, ...)	assert_cmp(long, a, b, <=,	\
+    >, "ld", __VA_ARGS__)
+#define	assert_ld_ge(a, b, ...)	assert_cmp(long, a, b, >=,	\
+    <, "ld", __VA_ARGS__)
+#define	assert_ld_gt(a, b, ...)	assert_cmp(long, a, b, >,	\
+    <=, "ld", __VA_ARGS__)

-#define	assert_lu_eq(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, ==, !=, "lu", fmt)
-#define	assert_lu_ne(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, !=, ==, "lu", fmt)
-#define	assert_lu_lt(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, <, >=, "lu", fmt)
-#define	assert_lu_le(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, <=, >, "lu", fmt)
-#define	assert_lu_ge(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, >=, <, "lu", fmt)
-#define	assert_lu_gt(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, >, <=, "lu", fmt)
+#define	assert_lu_eq(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, ==, !=, "lu", __VA_ARGS__)
+#define	assert_lu_ne(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, !=, ==, "lu", __VA_ARGS__)
+#define	assert_lu_lt(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, <, >=, "lu", __VA_ARGS__)
+#define	assert_lu_le(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, <=, >, "lu", __VA_ARGS__)
+#define	assert_lu_ge(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, >=, <, "lu", __VA_ARGS__)
+#define	assert_lu_gt(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, >, <=, "lu", __VA_ARGS__)

-#define	assert_qd_eq(a, b, fmt...)	assert_cmp(long long, a, b, ==,	\
-    !=, "qd", fmt)
-#define	assert_qd_ne(a, b, fmt...)	assert_cmp(long long, a, b, !=,	\
-    ==, "qd", fmt)
-#define	assert_qd_lt(a, b, fmt...)	assert_cmp(long long, a, b, <,	\
-    >=, "qd", fmt)
-#define	assert_qd_le(a, b, fmt...)	assert_cmp(long long, a, b, <=,	\
-    >, "qd", fmt)
-#define	assert_qd_ge(a, b, fmt...)	assert_cmp(long long, a, b, >=,	\
-    <, "qd", fmt)
-#define	assert_qd_gt(a, b, fmt...)	assert_cmp(long long, a, b, >,	\
-    <=, "qd", fmt)
+#define	assert_qd_eq(a, b, ...)	assert_cmp(long long, a, b, ==,	\
+    !=, "qd", __VA_ARGS__)
+#define	assert_qd_ne(a, b, ...)	assert_cmp(long long, a, b, !=,	\
+    ==, "qd", __VA_ARGS__)
+#define	assert_qd_lt(a, b, ...)	assert_cmp(long long, a, b, <,	\
+    >=, "qd", __VA_ARGS__)
+#define	assert_qd_le(a, b, ...)	assert_cmp(long long, a, b, <=,	\
+    >, "qd", __VA_ARGS__)
+#define	assert_qd_ge(a, b, ...)	assert_cmp(long long, a, b, >=,	\
+    <, "qd", __VA_ARGS__)
+#define	assert_qd_gt(a, b, ...)	assert_cmp(long long, a, b, >,	\
+    <=, "qd", __VA_ARGS__)

-#define	assert_qu_eq(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, ==, !=, "qu", fmt)
-#define	assert_qu_ne(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, !=, ==, "qu", fmt)
-#define	assert_qu_lt(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, <, >=, "qu", fmt)
-#define	assert_qu_le(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, <=, >, "qu", fmt)
-#define	assert_qu_ge(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, >=, <, "qu", fmt)
-#define	assert_qu_gt(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, >, <=, "qu", fmt)
+#define	assert_qu_eq(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, ==, !=, "qu", __VA_ARGS__)
+#define	assert_qu_ne(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, !=, ==, "qu", __VA_ARGS__)
+#define	assert_qu_lt(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, <, >=, "qu", __VA_ARGS__)
+#define	assert_qu_le(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, <=, >, "qu", __VA_ARGS__)
+#define	assert_qu_ge(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, >=, <, "qu", __VA_ARGS__)
+#define	assert_qu_gt(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, >, <=, "qu", __VA_ARGS__)

-#define	assert_jd_eq(a, b, fmt...)	assert_cmp(intmax_t, a, b, ==,	\
-    !=, "jd", fmt)
-#define	assert_jd_ne(a, b, fmt...)	assert_cmp(intmax_t, a, b, !=,	\
-    ==, "jd", fmt)
-#define	assert_jd_lt(a, b, fmt...)	assert_cmp(intmax_t, a, b, <,	\
-    >=, "jd", fmt)
-#define	assert_jd_le(a, b, fmt...)	assert_cmp(intmax_t, a, b, <=,	\
-    >, "jd", fmt)
-#define	assert_jd_ge(a, b, fmt...)	assert_cmp(intmax_t, a, b, >=,	\
-    <, "jd", fmt)
-#define	assert_jd_gt(a, b, fmt...)	assert_cmp(intmax_t, a, b, >,	\
-    <=, "jd", fmt)
+#define	assert_jd_eq(a, b, ...)	assert_cmp(intmax_t, a, b, ==,	\
+    !=, "jd", __VA_ARGS__)
+#define	assert_jd_ne(a, b, ...)	assert_cmp(intmax_t, a, b, !=,	\
+    ==, "jd", __VA_ARGS__)
+#define	assert_jd_lt(a, b, ...)	assert_cmp(intmax_t, a, b, <,	\
+    >=, "jd", __VA_ARGS__)
+#define	assert_jd_le(a, b, ...)	assert_cmp(intmax_t, a, b, <=,	\
+    >, "jd", __VA_ARGS__)
+#define	assert_jd_ge(a, b, ...)	assert_cmp(intmax_t, a, b, >=,	\
+    <, "jd", __VA_ARGS__)
+#define	assert_jd_gt(a, b, ...)	assert_cmp(intmax_t, a, b, >,	\
+    <=, "jd", __VA_ARGS__)

-#define	assert_ju_eq(a, b, fmt...)	assert_cmp(uintmax_t, a, b, ==,	\
-    !=, "ju", fmt)
-#define	assert_ju_ne(a, b, fmt...)	assert_cmp(uintmax_t, a, b, !=,	\
-    ==, "ju", fmt)
-#define	assert_ju_lt(a, b, fmt...)	assert_cmp(uintmax_t, a, b, <,	\
-    >=, "ju", fmt)
-#define	assert_ju_le(a, b, fmt...)	assert_cmp(uintmax_t, a, b, <=,	\
-    >, "ju", fmt)
-#define	assert_ju_ge(a, b, fmt...)	assert_cmp(uintmax_t, a, b, >=,	\
-    <, "ju", fmt)
-#define	assert_ju_gt(a, b, fmt...)	assert_cmp(uintmax_t, a, b, >,	\
-    <=, "ju", fmt)
+#define	assert_ju_eq(a, b, ...)	assert_cmp(uintmax_t, a, b, ==,	\
+    !=, "ju", __VA_ARGS__)
+#define	assert_ju_ne(a, b, ...)	assert_cmp(uintmax_t, a, b, !=,	\
+    ==, "ju", __VA_ARGS__)
+#define	assert_ju_lt(a, b, ...)	assert_cmp(uintmax_t, a, b, <,	\
+    >=, "ju", __VA_ARGS__)
+#define	assert_ju_le(a, b, ...)	assert_cmp(uintmax_t, a, b, <=,	\
+    >, "ju", __VA_ARGS__)
+#define	assert_ju_ge(a, b, ...)	assert_cmp(uintmax_t, a, b, >=,	\
+    <, "ju", __VA_ARGS__)
+#define	assert_ju_gt(a, b, ...)	assert_cmp(uintmax_t, a, b, >,	\
+    <=, "ju", __VA_ARGS__)

-#define	assert_zd_eq(a, b, fmt...)	assert_cmp(ssize_t, a, b, ==,	\
-    !=, "zd", fmt)
-#define	assert_zd_ne(a, b, fmt...)	assert_cmp(ssize_t, a, b, !=,	\
-    ==, "zd", fmt)
-#define	assert_zd_lt(a, b, fmt...)	assert_cmp(ssize_t, a, b, <,	\
-    >=, "zd", fmt)
-#define	assert_zd_le(a, b, fmt...)	assert_cmp(ssize_t, a, b, <=,	\
-    >, "zd", fmt)
-#define	assert_zd_ge(a, b, fmt...)	assert_cmp(ssize_t, a, b, >=,	\
-    <, "zd", fmt)
-#define	assert_zd_gt(a, b, fmt...)	assert_cmp(ssize_t, a, b, >,	\
-    <=, "zd", fmt)
+#define	assert_zd_eq(a, b, ...)	assert_cmp(ssize_t, a, b, ==,	\
+    !=, "zd", __VA_ARGS__)
+#define	assert_zd_ne(a, b, ...)	assert_cmp(ssize_t, a, b, !=,	\
+    ==, "zd", __VA_ARGS__)
+#define	assert_zd_lt(a, b, ...)	assert_cmp(ssize_t, a, b, <,	\
+    >=, "zd", __VA_ARGS__)
+#define	assert_zd_le(a, b, ...)	assert_cmp(ssize_t, a, b, <=,	\
+    >, "zd", __VA_ARGS__)
+#define	assert_zd_ge(a, b, ...)	assert_cmp(ssize_t, a, b, >=,	\
+    <, "zd", __VA_ARGS__)
+#define	assert_zd_gt(a, b, ...)	assert_cmp(ssize_t, a, b, >,	\
+    <=, "zd", __VA_ARGS__)

-#define	assert_zu_eq(a, b, fmt...)	assert_cmp(size_t, a, b, ==,	\
-    !=, "zu", fmt)
-#define	assert_zu_ne(a, b, fmt...)	assert_cmp(size_t, a, b, !=,	\
-    ==, "zu", fmt)
-#define	assert_zu_lt(a, b, fmt...)	assert_cmp(size_t, a, b, <,	\
-    >=, "zu", fmt)
-#define	assert_zu_le(a, b, fmt...)	assert_cmp(size_t, a, b, <=,	\
-    >, "zu", fmt)
-#define	assert_zu_ge(a, b, fmt...)	assert_cmp(size_t, a, b, >=,	\
-    <, "zu", fmt)
-#define	assert_zu_gt(a, b, fmt...)	assert_cmp(size_t, a, b, >,	\
-    <=, "zu", fmt)
+#define	assert_zu_eq(a, b, ...)	assert_cmp(size_t, a, b, ==,	\
+    !=, "zu", __VA_ARGS__)
+#define	assert_zu_ne(a, b, ...)	assert_cmp(size_t, a, b, !=,	\
+    ==, "zu", __VA_ARGS__)
+#define	assert_zu_lt(a, b, ...)	assert_cmp(size_t, a, b, <,	\
+    >=, "zu", __VA_ARGS__)
+#define	assert_zu_le(a, b, ...)	assert_cmp(size_t, a, b, <=,	\
+    >, "zu", __VA_ARGS__)
+#define	assert_zu_ge(a, b, ...)	assert_cmp(size_t, a, b, >=,	\
+    <, "zu", __VA_ARGS__)
+#define	assert_zu_gt(a, b, ...)	assert_cmp(size_t, a, b, >,	\
+    <=, "zu", __VA_ARGS__)

-#define	assert_d32_eq(a, b, fmt...)	assert_cmp(int32_t, a, b, ==,	\
-    !=, PRId32, fmt)
-#define	assert_d32_ne(a, b, fmt...)	assert_cmp(int32_t, a, b, !=,	\
-    ==, PRId32, fmt)
-#define	assert_d32_lt(a, b, fmt...)	assert_cmp(int32_t, a, b, <,	\
-    >=, PRId32, fmt)
-#define	assert_d32_le(a, b, fmt...)	assert_cmp(int32_t, a, b, <=,	\
-    >, PRId32, fmt)
-#define	assert_d32_ge(a, b, fmt...)	assert_cmp(int32_t, a, b, >=,	\
-    <, PRId32, fmt)
-#define	assert_d32_gt(a, b, fmt...)	assert_cmp(int32_t, a, b, >,	\
-    <=, PRId32, fmt)
+#define	assert_d32_eq(a, b, ...)	assert_cmp(int32_t, a, b, ==,	\
+    !=, PRId32, __VA_ARGS__)
+#define	assert_d32_ne(a, b, ...)	assert_cmp(int32_t, a, b, !=,	\
+    ==, PRId32, __VA_ARGS__)
+#define	assert_d32_lt(a, b, ...)	assert_cmp(int32_t, a, b, <,	\
+    >=, PRId32, __VA_ARGS__)
+#define	assert_d32_le(a, b, ...)	assert_cmp(int32_t, a, b, <=,	\
+    >, PRId32, __VA_ARGS__)
+#define	assert_d32_ge(a, b, ...)	assert_cmp(int32_t, a, b, >=,	\
+    <, PRId32, __VA_ARGS__)
+#define	assert_d32_gt(a, b, ...)	assert_cmp(int32_t, a, b, >,	\
+    <=, PRId32, __VA_ARGS__)

-#define	assert_u32_eq(a, b, fmt...)	assert_cmp(uint32_t, a, b, ==,	\
-    !=, PRIu32, fmt)
-#define	assert_u32_ne(a, b, fmt...)	assert_cmp(uint32_t, a, b, !=,	\
-    ==, PRIu32, fmt)
-#define	assert_u32_lt(a, b, fmt...)	assert_cmp(uint32_t, a, b, <,	\
-    >=, PRIu32, fmt)
-#define	assert_u32_le(a, b, fmt...)	assert_cmp(uint32_t, a, b, <=,	\
-    >, PRIu32, fmt)
-#define	assert_u32_ge(a, b, fmt...)	assert_cmp(uint32_t, a, b, >=,	\
-    <, PRIu32, fmt)
-#define	assert_u32_gt(a, b, fmt...)	assert_cmp(uint32_t, a, b, >,	\
-    <=, PRIu32, fmt)
+#define	assert_u32_eq(a, b, ...)	assert_cmp(uint32_t, a, b, ==,	\
+    !=, PRIu32, __VA_ARGS__)
+#define	assert_u32_ne(a, b, ...)	assert_cmp(uint32_t, a, b, !=,	\
+    ==, PRIu32, __VA_ARGS__)
+#define	assert_u32_lt(a, b, ...)	assert_cmp(uint32_t, a, b, <,	\
+    >=, PRIu32, __VA_ARGS__)
+#define	assert_u32_le(a, b, ...)	assert_cmp(uint32_t, a, b, <=,	\
+    >, PRIu32, __VA_ARGS__)
+#define	assert_u32_ge(a, b, ...)	assert_cmp(uint32_t, a, b, >=,	\
+    <, PRIu32, __VA_ARGS__)
+#define	assert_u32_gt(a, b, ...)	assert_cmp(uint32_t, a, b, >,	\
+    <=, PRIu32, __VA_ARGS__)

-#define	assert_d64_eq(a, b, fmt...)	assert_cmp(int64_t, a, b, ==,	\
-    !=, PRId64, fmt)
-#define	assert_d64_ne(a, b, fmt...)	assert_cmp(int64_t, a, b, !=,	\
-    ==, PRId64, fmt)
-#define	assert_d64_lt(a, b, fmt...)	assert_cmp(int64_t, a, b, <,	\
-    >=, PRId64, fmt)
-#define	assert_d64_le(a, b, fmt...)	assert_cmp(int64_t, a, b, <=,	\
-    >, PRId64, fmt)
-#define	assert_d64_ge(a, b, fmt...)	assert_cmp(int64_t, a, b, >=,	\
-    <, PRId64, fmt)
-#define	assert_d64_gt(a, b, fmt...)	assert_cmp(int64_t, a, b, >,	\
-    <=, PRId64, fmt)
+#define	assert_d64_eq(a, b, ...)	assert_cmp(int64_t, a, b, ==,	\
+    !=, PRId64, __VA_ARGS__)
+#define	assert_d64_ne(a, b, ...)	assert_cmp(int64_t, a, b, !=,	\
+    ==, PRId64, __VA_ARGS__)
+#define	assert_d64_lt(a, b, ...)	assert_cmp(int64_t, a, b, <,	\
+    >=, PRId64, __VA_ARGS__)
+#define	assert_d64_le(a, b, ...)	assert_cmp(int64_t, a, b, <=,	\
+    >, PRId64, __VA_ARGS__)
+#define	assert_d64_ge(a, b, ...)	assert_cmp(int64_t, a, b, >=,	\
+    <, PRId64, __VA_ARGS__)
+#define	assert_d64_gt(a, b, ...)	assert_cmp(int64_t, a, b, >,	\
+    <=, PRId64, __VA_ARGS__)

-#define	assert_u64_eq(a, b, fmt...)	assert_cmp(uint64_t, a, b, ==,	\
-    !=, PRIu64, fmt)
-#define	assert_u64_ne(a, b, fmt...)	assert_cmp(uint64_t, a, b, !=,	\
-    ==, PRIu64, fmt)
-#define	assert_u64_lt(a, b, fmt...)	assert_cmp(uint64_t, a, b, <,	\
-    >=, PRIu64, fmt)
-#define	assert_u64_le(a, b, fmt...)	assert_cmp(uint64_t, a, b, <=,	\
-    >, PRIu64, fmt)
-#define	assert_u64_ge(a, b, fmt...)	assert_cmp(uint64_t, a, b, >=,	\
-    <, PRIu64, fmt)
-#define	assert_u64_gt(a, b, fmt...)	assert_cmp(uint64_t, a, b, >,	\
-    <=, PRIu64, fmt)
+#define	assert_u64_eq(a, b, ...)	assert_cmp(uint64_t, a, b, ==,	\
+    !=, PRIu64, __VA_ARGS__)
+#define	assert_u64_ne(a, b, ...)	assert_cmp(uint64_t, a, b, !=,	\
+    ==, PRIu64, __VA_ARGS__)
+#define	assert_u64_lt(a, b, ...)	assert_cmp(uint64_t, a, b, <,	\
+    >=, PRIu64, __VA_ARGS__)
+#define	assert_u64_le(a, b, ...)	assert_cmp(uint64_t, a, b, <=,	\
+    >, PRIu64, __VA_ARGS__)
+#define	assert_u64_ge(a, b, ...)	assert_cmp(uint64_t, a, b, >=,	\
+    <, PRIu64, __VA_ARGS__)
+#define	assert_u64_gt(a, b, ...)	assert_cmp(uint64_t, a, b, >,	\
+    <=, PRIu64, __VA_ARGS__)

-#define	assert_b_eq(a, b, fmt...) do {					\
+#define	assert_b_eq(a, b, ...) do {					\
 	bool a_ = (a);							\
 	bool b_ = (b);							\
 	if (!(a_ == b_)) {						\
@@ -222,11 +222,11 @@
 		    __func__, __FILE__, __LINE__,			\
 		    #a, #b, a_ ? "true" : "false",			\
 		    b_ ? "true" : "false");				\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
-#define	assert_b_ne(a, b, fmt...) do {					\
+#define	assert_b_ne(a, b, ...) do {					\
 	bool a_ = (a);							\
 	bool b_ = (b);							\
 	if (!(a_ != b_)) {						\
@@ -238,14 +238,14 @@
 		    __func__, __FILE__, __LINE__,			\
 		    #a, #b, a_ ? "true" : "false",			\
 		    b_ ? "true" : "false");				\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
-#define	assert_true(a, fmt...)	assert_b_eq(a, true, fmt)
-#define	assert_false(a, fmt...)	assert_b_eq(a, false, fmt)
+#define	assert_true(a, ...)	assert_b_eq(a, true, __VA_ARGS__)
+#define	assert_false(a, ...)	assert_b_eq(a, false, __VA_ARGS__)

-#define	assert_str_eq(a, b, fmt...) do {				\
+#define	assert_str_eq(a, b, ...) do {				\
 	if (strcmp((a), (b))) {						\
 		char prefix[ASSERT_BUFSIZE];				\
 		char message[ASSERT_BUFSIZE];				\
@@ -254,11 +254,11 @@
 		    "(%s) same as (%s) --> "				\
 		    "\"%s\" differs from \"%s\": ",			\
 		    __func__, __FILE__, __LINE__, #a, #b, a, b);	\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
-#define	assert_str_ne(a, b, fmt...) do {				\
+#define	assert_str_ne(a, b, ...) do {				\
 	if (!strcmp((a), (b))) {					\
 		char prefix[ASSERT_BUFSIZE];				\
 		char message[ASSERT_BUFSIZE];				\
@@ -267,18 +267,18 @@
 		    "(%s) differs from (%s) --> "			\
 		    "\"%s\" same as \"%s\": ",				\
 		    __func__, __FILE__, __LINE__, #a, #b, a, b);	\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)

-#define	assert_not_reached(fmt...) do {					\
+#define	assert_not_reached(...) do {					\
 	char prefix[ASSERT_BUFSIZE];					\
 	char message[ASSERT_BUFSIZE];					\
 	malloc_snprintf(prefix, sizeof(prefix),				\
 	    "%s:%s:%d: Unreachable code reached: ",			\
 	    __func__, __FILE__, __LINE__);				\
-	malloc_snprintf(message, sizeof(message), fmt);			\
+	malloc_snprintf(message, sizeof(message), __VA_ARGS__);		\
 	p_test_fail(prefix, message);					\
 } while (0)

@@ -308,8 +308,8 @@ label_test_end:								\
 	p_test_fini();							\
 }

-#define	test(tests...)							\
-	p_test(tests, NULL)
+#define	test(...)							\
+	p_test(__VA_ARGS__, NULL)

 #define	test_skip_if(e) do {						\
 	if (e) {							\
@@ -323,7 +323,7 @@ void	test_skip(const char *format, ...) JEMALLOC_ATTR(format(printf, 1, 2));
 void	test_fail(const char *format, ...) JEMALLOC_ATTR(format(printf, 1, 2));

 /* For private use by macros. */
-test_status_t	p_test(test_t* t, ...);
+test_status_t	p_test(test_t *t, ...);
 void	p_test_init(const char *name);
 void	p_test_fini(void);
 void	p_test_fail(const char *prefix, const char *message);
--- a/memory/jemalloc/src/test/include/test/thd.h
+++ b/memory/jemalloc/src/test/include/test/thd.h
@@ -1,4 +1,4 @@
-/* Abstraction layer for threading in tests */
+/* Abstraction layer for threading in tests. */
 #ifdef _WIN32
 typedef HANDLE thd_t;
 #else
--- a/memory/jemalloc/src/test/include/test/timer.h
+++ b/memory/jemalloc/src/test/include/test/timer.h
@@ -0,0 +1,13 @@
+/* Simple timer, for use in benchmark reporting. */
+
+#include <sys/time.h>
+
+typedef struct {
+	struct timeval tv0;
+	struct timeval tv1;
+} timedelta_t;
+
+void	timer_start(timedelta_t *timer);
+void	timer_stop(timedelta_t *timer);
+uint64_t	timer_usec(const timedelta_t *timer);
+void	timer_ratio(timedelta_t *a, timedelta_t *b, char *buf, size_t buflen);
--- a/memory/jemalloc/src/test/integration/MALLOCX_ARENA.c
+++ b/memory/jemalloc/src/test/integration/MALLOCX_ARENA.c
@@ -2,6 +2,14 @@

 #define	NTHREADS 10

+static bool have_dss =
+#ifdef JEMALLOC_DSS
+    true
+#else
+    false
+#endif
+    ;
+
 void *
 thd_start(void *arg)
 {
@@ -18,13 +26,16 @@ thd_start(void *arg)
 		size_t mib[3];
 		size_t miblen = sizeof(mib) / sizeof(size_t);
 		const char *dss_precs[] = {"disabled", "primary", "secondary"};
-		const char *dss = dss_precs[thread_ind %
-		    (sizeof(dss_precs)/sizeof(char*))];
+		unsigned prec_ind = thread_ind %
+		    (sizeof(dss_precs)/sizeof(char*));
+		const char *dss = dss_precs[prec_ind];
+		int expected_err = (have_dss || prec_ind == 0) ? 0 : EFAULT;
 		assert_d_eq(mallctlnametomib("arena.0.dss", mib, &miblen), 0,
 		    "Error in mallctlnametomib()");
 		mib[1] = arena_ind;
 		assert_d_eq(mallctlbymib(mib, miblen, NULL, NULL, (void *)&dss,
-		    sizeof(const char *)), 0, "Error in mallctlbymib()");
+		    sizeof(const char *)), expected_err,
+		    "Error in mallctlbymib()");
 	}

 	p = mallocx(1, MALLOCX_ARENA(arena_ind));
@@ -34,7 +45,7 @@ thd_start(void *arg)
 	return (NULL);
 }

-TEST_BEGIN(test_ALLOCM_ARENA)
+TEST_BEGIN(test_MALLOCX_ARENA)
 {
 	thd_t thds[NTHREADS];
 	unsigned i;
@@ -54,5 +65,5 @@ main(void)
 {

 	return (test(
-	    test_ALLOCM_ARENA));
+	    test_MALLOCX_ARENA));
 }
--- a/memory/jemalloc/src/test/integration/allocm.c
+++ b/memory/jemalloc/src/test/integration/allocm.c
@@ -1,107 +0,0 @@
-#include "test/jemalloc_test.h"
-
-#define	CHUNK 0x400000
-#define	MAXALIGN (((size_t)1) << 25)
-#define	NITER 4
-
-TEST_BEGIN(test_basic)
-{
-	size_t nsz, rsz, sz;
-	void *p;
-
-	sz = 42;
-	nsz = 0;
-	assert_d_eq(nallocm(&nsz, sz, 0), ALLOCM_SUCCESS,
-	    "Unexpected nallocm() error");
-	rsz = 0;
-	assert_d_eq(allocm(&p, &rsz, sz, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-	assert_zu_ge(rsz, sz, "Real size smaller than expected");
-	assert_zu_eq(nsz, rsz, "nallocm()/allocm() rsize mismatch");
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-
-	assert_d_eq(allocm(&p, NULL, sz, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-
-	nsz = 0;
-	assert_d_eq(nallocm(&nsz, sz, ALLOCM_ZERO), ALLOCM_SUCCESS,
-	    "Unexpected nallocm() error");
-	rsz = 0;
-	assert_d_eq(allocm(&p, &rsz, sz, ALLOCM_ZERO), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-	assert_zu_eq(nsz, rsz, "nallocm()/allocm() rsize mismatch");
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_alignment_and_size)
-{
-	int r;
-	size_t nsz, rsz, sz, alignment, total;
-	unsigned i;
-	void *ps[NITER];
-
-	for (i = 0; i < NITER; i++)
-		ps[i] = NULL;
-
-	for (alignment = 8;
-	    alignment <= MAXALIGN;
-	    alignment <<= 1) {
-		total = 0;
-		for (sz = 1;
-		    sz < 3 * alignment && sz < (1U << 31);
-		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
-			for (i = 0; i < NITER; i++) {
-				nsz = 0;
-				r = nallocm(&nsz, sz, ALLOCM_ALIGN(alignment) |
-				    ALLOCM_ZERO);
-				assert_d_eq(r, ALLOCM_SUCCESS,
-				    "nallocm() error for alignment=%zu, "
-				    "size=%zu (%#zx): %d",
-				    alignment, sz, sz, r);
-				rsz = 0;
-				r = allocm(&ps[i], &rsz, sz,
-				    ALLOCM_ALIGN(alignment) | ALLOCM_ZERO);
-				assert_d_eq(r, ALLOCM_SUCCESS,
-				    "allocm() error for alignment=%zu, "
-				    "size=%zu (%#zx): %d",
-				    alignment, sz, sz, r);
-				assert_zu_ge(rsz, sz,
-				    "Real size smaller than expected for "
-				    "alignment=%zu, size=%zu", alignment, sz);
-				assert_zu_eq(nsz, rsz,
-				    "nallocm()/allocm() rsize mismatch for "
-				    "alignment=%zu, size=%zu", alignment, sz);
-				assert_ptr_null(
-				    (void *)((uintptr_t)ps[i] & (alignment-1)),
-				    "%p inadequately aligned for"
-				    " alignment=%zu, size=%zu", ps[i],
-				    alignment, sz);
-				sallocm(ps[i], &rsz, 0);
-				total += rsz;
-				if (total >= (MAXALIGN << 1))
-					break;
-			}
-			for (i = 0; i < NITER; i++) {
-				if (ps[i] != NULL) {
-					dallocm(ps[i], 0);
-					ps[i] = NULL;
-				}
-			}
-		}
-	}
-}
-TEST_END
-
-int
-main(void)
-{
-
-	return (test(
-	    test_basic,
-	    test_alignment_and_size));
-}
--- a/memory/jemalloc/src/test/integration/chunk.c
+++ b/memory/jemalloc/src/test/integration/chunk.c
@@ -0,0 +1,59 @@
+#include "test/jemalloc_test.h"
+
+chunk_alloc_t *old_alloc;
+chunk_dalloc_t *old_dalloc;
+
+bool
+chunk_dalloc(void *chunk, size_t size, unsigned arena_ind)
+{
+
+	return (old_dalloc(chunk, size, arena_ind));
+}
+
+void *
+chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
+    unsigned arena_ind)
+{
+
+	return (old_alloc(new_addr, size, alignment, zero, arena_ind));
+}
+
+TEST_BEGIN(test_chunk)
+{
+	void *p;
+	chunk_alloc_t *new_alloc;
+	chunk_dalloc_t *new_dalloc;
+	size_t old_size, new_size;
+
+	new_alloc = chunk_alloc;
+	new_dalloc = chunk_dalloc;
+	old_size = sizeof(chunk_alloc_t *);
+	new_size = sizeof(chunk_alloc_t *);
+
+	assert_d_eq(mallctl("arena.0.chunk.alloc", &old_alloc,
+	    &old_size, &new_alloc, new_size), 0,
+	    "Unexpected alloc error");
+	assert_ptr_ne(old_alloc, new_alloc,
+	    "Unexpected alloc error");
+	assert_d_eq(mallctl("arena.0.chunk.dalloc", &old_dalloc, &old_size,
+	    &new_dalloc, new_size), 0, "Unexpected dalloc error");
+	assert_ptr_ne(old_dalloc, new_dalloc, "Unexpected dalloc error");
+
+	p = mallocx(42, 0);
+	assert_ptr_ne(p, NULL, "Unexpected alloc error");
+	free(p);
+
+	assert_d_eq(mallctl("arena.0.chunk.alloc", NULL,
+	    NULL, &old_alloc, old_size), 0,
+	    "Unexpected alloc error");
+	assert_d_eq(mallctl("arena.0.chunk.dalloc", NULL, NULL, &old_dalloc,
+	    old_size), 0, "Unexpected dalloc error");
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(test_chunk));
+}
--- a/memory/jemalloc/src/test/integration/mremap.c
+++ b/memory/jemalloc/src/test/integration/mremap.c
@@ -1,45 +0,0 @@
-#include "test/jemalloc_test.h"
-
-TEST_BEGIN(test_mremap)
-{
-	int err;
-	size_t sz, lg_chunk, chunksize, i;
-	char *p, *q;
-
-	sz = sizeof(lg_chunk);
-	err = mallctl("opt.lg_chunk", &lg_chunk, &sz, NULL, 0);
-	assert_d_eq(err, 0, "Error in mallctl(): %s", strerror(err));
-	chunksize = ((size_t)1U) << lg_chunk;
-
-	p = (char *)malloc(chunksize);
-	assert_ptr_not_null(p, "malloc(%zu) --> %p", chunksize, p);
-	memset(p, 'a', chunksize);
-
-	q = (char *)realloc(p, chunksize * 2);
-	assert_ptr_not_null(q, "realloc(%p, %zu) --> %p", p, chunksize * 2,
-	    q);
-	for (i = 0; i < chunksize; i++) {
-		assert_c_eq(q[i], 'a',
-		    "realloc() should preserve existing bytes across copies");
-	}
-
-	p = q;
-
-	q = (char *)realloc(p, chunksize);
-	assert_ptr_not_null(q, "realloc(%p, %zu) --> %p", p, chunksize, q);
-	for (i = 0; i < chunksize; i++) {
-		assert_c_eq(q[i], 'a',
-		    "realloc() should preserve existing bytes across copies");
-	}
-
-	free(q);
-}
-TEST_END
-
-int
-main(void)
-{
-
-	return (test(
-	    test_mremap));
-}
--- a/memory/jemalloc/src/test/integration/rallocm.c
+++ b/memory/jemalloc/src/test/integration/rallocm.c
@@ -1,111 +0,0 @@
-#include "test/jemalloc_test.h"
-
-TEST_BEGIN(test_same_size)
-{
-	void *p, *q;
-	size_t sz, tsz;
-
-	assert_d_eq(allocm(&p, &sz, 42, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	q = p;
-	assert_d_eq(rallocm(&q, &tsz, sz, 0, ALLOCM_NO_MOVE), ALLOCM_SUCCESS,
-	    "Unexpected rallocm() error");
-	assert_ptr_eq(q, p, "Unexpected object move");
-	assert_zu_eq(tsz, sz, "Unexpected size change: %zu --> %zu", sz, tsz);
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_extra_no_move)
-{
-	void *p, *q;
-	size_t sz, tsz;
-
-	assert_d_eq(allocm(&p, &sz, 42, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	q = p;
-	assert_d_eq(rallocm(&q, &tsz, sz, sz-42, ALLOCM_NO_MOVE),
-	    ALLOCM_SUCCESS, "Unexpected rallocm() error");
-	assert_ptr_eq(q, p, "Unexpected object move");
-	assert_zu_eq(tsz, sz, "Unexpected size change: %zu --> %zu", sz, tsz);
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_no_move_fail)
-{
-	void *p, *q;
-	size_t sz, tsz;
-
-	assert_d_eq(allocm(&p, &sz, 42, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	q = p;
-	assert_d_eq(rallocm(&q, &tsz, sz + 5, 0, ALLOCM_NO_MOVE),
-	    ALLOCM_ERR_NOT_MOVED, "Unexpected rallocm() result");
-	assert_ptr_eq(q, p, "Unexpected object move");
-	assert_zu_eq(tsz, sz, "Unexpected size change: %zu --> %zu", sz, tsz);
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_grow_and_shrink)
-{
-	void *p, *q;
-	size_t tsz;
-#define	NCYCLES 3
-	unsigned i, j;
-#define	NSZS 2500
-	size_t szs[NSZS];
-#define	MAXSZ ZU(12 * 1024 * 1024)
-
-	assert_d_eq(allocm(&p, &szs[0], 1, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	for (i = 0; i < NCYCLES; i++) {
-		for (j = 1; j < NSZS && szs[j-1] < MAXSZ; j++) {
-			q = p;
-			assert_d_eq(rallocm(&q, &szs[j], szs[j-1]+1, 0, 0),
-			    ALLOCM_SUCCESS,
-			    "Unexpected rallocm() error for size=%zu-->%zu",
-			    szs[j-1], szs[j-1]+1);
-			assert_zu_ne(szs[j], szs[j-1]+1,
-			    "Expected size to at least: %zu", szs[j-1]+1);
-			p = q;
-		}
-
-		for (j--; j > 0; j--) {
-			q = p;
-			assert_d_eq(rallocm(&q, &tsz, szs[j-1], 0, 0),
-			    ALLOCM_SUCCESS,
-			    "Unexpected rallocm() error for size=%zu-->%zu",
-			    szs[j], szs[j-1]);
-			assert_zu_eq(tsz, szs[j-1],
-			    "Expected size=%zu, got size=%zu", szs[j-1], tsz);
-			p = q;
-		}
-	}
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-int
-main(void)
-{
-
-	return (test(
-	    test_same_size,
-	    test_extra_no_move,
-	    test_no_move_fail,
-	    test_grow_and_shrink));
-}
--- a/memory/jemalloc/src/test/integration/rallocx.c
+++ b/memory/jemalloc/src/test/integration/rallocx.c
@@ -95,7 +95,8 @@ TEST_BEGIN(test_zero)
 				    "Expected zeroed memory");
 			}
 			if (psz != qsz) {
-				memset(q+psz, FILL_BYTE, qsz-psz);
+				memset((void *)((uintptr_t)q+psz), FILL_BYTE,
+				    qsz-psz);
 				psz = qsz;
 			}
 			p = q;
@@ -159,8 +160,9 @@ TEST_BEGIN(test_lg_align_and_zero)
 		} else {
 			assert_false(validate_fill(q, 0, 0, MAX_VALIDATE),
 			    "Expected zeroed memory");
-			assert_false(validate_fill(q+sz-MAX_VALIDATE, 0, 0,
-			    MAX_VALIDATE), "Expected zeroed memory");
+			assert_false(validate_fill(
+			    (void *)((uintptr_t)q+sz-MAX_VALIDATE),
+			    0, 0, MAX_VALIDATE), "Expected zeroed memory");
 		}
 		p = q;
 	}
--- a/memory/jemalloc/src/test/integration/sdallocx.c
+++ b/memory/jemalloc/src/test/integration/sdallocx.c
@@ -0,0 +1,57 @@
+#include "test/jemalloc_test.h"
+
+#define	MAXALIGN (((size_t)1) << 25)
+#define	NITER 4
+
+TEST_BEGIN(test_basic)
+{
+	void *ptr = mallocx(64, 0);
+	sdallocx(ptr, 64, 0);
+}
+TEST_END
+
+TEST_BEGIN(test_alignment_and_size)
+{
+	size_t nsz, sz, alignment, total;
+	unsigned i;
+	void *ps[NITER];
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		for (sz = 1;
+		    sz < 3 * alignment && sz < (1U << 31);
+		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				nsz = nallocx(sz, MALLOCX_ALIGN(alignment) |
+				    MALLOCX_ZERO);
+				ps[i] = mallocx(sz, MALLOCX_ALIGN(alignment) |
+				    MALLOCX_ZERO);
+				total += nsz;
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					sdallocx(ps[i], sz,
+					    MALLOCX_ALIGN(alignment));
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+	    test_basic,
+	    test_alignment_and_size));
+}
--- a/memory/jemalloc/src/test/src/SFMT.c
+++ b/memory/jemalloc/src/test/src/SFMT.c
@@ -463,11 +463,11 @@ uint32_t gen_rand32_range(sfmt_t *ctx, uint32_t limit) {

    above = 0xffffffffU - (0xffffffffU % limit);
    while (1) {
-        ret = gen_rand32(ctx);
-        if (ret < above) {
-            ret %= limit;
-            break;
-        }
+	ret = gen_rand32(ctx);
+	if (ret < above) {
+	    ret %= limit;
+	    break;
+	}
    }
    return ret;
 }
@@ -511,13 +511,13 @@ uint64_t gen_rand64(sfmt_t *ctx) {
 uint64_t gen_rand64_range(sfmt_t *ctx, uint64_t limit) {
    uint64_t ret, above;

-    above = 0xffffffffffffffffLLU - (0xffffffffffffffffLLU  % limit);
+    above = KQU(0xffffffffffffffff) - (KQU(0xffffffffffffffff) % limit);
    while (1) {
-        ret = gen_rand64(ctx);
-        if (ret < above) {
-            ret %= limit;
-            break;
-        }
+	ret = gen_rand64(ctx);
+	if (ret < above) {
+	    ret %= limit;
+	    break;
+	}
    }
    return ret;
 }
--- a/memory/jemalloc/src/test/src/btalloc.c
+++ b/memory/jemalloc/src/test/src/btalloc.c
@@ -0,0 +1,8 @@
+#include "test/jemalloc_test.h"
+
+void *
+btalloc(size_t size, unsigned bits)
+{
+
+	return (btalloc_0(size, bits));
+}
--- a/memory/jemalloc/src/test/src/btalloc_0.c
+++ b/memory/jemalloc/src/test/src/btalloc_0.c
@@ -0,0 +1,3 @@
+#include "test/jemalloc_test.h"
+
+btalloc_n_gen(0)
--- a/memory/jemalloc/src/test/src/btalloc_1.c
+++ b/memory/jemalloc/src/test/src/btalloc_1.c
@@ -0,0 +1,3 @@
+#include "test/jemalloc_test.h"
+
+btalloc_n_gen(1)
--- a/memory/jemalloc/src/test/src/mtx.c
+++ b/memory/jemalloc/src/test/src/mtx.c
@@ -1,5 +1,9 @@
 #include "test/jemalloc_test.h"

+#ifndef _CRT_SPINCOUNT
+#define	_CRT_SPINCOUNT 4000
+#endif
+
 bool
 mtx_init(mtx_t *mtx)
 {
--- a/memory/jemalloc/src/test/src/test.c
+++ b/memory/jemalloc/src/test/src/test.c
@@ -61,13 +61,26 @@ p_test_fini(void)
 }

 test_status_t
-p_test(test_t* t, ...)
+p_test(test_t *t, ...)
 {
-	test_status_t ret = test_status_pass;
+	test_status_t ret;
 	va_list ap;

+	/*
+	 * Make sure initialization occurs prior to running tests.  Tests are
+	 * special because they may use internal facilities prior to triggering
+	 * initialization as a side effect of calling into the public API.  This
+	 * is a final safety that works even if jemalloc_constructor() doesn't
+	 * run, as for MSVC builds.
+	 */
+	if (nallocx(1, 0) == 0) {
+		malloc_printf("Initialization error");
+		return (test_status_fail);
+	}
+
+	ret = test_status_pass;
 	va_start(ap, t);
-	for (; t != NULL; t = va_arg(ap, test_t*)) {
+	for (; t != NULL; t = va_arg(ap, test_t *)) {
 		t();
 		if (test_status > ret)
 			ret = test_status;
--- a/Show More
+++ b/Show More