Merge branch 'master' into openbsd-minimal

2020-10-17 17:38:23 +02:00 · 2020-10-17 17:38:23 +02:00 · 35a7247a2c
commit 35a7247a2c
parent 161eb4a000 245d98d32d
252 changed files with 61579 additions and 20871 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -55,10 +55,7 @@ message("Configuring zig version ${ZIG_VERSION}")
 set(ZIG_STATIC off CACHE BOOL "Attempt to build a static zig executable (not compatible with glibc)")
 set(ZIG_STATIC_LLVM off CACHE BOOL "Prefer linking against static LLVM libraries")
 set(ZIG_PREFER_CLANG_CPP_DYLIB off CACHE BOOL "Try to link against -lclang-cpp")
-set(ZIG_WORKAROUND_4799 off CACHE BOOL "workaround for https://github.com/ziglang/zig/issues/4799")
-set(ZIG_WORKAROUND_POLLY_SO off CACHE STRING "workaround for https://github.com/ziglang/zig/issues/4799")
 set(ZIG_USE_CCACHE off CACHE BOOL "Use ccache if available")
-set(ZIG_WORKAROUND_6087 off CACHE BOOL "workaround for https://github.com/ziglang/zig/issues/6087")

 if(CCACHE_PROGRAM AND ZIG_USE_CCACHE)
    SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
@ -88,18 +85,12 @@ find_package(lld)

 if(APPLE AND ZIG_STATIC)
    list(REMOVE_ITEM LLVM_LIBRARIES "-lz")
-    find_library(ZLIB NAMES z zlib libz)
+    find_library(ZLIB NAMES libz.a z zlib libz)
    list(APPEND LLVM_LIBRARIES "${ZLIB}")
-endif()

-if(APPLE AND ZIG_WORKAROUND_6087)
-    list(REMOVE_ITEM LLVM_LIBRARIES "-llibxml2.tbd")
-    list(APPEND LLVM_LIBRARIES "-lxml2")
-endif()
-
-if(APPLE AND ZIG_WORKAROUND_4799)
-  # eg: ${CMAKE_PREFIX_PATH} could be /usr/local/opt/llvm/
-  list(APPEND LLVM_LIBRARIES "-Wl,${CMAKE_PREFIX_PATH}/lib/libPolly.a" "-Wl,${CMAKE_PREFIX_PATH}/lib/libPollyPPCG.a" "-Wl,${CMAKE_PREFIX_PATH}/lib/libPollyISL.a")
+    list(REMOVE_ITEM LLVM_LIBRARIES "-lcurses")
+    find_library(CURSES NAMES libcurses.a curses libcurses libncurses.a ncurses libncurses)
+    list(APPEND LLVM_LIBRARIES "${CURSES}")
 endif()

 set(ZIG_CPP_LIB_DIR "${CMAKE_BINARY_DIR}/zigcpp")
@ -363,7 +354,7 @@ endif()
 if(MSVC)
    set(EXE_CFLAGS "${EXE_CFLAGS}")
 else()
-    set(EXE_CFLAGS "${EXE_CFLAGS} -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -D_GNU_SOURCE -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -Werror=type-limits -Wno-missing-braces")
+    set(EXE_CFLAGS "${EXE_CFLAGS} -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -D_GNU_SOURCE -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -Werror=type-limits -Wno-missing-braces -Wno-comment")
    if(MINGW)
        set(EXE_CFLAGS "${EXE_CFLAGS} -Wno-format")
    endif()
@ -415,9 +406,6 @@ target_link_libraries(zigcpp LINK_PUBLIC
    ${LLVM_LIBRARIES}
    ${CMAKE_THREAD_LIBS_INIT}
 )
-if(ZIG_WORKAROUND_POLLY_SO)
-  target_link_libraries(zigcpp LINK_PUBLIC "-Wl,${ZIG_WORKAROUND_POLLY_SO}")
-endif()

 add_library(opt_c_util STATIC ${OPTIMIZED_C_SOURCES})
 set_target_properties(opt_c_util PROPERTIES
--- a/README.md
+++ b/README.md
@ -32,7 +32,7 @@ This step must be repeated when you make changes to any of the C++ source code.

 * cmake >= 2.8.5
 * gcc >= 5.0.0 or clang >= 3.6.0
- * LLVM, Clang, LLD development libraries == 10.x, compiled with the same gcc or clang version above
+ * LLVM, Clang, LLD development libraries == 11.x, compiled with the same gcc or clang version above
   - Use the system package manager, or [build from source](https://github.com/ziglang/zig/wiki/How-to-build-LLVM,-libclang,-and-liblld-from-source#posix).

 ##### Windows
@ -41,7 +41,7 @@ This step must be repeated when you make changes to any of the C++ source code.
 * Microsoft Visual Studio. Supported versions:
   - 2017 (version 15.8)
   - 2019 (version 16)
- * LLVM, Clang, LLD development libraries == 10.x
+ * LLVM, Clang, LLD development libraries == 11.x
   - Use the [pre-built binaries](https://github.com/ziglang/zig/wiki/Building-Zig-on-Windows) or [build from source](https://github.com/ziglang/zig/wiki/How-to-build-LLVM,-libclang,-and-liblld-from-source#windows).

 #### Instructions
@ -68,19 +68,6 @@ cmake .. -DCMAKE_PREFIX_PATH=$(brew --prefix llvm)
 make install
 ```

-You will now run into this issue:
-[homebrew and llvm 10 packages in apt.llvm.org are broken with undefined reference to getPollyPluginInfo](https://github.com/ziglang/zig/issues/4799)
-or
-[error: unable to create target: 'Unable to find target for this triple (no targets are registered)'](https://github.com/ziglang/zig/issues/5055),
-in which case try `-DZIG_WORKAROUND_4799=ON`
-
-This has been fixed upstream with LLVM 10.0.1.
-
-Building with LLVM 10.0.1 you might run into this problem:
-`ld: library not found for -llibxml2.tbd`
-[Building with LLVM 10.0.1 installed via Homebrew fails](https://github.com/ziglang/zig/issues/6087),
-in which case you can try `-DZIG_WORKAROUND_6087=ON`.
-
 ##### Windows

 See https://github.com/ziglang/zig/wiki/Building-Zig-on-Windows
--- a/ci/azure/linux_script
+++ b/ci/azure/linux_script
@ -5,14 +5,14 @@ set -e

 BUILDDIR="$(pwd)"

-sudo sh -c 'echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main" >> /etc/apt/sources.list'
+sudo sh -c 'echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main" >> /etc/apt/sources.list'
 wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
 sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
 sudo apt-get update -q

 sudo apt-get remove -y llvm-*
 sudo rm -rf /usr/local/*
-sudo apt-get install -y libxml2-dev libclang-10-dev llvm-10 llvm-10-dev liblld-10-dev cmake s3cmd gcc-7 g++-7 ninja-build tidy
+sudo apt-get install -y libxml2-dev libclang-11-dev llvm-11 llvm-11-dev liblld-11-dev cmake s3cmd gcc-7 g++-7 ninja-build tidy

 QEMUBASE="qemu-linux-x86_64-5.1.0"
 wget https://ziglang.org/deps/$QEMUBASE.tar.xz
@ -44,7 +44,7 @@ VERSION="$(./zig version)"
 if [ "${BUILD_REASON}" != "PullRequest" ]; then
  ARTIFACTSDIR="$BUILDDIR/artifacts"
  mkdir "$ARTIFACTSDIR"
-  docker run -i --mount type=bind,source="$ARTIFACTSDIR",target=/z ziglang/static-base:llvm10-x86_64-1 -j2 $BUILD_SOURCEVERSION
+  docker run -i --mount type=bind,source="$ARTIFACTSDIR",target=/z ziglang/static-base:llvm11-x86_64-1 -j2 $BUILD_SOURCEVERSION
  TARBALL="$(ls $ARTIFACTSDIR)"
  mv "$DOWNLOADSECUREFILE_SECUREFILEPATH" "$HOME/.s3cfg"
  s3cmd put -P --add-header="cache-control: public, max-age=31536000, immutable" "$ARTIFACTSDIR/$TARBALL" s3://ziglang.org/builds/
--- a/ci/azure/macos_script
+++ b/ci/azure/macos_script
@ -3,28 +3,26 @@
 set -x
 set -e

-system_profiler SPHardwareDataType
-
-brew install s3cmd gcc@9
+brew install s3cmd

 ZIGDIR="$(pwd)"
-LLVMVER="10.0.1"
 ARCH="x86_64"
-CACHE_BASENAME="llvm+clang+lld-$LLVMVER-$ARCH-macosx-gcc9-release"
+CACHE_BASENAME="zig+llvm+lld+clang-$ARCH-macos-gnu-0.6.0+1c9ef63a"
 PREFIX="$HOME/$CACHE_BASENAME"
 JOBS="-j2"

-# I tried using the system default compiler (clang), but it couldn't statically link libc++.
-# So we use gcc-9 from homebrew.
-export CC=gcc-9
-export CXX=g++-9
-
 rm -rf $PREFIX
-
 cd $HOME
-wget -nv "https://ziglang.org/builds/$CACHE_BASENAME.tar.xz"
+wget -nv "https://ziglang.org/deps/$CACHE_BASENAME.tar.xz"
 tar xf "$CACHE_BASENAME.tar.xz"

+ZIG="$PREFIX/bin/zig"
+NATIVE_LIBC_TXT="$HOME/native_libc.txt"
+$ZIG libc > "$NATIVE_LIBC_TXT"
+export ZIG_LIBC="$NATIVE_LIBC_TXT"
+export CC="$ZIG cc"
+export CXX="$ZIG c++"
+
 cd $ZIGDIR

 # Make the `zig version` number consistent.
@ -33,7 +31,21 @@ git config core.abbrev 9

 mkdir build
 cd build
-cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=$PREFIX -DCMAKE_INSTALL_PREFIX=$(pwd)/release -DZIG_STATIC=ON
+cmake .. \
+  -DCMAKE_INSTALL_PREFIX="$(pwd)/release" \
+  -DCMAKE_PREFIX_PATH="$PREFIX" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DZIG_TARGET_TRIPLE="$ARCH-native-gnu" \
+  -DZIG_TARGET_MCPU="baseline" \
+  -DZIG_EXECUTABLE="$ZIG" \
+  -DZIG_STATIC=ON
+
+# Now cmake will use zig as the C/C++ compiler. We reset the environment variables
+# so that installation and testing do not get affected by them.
+unset CC
+unset CXX
+unset ZIG_LIBC
+
 make $JOBS install
 release/bin/zig build test

--- a/ci/azure/pipelines.yml
+++ b/ci/azure/pipelines.yml
@ -1,7 +1,7 @@
 jobs:
 - job: BuildMacOS
  pool:
-    vmImage: 'macOS 10.14'
+    vmImage: 'macOS-10.15'

  timeoutInMinutes: 360

--- a/ci/azure/windows_msvc_install
+++ b/ci/azure/windows_msvc_install
@ -7,5 +7,5 @@ pacman -Su --needed --noconfirm
 pacman -S --needed --noconfirm wget p7zip python3-pip tar xz

 pip install s3cmd
-wget -nv "https://ziglang.org/deps/llvm%2bclang%2blld-10.0.0-x86_64-windows-msvc-release-mt.tar.xz"
-tar xf llvm+clang+lld-10.0.0-x86_64-windows-msvc-release-mt.tar.xz
+wget -nv "https://ziglang.org/deps/llvm%2bclang%2blld-11.0.0-x86_64-windows-msvc-release-mt.tar.xz"
+tar xf llvm+clang+lld-11.0.0-x86_64-windows-msvc-release-mt.tar.xz
--- a/ci/azure/windows_msvc_script.bat
+++ b/ci/azure/windows_msvc_script.bat
@ -11,7 +11,7 @@ SET "MSYSTEM=%PREVMSYSTEM%"

 SET "ZIGBUILDDIR=%SRCROOT%\build"
 SET "ZIGINSTALLDIR=%ZIGBUILDDIR%\dist"
-SET "ZIGPREFIXPATH=%SRCROOT%\llvm+clang+lld-10.0.0-x86_64-windows-msvc-release-mt"
+SET "ZIGPREFIXPATH=%SRCROOT%\llvm+clang+lld-11.0.0-x86_64-windows-msvc-release-mt"

 call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64

--- a/ci/drone/drone.yml
+++ b/ci/drone/drone.yml
@ -7,7 +7,7 @@ platform:

 steps:
 - name: build-and-test
-  image: ziglang/static-base:llvm10-aarch64-1
+  image: ziglang/static-base:llvm11-aarch64-1
  environment:
    SRHT_OAUTH_TOKEN:
      from_secret: SRHT_OAUTH_TOKEN
--- a/ci/srht/freebsd_script
+++ b/ci/srht/freebsd_script
@ -7,7 +7,7 @@ sudo pkg update -fq
 sudo pkg install -y cmake py27-s3cmd wget curl jq

 ZIGDIR="$(pwd)"
-CACHE_BASENAME="llvm+clang+lld-10.0.0-x86_64-freebsd-release"
+CACHE_BASENAME="llvm+clang+lld-11.0.0-x86_64-freebsd-release"
 PREFIX="$HOME/$CACHE_BASENAME"
 JOBS="-j$(sysctl -n hw.ncpu)"

@ -31,24 +31,7 @@ cd build
 cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=$PREFIX "-DCMAKE_INSTALL_PREFIX=$(pwd)/release" -DZIG_STATIC=ON
 make $JOBS install

-release/bin/zig build test-fmt
-release/bin/zig build test-behavior
-# TODO get these tests passing on freebsd and re-enable
-#release/bin/zig build test-std
-release/bin/zig build test-compiler-rt
-release/bin/zig build test-compare-output
-release/bin/zig build test-standalone
-release/bin/zig build test-stack-traces
-release/bin/zig build test-cli
-release/bin/zig build test-asm-link
-release/bin/zig build test-runtime-safety
-release/bin/zig build test-translate-c
-release/bin/zig build test-run-translated-c
-# TODO disabled until we are shipping self-hosted
-#release/bin/zig build test-gen-h
-# TODO disabled to save time and hit that 45 minute limit
-#release/bin/zig build test-compile-errors
-release/bin/zig build docs
+release/bin/zig build test

 if [ -f ~/.s3cfg ]; then
  mv ../LICENSE release/
--- a/cmake/Findclang.cmake
+++ b/cmake/Findclang.cmake
@ -9,27 +9,27 @@

 find_path(CLANG_INCLUDE_DIRS NAMES clang/Frontend/ASTUnit.h
  PATHS
-    /usr/lib/llvm/10/include
-    /usr/lib/llvm-10/include
-    /usr/lib/llvm-10.0/include
-    /usr/local/llvm100/include
-    /usr/local/llvm10/include
+    /usr/lib/llvm/11/include
+    /usr/lib/llvm-11/include
+    /usr/lib/llvm-11.0/include
+    /usr/local/llvm110/include
+    /usr/local/llvm11/include
    /mingw64/include
 )

 if(ZIG_PREFER_CLANG_CPP_DYLIB)
  find_library(CLANG_LIBRARIES
    NAMES
-      clang-cpp-10.0
-      clang-cpp100
+      clang-cpp-11.0
+      clang-cpp110
      clang-cpp
    PATHS
      ${CLANG_LIBDIRS}
-      /usr/lib/llvm/10/lib
-      /usr/lib/llvm/10/lib64
-      /usr/lib/llvm-10/lib
-      /usr/local/llvm100/lib
-      /usr/local/llvm10/lib
+      /usr/lib/llvm/11/lib
+      /usr/lib/llvm/11/lib64
+      /usr/lib/llvm-11/lib
+      /usr/local/llvm110/lib
+      /usr/local/llvm11/lib
  )
 endif()

@ -39,11 +39,11 @@ if(NOT CLANG_LIBRARIES)
    find_library(CLANG_${_prettylibname_}_LIB NAMES ${_libname_}
      PATHS
        ${CLANG_LIBDIRS}
-        /usr/lib/llvm/10/lib
-        /usr/lib/llvm-10/lib
-        /usr/lib/llvm-10.0/lib
-        /usr/local/llvm100/lib
-        /usr/local/llvm10/lib
+        /usr/lib/llvm/11/lib
+        /usr/lib/llvm-11/lib
+        /usr/lib/llvm-11.0/lib
+        /usr/local/llvm110/lib
+        /usr/local/llvm11/lib
        /mingw64/lib
        /c/msys64/mingw64/lib
        c:\\msys64\\mingw64\\lib
@ -75,6 +75,7 @@ if(NOT CLANG_LIBRARIES)
  FIND_AND_ADD_CLANG_LIB(clangRewrite)
  FIND_AND_ADD_CLANG_LIB(clangCrossTU)
  FIND_AND_ADD_CLANG_LIB(clangIndex)
+  FIND_AND_ADD_CLANG_LIB(clangToolingCore)
 endif()

 include(FindPackageHandleStandardArgs)
--- a/cmake/Findlld.cmake
+++ b/cmake/Findlld.cmake
@ -8,16 +8,16 @@

 find_path(LLD_INCLUDE_DIRS NAMES lld/Common/Driver.h
    PATHS
-        /usr/lib/llvm-10/include
-        /usr/local/llvm100/include
-        /usr/local/llvm10/include
+        /usr/lib/llvm-11/include
+        /usr/local/llvm110/include
+        /usr/local/llvm11/include
        /mingw64/include)

-find_library(LLD_LIBRARY NAMES lld-10.0 lld100 lld
+find_library(LLD_LIBRARY NAMES lld-11.0 lld110 lld
    PATHS
-        /usr/lib/llvm-10/lib
-        /usr/local/llvm100/lib
-        /usr/local/llvm10/lib
+        /usr/lib/llvm-11/lib
+        /usr/local/llvm110/lib
+        /usr/local/llvm11/lib
 )
 if(EXISTS ${LLD_LIBRARY})
    set(LLD_LIBRARIES ${LLD_LIBRARY})
@ -27,9 +27,9 @@ else()
        find_library(LLD_${_prettylibname_}_LIB NAMES ${_libname_}
            PATHS
                ${LLD_LIBDIRS}
-                /usr/lib/llvm-10/lib
-                /usr/local/llvm100/lib
-                /usr/local/llvm10/lib
+                /usr/lib/llvm-11/lib
+                /usr/local/llvm110/lib
+                /usr/local/llvm11/lib
                /mingw64/lib
                /c/msys64/mingw64/lib
                c:/msys64/mingw64/lib)
--- a/cmake/Findllvm.cmake
+++ b/cmake/Findllvm.cmake
@ -9,37 +9,37 @@

 find_path(LLVM_INCLUDE_DIRS NAMES llvm/IR/IRBuilder.h
  PATHS
-    /usr/lib/llvm/10/include
-    /usr/lib/llvm-10/include
-    /usr/lib/llvm-10.0/include
-    /usr/local/llvm10/include
-    /usr/local/llvm100/include
+    /usr/lib/llvm/11/include
+    /usr/lib/llvm-11/include
+    /usr/lib/llvm-11.0/include
+    /usr/local/llvm11/include
+    /usr/local/llvm110/include
    /mingw64/include
 )

 if(ZIG_PREFER_CLANG_CPP_DYLIB)
  find_library(LLVM_LIBRARIES
    NAMES
-      LLVM-10.0
-      LLVM-10
-      LLVM-100
+      LLVM-11.0
+      LLVM-11
+      LLVM-110
      LLVM
    PATHS
      ${LLVM_LIBDIRS}
-      /usr/lib/llvm/10/lib
-      /usr/lib/llvm/10/lib64
-      /usr/lib/llvm-10/lib
-      /usr/local/llvm10/lib
-      /usr/local/llvm100/lib
+      /usr/lib/llvm/11/lib
+      /usr/lib/llvm/11/lib64
+      /usr/lib/llvm-11/lib
+      /usr/local/llvm11/lib
+      /usr/local/llvm110/lib
  )
-elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG)
+
  find_program(LLVM_CONFIG_EXE
-      NAMES llvm-config-10 llvm-config-10.0 llvm-config100 llvm-config10 llvm-config
+      NAMES llvm-config-11 llvm-config-11.0 llvm-config110 llvm-config11 llvm-config
      PATHS
          "/mingw64/bin"
          "/c/msys64/mingw64/bin"
          "c:/msys64/mingw64/bin"
-          "C:/Libraries/llvm-10.0.0/bin")
+          "C:/Libraries/llvm-11.0.0/bin")

  if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
    message(FATAL_ERROR "unable to find llvm-config")
@ -54,14 +54,45 @@ elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG)
    OUTPUT_VARIABLE LLVM_CONFIG_VERSION
    OUTPUT_STRIP_TRAILING_WHITESPACE)

-  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 10)
-    message(FATAL_ERROR "expected LLVM 10.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 11)
+    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 11)
-    message(FATAL_ERROR "expected LLVM 10.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 12)
+    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 11)
-    message(FATAL_ERROR "expected LLVM 10.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  endif()
+elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG)
+  find_program(LLVM_CONFIG_EXE
+      NAMES llvm-config-11 llvm-config-11.0 llvm-config110 llvm-config11 llvm-config
+      PATHS
+          "/mingw64/bin"
+          "/c/msys64/mingw64/bin"
+          "c:/msys64/mingw64/bin"
+          "C:/Libraries/llvm-11.0.0/bin")
+
+  if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
+    message(FATAL_ERROR "unable to find llvm-config")
+  endif()
+
+  if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
+    message(FATAL_ERROR "unable to find llvm-config")
+  endif()
+
+  execute_process(
+    COMMAND ${LLVM_CONFIG_EXE} --version
+    OUTPUT_VARIABLE LLVM_CONFIG_VERSION
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 11)
+    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  endif()
+  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 12)
+    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  endif()
+  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 11)
+    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()

  execute_process(
@ -78,6 +109,7 @@ elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG)
  NEED_TARGET("AArch64")
  NEED_TARGET("AMDGPU")
  NEED_TARGET("ARM")
+  NEED_TARGET("AVR")
  NEED_TARGET("BPF")
  NEED_TARGET("Hexagon")
  NEED_TARGET("Lanai")
@ -134,26 +166,25 @@ elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG)
  set(LLVM_LIBRARIES ${LLVM_LIBRARIES} ${LLVM_SYSTEM_LIBS})

  if(NOT LLVM_LIBRARIES)
-    find_library(LLVM_LIBRARIES NAMES LLVM LLVM-10 LLVM-10.0)
+    find_library(LLVM_LIBRARIES NAMES LLVM LLVM-11 LLVM-11.0)
  endif()

  link_directories("${CMAKE_PREFIX_PATH}/lib")
  link_directories("${LLVM_LIBDIRS}")
 else()
  # Here we assume that we're cross compiling with Zig, of course. No reason
-  # to support more complicated setups. We also assume the experimental target
-  # AVR is enabled.
+  # to support more complicated setups.

  macro(FIND_AND_ADD_LLVM_LIB _libname_)
    string(TOUPPER ${_libname_} _prettylibname_)
    find_library(LLVM_${_prettylibname_}_LIB NAMES ${_libname_}
      PATHS
      ${LLVM_LIBDIRS}
-      /usr/lib/llvm/10/lib
-      /usr/lib/llvm-10/lib
-      /usr/lib/llvm-10.0/lib
-      /usr/local/llvm100/lib
-      /usr/local/llvm10/lib
+      /usr/lib/llvm/11/lib
+      /usr/lib/llvm-11/lib
+      /usr/lib/llvm-11.0/lib
+      /usr/local/llvm110/lib
+      /usr/local/llvm11/lib
      /mingw64/lib
      /c/msys64/mingw64/lib
      c:\\msys64\\mingw64\\lib)
@ -174,12 +205,13 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMMCA)
  FIND_AND_ADD_LLVM_LIB(LLVMLTO)
  FIND_AND_ADD_LLVM_LIB(LLVMPasses)
+  FIND_AND_ADD_LLVM_LIB(LLVMCoroutines)
  FIND_AND_ADD_LLVM_LIB(LLVMObjCARCOpts)
+  FIND_AND_ADD_LLVM_LIB(LLVMExtensions)
  FIND_AND_ADD_LLVM_LIB(LLVMLineEditor)
  FIND_AND_ADD_LLVM_LIB(LLVMLibDriver)
  FIND_AND_ADD_LLVM_LIB(LLVMInterpreter)
  FIND_AND_ADD_LLVM_LIB(LLVMFuzzMutate)
-  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOpenMP)
  FIND_AND_ADD_LLVM_LIB(LLVMMCJIT)
  FIND_AND_ADD_LLVM_LIB(LLVMExecutionEngine)
  FIND_AND_ADD_LLVM_LIB(LLVMRuntimeDyld)
@ -188,21 +220,14 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMOption)
  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoGSYM)
  FIND_AND_ADD_LLVM_LIB(LLVMCoverage)
-  FIND_AND_ADD_LLVM_LIB(LLVMCoroutines)
-  FIND_AND_ADD_LLVM_LIB(LLVMAVRDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMAVRCodeGen)
-  FIND_AND_ADD_LLVM_LIB(LLVMAVRAsmParser)
-  FIND_AND_ADD_LLVM_LIB(LLVMAVRDesc)
-  FIND_AND_ADD_LLVM_LIB(LLVMAVRInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreDisassembler)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMX86Disassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMX86AsmParser)
  FIND_AND_ADD_LLVM_LIB(LLVMX86CodeGen)
+  FIND_AND_ADD_LLVM_LIB(LLVMX86AsmParser)
  FIND_AND_ADD_LLVM_LIB(LLVMX86Desc)
-  FIND_AND_ADD_LLVM_LIB(LLVMX86Utils)
  FIND_AND_ADD_LLVM_LIB(LLVMX86Info)
  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyDisassembler)
  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyCodeGen)
@ -258,6 +283,11 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMBPFAsmParser)
  FIND_AND_ADD_LLVM_LIB(LLVMBPFDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMBPFInfo)
+  FIND_AND_ADD_LLVM_LIB(LLVMAVRDisassembler)
+  FIND_AND_ADD_LLVM_LIB(LLVMAVRCodeGen)
+  FIND_AND_ADD_LLVM_LIB(LLVMAVRAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMAVRDesc)
+  FIND_AND_ADD_LLVM_LIB(LLVMAVRInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMARMDisassembler)
  FIND_AND_ADD_LLVM_LIB(LLVMARMCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMARMAsmParser)
@ -273,6 +303,7 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMLinker)
  FIND_AND_ADD_LLVM_LIB(LLVMIRReader)
  FIND_AND_ADD_LLVM_LIB(LLVMAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOpenMP)
  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUAsmParser)
  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUUtils)
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@ -2839,81 +2839,81 @@ const mem = @import("std").mem;

 // Declare an enum.
 const Type = enum {
-    Ok,
-    NotOk,
+    ok,
+    not_ok,
 };

 // Declare a specific instance of the enum variant.
-const c = Type.Ok;
+const c = Type.ok;

 // If you want access to the ordinal value of an enum, you
 // can specify the tag type.
 const Value = enum(u2) {
-    Zero,
-    One,
-    Two,
+    zero,
+    one,
+    two,
 };

 // Now you can cast between u2 and Value.
 // The ordinal value starts from 0, counting up for each member.
 test "enum ordinal value" {
-    assert(@enumToInt(Value.Zero) == 0);
-    assert(@enumToInt(Value.One) == 1);
-    assert(@enumToInt(Value.Two) == 2);
+    assert(@enumToInt(Value.zero) == 0);
+    assert(@enumToInt(Value.one) == 1);
+    assert(@enumToInt(Value.two) == 2);
 }

 // You can override the ordinal value for an enum.
 const Value2 = enum(u32) {
-    Hundred = 100,
-    Thousand = 1000,
-    Million = 1000000,
+    hundred = 100,
+    thousand = 1000,
+    million = 1000000,
 };
 test "set enum ordinal value" {
-    assert(@enumToInt(Value2.Hundred) == 100);
-    assert(@enumToInt(Value2.Thousand) == 1000);
-    assert(@enumToInt(Value2.Million) == 1000000);
+    assert(@enumToInt(Value2.hundred) == 100);
+    assert(@enumToInt(Value2.thousand) == 1000);
+    assert(@enumToInt(Value2.million) == 1000000);
 }

 // Enums can have methods, the same as structs and unions.
 // Enum methods are not special, they are only namespaced
 // functions that you can call with dot syntax.
 const Suit = enum {
-    Clubs,
-    Spades,
-    Diamonds,
-    Hearts,
+    clubs,
+    spades,
+    diamonds,
+    hearts,

    pub fn isClubs(self: Suit) bool {
-        return self == Suit.Clubs;
+        return self == Suit.clubs;
    }
 };
 test "enum method" {
-    const p = Suit.Spades;
+    const p = Suit.spades;
    assert(!p.isClubs());
 }

 // An enum variant of different types can be switched upon.
 const Foo = enum {
-    String,
-    Number,
-    None,
+    string,
+    number,
+    none,
 };
 test "enum variant switch" {
-    const p = Foo.Number;
+    const p = Foo.number;
    const what_is_it = switch (p) {
-        Foo.String => "this is a string",
-        Foo.Number => "this is a number",
-        Foo.None => "this is a none",
+        Foo.string => "this is a string",
+        Foo.number => "this is a number",
+        Foo.none => "this is a none",
    };
    assert(mem.eql(u8, what_is_it, "this is a number"));
 }

 // @TagType can be used to access the integer tag type of an enum.
 const Small = enum {
-    One,
-    Two,
-    Three,
-    Four,
+    one,
+    two,
+    three,
+    four,
 };
 test "@TagType" {
    assert(@TagType(Small) == u2);
@ -2922,12 +2922,12 @@ test "@TagType" {
 // @typeInfo tells us the field count and the fields names:
 test "@typeInfo" {
    assert(@typeInfo(Small).Enum.fields.len == 4);
-    assert(mem.eql(u8, @typeInfo(Small).Enum.fields[1].name, "Two"));
+    assert(mem.eql(u8, @typeInfo(Small).Enum.fields[1].name, "two"));
 }

 // @tagName gives a []const u8 representation of an enum value:
 test "@tagName" {
-    assert(mem.eql(u8, @tagName(Small.Three), "Three"));
+    assert(mem.eql(u8, @tagName(Small.three), "three"));
 }
      {#code_end#}
      {#see_also|@typeInfo|@tagName|@sizeOf#}
@ -2937,14 +2937,14 @@ test "@tagName" {
      By default, enums are not guaranteed to be compatible with the C ABI:
      </p>
      {#code_begin|obj_err|parameter of type 'Foo' not allowed in function with calling convention 'C'#}
-const Foo = enum { A, B, C };
+const Foo = enum { a, b, c };
 export fn entry(foo: Foo) void { }
      {#code_end#}
      <p>
      For a C-ABI-compatible enum, use {#syntax#}extern enum{#endsyntax#}:
      </p>
      {#code_begin|obj#}
-const Foo = extern enum { A, B, C };
+const Foo = extern enum { a, b, c };
 export fn entry(foo: Foo) void { }
      {#code_end#}
      {#header_close#}
@ -2958,9 +2958,9 @@ const std = @import("std");

 test "packed enum" {
    const Number = packed enum(u8) {
-        One,
-        Two,
-        Three,
+        one,
+        two,
+        three,
    };
    std.debug.assert(@sizeOf(Number) == @sizeOf(u8));
 }
@ -2977,23 +2977,23 @@ const std = @import("std");
 const assert = std.debug.assert;

 const Color = enum {
-    Auto,
-    Off,
-    On,
+    auto,
+    off,
+    on,
 };

 test "enum literals" {
-    const color1: Color = .Auto;
-    const color2 = Color.Auto;
+    const color1: Color = .auto;
+    const color2 = Color.auto;
    assert(color1 == color2);
 }

 test "switch using enum literals" {
-    const color = Color.On;
+    const color = Color.on;
    const result = switch (color) {
-        .Auto => false,
-        .On => true,
-        .Off => false,
+        .auto => false,
+        .on => true,
+        .off => false,
    };
    assert(result);
 }
@ -3017,23 +3017,23 @@ const std = @import("std");
 const assert = std.debug.assert;

 const Number = enum(u8) {
-    One,
-    Two,
-    Three,
+    one,
+    two,
+    three,
    _,
 };

 test "switch on non-exhaustive enum" {
-    const number = Number.One;
+    const number = Number.one;
    const result = switch (number) {
-        .One => true,
-        .Two,
-        .Three => false,
+        .one => true,
+        .two,
+        .three => false,
        _ => false,
    };
    assert(result);
    const is_one = switch (number) {
-        .One => true,
+        .one => true,
        else => false,
    };
    assert(is_one);
@ -3055,13 +3055,13 @@ test "switch on non-exhaustive enum" {
      </p>
      {#code_begin|test_err|inactive union field#}
 const Payload = union {
-    Int: i64,
-    Float: f64,
-    Bool: bool,
+    int: i64,
+    float: f64,
+    boolean: bool,
 };
 test "simple union" {
-    var payload = Payload{ .Int = 1234 };
-    payload.Float = 12.34;
+    var payload = Payload{ .int = 1234 };
+    payload.float = 12.34;
 }
      {#code_end#}
      <p>You can activate another field by assigning the entire union:</p>
@ -3070,15 +3070,15 @@ const std = @import("std");
 const assert = std.debug.assert;

 const Payload = union {
-    Int: i64,
-    Float: f64,
-    Bool: bool,
+    int: i64,
+    float: f64,
+    boolean: bool,
 };
 test "simple union" {
-    var payload = Payload{ .Int = 1234 };
-    assert(payload.Int == 1234);
-    payload = Payload{ .Float = 12.34 };
-    assert(payload.Float == 12.34);
+    var payload = Payload{ .int = 1234 };
+    assert(payload.int == 1234);
+    payload = Payload{ .float = 12.34 };
+    assert(payload.float == 12.34);
 }
      {#code_end#}
      <p>
@ -3100,21 +3100,21 @@ const std = @import("std");
 const assert = std.debug.assert;

 const ComplexTypeTag = enum {
-    Ok,
-    NotOk,
+    ok,
+    not_ok,
 };
 const ComplexType = union(ComplexTypeTag) {
-    Ok: u8,
-    NotOk: void,
+    ok: u8,
+    not_ok: void,
 };

 test "switch on tagged union" {
-    const c = ComplexType{ .Ok = 42 };
-    assert(@as(ComplexTypeTag, c) == ComplexTypeTag.Ok);
+    const c = ComplexType{ .ok = 42 };
+    assert(@as(ComplexTypeTag, c) == ComplexTypeTag.ok);

    switch (c) {
-        ComplexTypeTag.Ok => |value| assert(value == 42),
-        ComplexTypeTag.NotOk => unreachable,
+        ComplexTypeTag.ok => |value| assert(value == 42),
+        ComplexTypeTag.not_ok => unreachable,
    }
 }

@ -3123,11 +3123,11 @@ test "@TagType" {
 }

 test "coerce to enum" {
-    const c1 = ComplexType{ .Ok = 42 };
-    const c2 = ComplexType.NotOk;
+    const c1 = ComplexType{ .ok = 42 };
+    const c2 = ComplexType.not_ok;

-    assert(c1 == .Ok);
-    assert(c2 == .NotOk);
+    assert(c1 == .ok);
+    assert(c2 == .not_ok);
 }
      {#code_end#}
      <p>In order to modify the payload of a tagged union in a switch expression,
@ -3138,24 +3138,24 @@ const std = @import("std");
 const assert = std.debug.assert;

 const ComplexTypeTag = enum {
-    Ok,
-    NotOk,
+    ok,
+    not_ok,
 };
 const ComplexType = union(ComplexTypeTag) {
-    Ok: u8,
-    NotOk: void,
+    ok: u8,
+    not_ok: void,
 };

 test "modify tagged union in switch" {
-    var c = ComplexType{ .Ok = 42 };
-    assert(@as(ComplexTypeTag, c) == ComplexTypeTag.Ok);
+    var c = ComplexType{ .ok = 42 };
+    assert(@as(ComplexTypeTag, c) == ComplexTypeTag.ok);

    switch (c) {
-        ComplexTypeTag.Ok => |*value| value.* += 1,
-        ComplexTypeTag.NotOk => unreachable,
+        ComplexTypeTag.ok => |*value| value.* += 1,
+        ComplexTypeTag.not_ok => unreachable,
    }

-    assert(c.Ok == 43);
+    assert(c.ok == 43);
 }
      {#code_end#}
      <p>
@ -3167,24 +3167,24 @@ const std = @import("std");
 const assert = std.debug.assert;

 const Variant = union(enum) {
-    Int: i32,
-    Bool: bool,
+    int: i32,
+    boolean: bool,

    // void can be omitted when inferring enum tag type.
-    None,
+    none,

    fn truthy(self: Variant) bool {
        return switch (self) {
-            Variant.Int => |x_int| x_int != 0,
-            Variant.Bool => |x_bool| x_bool,
-            Variant.None => false,
+            Variant.int => |x_int| x_int != 0,
+            Variant.boolean => |x_bool| x_bool,
+            Variant.none => false,
        };
    }
 };

 test "union method" {
-    var v1 = Variant{ .Int = 1 };
-    var v2 = Variant{ .Bool = false };
+    var v1 = Variant{ .int = 1 };
+    var v2 = Variant{ .boolean = false };

    assert(v1.truthy());
    assert(!v2.truthy());
@ -3199,12 +3199,12 @@ const std = @import("std");
 const assert = std.debug.assert;

 const Small2 = union(enum) {
-    A: i32,
-    B: bool,
-    C: u8,
+    a: i32,
+    b: bool,
+    c: u8,
 };
 test "@tagName" {
-    assert(std.mem.eql(u8, @tagName(Small2.C), "C"));
+    assert(std.mem.eql(u8, @tagName(Small2.a), "a"));
 }
      {#code_end#}
      {#header_close#}
@ -3249,6 +3249,31 @@ fn makeNumber() Number {

      {#header_close#}

+      {#header_open|opaque#}
+      <p>
+      {#syntax#}opaque {}{#endsyntax#} declares a new type with an unknown (but non-zero) size and alignment.
+      It can contain declarations the same as {#link|structs|struct#}, {#link|unions|union#},
+      and {#link|enums|enum#}.
+      </p>
+      <p>
+      This is typically used for type safety when interacting with C code that does not expose struct details.
+      Example:
+      </p>
+      {#code_begin|test_err|expected type '*Derp', found '*Wat'#}
+const Derp = opaque {};
+const Wat = opaque {};
+
+extern fn bar(d: *Derp) void;
+fn foo(w: *Wat) callconv(.C) void {
+    bar(w);
+}
+
+test "call foo" {
+    foo(undefined);
+}
+      {#code_end#}
+      {#header_close#}
+
      {#header_open|blocks#}
      <p>
      Blocks are used to limit the scope of variable declarations:
@ -3392,33 +3417,33 @@ test "switch on tagged union" {
        y: u8,
    };
    const Item = union(enum) {
-        A: u32,
-        C: Point,
-        D,
-        E: u32,
+        a: u32,
+        c: Point,
+        d,
+        e: u32,
    };

-    var a = Item{ .C = Point{ .x = 1, .y = 2 } };
+    var a = Item{ .c = Point{ .x = 1, .y = 2 } };

    // Switching on more complex enums is allowed.
    const b = switch (a) {
        // A capture group is allowed on a match, and will return the enum
        // value matched. If the payload types of both cases are the same
        // they can be put into the same switch prong.
-        Item.A, Item.E => |item| item,
+        Item.a, Item.e => |item| item,

        // A reference to the matched value can be obtained using `*` syntax.
-        Item.C => |*item| blk: {
+        Item.c => |*item| blk: {
            item.*.x += 1;
            break :blk 6;
        },

        // No else is required if the types cases was exhaustively handled
-        Item.D => 8,
+        Item.d => 8,
    };

    assert(b == 6);
-    assert(a.C.x == 2);
+    assert(a.c.x == 2);
 }
      {#code_end#}
      {#see_also|comptime|enum|@compileError|Compile Variables#}
@ -3430,16 +3455,16 @@ test "switch on tagged union" {
      </p>
      {#code_begin|test_err|not handled in switch#}
 const Color = enum {
-    Auto,
-    Off,
-    On,
+    auto,
+    off,
+    on,
 };

 test "exhaustive switching" {
-    const color = Color.Off;
+    const color = Color.off;
    switch (color) {
-        Color.Auto => {},
-        Color.On => {},
+        Color.auto => {},
+        Color.on => {},
    }
 }
      {#code_end#}
@ -3455,17 +3480,17 @@ const std = @import("std");
 const assert = std.debug.assert;

 const Color = enum {
-    Auto,
-    Off,
-    On,
+    auto,
+    off,
+    on,
 };

 test "enum literals with switch" {
-    const color = Color.Off;
+    const color = Color.off;
    const result = switch (color) {
-        .Auto => false,
-        .On => false,
-        .Off => true,
+        .auto => false,
+        .on => false,
+        .off => true,
    };
    assert(result);
 }
@ -5302,25 +5327,25 @@ const std = @import("std");
 const assert = std.debug.assert;

 const E = enum {
-    One,
-    Two,
-    Three,
+    one,
+    two,
+    three,
 };

 const U = union(E) {
-    One: i32,
-    Two: f32,
-    Three,
+    one: i32,
+    two: f32,
+    three,
 };

 test "coercion between unions and enums" {
-    var u = U{ .Two = 12.34 };
+    var u = U{ .two = 12.34 };
    var e: E = u;
-    assert(e == E.Two);
+    assert(e == E.two);

-    const three = E.Three;
+    const three = E.three;
    var another_u: U = three;
-    assert(another_u == E.Three);
+    assert(another_u == E.three);
 }
      {#code_end#}
      {#see_also|union|enum#}
@ -6096,44 +6121,44 @@ pub fn main() void {
 /// Calls print and then flushes the buffer.
 pub fn printf(self: *OutStream, comptime format: []const u8, args: anytype) anyerror!void {
    const State = enum {
-        Start,
-        OpenBrace,
-        CloseBrace,
+        start,
+        open_brace,
+        close_brace,
    };

    comptime var start_index: usize = 0;
-    comptime var state = State.Start;
+    comptime var state = State.start;
    comptime var next_arg: usize = 0;

    inline for (format) |c, i| {
        switch (state) {
-            State.Start => switch (c) {
+            State.start => switch (c) {
                '{' => {
                    if (start_index < i) try self.write(format[start_index..i]);
-                    state = State.OpenBrace;
+                    state = State.open_brace;
                },
                '}' => {
                    if (start_index < i) try self.write(format[start_index..i]);
-                    state = State.CloseBrace;
+                    state = State.close_brace;
                },
                else => {},
            },
-            State.OpenBrace => switch (c) {
+            State.open_brace => switch (c) {
                '{' => {
-                    state = State.Start;
+                    state = State.start;
                    start_index = i;
                },
                '}' => {
                    try self.printValue(args[next_arg]);
                    next_arg += 1;
-                    state = State.Start;
+                    state = State.start;
                    start_index = i + 1;
                },
                else => @compileError("Unknown format character: " ++ c),
            },
-            State.CloseBrace => switch (c) {
+            State.close_brace => switch (c) {
                '}' => {
-                    state = State.Start;
+                    state = State.start;
                    start_index = i;
                },
                else => @compileError("Single '}' encountered in format string"),
@ -8547,30 +8572,6 @@ fn foo(comptime T: type, ptr: *T) T {
      {#header_close#}
      {#header_close#}

-      {#header_open|opaque#}
-      <p>
-      {#syntax#}opaque {}{#endsyntax#} declares a new type with an unknown (but non-zero) size and alignment.
-      It can have declarations like structs, unions, or enums.
-      </p>
-      <p>
-      This is typically used for type safety when interacting with C code that does not expose struct details.
-      Example:
-      </p>
-      {#code_begin|test_err|expected type '*Derp', found '*Wat'#}
-const Derp = opaque {};
-const Wat = opaque {};
-
-extern fn bar(d: *Derp) void;
-fn foo(w: *Wat) callconv(.C) void {
-    bar(w);
-}
-
-test "call foo" {
-    foo(undefined);
-}
-      {#code_end#}
-      {#header_close#}
-
      {#header_open|Build Mode#}
      <p>
      Zig has four build modes:
@ -9069,9 +9070,9 @@ pub fn main() void {
      <p>At compile-time:</p>
      {#code_begin|test_err|has no tag matching integer value 3#}
 const Foo = enum {
-    A,
-    B,
-    C,
+    a,
+    b,
+    c,
 };
 comptime {
    const a: u2 = 3;
@ -9083,9 +9084,9 @@ comptime {
 const std = @import("std");

 const Foo = enum {
-    A,
-    B,
-    C,
+    a,
+    b,
+    c,
 };

 pub fn main() void {
@ -9625,10 +9626,102 @@ test "assert in release fast mode" {
      isolation.
      </p>
      {#header_close#}
+
      {#header_open|Zig Build System#}
-      <p>TODO: explain purpose, it's supposed to replace make/cmake</p>
-      <p>TODO: example of building a zig executable</p>
-      <p>TODO: example of building a C library</p>
+      <p>
+      The Zig Build System provides a cross-platform, dependency-free way to declare
+      the logic required to build a project. With this system, the logic to build
+      a project is written in a build.zig file, using the Zig Build System API to
+      declare and configure build artifacts and other tasks.
+      </p>
+      <p>
+      Some examples of tasks the build system can help with:
+      </p>
+      <ul>
+        <li>Creating build artifacts by executing the Zig compiler. This includes
+          building Zig source code as well as C and C++ source code.</li>
+        <li>Capturing user-configured options and using those options to configure
+          the build.</li>
+        <li>Surfacing build configuration as {#link|comptime#} values by providing a
+          file that can be {#link|imported|@import#} by Zig code.</li>
+        <li>Caching build artifacts to avoid unnecessarily repeating steps.</li>
+        <li>Executing build artifacts or system-installed tools.</li>
+        <li>Running tests and verifying the output of executing a build artifact matches
+        the expected value.</li>
+        <li>Running <code>zig fmt</code> on a codebase or a subset of it.</li>
+        <li>Custom tasks.</li>
+      </ul>
+      <p>
+      To use the build system, run <code class="shell">zig build --help</code>
+      to see a command-line usage help menu. This will include project-specific
+      options that were declared in the build.zig script.
+      </p>
+      
+      {#header_open|Building an Executable#}
+      <p>This <code>build.zig</code> file is automatically generated
+        by <code>zig init-exe</code>.</p>
+      {#code_begin|syntax|build#}
+const Builder = @import("std").build.Builder;
+
+pub fn build(b: *Builder) void {
+    // Standard target options allows the person running `zig build` to choose
+    // what target to build for. Here we do not override the defaults, which
+    // means any target is allowed, and the default is native. Other options
+    // for restricting supported target set are available.
+    const target = b.standardTargetOptions(.{});
+
+    // Standard release options allow the person running `zig build` to select
+    // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall.
+    const mode = b.standardReleaseOptions();
+
+    const exe = b.addExecutable("example", "src/main.zig");
+    exe.setTarget(target);
+    exe.setBuildMode(mode);
+    exe.install();
+
+    const run_cmd = exe.run();
+    run_cmd.step.dependOn(b.getInstallStep());
+    if (b.args) |args| {
+        run_cmd.addArgs(args);
+    }
+
+    const run_step = b.step("run", "Run the app");
+    run_step.dependOn(&run_cmd.step);
+}
+      {#code_end#}
+      {#header_close#}
+
+      {#header_open|Building a Library#}
+      <p>This <code>build.zig</code> file is automatically generated
+        by <code>zig init-lib</code>.</p>
+      {#code_begin|syntax|build#}
+const Builder = @import("std").build.Builder;
+
+pub fn build(b: *Builder) void {
+    const mode = b.standardReleaseOptions();
+    const lib = b.addStaticLibrary("example", "src/main.zig");
+    lib.setBuildMode(mode);
+    lib.install();
+
+    var main_tests = b.addTest("src/main.zig");
+    main_tests.setBuildMode(mode);
+
+    const test_step = b.step("test", "Run library tests");
+    test_step.dependOn(&main_tests.step);
+}
+      {#code_end#}
+      {#header_close#}
+
+      {#header_open|Compiling C Source Code#}
+      <pre>{#syntax#}
+lib.addCSourceFile("src/lib.c", &[_][]const u8{
+    "-Wall",
+    "-Wextra",
+    "-Werror",
+});
+      {#endsyntax#}</pre>
+      {#header_close#}
+
      {#header_close#}
      {#header_open|C#}
      <p>
@ -9905,7 +9998,10 @@ const std = @import("std");
 const PreopenList = std.fs.wasi.PreopenList;

 pub fn main() !void {
-    var preopens = PreopenList.init(std.heap.page_allocator);
+    var general_purpose_allocator = std.heap.GeneralPurposeAllocator(.{}){};
+    const gpa = &general_purpose_allocator.allocator;
+
+    var preopens = PreopenList.init(gpa);
    defer preopens.deinit();

    try preopens.populate();
--- a/lib/include/__clang_cuda_cmath.h
+++ b/lib/include/__clang_cuda_cmath.h
@ -12,7 +12,9 @@
 #error "This file is for CUDA compilation only."
 #endif

+#ifndef __OPENMP_NVPTX__
 #include <limits>
+#endif

 // CUDA lets us use various std math functions on the device side.  This file
 // works in concert with __clang_cuda_math_forward_declares.h to make this work.
@ -30,32 +32,16 @@
 // implementation.  Declaring in the global namespace and pulling into namespace
 // std covers all of the known knowns.

-#ifdef _OPENMP
-#define __DEVICE__ static __attribute__((always_inline))
+#ifdef __OPENMP_NVPTX__
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
 #else
 #define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
 #endif

-// For C++ 17 we need to include noexcept attribute to be compatible
-// with the header-defined version. This may be removed once
-// variant is supported.
-#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
-#define __NOEXCEPT noexcept
-#else
-#define __NOEXCEPT
-#endif
-
-#if !(defined(_OPENMP) && defined(__cplusplus))
 __DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
 __DEVICE__ long abs(long __n) { return ::labs(__n); }
 __DEVICE__ float abs(float __x) { return ::fabsf(__x); }
 __DEVICE__ double abs(double __x) { return ::fabs(__x); }
-#endif
-// TODO: remove once variat is supported.
-#if defined(_OPENMP) && defined(__cplusplus)
-__DEVICE__ const float abs(const float __x) { return ::fabsf((float)__x); }
-__DEVICE__ const double abs(const double __x) { return ::fabs((double)__x); }
-#endif
 __DEVICE__ float acos(float __x) { return ::acosf(__x); }
 __DEVICE__ float asin(float __x) { return ::asinf(__x); }
 __DEVICE__ float atan(float __x) { return ::atanf(__x); }
@ -64,11 +50,9 @@ __DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
 __DEVICE__ float cos(float __x) { return ::cosf(__x); }
 __DEVICE__ float cosh(float __x) { return ::coshf(__x); }
 __DEVICE__ float exp(float __x) { return ::expf(__x); }
-__DEVICE__ float fabs(float __x) __NOEXCEPT { return ::fabsf(__x); }
+__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
 __DEVICE__ float floor(float __x) { return ::floorf(__x); }
 __DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
-// TODO: remove when variant is supported
-#ifndef _OPENMP
 __DEVICE__ int fpclassify(float __x) {
  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                              FP_ZERO, __x);
@ -77,14 +61,15 @@ __DEVICE__ int fpclassify(double __x) {
  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                              FP_ZERO, __x);
 }
-#endif
 __DEVICE__ float frexp(float __arg, int *__exp) {
  return ::frexpf(__arg, __exp);
 }

 // For inscrutable reasons, the CUDA headers define these functions for us on
-// Windows.
-#ifndef _MSC_VER
+// Windows. For OpenMP we omit these as some old system headers have
+// non-conforming `isinf(float)` and `isnan(float)` implementations that return
+// an `int`. The system versions of these functions should be fine anyway.
+#if !defined(_MSC_VER) && !defined(__OPENMP_NVPTX__)
 __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
 __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
@ -161,6 +146,8 @@ __DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
 // libdevice doesn't provide an implementation, and we don't want to be in the
 // business of implementing tricky libm functions in this header.

+#ifndef __OPENMP_NVPTX__
+
 // Now we've defined everything we promised we'd define in
 // __clang_cuda_math_forward_declares.h.  We need to do two additional things to
 // fix up our math functions.
@ -457,10 +444,7 @@ using ::remainderf;
 using ::remquof;
 using ::rintf;
 using ::roundf;
-// TODO: remove once variant is supported
-#ifndef _OPENMP
 using ::scalblnf;
-#endif
 using ::scalbnf;
 using ::sinf;
 using ::sinhf;
@ -479,7 +463,8 @@ _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
 #endif

-#undef __NOEXCEPT
+#endif // __OPENMP_NVPTX__
+
 #undef __DEVICE__

 #endif
--- a/lib/include/__clang_cuda_complex_builtins.h
+++ b/lib/include/__clang_cuda_complex_builtins.h
@ -13,10 +13,57 @@
 // This header defines __muldc3, __mulsc3, __divdc3, and __divsc3.  These are
 // libgcc functions that clang assumes are available when compiling c99 complex
 // operations.  (These implementations come from libc++, and have been modified
-// to work with CUDA.)
+// to work with CUDA and OpenMP target offloading [in C and C++ mode].)

-extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
-                                                      double __c, double __d) {
+#pragma push_macro("__DEVICE__")
+#ifdef _OPENMP
+#pragma omp declare target
+#define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
+#else
+#define __DEVICE__ __device__ inline
+#endif
+
+// To make the algorithms available for C and C++ in CUDA and OpenMP we select
+// different but equivalent function versions. TODO: For OpenMP we currently
+// select the native builtins as the overload support for templates is lacking.
+#if !defined(_OPENMP)
+#define _ISNANd std::isnan
+#define _ISNANf std::isnan
+#define _ISINFd std::isinf
+#define _ISINFf std::isinf
+#define _ISFINITEd std::isfinite
+#define _ISFINITEf std::isfinite
+#define _COPYSIGNd std::copysign
+#define _COPYSIGNf std::copysign
+#define _SCALBNd std::scalbn
+#define _SCALBNf std::scalbn
+#define _ABSd std::abs
+#define _ABSf std::abs
+#define _LOGBd std::logb
+#define _LOGBf std::logb
+#else
+#define _ISNANd __nv_isnand
+#define _ISNANf __nv_isnanf
+#define _ISINFd __nv_isinfd
+#define _ISINFf __nv_isinff
+#define _ISFINITEd __nv_isfinited
+#define _ISFINITEf __nv_finitef
+#define _COPYSIGNd __nv_copysign
+#define _COPYSIGNf __nv_copysignf
+#define _SCALBNd __nv_scalbn
+#define _SCALBNf __nv_scalbnf
+#define _ABSd __nv_fabs
+#define _ABSf __nv_fabsf
+#define _LOGBd __nv_logb
+#define _LOGBf __nv_logbf
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+__DEVICE__ double _Complex __muldc3(double __a, double __b, double __c,
+                                    double __d) {
  double __ac = __a * __c;
  double __bd = __b * __d;
  double __ad = __a * __d;
@ -24,50 +71,49 @@ extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
  double _Complex z;
  __real__(z) = __ac - __bd;
  __imag__(z) = __ad + __bc;
-  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+  if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
    int __recalc = 0;
-    if (std::isinf(__a) || std::isinf(__b)) {
-      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
-      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
-      if (std::isnan(__c))
-        __c = std::copysign(0, __c);
-      if (std::isnan(__d))
-        __d = std::copysign(0, __d);
+    if (_ISINFd(__a) || _ISINFd(__b)) {
+      __a = _COPYSIGNd(_ISINFd(__a) ? 1 : 0, __a);
+      __b = _COPYSIGNd(_ISINFd(__b) ? 1 : 0, __b);
+      if (_ISNANd(__c))
+        __c = _COPYSIGNd(0, __c);
+      if (_ISNANd(__d))
+        __d = _COPYSIGNd(0, __d);
      __recalc = 1;
    }
-    if (std::isinf(__c) || std::isinf(__d)) {
-      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
-      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
-      if (std::isnan(__a))
-        __a = std::copysign(0, __a);
-      if (std::isnan(__b))
-        __b = std::copysign(0, __b);
+    if (_ISINFd(__c) || _ISINFd(__d)) {
+      __c = _COPYSIGNd(_ISINFd(__c) ? 1 : 0, __c);
+      __d = _COPYSIGNd(_ISINFd(__d) ? 1 : 0, __d);
+      if (_ISNANd(__a))
+        __a = _COPYSIGNd(0, __a);
+      if (_ISNANd(__b))
+        __b = _COPYSIGNd(0, __b);
      __recalc = 1;
    }
-    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
-                      std::isinf(__ad) || std::isinf(__bc))) {
-      if (std::isnan(__a))
-        __a = std::copysign(0, __a);
-      if (std::isnan(__b))
-        __b = std::copysign(0, __b);
-      if (std::isnan(__c))
-        __c = std::copysign(0, __c);
-      if (std::isnan(__d))
-        __d = std::copysign(0, __d);
+    if (!__recalc &&
+        (_ISINFd(__ac) || _ISINFd(__bd) || _ISINFd(__ad) || _ISINFd(__bc))) {
+      if (_ISNANd(__a))
+        __a = _COPYSIGNd(0, __a);
+      if (_ISNANd(__b))
+        __b = _COPYSIGNd(0, __b);
+      if (_ISNANd(__c))
+        __c = _COPYSIGNd(0, __c);
+      if (_ISNANd(__d))
+        __d = _COPYSIGNd(0, __d);
      __recalc = 1;
    }
    if (__recalc) {
      // Can't use std::numeric_limits<double>::infinity() -- that doesn't have
      // a device overload (and isn't constexpr before C++11, naturally).
-      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
-      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+      __real__(z) = __builtin_huge_val() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_val() * (__a * __d + __b * __c);
    }
  }
  return z;
 }

-extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
-                                                     float __c, float __d) {
+__DEVICE__ float _Complex __mulsc3(float __a, float __b, float __c, float __d) {
  float __ac = __a * __c;
  float __bd = __b * __d;
  float __ad = __a * __d;
@ -75,36 +121,36 @@ extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
  float _Complex z;
  __real__(z) = __ac - __bd;
  __imag__(z) = __ad + __bc;
-  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+  if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
    int __recalc = 0;
-    if (std::isinf(__a) || std::isinf(__b)) {
-      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
-      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
-      if (std::isnan(__c))
-        __c = std::copysign(0, __c);
-      if (std::isnan(__d))
-        __d = std::copysign(0, __d);
+    if (_ISINFf(__a) || _ISINFf(__b)) {
+      __a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
+      __b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
+      if (_ISNANf(__c))
+        __c = _COPYSIGNf(0, __c);
+      if (_ISNANf(__d))
+        __d = _COPYSIGNf(0, __d);
      __recalc = 1;
    }
-    if (std::isinf(__c) || std::isinf(__d)) {
-      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
-      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
-      if (std::isnan(__a))
-        __a = std::copysign(0, __a);
-      if (std::isnan(__b))
-        __b = std::copysign(0, __b);
+    if (_ISINFf(__c) || _ISINFf(__d)) {
+      __c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
+      __d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
+      if (_ISNANf(__a))
+        __a = _COPYSIGNf(0, __a);
+      if (_ISNANf(__b))
+        __b = _COPYSIGNf(0, __b);
      __recalc = 1;
    }
-    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
-                      std::isinf(__ad) || std::isinf(__bc))) {
-      if (std::isnan(__a))
-        __a = std::copysign(0, __a);
-      if (std::isnan(__b))
-        __b = std::copysign(0, __b);
-      if (std::isnan(__c))
-        __c = std::copysign(0, __c);
-      if (std::isnan(__d))
-        __d = std::copysign(0, __d);
+    if (!__recalc &&
+        (_ISINFf(__ac) || _ISINFf(__bd) || _ISINFf(__ad) || _ISINFf(__bc))) {
+      if (_ISNANf(__a))
+        __a = _COPYSIGNf(0, __a);
+      if (_ISNANf(__b))
+        __b = _COPYSIGNf(0, __b);
+      if (_ISNANf(__c))
+        __c = _COPYSIGNf(0, __c);
+      if (_ISNANf(__d))
+        __d = _COPYSIGNf(0, __d);
      __recalc = 1;
    }
    if (__recalc) {
@ -115,36 +161,36 @@ extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
  return z;
 }

-extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
-                                                      double __c, double __d) {
+__DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
+                                    double __d) {
  int __ilogbw = 0;
  // Can't use std::max, because that's defined in <algorithm>, and we don't
  // want to pull that in for every compile.  The CUDA headers define
  // ::max(float, float) and ::max(double, double), which is sufficient for us.
-  double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
-  if (std::isfinite(__logbw)) {
+  double __logbw = _LOGBd(max(_ABSd(__c), _ABSd(__d)));
+  if (_ISFINITEd(__logbw)) {
    __ilogbw = (int)__logbw;
-    __c = std::scalbn(__c, -__ilogbw);
-    __d = std::scalbn(__d, -__ilogbw);
+    __c = _SCALBNd(__c, -__ilogbw);
+    __d = _SCALBNd(__d, -__ilogbw);
  }
  double __denom = __c * __c + __d * __d;
  double _Complex z;
-  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
-  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
-    if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
-      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
-      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
-    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
-               std::isfinite(__d)) {
-      __a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
-      __b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
-      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
-      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
-    } else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
-               std::isfinite(__b)) {
-      __c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
-      __d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
+  __real__(z) = _SCALBNd((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = _SCALBNd((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
+    if ((__denom == 0.0) && (!_ISNANd(__a) || !_ISNANd(__b))) {
+      __real__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __a;
+      __imag__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __b;
+    } else if ((_ISINFd(__a) || _ISINFd(__b)) && _ISFINITEd(__c) &&
+               _ISFINITEd(__d)) {
+      __a = _COPYSIGNd(_ISINFd(__a) ? 1.0 : 0.0, __a);
+      __b = _COPYSIGNd(_ISINFd(__b) ? 1.0 : 0.0, __b);
+      __real__(z) = __builtin_huge_val() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_val() * (__b * __c - __a * __d);
+    } else if (_ISINFd(__logbw) && __logbw > 0.0 && _ISFINITEd(__a) &&
+               _ISFINITEd(__b)) {
+      __c = _COPYSIGNd(_ISINFd(__c) ? 1.0 : 0.0, __c);
+      __d = _COPYSIGNd(_ISINFd(__d) ? 1.0 : 0.0, __d);
      __real__(z) = 0.0 * (__a * __c + __b * __d);
      __imag__(z) = 0.0 * (__b * __c - __a * __d);
    }
@ -152,33 +198,32 @@ extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
  return z;
 }

-extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
-                                                     float __c, float __d) {
+__DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
  int __ilogbw = 0;
-  float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
-  if (std::isfinite(__logbw)) {
+  float __logbw = _LOGBf(max(_ABSf(__c), _ABSf(__d)));
+  if (_ISFINITEf(__logbw)) {
    __ilogbw = (int)__logbw;
-    __c = std::scalbn(__c, -__ilogbw);
-    __d = std::scalbn(__d, -__ilogbw);
+    __c = _SCALBNf(__c, -__ilogbw);
+    __d = _SCALBNf(__d, -__ilogbw);
  }
  float __denom = __c * __c + __d * __d;
  float _Complex z;
-  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
-  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
-    if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
-      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
-      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
-    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
-               std::isfinite(__d)) {
-      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
-      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+  __real__(z) = _SCALBNf((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = _SCALBNf((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
+    if ((__denom == 0) && (!_ISNANf(__a) || !_ISNANf(__b))) {
+      __real__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __b;
+    } else if ((_ISINFf(__a) || _ISINFf(__b)) && _ISFINITEf(__c) &&
+               _ISFINITEf(__d)) {
+      __a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
+      __b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
-    } else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
-               std::isfinite(__b)) {
-      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
-      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+    } else if (_ISINFf(__logbw) && __logbw > 0 && _ISFINITEf(__a) &&
+               _ISFINITEf(__b)) {
+      __c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
+      __d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
      __real__(z) = 0 * (__a * __c + __b * __d);
      __imag__(z) = 0 * (__b * __c - __a * __d);
    }
@ -186,4 +231,29 @@ extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
  return z;
 }

+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#undef _ISNANd
+#undef _ISNANf
+#undef _ISINFd
+#undef _ISINFf
+#undef _COPYSIGNd
+#undef _COPYSIGNf
+#undef _ISFINITEd
+#undef _ISFINITEf
+#undef _SCALBNd
+#undef _SCALBNf
+#undef _ABSd
+#undef _ABSf
+#undef _LOGBd
+#undef _LOGBf
+
+#ifdef _OPENMP
+#pragma omp end declare target
+#endif
+
+#pragma pop_macro("__DEVICE__")
+
 #endif // __CLANG_CUDA_COMPLEX_BUILTINS
--- a/lib/include/__clang_cuda_device_functions.h
+++ b/lib/include/__clang_cuda_device_functions.h
@ -10,7 +10,7 @@
 #ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__
 #define __CLANG_CUDA_DEVICE_FUNCTIONS_H__

-#ifndef _OPENMP
+#ifndef __OPENMP_NVPTX__
 #if CUDA_VERSION < 9000
 #error This file is intended to be used with CUDA-9+ only.
 #endif
@ -20,32 +20,12 @@
 // we implement in this file. We need static in order to avoid emitting unused
 // functions and __forceinline__ helps inlining these wrappers at -O1.
 #pragma push_macro("__DEVICE__")
-#ifdef _OPENMP
-#define __DEVICE__ static __attribute__((always_inline))
+#ifdef __OPENMP_NVPTX__
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
 #else
 #define __DEVICE__ static __device__ __forceinline__
 #endif

-// libdevice provides fast low precision and slow full-recision implementations
-// for some functions. Which one gets selected depends on
-// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
-// -ffast-math or -fcuda-approx-transcendentals are in effect.
-#pragma push_macro("__FAST_OR_SLOW")
-#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
-#define __FAST_OR_SLOW(fast, slow) fast
-#else
-#define __FAST_OR_SLOW(fast, slow) slow
-#endif
-
-// For C++ 17 we need to include noexcept attribute to be compatible
-// with the header-defined version. This may be removed once
-// variant is supported.
-#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
-#define __NOEXCEPT noexcept
-#else
-#define __NOEXCEPT
-#endif
-
 __DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); }
 __DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); }
 __DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); }
@ -359,10 +339,10 @@ __DEVICE__ int __iAtomicAdd(int *__p, int __v) {
  return __nvvm_atom_add_gen_i(__p, __v);
 }
 __DEVICE__ int __iAtomicAdd_block(int *__p, int __v) {
-  __nvvm_atom_cta_add_gen_i(__p, __v);
+  return __nvvm_atom_cta_add_gen_i(__p, __v);
 }
 __DEVICE__ int __iAtomicAdd_system(int *__p, int __v) {
-  __nvvm_atom_sys_add_gen_i(__p, __v);
+  return __nvvm_atom_sys_add_gen_i(__p, __v);
 }
 __DEVICE__ int __iAtomicAnd(int *__p, int __v) {
  return __nvvm_atom_and_gen_i(__p, __v);
@ -1483,152 +1463,17 @@ __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
  return r;
 }
 #endif // CUDA_VERSION >= 9020
-__DEVICE__ int abs(int __a) __NOEXCEPT { return __nv_abs(__a); }
-__DEVICE__ double fabs(double __a) __NOEXCEPT { return __nv_fabs(__a); }
-__DEVICE__ double acos(double __a) { return __nv_acos(__a); }
-__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
-__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
-__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); }
-__DEVICE__ double asin(double __a) { return __nv_asin(__a); }
-__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); }
-__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); }
-__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); }
-__DEVICE__ double atan(double __a) { return __nv_atan(__a); }
-__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); }
-__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); }
-__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); }
-__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); }
-__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); }
-__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
-__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
-__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
-__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
-#ifndef _OPENMP
-__DEVICE__ int clock() { return __nvvm_read_ptx_sreg_clock(); }
+
+// For OpenMP we require the user to include <time.h> as we need to know what
+// clock_t is on the system.
+#ifndef __OPENMP_NVPTX__
+__DEVICE__ /* clock_t= */ int clock() { return __nvvm_read_ptx_sreg_clock(); }
+#endif
 __DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); }
-#endif
-__DEVICE__ double copysign(double __a, double __b) {
-  return __nv_copysign(__a, __b);
-}
-__DEVICE__ float copysignf(float __a, float __b) {
-  return __nv_copysignf(__a, __b);
-}
-__DEVICE__ double cos(double __a) { return __nv_cos(__a); }
-__DEVICE__ float cosf(float __a) {
-  return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a);
-}
-__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); }
-__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); }
-__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); }
-__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); }
-__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); }
-__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); }
-__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); }
-__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); }
-__DEVICE__ double erf(double __a) { return __nv_erf(__a); }
-__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); }
-__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); }
-__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); }
-__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); }
-__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); }
-__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); }
-__DEVICE__ float erff(float __a) { return __nv_erff(__a); }
-__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); }
-__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); }
-__DEVICE__ double exp(double __a) { return __nv_exp(__a); }
-__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); }
-__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); }
-__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); }
-__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
-__DEVICE__ float expf(float __a) { return __nv_expf(__a); }
-__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
-__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
-__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
-__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
-__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
-__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
-__DEVICE__ float fdividef(float __a, float __b) {
-#if __FAST_MATH__ && !__CUDA_PREC_DIV
-  return __nv_fast_fdividef(__a, __b);
-#else
-  return __a / __b;
-#endif
-}
-__DEVICE__ double floor(double __f) { return __nv_floor(__f); }
-__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); }
-__DEVICE__ double fma(double __a, double __b, double __c) {
-  return __nv_fma(__a, __b, __c);
-}
-__DEVICE__ float fmaf(float __a, float __b, float __c) {
-  return __nv_fmaf(__a, __b, __c);
-}
-__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); }
-__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); }
-__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); }
-__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); }
-__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); }
-__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); }
-__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); }
-__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); }
-__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); }
-__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); }
-__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); }
-__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); }
-__DEVICE__ double j0(double __a) { return __nv_j0(__a); }
-__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); }
-__DEVICE__ double j1(double __a) { return __nv_j1(__a); }
-__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
-__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
-__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
-#if defined(__LP64__) || defined(_WIN64)
-__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_llabs(__a); };
-#else
-__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_abs(__a); };
-#endif
-__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
-__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
-__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
-__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
-__DEVICE__ long long llabs(long long __a) __NOEXCEPT { return __nv_llabs(__a); }
-__DEVICE__ long long llmax(long long __a, long long __b) {
-  return __nv_llmax(__a, __b);
-}
-__DEVICE__ long long llmin(long long __a, long long __b) {
-  return __nv_llmin(__a, __b);
-}
-__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
-__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
-__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
-__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
-__DEVICE__ double log(double __a) { return __nv_log(__a); }
-__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
-__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
-__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); }
-__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); }
-__DEVICE__ double log2(double __a) { return __nv_log2(__a); }
-__DEVICE__ float log2f(float __a) {
-  return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a);
-}
-__DEVICE__ double logb(double __a) { return __nv_logb(__a); }
-__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
-__DEVICE__ float logf(float __a) {
-  return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
-}
-#if defined(__LP64__) || defined(_WIN64)
-__DEVICE__ long lrint(double __a) { return llrint(__a); }
-__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
-__DEVICE__ long lround(double __a) { return llround(__a); }
-__DEVICE__ long lroundf(float __a) { return llroundf(__a); }
-#else
-__DEVICE__ long lrint(double __a) { return (long)rint(__a); }
-__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); }
-__DEVICE__ long lround(double __a) { return round(__a); }
-__DEVICE__ long lroundf(float __a) { return roundf(__a); }
-#endif
-__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
+
 // These functions shouldn't be declared when including this header
 // for math function resolution purposes.
-#ifndef _OPENMP
+#ifndef __OPENMP_NVPTX__
 __DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) {
  return __builtin_memcpy(__a, __b, __c);
 }
@ -1636,158 +1481,6 @@ __DEVICE__ void *memset(void *__a, int __b, size_t __c) {
  return __builtin_memset(__a, __b, __c);
 }
 #endif
-__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
-__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
-__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
-__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); }
-__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); }
-__DEVICE__ double nextafter(double __a, double __b) {
-  return __nv_nextafter(__a, __b);
-}
-__DEVICE__ float nextafterf(float __a, float __b) {
-  return __nv_nextafterf(__a, __b);
-}
-__DEVICE__ double norm(int __dim, const double *__t) {
-  return __nv_norm(__dim, __t);
-}
-__DEVICE__ double norm3d(double __a, double __b, double __c) {
-  return __nv_norm3d(__a, __b, __c);
-}
-__DEVICE__ float norm3df(float __a, float __b, float __c) {
-  return __nv_norm3df(__a, __b, __c);
-}
-__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) {
-  return __nv_norm4d(__a, __b, __c, __d);
-}
-__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) {
-  return __nv_norm4df(__a, __b, __c, __d);
-}
-__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); }
-__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); }
-__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); }
-__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); }
-__DEVICE__ float normf(int __dim, const float *__t) {
-  return __nv_normf(__dim, __t);
-}
-__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); }
-__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); }
-__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); }
-__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); }
-__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); }
-__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); }
-__DEVICE__ double remainder(double __a, double __b) {
-  return __nv_remainder(__a, __b);
-}
-__DEVICE__ float remainderf(float __a, float __b) {
-  return __nv_remainderf(__a, __b);
-}
-__DEVICE__ double remquo(double __a, double __b, int *__c) {
-  return __nv_remquo(__a, __b, __c);
-}
-__DEVICE__ float remquof(float __a, float __b, int *__c) {
-  return __nv_remquof(__a, __b, __c);
-}
-__DEVICE__ double rhypot(double __a, double __b) {
-  return __nv_rhypot(__a, __b);
-}
-__DEVICE__ float rhypotf(float __a, float __b) {
-  return __nv_rhypotf(__a, __b);
-}
-__DEVICE__ double rint(double __a) { return __nv_rint(__a); }
-__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); }
-__DEVICE__ double rnorm(int __a, const double *__b) {
-  return __nv_rnorm(__a, __b);
-}
-__DEVICE__ double rnorm3d(double __a, double __b, double __c) {
-  return __nv_rnorm3d(__a, __b, __c);
-}
-__DEVICE__ float rnorm3df(float __a, float __b, float __c) {
-  return __nv_rnorm3df(__a, __b, __c);
-}
-__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) {
-  return __nv_rnorm4d(__a, __b, __c, __d);
-}
-__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
-  return __nv_rnorm4df(__a, __b, __c, __d);
-}
-__DEVICE__ float rnormf(int __dim, const float *__t) {
-  return __nv_rnormf(__dim, __t);
-}
-__DEVICE__ double round(double __a) { return __nv_round(__a); }
-__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
-__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
-__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
-__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
-__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
-// TODO: remove once variant is supported
-#ifndef _OPENMP
-__DEVICE__ double scalbln(double __a, long __b) {
-  if (__b > INT_MAX)
-    return __a > 0 ? HUGE_VAL : -HUGE_VAL;
-  if (__b < INT_MIN)
-    return __a > 0 ? 0.0 : -0.0;
-  return scalbn(__a, (int)__b);
-}
-__DEVICE__ float scalblnf(float __a, long __b) {
-  if (__b > INT_MAX)
-    return __a > 0 ? HUGE_VALF : -HUGE_VALF;
-  if (__b < INT_MIN)
-    return __a > 0 ? 0.f : -0.f;
-  return scalbnf(__a, (int)__b);
-}
-#endif
-__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
-__DEVICE__ void sincos(double __a, double *__s, double *__c) {
-  return __nv_sincos(__a, __s, __c);
-}
-__DEVICE__ void sincosf(float __a, float *__s, float *__c) {
-  return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
-}
-__DEVICE__ void sincospi(double __a, double *__s, double *__c) {
-  return __nv_sincospi(__a, __s, __c);
-}
-__DEVICE__ void sincospif(float __a, float *__s, float *__c) {
-  return __nv_sincospif(__a, __s, __c);
-}
-__DEVICE__ float sinf(float __a) {
-  return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
-}
-__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); }
-__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); }
-__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); }
-__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); }
-__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); }
-__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); }
-__DEVICE__ double tan(double __a) { return __nv_tan(__a); }
-__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); }
-__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); }
-__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); }
-__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); }
-__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); }
-__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); }
-__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); }
-__DEVICE__ unsigned long long ullmax(unsigned long long __a,
-                                     unsigned long long __b) {
-  return __nv_ullmax(__a, __b);
-}
-__DEVICE__ unsigned long long ullmin(unsigned long long __a,
-                                     unsigned long long __b) {
-  return __nv_ullmin(__a, __b);
-}
-__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) {
-  return __nv_umax(__a, __b);
-}
-__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) {
-  return __nv_umin(__a, __b);
-}
-__DEVICE__ double y0(double __a) { return __nv_y0(__a); }
-__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); }
-__DEVICE__ double y1(double __a) { return __nv_y1(__a); }
-__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
-__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
-__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }

-#undef __NOEXCEPT
 #pragma pop_macro("__DEVICE__")
-#pragma pop_macro("__FAST_OR_SLOW")
 #endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
--- a/lib/include/__clang_cuda_libdevice_declares.h
+++ b/lib/include/__clang_cuda_libdevice_declares.h
@ -14,7 +14,7 @@
 extern "C" {
 #endif

-#if defined(_OPENMP)
+#if defined(__OPENMP_NVPTX__)
 #define __DEVICE__
 #elif defined(__CUDA__)
 #define __DEVICE__ __device__
--- a/lib/include/__clang_cuda_math.h
+++ b/lib/include/__clang_cuda_math.h
@ -0,0 +1,347 @@
+/*===---- __clang_cuda_math.h - Device-side CUDA math support --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_MATH_H__
+#define __CLANG_CUDA_MATH_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+#ifndef __OPENMP_NVPTX__
+#if CUDA_VERSION < 9000
+#error This file is intended to be used with CUDA-9+ only.
+#endif
+#endif
+
+// __DEVICE__ is a helper macro with common set of attributes for the wrappers
+// we implement in this file. We need static in order to avoid emitting unused
+// functions and __forceinline__ helps inlining these wrappers at -O1.
+#pragma push_macro("__DEVICE__")
+#ifdef __OPENMP_NVPTX__
+#if defined(__cplusplus)
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+#else
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
+#endif
+#else
+#define __DEVICE__ static __device__ __forceinline__
+#endif
+
+// Specialized version of __DEVICE__ for functions with void return type. Needed
+// because the OpenMP overlay requires constexpr functions here but prior to
+// c++14 void return functions could not be constexpr.
+#pragma push_macro("__DEVICE_VOID__")
+#ifdef __OPENMP_NVPTX__ && defined(__cplusplus) && __cplusplus < 201402L
+#define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
+#else
+#define __DEVICE_VOID__ __DEVICE__
+#endif
+
+// libdevice provides fast low precision and slow full-recision implementations
+// for some functions. Which one gets selected depends on
+// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
+// -ffast-math or -fcuda-approx-transcendentals are in effect.
+#pragma push_macro("__FAST_OR_SLOW")
+#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#define __FAST_OR_SLOW(fast, slow) fast
+#else
+#define __FAST_OR_SLOW(fast, slow) slow
+#endif
+
+__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
+__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
+__DEVICE__ double acos(double __a) { return __nv_acos(__a); }
+__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
+__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
+__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); }
+__DEVICE__ double asin(double __a) { return __nv_asin(__a); }
+__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); }
+__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); }
+__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); }
+__DEVICE__ double atan(double __a) { return __nv_atan(__a); }
+__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); }
+__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); }
+__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); }
+__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); }
+__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); }
+__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
+__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
+__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
+__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
+__DEVICE__ double copysign(double __a, double __b) {
+  return __nv_copysign(__a, __b);
+}
+__DEVICE__ float copysignf(float __a, float __b) {
+  return __nv_copysignf(__a, __b);
+}
+__DEVICE__ double cos(double __a) { return __nv_cos(__a); }
+__DEVICE__ float cosf(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a);
+}
+__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); }
+__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); }
+__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); }
+__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); }
+__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); }
+__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); }
+__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); }
+__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); }
+__DEVICE__ double erf(double __a) { return __nv_erf(__a); }
+__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); }
+__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); }
+__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); }
+__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); }
+__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); }
+__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); }
+__DEVICE__ float erff(float __a) { return __nv_erff(__a); }
+__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); }
+__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); }
+__DEVICE__ double exp(double __a) { return __nv_exp(__a); }
+__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); }
+__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); }
+__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); }
+__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
+__DEVICE__ float expf(float __a) { return __nv_expf(__a); }
+__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
+__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
+__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
+__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
+__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
+__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
+__DEVICE__ float fdividef(float __a, float __b) {
+#if __FAST_MATH__ && !__CUDA_PREC_DIV
+  return __nv_fast_fdividef(__a, __b);
+#else
+  return __a / __b;
+#endif
+}
+__DEVICE__ double floor(double __f) { return __nv_floor(__f); }
+__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); }
+__DEVICE__ double fma(double __a, double __b, double __c) {
+  return __nv_fma(__a, __b, __c);
+}
+__DEVICE__ float fmaf(float __a, float __b, float __c) {
+  return __nv_fmaf(__a, __b, __c);
+}
+__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); }
+__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); }
+__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); }
+__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); }
+__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); }
+__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); }
+__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); }
+__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); }
+__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); }
+__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); }
+__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); }
+__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); }
+__DEVICE__ double j0(double __a) { return __nv_j0(__a); }
+__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); }
+__DEVICE__ double j1(double __a) { return __nv_j1(__a); }
+__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
+__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
+__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
+#if defined(__LP64__) || defined(_WIN64)
+__DEVICE__ long labs(long __a) { return __nv_llabs(__a); };
+#else
+__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
+#endif
+__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
+__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
+__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
+__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
+__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); }
+__DEVICE__ long long llmax(long long __a, long long __b) {
+  return __nv_llmax(__a, __b);
+}
+__DEVICE__ long long llmin(long long __a, long long __b) {
+  return __nv_llmin(__a, __b);
+}
+__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
+__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
+__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
+__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
+__DEVICE__ double log(double __a) { return __nv_log(__a); }
+__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
+__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
+__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); }
+__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); }
+__DEVICE__ double log2(double __a) { return __nv_log2(__a); }
+__DEVICE__ float log2f(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a);
+}
+__DEVICE__ double logb(double __a) { return __nv_logb(__a); }
+__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
+__DEVICE__ float logf(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
+}
+#if defined(__LP64__) || defined(_WIN64)
+__DEVICE__ long lrint(double __a) { return llrint(__a); }
+__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
+__DEVICE__ long lround(double __a) { return llround(__a); }
+__DEVICE__ long lroundf(float __a) { return llroundf(__a); }
+#else
+__DEVICE__ long lrint(double __a) { return (long)rint(__a); }
+__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); }
+__DEVICE__ long lround(double __a) { return round(__a); }
+__DEVICE__ long lroundf(float __a) { return roundf(__a); }
+#endif
+__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
+__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
+__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
+__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
+__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); }
+__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); }
+__DEVICE__ double nextafter(double __a, double __b) {
+  return __nv_nextafter(__a, __b);
+}
+__DEVICE__ float nextafterf(float __a, float __b) {
+  return __nv_nextafterf(__a, __b);
+}
+__DEVICE__ double norm(int __dim, const double *__t) {
+  return __nv_norm(__dim, __t);
+}
+__DEVICE__ double norm3d(double __a, double __b, double __c) {
+  return __nv_norm3d(__a, __b, __c);
+}
+__DEVICE__ float norm3df(float __a, float __b, float __c) {
+  return __nv_norm3df(__a, __b, __c);
+}
+__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) {
+  return __nv_norm4d(__a, __b, __c, __d);
+}
+__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) {
+  return __nv_norm4df(__a, __b, __c, __d);
+}
+__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); }
+__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); }
+__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); }
+__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); }
+__DEVICE__ float normf(int __dim, const float *__t) {
+  return __nv_normf(__dim, __t);
+}
+__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); }
+__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); }
+__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); }
+__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); }
+__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); }
+__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); }
+__DEVICE__ double remainder(double __a, double __b) {
+  return __nv_remainder(__a, __b);
+}
+__DEVICE__ float remainderf(float __a, float __b) {
+  return __nv_remainderf(__a, __b);
+}
+__DEVICE__ double remquo(double __a, double __b, int *__c) {
+  return __nv_remquo(__a, __b, __c);
+}
+__DEVICE__ float remquof(float __a, float __b, int *__c) {
+  return __nv_remquof(__a, __b, __c);
+}
+__DEVICE__ double rhypot(double __a, double __b) {
+  return __nv_rhypot(__a, __b);
+}
+__DEVICE__ float rhypotf(float __a, float __b) {
+  return __nv_rhypotf(__a, __b);
+}
+__DEVICE__ double rint(double __a) { return __nv_rint(__a); }
+__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); }
+__DEVICE__ double rnorm(int __a, const double *__b) {
+  return __nv_rnorm(__a, __b);
+}
+__DEVICE__ double rnorm3d(double __a, double __b, double __c) {
+  return __nv_rnorm3d(__a, __b, __c);
+}
+__DEVICE__ float rnorm3df(float __a, float __b, float __c) {
+  return __nv_rnorm3df(__a, __b, __c);
+}
+__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) {
+  return __nv_rnorm4d(__a, __b, __c, __d);
+}
+__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
+  return __nv_rnorm4df(__a, __b, __c, __d);
+}
+__DEVICE__ float rnormf(int __dim, const float *__t) {
+  return __nv_rnormf(__dim, __t);
+}
+__DEVICE__ double round(double __a) { return __nv_round(__a); }
+__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
+__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
+__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
+__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
+__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
+__DEVICE__ double scalbln(double __a, long __b) {
+  if (__b > INT_MAX)
+    return __a > 0 ? HUGE_VAL : -HUGE_VAL;
+  if (__b < INT_MIN)
+    return __a > 0 ? 0.0 : -0.0;
+  return scalbn(__a, (int)__b);
+}
+__DEVICE__ float scalblnf(float __a, long __b) {
+  if (__b > INT_MAX)
+    return __a > 0 ? HUGE_VALF : -HUGE_VALF;
+  if (__b < INT_MIN)
+    return __a > 0 ? 0.f : -0.f;
+  return scalbnf(__a, (int)__b);
+}
+__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
+__DEVICE_VOID__ void sincos(double __a, double *__s, double *__c) {
+  return __nv_sincos(__a, __s, __c);
+}
+__DEVICE_VOID__ void sincosf(float __a, float *__s, float *__c) {
+  return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
+}
+__DEVICE_VOID__ void sincospi(double __a, double *__s, double *__c) {
+  return __nv_sincospi(__a, __s, __c);
+}
+__DEVICE_VOID__ void sincospif(float __a, float *__s, float *__c) {
+  return __nv_sincospif(__a, __s, __c);
+}
+__DEVICE__ float sinf(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
+}
+__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); }
+__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); }
+__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); }
+__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); }
+__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); }
+__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); }
+__DEVICE__ double tan(double __a) { return __nv_tan(__a); }
+__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); }
+__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); }
+__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); }
+__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); }
+__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); }
+__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); }
+__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); }
+__DEVICE__ unsigned long long ullmax(unsigned long long __a,
+                                     unsigned long long __b) {
+  return __nv_ullmax(__a, __b);
+}
+__DEVICE__ unsigned long long ullmin(unsigned long long __a,
+                                     unsigned long long __b) {
+  return __nv_ullmin(__a, __b);
+}
+__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) {
+  return __nv_umax(__a, __b);
+}
+__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) {
+  return __nv_umin(__a, __b);
+}
+__DEVICE__ double y0(double __a) { return __nv_y0(__a); }
+__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); }
+__DEVICE__ double y1(double __a) { return __nv_y1(__a); }
+__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
+__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
+__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
+
+#pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__DEVICE_VOID__")
+#pragma pop_macro("__FAST_OR_SLOW")
+
+#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
--- a/lib/include/__clang_cuda_math_forward_declares.h
+++ b/lib/include/__clang_cuda_math_forward_declares.h
@ -8,8 +8,8 @@
 */
 #ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
 #define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
-#ifndef __CUDA__
-#error "This file is for CUDA compilation only."
+#if !defined(__CUDA__) && !__HIP__
+#error "This file is for CUDA/HIP compilation only."
 #endif

 // This file forward-declares of some math functions we (or the CUDA headers)
@ -20,37 +20,14 @@
 // would preclude the use of our own __device__ overloads for these functions.

 #pragma push_macro("__DEVICE__")
-#ifdef _OPENMP
-#define __DEVICE__ static __inline__ __attribute__((always_inline))
-#else
 #define __DEVICE__                                                             \
  static __inline__ __attribute__((always_inline)) __attribute__((device))
-#endif

-// For C++ 17 we need to include noexcept attribute to be compatible
-// with the header-defined version. This may be removed once
-// variant is supported.
-#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
-#define __NOEXCEPT noexcept
-#else
-#define __NOEXCEPT
-#endif
-
-#if !(defined(_OPENMP) && defined(__cplusplus))
 __DEVICE__ long abs(long);
 __DEVICE__ long long abs(long long);
 __DEVICE__ double abs(double);
 __DEVICE__ float abs(float);
-#endif
-// While providing the CUDA declarations and definitions for math functions,
-// we may manually define additional functions.
-// TODO: Once variant is supported the additional functions will have
-// to be removed.
-#if defined(_OPENMP) && defined(__cplusplus)
-__DEVICE__ const double abs(const double);
-__DEVICE__ const float abs(const float);
-#endif
-__DEVICE__ int abs(int) __NOEXCEPT;
+__DEVICE__ int abs(int);
 __DEVICE__ double acos(double);
 __DEVICE__ float acos(float);
 __DEVICE__ double acosh(double);
@ -85,8 +62,8 @@ __DEVICE__ double exp(double);
 __DEVICE__ float exp(float);
 __DEVICE__ double expm1(double);
 __DEVICE__ float expm1(float);
-__DEVICE__ double fabs(double) __NOEXCEPT;
-__DEVICE__ float fabs(float) __NOEXCEPT;
+__DEVICE__ double fabs(double);
+__DEVICE__ float fabs(float);
 __DEVICE__ double fdim(double, double);
 __DEVICE__ float fdim(float, float);
 __DEVICE__ double floor(double);
@ -136,12 +113,12 @@ __DEVICE__ bool isnormal(double);
 __DEVICE__ bool isnormal(float);
 __DEVICE__ bool isunordered(double, double);
 __DEVICE__ bool isunordered(float, float);
-__DEVICE__ long labs(long) __NOEXCEPT;
+__DEVICE__ long labs(long);
 __DEVICE__ double ldexp(double, int);
 __DEVICE__ float ldexp(float, int);
 __DEVICE__ double lgamma(double);
 __DEVICE__ float lgamma(float);
-__DEVICE__ long long llabs(long long) __NOEXCEPT;
+__DEVICE__ long long llabs(long long);
 __DEVICE__ long long llrint(double);
 __DEVICE__ long long llrint(float);
 __DEVICE__ double log10(double);
@ -152,9 +129,6 @@ __DEVICE__ double log2(double);
 __DEVICE__ float log2(float);
 __DEVICE__ double logb(double);
 __DEVICE__ float logb(float);
-#if defined(_OPENMP) && defined(__cplusplus)
-__DEVICE__ long double log(long double);
-#endif
 __DEVICE__ double log(double);
 __DEVICE__ float log(float);
 __DEVICE__ long lrint(double);
@ -302,7 +276,6 @@ _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
 #endif

-#undef __NOEXCEPT
 #pragma pop_macro("__DEVICE__")

 #endif
--- a/lib/include/__clang_cuda_runtime_wrapper.h
+++ b/lib/include/__clang_cuda_runtime_wrapper.h
@ -31,11 +31,17 @@
 // Include some forward declares that must come before cmath.
 #include <__clang_cuda_math_forward_declares.h>

+// Define __CUDACC__ early as libstdc++ standard headers with GNU extensions
+// enabled depend on it to avoid using __float128, which is unsupported in
+// CUDA.
+#define __CUDACC__
+
 // Include some standard headers to avoid CUDA headers including them
 // while some required macros (like __THROW) are in a weird state.
 #include <cmath>
 #include <cstdlib>
 #include <stdlib.h>
+#undef __CUDACC__

 // Preserve common macros that will be changed below by us or by CUDA
 // headers.
@ -83,13 +89,15 @@
 #if CUDA_VERSION < 9000
 #define __CUDABE__
 #else
+#define __CUDACC__
 #define __CUDA_LIBDEVICE__
 #endif
 // Disables definitions of device-side runtime support stubs in
 // cuda_device_runtime_api.h
+#include "host_defines.h"
+#undef __CUDACC__
 #include "driver_types.h"
 #include "host_config.h"
-#include "host_defines.h"

 // Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in
 // cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the
@ -141,11 +149,12 @@ inline __host__ double __signbitd(double x) {
 // to provide our own.
 #include <__clang_cuda_libdevice_declares.h>

-// Wrappers for many device-side standard library functions became compiler
-// builtins in CUDA-9 and have been removed from the CUDA headers. Clang now
-// provides its own implementation of the wrappers.
+// Wrappers for many device-side standard library functions, incl. math
+// functions, became compiler builtins in CUDA-9 and have been removed from the
+// CUDA headers. Clang now provides its own implementation of the wrappers.
 #if CUDA_VERSION >= 9000
 #include <__clang_cuda_device_functions.h>
+#include <__clang_cuda_math.h>
 #endif

 // __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
--- a/lib/include/__clang_hip_libdevice_declares.h
+++ b/lib/include/__clang_hip_libdevice_declares.h
@ -0,0 +1,326 @@
+/*===---- __clang_hip_libdevice_declares.h - HIP device library decls -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
+#define __CLANG_HIP_LIBDEVICE_DECLARES_H__
+
+extern "C" {
+
+// BEGIN FLOAT
+__device__ __attribute__((const)) float __ocml_acos_f32(float);
+__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
+__device__ __attribute__((const)) float __ocml_asin_f32(float);
+__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
+__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
+__device__ __attribute__((const)) float __ocml_atan_f32(float);
+__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
+__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
+__device__ __attribute__((const)) float __ocml_ceil_f32(float);
+__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float,
+                                                                       float);
+__device__ float __ocml_cos_f32(float);
+__device__ float __ocml_native_cos_f32(float);
+__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
+__device__ float __ocml_cospi_f32(float);
+__device__ float __ocml_i0_f32(float);
+__device__ float __ocml_i1_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
+__device__ __attribute__((pure)) float __ocml_erf_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
+__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
+__device__ __attribute__((const)) float __ocml_fabs_f32(float);
+__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
+__device__ __attribute__((const)) float __ocml_floor_f32(float);
+__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
+__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
+__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float,
+                                                                   float);
+__device__ float __ocml_frexp_f32(float,
+                                  __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
+__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
+__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
+__device__ __attribute__((const)) int __ocml_isinf_f32(float);
+__device__ __attribute__((const)) int __ocml_isnan_f32(float);
+__device__ float __ocml_j0_f32(float);
+__device__ float __ocml_j1_f32(float);
+__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
+__device__ float __ocml_lgamma_f32(float);
+__device__ __attribute__((pure)) float __ocml_log10_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
+__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
+__device__ __attribute__((pure)) float __ocml_log2_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
+__device__ __attribute__((const)) float __ocml_logb_f32(float);
+__device__ __attribute__((pure)) float __ocml_log_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
+__device__ float __ocml_modf_f32(float,
+                                 __attribute__((address_space(5))) float *);
+__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
+__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
+__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
+                                                        float);
+__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
+__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
+__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
+__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
+__device__ float __ocml_remquo_f32(float, float,
+                                   __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
+__device__ __attribute__((const)) float __ocml_rint_f32(float);
+__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
+                                                         float);
+__device__ __attribute__((const)) float __ocml_round_f32(float);
+__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
+__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
+__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
+__device__ __attribute__((const)) int __ocml_signbit_f32(float);
+__device__ float __ocml_sincos_f32(float,
+                                   __attribute__((address_space(5))) float *);
+__device__ float __ocml_sincospi_f32(float,
+                                     __attribute__((address_space(5))) float *);
+__device__ float __ocml_sin_f32(float);
+__device__ float __ocml_native_sin_f32(float);
+__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
+__device__ float __ocml_sinpi_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
+__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
+__device__ float __ocml_tan_f32(float);
+__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
+__device__ float __ocml_tgamma_f32(float);
+__device__ __attribute__((const)) float __ocml_trunc_f32(float);
+__device__ float __ocml_y0_f32(float);
+__device__ float __ocml_y1_f32(float);
+
+// BEGIN INTRINSICS
+__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
+
+__device__ __attribute__((const)) float
+__llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
+__device__ __attribute__((const)) float
+__llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
+__device__ __attribute__((const)) float
+__llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
+__device__ __attribute__((const)) float
+__llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__device__ __attribute__((const)) double __ocml_acos_f64(double);
+__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
+__device__ __attribute__((const)) double __ocml_asin_f64(double);
+__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
+__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
+__device__ __attribute__((const)) double __ocml_atan_f64(double);
+__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
+__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
+__device__ __attribute__((const)) double __ocml_ceil_f64(double);
+__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
+__device__ double __ocml_cos_f64(double);
+__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
+__device__ double __ocml_cospi_f64(double);
+__device__ double __ocml_i0_f64(double);
+__device__ double __ocml_i1_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
+__device__ __attribute__((pure)) double __ocml_erf_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp_f64(double);
+__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
+__device__ __attribute__((const)) double __ocml_fabs_f64(double);
+__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
+__device__ __attribute__((const)) double __ocml_floor_f64(double);
+__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
+__device__ double __ocml_frexp_f64(double,
+                                   __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
+__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
+__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
+__device__ __attribute__((const)) int __ocml_isinf_f64(double);
+__device__ __attribute__((const)) int __ocml_isnan_f64(double);
+__device__ double __ocml_j0_f64(double);
+__device__ double __ocml_j1_f64(double);
+__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
+__device__ double __ocml_lgamma_f64(double);
+__device__ __attribute__((pure)) double __ocml_log10_f64(double);
+__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
+__device__ __attribute__((pure)) double __ocml_log2_f64(double);
+__device__ __attribute__((const)) double __ocml_logb_f64(double);
+__device__ __attribute__((pure)) double __ocml_log_f64(double);
+__device__ double __ocml_modf_f64(double,
+                                  __attribute__((address_space(5))) double *);
+__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
+__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
+__device__ __attribute__((const)) double __ocml_len3_f64(double, double,
+                                                         double);
+__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
+                                                         double);
+__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
+__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
+__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
+__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
+__device__ double __ocml_remquo_f64(double, double,
+                                    __attribute__((address_space(5))) int *);
+__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
+__device__ __attribute__((const)) double __ocml_rint_f64(double);
+__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double,
+                                                          double);
+__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double,
+                                                          double, double);
+__device__ __attribute__((const)) double __ocml_round_f64(double);
+__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
+__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
+__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
+__device__ __attribute__((const)) int __ocml_signbit_f64(double);
+__device__ double __ocml_sincos_f64(double,
+                                    __attribute__((address_space(5))) double *);
+__device__ double
+__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *);
+__device__ double __ocml_sin_f64(double);
+__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
+__device__ double __ocml_sinpi_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
+__device__ double __ocml_tan_f64(double);
+__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
+__device__ double __ocml_tgamma_f64(double);
+__device__ __attribute__((const)) double __ocml_trunc_f64(double);
+__device__ double __ocml_y0_f64(double);
+__device__ double __ocml_y1_f64(double);
+
+// BEGIN INTRINSICS
+__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
+                                                            double);
+__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
+                                                            double);
+__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
+                                                            double);
+__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
+                                                            double);
+
+__device__ __attribute__((const)) double
+__llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
+__device__ __attribute__((const)) double
+__llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
+
+__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
+__device__ _Float16 __ocml_cos_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
+                                                          _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
+__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
+__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
+__device__ _Float16 __ocml_sin_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
+
+typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
+typedef short __2i16 __attribute__((ext_vector_type(2)));
+
+__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
+                                                     float c, bool s);
+__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
+__device__ __2f16 __ocml_cos_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
+__device__ __attribute__((const))
+__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
+__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
+__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
+__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
+__device__ inline __2f16
+__llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL.
+{
+  return __2f16{__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)};
+}
+__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
+__device__ __2f16 __ocml_sin_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
+
+} // extern "C"
+
+#endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__
--- a/lib/include/__clang_hip_math.h
+++ b/lib/include/__clang_hip_math.h
--- a/lib/include/__clang_hip_runtime_wrapper.h
+++ b/lib/include/__clang_hip_runtime_wrapper.h
@ -0,0 +1,64 @@
+/*===---- __clang_hip_runtime_wrapper.h - HIP runtime support ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * WARNING: This header is intended to be directly -include'd by
+ * the compiler and is not supposed to be included by users.
+ *
+ */
+
+#ifndef __CLANG_HIP_RUNTIME_WRAPPER_H__
+#define __CLANG_HIP_RUNTIME_WRAPPER_H__
+
+#if __HIP__
+
+#include <cmath>
+#include <cstdlib>
+#include <stdlib.h>
+
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+#if __HIP_ENABLE_DEVICE_MALLOC__
+extern "C" __device__ void *__hip_malloc(size_t __size);
+extern "C" __device__ void *__hip_free(void *__ptr);
+static inline __device__ void *malloc(size_t __size) {
+  return __hip_malloc(__size);
+}
+static inline __device__ void *free(void *__ptr) { return __hip_free(__ptr); }
+#else
+static inline __device__ void *malloc(size_t __size) {
+  __builtin_trap();
+  return nullptr;
+}
+static inline __device__ void *free(void *__ptr) {
+  __builtin_trap();
+  return nullptr;
+}
+#endif
+
+#include <__clang_hip_libdevice_declares.h>
+#include <__clang_hip_math.h>
+
+#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#include <__clang_cuda_math_forward_declares.h>
+#include <__clang_cuda_complex_builtins.h>
+
+#include <algorithm>
+#include <complex>
+#include <new>
+#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+
+#define __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 1
+
+#endif // __HIP__
+#endif // __CLANG_HIP_RUNTIME_WRAPPER_H__
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
@ -16761,6 +16761,408 @@ static vector signed short __ATTRS_o_ai vec_nabs(vector signed short __a) {
 static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) {
  return __builtin_altivec_vminsb(__a, -__a);
 }
+
+#ifdef __POWER10_VECTOR__
+/* vec_pdep */
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_pdep(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vpdepd(__a, __b);
+}
+
+/* vec_pext */
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_pext(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vpextd(__a, __b);
+}
+
+/* vec_cfuge */
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_cfuge(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vcfuged(__a, __b);
+}
+
+/* vec_gnb */
+
+#define vec_gnb(__a, __b) __builtin_altivec_vgnb(__a, __b)
+
+/* vec_ternarylogic */
+#ifdef __VSX__
+#define vec_ternarylogic(__a, __b, __c, __imm)                                 \
+  _Generic((__a), vector unsigned char                                         \
+           : __builtin_vsx_xxeval((vector unsigned long long)(__a),            \
+                                  (vector unsigned long long)(__b),            \
+                                  (vector unsigned long long)(__c), (__imm)),  \
+             vector unsigned short                                             \
+           : __builtin_vsx_xxeval((vector unsigned long long)(__a),            \
+                                  (vector unsigned long long)(__b),            \
+                                  (vector unsigned long long)(__c), (__imm)),  \
+             vector unsigned int                                               \
+           : __builtin_vsx_xxeval((vector unsigned long long)(__a),            \
+                                  (vector unsigned long long)(__b),            \
+                                  (vector unsigned long long)(__c), (__imm)),  \
+             vector unsigned long long                                         \
+           : __builtin_vsx_xxeval((vector unsigned long long)(__a),            \
+                                  (vector unsigned long long)(__b),            \
+                                  (vector unsigned long long)(__c), (__imm)),  \
+             vector unsigned __int128                                          \
+           : __builtin_vsx_xxeval((vector unsigned long long)(__a),            \
+                                  (vector unsigned long long)(__b),            \
+                                  (vector unsigned long long)(__c), (__imm)))
+#endif /* __VSX__ */
+
+/* vec_genpcvm */
+
+#ifdef __VSX__
+#define vec_genpcvm(__a, __imm)                                                \
+  _Generic((__a), vector unsigned char                                         \
+           : __builtin_vsx_xxgenpcvbm((__a), (int)(__imm)),                    \
+             vector unsigned short                                             \
+           : __builtin_vsx_xxgenpcvhm((__a), (int)(__imm)),                    \
+             vector unsigned int                                               \
+           : __builtin_vsx_xxgenpcvwm((__a), (int)(__imm)),                    \
+             vector unsigned long long                                         \
+           : __builtin_vsx_xxgenpcvdm((__a), (int)(__imm)))
+#endif /* __VSX__ */
+
+/* vec_clrl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_clrl(vector signed char __a, unsigned int __n) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclrrb(__a, __n);
+#else
+  return __builtin_altivec_vclrlb( __a, __n);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_clrl(vector unsigned char __a, unsigned int __n) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclrrb((vector signed char)__a, __n);
+#else
+  return __builtin_altivec_vclrlb((vector signed char)__a, __n);
+#endif
+}
+
+/* vec_clrr */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_clrr(vector signed char __a, unsigned int __n) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclrlb(__a, __n);
+#else
+  return __builtin_altivec_vclrrb( __a, __n);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_clrr(vector unsigned char __a, unsigned int __n) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclrlb((vector signed char)__a, __n);
+#else
+  return __builtin_altivec_vclrrb((vector signed char)__a, __n);
+#endif
+}
+
+/* vec_cntlzm */
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_cntlzm(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vclzdm(__a, __b);
+}
+
+/* vec_cnttzm */
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_cnttzm(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vctzdm(__a, __b);
+}
+
+/* vec_sldbi */
+
+#define vec_sldb(__a, __b, __c) __builtin_altivec_vsldbi(__a, __b, (__c & 0x7))
+
+/* vec_srdbi */
+
+#define vec_srdb(__a, __b, __c) __builtin_altivec_vsrdbi(__a, __b, (__c & 0x7))
+
+/* vec_insertl */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_insertl(unsigned char __a, vector unsigned char __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinsbrx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinsblx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_insertl(unsigned short __a, vector unsigned short __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinshrx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinshlx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_insertl(unsigned int __a, vector unsigned int __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinswrx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinswlx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_insertl(unsigned long long __a, vector unsigned long long __b,
+            unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinsdrx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinsdlx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_insertl(vector unsigned char __a, vector unsigned char __b,
+            unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinsbvrx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinsbvlx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_insertl(vector unsigned short __a, vector unsigned short __b,
+            unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinshvrx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinshvlx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_insertl(vector unsigned int __a, vector unsigned int __b,
+            unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinswvrx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinswvlx(__b, __c, __a);
+#endif
+}
+
+/* vec_inserth */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_inserth(unsigned char __a, vector unsigned char __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinsblx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinsbrx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_inserth(unsigned short __a, vector unsigned short __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinshlx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinshrx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_inserth(unsigned int __a, vector unsigned int __b, unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinswlx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinswrx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_inserth(unsigned long long __a, vector unsigned long long __b,
+            unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinsdlx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinsdrx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_inserth(vector unsigned char __a, vector unsigned char __b,
+            unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinsbvlx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinsbvrx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_inserth(vector unsigned short __a, vector unsigned short __b,
+            unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinshvlx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinshvrx(__b, __c, __a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_inserth(vector unsigned int __a, vector unsigned int __b,
+            unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vinswvlx(__b, __c, __a);
+#else
+  return __builtin_altivec_vinswvrx(__b, __c, __a);
+#endif
+}
+
+#ifdef __VSX__
+
+/* vec_permx */
+
+#define vec_permx(__a, __b, __c, __d)                                          \
+  __builtin_vsx_xxpermx((__a), (__b), (__c), (__d))
+
+/* vec_blendv */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_blendv(vector signed char __a, vector signed char __b,
+           vector unsigned char __c) {
+  return __builtin_vsx_xxblendvb(__a, __b, __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_blendv(vector unsigned char __a, vector unsigned char __b,
+           vector unsigned char __c) {
+  return __builtin_vsx_xxblendvb(__a, __b, __c);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_blendv(vector signed short __a, vector signed short __b,
+           vector unsigned short __c) {
+  return __builtin_vsx_xxblendvh(__a, __b, __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_blendv(vector unsigned short __a, vector unsigned short __b,
+           vector unsigned short __c) {
+  return __builtin_vsx_xxblendvh(__a, __b, __c);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_blendv(vector signed int __a, vector signed int __b,
+           vector unsigned int __c) {
+  return __builtin_vsx_xxblendvw(__a, __b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_blendv(vector unsigned int __a, vector unsigned int __b,
+           vector unsigned int __c) {
+  return __builtin_vsx_xxblendvw(__a, __b, __c);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_blendv(vector signed long long __a, vector signed long long __b,
+           vector unsigned long long __c) {
+  return __builtin_vsx_xxblendvd(__a, __b, __c);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_blendv(vector unsigned long long __a, vector unsigned long long __b,
+           vector unsigned long long __c) {
+  return __builtin_vsx_xxblendvd(__a, __b, __c);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_blendv(vector float __a, vector float __b, vector unsigned int __c) {
+  return __builtin_vsx_xxblendvw(__a, __b, __c);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_blendv(vector double __a, vector double __b,
+           vector unsigned long long __c) {
+  return __builtin_vsx_xxblendvd(__a, __b, __c);
+}
+
+/* vec_splati */
+
+#define vec_splati(__a)                                                        \
+  _Generic((__a), signed int                                                   \
+           : ((vector signed int)__a), unsigned int                            \
+           : ((vector unsigned int)__a), float                                 \
+           : ((vector float)__a))
+
+/* vec_spatid */
+
+static __inline__ vector double __ATTRS_o_ai vec_splatid(const float __a) {
+  return ((vector double)((double)__a));
+}
+
+/* vec_splati_ins */
+
+static __inline__ vector signed int __ATTRS_o_ai vec_splati_ins(
+    vector signed int __a, const unsigned int __b, const signed int __c) {
+#ifdef __LITTLE_ENDIAN__
+  __a[1 - __b] = __c;
+  __a[3 - __b] = __c;
+#else
+  __a[__b] = __c;
+  __a[2 + __b] = __c;
+#endif
+  return __a;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_splati_ins(
+    vector unsigned int __a, const unsigned int __b, const unsigned int __c) {
+#ifdef __LITTLE_ENDIAN__
+  __a[1 - __b] = __c;
+  __a[3 - __b] = __c;
+#else
+  __a[__b] = __c;
+  __a[2 + __b] = __c;
+#endif
+  return __a;
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_splati_ins(vector float __a, const unsigned int __b, const float __c) {
+#ifdef __LITTLE_ENDIAN__
+  __a[1 - __b] = __c;
+  __a[3 - __b] = __c;
+#else
+  __a[__b] = __c;
+  __a[2 + __b] = __c;
+#endif
+  return __a;
+}
+
+/* vec_test_lsbb_all_ones */
+
+static __inline__ int __ATTRS_o_ai
+vec_test_lsbb_all_ones(vector unsigned char __a) {
+  return __builtin_vsx_xvtlsbb(__a, 1);
+}
+
+/* vec_test_lsbb_all_zeros */
+
+static __inline__ int __ATTRS_o_ai
+vec_test_lsbb_all_zeros(vector unsigned char __a) {
+  return __builtin_vsx_xvtlsbb(__a, 0);
+}
+#endif /* __VSX__ */
+#endif /* __POWER10_VECTOR__ */
+
 #undef __ATTRS_o_ai

 #endif /* __ALTIVEC_H */
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@ -0,0 +1,225 @@
+/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXINTRIN_H
+#define __AMXINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("amx-tile")))
+
+/// Load tile configuration from a 64-byte memory location specified by
+/// "mem_addr". The tile configuration includes the tile type palette, the
+/// number of bytes per row, and the number of rows. If the specified
+/// palette_id is zero, that signifies the init state for both the tile
+/// config and the tile data, and the tiles are zeroed. Any invalid
+/// configurations will result in #GP fault.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
+///
+/// \param __config
+///    A pointer to 512-bits configuration
+static __inline__ void __DEFAULT_FN_ATTRS
+_tile_loadconfig(const void *__config)
+{
+  __builtin_ia32_tile_loadconfig(__config);
+}
+
+/// Stores the current tile configuration to a 64-byte memory location
+/// specified by "mem_addr". The tile configuration includes the tile type
+/// palette, the number of bytes per row, and the number of rows. If tiles
+/// are not configured, all zeroes will be stored to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
+///
+/// \param __config
+///    A pointer to 512-bits configuration
+static __inline__ void __DEFAULT_FN_ATTRS
+_tile_storeconfig(void *__config)
+{
+  __builtin_ia32_tile_storeconfig(__config);
+}
+
+/// Release the tile configuration to return to the init state, which
+/// releases all storage it currently holds.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
+static __inline__ void __DEFAULT_FN_ATTRS
+_tile_release(void)
+{
+  __builtin_ia32_tilerelease();
+}
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst" using the tile configuration previously configured
+/// via "_tile_loadconfig".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+#define _tile_loadd(dst, base, stride) \
+  __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst" using the tile configuration previously configured
+/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+#define _tile_stream_loadd(dst, base, stride) \
+  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+
+/// Store the tile specified by "src" to memory specifieid by "base" address and
+/// "stride" using the tile configuration previously configured via
+/// "_tile_loadconfig".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be stored in memory.
+#define _tile_stored(dst, base, stride) \
+  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
+
+/// Zero the tile specified by "tdest".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
+///
+/// \param tile
+///    The destination tile to be zero. Max size is 1024 Bytes.
+#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
+/// "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbf16ps(dst, src0, src1) \
+  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMXINTRIN_H */
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -22,31 +22,43 @@ extern "C" {

 /* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
 /* 8.3 Memory barriers */
-#if !defined(_MSC_VER)
+#if !__has_builtin(__dmb)
 #define __dmb(i) __builtin_arm_dmb(i)
+#endif
+#if !__has_builtin(__dsb)
 #define __dsb(i) __builtin_arm_dsb(i)
+#endif
+#if !__has_builtin(__isb)
 #define __isb(i) __builtin_arm_isb(i)
 #endif

 /* 8.4 Hints */

-#if !defined(_MSC_VER)
+#if !__has_builtin(__wfi)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
  __builtin_arm_wfi();
 }
+#endif

+#if !__has_builtin(__wfe)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
  __builtin_arm_wfe();
 }
+#endif

+#if !__has_builtin(__sev)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
  __builtin_arm_sev();
 }
+#endif

+#if !__has_builtin(__sevl)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
  __builtin_arm_sevl();
 }
+#endif

+#if !__has_builtin(__yield)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
  __builtin_arm_yield();
 }
--- a/lib/include/arm_bf16.h
+++ b/lib/include/arm_bf16.h
@ -0,0 +1,20 @@
+/*===---- arm_bf16.h - ARM BF16 intrinsics -----------------------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_BF16_H
+#define __ARM_BF16_H
+
+typedef __bf16 bfloat16_t;
+#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
+
+
+#undef __ai
+
+#endif
--- a/lib/include/arm_cde.h
+++ b/lib/include/arm_cde.h
@ -0,0 +1,410 @@
+/*===---- arm_cde.h - ARM CDE intrinsics -----------------------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_CDE_H
+#define __ARM_CDE_H
+
+#if !__ARM_FEATURE_CDE
+#error "CDE support not enabled"
+#endif
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1)))
+uint32_t __arm_cx1(int, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1a)))
+uint32_t __arm_cx1a(int, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1d)))
+uint64_t __arm_cx1d(int, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1da)))
+uint64_t __arm_cx1da(int, uint64_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2)))
+uint32_t __arm_cx2(int, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2a)))
+uint32_t __arm_cx2a(int, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2d)))
+uint64_t __arm_cx2d(int, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2da)))
+uint64_t __arm_cx2da(int, uint64_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3)))
+uint32_t __arm_cx3(int, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3a)))
+uint32_t __arm_cx3a(int, uint32_t, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3d)))
+uint64_t __arm_cx3d(int, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3da)))
+uint64_t __arm_cx3da(int, uint64_t, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1_u32)))
+uint32_t __arm_vcx1_u32(int, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1a_u32)))
+uint32_t __arm_vcx1a_u32(int, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1d_u64)))
+uint64_t __arm_vcx1d_u64(int, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1da_u64)))
+uint64_t __arm_vcx1da_u64(int, uint64_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2_u32)))
+uint32_t __arm_vcx2_u32(int, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2a_u32)))
+uint32_t __arm_vcx2a_u32(int, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2d_u64)))
+uint64_t __arm_vcx2d_u64(int, uint64_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2da_u64)))
+uint64_t __arm_vcx2da_u64(int, uint64_t, uint64_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3_u32)))
+uint32_t __arm_vcx3_u32(int, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3a_u32)))
+uint32_t __arm_vcx3a_u32(int, uint32_t, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3d_u64)))
+uint64_t __arm_vcx3d_u64(int, uint64_t, uint64_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3da_u64)))
+uint64_t __arm_vcx3da_u64(int, uint64_t, uint64_t, uint64_t, uint32_t);
+
+#if __ARM_FEATURE_MVE
+
+typedef uint16_t mve_pred16_t;
+typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) int16_t int16x8_t;
+typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) int32_t int32x4_t;
+typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) int64_t int64x2_t;
+typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) int8_t int8x16_t;
+typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) uint16_t uint16x8_t;
+typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) uint32_t uint32x4_t;
+typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) uint64_t uint64x2_t;
+typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) uint8_t uint8x16_t;
+
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s16)))
+int16x8_t __arm_vcx1q_m(int, int16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s32)))
+int32x4_t __arm_vcx1q_m(int, int32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s64)))
+int64x2_t __arm_vcx1q_m(int, int64x2_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s8)))
+int8x16_t __arm_vcx1q_m(int, int8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u16)))
+uint16x8_t __arm_vcx1q_m(int, uint16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u32)))
+uint32x4_t __arm_vcx1q_m(int, uint32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u64)))
+uint64x2_t __arm_vcx1q_m(int, uint64x2_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u8)))
+uint8x16_t __arm_vcx1q_m(int, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_u8)))
+uint8x16_t __arm_vcx1q_u8(int, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s16)))
+int16x8_t __arm_vcx1qa_m(int, int16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s32)))
+int32x4_t __arm_vcx1qa_m(int, int32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s64)))
+int64x2_t __arm_vcx1qa_m(int, int64x2_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s8)))
+int8x16_t __arm_vcx1qa_m(int, int8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u16)))
+uint16x8_t __arm_vcx1qa_m(int, uint16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u32)))
+uint32x4_t __arm_vcx1qa_m(int, uint32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u64)))
+uint64x2_t __arm_vcx1qa_m(int, uint64x2_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u8)))
+uint8x16_t __arm_vcx1qa_m(int, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s16)))
+int16x8_t __arm_vcx1qa(int, int16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s32)))
+int32x4_t __arm_vcx1qa(int, int32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s64)))
+int64x2_t __arm_vcx1qa(int, int64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s8)))
+int8x16_t __arm_vcx1qa(int, int8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u16)))
+uint16x8_t __arm_vcx1qa(int, uint16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u32)))
+uint32x4_t __arm_vcx1qa(int, uint32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u64)))
+uint64x2_t __arm_vcx1qa(int, uint64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u8)))
+uint8x16_t __arm_vcx1qa(int, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s16)))
+int16x8_t __arm_vcx2q_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s32)))
+int32x4_t __arm_vcx2q_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s64)))
+int64x2_t __arm_vcx2q_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s8)))
+int8x16_t __arm_vcx2q_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u16)))
+uint16x8_t __arm_vcx2q_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u32)))
+uint32x4_t __arm_vcx2q_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u64)))
+uint64x2_t __arm_vcx2q_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u8)))
+uint8x16_t __arm_vcx2q_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s16)))
+int16x8_t __arm_vcx2q(int, int16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s32)))
+int32x4_t __arm_vcx2q(int, int32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s64)))
+int64x2_t __arm_vcx2q(int, int64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s8)))
+int8x16_t __arm_vcx2q(int, int8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u16)))
+uint16x8_t __arm_vcx2q(int, uint16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u32)))
+uint32x4_t __arm_vcx2q(int, uint32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u64)))
+uint64x2_t __arm_vcx2q(int, uint64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8)))
+uint8x16_t __arm_vcx2q(int, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s16)))
+uint8x16_t __arm_vcx2q_u8(int, int16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s32)))
+uint8x16_t __arm_vcx2q_u8(int, int32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s64)))
+uint8x16_t __arm_vcx2q_u8(int, int64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s8)))
+uint8x16_t __arm_vcx2q_u8(int, int8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u16)))
+uint8x16_t __arm_vcx2q_u8(int, uint16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u32)))
+uint8x16_t __arm_vcx2q_u8(int, uint32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u64)))
+uint8x16_t __arm_vcx2q_u8(int, uint64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u8)))
+uint8x16_t __arm_vcx2q_u8(int, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s16)))
+int16x8_t __arm_vcx2qa_impl(int, int16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s32)))
+int32x4_t __arm_vcx2qa_impl(int, int32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s64)))
+int64x2_t __arm_vcx2qa_impl(int, int64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s8)))
+int8x16_t __arm_vcx2qa_impl(int, int8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u16)))
+uint16x8_t __arm_vcx2qa_impl(int, uint16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u32)))
+uint32x4_t __arm_vcx2qa_impl(int, uint32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u64)))
+uint64x2_t __arm_vcx2qa_impl(int, uint64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u8)))
+uint8x16_t __arm_vcx2qa_impl(int, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s16)))
+int16x8_t __arm_vcx2qa_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s32)))
+int32x4_t __arm_vcx2qa_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s64)))
+int64x2_t __arm_vcx2qa_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s8)))
+int8x16_t __arm_vcx2qa_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u16)))
+uint16x8_t __arm_vcx2qa_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u32)))
+uint32x4_t __arm_vcx2qa_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u64)))
+uint64x2_t __arm_vcx2qa_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u8)))
+uint8x16_t __arm_vcx2qa_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s16)))
+int16x8_t __arm_vcx3q_impl(int, int16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s32)))
+int32x4_t __arm_vcx3q_impl(int, int32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s64)))
+int64x2_t __arm_vcx3q_impl(int, int64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s8)))
+int8x16_t __arm_vcx3q_impl(int, int8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u16)))
+uint16x8_t __arm_vcx3q_impl(int, uint16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u32)))
+uint32x4_t __arm_vcx3q_impl(int, uint32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u64)))
+uint64x2_t __arm_vcx3q_impl(int, uint64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u8)))
+uint8x16_t __arm_vcx3q_impl(int, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s16)))
+int16x8_t __arm_vcx3q_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s32)))
+int32x4_t __arm_vcx3q_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s64)))
+int64x2_t __arm_vcx3q_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s8)))
+int8x16_t __arm_vcx3q_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u16)))
+uint16x8_t __arm_vcx3q_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u32)))
+uint32x4_t __arm_vcx3q_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u64)))
+uint64x2_t __arm_vcx3q_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u8)))
+uint8x16_t __arm_vcx3q_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s16)))
+uint8x16_t __arm_vcx3q_u8_impl(int, int16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s32)))
+uint8x16_t __arm_vcx3q_u8_impl(int, int32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s64)))
+uint8x16_t __arm_vcx3q_u8_impl(int, int64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s8)))
+uint8x16_t __arm_vcx3q_u8_impl(int, int8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u16)))
+uint8x16_t __arm_vcx3q_u8_impl(int, uint16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u32)))
+uint8x16_t __arm_vcx3q_u8_impl(int, uint32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u64)))
+uint8x16_t __arm_vcx3q_u8_impl(int, uint64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u8)))
+uint8x16_t __arm_vcx3q_u8_impl(int, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s16)))
+int16x8_t __arm_vcx3qa_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s32)))
+int32x4_t __arm_vcx3qa_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s64)))
+int64x2_t __arm_vcx3qa_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s8)))
+int8x16_t __arm_vcx3qa_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u16)))
+uint16x8_t __arm_vcx3qa_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u32)))
+uint32x4_t __arm_vcx3qa_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u64)))
+uint64x2_t __arm_vcx3qa_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u8)))
+uint8x16_t __arm_vcx3qa_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s16)))
+int16x8_t __arm_vcx3qa_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s32)))
+int32x4_t __arm_vcx3qa_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s64)))
+int64x2_t __arm_vcx3qa_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s8)))
+int8x16_t __arm_vcx3qa_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u16)))
+uint16x8_t __arm_vcx3qa_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u32)))
+uint32x4_t __arm_vcx3qa_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u64)))
+uint64x2_t __arm_vcx3qa_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u8)))
+uint8x16_t __arm_vcx3qa_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
+int16x8_t __arm_vreinterpretq_s16_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
+int32x4_t __arm_vreinterpretq_s32_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
+int64x2_t __arm_vreinterpretq_s64_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
+int8x16_t __arm_vreinterpretq_s8_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
+uint16x8_t __arm_vreinterpretq_u16_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
+uint32x4_t __arm_vreinterpretq_u32_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
+uint64x2_t __arm_vreinterpretq_u64_u8(uint8x16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
+uint8x16_t __arm_vreinterpretq_u8(int16x8_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
+uint8x16_t __arm_vreinterpretq_u8(int32x4_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
+uint8x16_t __arm_vreinterpretq_u8(int64x2_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
+uint8x16_t __arm_vreinterpretq_u8(int8x16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
+uint8x16_t __arm_vreinterpretq_u8(uint16x8_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
+uint8x16_t __arm_vreinterpretq_u8(uint32x4_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
+uint8x16_t __arm_vreinterpretq_u8(uint64x2_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vreinterpretq_u8_u8)))
+uint8x16_t __arm_vreinterpretq_u8(uint8x16_t);
+#define __arm_vcx2q_m(cp, inactive, n, imm, pred) __arm_vcx2q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), (imm), (pred))
+#define __arm_vcx2qa(cp, acc, n, imm) __arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))
+#define __arm_vcx2qa_m(cp, acc, n, imm, pred) __arm_vcx2qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm), (pred))
+#define __arm_vcx3q(cp, n, m, imm) __arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
+#define __arm_vcx3q_m(cp, inactive, n, m, imm, pred) __arm_vcx3q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
+#define __arm_vcx3q_u8(cp, n, m, imm) __arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
+#define __arm_vcx3qa(cp, acc, n, m, imm) __arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm))
+#define __arm_vcx3qa_m(cp, acc, n, m, imm, pred) __arm_vcx3qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
+
+#endif /* __ARM_FEATURE_MVE */
+
+#if __ARM_FEATURE_MVE & 2
+
+typedef __fp16 float16_t;
+typedef float float32_t;
+typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) float16_t float16x8_t;
+typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) float32_t float32x4_t;
+
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f16)))
+float16x8_t __arm_vcx1q_m(int, float16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f32)))
+float32x4_t __arm_vcx1q_m(int, float32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f16)))
+float16x8_t __arm_vcx1qa(int, float16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f32)))
+float32x4_t __arm_vcx1qa(int, float32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f16)))
+float16x8_t __arm_vcx1qa_m(int, float16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f32)))
+float32x4_t __arm_vcx1qa_m(int, float32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f16)))
+float16x8_t __arm_vcx2q(int, float16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f32)))
+float32x4_t __arm_vcx2q(int, float32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f16)))
+float16x8_t __arm_vcx2q_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f32)))
+float32x4_t __arm_vcx2q_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f16)))
+uint8x16_t __arm_vcx2q_u8(int, float16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f32)))
+uint8x16_t __arm_vcx2q_u8(int, float32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f16)))
+float16x8_t __arm_vcx2qa_impl(int, float16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f32)))
+float32x4_t __arm_vcx2qa_impl(int, float32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f16)))
+float16x8_t __arm_vcx2qa_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f32)))
+float32x4_t __arm_vcx2qa_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f16)))
+float16x8_t __arm_vcx3q_impl(int, float16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f32)))
+float32x4_t __arm_vcx3q_impl(int, float32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f16)))
+float16x8_t __arm_vcx3q_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f32)))
+float32x4_t __arm_vcx3q_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f16)))
+uint8x16_t __arm_vcx3q_u8_impl(int, float16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f32)))
+uint8x16_t __arm_vcx3q_u8_impl(int, float32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f16)))
+float16x8_t __arm_vcx3qa_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f32)))
+float32x4_t __arm_vcx3qa_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f16)))
+float16x8_t __arm_vcx3qa_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f32)))
+float32x4_t __arm_vcx3qa_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
+float16x8_t __arm_vreinterpretq_f16_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
+float32x4_t __arm_vreinterpretq_f32_u8(uint8x16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
+uint8x16_t __arm_vreinterpretq_u8(float16x8_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
+uint8x16_t __arm_vreinterpretq_u8(float32x4_t);
+
+#endif /* __ARM_FEATURE_MVE & 2 */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* __ARM_CDE_H */
--- a/lib/include/arm_mve.h
+++ b/lib/include/arm_mve.h
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
--- a/lib/include/avx2intrin.h
+++ b/lib/include/avx2intrin.h
@ -740,6 +740,8 @@ _mm256_broadcastsi128_si256(__m128i __X)
  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
 }

+#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
+
 #define _mm_blend_epi32(V1, V2, M) \
  (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
                                     (__v4si)(__m128i)(V2), (int)(M))
--- a/lib/include/avx512bwintrin.h
+++ b/lib/include/avx512bwintrin.h
@ -1504,13 +1504,14 @@ _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_slli_epi16(__m512i __A, int __B)
+_mm512_slli_epi16(__m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A,
+                       unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_slli_epi16(__A, __B),
@ -1518,7 +1519,7 @@ _mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B)
+_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_slli_epi16(__A, __B),
@ -1595,13 +1596,14 @@ _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srai_epi16(__m512i __A, int __B)
+_mm512_srai_epi16(__m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A,
+                       unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_srai_epi16(__A, __B),
@ -1609,7 +1611,7 @@ _mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B)
+_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_srai_epi16(__A, __B),
@ -1639,13 +1641,14 @@ _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srli_epi16(__m512i __A, int __B)
+_mm512_srli_epi16(__m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A,
+                       unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_srli_epi16(__A, __B),
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@ -5111,13 +5111,14 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
                                      (__v8di)_mm512_setzero_si512())

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_slli_epi32(__m512i __A, int __B)
+_mm512_slli_epi32(__m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
+                       unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                         (__v16si)_mm512_slli_epi32(__A, __B),
@ -5125,20 +5126,20 @@ _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) {
+_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                         (__v16si)_mm512_slli_epi32(__A, __B),
                                         (__v16si)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_slli_epi64(__m512i __A, int __B)
+_mm512_slli_epi64(__m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                          (__v8di)_mm512_slli_epi64(__A, __B),
@ -5146,7 +5147,7 @@ _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
+_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                          (__v8di)_mm512_slli_epi64(__A, __B),
@ -5154,13 +5155,14 @@ _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srli_epi32(__m512i __A, int __B)
+_mm512_srli_epi32(__m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
+                       unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                         (__v16si)_mm512_srli_epi32(__A, __B),
@ -5168,20 +5170,21 @@ _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) {
+_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                         (__v16si)_mm512_srli_epi32(__A, __B),
                                         (__v16si)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srli_epi64(__m512i __A, int __B)
+_mm512_srli_epi64(__m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A,
+                       unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                          (__v8di)_mm512_srli_epi64(__A, __B),
@ -5189,7 +5192,8 @@ _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B)
+_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A,
+                        unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                          (__v8di)_mm512_srli_epi64(__A, __B),
@ -6593,13 +6597,14 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
                                             (int)(R))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srai_epi32(__m512i __A, int __B)
+_mm512_srai_epi32(__m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A,
+                       unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                         (__v16si)_mm512_srai_epi32(__A, __B),
@ -6607,20 +6612,21 @@ _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) {
+_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
+                        unsigned int __B) {
  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                         (__v16si)_mm512_srai_epi32(__A, __B),
                                         (__v16si)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srai_epi64(__m512i __A, int __B)
+_mm512_srai_epi64(__m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                          (__v8di)_mm512_srai_epi64(__A, __B),
@ -6628,7 +6634,7 @@ _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B)
+_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
 {
  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                          (__v8di)_mm512_srai_epi64(__A, __B),
--- a/lib/include/avx512vlbwintrin.h
+++ b/lib/include/avx512vlbwintrin.h
@ -1939,7 +1939,7 @@ _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                             (__v8hi)_mm_slli_epi16(__A, __B),
@ -1947,7 +1947,7 @@ _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
+_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                             (__v8hi)_mm_slli_epi16(__A, __B),
@ -1955,7 +1955,8 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A,
+                       unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                         (__v16hi)_mm256_slli_epi16(__A, __B),
@ -1963,7 +1964,7 @@ _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B)
+_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                         (__v16hi)_mm256_slli_epi16(__A, __B),
@ -2091,7 +2092,7 @@ _mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                             (__v8hi)_mm_srai_epi16(__A, __B),
@ -2099,7 +2100,7 @@ _mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B)
+_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                             (__v8hi)_mm_srai_epi16(__A, __B),
@ -2107,7 +2108,8 @@ _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A,
+                       unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                         (__v16hi)_mm256_srai_epi16(__A, __B),
@ -2115,7 +2117,7 @@ _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, int __B)
+_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                         (__v16hi)_mm256_srai_epi16(__A, __B),
--- a/lib/include/avx512vlintrin.h
+++ b/lib/include/avx512vlintrin.h
@ -4522,7 +4522,7 @@ _mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                             (__v4si)_mm_slli_epi32(__A, __B),
@ -4530,7 +4530,7 @@ _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B)
+_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                             (__v4si)_mm_slli_epi32(__A, __B),
@ -4538,7 +4538,7 @@ _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                             (__v8si)_mm256_slli_epi32(__A, __B),
@ -4546,7 +4546,7 @@ _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B)
+_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                             (__v8si)_mm256_slli_epi32(__A, __B),
@ -4586,7 +4586,7 @@ _mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                             (__v2di)_mm_slli_epi64(__A, __B),
@ -4594,7 +4594,7 @@ _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B)
+_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                             (__v2di)_mm_slli_epi64(__A, __B),
@ -4602,7 +4602,7 @@ _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                             (__v4di)_mm256_slli_epi64(__A, __B),
@ -4610,7 +4610,7 @@ _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B)
+_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                             (__v4di)_mm256_slli_epi64(__A, __B),
@ -4866,7 +4866,7 @@ _mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                             (__v4si)_mm_srli_epi32(__A, __B),
@ -4874,7 +4874,7 @@ _mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B)
+_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                             (__v4si)_mm_srli_epi32(__A, __B),
@ -4882,7 +4882,7 @@ _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                             (__v8si)_mm256_srli_epi32(__A, __B),
@ -4890,7 +4890,7 @@ _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, int __B)
+_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                             (__v8si)_mm256_srli_epi32(__A, __B),
@ -4930,7 +4930,7 @@ _mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                             (__v2di)_mm_srli_epi64(__A, __B),
@ -4938,7 +4938,7 @@ _mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B)
+_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                             (__v2di)_mm_srli_epi64(__A, __B),
@ -4946,7 +4946,7 @@ _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                             (__v4di)_mm256_srli_epi64(__A, __B),
@ -4954,7 +4954,7 @@ _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, int __B)
+_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                             (__v4di)_mm256_srli_epi64(__A, __B),
@ -6405,7 +6405,7 @@ _mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                             (__v4si)_mm_srai_epi32(__A, __B),
@ -6413,7 +6413,7 @@ _mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B)
+_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                             (__v4si)_mm_srai_epi32(__A, __B),
@ -6421,7 +6421,7 @@ _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                             (__v8si)_mm256_srai_epi32(__A, __B),
@ -6429,7 +6429,7 @@ _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, int __B)
+_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                             (__v8si)_mm256_srai_epi32(__A, __B),
@ -6481,13 +6481,13 @@ _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srai_epi64(__m128i __A, int __imm)
+_mm_srai_epi64(__m128i __A, unsigned int __imm)
 {
  return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm)
+_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
                                           (__v2di)_mm_srai_epi64(__A, __imm), \
@ -6495,7 +6495,7 @@ _mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm)
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm)
+_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
                                           (__v2di)_mm_srai_epi64(__A, __imm), \
@ -6503,13 +6503,14 @@ _mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srai_epi64(__m256i __A, int __imm)
+_mm256_srai_epi64(__m256i __A, unsigned int __imm)
 {
  return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm)
+_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A,
+                       unsigned int __imm)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
@ -6517,7 +6518,7 @@ _mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm)
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm)
+_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
--- a/lib/include/bmiintrin.h
+++ b/lib/include/bmiintrin.h
@ -111,7 +111,8 @@ _mm_tzcnt_64(unsigned long long __X)

 #undef __RELAXED_FN_ATTRS

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__BMI__)

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
@ -192,6 +193,28 @@ _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }

+/* Intel-specified, single-leading-underscore version of BEXTR2 */
+/// Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
+///    specify the index of the least significant bit. Bits [15:8] specify the
+///    number of bits to be extracted.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
+/// \see __bextr_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bextr2_u32(unsigned int __X, unsigned int __Y) {
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
 /// Clears all bits in the source except for the least significant bit
 ///    containing a value of 1 and returns the result.
 ///
@ -321,6 +344,28 @@ _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }

+/* Intel-specified, single-leading-underscore version of BEXTR2 */
+/// Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
+///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
+///    the number of bits to be extracted.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
+/// \see __bextr_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bextr2_u64(unsigned long long __X, unsigned long long __Y) {
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
 /// Clears all bits in the source except for the least significant bit
 ///    containing a value of 1 and returns the result.
 ///
@ -376,6 +421,7 @@ __blsr_u64(unsigned long long __X)

 #undef __DEFAULT_FN_ATTRS

-#endif /* !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__) */
+#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules)   \
+          || defined(__BMI__) */

 #endif /* __BMIINTRIN_H */
--- a/lib/include/cet.h
+++ b/lib/include/cet.h
@ -0,0 +1,66 @@
+/*===------ cet.h -Control-flow Enforcement Technology  feature ------------===
+ * Add x86 feature with IBT and/or SHSTK bits to ELF program property if they
+ * are enabled. Otherwise, contents in this header file are unused. This file
+ * is mainly design for assembly source code which want to enable CET.
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CET_H
+#define __CET_H
+
+#ifdef __ASSEMBLER__
+
+#ifndef __CET__
+# define _CET_ENDBR
+#endif
+
+#ifdef __CET__
+
+# ifdef __LP64__
+#  if __CET__ & 0x1
+#    define _CET_ENDBR endbr64
+#  else
+#    define _CET_ENDBR
+#  endif
+# else
+#  if __CET__ & 0x1
+#    define _CET_ENDBR endbr32
+#  else
+#    define _CET_ENDBR
+#  endif
+# endif
+
+
+#  ifdef __LP64__
+#   define __PROPERTY_ALIGN 3
+#  else
+#   define __PROPERTY_ALIGN 2
+#  endif
+
+	.pushsection ".note.gnu.property", "a"
+	.p2align __PROPERTY_ALIGN
+	.long 1f - 0f		/* name length.  */
+	.long 4f - 1f		/* data length.  */
+	/* NT_GNU_PROPERTY_TYPE_0.   */
+	.long 5			/* note type.  */
+0:
+	.asciz "GNU"		/* vendor name.  */
+1:
+	.p2align __PROPERTY_ALIGN
+	/* GNU_PROPERTY_X86_FEATURE_1_AND.  */
+	.long 0xc0000002	/* pr_type.  */
+	.long 3f - 2f		/* pr_datasz.  */
+2:
+	/* GNU_PROPERTY_X86_FEATURE_1_XXX.  */
+	.long __CET__
+3:
+	.p2align __PROPERTY_ALIGN
+4:
+	.popsection
+#endif
+#endif
+#endif
--- a/lib/include/cldemoteintrin.h
+++ b/lib/include/cldemoteintrin.h
@ -18,11 +18,19 @@
 #define __DEFAULT_FN_ATTRS \
  __attribute__((__always_inline__, __nodebug__,  __target__("cldemote")))

+/// Hint to hardware that the cache line that contains \p __P should be demoted
+/// from the cache closest to the processor core to a level more distant from
+/// the processor core.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CLDEMOTE </c> instruction.
 static __inline__ void __DEFAULT_FN_ATTRS
 _cldemote(const void * __P) {
  __builtin_ia32_cldemote(__P);
 }

+#define _mm_cldemote(p) _cldemote(p)
 #undef __DEFAULT_FN_ATTRS

 #endif
--- a/lib/include/cpuid.h
+++ b/lib/include/cpuid.h
@ -24,6 +24,10 @@
 #define signature_CYRIX_ebx 0x69727943
 #define signature_CYRIX_edx 0x736e4978
 #define signature_CYRIX_ecx 0x64616574
+/* HYGON:   "HygonGenuine" */
+#define signature_HYGON_ebx 0x6f677948
+#define signature_HYGON_edx 0x6e65476e
+#define signature_HYGON_ecx 0x656e6975
 /* INTEL:   "GenuineIntel" */
 #define signature_INTEL_ebx 0x756e6547
 #define signature_INTEL_edx 0x49656e69
@ -182,8 +186,13 @@
 /* Features in %edx for leaf 7 sub-leaf 0 */
 #define bit_AVX5124VNNIW  0x00000004
 #define bit_AVX5124FMAPS  0x00000008
+#define bit_SERIALIZE     0x00004000
+#define bit_TSXLDTRK      0x00010000
 #define bit_PCONFIG       0x00040000
 #define bit_IBT           0x00100000
+#define bit_AMXBF16       0x00400000
+#define bit_AMXTILE       0x01000000
+#define bit_AMXINT8       0x02000000

 /* Features in %eax for leaf 7 sub-leaf 1 */
 #define bit_AVX512BF16    0x00000020
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
@ -4970,10 +4970,10 @@ void _mm_pause(void);

 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))

-#define _MM_DENORMALS_ZERO_ON   (0x0040)
-#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+#define _MM_DENORMALS_ZERO_ON   (0x0040U)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000U)

-#define _MM_DENORMALS_ZERO_MASK (0x0040)
+#define _MM_DENORMALS_ZERO_MASK (0x0040U)

 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -10,198 +10,231 @@
 #ifndef __IMMINTRIN_H
 #define __IMMINTRIN_H

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MMX__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__MMX__)
 #include <mmintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SSE__)
 #include <xmmintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE2__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SSE2__)
 #include <emmintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE3__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SSE3__)
 #include <pmmintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSSE3__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SSSE3__)
 #include <tmmintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__SSE4_2__) || defined(__SSE4_1__))
 #include <smmintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AES__) || defined(__PCLMUL__))
 #include <wmmintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLFLUSHOPT__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__CLFLUSHOPT__)
 #include <clflushoptintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLWB__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__CLWB__)
 #include <clwbintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX__)
 #include <avxintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX2__)
 #include <avx2intrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__F16C__)
 #include <f16cintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__VPCLMULQDQ__)
 #include <vpclmulqdqintrin.h>
 #endif

 /* No feature check desired due to internal checks */
 #include <bmiintrin.h>

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__BMI2__)
 #include <bmi2intrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__LZCNT__)
 #include <lzcntintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__POPCNT__)
 #include <popcntintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__FMA__)
 #include <fmaintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512F__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512F__)
 #include <avx512fintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VL__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512VL__)
 #include <avx512vlintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BW__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512BW__)
 #include <avx512bwintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BITALG__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512BITALG__)
 #include <avx512bitalgintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512CD__)
 #include <avx512cdintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512VPOPCNTDQ__)
 #include <avx512vpopcntdqintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
 #include <avx512vpopcntdqvlintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VNNI__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512VNNI__)
 #include <avx512vnniintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AVX512VL__) && defined(__AVX512VNNI__))
 #include <avx512vlvnniintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AVX512VL__) && defined(__AVX512BITALG__))
 #include <avx512vlbitalgintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AVX512VL__) && defined(__AVX512BW__))
 #include <avx512vlbwintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AVX512VL__) && defined(__AVX512CD__))
 #include <avx512vlcdintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AVX512VL__) && defined(__AVX512DQ__))
 #include <avx512vldqintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512ER__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512ER__)
 #include <avx512erintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512IFMA__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512IFMA__)
 #include <avx512ifmaintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AVX512IFMA__) && defined(__AVX512VL__))
 #include <avx512ifmavlintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512VBMI__)
 #include <avx512vbmiintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AVX512VBMI__) && defined(__AVX512VL__))
 #include <avx512vbmivlintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI2__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512VBMI2__)
 #include <avx512vbmi2intrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
 #include <avx512vlvbmi2intrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512PF__)
 #include <avx512pfintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BF16__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512BF16__)
 #include <avx512bf16intrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    (defined(__AVX512VL__) && defined(__AVX512BF16__))
 #include <avx512vlbf16intrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__PKU__)
 #include <pkuintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VAES__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__VAES__)
 #include <vaesintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__GFNI__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__GFNI__)
 #include <gfniintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDPID__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__RDPID__)
 /// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
 ///
 /// \headerfile <immintrin.h>
@ -213,7 +246,8 @@ _rdpid_u32(void) {
 }
 #endif // __RDPID__

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__RDRND__)
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
 {
@ -235,7 +269,8 @@ _rdrand64_step(unsigned long long *__p)
 #endif
 #endif /* __RDRND__ */

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__FSGSBASE__)
 #ifdef __x86_64__
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readfsbase_u32(void)
@ -288,7 +323,8 @@ _writegsbase_u64(unsigned long long __V)
 #endif
 #endif /* __FSGSBASE__ */

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MOVBE__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__MOVBE__)

 /* The structs used below are to force the load/store to be unaligned. This
 * is accomplished with the __packed__ attribute. The __may_alias__ prevents
@ -347,35 +383,42 @@ _storebe_i64(void * __P, long long __D) {
 #endif
 #endif /* __MOVBE */

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__RTM__)
 #include <rtmintrin.h>
 #include <xtestintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHA__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SHA__)
 #include <shaintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FXSR__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__FXSR__)
 #include <fxsrintrin.h>
 #endif

 /* No feature check desired due to internal MSC_VER checks */
 #include <xsaveintrin.h>

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__XSAVEOPT__)
 #include <xsaveoptintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEC__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__XSAVEC__)
 #include <xsavecintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVES__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__XSAVES__)
 #include <xsavesintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHSTK__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SHSTK__)
 #include <cetintrin.h>
 #endif

@ -383,57 +426,81 @@ _storebe_i64(void * __P, long long __D) {
 * whereas others are also available at all times. */
 #include <adxintrin.h>

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__RDSEED__)
 #include <rdseedintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WBNOINVD__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__WBNOINVD__)
 #include <wbnoinvdintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLDEMOTE__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__CLDEMOTE__)
 #include <cldemoteintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WAITPKG__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__WAITPKG__)
 #include <waitpkgintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
-  defined(__MOVDIRI__) || defined(__MOVDIR64B__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__MOVDIRI__) || defined(__MOVDIR64B__)
 #include <movdirintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PCONFIG__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__PCONFIG__)
 #include <pconfigintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SGX__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SGX__)
 #include <sgxintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PTWRITE__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__PTWRITE__)
 #include <ptwriteintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__INVPCID__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__INVPCID__)
 #include <invpcidintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
-  defined(__AVX512VP2INTERSECT__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
+#include <amxintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || \
-  (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
 #include <avx512vlvp2intersectintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__ENQCMD__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__ENQCMD__)
 #include <enqcmdintrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SERIALIZE__)
+#include <serializeintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__TSXLDTRK__)
+#include <tsxldtrkintrin.h>
+#endif
+
 #if defined(_MSC_VER) && __has_extension(gnu_asm)
 /* Define the default attributes for these intrinsics */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -289,6 +289,9 @@ unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
 static __inline__
 unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);

+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
 static __inline__
 __int64 _InterlockedDecrement64(__int64 volatile *_Addend);
 static __inline__
--- a/lib/include/module.modulemap
+++ b/lib/include/module.modulemap
@ -27,6 +27,12 @@ module _Builtin_intrinsics [system] [extern_c] {
      header "arm_fp16.h"
      export *
    }
+
+    explicit module sve {
+      requires sve
+      header "arm_sve.h"
+      export *
+    }
  }

  explicit module intel {
--- a/lib/include/msa.h
+++ b/lib/include/msa.h
@ -212,10 +212,14 @@ typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
 #define __msa_ld_h __builtin_msa_ld_h
 #define __msa_ld_w __builtin_msa_ld_w
 #define __msa_ld_d __builtin_msa_ld_d
+#define __msa_ldr_d __builtin_msa_ldr_d
+#define __msa_ldr_w __builtin_msa_ldrq_w
 #define __msa_st_b __builtin_msa_st_b
 #define __msa_st_h __builtin_msa_st_h
 #define __msa_st_w __builtin_msa_st_w
 #define __msa_st_d __builtin_msa_st_d
+#define __msa_str_d __builtin_msa_str_d
+#define __msa_str_w __builtin_msa_strq_w
 #define __msa_sat_s_b __builtin_msa_sat_s_b
 #define __msa_sat_s_h __builtin_msa_sat_s_h
 #define __msa_sat_s_w __builtin_msa_sat_s_w
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
@ -13432,18 +13432,12 @@ int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, m
 uint __ovld atomic_fetch_min(volatile atomic_uint *object, uint operand);
 uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order);
 uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_min(volatile atomic_uint *object, int operand);
-uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order);
-uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
 int __ovld atomic_fetch_max(volatile atomic_int *object, int operand);
 int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order);
 int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
 uint __ovld atomic_fetch_max(volatile atomic_uint *object, uint operand);
 uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order);
 uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_max(volatile atomic_uint *object, int operand);
-uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order);
-uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);

 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 long __ovld atomic_fetch_add(volatile atomic_long *object, long operand);
@ -13482,18 +13476,12 @@ long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand
 ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, ulong operand);
 ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
 ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, long operand);
-ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
 long __ovld atomic_fetch_max(volatile atomic_long *object, long operand);
 long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order);
 long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
 ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, ulong operand);
 ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
 ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, long operand);
-ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)

 // OpenCL v2.0 s6.13.11.7.5:
@ -14682,7 +14670,7 @@ void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, flo

 // OpenCL Extension v2.0 s9.18 - Mipmaps
 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
-#ifdef cl_khr_mipmap_image
+#if defined(cl_khr_mipmap_image_writes)
 void __ovld write_imagef(write_only image1d_t image, int coord, int lod, float4 color);
 void __ovld write_imagei(write_only image1d_t image, int coord, int lod, int4 color);
 void __ovld write_imageui(write_only image1d_t image, int coord, int lod, uint4 color);
@ -14699,15 +14687,16 @@ void __ovld write_imagef(write_only image2d_array_t image_array, int4 coord, int
 void __ovld write_imagei(write_only image2d_array_t image_array, int4 coord, int lod, int4 color);
 void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, int lod, uint4 color);

-void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float color);
-void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float color);
+void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float depth);
+void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float depth);

 #ifdef cl_khr_3d_image_writes
 void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color);
 void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color);
 void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color);
-#endif
-#endif //cl_khr_mipmap_image
+#endif //cl_khr_3d_image_writes
+
+#endif //defined(cl_khr_mipmap_image_writes)
 #endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

 // Image write functions for half4 type
@ -14756,7 +14745,7 @@ void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, flo
 #endif //cl_khr_depth_images

 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
-#ifdef cl_khr_mipmap_image
+#if defined(cl_khr_mipmap_image_writes)
 void __ovld write_imagef(read_write image1d_t image, int coord, int lod, float4 color);
 void __ovld write_imagei(read_write image1d_t image, int coord, int lod, int4 color);
 void __ovld write_imageui(read_write image1d_t image, int coord, int lod, uint4 color);
@ -14780,8 +14769,9 @@ void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int
 void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color);
 void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color);
 void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color);
-#endif
-#endif //cl_khr_mipmap_image
+#endif //cl_khr_3d_image_writes
+
+#endif //cl_khr_mipmap_image_writes
 #endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

 // Image write functions for half4 type
@ -15470,6 +15460,674 @@ double  __ovld __conv sub_group_scan_inclusive_max(double x);

 #endif //cl_khr_subgroups cl_intel_subgroups

+#if defined(cl_khr_subgroup_extended_types)
+char __ovld __conv sub_group_broadcast( char value, uint index );
+char2 __ovld __conv sub_group_broadcast( char2 value, uint index );
+char3 __ovld __conv sub_group_broadcast( char3 value, uint index );
+char4 __ovld __conv sub_group_broadcast( char4 value, uint index );
+char8 __ovld __conv sub_group_broadcast( char8 value, uint index );
+char16 __ovld __conv sub_group_broadcast( char16 value, uint index );
+
+uchar __ovld __conv sub_group_broadcast( uchar value, uint index );
+uchar2 __ovld __conv sub_group_broadcast( uchar2 value, uint index );
+uchar3 __ovld __conv sub_group_broadcast( uchar3 value, uint index );
+uchar4 __ovld __conv sub_group_broadcast( uchar4 value, uint index );
+uchar8 __ovld __conv sub_group_broadcast( uchar8 value, uint index );
+uchar16 __ovld __conv sub_group_broadcast( uchar16 value, uint index );
+
+short __ovld __conv sub_group_broadcast( short value, uint index );
+short2 __ovld __conv sub_group_broadcast( short2 value, uint index );
+short3 __ovld __conv sub_group_broadcast( short3 value, uint index );
+short4 __ovld __conv sub_group_broadcast( short4 value, uint index );
+short8 __ovld __conv sub_group_broadcast( short8 value, uint index );
+short16 __ovld __conv sub_group_broadcast( short16 value, uint index );
+
+ushort __ovld __conv sub_group_broadcast( ushort value, uint index );
+ushort2 __ovld __conv sub_group_broadcast( ushort2 value, uint index );
+ushort3 __ovld __conv sub_group_broadcast( ushort3 value, uint index );
+ushort4 __ovld __conv sub_group_broadcast( ushort4 value, uint index );
+ushort8 __ovld __conv sub_group_broadcast( ushort8 value, uint index );
+ushort16 __ovld __conv sub_group_broadcast( ushort16 value, uint index );
+
+// scalar int broadcast is part of cl_khr_subgroups
+int2 __ovld __conv sub_group_broadcast( int2 value, uint index );
+int3 __ovld __conv sub_group_broadcast( int3 value, uint index );
+int4 __ovld __conv sub_group_broadcast( int4 value, uint index );
+int8 __ovld __conv sub_group_broadcast( int8 value, uint index );
+int16 __ovld __conv sub_group_broadcast( int16 value, uint index );
+
+// scalar uint broadcast is part of cl_khr_subgroups
+uint2 __ovld __conv sub_group_broadcast( uint2 value, uint index );
+uint3 __ovld __conv sub_group_broadcast( uint3 value, uint index );
+uint4 __ovld __conv sub_group_broadcast( uint4 value, uint index );
+uint8 __ovld __conv sub_group_broadcast( uint8 value, uint index );
+uint16 __ovld __conv sub_group_broadcast( uint16 value, uint index );
+
+// scalar long broadcast is part of cl_khr_subgroups
+long2 __ovld __conv sub_group_broadcast( long2 value, uint index );
+long3 __ovld __conv sub_group_broadcast( long3 value, uint index );
+long4 __ovld __conv sub_group_broadcast( long4 value, uint index );
+long8 __ovld __conv sub_group_broadcast( long8 value, uint index );
+long16 __ovld __conv sub_group_broadcast( long16 value, uint index );
+
+// scalar ulong broadcast is part of cl_khr_subgroups
+ulong2 __ovld __conv sub_group_broadcast( ulong2 value, uint index );
+ulong3 __ovld __conv sub_group_broadcast( ulong3 value, uint index );
+ulong4 __ovld __conv sub_group_broadcast( ulong4 value, uint index );
+ulong8 __ovld __conv sub_group_broadcast( ulong8 value, uint index );
+ulong16 __ovld __conv sub_group_broadcast( ulong16 value, uint index );
+
+// scalar float broadcast is part of cl_khr_subgroups
+float2 __ovld __conv sub_group_broadcast( float2 value, uint index );
+float3 __ovld __conv sub_group_broadcast( float3 value, uint index );
+float4 __ovld __conv sub_group_broadcast( float4 value, uint index );
+float8 __ovld __conv sub_group_broadcast( float8 value, uint index );
+float16 __ovld __conv sub_group_broadcast( float16 value, uint index );
+
+char __ovld __conv sub_group_reduce_add( char value );
+uchar __ovld __conv sub_group_reduce_add( uchar value );
+short __ovld __conv sub_group_reduce_add( short value );
+ushort __ovld __conv sub_group_reduce_add( ushort value );
+
+char __ovld __conv sub_group_reduce_min( char value );
+uchar __ovld __conv sub_group_reduce_min( uchar value );
+short __ovld __conv sub_group_reduce_min( short value );
+ushort __ovld __conv sub_group_reduce_min( ushort value );
+
+char __ovld __conv sub_group_reduce_max( char value );
+uchar __ovld __conv sub_group_reduce_max( uchar value );
+short __ovld __conv sub_group_reduce_max( short value );
+ushort __ovld __conv sub_group_reduce_max( ushort value );
+
+char __ovld __conv sub_group_scan_inclusive_add( char value );
+uchar __ovld __conv sub_group_scan_inclusive_add( uchar value );
+short __ovld __conv sub_group_scan_inclusive_add( short value );
+ushort __ovld __conv sub_group_scan_inclusive_add( ushort value );
+
+char __ovld __conv sub_group_scan_inclusive_min( char value );
+uchar __ovld __conv sub_group_scan_inclusive_min( uchar value );
+short __ovld __conv sub_group_scan_inclusive_min( short value );
+ushort __ovld __conv sub_group_scan_inclusive_min( ushort value );
+
+char __ovld __conv sub_group_scan_inclusive_max( char value );
+uchar __ovld __conv sub_group_scan_inclusive_max( uchar value );
+short __ovld __conv sub_group_scan_inclusive_max( short value );
+ushort __ovld __conv sub_group_scan_inclusive_max( ushort value );
+
+char __ovld __conv sub_group_scan_exclusive_add( char value );
+uchar __ovld __conv sub_group_scan_exclusive_add( uchar value );
+short __ovld __conv sub_group_scan_exclusive_add( short value );
+ushort __ovld __conv sub_group_scan_exclusive_add( ushort value );
+
+char __ovld __conv sub_group_scan_exclusive_min( char value );
+uchar __ovld __conv sub_group_scan_exclusive_min( uchar value );
+short __ovld __conv sub_group_scan_exclusive_min( short value );
+ushort __ovld __conv sub_group_scan_exclusive_min( ushort value );
+
+char __ovld __conv sub_group_scan_exclusive_max( char value );
+uchar __ovld __conv sub_group_scan_exclusive_max( uchar value );
+short __ovld __conv sub_group_scan_exclusive_max( short value );
+ushort __ovld __conv sub_group_scan_exclusive_max( ushort value );
+
+#if defined(cl_khr_fp16)
+// scalar half broadcast is part of cl_khr_subgroups
+half2 __ovld __conv sub_group_broadcast( half2 value, uint index );
+half3 __ovld __conv sub_group_broadcast( half3 value, uint index );
+half4 __ovld __conv sub_group_broadcast( half4 value, uint index );
+half8 __ovld __conv sub_group_broadcast( half8 value, uint index );
+half16 __ovld __conv sub_group_broadcast( half16 value, uint index );
+#endif  // cl_khr_fp16
+
+#if defined(cl_khr_fp64)
+// scalar double broadcast is part of cl_khr_subgroups
+double2 __ovld __conv sub_group_broadcast( double2 value, uint index );
+double3 __ovld __conv sub_group_broadcast( double3 value, uint index );
+double4 __ovld __conv sub_group_broadcast( double4 value, uint index );
+double8 __ovld __conv sub_group_broadcast( double8 value, uint index );
+double16 __ovld __conv sub_group_broadcast( double16 value, uint index );
+#endif  // cl_khr_fp64
+
+#endif  // cl_khr_subgroup_extended_types
+
+#if defined(cl_khr_subgroup_non_uniform_vote)
+int     __ovld sub_group_elect(void);
+int     __ovld sub_group_non_uniform_all( int predicate );
+int     __ovld sub_group_non_uniform_any( int predicate );
+
+int     __ovld sub_group_non_uniform_all_equal( char value );
+int     __ovld sub_group_non_uniform_all_equal( uchar value );
+int     __ovld sub_group_non_uniform_all_equal( short value );
+int     __ovld sub_group_non_uniform_all_equal( ushort value );
+int     __ovld sub_group_non_uniform_all_equal( int value );
+int     __ovld sub_group_non_uniform_all_equal( uint value );
+int     __ovld sub_group_non_uniform_all_equal( long value );
+int     __ovld sub_group_non_uniform_all_equal( ulong value );
+int     __ovld sub_group_non_uniform_all_equal( float value );
+
+#if defined(cl_khr_fp16)
+int     __ovld sub_group_non_uniform_all_equal( half value );
+#endif // cl_khr_fp16
+
+#if defined(cl_khr_fp64)
+int     __ovld sub_group_non_uniform_all_equal( double value );
+#endif // cl_khr_fp64
+
+#endif // cl_khr_subgroup_non_uniform_vote
+
+#if defined(cl_khr_subgroup_ballot)
+char    __ovld sub_group_non_uniform_broadcast( char value, uint index );
+char2   __ovld sub_group_non_uniform_broadcast( char2 value, uint index );
+char3   __ovld sub_group_non_uniform_broadcast( char3 value, uint index );
+char4   __ovld sub_group_non_uniform_broadcast( char4 value, uint index );
+char8   __ovld sub_group_non_uniform_broadcast( char8 value, uint index );
+char16  __ovld sub_group_non_uniform_broadcast( char16 value, uint index );
+
+uchar   __ovld sub_group_non_uniform_broadcast( uchar value, uint index );
+uchar2  __ovld sub_group_non_uniform_broadcast( uchar2 value, uint index );
+uchar3  __ovld sub_group_non_uniform_broadcast( uchar3 value, uint index );
+uchar4  __ovld sub_group_non_uniform_broadcast( uchar4 value, uint index );
+uchar8  __ovld sub_group_non_uniform_broadcast( uchar8 value, uint index );
+uchar16 __ovld sub_group_non_uniform_broadcast( uchar16 value, uint index );
+
+short   __ovld sub_group_non_uniform_broadcast( short value, uint index );
+short2  __ovld sub_group_non_uniform_broadcast( short2 value, uint index );
+short3  __ovld sub_group_non_uniform_broadcast( short3 value, uint index );
+short4  __ovld sub_group_non_uniform_broadcast( short4 value, uint index );
+short8  __ovld sub_group_non_uniform_broadcast( short8 value, uint index );
+short16 __ovld sub_group_non_uniform_broadcast( short16 value, uint index );
+
+ushort  __ovld sub_group_non_uniform_broadcast( ushort value, uint index );
+ushort2 __ovld sub_group_non_uniform_broadcast( ushort2 value, uint index );
+ushort3 __ovld sub_group_non_uniform_broadcast( ushort3 value, uint index );
+ushort4 __ovld sub_group_non_uniform_broadcast( ushort4 value, uint index );
+ushort8 __ovld sub_group_non_uniform_broadcast( ushort8 value, uint index );
+ushort16 __ovld sub_group_non_uniform_broadcast( ushort16 value, uint index );
+
+int     __ovld sub_group_non_uniform_broadcast( int value, uint index );
+int2    __ovld sub_group_non_uniform_broadcast( int2 value, uint index );
+int3    __ovld sub_group_non_uniform_broadcast( int3 value, uint index );
+int4    __ovld sub_group_non_uniform_broadcast( int4 value, uint index );
+int8    __ovld sub_group_non_uniform_broadcast( int8 value, uint index );
+int16   __ovld sub_group_non_uniform_broadcast( int16 value, uint index );
+
+uint    __ovld sub_group_non_uniform_broadcast( uint value, uint index );
+uint2   __ovld sub_group_non_uniform_broadcast( uint2 value, uint index );
+uint3   __ovld sub_group_non_uniform_broadcast( uint3 value, uint index );
+uint4   __ovld sub_group_non_uniform_broadcast( uint4 value, uint index );
+uint8   __ovld sub_group_non_uniform_broadcast( uint8 value, uint index );
+uint16  __ovld sub_group_non_uniform_broadcast( uint16 value, uint index );
+
+long    __ovld sub_group_non_uniform_broadcast( long value, uint index );
+long2   __ovld sub_group_non_uniform_broadcast( long2 value, uint index );
+long3   __ovld sub_group_non_uniform_broadcast( long3 value, uint index );
+long4   __ovld sub_group_non_uniform_broadcast( long4 value, uint index );
+long8   __ovld sub_group_non_uniform_broadcast( long8 value, uint index );
+long16  __ovld sub_group_non_uniform_broadcast( long16 value, uint index );
+
+ulong   __ovld sub_group_non_uniform_broadcast( ulong value, uint index );
+ulong2  __ovld sub_group_non_uniform_broadcast( ulong2 value, uint index );
+ulong3  __ovld sub_group_non_uniform_broadcast( ulong3 value, uint index );
+ulong4  __ovld sub_group_non_uniform_broadcast( ulong4 value, uint index );
+ulong8  __ovld sub_group_non_uniform_broadcast( ulong8 value, uint index );
+ulong16 __ovld sub_group_non_uniform_broadcast( ulong16 value, uint index );
+
+float   __ovld sub_group_non_uniform_broadcast( float value, uint index );
+float2  __ovld sub_group_non_uniform_broadcast( float2 value, uint index );
+float3  __ovld sub_group_non_uniform_broadcast( float3 value, uint index );
+float4  __ovld sub_group_non_uniform_broadcast( float4 value, uint index );
+float8  __ovld sub_group_non_uniform_broadcast( float8 value, uint index );
+float16 __ovld sub_group_non_uniform_broadcast( float16 value, uint index );
+
+char    __ovld sub_group_broadcast_first( char value );
+uchar   __ovld sub_group_broadcast_first( uchar value );
+short   __ovld sub_group_broadcast_first( short value );
+ushort  __ovld sub_group_broadcast_first( ushort value );
+int     __ovld sub_group_broadcast_first( int value );
+uint    __ovld sub_group_broadcast_first( uint value );
+long    __ovld sub_group_broadcast_first( long value );
+ulong   __ovld sub_group_broadcast_first( ulong value );
+float   __ovld sub_group_broadcast_first( float value );
+
+uint4   __ovld sub_group_ballot( int predicate );
+int     __ovld __cnfn sub_group_inverse_ballot( uint4 value );
+int     __ovld __cnfn sub_group_ballot_bit_extract( uint4 value, uint index );
+uint    __ovld __cnfn sub_group_ballot_bit_count( uint4 value );
+
+uint    __ovld sub_group_ballot_inclusive_scan( uint4 value );
+uint    __ovld sub_group_ballot_exclusive_scan( uint4 value );
+uint    __ovld sub_group_ballot_find_lsb( uint4 value );
+uint    __ovld sub_group_ballot_find_msb( uint4 value );
+
+uint4   __ovld __cnfn get_sub_group_eq_mask(void);
+uint4   __ovld __cnfn get_sub_group_ge_mask(void);
+uint4   __ovld __cnfn get_sub_group_gt_mask(void);
+uint4   __ovld __cnfn get_sub_group_le_mask(void);
+uint4   __ovld __cnfn get_sub_group_lt_mask(void);
+
+#if defined(cl_khr_fp16)
+half    __ovld sub_group_non_uniform_broadcast( half value, uint index );
+half2   __ovld sub_group_non_uniform_broadcast( half2 value, uint index );
+half3   __ovld sub_group_non_uniform_broadcast( half3 value, uint index );
+half4   __ovld sub_group_non_uniform_broadcast( half4 value, uint index );
+half8   __ovld sub_group_non_uniform_broadcast( half8 value, uint index );
+half16  __ovld sub_group_non_uniform_broadcast( half16 value, uint index );
+
+half    __ovld sub_group_broadcast_first( half value );
+#endif // cl_khr_fp16
+
+#if defined(cl_khr_fp64)
+double   __ovld sub_group_non_uniform_broadcast( double value, uint index );
+double2  __ovld sub_group_non_uniform_broadcast( double2 value, uint index );
+double3  __ovld sub_group_non_uniform_broadcast( double3 value, uint index );
+double4  __ovld sub_group_non_uniform_broadcast( double4 value, uint index );
+double8  __ovld sub_group_non_uniform_broadcast( double8 value, uint index );
+double16 __ovld sub_group_non_uniform_broadcast( double16 value, uint index );
+
+double   __ovld sub_group_broadcast_first( double value );
+#endif // cl_khr_fp64
+
+#endif // cl_khr_subgroup_ballot
+
+#if defined(cl_khr_subgroup_non_uniform_arithmetic)
+char    __ovld sub_group_non_uniform_reduce_add( char value );
+uchar   __ovld sub_group_non_uniform_reduce_add( uchar value );
+short   __ovld sub_group_non_uniform_reduce_add( short value );
+ushort  __ovld sub_group_non_uniform_reduce_add( ushort value );
+int     __ovld sub_group_non_uniform_reduce_add( int value );
+uint    __ovld sub_group_non_uniform_reduce_add( uint value );
+long    __ovld sub_group_non_uniform_reduce_add( long value );
+ulong   __ovld sub_group_non_uniform_reduce_add( ulong value );
+float   __ovld sub_group_non_uniform_reduce_add( float value );
+
+char    __ovld sub_group_non_uniform_reduce_mul( char value );
+uchar   __ovld sub_group_non_uniform_reduce_mul( uchar value );
+short   __ovld sub_group_non_uniform_reduce_mul( short value );
+ushort  __ovld sub_group_non_uniform_reduce_mul( ushort value );
+int     __ovld sub_group_non_uniform_reduce_mul( int value );
+uint    __ovld sub_group_non_uniform_reduce_mul( uint value );
+long    __ovld sub_group_non_uniform_reduce_mul( long value );
+ulong   __ovld sub_group_non_uniform_reduce_mul( ulong value );
+float   __ovld sub_group_non_uniform_reduce_mul( float value );
+
+char    __ovld sub_group_non_uniform_reduce_min( char value );
+uchar   __ovld sub_group_non_uniform_reduce_min( uchar value );
+short   __ovld sub_group_non_uniform_reduce_min( short value );
+ushort  __ovld sub_group_non_uniform_reduce_min( ushort value );
+int     __ovld sub_group_non_uniform_reduce_min( int value );
+uint    __ovld sub_group_non_uniform_reduce_min( uint value );
+long    __ovld sub_group_non_uniform_reduce_min( long value );
+ulong   __ovld sub_group_non_uniform_reduce_min( ulong value );
+float   __ovld sub_group_non_uniform_reduce_min( float value );
+
+char    __ovld sub_group_non_uniform_reduce_max( char value );
+uchar   __ovld sub_group_non_uniform_reduce_max( uchar value );
+short   __ovld sub_group_non_uniform_reduce_max( short value );
+ushort  __ovld sub_group_non_uniform_reduce_max( ushort value );
+int     __ovld sub_group_non_uniform_reduce_max( int value );
+uint    __ovld sub_group_non_uniform_reduce_max( uint value );
+long    __ovld sub_group_non_uniform_reduce_max( long value );
+ulong   __ovld sub_group_non_uniform_reduce_max( ulong value );
+float   __ovld sub_group_non_uniform_reduce_max( float value );
+
+char    __ovld sub_group_non_uniform_scan_inclusive_add( char value );
+uchar   __ovld sub_group_non_uniform_scan_inclusive_add( uchar value );
+short   __ovld sub_group_non_uniform_scan_inclusive_add( short value );
+ushort  __ovld sub_group_non_uniform_scan_inclusive_add( ushort value );
+int     __ovld sub_group_non_uniform_scan_inclusive_add( int value );
+uint    __ovld sub_group_non_uniform_scan_inclusive_add( uint value );
+long    __ovld sub_group_non_uniform_scan_inclusive_add( long value );
+ulong   __ovld sub_group_non_uniform_scan_inclusive_add( ulong value );
+float   __ovld sub_group_non_uniform_scan_inclusive_add( float value );
+
+char    __ovld sub_group_non_uniform_scan_inclusive_mul( char value );
+uchar   __ovld sub_group_non_uniform_scan_inclusive_mul( uchar value );
+short   __ovld sub_group_non_uniform_scan_inclusive_mul( short value );
+ushort  __ovld sub_group_non_uniform_scan_inclusive_mul( ushort value );
+int     __ovld sub_group_non_uniform_scan_inclusive_mul( int value );
+uint    __ovld sub_group_non_uniform_scan_inclusive_mul( uint value );
+long    __ovld sub_group_non_uniform_scan_inclusive_mul( long value );
+ulong   __ovld sub_group_non_uniform_scan_inclusive_mul( ulong value );
+float   __ovld sub_group_non_uniform_scan_inclusive_mul( float value );
+
+char    __ovld sub_group_non_uniform_scan_inclusive_min( char value );
+uchar   __ovld sub_group_non_uniform_scan_inclusive_min( uchar value );
+short   __ovld sub_group_non_uniform_scan_inclusive_min( short value );
+ushort  __ovld sub_group_non_uniform_scan_inclusive_min( ushort value );
+int     __ovld sub_group_non_uniform_scan_inclusive_min( int value );
+uint    __ovld sub_group_non_uniform_scan_inclusive_min( uint value );
+long    __ovld sub_group_non_uniform_scan_inclusive_min( long value );
+ulong   __ovld sub_group_non_uniform_scan_inclusive_min( ulong value );
+float   __ovld sub_group_non_uniform_scan_inclusive_min( float value );
+
+char    __ovld sub_group_non_uniform_scan_inclusive_max( char value );
+uchar   __ovld sub_group_non_uniform_scan_inclusive_max( uchar value );
+short   __ovld sub_group_non_uniform_scan_inclusive_max( short value );
+ushort  __ovld sub_group_non_uniform_scan_inclusive_max( ushort value );
+int     __ovld sub_group_non_uniform_scan_inclusive_max( int value );
+uint    __ovld sub_group_non_uniform_scan_inclusive_max( uint value );
+long    __ovld sub_group_non_uniform_scan_inclusive_max( long value );
+ulong   __ovld sub_group_non_uniform_scan_inclusive_max( ulong value );
+float   __ovld sub_group_non_uniform_scan_inclusive_max( float value );
+
+char    __ovld sub_group_non_uniform_scan_exclusive_add( char value );
+uchar   __ovld sub_group_non_uniform_scan_exclusive_add( uchar value );
+short   __ovld sub_group_non_uniform_scan_exclusive_add( short value );
+ushort  __ovld sub_group_non_uniform_scan_exclusive_add( ushort value );
+int     __ovld sub_group_non_uniform_scan_exclusive_add( int value );
+uint    __ovld sub_group_non_uniform_scan_exclusive_add( uint value );
+long    __ovld sub_group_non_uniform_scan_exclusive_add( long value );
+ulong   __ovld sub_group_non_uniform_scan_exclusive_add( ulong value );
+float   __ovld sub_group_non_uniform_scan_exclusive_add( float value );
+
+char    __ovld sub_group_non_uniform_scan_exclusive_mul( char value );
+uchar   __ovld sub_group_non_uniform_scan_exclusive_mul( uchar value );
+short   __ovld sub_group_non_uniform_scan_exclusive_mul( short value );
+ushort  __ovld sub_group_non_uniform_scan_exclusive_mul( ushort value );
+int     __ovld sub_group_non_uniform_scan_exclusive_mul( int value );
+uint    __ovld sub_group_non_uniform_scan_exclusive_mul( uint value );
+long    __ovld sub_group_non_uniform_scan_exclusive_mul( long value );
+ulong   __ovld sub_group_non_uniform_scan_exclusive_mul( ulong value );
+float   __ovld sub_group_non_uniform_scan_exclusive_mul( float value );
+
+char    __ovld sub_group_non_uniform_scan_exclusive_min( char value );
+uchar   __ovld sub_group_non_uniform_scan_exclusive_min( uchar value );
+short   __ovld sub_group_non_uniform_scan_exclusive_min( short value );
+ushort  __ovld sub_group_non_uniform_scan_exclusive_min( ushort value );
+int     __ovld sub_group_non_uniform_scan_exclusive_min( int value );
+uint    __ovld sub_group_non_uniform_scan_exclusive_min( uint value );
+long    __ovld sub_group_non_uniform_scan_exclusive_min( long value );
+ulong   __ovld sub_group_non_uniform_scan_exclusive_min( ulong value );
+float   __ovld sub_group_non_uniform_scan_exclusive_min( float value );
+
+char    __ovld sub_group_non_uniform_scan_exclusive_max( char value );
+uchar   __ovld sub_group_non_uniform_scan_exclusive_max( uchar value );
+short   __ovld sub_group_non_uniform_scan_exclusive_max( short value );
+ushort  __ovld sub_group_non_uniform_scan_exclusive_max( ushort value );
+int     __ovld sub_group_non_uniform_scan_exclusive_max( int value );
+uint    __ovld sub_group_non_uniform_scan_exclusive_max( uint value );
+long    __ovld sub_group_non_uniform_scan_exclusive_max( long value );
+ulong   __ovld sub_group_non_uniform_scan_exclusive_max( ulong value );
+float   __ovld sub_group_non_uniform_scan_exclusive_max( float value );
+
+char    __ovld sub_group_non_uniform_reduce_and( char value );
+uchar   __ovld sub_group_non_uniform_reduce_and( uchar value );
+short   __ovld sub_group_non_uniform_reduce_and( short value );
+ushort  __ovld sub_group_non_uniform_reduce_and( ushort value );
+int     __ovld sub_group_non_uniform_reduce_and( int value );
+uint    __ovld sub_group_non_uniform_reduce_and( uint value );
+long    __ovld sub_group_non_uniform_reduce_and( long value );
+ulong   __ovld sub_group_non_uniform_reduce_and( ulong value );
+
+char    __ovld sub_group_non_uniform_reduce_or( char value );
+uchar   __ovld sub_group_non_uniform_reduce_or( uchar value );
+short   __ovld sub_group_non_uniform_reduce_or( short value );
+ushort  __ovld sub_group_non_uniform_reduce_or( ushort value );
+int     __ovld sub_group_non_uniform_reduce_or( int value );
+uint    __ovld sub_group_non_uniform_reduce_or( uint value );
+long    __ovld sub_group_non_uniform_reduce_or( long value );
+ulong   __ovld sub_group_non_uniform_reduce_or( ulong value );
+
+char    __ovld sub_group_non_uniform_reduce_xor( char value );
+uchar   __ovld sub_group_non_uniform_reduce_xor( uchar value );
+short   __ovld sub_group_non_uniform_reduce_xor( short value );
+ushort  __ovld sub_group_non_uniform_reduce_xor( ushort value );
+int     __ovld sub_group_non_uniform_reduce_xor( int value );
+uint    __ovld sub_group_non_uniform_reduce_xor( uint value );
+long    __ovld sub_group_non_uniform_reduce_xor( long value );
+ulong   __ovld sub_group_non_uniform_reduce_xor( ulong value );
+
+char    __ovld sub_group_non_uniform_scan_inclusive_and( char value );
+uchar   __ovld sub_group_non_uniform_scan_inclusive_and( uchar value );
+short   __ovld sub_group_non_uniform_scan_inclusive_and( short value );
+ushort  __ovld sub_group_non_uniform_scan_inclusive_and( ushort value );
+int     __ovld sub_group_non_uniform_scan_inclusive_and( int value );
+uint    __ovld sub_group_non_uniform_scan_inclusive_and( uint value );
+long    __ovld sub_group_non_uniform_scan_inclusive_and( long value );
+ulong   __ovld sub_group_non_uniform_scan_inclusive_and( ulong value );
+
+char    __ovld sub_group_non_uniform_scan_inclusive_or( char value );
+uchar   __ovld sub_group_non_uniform_scan_inclusive_or( uchar value );
+short   __ovld sub_group_non_uniform_scan_inclusive_or( short value );
+ushort  __ovld sub_group_non_uniform_scan_inclusive_or( ushort value );
+int     __ovld sub_group_non_uniform_scan_inclusive_or( int value );
+uint    __ovld sub_group_non_uniform_scan_inclusive_or( uint value );
+long    __ovld sub_group_non_uniform_scan_inclusive_or( long value );
+ulong   __ovld sub_group_non_uniform_scan_inclusive_or( ulong value );
+
+char    __ovld sub_group_non_uniform_scan_inclusive_xor( char value );
+uchar   __ovld sub_group_non_uniform_scan_inclusive_xor( uchar value );
+short   __ovld sub_group_non_uniform_scan_inclusive_xor( short value );
+ushort  __ovld sub_group_non_uniform_scan_inclusive_xor( ushort value );
+int     __ovld sub_group_non_uniform_scan_inclusive_xor( int value );
+uint    __ovld sub_group_non_uniform_scan_inclusive_xor( uint value );
+long    __ovld sub_group_non_uniform_scan_inclusive_xor( long value );
+ulong   __ovld sub_group_non_uniform_scan_inclusive_xor( ulong value );
+
+char    __ovld sub_group_non_uniform_scan_exclusive_and( char value );
+uchar   __ovld sub_group_non_uniform_scan_exclusive_and( uchar value );
+short   __ovld sub_group_non_uniform_scan_exclusive_and( short value );
+ushort  __ovld sub_group_non_uniform_scan_exclusive_and( ushort value );
+int     __ovld sub_group_non_uniform_scan_exclusive_and( int value );
+uint    __ovld sub_group_non_uniform_scan_exclusive_and( uint value );
+long    __ovld sub_group_non_uniform_scan_exclusive_and( long value );
+ulong   __ovld sub_group_non_uniform_scan_exclusive_and( ulong value );
+
+char    __ovld sub_group_non_uniform_scan_exclusive_or( char value );
+uchar   __ovld sub_group_non_uniform_scan_exclusive_or( uchar value );
+short   __ovld sub_group_non_uniform_scan_exclusive_or( short value );
+ushort  __ovld sub_group_non_uniform_scan_exclusive_or( ushort value );
+int     __ovld sub_group_non_uniform_scan_exclusive_or( int value );
+uint    __ovld sub_group_non_uniform_scan_exclusive_or( uint value );
+long    __ovld sub_group_non_uniform_scan_exclusive_or( long value );
+ulong   __ovld sub_group_non_uniform_scan_exclusive_or( ulong value );
+
+char    __ovld sub_group_non_uniform_scan_exclusive_xor( char value );
+uchar   __ovld sub_group_non_uniform_scan_exclusive_xor( uchar value );
+short   __ovld sub_group_non_uniform_scan_exclusive_xor( short value );
+ushort  __ovld sub_group_non_uniform_scan_exclusive_xor( ushort value );
+int     __ovld sub_group_non_uniform_scan_exclusive_xor( int value );
+uint    __ovld sub_group_non_uniform_scan_exclusive_xor( uint value );
+long    __ovld sub_group_non_uniform_scan_exclusive_xor( long value );
+ulong   __ovld sub_group_non_uniform_scan_exclusive_xor( ulong value );
+
+int     __ovld sub_group_non_uniform_reduce_logical_and( int predicate );
+int     __ovld sub_group_non_uniform_reduce_logical_or( int predicate );
+int     __ovld sub_group_non_uniform_reduce_logical_xor( int predicate );
+
+int     __ovld sub_group_non_uniform_scan_inclusive_logical_and( int predicate );
+int     __ovld sub_group_non_uniform_scan_inclusive_logical_or( int predicate );
+int     __ovld sub_group_non_uniform_scan_inclusive_logical_xor( int predicate );
+
+int     __ovld sub_group_non_uniform_scan_exclusive_logical_and( int predicate );
+int     __ovld sub_group_non_uniform_scan_exclusive_logical_or( int predicate );
+int     __ovld sub_group_non_uniform_scan_exclusive_logical_xor( int predicate );
+
+#if defined(cl_khr_fp16)
+half    __ovld sub_group_non_uniform_reduce_add( half value );
+half    __ovld sub_group_non_uniform_reduce_mul( half value );
+half    __ovld sub_group_non_uniform_reduce_min( half value );
+half    __ovld sub_group_non_uniform_reduce_max( half value );
+half    __ovld sub_group_non_uniform_scan_inclusive_add( half value );
+half    __ovld sub_group_non_uniform_scan_inclusive_mul( half value );
+half    __ovld sub_group_non_uniform_scan_inclusive_min( half value );
+half    __ovld sub_group_non_uniform_scan_inclusive_max( half value );
+half    __ovld sub_group_non_uniform_scan_exclusive_add( half value );
+half    __ovld sub_group_non_uniform_scan_exclusive_mul( half value );
+half    __ovld sub_group_non_uniform_scan_exclusive_min( half value );
+half    __ovld sub_group_non_uniform_scan_exclusive_max( half value );
+#endif // cl_khr_fp16
+
+#if defined(cl_khr_fp64)
+double  __ovld sub_group_non_uniform_reduce_add( double value );
+double  __ovld sub_group_non_uniform_reduce_mul( double value );
+double  __ovld sub_group_non_uniform_reduce_min( double value );
+double  __ovld sub_group_non_uniform_reduce_max( double value );
+double  __ovld sub_group_non_uniform_scan_inclusive_add( double value );
+double  __ovld sub_group_non_uniform_scan_inclusive_mul( double value );
+double  __ovld sub_group_non_uniform_scan_inclusive_min( double value );
+double  __ovld sub_group_non_uniform_scan_inclusive_max( double value );
+double  __ovld sub_group_non_uniform_scan_exclusive_add( double value );
+double  __ovld sub_group_non_uniform_scan_exclusive_mul( double value );
+double  __ovld sub_group_non_uniform_scan_exclusive_min( double value );
+double  __ovld sub_group_non_uniform_scan_exclusive_max( double value );
+#endif // cl_khr_fp64
+
+#endif // cl_khr_subgroup_non_uniform_arithmetic
+
+#if defined(cl_khr_subgroup_shuffle)
+char    __ovld sub_group_shuffle( char value, uint index );
+uchar   __ovld sub_group_shuffle( uchar value, uint index );
+short   __ovld sub_group_shuffle( short value, uint index );
+ushort  __ovld sub_group_shuffle( ushort value, uint index );
+int     __ovld sub_group_shuffle( int value, uint index );
+uint    __ovld sub_group_shuffle( uint value, uint index );
+long    __ovld sub_group_shuffle( long value, uint index );
+ulong   __ovld sub_group_shuffle( ulong value, uint index );
+float   __ovld sub_group_shuffle( float value, uint index );
+
+char    __ovld sub_group_shuffle_xor( char value, uint mask );
+uchar   __ovld sub_group_shuffle_xor( uchar value, uint mask );
+short   __ovld sub_group_shuffle_xor( short value, uint mask );
+ushort  __ovld sub_group_shuffle_xor( ushort value, uint mask );
+int     __ovld sub_group_shuffle_xor( int value, uint mask );
+uint    __ovld sub_group_shuffle_xor( uint value, uint mask );
+long    __ovld sub_group_shuffle_xor( long value, uint mask );
+ulong   __ovld sub_group_shuffle_xor( ulong value, uint mask );
+float   __ovld sub_group_shuffle_xor( float value, uint mask );
+
+#if defined(cl_khr_fp16)
+half    __ovld sub_group_shuffle( half value, uint index );
+half    __ovld sub_group_shuffle_xor( half value, uint mask );
+#endif // cl_khr_fp16
+
+#if defined(cl_khr_fp64)
+double  __ovld sub_group_shuffle( double value, uint index );
+double  __ovld sub_group_shuffle_xor( double value, uint mask );
+#endif // cl_khr_fp64
+
+#endif // cl_khr_subgroup_shuffle
+
+#if defined(cl_khr_subgroup_shuffle_relative)
+char    __ovld sub_group_shuffle_up( char value, uint delta );
+uchar   __ovld sub_group_shuffle_up( uchar value, uint delta );
+short   __ovld sub_group_shuffle_up( short value, uint delta );
+ushort  __ovld sub_group_shuffle_up( ushort value, uint delta );
+int     __ovld sub_group_shuffle_up( int value, uint delta );
+uint    __ovld sub_group_shuffle_up( uint value, uint delta );
+long    __ovld sub_group_shuffle_up( long value, uint delta );
+ulong   __ovld sub_group_shuffle_up( ulong value, uint delta );
+float   __ovld sub_group_shuffle_up( float value, uint delta );
+
+char    __ovld sub_group_shuffle_down( char value, uint delta );
+uchar   __ovld sub_group_shuffle_down( uchar value, uint delta );
+short   __ovld sub_group_shuffle_down( short value, uint delta );
+ushort  __ovld sub_group_shuffle_down( ushort value, uint delta );
+int     __ovld sub_group_shuffle_down( int value, uint delta );
+uint    __ovld sub_group_shuffle_down( uint value, uint delta );
+long    __ovld sub_group_shuffle_down( long value, uint delta );
+ulong   __ovld sub_group_shuffle_down( ulong value, uint delta );
+float   __ovld sub_group_shuffle_down( float value, uint delta );
+
+#if defined(cl_khr_fp16)
+half    __ovld sub_group_shuffle_up( half value, uint delta );
+half    __ovld sub_group_shuffle_down( half value, uint delta );
+#endif // cl_khr_fp16
+
+#if defined(cl_khr_fp64)
+double  __ovld sub_group_shuffle_up( double value, uint delta );
+double  __ovld sub_group_shuffle_down( double value, uint delta );
+#endif // cl_khr_fp64
+
+#endif // cl_khr_subgroup_shuffle_relative
+
+#if defined(cl_khr_subgroup_clustered_reduce)
+char    __ovld sub_group_clustered_reduce_add( char value, uint clustersize );
+uchar   __ovld sub_group_clustered_reduce_add( uchar value, uint clustersize );
+short   __ovld sub_group_clustered_reduce_add( short value, uint clustersize );
+ushort  __ovld sub_group_clustered_reduce_add( ushort value, uint clustersize );
+int     __ovld sub_group_clustered_reduce_add( int value, uint clustersize );
+uint    __ovld sub_group_clustered_reduce_add( uint value, uint clustersize );
+long    __ovld sub_group_clustered_reduce_add( long value, uint clustersize );
+ulong   __ovld sub_group_clustered_reduce_add( ulong value, uint clustersize );
+float   __ovld sub_group_clustered_reduce_add( float value, uint clustersize );
+
+char    __ovld sub_group_clustered_reduce_mul( char value, uint clustersize );
+uchar   __ovld sub_group_clustered_reduce_mul( uchar value, uint clustersize );
+short   __ovld sub_group_clustered_reduce_mul( short value, uint clustersize );
+ushort  __ovld sub_group_clustered_reduce_mul( ushort value, uint clustersize );
+int     __ovld sub_group_clustered_reduce_mul( int value, uint clustersize );
+uint    __ovld sub_group_clustered_reduce_mul( uint value, uint clustersize );
+long    __ovld sub_group_clustered_reduce_mul( long value, uint clustersize );
+ulong   __ovld sub_group_clustered_reduce_mul( ulong value, uint clustersize );
+float   __ovld sub_group_clustered_reduce_mul( float value, uint clustersize );
+
+char    __ovld sub_group_clustered_reduce_min( char value, uint clustersize );
+uchar   __ovld sub_group_clustered_reduce_min( uchar value, uint clustersize );
+short   __ovld sub_group_clustered_reduce_min( short value, uint clustersize );
+ushort  __ovld sub_group_clustered_reduce_min( ushort value, uint clustersize );
+int     __ovld sub_group_clustered_reduce_min( int value, uint clustersize );
+uint    __ovld sub_group_clustered_reduce_min( uint value, uint clustersize );
+long    __ovld sub_group_clustered_reduce_min( long value, uint clustersize );
+ulong   __ovld sub_group_clustered_reduce_min( ulong value, uint clustersize );
+float   __ovld sub_group_clustered_reduce_min( float value, uint clustersize );
+
+char    __ovld sub_group_clustered_reduce_max( char value, uint clustersize );
+uchar   __ovld sub_group_clustered_reduce_max( uchar value, uint clustersize );
+short   __ovld sub_group_clustered_reduce_max( short value, uint clustersize );
+ushort  __ovld sub_group_clustered_reduce_max( ushort value, uint clustersize );
+int     __ovld sub_group_clustered_reduce_max( int value, uint clustersize );
+uint    __ovld sub_group_clustered_reduce_max( uint value, uint clustersize );
+long    __ovld sub_group_clustered_reduce_max( long value, uint clustersize );
+ulong   __ovld sub_group_clustered_reduce_max( ulong value, uint clustersize );
+float   __ovld sub_group_clustered_reduce_max( float value, uint clustersize );
+
+char    __ovld sub_group_clustered_reduce_and( char value, uint clustersize );
+uchar   __ovld sub_group_clustered_reduce_and( uchar value, uint clustersize );
+short   __ovld sub_group_clustered_reduce_and( short value, uint clustersize );
+ushort  __ovld sub_group_clustered_reduce_and( ushort value, uint clustersize );
+int     __ovld sub_group_clustered_reduce_and( int value, uint clustersize );
+uint    __ovld sub_group_clustered_reduce_and( uint value, uint clustersize );
+long    __ovld sub_group_clustered_reduce_and( long value, uint clustersize );
+ulong   __ovld sub_group_clustered_reduce_and( ulong value, uint clustersize );
+
+char    __ovld sub_group_clustered_reduce_or( char value, uint clustersize );
+uchar   __ovld sub_group_clustered_reduce_or( uchar value, uint clustersize );
+short   __ovld sub_group_clustered_reduce_or( short value, uint clustersize );
+ushort  __ovld sub_group_clustered_reduce_or( ushort value, uint clustersize );
+int     __ovld sub_group_clustered_reduce_or( int value, uint clustersize );
+uint    __ovld sub_group_clustered_reduce_or( uint value, uint clustersize );
+long    __ovld sub_group_clustered_reduce_or( long value, uint clustersize );
+ulong   __ovld sub_group_clustered_reduce_or( ulong value, uint clustersize );
+
+char    __ovld sub_group_clustered_reduce_xor( char value, uint clustersize );
+uchar   __ovld sub_group_clustered_reduce_xor( uchar value, uint clustersize );
+short   __ovld sub_group_clustered_reduce_xor( short value, uint clustersize );
+ushort  __ovld sub_group_clustered_reduce_xor( ushort value, uint clustersize );
+int     __ovld sub_group_clustered_reduce_xor( int value, uint clustersize );
+uint    __ovld sub_group_clustered_reduce_xor( uint value, uint clustersize );
+long    __ovld sub_group_clustered_reduce_xor( long value, uint clustersize );
+ulong   __ovld sub_group_clustered_reduce_xor( ulong value, uint clustersize );
+
+int     __ovld sub_group_clustered_reduce_logical_and( int predicate, uint clustersize );
+int     __ovld sub_group_clustered_reduce_logical_or( int predicate, uint clustersize );
+int     __ovld sub_group_clustered_reduce_logical_xor( int predicate, uint clustersize );
+
+#if defined(cl_khr_fp16)
+half    __ovld sub_group_clustered_reduce_add( half value, uint clustersize );
+half    __ovld sub_group_clustered_reduce_mul( half value, uint clustersize );
+half    __ovld sub_group_clustered_reduce_min( half value, uint clustersize );
+half    __ovld sub_group_clustered_reduce_max( half value, uint clustersize );
+#endif // cl_khr_fp16
+
+#if defined(cl_khr_fp64)
+double  __ovld sub_group_clustered_reduce_add( double value, uint clustersize );
+double  __ovld sub_group_clustered_reduce_mul( double value, uint clustersize );
+double  __ovld sub_group_clustered_reduce_min( double value, uint clustersize );
+double  __ovld sub_group_clustered_reduce_max( double value, uint clustersize );
+#endif // cl_khr_fp64
+
+#endif // cl_khr_subgroup_clustered_reduce
+
 #if defined(cl_intel_subgroups)
 // Intel-Specific Sub Group Functions
 float   __ovld __conv intel_sub_group_shuffle( float  x, uint c );
--- a/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
@ -1,4 +1,4 @@
-/*===---- __clang_openmp_math_declares.h - OpenMP math declares ------------===
+/*===- __clang_openmp_device_functions.h - OpenMP device function declares -===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
@ -7,27 +7,36 @@
 *===-----------------------------------------------------------------------===
 */

-#ifndef __CLANG_OPENMP_MATH_DECLARES_H__
-#define __CLANG_OPENMP_MATH_DECLARES_H__
+#ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
+#define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__

 #ifndef _OPENMP
 #error "This file is for OpenMP compilation only."
 #endif

-#if defined(__NVPTX__) && defined(_OPENMP)
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+#ifdef __cplusplus
+extern "C" {
+#endif

 #define __CUDA__
-
-#if defined(__cplusplus)
-  #include <__clang_cuda_math_forward_declares.h>
-#endif
+#define __OPENMP_NVPTX__

 /// Include declarations for libdevice functions.
 #include <__clang_cuda_libdevice_declares.h>
+
 /// Provide definitions for these functions.
 #include <__clang_cuda_device_functions.h>

+#undef __OPENMP_NVPTX__
 #undef __CUDA__

+#ifdef __cplusplus
+} // extern "C"
 #endif
+
+#pragma omp end declare variant
+
 #endif
--- a/lib/include/openmp_wrappers/__clang_openmp_math.h
+++ b/lib/include/openmp_wrappers/__clang_openmp_math.h
@ -1,35 +0,0 @@
-/*===---- __clang_openmp_math.h - OpenMP target math support ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if defined(__NVPTX__) && defined(_OPENMP)
-/// TODO:
-/// We are currently reusing the functionality of the Clang-CUDA code path
-/// as an alternative to the host declarations provided by math.h and cmath.
-/// This is suboptimal.
-///
-/// We should instead declare the device functions in a similar way, e.g.,
-/// through OpenMP 5.0 variants, and afterwards populate the module with the
-/// host declarations by unconditionally including the host math.h or cmath,
-/// respectively. This is actually what the Clang-CUDA code path does, using
-/// __device__ instead of variants to avoid redeclarations and get the desired
-/// overload resolution.
-
-#define __CUDA__
-
-#if defined(__cplusplus)
-  #include <__clang_cuda_cmath.h>
-#endif
-
-#undef __CUDA__
-
-/// Magic macro for stopping the math.h/cmath host header from being included.
-#define __CLANG_NO_HOST_MATH__
-
-#endif
-
--- a/lib/include/openmp_wrappers/cmath
+++ b/lib/include/openmp_wrappers/cmath
@ -1,4 +1,4 @@
-/*===-------------- cmath - Alternative cmath header -----------------------===
+/*===-- __clang_openmp_device_functions.h - OpenMP math declares ------ c++ -===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
@ -7,10 +7,69 @@
 *===-----------------------------------------------------------------------===
 */

-#include <__clang_openmp_math.h>
+#ifndef __CLANG_OPENMP_CMATH_H__
+#define __CLANG_OPENMP_CMATH_H__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+#include_next <cmath>
+
+// Make sure we include our math.h overlay, it probably happend already but we
+// need to be sure.
+#include <math.h>
+
+// We (might) need cstdlib because __clang_cuda_cmath.h below declares `abs`
+// which might live in cstdlib.
+#include <cstdlib>
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+#define __CUDA__
+#define __OPENMP_NVPTX__
+#include <__clang_cuda_cmath.h>
+#undef __OPENMP_NVPTX__
+#undef __CUDA__
+
+// Overloads not provided by the CUDA wrappers but by the CUDA system headers.
+// Since we do not include the latter we define them ourselves.
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+
+__DEVICE__ float acosh(float __x) { return ::acoshf(__x); }
+__DEVICE__ float asinh(float __x) { return ::asinhf(__x); }
+__DEVICE__ float atanh(float __x) { return ::atanhf(__x); }
+__DEVICE__ float cbrt(float __x) { return ::cbrtf(__x); }
+__DEVICE__ float erf(float __x) { return ::erff(__x); }
+__DEVICE__ float erfc(float __x) { return ::erfcf(__x); }
+__DEVICE__ float exp2(float __x) { return ::exp2f(__x); }
+__DEVICE__ float expm1(float __x) { return ::expm1f(__x); }
+__DEVICE__ float fdim(float __x, float __y) { return ::fdimf(__x, __y); }
+__DEVICE__ float hypot(float __x, float __y) { return ::hypotf(__x, __y); }
+__DEVICE__ int ilogb(float __x) { return ::ilogbf(__x); }
+__DEVICE__ float lgamma(float __x) { return ::lgammaf(__x); }
+__DEVICE__ long long int llrint(float __x) { return ::llrintf(__x); }
+__DEVICE__ long long int llround(float __x) { return ::llroundf(__x); }
+__DEVICE__ float log1p(float __x) { return ::log1pf(__x); }
+__DEVICE__ float log2(float __x) { return ::log2f(__x); }
+__DEVICE__ float logb(float __x) { return ::logbf(__x); }
+__DEVICE__ long int lrint(float __x) { return ::lrintf(__x); }
+__DEVICE__ long int lround(float __x) { return ::lroundf(__x); }
+__DEVICE__ float nextafter(float __x, float __y) {
+  return ::nextafterf(__x, __y);
+}
+__DEVICE__ float remainder(float __x, float __y) {
+  return ::remainderf(__x, __y);
+}
+__DEVICE__ float scalbln(float __x, long int __y) {
+  return ::scalblnf(__x, __y);
+}
+__DEVICE__ float scalbn(float __x, int __y) { return ::scalbnf(__x, __y); }
+__DEVICE__ float tgamma(float __x) { return ::tgammaf(__x); }
+
+#undef __DEVICE__
+
+#pragma omp end declare variant

-#ifndef __CLANG_NO_HOST_MATH__
-#include_next <cmath>
-#else
-#undef __CLANG_NO_HOST_MATH__
 #endif
--- a/lib/include/openmp_wrappers/complex
+++ b/lib/include/openmp_wrappers/complex
@ -0,0 +1,25 @@
+/*===-- complex --- OpenMP complex wrapper for target regions --------- c++ -===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_OPENMP_COMPLEX__
+#define __CLANG_OPENMP_COMPLEX__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+// We require std::math functions in the complex builtins below.
+#include <cmath>
+
+#define __CUDA__
+#include <__clang_cuda_complex_builtins.h>
+#endif
+
+// Grab the host header too.
+#include_next <complex>
--- a/lib/include/openmp_wrappers/complex.h
+++ b/lib/include/openmp_wrappers/complex.h
@ -0,0 +1,25 @@
+/*===-- complex --- OpenMP complex wrapper for target regions --------- c++ -===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_OPENMP_COMPLEX_H__
+#define __CLANG_OPENMP_COMPLEX_H__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+// We require math functions in the complex builtins below.
+#include <math.h>
+
+#define __CUDA__
+#include <__clang_cuda_complex_builtins.h>
+#endif
+
+// Grab the host header too.
+#include_next <complex.h>
--- a/lib/include/openmp_wrappers/math.h
+++ b/lib/include/openmp_wrappers/math.h
@ -1,4 +1,4 @@
-/*===------------- math.h - Alternative math.h header ----------------------===
+/*===---- openmp_wrapper/math.h -------- OpenMP math.h intercept ------ c++ -===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
@ -7,11 +7,45 @@
 *===-----------------------------------------------------------------------===
 */

-#include <__clang_openmp_math.h>
-
-#ifndef __CLANG_NO_HOST_MATH__
-#include_next <math.h>
-#else
-#undef __CLANG_NO_HOST_MATH__
+// If we are in C++ mode and include <math.h> (not <cmath>) first, we still need
+// to make sure <cmath> is read first. The problem otherwise is that we haven't
+// seen the declarations of the math.h functions when the system math.h includes
+// our cmath overlay. However, our cmath overlay, or better the underlying
+// overlay, e.g. CUDA, uses the math.h functions. Since we haven't declared them
+// yet we get errors. CUDA avoids this by eagerly declaring all math functions
+// (in the __device__ space) but we cannot do this. Instead we break the
+// dependence by forcing cmath to go first. While our cmath will in turn include
+// this file, the cmath guards will prevent recursion.
+#ifdef __cplusplus
+#include <cmath>
 #endif

+#ifndef __CLANG_OPENMP_MATH_H__
+#define __CLANG_OPENMP_MATH_H__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+#include_next <math.h>
+
+// We need limits.h for __clang_cuda_math.h below and because it should not hurt
+// we include it eagerly here.
+#include <limits.h>
+
+// We need stdlib.h because (for now) __clang_cuda_math.h below declares `abs`
+// which should live in stdlib.h.
+#include <stdlib.h>
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+#define __CUDA__
+#define __OPENMP_NVPTX__
+#include <__clang_cuda_math.h>
+#undef __OPENMP_NVPTX__
+#undef __CUDA__
+
+#pragma omp end declare variant
+
+#endif
--- a/lib/include/openmp_wrappers/new
+++ b/lib/include/openmp_wrappers/new
@ -0,0 +1,70 @@
+//===--------- new - OPENMP wrapper for <new> ------------------------------===
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===
+
+#ifndef __CLANG_OPENMP_WRAPPERS_NEW
+#define __CLANG_OPENMP_WRAPPERS_NEW
+
+#include_next <new>
+
+#if defined(__NVPTX__) && defined(_OPENMP)
+
+#include <cstdlib>
+
+#pragma push_macro("OPENMP_NOEXCEPT")
+#if __cplusplus >= 201103L
+#define OPENMP_NOEXCEPT noexcept
+#else
+#define OPENMP_NOEXCEPT
+#endif
+
+// Device overrides for non-placement new and delete.
+inline void *operator new(__SIZE_TYPE__ size) {
+  if (size == 0)
+    size = 1;
+  return ::malloc(size);
+}
+inline void *operator new(__SIZE_TYPE__ size,
+                          const std::nothrow_t &) OPENMP_NOEXCEPT {
+  return ::operator new(size);
+}
+
+inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
+inline void *operator new[](__SIZE_TYPE__ size, const std::nothrow_t &) {
+  return ::operator new(size);
+}
+
+inline void operator delete(void *ptr)OPENMP_NOEXCEPT {
+  if (ptr)
+    ::free(ptr);
+}
+inline void operator delete(void *ptr, const std::nothrow_t &)OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+
+inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+inline void operator delete[](void *ptr,
+                              const std::nothrow_t &) OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+
+// Sized delete, C++14 only.
+#if __cplusplus >= 201402L
+inline void operator delete(void *ptr, __SIZE_TYPE__ size)OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+inline void operator delete[](void *ptr, __SIZE_TYPE__ size) OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+#endif
+
+#pragma pop_macro("OPENMP_NOEXCEPT")
+#endif
+
+#endif // include guard
--- a/lib/include/serializeintrin.h
+++ b/lib/include/serializeintrin.h
@ -0,0 +1,30 @@
+/*===--------------- serializeintrin.h - serialize intrinsics --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <serializeintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SERIALIZEINTRIN_H
+#define __SERIALIZEINTRIN_H
+
+/// Serialize instruction fetch and execution.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SERIALIZE </c> instruction.
+///
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__, __target__("serialize")))
+_serialize (void)
+{
+  __builtin_ia32_serialize ();
+}
+
+#endif /* __SERIALIZEINTRIN_H */
--- a/lib/include/tsxldtrkintrin.h
+++ b/lib/include/tsxldtrkintrin.h
@ -0,0 +1,56 @@
+/*===------------- tsxldtrkintrin.h - tsxldtrk intrinsics ------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <tsxldtrkintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __TSXLDTRKINTRIN_H
+#define __TSXLDTRKINTRIN_H
+
+/* Define the default attributes for the functions in this file */
+#define _DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("tsxldtrk")))
+
+/// Marks the start of an TSX (RTM) suspend load address tracking region. If
+///    this intrinsic is used inside a transactional region, subsequent loads
+///    are not added to the read set of the transaction. If it's used inside a
+///    suspend load address tracking region it will cause transaction abort.
+///    If it's used outside of a transactional region it behaves like a NOP.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c XSUSLDTRK instruction.
+///
+static __inline__ void _DEFAULT_FN_ATTRS
+_xsusldtrk (void)
+{
+    __builtin_ia32_xsusldtrk();
+}
+
+/// Marks the end of an TSX (RTM) suspend load address tracking region. If this
+///    intrinsic is used inside a suspend load address tracking region it will
+///    end the suspend region and all following load addresses will be added to
+///    the transaction read set. If it's used inside an active transaction but
+///    not in a suspend region it will cause transaction abort. If it's used
+///    outside of a transactional region it behaves like a NOP.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c XRESLDTRK instruction.
+///
+static __inline__ void _DEFAULT_FN_ATTRS
+_xresldtrk (void)
+{
+    __builtin_ia32_xresldtrk();
+}
+
+#undef _DEFAULT_FN_ATTRS
+
+#endif /* __TSXLDTRKINTRIN_H */
--- a/lib/include/vecintrin.h
+++ b/lib/include/vecintrin.h
--- a/lib/include/wasm_simd128.h
+++ b/lib/include/wasm_simd128.h
--- a/lib/include/x86intrin.h
+++ b/lib/include/x86intrin.h
@ -14,39 +14,48 @@

 #include <immintrin.h>

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__3dNOW__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__3dNOW__)
 #include <mm3dnow.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__PRFCHW__)
 #include <prfchwintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE4A__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SSE4A__)
 #include <ammintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA4__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__FMA4__)
 #include <fma4intrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XOP__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__XOP__)
 #include <xopintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__TBM__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__TBM__)
 #include <tbmintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LWP__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__LWP__)
 #include <lwpintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__MWAITX__)
 #include <mwaitxintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLZERO__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__CLZERO__)
 #include <clzerointrin.h>
 #endif

--- a/lib/include/xmmintrin.h
+++ b/lib/include/xmmintrin.h
@ -2931,31 +2931,31 @@ _mm_movemask_ps(__m128 __a)

 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))

-#define _MM_EXCEPT_INVALID    (0x0001)
-#define _MM_EXCEPT_DENORM     (0x0002)
-#define _MM_EXCEPT_DIV_ZERO   (0x0004)
-#define _MM_EXCEPT_OVERFLOW   (0x0008)
-#define _MM_EXCEPT_UNDERFLOW  (0x0010)
-#define _MM_EXCEPT_INEXACT    (0x0020)
-#define _MM_EXCEPT_MASK       (0x003f)
+#define _MM_EXCEPT_INVALID    (0x0001U)
+#define _MM_EXCEPT_DENORM     (0x0002U)
+#define _MM_EXCEPT_DIV_ZERO   (0x0004U)
+#define _MM_EXCEPT_OVERFLOW   (0x0008U)
+#define _MM_EXCEPT_UNDERFLOW  (0x0010U)
+#define _MM_EXCEPT_INEXACT    (0x0020U)
+#define _MM_EXCEPT_MASK       (0x003fU)

-#define _MM_MASK_INVALID      (0x0080)
-#define _MM_MASK_DENORM       (0x0100)
-#define _MM_MASK_DIV_ZERO     (0x0200)
-#define _MM_MASK_OVERFLOW     (0x0400)
-#define _MM_MASK_UNDERFLOW    (0x0800)
-#define _MM_MASK_INEXACT      (0x1000)
-#define _MM_MASK_MASK         (0x1f80)
+#define _MM_MASK_INVALID      (0x0080U)
+#define _MM_MASK_DENORM       (0x0100U)
+#define _MM_MASK_DIV_ZERO     (0x0200U)
+#define _MM_MASK_OVERFLOW     (0x0400U)
+#define _MM_MASK_UNDERFLOW    (0x0800U)
+#define _MM_MASK_INEXACT      (0x1000U)
+#define _MM_MASK_MASK         (0x1f80U)

-#define _MM_ROUND_NEAREST     (0x0000)
-#define _MM_ROUND_DOWN        (0x2000)
-#define _MM_ROUND_UP          (0x4000)
-#define _MM_ROUND_TOWARD_ZERO (0x6000)
-#define _MM_ROUND_MASK        (0x6000)
+#define _MM_ROUND_NEAREST     (0x0000U)
+#define _MM_ROUND_DOWN        (0x2000U)
+#define _MM_ROUND_UP          (0x4000U)
+#define _MM_ROUND_TOWARD_ZERO (0x6000U)
+#define _MM_ROUND_MASK        (0x6000U)

-#define _MM_FLUSH_ZERO_MASK   (0x8000)
-#define _MM_FLUSH_ZERO_ON     (0x8000)
-#define _MM_FLUSH_ZERO_OFF    (0x0000)
+#define _MM_FLUSH_ZERO_MASK   (0x8000U)
+#define _MM_FLUSH_ZERO_ON     (0x8000U)
+#define _MM_FLUSH_ZERO_OFF    (0x0000U)

 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
--- a/lib/libcxx/include/__bit_reference
+++ b/lib/libcxx/include/__bit_reference
@ -1122,6 +1122,21 @@ public:
    __bit_iterator(const __type_for_copy_to_const& __it) _NOEXCEPT
        : __seg_(__it.__seg_), __ctz_(__it.__ctz_) {}

+    // The non-const __bit_iterator has historically had a non-trivial
+    // copy constructor (as a quirk of its construction). We need to maintain
+    // this for ABI purposes.
+    using __type_for_abi_non_trivial_copy_ctor =
+      _If<!_IsConst, __bit_iterator, struct __private_nat>;
+
+    _LIBCPP_INLINE_VISIBILITY
+    __bit_iterator(__type_for_abi_non_trivial_copy_ctor const& __it) _NOEXCEPT
+      : __seg_(__it.__seg_), __ctz_(__it.__ctz_) {}
+
+    // Always declare the copy assignment operator since the implicit declaration
+    // is deprecated.
+    _LIBCPP_INLINE_VISIBILITY
+    __bit_iterator& operator=(__bit_iterator const&) = default;
+
    _LIBCPP_INLINE_VISIBILITY reference operator*() const _NOEXCEPT
        {return reference(__seg_, __storage_type(1) << __ctz_);}

--- a/lib/libcxx/include/__config
+++ b/lib/libcxx/include/__config
@ -32,7 +32,7 @@
 #  define _GNUC_VER_NEW 0
 #endif

-#define _LIBCPP_VERSION 10000
+#define _LIBCPP_VERSION 11000

 #ifndef _LIBCPP_ABI_VERSION
 #  define _LIBCPP_ABI_VERSION 1
@ -102,6 +102,9 @@
 #  define _LIBCPP_ABI_OPTIMIZED_FUNCTION
 // All the regex constants must be distinct and nonzero.
 #  define _LIBCPP_ABI_REGEX_CONSTANTS_NONZERO
+// Re-worked external template instantiations for std::string with a focus on
+// performance and fast-path inlining.
+#  define _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION
 #elif _LIBCPP_ABI_VERSION == 1
 #  if !defined(_LIBCPP_OBJECT_FORMAT_COFF)
 // Enable compiling copies of now inline methods into the dylib to support
@ -395,12 +398,12 @@

 #if defined(_LIBCPP_COMPILER_CLANG)

-// _LIBCPP_ALTERNATE_STRING_LAYOUT is an old name for
-// _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT left here for backward compatibility.
-#if (defined(__APPLE__) && !defined(__i386__) && !defined(__x86_64__) &&       \
-     (!defined(__arm__) || __ARM_ARCH_7K__ >= 2)) ||                           \
-    defined(_LIBCPP_ALTERNATE_STRING_LAYOUT)
-#define _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+#if defined(_LIBCPP_ALTERNATE_STRING_LAYOUT)
+#  error _LIBCPP_ALTERNATE_STRING_LAYOUT is deprecated, please use _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT instead
+#endif
+#if defined(__APPLE__) && !defined(__i386__) && !defined(__x86_64__) &&       \
+    (!defined(__arm__) || __ARM_ARCH_7K__ >= 2)
+#  define _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
 #endif

 #if __has_feature(cxx_alignas)
@ -416,12 +419,8 @@ typedef __char16_t char16_t;
 typedef __char32_t char32_t;
 #endif

-#if !(__has_feature(cxx_exceptions)) && !defined(_LIBCPP_NO_EXCEPTIONS)
-#define _LIBCPP_NO_EXCEPTIONS
-#endif
-
-#if !(__has_feature(cxx_rtti)) && !defined(_LIBCPP_NO_RTTI)
-#define _LIBCPP_NO_RTTI
+#if !__has_feature(cxx_exceptions)
+#  define _LIBCPP_NO_EXCEPTIONS
 #endif

 #if !(__has_feature(cxx_strong_enums))
@ -467,6 +466,14 @@ typedef __char32_t char32_t;
 #define _LIBCPP_HAS_OBJC_ARC_WEAK
 #endif

+#if __has_extension(blocks)
+#  define _LIBCPP_HAS_EXTENSION_BLOCKS
+#endif
+
+#if defined(_LIBCPP_HAS_EXTENSION_BLOCKS) && defined(__APPLE__)
+#  define _LIBCPP_HAS_BLOCKS_RUNTIME
+#endif
+
 #if !(__has_feature(cxx_relaxed_constexpr))
 #define _LIBCPP_HAS_NO_CXX14_CONSTEXPR
 #endif
@ -492,6 +499,10 @@ typedef __char32_t char32_t;
 #define _LIBCPP_COMPILER_HAS_BUILTIN_LAUNDER
 #endif

+#if __has_builtin(__builtin_constant_p)
+#define _LIBCPP_COMPILER_HAS_BUILTIN_CONSTANT_P
+#endif
+
 #if !__is_identifier(__has_unique_object_representations)
 #define _LIBCPP_HAS_UNIQUE_OBJECT_REPRESENTATIONS
 #endif
@ -513,8 +524,8 @@ typedef __char32_t char32_t;

 #define _LIBCPP_NORETURN __attribute__((noreturn))

-#if !__EXCEPTIONS && !defined(_LIBCPP_NO_EXCEPTIONS)
-#define _LIBCPP_NO_EXCEPTIONS
+#if !__EXCEPTIONS
+#  define _LIBCPP_NO_EXCEPTIONS
 #endif

 // Determine if GCC supports relaxed constexpr
@ -533,9 +544,7 @@ typedef __char32_t char32_t;

 #if _GNUC_VER >= 700
 #define _LIBCPP_COMPILER_HAS_BUILTIN_LAUNDER
-#endif
-
-#if _GNUC_VER >= 700
+#define _LIBCPP_COMPILER_HAS_BUILTIN_CONSTANT_P
 #define _LIBCPP_HAS_UNIQUE_OBJECT_REPRESENTATIONS
 #endif

@ -745,14 +754,14 @@ typedef __char32_t char32_t;
 #  endif
 #endif

-#ifndef _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
-# ifdef _LIBCPP_OBJECT_FORMAT_COFF // Windows binaries can't merge typeinfos.
-# define _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT 0
-#else
-// TODO: This isn't strictly correct on ELF platforms due to llvm.org/PR37398
-// And we should consider defaulting to OFF.
-# define _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT 1
-#endif
+#ifndef _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
+#  ifdef _LIBCPP_OBJECT_FORMAT_COFF // Windows binaries can't merge typeinfos.
+#    define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 2
+#  else
+     // TODO: This isn't strictly correct on ELF platforms due to llvm.org/PR37398
+     // And we should consider defaulting to OFF.
+#    define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 1
+#  endif
 #endif

 #ifndef _LIBCPP_HIDE_FROM_ABI
@ -901,6 +910,10 @@ typedef unsigned int   char32_t;
 #define _LIBCPP_EXTERN_TEMPLATE2(...) extern template __VA_ARGS__;
 #endif

+#ifndef _LIBCPP_EXTERN_TEMPLATE_DEFINE
+#define _LIBCPP_EXTERN_TEMPLATE_DEFINE(...) template __VA_ARGS__;
+#endif
+
 #if defined(__APPLE__) || defined(__FreeBSD__) || defined(_LIBCPP_MSVCRT_LIKE) || \
    defined(__sun__) || defined(__NetBSD__) || defined(__CloudABI__)
 #define _LIBCPP_LOCALE__L_EXTENSIONS 1
@ -1092,17 +1105,12 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
 #endif

 // Try to find out if RTTI is disabled.
-// g++ and cl.exe have RTTI on by default and define a macro when it is.
-// g++ only defines the macro in 4.3.2 and onwards.
-#if !defined(_LIBCPP_NO_RTTI)
-#  if defined(__GNUC__) && \
-      ((__GNUC__ >= 5) || \
-       (__GNUC__ == 4 && (__GNUC_MINOR__ >= 3 || __GNUC_PATCHLEVEL__ >= 2))) && \
-      !defined(__GXX_RTTI)
-#    define _LIBCPP_NO_RTTI
-#  elif defined(_LIBCPP_COMPILER_MSVC) && !defined(_CPPRTTI)
-#    define _LIBCPP_NO_RTTI
-#  endif
+#if defined(_LIBCPP_COMPILER_CLANG) && !__has_feature(cxx_rtti)
+#  define _LIBCPP_NO_RTTI
+#elif defined(__GNUC__) && !defined(__GXX_RTTI)
+#  define _LIBCPP_NO_RTTI
+#elif defined(_LIBCPP_COMPILER_MSVC) && !defined(_CPPRTTI)
+#  define _LIBCPP_NO_RTTI
 #endif

 #ifndef _LIBCPP_WEAK
@ -1125,7 +1133,8 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
      (defined(__MINGW32__) && __has_include(<pthread.h>))
 #    define _LIBCPP_HAS_THREAD_API_PTHREAD
 #  elif defined(__Fuchsia__)
-#    define _LIBCPP_HAS_THREAD_API_C11
+     // TODO(44575): Switch to C11 thread API when possible.
+#    define _LIBCPP_HAS_THREAD_API_PTHREAD
 #  elif defined(_LIBCPP_WIN32API)
 #    define _LIBCPP_HAS_THREAD_API_WIN32
 #  else
@ -1360,6 +1369,7 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(

 // Decide whether to use availability macros.
 #if !defined(_LIBCPP_BUILDING_LIBRARY) &&                                      \
+    !defined(_LIBCXXABI_BUILDING_LIBRARY) &&                                   \
    !defined(_LIBCPP_DISABLE_AVAILABILITY) &&                                  \
    __has_feature(attribute_availability_with_strict) &&                       \
    __has_feature(attribute_availability_in_templates) &&                      \
@ -1377,10 +1387,10 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
     __attribute__((availability(tvos,strict,introduced=10.0)))                \
     __attribute__((availability(watchos,strict,introduced=3.0)))
 #  define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS                             \
-     __attribute__((availability(macosx,strict,introduced=10.14)))             \
-     __attribute__((availability(ios,strict,introduced=12.0)))                 \
-     __attribute__((availability(tvos,strict,introduced=12.0)))                \
-     __attribute__((availability(watchos,strict,introduced=5.0)))
+     __attribute__((availability(macosx,strict,introduced=10.13)))             \
+     __attribute__((availability(ios,strict,introduced=11.0)))                 \
+     __attribute__((availability(tvos,strict,introduced=11.0)))                \
+     __attribute__((availability(watchos,strict,introduced=4.0)))
 #  define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS                              \
     _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
 #  define _LIBCPP_AVAILABILITY_BAD_ANY_CAST                                    \
@ -1421,6 +1431,10 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
     _Pragma("clang attribute pop")                                            \
     _Pragma("clang attribute pop")                                            \
     _Pragma("clang attribute pop")
+#  define _LIBCPP_AVAILABILITY_TO_CHARS                                        \
+     _LIBCPP_AVAILABILITY_FILESYSTEM
+#  define _LIBCPP_AVAILABILITY_SYNC                                            \
+     __attribute__((unavailable))
 #else
 #  define _LIBCPP_AVAILABILITY_SHARED_MUTEX
 #  define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS
@ -1435,6 +1449,8 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
 #  define _LIBCPP_AVAILABILITY_FILESYSTEM
 #  define _LIBCPP_AVAILABILITY_FILESYSTEM_PUSH
 #  define _LIBCPP_AVAILABILITY_FILESYSTEM_POP
+#  define _LIBCPP_AVAILABILITY_TO_CHARS
+#  define _LIBCPP_AVAILABILITY_SYNC
 #endif

 // Define availability that depends on _LIBCPP_NO_EXCEPTIONS.
@ -1518,6 +1534,19 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
 #  define _LIBCPP_FOPEN_CLOEXEC_MODE
 #endif

+#ifdef _LIBCPP_COMPILER_HAS_BUILTIN_CONSTANT_P
+#define _LIBCPP_BUILTIN_CONSTANT_P(x) __builtin_constant_p(x)
+#else
+#define _LIBCPP_BUILTIN_CONSTANT_P(x) false
+#endif
+
+// Support for _FILE_OFFSET_BITS=64 landed gradually in Android, so the full set
+// of functions used in cstdio may not be available for low API levels when
+// using 64-bit file offsets on LP32.
+#if defined(__BIONIC__) && defined(__USE_FILE_OFFSET64) && __ANDROID_API__ < 24
+#define _LIBCPP_HAS_NO_FGETPOS_FSETPOS
+#endif
+
 #endif // __cplusplus

 #endif // _LIBCPP_CONFIG
--- a/lib/libcxx/include/__config_site.in
+++ b/lib/libcxx/include/__config_site.in
@ -27,7 +27,9 @@
 #cmakedefine _LIBCPP_HAS_THREAD_LIBRARY_EXTERNAL
 #cmakedefine _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS
 #cmakedefine _LIBCPP_NO_VCRUNTIME
-#cmakedefine01 _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
+#ifndef _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
+#cmakedefine _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION @_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION@
+#endif
 #cmakedefine _LIBCPP_ABI_NAMESPACE @_LIBCPP_ABI_NAMESPACE@
 #cmakedefine _LIBCPP_HAS_PARALLEL_ALGORITHMS

--- a/lib/libcxx/include/__functional_base
+++ b/lib/libcxx/include/__functional_base
@ -522,7 +522,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 reference_wrapper<_Tp>
 ref(reference_wrapper<_Tp> __t) _NOEXCEPT
 {
-    return ref(__t.get());
+    return _VSTD::ref(__t.get());
 }

 template <class _Tp>
@ -538,7 +538,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 reference_wrapper<const _Tp>
 cref(reference_wrapper<_Tp> __t) _NOEXCEPT
 {
-    return cref(__t.get());
+    return _VSTD::cref(__t.get());
 }

 #ifndef _LIBCPP_CXX03_LANG
--- a/lib/libcxx/include/__libcpp_version
+++ b/lib/libcxx/include/__libcpp_version
@ -1 +1 @@
-10000
+11000
--- a/lib/libcxx/include/__locale
+++ b/lib/libcxx/include/__locale
@ -496,7 +496,13 @@ public:
    static const mask punct  = 1<<7;
    static const mask xdigit = 1<<8;
    static const mask blank  = 1<<9;
+#if defined(__BIONIC__)
+    // Historically this was a part of regex_traits rather than ctype_base. The
+    // historical value of the constant is preserved for ABI compatibility.
+    static const mask __regex_word = 0x8000;
+#else
    static const mask __regex_word = 1<<10;
+#endif // defined(__BIONIC__)
 #endif
    static const mask alnum  = alpha | digit;
    static const mask graph  = alnum | punct;
--- a/lib/libcxx/include/__string
+++ b/lib/libcxx/include/__string
@ -70,6 +70,123 @@ _LIBCPP_PUSH_MACROS

 _LIBCPP_BEGIN_NAMESPACE_STD

+// The the extern template ABI lists are kept outside of <string> to improve the
+// readability of that header.
+
+// The extern template ABI lists are kept outside of <string> to improve the
+// readability of that header. We maintain 2 ABI lists:
+// - _LIBCPP_STRING_V1_EXTERN_TEMPLATE_LIST
+// - _LIBCPP_STRING_UNSTABLE_EXTERN_TEMPLATE_LIST
+// As the name implies, the ABI lists define the V1 (Stable) and unstable ABI.
+//
+// For unstable, we may explicitly remove function that are external in V1,
+// and add (new) external functions to better control inlining and compiler
+// optimization opportunities.
+//
+// For stable, the ABI list should rarely change, except for adding new
+// functions supporting new c++ version / API changes. Typically entries
+// must never be removed from the stable list.
+#define _LIBCPP_STRING_V1_EXTERN_TEMPLATE_LIST(_Func, _CharType) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::rfind(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__init(value_type const*, size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&, std::allocator<_CharType> const&)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find_last_not_of(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::~basic_string()) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find_first_not_of(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::insert(size_type, size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::operator=(value_type)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__init(value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS const _CharType& basic_string<_CharType>::at(size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::insert(size_type, value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find_first_of(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::assign(value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::reserve(size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::append(value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::assign(basic_string const&, size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::copy(value_type*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&, size_type, size_type, std::allocator<_CharType> const&)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find(value_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__init(size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::insert(size_type, value_type const*)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find_last_of(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__grow_by(size_type, size_type, size_type, size_type, size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__grow_by_and_replace(size_type, size_type, size_type, size_type, size_type, size_type, value_type const*)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::push_back(value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::append(size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::rfind(value_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS const basic_string<_CharType>::size_type basic_string<_CharType>::npos) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::assign(size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::erase(size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::append(basic_string const&, size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS int basic_string<_CharType>::compare(value_type const*) const) \
+  _Func(_LIBCPP_FUNC_VIS int basic_string<_CharType>::compare(size_type, size_type, value_type const*) const) \
+  _Func(_LIBCPP_FUNC_VIS _CharType& basic_string<_CharType>::at(size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::assign(value_type const*)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS int basic_string<_CharType>::compare(size_type, size_type, basic_string const&, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS int basic_string<_CharType>::compare(size_type, size_type, value_type const*, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::operator=(basic_string const&)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::append(value_type const*)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, basic_string const&, size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::iterator basic_string<_CharType>::insert(basic_string::const_iterator, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::resize(size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::insert(size_type, basic_string const&, size_type, size_type))
+
+#define _LIBCPP_STRING_UNSTABLE_EXTERN_TEMPLATE_LIST(_Func, _CharType) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::rfind(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__init(value_type const*, size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find_last_not_of(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::~basic_string()) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find_first_not_of(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::insert(size_type, size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::operator=(value_type)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__init(value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__init_copy_ctor_external(value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS const _CharType& basic_string<_CharType>::at(size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::insert(size_type, value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find_first_of(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::__assign_external(value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::__assign_external(value_type const*)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::reserve(size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::append(value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::assign(basic_string const&, size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::copy(value_type*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::basic_string(basic_string const&, size_type, size_type, std::allocator<_CharType> const&)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find(value_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__init(size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::insert(size_type, value_type const*)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find_last_of(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__grow_by(size_type, size_type, size_type, size_type, size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__grow_by_and_replace(size_type, size_type, size_type, size_type, size_type, size_type, value_type const*)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::__assign_no_alias<false>(value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::__assign_no_alias<true>(value_type const*, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::push_back(value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::append(size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::rfind(value_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS const basic_string<_CharType>::size_type basic_string<_CharType>::npos) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::assign(size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::__erase_external_with_move(size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::append(basic_string const&, size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS int basic_string<_CharType>::compare(value_type const*) const) \
+  _Func(_LIBCPP_FUNC_VIS int basic_string<_CharType>::compare(size_type, size_type, value_type const*) const) \
+  _Func(_LIBCPP_FUNC_VIS _CharType& basic_string<_CharType>::at(size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::size_type basic_string<_CharType>::find(value_type const*, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS int basic_string<_CharType>::compare(size_type, size_type, basic_string const&, size_type, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS int basic_string<_CharType>::compare(size_type, size_type, value_type const*, size_type) const) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::append(value_type const*)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, basic_string const&, size_type, size_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>::iterator basic_string<_CharType>::insert(basic_string::const_iterator, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS void basic_string<_CharType>::resize(size_type, value_type)) \
+  _Func(_LIBCPP_FUNC_VIS basic_string<_CharType>& basic_string<_CharType>::insert(size_type, basic_string const&, size_type, size_type))
+
+
 // char_traits

 template <class _CharT>
--- a/lib/libcxx/include/__threading_support
+++ b/lib/libcxx/include/__threading_support
@ -26,6 +26,12 @@
 #if defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
 # include <pthread.h>
 # include <sched.h>
+# ifdef __APPLE__
+#  define _LIBCPP_NO_NATIVE_SEMAPHORES
+# endif
+# ifndef _LIBCPP_NO_NATIVE_SEMAPHORES
+# include <semaphore.h>
+# endif
 #elif defined(_LIBCPP_HAS_THREAD_API_C11)
 # include <threads.h>
 #endif
@ -65,6 +71,12 @@ typedef pthread_mutex_t __libcpp_recursive_mutex_t;
 typedef pthread_cond_t __libcpp_condvar_t;
 #define _LIBCPP_CONDVAR_INITIALIZER PTHREAD_COND_INITIALIZER

+#ifndef _LIBCPP_NO_NATIVE_SEMAPHORES
+// Semaphore
+typedef sem_t __libcpp_semaphore_t;
+# define _LIBCPP_SEMAPHORE_MAX SEM_VALUE_MAX
+#endif
+
 // Execute once
 typedef pthread_once_t __libcpp_exec_once_flag;
 #define _LIBCPP_EXEC_ONCE_INITIALIZER PTHREAD_ONCE_INIT
@ -127,6 +139,9 @@ typedef void* __libcpp_recursive_mutex_t[5];
 typedef void* __libcpp_condvar_t;
 #define _LIBCPP_CONDVAR_INITIALIZER 0

+// Semaphore
+typedef void* __libcpp_semaphore_t;
+
 // Execute Once
 typedef void* __libcpp_exec_once_flag;
 #define _LIBCPP_EXEC_ONCE_INITIALIZER 0
@ -191,6 +206,26 @@ int __libcpp_condvar_timedwait(__libcpp_condvar_t *__cv, __libcpp_mutex_t *__m,
 _LIBCPP_THREAD_ABI_VISIBILITY
 int __libcpp_condvar_destroy(__libcpp_condvar_t* __cv);

+#ifndef _LIBCPP_NO_NATIVE_SEMAPHORES
+
+// Semaphore
+_LIBCPP_THREAD_ABI_VISIBILITY
+bool __libcpp_semaphore_init(__libcpp_semaphore_t* __sem, int __init);
+
+_LIBCPP_THREAD_ABI_VISIBILITY
+bool __libcpp_semaphore_destroy(__libcpp_semaphore_t* __sem);
+
+_LIBCPP_THREAD_ABI_VISIBILITY
+bool __libcpp_semaphore_post(__libcpp_semaphore_t* __sem);
+
+_LIBCPP_THREAD_ABI_VISIBILITY
+bool __libcpp_semaphore_wait(__libcpp_semaphore_t* __sem);
+
+_LIBCPP_THREAD_ABI_VISIBILITY
+bool __libcpp_semaphore_wait_timed(__libcpp_semaphore_t* __sem, chrono::nanoseconds const& __ns);
+
+#endif // _LIBCPP_NO_NATIVE_SEMAPHORES
+
 // Execute once
 _LIBCPP_THREAD_ABI_VISIBILITY
 int __libcpp_execute_once(__libcpp_exec_once_flag *flag,
@ -242,9 +277,52 @@ int __libcpp_tls_set(__libcpp_tls_key __key, void *__p);

 #endif // !defined(_LIBCPP_HAS_THREAD_API_EXTERNAL)

+struct __libcpp_timed_backoff_policy {
+  _LIBCPP_THREAD_ABI_VISIBILITY
+  bool operator()(chrono::nanoseconds __elapsed) const;
+};
+
+inline _LIBCPP_INLINE_VISIBILITY
+bool __libcpp_timed_backoff_policy::operator()(chrono::nanoseconds __elapsed) const
+{
+    if(__elapsed > chrono::milliseconds(128))
+        __libcpp_thread_sleep_for(chrono::milliseconds(8));
+    else if(__elapsed > chrono::microseconds(64))
+        __libcpp_thread_sleep_for(__elapsed / 2);
+    else if(__elapsed > chrono::microseconds(4))
+      __libcpp_thread_yield();
+    else
+      ; // poll
+    return false;
+}
+
+static _LIBCPP_CONSTEXPR const int __libcpp_polling_count = 64;
+
+template<class _Fn, class _BFn>
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+bool __libcpp_thread_poll_with_backoff(
+  _Fn && __f, _BFn && __bf, chrono::nanoseconds __max_elapsed = chrono::nanoseconds::zero())
+{
+    auto const __start = chrono::high_resolution_clock::now();
+    for(int __count = 0;;) {
+      if(__f())
+        return true; // _Fn completion means success
+      if(__count < __libcpp_polling_count) {
+        __count += 1;
+        continue;
+      }
+      chrono::nanoseconds const __elapsed = chrono::high_resolution_clock::now() - __start;
+      if(__max_elapsed != chrono::nanoseconds::zero() && __max_elapsed < __elapsed)
+          return false; // timeout failure
+      if(__bf(__elapsed))
+        return false; // _BFn completion means failure
+    }
+}
+
 #if (!defined(_LIBCPP_HAS_THREAD_LIBRARY_EXTERNAL) || \
     defined(_LIBCPP_BUILDING_THREAD_LIBRARY_EXTERNAL))

+
 namespace __thread_detail {

 inline __libcpp_timespec_t __convert_to_timespec(const chrono::nanoseconds& __ns)
@ -364,6 +442,38 @@ int __libcpp_condvar_destroy(__libcpp_condvar_t *__cv)
  return pthread_cond_destroy(__cv);
 }

+#ifndef _LIBCPP_NO_NATIVE_SEMAPHORES
+
+// Semaphore
+bool __libcpp_semaphore_init(__libcpp_semaphore_t* __sem, int __init)
+{
+    return sem_init(__sem, 0, __init) == 0;
+}
+
+bool __libcpp_semaphore_destroy(__libcpp_semaphore_t* __sem)
+{
+    return sem_destroy(__sem) == 0;
+}
+
+bool __libcpp_semaphore_post(__libcpp_semaphore_t* __sem)
+{
+    return sem_post(__sem) == 0;
+}
+
+bool __libcpp_semaphore_wait(__libcpp_semaphore_t* __sem)
+{
+    return sem_wait(__sem) == 0;
+}
+
+bool __libcpp_semaphore_wait_timed(__libcpp_semaphore_t* __sem, chrono::nanoseconds const& __ns)
+{
+    auto const __abs_time = chrono::system_clock::now().time_since_epoch() + __ns;
+    __libcpp_timespec_t __ts = __thread_detail::__convert_to_timespec(__abs_time);
+    return sem_timedwait(__sem, &__ts) == 0;
+}
+
+#endif //_LIBCPP_NO_NATIVE_SEMAPHORES
+
 // Execute once
 int __libcpp_execute_once(__libcpp_exec_once_flag *flag,
                          void (*init_routine)()) {
@ -600,6 +710,7 @@ int __libcpp_tls_set(__libcpp_tls_key __key, void *__p)

 #endif

+
 #endif // !_LIBCPP_HAS_THREAD_LIBRARY_EXTERNAL || _LIBCPP_BUILDING_THREAD_LIBRARY_EXTERNAL

 class _LIBCPP_TYPE_VIS thread;
--- a/lib/libcxx/include/array
+++ b/lib/libcxx/include/array
@ -32,24 +32,24 @@ struct array
    typedef std::reverse_iterator<const_iterator> const_reverse_iterator;

    // No explicit construct/copy/destroy for aggregate type
-    void fill(const T& u);
-    void swap(array& a) noexcept(is_nothrow_swappable_v<T>);
+    void fill(const T& u);                                      // constexpr in C++20
+    void swap(array& a) noexcept(is_nothrow_swappable_v<T>);    // constexpr in C++20

    // iterators:
-    iterator begin() noexcept;
-    const_iterator begin() const noexcept;
-    iterator end() noexcept;
-    const_iterator end() const noexcept;
+    iterator begin() noexcept;                                  // constexpr in C++17
+    const_iterator begin() const noexcept;                      // constexpr in C++17
+    iterator end() noexcept;                                    // constexpr in C++17
+    const_iterator end() const noexcept;                        // constexpr in C++17

-    reverse_iterator rbegin() noexcept;
-    const_reverse_iterator rbegin() const noexcept;
-    reverse_iterator rend() noexcept;
-    const_reverse_iterator rend() const noexcept;
+    reverse_iterator rbegin() noexcept;                         // constexpr in C++17
+    const_reverse_iterator rbegin() const noexcept;             // constexpr in C++17
+    reverse_iterator rend() noexcept;                           // constexpr in C++17
+    const_reverse_iterator rend() const noexcept;               // constexpr in C++17

-    const_iterator cbegin() const noexcept;
-    const_iterator cend() const noexcept;
-    const_reverse_iterator crbegin() const noexcept;
-    const_reverse_iterator crend() const noexcept;
+    const_iterator cbegin() const noexcept;                     // constexpr in C++17
+    const_iterator cend() const noexcept;                       // constexpr in C++17
+    const_reverse_iterator crbegin() const noexcept;            // constexpr in C++17
+    const_reverse_iterator crend() const noexcept;              // constexpr in C++17

    // capacity:
    constexpr size_type size() const noexcept;
@ -57,46 +57,51 @@ struct array
    constexpr bool empty() const noexcept;

    // element access:
-    reference operator[](size_type n);
-    const_reference operator[](size_type n) const; // constexpr in C++14
-    const_reference at(size_type n) const; // constexpr in C++14
-    reference at(size_type n);
+    reference operator[](size_type n);                          // constexpr in C++17
+    const_reference operator[](size_type n) const;              // constexpr in C++14
+    reference at(size_type n);                                  // constexpr in C++17
+    const_reference at(size_type n) const;                      // constexpr in C++14

-    reference front();
-    const_reference front() const; // constexpr in C++14
-    reference back();
-    const_reference back() const; // constexpr in C++14
+    reference front();                                          // constexpr in C++17
+    const_reference front() const;                              // constexpr in C++14
+    reference back();                                           // constexpr in C++17
+    const_reference back() const;                               // constexpr in C++14

-    T* data() noexcept;
-    const T* data() const noexcept;
+    T* data() noexcept;                                         // constexpr in C++17
+    const T* data() const noexcept;                             // constexpr in C++17
 };

-  template <class T, class... U>
-    array(T, U...) -> array<T, 1 + sizeof...(U)>;
+template <class T, class... U>
+  array(T, U...) -> array<T, 1 + sizeof...(U)>;                 // C++17

 template <class T, size_t N>
-  bool operator==(const array<T,N>& x, const array<T,N>& y);
+  bool operator==(const array<T,N>& x, const array<T,N>& y);    // constexpr in C++20
 template <class T, size_t N>
-  bool operator!=(const array<T,N>& x, const array<T,N>& y);
+  bool operator!=(const array<T,N>& x, const array<T,N>& y);    // constexpr in C++20
 template <class T, size_t N>
-  bool operator<(const array<T,N>& x, const array<T,N>& y);
+  bool operator<(const array<T,N>& x, const array<T,N>& y);     // constexpr in C++20
 template <class T, size_t N>
-  bool operator>(const array<T,N>& x, const array<T,N>& y);
+  bool operator>(const array<T,N>& x, const array<T,N>& y);     // constexpr in C++20
 template <class T, size_t N>
-  bool operator<=(const array<T,N>& x, const array<T,N>& y);
+  bool operator<=(const array<T,N>& x, const array<T,N>& y);    // constexpr in C++20
 template <class T, size_t N>
-  bool operator>=(const array<T,N>& x, const array<T,N>& y);
+  bool operator>=(const array<T,N>& x, const array<T,N>& y);    // constexpr in C++20

 template <class T, size_t N >
-  void swap(array<T,N>& x, array<T,N>& y) noexcept(noexcept(x.swap(y))); // C++17
+  void swap(array<T,N>& x, array<T,N>& y) noexcept(noexcept(x.swap(y))); // constexpr in C++20
+
+template <class T, size_t N>
+  constexpr array<remove_cv_t<T>, N> to_array(T (&a)[N]);  // C++20
+template <class T, size_t N>
+  constexpr array<remove_cv_t<T>, N> to_array(T (&&a)[N]); // C++20

 template <class T> struct tuple_size;
 template <size_t I, class T> struct tuple_element;
 template <class T, size_t N> struct tuple_size<array<T, N>>;
 template <size_t I, class T, size_t N> struct tuple_element<I, array<T, N>>;
-template <size_t I, class T, size_t N> T& get(array<T, N>&) noexcept; // constexpr in C++14
-template <size_t I, class T, size_t N> const T& get(const array<T, N>&) noexcept; // constexpr in C++14
-template <size_t I, class T, size_t N> T&& get(array<T, N>&&) noexcept; // constexpr in C++14
+template <size_t I, class T, size_t N> T& get(array<T, N>&) noexcept;               // constexpr in C++14
+template <size_t I, class T, size_t N> const T& get(const array<T, N>&) noexcept;   // constexpr in C++14
+template <size_t I, class T, size_t N> T&& get(array<T, N>&&) noexcept;             // constexpr in C++14
 template <size_t I, class T, size_t N> const T&& get(const array<T, N>&&) noexcept; // constexpr in C++14

 }  // std
@ -143,13 +148,14 @@ struct _LIBCPP_TEMPLATE_VIS array
    _Tp __elems_[_Size];

    // No explicit construct/copy/destroy for aggregate type
-    _LIBCPP_INLINE_VISIBILITY void fill(const value_type& __u) {
-      _VSTD::fill_n(__elems_, _Size, __u);
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    void fill(const value_type& __u) {
+        _VSTD::fill_n(data(), _Size, __u);
    }

-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
    void swap(array& __a) _NOEXCEPT_(__is_nothrow_swappable<_Tp>::value) {
-      std::swap_ranges(__elems_, __elems_ + _Size, __a.__elems_);
+        std::swap_ranges(data(), data() + _Size, __a.data());
    }

    // iterators:
@ -186,21 +192,38 @@ struct _LIBCPP_TEMPLATE_VIS array
    _LIBCPP_INLINE_VISIBILITY
    _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT {return _Size;}
    _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_INLINE_VISIBILITY
-    _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT {return false; }
+    _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT {return _Size == 0;}

    // element access:
    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
-    reference operator[](size_type __n)             _NOEXCEPT {return __elems_[__n];}
+    reference operator[](size_type __n) _NOEXCEPT {
+        _LIBCPP_ASSERT(__n < _Size, "out-of-bounds access in std::array<T, N>");
+        return __elems_[__n];
+    }
    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
-    const_reference operator[](size_type __n) const _NOEXCEPT {return __elems_[__n];}
+    const_reference operator[](size_type __n) const _NOEXCEPT {
+        _LIBCPP_ASSERT(__n < _Size, "out-of-bounds access in std::array<T, N>");
+        return __elems_[__n];
+    }

-    _LIBCPP_CONSTEXPR_AFTER_CXX14       reference at(size_type __n);
-    _LIBCPP_CONSTEXPR_AFTER_CXX11 const_reference at(size_type __n) const;
+    _LIBCPP_CONSTEXPR_AFTER_CXX14 reference at(size_type __n)
+    {
+        if (__n >= _Size)
+            __throw_out_of_range("array::at");
+        return __elems_[__n];
+    }

-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14 reference front()             _NOEXCEPT {return __elems_[0];}
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11 const_reference front() const _NOEXCEPT {return __elems_[0];}
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14 reference back()              _NOEXCEPT {return __elems_[_Size - 1];}
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11 const_reference back() const  _NOEXCEPT {return __elems_[_Size - 1];}
+    _LIBCPP_CONSTEXPR_AFTER_CXX11 const_reference at(size_type __n) const
+    {
+        if (__n >= _Size)
+            __throw_out_of_range("array::at");
+        return __elems_[__n];
+    }
+
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14 reference front()             _NOEXCEPT {return (*this)[0];}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11 const_reference front() const _NOEXCEPT {return (*this)[0];}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14 reference back()              _NOEXCEPT {return (*this)[_Size - 1];}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11 const_reference back() const  _NOEXCEPT {return (*this)[_Size - 1];}

    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    value_type* data() _NOEXCEPT {return __elems_;}
@ -208,28 +231,6 @@ struct _LIBCPP_TEMPLATE_VIS array
    const value_type* data() const _NOEXCEPT {return __elems_;}
 };

-
-template <class _Tp, size_t _Size>
-_LIBCPP_CONSTEXPR_AFTER_CXX14
-typename array<_Tp, _Size>::reference
-array<_Tp, _Size>::at(size_type __n)
-{
-    if (__n >= _Size)
-        __throw_out_of_range("array::at");
-
-    return __elems_[__n];
-}
-
-template <class _Tp, size_t _Size>
-_LIBCPP_CONSTEXPR_AFTER_CXX11
-typename array<_Tp, _Size>::const_reference
-array<_Tp, _Size>::at(size_type __n) const
-{
-    if (__n >= _Size)
-        __throw_out_of_range("array::at");
-    return __elems_[__n];
-}
-
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0>
 {
@ -253,44 +254,50 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0>
    struct  _ArrayInStructT { _Tp __data_[1]; };
    _ALIGNAS_TYPE(_ArrayInStructT) _CharType __elems_[sizeof(_ArrayInStructT)];

+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
+    value_type* data() _NOEXCEPT {return nullptr;}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
+    const value_type* data() const _NOEXCEPT {return nullptr;}
+
    // No explicit construct/copy/destroy for aggregate type
-    _LIBCPP_INLINE_VISIBILITY void fill(const value_type&) {
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    void fill(const value_type&) {
      static_assert(!is_const<_Tp>::value,
                    "cannot fill zero-sized array of type 'const T'");
    }

-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
    void swap(array&) _NOEXCEPT {
      static_assert(!is_const<_Tp>::value,
                    "cannot swap zero-sized array of type 'const T'");
    }

    // iterators:
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    iterator begin() _NOEXCEPT {return iterator(data());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    const_iterator begin() const _NOEXCEPT {return const_iterator(data());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    iterator end() _NOEXCEPT {return iterator(data());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    const_iterator end() const _NOEXCEPT {return const_iterator(data());}

-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    reverse_iterator rbegin() _NOEXCEPT {return reverse_iterator(end());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    const_reverse_iterator rbegin() const _NOEXCEPT {return const_reverse_iterator(end());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    reverse_iterator rend() _NOEXCEPT {return reverse_iterator(begin());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    const_reverse_iterator rend() const _NOEXCEPT {return const_reverse_iterator(begin());}

-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    const_iterator cbegin() const _NOEXCEPT {return begin();}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    const_iterator cend() const _NOEXCEPT {return end();}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    const_reverse_iterator crbegin() const _NOEXCEPT {return rbegin();}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    const_reverse_iterator crend() const _NOEXCEPT {return rend();}

    // capacity:
@ -302,7 +309,7 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0>
    _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT {return true;}

    // element access:
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    reference operator[](size_type) _NOEXCEPT {
      _LIBCPP_ASSERT(false, "cannot call array<T, 0>::operator[] on a zero-sized array");
      _LIBCPP_UNREACHABLE();
@ -314,52 +321,47 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0>
      _LIBCPP_UNREACHABLE();
    }

-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    reference at(size_type) {
      __throw_out_of_range("array<T, 0>::at");
      _LIBCPP_UNREACHABLE();
    }

-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
    const_reference at(size_type) const {
      __throw_out_of_range("array<T, 0>::at");
      _LIBCPP_UNREACHABLE();
    }

-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    reference front() _NOEXCEPT {
      _LIBCPP_ASSERT(false, "cannot call array<T, 0>::front() on a zero-sized array");
      _LIBCPP_UNREACHABLE();
    }

-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
    const_reference front() const _NOEXCEPT {
      _LIBCPP_ASSERT(false, "cannot call array<T, 0>::front() on a zero-sized array");
      _LIBCPP_UNREACHABLE();
    }

-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
    reference back() _NOEXCEPT {
      _LIBCPP_ASSERT(false, "cannot call array<T, 0>::back() on a zero-sized array");
      _LIBCPP_UNREACHABLE();
    }

-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
    const_reference back() const _NOEXCEPT {
      _LIBCPP_ASSERT(false, "cannot call array<T, 0>::back() on a zero-sized array");
      _LIBCPP_UNREACHABLE();
    }
-
-    _LIBCPP_INLINE_VISIBILITY
-    value_type* data() _NOEXCEPT {return reinterpret_cast<value_type*>(__elems_);}
-    _LIBCPP_INLINE_VISIBILITY
-    const value_type* data() const _NOEXCEPT {return reinterpret_cast<const value_type*>(__elems_);}
 };


 #ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
 template<class _Tp, class... _Args,
-         class = typename enable_if<(is_same_v<_Tp, _Args> && ...), void>::type
+         class = _EnableIf<__all<_IsSame<_Tp, _Args>::value...>::value>
         >
 array(_Tp, _Args...)
  -> array<_Tp, 1 + sizeof...(_Args)>;
@ -415,7 +417,7 @@ operator>=(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y)
 }

 template <class _Tp, size_t _Size>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 typename enable_if
 <
    _Size == 0 ||
@ -479,6 +481,47 @@ get(const array<_Tp, _Size>&& __a) _NOEXCEPT

 #endif  // !_LIBCPP_CXX03_LANG

+#if _LIBCPP_STD_VER > 17
+
+template <typename _Tp, size_t _Size, size_t... _Index>
+_LIBCPP_INLINE_VISIBILITY constexpr array<remove_cv_t<_Tp>, _Size>
+__to_array_lvalue_impl(_Tp (&__arr)[_Size], index_sequence<_Index...>) {
+  return {{__arr[_Index]...}};
+}
+
+template <typename _Tp, size_t _Size, size_t... _Index>
+_LIBCPP_INLINE_VISIBILITY constexpr array<remove_cv_t<_Tp>, _Size>
+__to_array_rvalue_impl(_Tp(&&__arr)[_Size], index_sequence<_Index...>) {
+  return {{_VSTD::move(__arr[_Index])...}};
+}
+
+template <typename _Tp, size_t _Size>
+_LIBCPP_INLINE_VISIBILITY constexpr array<remove_cv_t<_Tp>, _Size>
+to_array(_Tp (&__arr)[_Size]) noexcept(is_nothrow_constructible_v<_Tp, _Tp&>) {
+  static_assert(
+      !is_array_v<_Tp>,
+      "[array.creation]/1: to_array does not accept multidimensional arrays.");
+  static_assert(
+      is_constructible_v<_Tp, _Tp&>,
+      "[array.creation]/1: to_array requires copy constructible elements.");
+  return __to_array_lvalue_impl(__arr, make_index_sequence<_Size>());
+}
+
+template <typename _Tp, size_t _Size>
+_LIBCPP_INLINE_VISIBILITY constexpr array<remove_cv_t<_Tp>, _Size>
+to_array(_Tp(&&__arr)[_Size]) noexcept(is_nothrow_move_constructible_v<_Tp>) {
+  static_assert(
+      !is_array_v<_Tp>,
+      "[array.creation]/4: to_array does not accept multidimensional arrays.");
+  static_assert(
+      is_move_constructible_v<_Tp>,
+      "[array.creation]/4: to_array requires move constructible elements.");
+  return __to_array_rvalue_impl(_VSTD::move(__arr),
+                                make_index_sequence<_Size>());
+}
+
+#endif // _LIBCPP_STD_VER > 17
+
 _LIBCPP_END_NAMESPACE_STD

 #endif  // _LIBCPP_ARRAY
--- a/lib/libcxx/include/atomic
+++ b/lib/libcxx/include/atomic
--- a/lib/libcxx/include/barrier
+++ b/lib/libcxx/include/barrier
@ -0,0 +1,322 @@
+// -*- C++ -*-
+//===--------------------------- barrier ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_BARRIER
+#define _LIBCPP_BARRIER
+
+/*
+    barrier synopsis
+
+namespace std
+{
+
+  template<class CompletionFunction = see below>
+  class barrier
+  {
+  public:
+    using arrival_token = see below;
+
+    constexpr explicit barrier(ptrdiff_t phase_count,
+                               CompletionFunction f = CompletionFunction());
+    ~barrier();
+
+    barrier(const barrier&) = delete;
+    barrier& operator=(const barrier&) = delete;
+
+    [[nodiscard]] arrival_token arrive(ptrdiff_t update = 1);
+    void wait(arrival_token&& arrival) const;
+
+    void arrive_and_wait();
+    void arrive_and_drop();
+
+  private:
+    CompletionFunction completion; // exposition only
+  };
+
+}
+
+*/
+
+#include <__config>
+#include <atomic>
+#ifndef _LIBCPP_HAS_NO_TREE_BARRIER
+# include <memory>
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+#ifdef _LIBCPP_HAS_NO_THREADS
+# error <barrier> is not supported on this single threaded system
+#endif
+
+#if _LIBCPP_STD_VER >= 14
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct __empty_completion
+{
+    inline _LIBCPP_INLINE_VISIBILITY
+    void operator()() noexcept
+    {
+    }
+};
+
+#ifndef _LIBCPP_HAS_NO_TREE_BARRIER
+
+/*
+
+The default implementation of __barrier_base is a classic tree barrier.
+
+It looks different from literature pseudocode for two main reasons:
+ 1. Threads that call into std::barrier functions do not provide indices,
+    so a numbering step is added before the actual barrier algorithm,
+    appearing as an N+1 round to the N rounds of the tree barrier.
+ 2. A great deal of attention has been paid to avoid cache line thrashing
+    by flattening the tree structure into cache-line sized arrays, that
+    are indexed in an efficient way.
+
+*/
+
+using __barrier_phase_t = uint8_t;
+
+class __barrier_algorithm_base;
+
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI
+__barrier_algorithm_base* __construct_barrier_algorithm_base(ptrdiff_t& __expected);
+
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI
+bool __arrive_barrier_algorithm_base(__barrier_algorithm_base* __barrier,
+                                     __barrier_phase_t __old_phase);
+
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI
+void __destroy_barrier_algorithm_base(__barrier_algorithm_base* __barrier);
+
+template<class _CompletionF>
+class __barrier_base {
+
+    ptrdiff_t                                               __expected;
+    unique_ptr<__barrier_algorithm_base,
+               void (*)(__barrier_algorithm_base*)>         __base;
+    __atomic_base<ptrdiff_t>                                __expected_adjustment;
+    _CompletionF                                            __completion;
+    __atomic_base<__barrier_phase_t>                        __phase;
+
+public:
+    using arrival_token = __barrier_phase_t;
+
+    static constexpr ptrdiff_t max() noexcept {
+        return numeric_limits<ptrdiff_t>::max();
+    }
+
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
+            : __expected(__expected), __base(__construct_barrier_algorithm_base(this->__expected),
+                                             &__destroy_barrier_algorithm_base),
+              __expected_adjustment(0), __completion(move(__completion)), __phase(0)
+    {
+    }
+    [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    arrival_token arrive(ptrdiff_t update)
+    {
+        auto const __old_phase = __phase.load(memory_order_relaxed);
+        for(; update; --update)
+            if(__arrive_barrier_algorithm_base(__base.get(), __old_phase)) {
+                __completion();
+                __expected += __expected_adjustment.load(memory_order_relaxed);
+                __expected_adjustment.store(0, memory_order_relaxed);
+                __phase.store(__old_phase + 2, memory_order_release);
+                __phase.notify_all();
+            }
+        return __old_phase;
+    }
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void wait(arrival_token&& __old_phase) const
+    {
+        auto const __test_fn = [=]() -> bool {
+            return __phase.load(memory_order_acquire) != __old_phase;
+        };
+        __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy());
+    }
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void arrive_and_drop()
+    {
+        __expected_adjustment.fetch_sub(1, memory_order_relaxed);
+        (void)arrive(1);
+    }
+};
+
+#else
+
+/*
+
+The alternative implementation of __barrier_base is a central barrier.
+
+Two versions of this algorithm are provided:
+ 1. A fairly straightforward implementation of the litterature for the
+    general case where the completion function is not empty.
+ 2. An optimized implementation that exploits 2's complement arithmetic
+    and well-defined overflow in atomic arithmetic, to handle the phase
+    roll-over for free.
+
+*/
+
+template<class _CompletionF>
+class __barrier_base {
+
+    __atomic_base<ptrdiff_t> __expected;
+    __atomic_base<ptrdiff_t> __arrived;
+    _CompletionF             __completion;
+    __atomic_base<bool>      __phase;
+public:
+    using arrival_token = bool;
+
+    static constexpr ptrdiff_t max() noexcept {
+        return numeric_limits<ptrdiff_t>::max();
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
+        : __expected(__expected), __arrived(__expected), __completion(move(__completion)), __phase(false)
+    {
+    }
+    [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    arrival_token arrive(ptrdiff_t update)
+    {
+        auto const __old_phase = __phase.load(memory_order_relaxed);
+        auto const __result = __arrived.fetch_sub(update, memory_order_acq_rel) - update;
+        auto const new_expected = __expected.load(memory_order_relaxed);
+        if(0 == __result) {
+            __completion();
+            __arrived.store(new_expected, memory_order_relaxed);
+            __phase.store(!__old_phase, memory_order_release);
+            __phase.notify_all();
+        }
+        return __old_phase;
+    }
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void wait(arrival_token&& __old_phase) const
+    {
+        __phase.wait(__old_phase, memory_order_acquire);
+    }
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void arrive_and_drop()
+    {
+        __expected.fetch_sub(1, memory_order_relaxed);
+        (void)arrive(1);
+    }
+};
+
+template<>
+class __barrier_base<__empty_completion> {
+
+    static constexpr uint64_t __expected_unit = 1ull;
+    static constexpr uint64_t __arrived_unit = 1ull << 32;
+    static constexpr uint64_t __expected_mask = __arrived_unit - 1;
+    static constexpr uint64_t __phase_bit = 1ull << 63;
+    static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask;
+
+    __atomic_base<uint64_t>   __phase_arrived_expected;
+
+    static _LIBCPP_INLINE_VISIBILITY
+    constexpr uint64_t __init(ptrdiff_t __count) _NOEXCEPT
+    {
+        return ((uint64_t(1u << 31) - __count) << 32)
+              | (uint64_t(1u << 31) - __count);
+    }
+
+public:
+    using arrival_token = uint64_t;
+
+    static constexpr ptrdiff_t max() noexcept {
+        return ptrdiff_t(1u << 31) - 1;
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit inline __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion())
+        : __phase_arrived_expected(__init(__count))
+    {
+    }
+    [[nodiscard]] inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    arrival_token arrive(ptrdiff_t update)
+    {
+        auto const __inc = __arrived_unit * update;
+        auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel);
+        if((__old ^ (__old + __inc)) & __phase_bit) {
+            __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed);
+            __phase_arrived_expected.notify_all();
+        }
+        return __old & __phase_bit;
+    }
+    inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void wait(arrival_token&& __phase) const
+    {
+        auto const __test_fn = [=]() -> bool {
+            uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
+            return ((__current & __phase_bit) != __phase);
+        };
+        __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy());
+    }
+    inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void arrive_and_drop()
+    {
+        __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);
+        (void)arrive(1);
+    }
+};
+
+#endif //_LIBCPP_HAS_NO_TREE_BARRIER
+
+template<class _CompletionF = __empty_completion>
+class barrier {
+
+    __barrier_base<_CompletionF> __b;
+public:
+    using arrival_token = typename __barrier_base<_CompletionF>::arrival_token;
+
+    static constexpr ptrdiff_t max() noexcept {
+        return __barrier_base<_CompletionF>::max();
+    }
+
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF())
+        : __b(__count, std::move(__completion)) {
+    }
+
+    barrier(barrier const&) = delete;
+    barrier& operator=(barrier const&) = delete;
+
+    [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    arrival_token arrive(ptrdiff_t update = 1)
+    {
+        return __b.arrive(update);
+    }
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void wait(arrival_token&& __phase) const
+    {
+        __b.wait(std::move(__phase));
+    }
+	_LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void arrive_and_wait()
+    {
+        wait(arrive());
+	}
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void arrive_and_drop()
+    {
+        __b.arrive_and_drop();
+    }
+};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 14
+
+#endif //_LIBCPP_BARRIER
--- a/lib/libcxx/include/bit
+++ b/lib/libcxx/include/bit
@ -15,6 +15,7 @@

 namespace std {

+  // [bit.pow.two], integral powers of 2
  template <class T>
    constexpr bool ispow2(T x) noexcept; // C++20
  template <class T>
@ -24,13 +25,13 @@ namespace std {
  template <class T>
    constexpr T log2p1(T x) noexcept;    // C++20

-  // 23.20.2, rotating
+  // [bit.rotate], rotating
  template<class T>
    constexpr T rotl(T x, unsigned int s) noexcept; // C++20
  template<class T>
    constexpr T rotr(T x, unsigned int s) noexcept; // C++20

-  // 23.20.3, counting
+  // [bit.count], counting
  template<class T>
    constexpr int countl_zero(T x) noexcept;  // C++20
  template<class T>
@ -42,7 +43,7 @@ namespace std {
  template<class T>
    constexpr int popcount(T x) noexcept;     // C++20

-  // 20.15.9, endian
+  // [bit.endian], endian
  enum class endian {
    little = see below,        // C++20
    big = see below,           // C++20
@ -350,7 +351,7 @@ _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 bool __ispow2(_Tp __t) _NOEXCEPT
 {
    static_assert(__bitop_unsigned_integer<_Tp>::value, "__ispow2 requires unsigned");
-	return __t != 0 && (((__t & (__t - 1)) == 0));
+    return __t != 0 && (((__t & (__t - 1)) == 0));
 }


--- a/lib/libcxx/include/charconv
+++ b/lib/libcxx/include/charconv
@ -73,6 +73,7 @@ namespace std {

 */

+#include <__config>
 #include <__errc>
 #include <type_traits>
 #include <limits>
@ -92,8 +93,8 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD

 namespace __itoa {
-_LIBCPP_FUNC_VIS char* __u64toa(uint64_t __value, char* __buffer);
-_LIBCPP_FUNC_VIS char* __u32toa(uint32_t __value, char* __buffer);
+_LIBCPP_AVAILABILITY_TO_CHARS _LIBCPP_FUNC_VIS char* __u64toa(uint64_t __value, char* __buffer) _NOEXCEPT;
+_LIBCPP_AVAILABILITY_TO_CHARS _LIBCPP_FUNC_VIS char* __u32toa(uint32_t __value, char* __buffer) _NOEXCEPT;
 }

 #ifndef _LIBCPP_CXX03_LANG
@ -167,6 +168,7 @@ struct _LIBCPP_HIDDEN __traits_base
    }
 #endif

+    _LIBCPP_AVAILABILITY_TO_CHARS
    static _LIBCPP_INLINE_VISIBILITY char* __convert(_Tp __v, char* __p)
    {
        return __u64toa(__v, __p);
@ -189,6 +191,7 @@ struct _LIBCPP_HIDDEN
    }
 #endif

+    _LIBCPP_AVAILABILITY_TO_CHARS
    static _LIBCPP_INLINE_VISIBILITY char* __convert(_Tp __v, char* __p)
    {
        return __u32toa(__v, __p);
@ -292,6 +295,7 @@ __to_unsigned(_Tp __x)
 }

 template <typename _Tp>
+_LIBCPP_AVAILABILITY_TO_CHARS
 inline _LIBCPP_INLINE_VISIBILITY to_chars_result
 __to_chars_itoa(char* __first, char* __last, _Tp __value, true_type)
 {
@ -306,6 +310,7 @@ __to_chars_itoa(char* __first, char* __last, _Tp __value, true_type)
 }

 template <typename _Tp>
+_LIBCPP_AVAILABILITY_TO_CHARS
 inline _LIBCPP_INLINE_VISIBILITY to_chars_result
 __to_chars_itoa(char* __first, char* __last, _Tp __value, false_type)
 {
@ -337,6 +342,7 @@ __to_chars_itoa(char* __first, char* __last, _Tp __value, false_type)
 }

 template <typename _Tp>
+_LIBCPP_AVAILABILITY_TO_CHARS
 inline _LIBCPP_INLINE_VISIBILITY to_chars_result
 __to_chars_integral(char* __first, char* __last, _Tp __value, int __base,
                    true_type)
@ -352,6 +358,7 @@ __to_chars_integral(char* __first, char* __last, _Tp __value, int __base,
 }

 template <typename _Tp>
+_LIBCPP_AVAILABILITY_TO_CHARS
 inline _LIBCPP_INLINE_VISIBILITY to_chars_result
 __to_chars_integral(char* __first, char* __last, _Tp __value, int __base,
                    false_type)
@ -380,6 +387,7 @@ __to_chars_integral(char* __first, char* __last, _Tp __value, int __base,
 }

 template <typename _Tp, typename enable_if<is_integral<_Tp>::value, int>::type = 0>
+_LIBCPP_AVAILABILITY_TO_CHARS
 inline _LIBCPP_INLINE_VISIBILITY to_chars_result
 to_chars(char* __first, char* __last, _Tp __value)
 {
@ -387,6 +395,7 @@ to_chars(char* __first, char* __last, _Tp __value)
 }

 template <typename _Tp, typename enable_if<is_integral<_Tp>::value, int>::type = 0>
+_LIBCPP_AVAILABILITY_TO_CHARS
 inline _LIBCPP_INLINE_VISIBILITY to_chars_result
 to_chars(char* __first, char* __last, _Tp __value, int __base)
 {
--- a/lib/libcxx/include/chrono
+++ b/lib/libcxx/include/chrono
@ -2454,7 +2454,7 @@ chrono::day year_month_day_last::day() const noexcept
        chrono::day(31), chrono::day(31), chrono::day(30),
        chrono::day(31), chrono::day(30), chrono::day(31)
    };
-    return month() != February || !__y.is_leap() ?
+    return (month() != February || !__y.is_leap()) && month().ok() ?
        __d[static_cast<unsigned>(month()) - 1] : chrono::day{29};
 }

--- a/lib/libcxx/include/cmath
+++ b/lib/libcxx/include/cmath
@ -296,6 +296,10 @@ floating_point trunc (arithmetic x);
 float          truncf(float x);
 long double    truncl(long double x);

+constexpr float       lerp(float a, float b, float t) noexcept;                   // C++20
+constexpr double      lerp(double a, double b, double t) noexcept;                // C++20
+constexpr long double lerp(long double a, long double b, long double t) noexcept; // C++20
+
 }  // std

 */
--- a/lib/libcxx/include/codecvt
+++ b/lib/libcxx/include/codecvt
@ -102,11 +102,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <>
@ -137,11 +137,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <>
@ -172,11 +172,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <class _Elem, unsigned long _Maxcode = 0x10ffff,
@ -225,11 +225,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <>
@ -260,11 +260,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <>
@ -295,11 +295,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <>
@ -330,11 +330,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <>
@ -365,11 +365,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <>
@ -400,11 +400,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <class _Elem, unsigned long _Maxcode = 0x10ffff,
@ -453,11 +453,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <>
@ -488,11 +488,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <>
@ -523,11 +523,11 @@ protected:
    virtual result
        do_unshift(state_type& __st,
                   extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
-    virtual int do_encoding() const throw();
-    virtual bool do_always_noconv() const throw();
+    virtual int do_encoding() const _NOEXCEPT;
+    virtual bool do_always_noconv() const _NOEXCEPT;
    virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end,
                          size_t __mx) const;
-    virtual int do_max_length() const throw();
+    virtual int do_max_length() const _NOEXCEPT;
 };

 template <class _Elem, unsigned long _Maxcode = 0x10ffff,
--- a/lib/libcxx/include/compare
+++ b/lib/libcxx/include/compare
@ -43,6 +43,84 @@ namespace std {
  template<class T> constexpr partial_ordering partial_order(const T& a, const T& b);
  template<class T> constexpr strong_equality strong_equal(const T& a, const T& b);
  template<class T> constexpr weak_equality weak_equal(const T& a, const T& b);
+
+  // [cmp.partialord], Class partial_ordering
+  class partial_ordering {
+  public:
+    // valid values
+    static const partial_ordering less;
+    static const partial_ordering equivalent;
+    static const partial_ordering greater;
+    static const partial_ordering unordered;
+
+    // comparisons
+    friend constexpr bool operator==(partial_ordering v, unspecified) noexcept;
+    friend constexpr bool operator==(partial_ordering v, partial_ordering w) noexcept = default;
+    friend constexpr bool operator< (partial_ordering v, unspecified) noexcept;
+    friend constexpr bool operator> (partial_ordering v, unspecified) noexcept;
+    friend constexpr bool operator<=(partial_ordering v, unspecified) noexcept;
+    friend constexpr bool operator>=(partial_ordering v, unspecified) noexcept;
+    friend constexpr bool operator< (unspecified, partial_ordering v) noexcept;
+    friend constexpr bool operator> (unspecified, partial_ordering v) noexcept;
+    friend constexpr bool operator<=(unspecified, partial_ordering v) noexcept;
+    friend constexpr bool operator>=(unspecified, partial_ordering v) noexcept;
+    friend constexpr partial_ordering operator<=>(partial_ordering v, unspecified) noexcept;
+    friend constexpr partial_ordering operator<=>(unspecified, partial_ordering v) noexcept;
+  };
+
+  // [cmp.weakord], Class weak_ordering
+  class weak_ordering {
+  public:
+    // valid values
+    static const weak_ordering less;
+    static const weak_ordering equivalent;
+    static const weak_ordering greater;
+
+    // conversions
+    constexpr operator partial_ordering() const noexcept;
+
+    // comparisons
+    friend constexpr bool operator==(weak_ordering v, unspecified) noexcept;
+    friend constexpr bool operator==(weak_ordering v, weak_ordering w) noexcept = default;
+    friend constexpr bool operator< (weak_ordering v, unspecified) noexcept;
+    friend constexpr bool operator> (weak_ordering v, unspecified) noexcept;
+    friend constexpr bool operator<=(weak_ordering v, unspecified) noexcept;
+    friend constexpr bool operator>=(weak_ordering v, unspecified) noexcept;
+    friend constexpr bool operator< (unspecified, weak_ordering v) noexcept;
+    friend constexpr bool operator> (unspecified, weak_ordering v) noexcept;
+    friend constexpr bool operator<=(unspecified, weak_ordering v) noexcept;
+    friend constexpr bool operator>=(unspecified, weak_ordering v) noexcept;
+    friend constexpr weak_ordering operator<=>(weak_ordering v, unspecified) noexcept;
+    friend constexpr weak_ordering operator<=>(unspecified, weak_ordering v) noexcept;
+  };
+
+  // [cmp.strongord], Class strong_ordering
+  class strong_ordering {
+  public:
+    // valid values
+    static const strong_ordering less;
+    static const strong_ordering equal;
+    static const strong_ordering equivalent;
+    static const strong_ordering greater;
+
+    // conversions
+    constexpr operator partial_ordering() const noexcept;
+    constexpr operator weak_ordering() const noexcept;
+
+    // comparisons
+    friend constexpr bool operator==(strong_ordering v, unspecified) noexcept;
+    friend constexpr bool operator==(strong_ordering v, strong_ordering w) noexcept = default;
+    friend constexpr bool operator< (strong_ordering v, unspecified) noexcept;
+    friend constexpr bool operator> (strong_ordering v, unspecified) noexcept;
+    friend constexpr bool operator<=(strong_ordering v, unspecified) noexcept;
+    friend constexpr bool operator>=(strong_ordering v, unspecified) noexcept;
+    friend constexpr bool operator< (unspecified, strong_ordering v) noexcept;
+    friend constexpr bool operator> (unspecified, strong_ordering v) noexcept;
+    friend constexpr bool operator<=(unspecified, strong_ordering v) noexcept;
+    friend constexpr bool operator>=(unspecified, strong_ordering v) noexcept;
+    friend constexpr strong_ordering operator<=>(strong_ordering v, unspecified) noexcept;
+    friend constexpr strong_ordering operator<=>(unspecified, strong_ordering v) noexcept;
+  };
 }
 */

@ -248,6 +326,8 @@ public:
  _LIBCPP_INLINE_VISIBILITY friend constexpr bool operator>=(_CmpUnspecifiedParam, partial_ordering __v) noexcept;

 #ifndef _LIBCPP_HAS_NO_SPACESHIP_OPERATOR
+  _LIBCPP_INLINE_VISIBILITY friend constexpr bool operator==(partial_ordering, partial_ordering) noexcept = default;
+
  _LIBCPP_INLINE_VISIBILITY friend constexpr partial_ordering operator<=>(partial_ordering __v, _CmpUnspecifiedParam) noexcept;
  _LIBCPP_INLINE_VISIBILITY friend constexpr partial_ordering operator<=>(_CmpUnspecifiedParam, partial_ordering __v) noexcept;
 #endif
@ -364,6 +444,8 @@ public:
  _LIBCPP_INLINE_VISIBILITY friend constexpr bool operator>=(_CmpUnspecifiedParam, weak_ordering __v) noexcept;

 #ifndef _LIBCPP_HAS_NO_SPACESHIP_OPERATOR
+  _LIBCPP_INLINE_VISIBILITY friend constexpr bool operator==(weak_ordering, weak_ordering) noexcept = default;
+
  _LIBCPP_INLINE_VISIBILITY friend constexpr weak_ordering operator<=>(weak_ordering __v, _CmpUnspecifiedParam) noexcept;
  _LIBCPP_INLINE_VISIBILITY friend constexpr weak_ordering operator<=>(_CmpUnspecifiedParam, weak_ordering __v) noexcept;
 #endif
@ -490,6 +572,8 @@ public:
  _LIBCPP_INLINE_VISIBILITY friend constexpr bool operator>=(_CmpUnspecifiedParam, strong_ordering __v) noexcept;

 #ifndef _LIBCPP_HAS_NO_SPACESHIP_OPERATOR
+  _LIBCPP_INLINE_VISIBILITY friend constexpr bool operator==(strong_ordering, strong_ordering) noexcept = default;
+
  _LIBCPP_INLINE_VISIBILITY friend constexpr strong_ordering operator<=>(strong_ordering __v, _CmpUnspecifiedParam) noexcept;
  _LIBCPP_INLINE_VISIBILITY friend constexpr strong_ordering operator<=>(_CmpUnspecifiedParam, strong_ordering __v) noexcept;
 #endif
--- a/lib/libcxx/include/complex
+++ b/lib/libcxx/include/complex
@ -243,6 +243,7 @@ template<class T, class charT, class traits>
 #include <type_traits>
 #include <stdexcept>
 #include <cmath>
+#include <iosfwd>
 #include <sstream>
 #include <version>

@ -1406,10 +1407,10 @@ operator>>(basic_istream<_CharT, _Traits>& __is, complex<_Tp>& __x)
                            __x = complex<_Tp>(__r, __i);
                        }
                        else
-                            __is.setstate(ios_base::failbit);
+                            __is.setstate(__is.failbit);
                    }
                    else
-                        __is.setstate(ios_base::failbit);
+                        __is.setstate(__is.failbit);
                }
                else if (__c == _CharT(')'))
                {
@ -1417,10 +1418,10 @@ operator>>(basic_istream<_CharT, _Traits>& __is, complex<_Tp>& __x)
                    __x = complex<_Tp>(__r, _Tp(0));
                }
                else
-                    __is.setstate(ios_base::failbit);
+                    __is.setstate(__is.failbit);
            }
            else
-                __is.setstate(ios_base::failbit);
+                __is.setstate(__is.failbit);
        }
        else
        {
@ -1429,11 +1430,11 @@ operator>>(basic_istream<_CharT, _Traits>& __is, complex<_Tp>& __x)
            if (!__is.fail())
                __x = complex<_Tp>(__r, _Tp(0));
            else
-                __is.setstate(ios_base::failbit);
+                __is.setstate(__is.failbit);
        }
    }
    else
-        __is.setstate(ios_base::failbit);
+        __is.setstate(__is.failbit);
    return __is;
 }

--- a/lib/libcxx/include/concepts
+++ b/lib/libcxx/include/concepts
@ -0,0 +1,166 @@
+// -*- C++ -*-
+//===-------------------------- concepts ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_CONCEPTS
+#define _LIBCPP_CONCEPTS
+
+/*
+    concepts synopsis
+namespace std {
+  // [concepts.lang], language-related concepts
+  // [concept.same], concept same_as
+  template<class T, class U>
+    concept same_as = see below;
+
+  // [concept.derived], concept derived_from
+  template<class Derived, class Base>
+    concept derived_from = see below;
+
+  // [concept.convertible], concept convertible_to
+  template<class From, class To>
+    concept convertible_to = see below;
+
+  // [concept.commonref], concept common_reference_with
+  template<class T, class U>
+    concept common_reference_with = see below;
+
+  // [concept.common], concept common_with
+  template<class T, class U>
+    concept common_with = see below;
+
+  // [concepts.arithmetic], arithmetic concepts
+  template<class T>
+    concept integral = see below;
+  template<class T>
+    concept signed_integral = see below;
+  template<class T>
+    concept unsigned_integral = see below;
+  template<class T>
+    concept floating_point = see below;
+
+  // [concept.assignable], concept assignable_from
+  template<class LHS, class RHS>
+    concept assignable_from = see below;
+
+  // [concept.swappable], concept swappable
+  namespace ranges {
+    inline namespace unspecified {
+      inline constexpr unspecified swap = unspecified;
+    }
+  }
+  template<class T>
+    concept swappable = see below;
+  template<class T, class U>
+    concept swappable_with = see below;
+
+  // [concept.destructible], concept destructible
+  template<class T>
+    concept destructible = see below;
+
+  // [concept.constructible], concept constructible_from
+  template<class T, class... Args>
+    concept constructible_from = see below;
+
+  // [concept.defaultconstructible], concept default_constructible
+  template<class T>
+    concept default_constructible = see below;
+
+  // [concept.moveconstructible], concept move_constructible
+  template<class T>
+    concept move_constructible = see below;
+
+  // [concept.copyconstructible], concept copy_constructible
+  template<class T>
+    concept copy_constructible = see below;
+
+  // [concepts.compare], comparison concepts
+  // [concept.boolean], concept boolean
+  template<class B>
+    concept boolean = see below;
+
+  // [concept.equalitycomparable], concept equality_comparable
+  template<class T>
+    concept equality_comparable = see below;
+  template<class T, class U>
+    concept equality_comparable_with = see below;
+
+  // [concept.totallyordered], concept totally_ordered
+  template<class T>
+    concept totally_ordered = see below;
+  template<class T, class U>
+    concept totally_ordered_with = see below;
+
+  // [concepts.object], object concepts
+  template<class T>
+    concept movable = see below;
+  template<class T>
+    concept copyable = see below;
+  template<class T>
+    concept semiregular = see below;
+  template<class T>
+    concept regular = see below;
+
+  // [concepts.callable], callable concepts
+  // [concept.invocable], concept invocable
+  template<class F, class... Args>
+    concept invocable = see below;
+
+  // [concept.regularinvocable], concept regular_invocable
+  template<class F, class... Args>
+    concept regular_invocable = see below;
+
+  // [concept.predicate], concept predicate
+  template<class F, class... Args>
+    concept predicate = see below;
+
+  // [concept.relation], concept relation
+  template<class R, class T, class U>
+    concept relation = see below;
+
+  // [concept.equiv], concept equivalence_relation
+  template<class R, class T, class U>
+    concept equivalence_relation = see below;
+
+  // [concept.strictweakorder], concept strict_weak_order
+  template<class R, class T, class U>
+    concept strict_weak_order = see below;
+}
+
+*/
+
+#include <__config>
+#include <type_traits>
+#include <version>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER > 17 && defined(__cpp_concepts) && __cpp_concepts >= 201811L
+
+// [concept.same]
+
+template<class _Tp, class _Up>
+concept __same_as_impl = _VSTD::_IsSame<_Tp, _Up>::value;
+
+template<class _Tp, class _Up>
+concept same_as = __same_as_impl<_Tp, _Up> && __same_as_impl<_Up, _Tp>;
+
+#endif //_LIBCPP_STD_VER > 17 && defined(__cpp_concepts) && __cpp_concepts >= 201811L
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP_CONCEPTS
--- a/lib/libcxx/include/cstddef
+++ b/lib/libcxx/include/cstddef
@ -25,7 +25,7 @@ Types:

    ptrdiff_t
    size_t
-    max_align_t
+    max_align_t // C++11
    nullptr_t
    byte // C++17

@ -49,12 +49,34 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 using ::ptrdiff_t;
 using ::size_t;

-#if defined(__CLANG_MAX_ALIGN_T_DEFINED) || defined(_GCC_MAX_ALIGN_T) || \
-    defined(__DEFINED_max_align_t) || defined(__NetBSD__)
-// Re-use the compiler's <stddef.h> max_align_t where possible.
+#if !defined(_LIBCPP_CXX03_LANG)
 using ::max_align_t;
-#else
-typedef long double max_align_t;
+#endif
+
+template <class _Tp> struct __libcpp_is_integral                     { enum { value = 0 }; };
+template <>          struct __libcpp_is_integral<bool>               { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<char>               { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<signed char>        { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<unsigned char>      { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<wchar_t>            { enum { value = 1 }; };
+#ifndef _LIBCPP_NO_HAS_CHAR8_T
+template <>          struct __libcpp_is_integral<char8_t>            { enum { value = 1 }; };
+#endif
+#ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
+template <>          struct __libcpp_is_integral<char16_t>           { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<char32_t>           { enum { value = 1 }; };
+#endif  // _LIBCPP_HAS_NO_UNICODE_CHARS
+template <>          struct __libcpp_is_integral<short>              { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<unsigned short>     { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<int>                { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<unsigned int>       { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<long>               { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<unsigned long>      { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<long long>          { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<unsigned long long> { enum { value = 1 }; };
+#ifndef _LIBCPP_HAS_NO_INT128
+template <>          struct __libcpp_is_integral<__int128_t>         { enum { value = 1 }; };
+template <>          struct __libcpp_is_integral<__uint128_t>        { enum { value = 1 }; };
 #endif

 _LIBCPP_END_NAMESPACE_STD
@ -64,6 +86,11 @@ namespace std  // purposefully not versioned
 {
 enum class byte : unsigned char {};

+
+template <bool> struct __enable_if_integral_imp {};
+template <> struct __enable_if_integral_imp<true> { using type = byte; };
+template <class _Tp> using _EnableByteOverload = typename __enable_if_integral_imp<__libcpp_is_integral<_Tp>::value>::type;
+
 constexpr byte  operator| (byte  __lhs, byte __rhs) noexcept
 {
    return static_cast<byte>(
@ -104,10 +131,31 @@ constexpr byte  operator~ (byte __b) noexcept
        ~static_cast<unsigned int>(__b)
    ));
 }
+template <class _Integer>
+  constexpr _EnableByteOverload<_Integer> &
+  operator<<=(byte& __lhs, _Integer __shift) noexcept
+  { return __lhs = __lhs << __shift; }

+template <class _Integer>
+  constexpr _EnableByteOverload<_Integer>
+  operator<< (byte  __lhs, _Integer __shift) noexcept
+  { return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) << __shift)); }
+
+template <class _Integer>
+  constexpr _EnableByteOverload<_Integer> &
+  operator>>=(byte& __lhs, _Integer __shift) noexcept
+  { return __lhs = __lhs >> __shift; }
+
+template <class _Integer>
+  constexpr _EnableByteOverload<_Integer>
+  operator>> (byte  __lhs, _Integer __shift) noexcept
+  { return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) >> __shift)); }
+
+template <class _Integer, class = _EnableByteOverload<_Integer> >
+  constexpr _Integer
+  to_integer(byte __b) noexcept { return static_cast<_Integer>(__b); }
 }

-#include <type_traits>  // rest of byte
 #endif

 #endif  // _LIBCPP_CSTDDEF
--- a/lib/libcxx/include/cstdio
+++ b/lib/libcxx/include/cstdio
@ -131,9 +131,13 @@ using ::putc;
 using ::ungetc;
 using ::fread;
 using ::fwrite;
+#ifndef _LIBCPP_HAS_NO_FGETPOS_FSETPOS
 using ::fgetpos;
+#endif
 using ::fseek;
+#ifndef _LIBCPP_HAS_NO_FGETPOS_FSETPOS
 using ::fsetpos;
+#endif
 using ::ftell;
 using ::rewind;
 using ::clearerr;
--- a/lib/libcxx/include/deque
+++ b/lib/libcxx/include/deque
@ -150,9 +150,11 @@ template <class T, class Allocator>
         noexcept(noexcept(x.swap(y)));

 template <class T, class Allocator, class U>
-    void erase(deque<T, Allocator>& c, const U& value);       // C++20
+    typename deque<T, Allocator>::size_type
+    erase(deque<T, Allocator>& c, const U& value);       // C++20
 template <class T, class Allocator, class Predicate>
-    void erase_if(deque<T, Allocator>& c, Predicate pred);    // C++20
+    typename deque<T, Allocator>::size_type
+    erase_if(deque<T, Allocator>& c, Predicate pred);    // C++20

 }  // std

@ -3021,14 +3023,20 @@ swap(deque<_Tp, _Allocator>& __x, deque<_Tp, _Allocator>& __y)

 #if _LIBCPP_STD_VER > 17
 template <class _Tp, class _Allocator, class _Up>
-inline _LIBCPP_INLINE_VISIBILITY
-void erase(deque<_Tp, _Allocator>& __c, const _Up& __v)
-{ __c.erase(_VSTD::remove(__c.begin(), __c.end(), __v), __c.end()); }
+inline _LIBCPP_INLINE_VISIBILITY typename deque<_Tp, _Allocator>::size_type
+erase(deque<_Tp, _Allocator>& __c, const _Up& __v) {
+  auto __old_size = __c.size();
+  __c.erase(_VSTD::remove(__c.begin(), __c.end(), __v), __c.end());
+  return __old_size - __c.size();
+}

 template <class _Tp, class _Allocator, class _Predicate>
-inline _LIBCPP_INLINE_VISIBILITY
-void erase_if(deque<_Tp, _Allocator>& __c, _Predicate __pred)
-{ __c.erase(_VSTD::remove_if(__c.begin(), __c.end(), __pred), __c.end()); }
+inline _LIBCPP_INLINE_VISIBILITY typename deque<_Tp, _Allocator>::size_type
+erase_if(deque<_Tp, _Allocator>& __c, _Predicate __pred) {
+  auto __old_size = __c.size();
+  __c.erase(_VSTD::remove_if(__c.begin(), __c.end(), __pred), __c.end());
+  return __old_size - __c.size();
+}
 #endif


--- a/lib/libcxx/include/exception
+++ b/lib/libcxx/include/exception
@ -98,6 +98,8 @@ class _LIBCPP_EXCEPTION_ABI exception
 {
 public:
    _LIBCPP_INLINE_VISIBILITY exception() _NOEXCEPT {}
+    _LIBCPP_INLINE_VISIBILITY exception(const exception&) _NOEXCEPT = default;
+
    virtual ~exception() _NOEXCEPT;
    virtual const char* what() const _NOEXCEPT;
 };
--- a/lib/libcxx/include/filesystem
+++ b/lib/libcxx/include/filesystem
@ -1346,6 +1346,7 @@ public:
  _LIBCPP_INLINE_VISIBILITY
  const path& path2() const noexcept { return __storage_->__p2_; }

+  filesystem_error(const filesystem_error&) = default;
  ~filesystem_error() override; // key function

  _LIBCPP_INLINE_VISIBILITY
--- a/lib/libcxx/include/forward_list
+++ b/lib/libcxx/include/forward_list
@ -169,9 +169,11 @@ template <class T, class Allocator>
         noexcept(noexcept(x.swap(y)));

 template <class T, class Allocator, class U>
-    void erase(forward_list<T, Allocator>& c, const U& value);       // C++20
+    typename forward_list<T, Allocator>::size_type
+    erase(forward_list<T, Allocator>& c, const U& value);       // C++20
 template <class T, class Allocator, class Predicate>
-    void erase_if(forward_list<T, Allocator>& c, Predicate pred);    // C++20
+    typename forward_list<T, Allocator>::size_type
+    erase_if(forward_list<T, Allocator>& c, Predicate pred);    // C++20

 }  // std

@ -1765,13 +1767,17 @@ swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y)
 #if _LIBCPP_STD_VER > 17
 template <class _Tp, class _Allocator, class _Predicate>
 inline _LIBCPP_INLINE_VISIBILITY
-void erase_if(forward_list<_Tp, _Allocator>& __c, _Predicate __pred)
-{ __c.remove_if(__pred); }
+    typename forward_list<_Tp, _Allocator>::size_type
+    erase_if(forward_list<_Tp, _Allocator>& __c, _Predicate __pred) {
+  return __c.remove_if(__pred);
+}

 template <class _Tp, class _Allocator, class _Up>
 inline _LIBCPP_INLINE_VISIBILITY
-void erase(forward_list<_Tp, _Allocator>& __c, const _Up& __v)
-{ _VSTD::erase_if(__c, [&](auto& __elem) { return __elem == __v; }); }
+    typename forward_list<_Tp, _Allocator>::size_type
+    erase(forward_list<_Tp, _Allocator>& __c, const _Up& __v) {
+  return _VSTD::erase_if(__c, [&](auto& __elem) { return __elem == __v; });
+}
 #endif

 _LIBCPP_END_NAMESPACE_STD
--- a/lib/libcxx/include/functional
+++ b/lib/libcxx/include/functional
@ -508,6 +508,10 @@ POLICY:  For non-variadic implementations, the number of arguments is limited

 #include <__functional_base>

+#if defined(_LIBCPP_HAS_BLOCKS_RUNTIME) && !defined(_LIBCPP_HAS_OBJC_ARC)
+#include <Block.h>
+#endif
+
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
@ -1434,7 +1438,14 @@ void __throw_bad_function_call()
 #endif
 }

-template<class _Fp> class _LIBCPP_TEMPLATE_VIS function; // undefined
+#if defined(_LIBCPP_CXX03_LANG) && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS) && __has_attribute(deprecated)
+#   define _LIBCPP_DEPRECATED_CXX03_FUNCTION \
+        __attribute__((deprecated("Using std::function in C++03 is not supported anymore. Please upgrade to C++11 or later, or use a different type")))
+#else
+#   define _LIBCPP_DEPRECATED_CXX03_FUNCTION /* nothing */
+#endif
+
+template<class _Fp> class _LIBCPP_DEPRECATED_CXX03_FUNCTION _LIBCPP_TEMPLATE_VIS function; // undefined

 namespace __function
 {
@ -1477,6 +1488,12 @@ template <class _Fp>
 _LIBCPP_INLINE_VISIBILITY
 bool __not_null(function<_Fp> const& __f) { return !!__f; }

+#ifdef _LIBCPP_HAS_EXTENSION_BLOCKS
+template <class _Rp, class ..._Args>
+_LIBCPP_INLINE_VISIBILITY
+bool __not_null(_Rp (^__p)(_Args...)) { return __p; }
+#endif
+
 } // namespace __function

 #ifndef _LIBCPP_CXX03_LANG
@ -1611,7 +1628,7 @@ public:

 // __base provides an abstract interface for copyable functors.

-template<class _Fp> class __base;
+template<class _Fp> class _LIBCPP_TEMPLATE_VIS __base;

 template<class _Rp, class ..._ArgTypes>
 class __base<_Rp(_ArgTypes...)>
@ -2238,6 +2255,72 @@ template <class _Rp, class... _ArgTypes> class __policy_func<_Rp(_ArgTypes...)>
 #endif // _LIBCPP_NO_RTTI
 };

+#if defined(_LIBCPP_HAS_BLOCKS_RUNTIME) && !defined(_LIBCPP_HAS_OBJC_ARC)
+
+template<class _Rp1, class ..._ArgTypes1, class _Alloc, class _Rp, class ..._ArgTypes>
+class __func<_Rp1(^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)>
+    : public  __base<_Rp(_ArgTypes...)>
+{
+    typedef _Rp1(^__block_type)(_ArgTypes1...);
+    __block_type __f_;
+
+public:
+    _LIBCPP_INLINE_VISIBILITY
+    explicit __func(__block_type const& __f)
+        : __f_(__f ? Block_copy(__f) : (__block_type)0)
+    { }
+
+    // [TODO] add && to save on a retain
+
+    _LIBCPP_INLINE_VISIBILITY
+    explicit __func(__block_type __f, const _Alloc& /* unused */)
+        : __f_(__f ? Block_copy(__f) : (__block_type)0)
+    { }
+
+    virtual __base<_Rp(_ArgTypes...)>* __clone() const {
+        _LIBCPP_ASSERT(false,
+            "Block pointers are just pointers, so they should always fit into "
+            "std::function's small buffer optimization. This function should "
+            "never be invoked.");
+        return nullptr;
+    }
+
+    virtual void __clone(__base<_Rp(_ArgTypes...)>* __p) const {
+        ::new (__p) __func(__f_);
+    }
+
+    virtual void destroy() _NOEXCEPT {
+        if (__f_)
+            Block_release(__f_);
+        __f_ = 0;
+    }
+
+    virtual void destroy_deallocate() _NOEXCEPT {
+        _LIBCPP_ASSERT(false,
+            "Block pointers are just pointers, so they should always fit into "
+            "std::function's small buffer optimization. This function should "
+            "never be invoked.");
+    }
+
+    virtual _Rp operator()(_ArgTypes&& ... __arg) {
+        return __invoke(__f_, _VSTD::forward<_ArgTypes>(__arg)...);
+    }
+
+#ifndef _LIBCPP_NO_RTTI
+    virtual const void* target(type_info const& __ti) const _NOEXCEPT {
+        if (__ti == typeid(__func::__block_type))
+            return &__f_;
+        return (const void*)nullptr;
+    }
+
+    virtual const std::type_info& target_type() const _NOEXCEPT {
+        return typeid(__func::__block_type);
+    }
+#endif  // _LIBCPP_NO_RTTI
+};
+
+#endif  // _LIBCPP_HAS_EXTENSION_BLOCKS && !_LIBCPP_HAS_OBJC_ARC
+
 }  // __function

 template<class _Rp, class ..._ArgTypes>
@ -2255,14 +2338,14 @@ class _LIBCPP_TEMPLATE_VIS function<_Rp(_ArgTypes...)>

    template <class _Fp, bool = _And<
        _IsNotSame<__uncvref_t<_Fp>, function>,
-        __invokable<_Fp&, _ArgTypes...>
+        __invokable<_Fp, _ArgTypes...>
    >::value>
    struct __callable;
    template <class _Fp>
        struct __callable<_Fp, true>
        {
            static const bool value = is_same<void, _Rp>::value ||
-                is_convertible<typename __invoke_of<_Fp&, _ArgTypes...>::type,
+                is_convertible<typename __invoke_of<_Fp, _ArgTypes...>::type,
                               _Rp>::value;
        };
    template <class _Fp>
@ -2272,7 +2355,7 @@ class _LIBCPP_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
        };

  template <class _Fp>
-  using _EnableIfCallable = typename enable_if<__callable<_Fp>::value>::type;
+  using _EnableIfLValueCallable = typename enable_if<__callable<_Fp&>::value>::type;
 public:
    typedef _Rp result_type;

@ -2283,7 +2366,7 @@ public:
    function(nullptr_t) _NOEXCEPT {}
    function(const function&);
    function(function&&) _NOEXCEPT;
-    template<class _Fp, class = _EnableIfCallable<_Fp>>
+    template<class _Fp, class = _EnableIfLValueCallable<_Fp>>
    function(_Fp);

 #if _LIBCPP_STD_VER <= 14
@ -2297,14 +2380,14 @@ public:
      function(allocator_arg_t, const _Alloc&, const function&);
    template<class _Alloc>
      function(allocator_arg_t, const _Alloc&, function&&);
-    template<class _Fp, class _Alloc, class = _EnableIfCallable<_Fp>>
+    template<class _Fp, class _Alloc, class = _EnableIfLValueCallable<_Fp>>
      function(allocator_arg_t, const _Alloc& __a, _Fp __f);
 #endif

    function& operator=(const function&);
    function& operator=(function&&) _NOEXCEPT;
    function& operator=(nullptr_t) _NOEXCEPT;
-    template<class _Fp, class = _EnableIfCallable<_Fp>>
+    template<class _Fp, class = _EnableIfLValueCallable<typename decay<_Fp>::type>>
    function& operator=(_Fp&&);

    ~function();
@ -2967,14 +3050,14 @@ __search(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
         forward_iterator_tag, forward_iterator_tag)
 {
    if (__first2 == __last2)
-        return make_pair(__first1, __first1);  // Everything matches an empty sequence
+        return _VSTD::make_pair(__first1, __first1);  // Everything matches an empty sequence
    while (true)
    {
        // Find first element in sequence 1 that matchs *__first2, with a mininum of loop checks
        while (true)
        {
            if (__first1 == __last1)  // return __last1 if no element matches *__first2
-                return make_pair(__last1, __last1);
+                return _VSTD::make_pair(__last1, __last1);
            if (__pred(*__first1, *__first2))
                break;
            ++__first1;
@ -2985,9 +3068,9 @@ __search(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
        while (true)
        {
            if (++__m2 == __last2)  // If pattern exhausted, __first1 is the answer (works for 1 element pattern)
-                return make_pair(__first1, __m1);
+                return _VSTD::make_pair(__first1, __m1);
            if (++__m1 == __last1)  // Otherwise if source exhaused, pattern not found
-                return make_pair(__last1, __last1);
+                return _VSTD::make_pair(__last1, __last1);
            if (!__pred(*__m1, *__m2))  // if there is a mismatch, restart with a new __first1
            {
                ++__first1;
@ -3009,10 +3092,10 @@ __search(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1,
    // Take advantage of knowing source and pattern lengths.  Stop short when source is smaller than pattern
    const _D2 __len2 = __last2 - __first2;
    if (__len2 == 0)
-        return make_pair(__first1, __first1);
+        return _VSTD::make_pair(__first1, __first1);
    const _D1 __len1 = __last1 - __first1;
    if (__len1 < __len2)
-        return make_pair(__last1, __last1);
+        return _VSTD::make_pair(__last1, __last1);
    const _RandomAccessIterator1 __s = __last1 - (__len2 - 1);  // Start of pattern match can't go beyond here

    while (true)
@ -3020,7 +3103,7 @@ __search(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1,
        while (true)
        {
            if (__first1 == __s)
-                return make_pair(__last1, __last1);
+                return _VSTD::make_pair(__last1, __last1);
            if (__pred(*__first1, *__first2))
                break;
            ++__first1;
@ -3031,7 +3114,7 @@ __search(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1,
         while (true)
         {
             if (++__m2 == __last2)
-                 return make_pair(__first1, __first1 + __len2);
+                 return _VSTD::make_pair(__first1, __first1 + __len2);
             ++__m1;          // no need to check range on __m1 because __s guarantees we have enough source
             if (!__pred(*__m1, *__m2))
             {
@ -3080,15 +3163,19 @@ using unwrap_ref_decay_t = typename unwrap_ref_decay<_Tp>::type;
 #endif // > C++17

 template <class _Container, class _Predicate>
-inline void __libcpp_erase_if_container( _Container& __c, _Predicate __pred)
-{
-	for (typename _Container::iterator __iter = __c.begin(), __last = __c.end(); __iter != __last;)
-	{
-		if (__pred(*__iter))
-			__iter = __c.erase(__iter);
-		else
-			++__iter;
-	}
+inline typename _Container::size_type
+__libcpp_erase_if_container(_Container& __c, _Predicate __pred) {
+  typename _Container::size_type __old_size = __c.size();
+
+  const typename _Container::iterator __last = __c.end();
+  for (typename _Container::iterator __iter = __c.begin(); __iter != __last;) {
+    if (__pred(*__iter))
+      __iter = __c.erase(__iter);
+    else
+      ++__iter;
+  }
+
+  return __old_size - __c.size();
 }

 _LIBCPP_END_NAMESPACE_STD
--- a/lib/libcxx/include/future
+++ b/lib/libcxx/include/future
@ -506,6 +506,7 @@ public:
    _LIBCPP_INLINE_VISIBILITY
    const error_code& code() const _NOEXCEPT {return __ec_;}

+    future_error(const future_error&) _NOEXCEPT = default;
    virtual ~future_error() _NOEXCEPT;
 };

--- a/lib/libcxx/include/ios
+++ b/lib/libcxx/include/ios
@ -431,7 +431,8 @@ class _LIBCPP_EXCEPTION_ABI ios_base::failure
 public:
    explicit failure(const string& __msg, const error_code& __ec = io_errc::stream);
    explicit failure(const char* __msg, const error_code& __ec = io_errc::stream);
-    virtual ~failure() throw();
+    failure(const failure&) _NOEXCEPT = default;
+    virtual ~failure() _NOEXCEPT;
 };

 _LIBCPP_NORETURN inline _LIBCPP_INLINE_VISIBILITY
@ -842,7 +843,7 @@ basic_ios<_CharT, _Traits>::set_rdbuf(basic_streambuf<char_type, traits_type>* _
    ios_base::set_rdbuf(__sb);
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 boolalpha(ios_base& __str)
 {
@ -850,7 +851,7 @@ boolalpha(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 noboolalpha(ios_base& __str)
 {
@ -858,7 +859,7 @@ noboolalpha(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 showbase(ios_base& __str)
 {
@ -866,7 +867,7 @@ showbase(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 noshowbase(ios_base& __str)
 {
@ -874,7 +875,7 @@ noshowbase(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 showpoint(ios_base& __str)
 {
@ -882,7 +883,7 @@ showpoint(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 noshowpoint(ios_base& __str)
 {
@ -890,7 +891,7 @@ noshowpoint(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 showpos(ios_base& __str)
 {
@ -898,7 +899,7 @@ showpos(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 noshowpos(ios_base& __str)
 {
@ -906,7 +907,7 @@ noshowpos(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 skipws(ios_base& __str)
 {
@ -914,7 +915,7 @@ skipws(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 noskipws(ios_base& __str)
 {
@ -922,7 +923,7 @@ noskipws(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 uppercase(ios_base& __str)
 {
@ -930,7 +931,7 @@ uppercase(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 nouppercase(ios_base& __str)
 {
@ -938,7 +939,7 @@ nouppercase(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 unitbuf(ios_base& __str)
 {
@ -946,7 +947,7 @@ unitbuf(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 nounitbuf(ios_base& __str)
 {
@ -954,7 +955,7 @@ nounitbuf(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 internal(ios_base& __str)
 {
@ -962,7 +963,7 @@ internal(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 left(ios_base& __str)
 {
@ -970,7 +971,7 @@ left(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 right(ios_base& __str)
 {
@ -978,7 +979,7 @@ right(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 dec(ios_base& __str)
 {
@ -986,7 +987,7 @@ dec(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 hex(ios_base& __str)
 {
@ -994,7 +995,7 @@ hex(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 oct(ios_base& __str)
 {
@ -1002,7 +1003,7 @@ oct(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 fixed(ios_base& __str)
 {
@ -1010,7 +1011,7 @@ fixed(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 scientific(ios_base& __str)
 {
@ -1018,7 +1019,7 @@ scientific(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 hexfloat(ios_base& __str)
 {
@ -1026,7 +1027,7 @@ hexfloat(ios_base& __str)
    return __str;
 }

-inline _LIBCPP_INLINE_VISIBILITY
+inline
 ios_base&
 defaultfloat(ios_base& __str)
 {
--- a/lib/libcxx/include/iterator
+++ b/lib/libcxx/include/iterator
@ -54,10 +54,8 @@ struct bidirectional_iterator_tag : public forward_iterator_tag       {};
 struct random_access_iterator_tag : public bidirectional_iterator_tag {};

 // 27.4.3, iterator operations
-// extension: second argument not conforming to C++03
-template <class InputIterator>  // constexpr in C++17
-  constexpr void advance(InputIterator& i,
-             typename iterator_traits<InputIterator>::difference_type n);
+template <class InputIterator, class Distance>  // constexpr in C++17
+  constexpr void advance(InputIterator& i, Distance n);

 template <class InputIterator>  // constexpr in C++17
  constexpr typename iterator_traits<InputIterator>::difference_type
@ -663,13 +661,14 @@ void __advance(_RandIter& __i,
   __i += __n;
 }

-template <class _InputIter>
+template <class _InputIter, class _Distance>
 inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
-void advance(_InputIter& __i,
-             typename iterator_traits<_InputIter>::difference_type __n)
+void advance(_InputIter& __i, _Distance __orig_n)
 {
-    _LIBCPP_ASSERT(__n >= 0 || __is_cpp17_bidirectional_iterator<_InputIter>::value,
-                       "Attempt to advance(it, -n) on a non-bidi iterator");
+    _LIBCPP_ASSERT(__orig_n >= 0 || __is_cpp17_bidirectional_iterator<_InputIter>::value,
+                   "Attempt to advance(it, n) with negative n on a non-bidirectional iterator");
+    typedef decltype(__convert_to_integral(__orig_n)) _IntegralSize;
+    _IntegralSize __n = __orig_n;
    __advance(__i, __n, typename iterator_traits<_InputIter>::iterator_category());
 }

@ -711,7 +710,7 @@ next(_InputIter __x,
     typename iterator_traits<_InputIter>::difference_type __n = 1)
 {
    _LIBCPP_ASSERT(__n >= 0 || __is_cpp17_bidirectional_iterator<_InputIter>::value,
-                       "Attempt to next(it, -n) on a non-bidi iterator");
+                       "Attempt to next(it, n) with negative n on a non-bidirectional iterator");

    _VSTD::advance(__x, __n);
    return __x;
@ -728,7 +727,7 @@ prev(_InputIter __x,
     typename iterator_traits<_InputIter>::difference_type __n = 1)
 {
    _LIBCPP_ASSERT(__n <= 0 || __is_cpp17_bidirectional_iterator<_InputIter>::value,
-                       "Attempt to prev(it, +n) on a non-bidi iterator");
+                       "Attempt to prev(it, n) with a positive n on a non-bidirectional iterator");
    _VSTD::advance(__x, -__n);
    return __x;
 }
--- a/lib/libcxx/include/latch
+++ b/lib/libcxx/include/latch
@ -0,0 +1,104 @@
+// -*- C++ -*-
+//===--------------------------- latch -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_LATCH
+#define _LIBCPP_LATCH
+
+/*
+    latch synopsis
+
+namespace std
+{
+
+  class latch
+  {
+  public:
+    constexpr explicit latch(ptrdiff_t __expected);
+    ~latch();
+
+    latch(const latch&) = delete;
+    latch& operator=(const latch&) = delete;
+
+    void count_down(ptrdiff_t __update = 1);
+    bool try_wait() const noexcept;
+    void wait() const;
+    void arrive_and_wait(ptrdiff_t __update = 1);
+
+  private:
+    ptrdiff_t __counter; // exposition only
+  };
+
+}
+
+*/
+
+#include <__config>
+#include <atomic>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+#ifdef _LIBCPP_HAS_NO_THREADS
+# error <latch> is not supported on this single threaded system
+#endif
+
+#if _LIBCPP_STD_VER >= 14
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+class latch
+{
+    __atomic_base<ptrdiff_t> __a;
+
+public:
+    static constexpr ptrdiff_t max() noexcept {
+        return numeric_limits<ptrdiff_t>::max();
+    }
+
+    inline _LIBCPP_INLINE_VISIBILITY
+    constexpr explicit latch(ptrdiff_t __expected) : __a(__expected) { }
+
+    ~latch() = default;
+    latch(const latch&) = delete;
+    latch& operator=(const latch&) = delete;
+
+    inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void count_down(ptrdiff_t __update = 1)
+    {
+        auto const __old = __a.fetch_sub(__update, memory_order_release);
+        if(__old == __update)
+            __a.notify_all();
+    }
+    inline _LIBCPP_INLINE_VISIBILITY
+    bool try_wait() const noexcept
+    {
+        return 0 == __a.load(memory_order_acquire);
+    }
+    inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void wait() const
+    {
+        auto const __test_fn = [=]() -> bool {
+            return try_wait();
+        };
+        __cxx_atomic_wait(&__a.__a_, __test_fn);
+    }
+    inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void arrive_and_wait(ptrdiff_t __update = 1)
+    {
+        count_down(__update);
+        wait();
+    }
+};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 14
+
+#endif //_LIBCPP_LATCH
--- a/lib/libcxx/include/list
+++ b/lib/libcxx/include/list
@ -170,9 +170,11 @@ template <class T, class Alloc>
         noexcept(noexcept(x.swap(y)));

 template <class T, class Allocator, class U>
-    void erase(list<T, Allocator>& c, const U& value);       // C++20
+    typename list<T, Allocator>::size_type
+    erase(list<T, Allocator>& c, const U& value);       // C++20
 template <class T, class Allocator, class Predicate>
-    void erase_if(list<T, Allocator>& c, Predicate pred);    // C++20
+    typename list<T, Allocator>::size_type
+    erase_if(list<T, Allocator>& c, Predicate pred);    // C++20

 }  // std

@ -2471,14 +2473,16 @@ swap(list<_Tp, _Alloc>& __x, list<_Tp, _Alloc>& __y)

 #if _LIBCPP_STD_VER > 17
 template <class _Tp, class _Allocator, class _Predicate>
-inline _LIBCPP_INLINE_VISIBILITY
-void erase_if(list<_Tp, _Allocator>& __c, _Predicate __pred)
-{ __c.remove_if(__pred); }
+inline _LIBCPP_INLINE_VISIBILITY typename list<_Tp, _Allocator>::size_type
+erase_if(list<_Tp, _Allocator>& __c, _Predicate __pred) {
+  return __c.remove_if(__pred);
+}

 template <class _Tp, class _Allocator, class _Up>
-inline _LIBCPP_INLINE_VISIBILITY
-void erase(list<_Tp, _Allocator>& __c, const _Up& __v)
-{ _VSTD::erase_if(__c, [&](auto& __elem) { return __elem == __v; }); }
+inline _LIBCPP_INLINE_VISIBILITY typename list<_Tp, _Allocator>::size_type
+erase(list<_Tp, _Allocator>& __c, const _Up& __v) {
+  return _VSTD::erase_if(__c, [&](auto& __elem) { return __elem == __v; });
+}
 #endif

 _LIBCPP_END_NAMESPACE_STD
--- a/lib/libcxx/include/map
+++ b/lib/libcxx/include/map
@ -254,7 +254,8 @@ swap(map<Key, T, Compare, Allocator>& x, map<Key, T, Compare, Allocator>& y)
    noexcept(noexcept(x.swap(y)));

 template <class Key, class T, class Compare, class Allocator, class Predicate>
-  void erase_if(map<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20
+typename map<Key, T, Compare, Allocator>::size_type
+erase_if(map<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20


 template <class Key, class T, class Compare = less<Key>,
@ -469,7 +470,8 @@ swap(multimap<Key, T, Compare, Allocator>& x,
    noexcept(noexcept(x.swap(y)));

 template <class Key, class T, class Compare, class Allocator, class Predicate>
-  void erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20
+typename multimap<Key, T, Compare, Allocator>::size_type
+erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20

 }  // std

@ -1653,10 +1655,13 @@ swap(map<_Key, _Tp, _Compare, _Allocator>& __x,
 }

 #if _LIBCPP_STD_VER > 17
-template <class _Key, class _Tp, class _Compare, class _Allocator, class _Predicate>
+template <class _Key, class _Tp, class _Compare, class _Allocator,
+          class _Predicate>
 inline _LIBCPP_INLINE_VISIBILITY
-void erase_if(map<_Key, _Tp, _Compare, _Allocator>& __c, _Predicate __pred)
-{ __libcpp_erase_if_container(__c, __pred); }
+    typename map<_Key, _Tp, _Compare, _Allocator>::size_type
+    erase_if(map<_Key, _Tp, _Compare, _Allocator>& __c, _Predicate __pred) {
+  return __libcpp_erase_if_container(__c, __pred);
+}
 #endif


@ -2235,10 +2240,14 @@ swap(multimap<_Key, _Tp, _Compare, _Allocator>& __x,
 }

 #if _LIBCPP_STD_VER > 17
-template <class _Key, class _Tp, class _Compare, class _Allocator, class _Predicate>
+template <class _Key, class _Tp, class _Compare, class _Allocator,
+          class _Predicate>
 inline _LIBCPP_INLINE_VISIBILITY
-void erase_if(multimap<_Key, _Tp, _Compare, _Allocator>& __c, _Predicate __pred)
-{ __libcpp_erase_if_container(__c, __pred); }
+    typename multimap<_Key, _Tp, _Compare, _Allocator>::size_type
+    erase_if(multimap<_Key, _Tp, _Compare, _Allocator>& __c,
+             _Predicate __pred) {
+  return __libcpp_erase_if_container(__c, __pred);
+}
 #endif

 _LIBCPP_END_NAMESPACE_STD
--- a/lib/libcxx/include/math.h
+++ b/lib/libcxx/include/math.h
@ -297,9 +297,6 @@ long double    truncl(long double x);
 #pragma GCC system_header
 #endif

-#define _LIBCPP_STDLIB_INCLUDE_NEXT
-#include <stdlib.h>
-
 #include_next <math.h>

 #ifdef __cplusplus
@ -308,6 +305,7 @@ long double    truncl(long double x);
 // back to C++ linkage before including these C++ headers.
 extern "C++" {

+#include <stdlib.h>
 #include <type_traits>
 #include <limits>

@ -760,61 +758,12 @@ isunordered(_A1 __lcpp_x, _A2 __lcpp_y) _NOEXCEPT
 #endif  // isunordered

 // abs
-
-#undef abs
-#undef labs
-#ifndef _LIBCPP_HAS_NO_LONG_LONG
-#undef llabs
-#endif
-
-// MSVCRT already has the correct prototype in <stdlib.h> if __cplusplus is defined
-#if !defined(_LIBCPP_MSVCRT) && !defined(__sun__) && !defined(_AIX)
-inline _LIBCPP_INLINE_VISIBILITY long abs(long __x) _NOEXCEPT {
-  return ::labs(__x);
-}
-#ifndef _LIBCPP_HAS_NO_LONG_LONG
-inline _LIBCPP_INLINE_VISIBILITY long long abs(long long __x) _NOEXCEPT {
-  return ::llabs(__x);
-}
-#endif // _LIBCPP_HAS_NO_LONG_LONG
-#endif // !defined(_LIBCPP_MSVCRT) && !defined(__sun__) && !defined(_AIX)
-
-
-#if !(defined(_AIX) || defined(__sun__))
-inline _LIBCPP_INLINE_VISIBILITY float abs(float __lcpp_x) _NOEXCEPT {
-  return ::fabsf(__lcpp_x);
-}
-
-inline _LIBCPP_INLINE_VISIBILITY double abs(double __lcpp_x) _NOEXCEPT {
-  return ::fabs(__lcpp_x);
-}
-
-inline _LIBCPP_INLINE_VISIBILITY long double
-abs(long double __lcpp_x) _NOEXCEPT {
-  return ::fabsl(__lcpp_x);
-}
-#endif // !(defined(_AIX) || defined(__sun__))
+//
+// handled in stdlib.h

 // div
-
-#undef div
-#undef ldiv
-#ifndef _LIBCPP_HAS_NO_LONG_LONG
-#undef lldiv
-#endif
-
-// MSVCRT already has the correct prototype in <stdlib.h> if __cplusplus is defined
-#if !defined(_LIBCPP_MSVCRT) && !defined(__sun__) && !defined(_AIX)
-inline _LIBCPP_INLINE_VISIBILITY ldiv_t div(long __x, long __y) _NOEXCEPT {
-  return ::ldiv(__x, __y);
-}
-#ifndef _LIBCPP_HAS_NO_LONG_LONG
-inline _LIBCPP_INLINE_VISIBILITY lldiv_t div(long long __x,
-                                             long long __y) _NOEXCEPT {
-  return ::lldiv(__x, __y);
-}
-#endif // _LIBCPP_HAS_NO_LONG_LONG
-#endif // _LIBCPP_MSVCRT / __sun__ / _AIX
+//
+// handled in stdlib.h

 // acos

--- a/lib/libcxx/include/memory
+++ b/lib/libcxx/include/memory
--- a/lib/libcxx/include/module.modulemap
+++ b/lib/libcxx/include/module.modulemap
@ -231,6 +231,11 @@ module std [system] {
    header "atomic"
    export *
  }
+  module barrier {
+    requires cplusplus14
+    header "barrier"
+    export *
+  }
  module bit {
    header "bit"
    export *
@ -262,6 +267,10 @@ module std [system] {
    header "complex"
    export *
  }
+  module concepts {
+    header "concepts"
+    export *
+  }
  module condition_variable {
    header "condition_variable"
    export *
@ -334,6 +343,11 @@ module std [system] {
    header "iterator"
    export *
  }
+  module latch {
+    requires cplusplus14
+    header "latch"
+    export *
+  }
  module limits {
    header "limits"
    export *
@ -364,6 +378,10 @@ module std [system] {
    header "new"
    export *
  }
+  module numbers {
+    header "numbers"
+    export *
+  }
  module numeric {
    header "numeric"
    export *
@ -400,6 +418,11 @@ module std [system] {
    header "scoped_allocator"
    export *
  }
+  module semaphore {
+    requires cplusplus14
+    header "semaphore"
+    export *
+  }
  module set {
    header "set"
    export initializer_list
--- a/lib/libcxx/include/numbers
+++ b/lib/libcxx/include/numbers
@ -0,0 +1,141 @@
+// -*- C++ -*-
+//===---------------------------- numbers ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_NUMBERS
+#define _LIBCPP_NUMBERS
+
+/*
+    numbers synopsis
+
+namespace std::numbers {
+  template<class T> inline constexpr T e_v          = unspecified;
+  template<class T> inline constexpr T log2e_v      = unspecified;
+  template<class T> inline constexpr T log10e_v     = unspecified;
+  template<class T> inline constexpr T pi_v         = unspecified;
+  template<class T> inline constexpr T inv_pi_v     = unspecified;
+  template<class T> inline constexpr T inv_sqrtpi_v = unspecified;
+  template<class T> inline constexpr T ln2_v        = unspecified;
+  template<class T> inline constexpr T ln10_v       = unspecified;
+  template<class T> inline constexpr T sqrt2_v      = unspecified;
+  template<class T> inline constexpr T sqrt3_v      = unspecified;
+  template<class T> inline constexpr T inv_sqrt3_v  = unspecified;
+  template<class T> inline constexpr T egamma_v     = unspecified;
+  template<class T> inline constexpr T phi_v        = unspecified;
+
+  template<floating_point T> inline constexpr T e_v<T>          = see below;
+  template<floating_point T> inline constexpr T log2e_v<T>      = see below;
+  template<floating_point T> inline constexpr T log10e_v<T>     = see below;
+  template<floating_point T> inline constexpr T pi_v<T>         = see below;
+  template<floating_point T> inline constexpr T inv_pi_v<T>     = see below;
+  template<floating_point T> inline constexpr T inv_sqrtpi_v<T> = see below;
+  template<floating_point T> inline constexpr T ln2_v<T>        = see below;
+  template<floating_point T> inline constexpr T ln10_v<T>       = see below;
+  template<floating_point T> inline constexpr T sqrt2_v<T>      = see below;
+  template<floating_point T> inline constexpr T sqrt3_v<T>      = see below;
+  template<floating_point T> inline constexpr T inv_sqrt3_v<T>  = see below;
+  template<floating_point T> inline constexpr T egamma_v<T>     = see below;
+  template<floating_point T> inline constexpr T phi_v<T>        = see below;
+
+  inline constexpr double e          = e_v<double>;
+  inline constexpr double log2e      = log2e_v<double>;
+  inline constexpr double log10e     = log10e_v<double>;
+  inline constexpr double pi         = pi_v<double>;
+  inline constexpr double inv_pi     = inv_pi_v<double>;
+  inline constexpr double inv_sqrtpi = inv_sqrtpi_v<double>;
+  inline constexpr double ln2        = ln2_v<double>;
+  inline constexpr double ln10       = ln10_v<double>;
+  inline constexpr double sqrt2      = sqrt2_v<double>;
+  inline constexpr double sqrt3      = sqrt3_v<double>;
+  inline constexpr double inv_sqrt3  = inv_sqrt3_v<double>;
+  inline constexpr double egamma     = egamma_v<double>;
+  inline constexpr double phi        = phi_v<double>;
+}
+*/
+
+#include <__config>
+
+#if _LIBCPP_STD_VER > 17 && defined(__cpp_concepts) && __cpp_concepts >= 201811L
+
+#include <type_traits>
+#include <version>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace numbers {
+
+template <class T>
+inline constexpr bool __false = false;
+
+template <class T>
+struct __illformed
+{
+  static_assert(__false<T>, "A program that instantiates a primary template of a mathematical constant variable template is ill-formed.");
+};
+
+template <class T> inline constexpr T e_v =          __illformed<T>{};
+template <class T> inline constexpr T log2e_v =      __illformed<T>{};
+template <class T> inline constexpr T log10e_v =     __illformed<T>{};
+template <class T> inline constexpr T pi_v =         __illformed<T>{};
+template <class T> inline constexpr T inv_pi_v =     __illformed<T>{};
+template <class T> inline constexpr T inv_sqrtpi_v = __illformed<T>{};
+template <class T> inline constexpr T ln2_v =        __illformed<T>{};
+template <class T> inline constexpr T ln10_v =       __illformed<T>{};
+template <class T> inline constexpr T sqrt2_v =      __illformed<T>{};
+template <class T> inline constexpr T sqrt3_v =      __illformed<T>{};
+template <class T> inline constexpr T inv_sqrt3_v =  __illformed<T>{};
+template <class T> inline constexpr T egamma_v =     __illformed<T>{};
+template <class T> inline constexpr T phi_v =        __illformed<T>{};
+
+template <class T>
+concept __floating_point = std::is_floating_point_v<T>;
+
+template <__floating_point T> inline constexpr T e_v<T>          = 2.718281828459045235360287471352662;
+template <__floating_point T> inline constexpr T log2e_v<T>      = 1.442695040888963407359924681001892;
+template <__floating_point T> inline constexpr T log10e_v<T>     = 0.434294481903251827651128918916605;
+template <__floating_point T> inline constexpr T pi_v<T>         = 3.141592653589793238462643383279502;
+template <__floating_point T> inline constexpr T inv_pi_v<T>     = 0.318309886183790671537767526745028;
+template <__floating_point T> inline constexpr T inv_sqrtpi_v<T> = 0.564189583547756286948079451560772;
+template <__floating_point T> inline constexpr T ln2_v<T>        = 0.693147180559945309417232121458176;
+template <__floating_point T> inline constexpr T ln10_v<T>       = 2.302585092994045684017991454684364;
+template <__floating_point T> inline constexpr T sqrt2_v<T>      = 1.414213562373095048801688724209698;
+template <__floating_point T> inline constexpr T sqrt3_v<T>      = 1.732050807568877293527446341505872;
+template <__floating_point T> inline constexpr T inv_sqrt3_v<T>  = 0.577350269189625764509148780501957;
+template <__floating_point T> inline constexpr T egamma_v<T>     = 0.577215664901532860606512090082402;
+template <__floating_point T> inline constexpr T phi_v<T>        = 1.618033988749894848204586834365638;
+
+inline constexpr double e          = e_v<double>;
+inline constexpr double log2e      = log2e_v<double>;
+inline constexpr double log10e     = log10e_v<double>;
+inline constexpr double pi         = pi_v<double>;
+inline constexpr double inv_pi     = inv_pi_v<double>;
+inline constexpr double inv_sqrtpi = inv_sqrtpi_v<double>;
+inline constexpr double ln2        = ln2_v<double>;
+inline constexpr double ln10       = ln10_v<double>;
+inline constexpr double sqrt2      = sqrt2_v<double>;
+inline constexpr double sqrt3      = sqrt3_v<double>;
+inline constexpr double inv_sqrt3  = inv_sqrt3_v<double>;
+inline constexpr double egamma     = egamma_v<double>;
+inline constexpr double phi        = phi_v<double>;
+
+} // namespace numbers
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif //_LIBCPP_STD_VER > 17 && defined(__cpp_concepts) && __cpp_concepts >= 201811L
+
+#endif // _LIBCPP_NUMBERS
--- a/lib/libcxx/include/ostream
+++ b/lib/libcxx/include/ostream
@ -999,7 +999,7 @@ basic_ostream<_CharT, _Traits>::seekp(off_type __off, ios_base::seekdir __dir)
 }

 template <class _CharT, class _Traits>
-inline _LIBCPP_INLINE_VISIBILITY
+inline
 basic_ostream<_CharT, _Traits>&
 endl(basic_ostream<_CharT, _Traits>& __os)
 {
@ -1009,7 +1009,7 @@ endl(basic_ostream<_CharT, _Traits>& __os)
 }

 template <class _CharT, class _Traits>
-inline _LIBCPP_INLINE_VISIBILITY
+inline
 basic_ostream<_CharT, _Traits>&
 ends(basic_ostream<_CharT, _Traits>& __os)
 {
@ -1018,7 +1018,7 @@ ends(basic_ostream<_CharT, _Traits>& __os)
 }

 template <class _CharT, class _Traits>
-inline _LIBCPP_INLINE_VISIBILITY
+inline
 basic_ostream<_CharT, _Traits>&
 flush(basic_ostream<_CharT, _Traits>& __os)
 {
--- a/lib/libcxx/include/random
+++ b/lib/libcxx/include/random
@ -4048,10 +4048,12 @@ binomial_distribution<_IntType>::operator()(_URNG& __g, const param_type& __pr)
    result_type __rd = __ru;
    while (true)
    {
+        bool __break = true;
        if (__rd >= 1)
        {
            __pd *= __rd / (__pr.__odds_ratio_ * (__pr.__t_ - __rd + 1));
            __u -= __pd;
+            __break = false;
            if (__u < 0)
                return __rd - 1;
        }
@ -4062,9 +4064,12 @@ binomial_distribution<_IntType>::operator()(_URNG& __g, const param_type& __pr)
        {
            __pu *= (__pr.__t_ - __ru + 1) * __pr.__odds_ratio_ / __ru;
            __u -= __pu;
+            __break = false;
            if (__u < 0)
                return __ru;
        }
+        if (__break)
+            return 0;
    }
 }

--- a/lib/libcxx/include/regex
+++ b/lib/libcxx/include/regex
@ -21,7 +21,7 @@ namespace std
 namespace regex_constants
 {

-emum syntax_option_type
+enum syntax_option_type
 {
    icase      = unspecified,
    nosubs     = unspecified,
@ -631,7 +631,7 @@ template <class OutputIterator, class BidirectionalIterator,
                  const basic_regex<charT, traits>& e, const charT* fmt,
                  regex_constants::match_flag_type flags = regex_constants::match_default);

-template <class traits, class charT, class ST, class SA, class FST, class FSA>>
+template <class traits, class charT, class ST, class SA, class FST, class FSA>
    basic_string<charT, ST, SA>
    regex_replace(const basic_string<charT, ST, SA>& s,
                  const basic_regex<charT, traits>& e,
@ -675,9 +675,9 @@ public:
    regex_iterator(BidirectionalIterator a, BidirectionalIterator b,
                   const regex_type& re,
                   regex_constants::match_flag_type m = regex_constants::match_default);
-    regex_iterator(_BidirectionalIterator __a, _BidirectionalIterator __b,
-                   const regex_type&& __re,
-                   regex_constants::match_flag_type __m
+    regex_iterator(BidirectionalIterator a, BidirectionalIterator b,
+                   const regex_type&& re,
+                   regex_constants::match_flag_type m
                                     = regex_constants::match_default) = delete; // C++14
    regex_iterator(const regex_iterator&);
    regex_iterator& operator=(const regex_iterator&);
@ -698,7 +698,7 @@ typedef regex_iterator<string::const_iterator>  sregex_iterator;
 typedef regex_iterator<wstring::const_iterator> wsregex_iterator;

 template <class BidirectionalIterator,
-          class charT = typename iterator_traits< BidirectionalIterator>::value_type,
+          class charT = typename iterator_traits<BidirectionalIterator>::value_type,
          class traits = regex_traits<charT>>
 class regex_token_iterator
 {
@ -735,8 +735,8 @@ public:
                             regex_constants::match_flag_type m = regex_constants::match_default);
    template <size_t N>
        regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-                             const regex_type& re, const int (&submatches)[N],
-                             regex_constants::match_flag_type m = regex_constants::match_default) = delete // C++14;
+                             const regex_type&& re, const int (&submatches)[N],
+                             regex_constants::match_flag_type m = regex_constants::match_default) = delete; // C++14
    regex_token_iterator(const regex_token_iterator&);
    regex_token_iterator& operator=(const regex_token_iterator&);

@ -977,7 +977,8 @@ class _LIBCPP_EXCEPTION_ABI regex_error
    regex_constants::error_type __code_;
 public:
    explicit regex_error(regex_constants::error_type __ecode);
-    virtual ~regex_error() throw();
+    regex_error(const regex_error&) _NOEXCEPT = default;
+    virtual ~regex_error() _NOEXCEPT;
     _LIBCPP_INLINE_VISIBILITY
    regex_constants::error_type code() const {return __code_;}
 };
@ -1000,7 +1001,19 @@ public:
    typedef _CharT                  char_type;
    typedef basic_string<char_type> string_type;
    typedef locale                  locale_type;
+#ifdef __BIONIC__
+    // Originally bionic's ctype_base used its own ctype masks because the
+    // builtin ctype implementation wasn't in libc++ yet. Bionic's ctype mask
+    // was only 8 bits wide and already saturated, so it used a wider type here
+    // to make room for __regex_word (then a part of this class rather than
+    // ctype_base). Bionic has since moved to the builtin ctype_base
+    // implementation, but this was not updated to match. Since then Android has
+    // needed to maintain a stable libc++ ABI, and this can't be changed without
+    // an ABI break.
+    typedef uint16_t char_class_type;
+#else
    typedef ctype_base::mask        char_class_type;
+#endif

    static const char_class_type __regex_word = ctype_base::__regex_word;
 private:
@ -2837,6 +2850,8 @@ private:
        __parse_awk_escape(_ForwardIterator __first, _ForwardIterator __last,
                          basic_string<_CharT>* __str = nullptr);

+    bool __test_back_ref(_CharT c);
+
    _LIBCPP_INLINE_VISIBILITY
    void __push_l_anchor();
    void __push_r_anchor();
@ -3408,18 +3423,8 @@ basic_regex<_CharT, _Traits>::__parse_BACKREF(_ForwardIterator __first,
    if (__first != __last)
    {
        _ForwardIterator __temp = _VSTD::next(__first);
-        if (__temp != __last)
-        {
-            if (*__first == '\\')
-            {
-                int __val = __traits_.value(*__temp, 10);
-                if (__val >= 1 && __val <= 9)
-                {
-                    __push_back_ref(__val);
-                    __first = ++__temp;
-                }
-            }
-        }
+        if (__temp != __last && *__first == '\\' && __test_back_ref(*__temp))
+            __first = ++__temp;
    }
    return __first;
 }
@ -3547,6 +3552,8 @@ basic_regex<_CharT, _Traits>::__parse_QUOTED_CHAR_ERE(_ForwardIterator __first,
                default:
                    if (__get_grammar(__flags_) == awk)
                        __first = __parse_awk_escape(++__first, __last);
+                    else if(__test_back_ref(*__temp))
+                        __first = ++__temp;
                    break;
                }
            }
@ -4660,6 +4667,22 @@ basic_regex<_CharT, _Traits>::__parse_egrep(_ForwardIterator __first,
    return __first;
 }

+template <class _CharT, class _Traits>
+bool
+basic_regex<_CharT, _Traits>::__test_back_ref(_CharT c)
+{
+    unsigned __val = __traits_.value(c, 10);
+    if (__val >= 1 && __val <= 9)
+    {
+        if (__val > mark_count())
+            __throw_regex_error<regex_constants::error_backref>();
+        __push_back_ref(__val);
+        return true;
+    }
+
+    return false;
+}
+
 template <class _CharT, class _Traits>
 void
 basic_regex<_CharT, _Traits>::__push_loop(size_t __min, size_t __max,
@ -5917,6 +5940,9 @@ basic_regex<_CharT, _Traits>::__search(
        match_results<const _CharT*, _Allocator>& __m,
        regex_constants::match_flag_type __flags) const
 {
+    if (__flags & regex_constants::match_prev_avail)
+        __flags &= ~(regex_constants::match_not_bol | regex_constants::match_not_bow);
+
    __m.__init(1 + mark_count(), __first, __last,
                                    __flags & regex_constants::__no_update_pos);
    if (__match_at_start(__first, __last, __m, __flags,
--- a/lib/libcxx/include/semaphore
+++ b/lib/libcxx/include/semaphore
@ -0,0 +1,235 @@
+// -*- C++ -*-
+//===--------------------------- semaphore --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_SEMAPHORE
+#define _LIBCPP_SEMAPHORE
+
+/*
+    semaphore synopsis
+
+namespace std {
+
+template<ptrdiff_t least_max_value = implementation-defined>
+class counting_semaphore
+{
+public:
+static constexpr ptrdiff_t max() noexcept;
+
+constexpr explicit counting_semaphore(ptrdiff_t desired);
+~counting_semaphore();
+
+counting_semaphore(const counting_semaphore&) = delete;
+counting_semaphore& operator=(const counting_semaphore&) = delete;
+
+void release(ptrdiff_t update = 1);
+void acquire();
+bool try_acquire() noexcept;
+template<class Rep, class Period>
+    bool try_acquire_for(const chrono::duration<Rep, Period>& rel_time);
+template<class Clock, class Duration>
+    bool try_acquire_until(const chrono::time_point<Clock, Duration>& abs_time);
+
+private:
+ptrdiff_t counter; // exposition only
+};
+
+using binary_semaphore = counting_semaphore<1>;
+
+}
+
+*/
+
+#include <__config>
+#include <__threading_support>
+#include <atomic>
+#include <cassert>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+#ifdef _LIBCPP_HAS_NO_THREADS
+# error <semaphore> is not supported on this single threaded system
+#endif
+
+#if _LIBCPP_STD_VER >= 14
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+/*
+
+__atomic_semaphore_base is the general-case implementation, to be used for
+user-requested least-max values that exceed the OS implementation support
+(incl. when the OS has no support of its own) and for binary semaphores.
+
+It is a typical Dijsktra semaphore algorithm over atomics, wait and notify
+functions. It avoids contention against users' own use of those facilities.
+
+*/
+
+class __atomic_semaphore_base
+{
+    __atomic_base<ptrdiff_t> __a;
+
+public:
+    _LIBCPP_INLINE_VISIBILITY
+    __atomic_semaphore_base(ptrdiff_t __count) : __a(__count)
+    {
+    }
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void release(ptrdiff_t __update = 1)
+    {
+        if(0 < __a.fetch_add(__update, memory_order_release))
+            ;
+        else if(__update > 1)
+            __a.notify_all();
+        else
+            __a.notify_one();
+    }
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void acquire()
+    {
+        auto const __test_fn = [=]() -> bool {
+            auto __old = __a.load(memory_order_relaxed);
+            return (__old != 0) && __a.compare_exchange_strong(__old, __old - 1, memory_order_acquire, memory_order_relaxed);
+        };
+        __cxx_atomic_wait(&__a.__a_, __test_fn);
+    }
+    template <class Rep, class Period>
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
+    {
+        auto const __test_fn = [=]() -> bool {
+            auto __old = __a.load(memory_order_acquire);
+            while(1) {
+                if (__old == 0)
+                    return false;
+                if(__a.compare_exchange_strong(__old, __old - 1, memory_order_acquire, memory_order_relaxed))
+                    return true;
+            }
+        };
+        return __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy(), __rel_time);
+    }
+};
+
+#ifndef _LIBCPP_NO_NATIVE_SEMAPHORES
+
+/*
+
+__platform_semaphore_base a simple wrapper for the OS semaphore type. That
+is, every call is routed to the OS in the most direct manner possible.
+
+*/
+
+class __platform_semaphore_base
+{
+    __libcpp_semaphore_t __semaphore;
+
+public:
+    _LIBCPP_INLINE_VISIBILITY
+    __platform_semaphore_base(ptrdiff_t __count) :
+        __semaphore()
+    {
+        __libcpp_semaphore_init(&__semaphore, __count);
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    ~__platform_semaphore_base() {
+        __libcpp_semaphore_destroy(&__semaphore);
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    void release(ptrdiff_t __update)
+    {
+        for(; __update; --__update)
+            __libcpp_semaphore_post(&__semaphore);
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    void acquire()
+    {
+        __libcpp_semaphore_wait(&__semaphore);
+    }
+    _LIBCPP_INLINE_VISIBILITY
+    bool try_acquire_for(chrono::nanoseconds __rel_time)
+    {
+        return __libcpp_semaphore_wait_timed(&__semaphore, __rel_time);
+    }
+};
+
+template<ptrdiff_t __least_max_value>
+using __semaphore_base =
+    typename conditional<(__least_max_value > 1 && __least_max_value <= _LIBCPP_SEMAPHORE_MAX),
+                         __platform_semaphore_base,
+                         __atomic_semaphore_base>::type;
+
+#else
+
+template<ptrdiff_t __least_max_value>
+using __semaphore_base =
+    __atomic_semaphore_base;
+
+#define _LIBCPP_SEMAPHORE_MAX (numeric_limits<ptrdiff_t>::max())
+
+#endif //_LIBCPP_NO_NATIVE_SEMAPHORES
+
+template<ptrdiff_t __least_max_value = _LIBCPP_SEMAPHORE_MAX>
+class counting_semaphore
+{
+    __semaphore_base<__least_max_value> __semaphore;
+
+public:
+    static constexpr ptrdiff_t max() noexcept {
+        return __least_max_value;
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    counting_semaphore(ptrdiff_t __count = 0) : __semaphore(__count) { }
+    ~counting_semaphore() = default;
+
+    counting_semaphore(const counting_semaphore&) = delete;
+    counting_semaphore& operator=(const counting_semaphore&) = delete;
+
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void release(ptrdiff_t __update = 1)
+    {
+        __semaphore.release(__update);
+    }
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    void acquire()
+    {
+        __semaphore.acquire();
+    }
+    template<class Rep, class Period>
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
+    {
+        return __semaphore.try_acquire_for(chrono::duration_cast<chrono::nanoseconds>(__rel_time));
+    }
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    bool try_acquire()
+    {
+        return try_acquire_for(chrono::nanoseconds::zero());
+    }
+    template <class Clock, class Duration>
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
+    {
+        auto const current = Clock::now();
+        if(current >= __abs_time)
+            return try_acquire();
+        else
+            return try_acquire_for(__abs_time - current);
+    }
+};
+
+using binary_semaphore = counting_semaphore<1>;
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 14
+
+#endif //_LIBCPP_SEMAPHORE
--- a/lib/libcxx/include/set
+++ b/lib/libcxx/include/set
@ -216,7 +216,8 @@ swap(set<Key, Compare, Allocator>& x, set<Key, Compare, Allocator>& y)
    noexcept(noexcept(x.swap(y)));

 template <class Key, class Compare, class Allocator, class Predicate>
-  void erase_if(set<Key, Compare, Allocator>& c, Predicate pred);  // C++20
+typename set<Key, Compare, Allocator>::size_type
+erase_if(set<Key, Compare, Allocator>& c, Predicate pred);  // C++20

 template <class Key, class Compare = less<Key>,
          class Allocator = allocator<Key>>
@ -417,7 +418,8 @@ swap(multiset<Key, Compare, Allocator>& x, multiset<Key, Compare, Allocator>& y)
    noexcept(noexcept(x.swap(y)));

 template <class Key, class Compare, class Allocator, class Predicate>
-  void erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred);  // C++20
+typename multiset<Key, Compare, Allocator>::size_type
+erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred);  // C++20

 }  // std

@ -960,8 +962,10 @@ swap(set<_Key, _Compare, _Allocator>& __x,
 #if _LIBCPP_STD_VER > 17
 template <class _Key, class _Compare, class _Allocator, class _Predicate>
 inline _LIBCPP_INLINE_VISIBILITY
-void erase_if(set<_Key, _Compare, _Allocator>& __c, _Predicate __pred)
-{ __libcpp_erase_if_container(__c, __pred); }
+    typename set<_Key, _Compare, _Allocator>::size_type
+    erase_if(set<_Key, _Compare, _Allocator>& __c, _Predicate __pred) {
+  return __libcpp_erase_if_container(__c, __pred);
+}
 #endif

 template <class _Key, class _Compare = less<_Key>,
@ -1484,8 +1488,10 @@ swap(multiset<_Key, _Compare, _Allocator>& __x,
 #if _LIBCPP_STD_VER > 17
 template <class _Key, class _Compare, class _Allocator, class _Predicate>
 inline _LIBCPP_INLINE_VISIBILITY
-void erase_if(multiset<_Key, _Compare, _Allocator>& __c, _Predicate __pred)
-{ __libcpp_erase_if_container(__c, __pred); }
+    typename multiset<_Key, _Compare, _Allocator>::size_type
+    erase_if(multiset<_Key, _Compare, _Allocator>& __c, _Predicate __pred) {
+  return __libcpp_erase_if_container(__c, __pred);
+}
 #endif

 _LIBCPP_END_NAMESPACE_STD
--- a/Show More
+++ b/Show More