changeset 32729:31b285853bb7

Merge
author thartmann
date Mon, 14 Sep 2015 07:03:04 +0000
parents c94c99524ae9 320855c2baef
children 9918bb9fe8b6
files hotspot/src/share/vm/classfile/imageDecompressor.cpp hotspot/src/share/vm/classfile/imageDecompressor.hpp hotspot/src/share/vm/classfile/imageFile.cpp hotspot/src/share/vm/classfile/imageFile.hpp hotspot/src/share/vm/utilities/endian.cpp hotspot/src/share/vm/utilities/endian.hpp hotspot/test/runtime/modules/ImageFile/ImageAttributeOffsetsTest.java hotspot/test/runtime/modules/ImageFile/ImageCloseTest.java hotspot/test/runtime/modules/ImageFile/ImageFileHeaderTest.java hotspot/test/runtime/modules/ImageFile/ImageFindAttributesTest.java hotspot/test/runtime/modules/ImageFile/ImageGetAttributesTest.java hotspot/test/runtime/modules/ImageFile/ImageGetDataAddressTest.java hotspot/test/runtime/modules/ImageFile/ImageGetIndexAddressTest.java hotspot/test/runtime/modules/ImageFile/ImageGetStringBytesTest.java hotspot/test/runtime/modules/ImageFile/ImageOpenTest.java hotspot/test/runtime/modules/ImageFile/ImageReadTest.java hotspot/test/runtime/modules/ImageFile/LocationConstants.java jdk/src/java.base/unix/classes/sun/nio/fs/GnomeFileTypeDetector.java jdk/src/java.base/unix/native/libnio/fs/GnomeFileTypeDetector.c langtools/test/tools/javac/TestBootstrapMethodsCount.java langtools/test/tools/javac/lib/JavacTestingAbstractThreadedTest.java nashorn/src/jdk.scripting.nashorn/share/classes/jdk/nashorn/internal/codegen/AstSerializer.java nashorn/src/jdk.scripting.nashorn/share/classes/jdk/nashorn/internal/runtime/arrays/InvalidArrayIndexException.java
diffstat 569 files changed, 16164 insertions(+), 11632 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Mon Sep 14 07:02:50 2015 +0200
+++ b/.hgtags	Mon Sep 14 07:03:04 2015 +0000
@@ -322,3 +322,4 @@
 c25e882cee9622ec75c4e9d60633539a2f0a8809 jdk9-b77
 c8753d0be1778944dc512ec86a459941ea1ad2c3 jdk9-b78
 3966bd3b8167419aa05c6718a4af1cf54b1e3c58 jdk9-b79
+3c9f5bd909ae7187f24622ee4b69f8a5756a9271 jdk9-b80
--- a/.hgtags-top-repo	Mon Sep 14 07:02:50 2015 +0200
+++ b/.hgtags-top-repo	Mon Sep 14 07:03:04 2015 +0000
@@ -322,3 +322,4 @@
 7972dc8f2a47f0c4cd8f02fa5662af41f028aa14 jdk9-b77
 8c40d4143ee13bdf8170c68cc384c36ab1e9fadb jdk9-b78
 ba08a9f79b9849716bae1f39f71333d47f604012 jdk9-b79
+f7c5ae2933c0b8510a420d1713a955e4ffc7ad0b jdk9-b80
--- a/common/bin/logger.sh	Mon Sep 14 07:02:50 2015 +0200
+++ b/common/bin/logger.sh	Mon Sep 14 07:03:04 2015 +0000
@@ -41,5 +41,19 @@
 trap "rm -rf \"$RCDIR\"" EXIT
 LOGFILE=$1
 shift
+
+# We need to handle command likes like "VAR1=val1 /usr/bin/cmd VAR2=val2".
+# Do this by shifting away prepended variable assignments, and export them
+# instead.
+is_prefix=true
+for opt; do
+  if [[ "$is_prefix" = true && "$opt" =~ ^.*=.*$ ]]; then
+    export $opt
+    shift
+  else
+    is_prefix=false
+  fi
+done
+
 (exec 3>&1 ; ("$@" 2>&1 1>&3; echo $? > "$RCDIR/rc") | tee -a $LOGFILE 1>&2 ; exec 3>&-) | tee -a $LOGFILE
 exit `cat "$RCDIR/rc"`
--- a/corba/.hgtags	Mon Sep 14 07:02:50 2015 +0200
+++ b/corba/.hgtags	Mon Sep 14 07:03:04 2015 +0000
@@ -322,3 +322,4 @@
 8bb2441c0fec8b28f7bf11a0ca3ec1642e7ef457 jdk9-b77
 182bb7accc5253bcfefd8edc1d4997ec8f9f8694 jdk9-b78
 4ab250b8fac66ef8cd15ee78c40f0c651c96e16a jdk9-b79
+821a0373ef2d1642a9824facb938b901ad010413 jdk9-b80
--- a/hotspot/.hgtags	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/.hgtags	Mon Sep 14 07:03:04 2015 +0000
@@ -482,3 +482,4 @@
 e66c3813789debfc06f206afde1bf7a84cb08451 jdk9-b77
 20dc06b04fe5ec373879414d60ef82ac70faef98 jdk9-b78
 e9e63d93bbfe2c6c23447e2c1f5cc71c98671cba jdk9-b79
+8e8377739c06b99b9011c003c77e0bef84c91e09 jdk9-b80
--- a/hotspot/make/Makefile	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/Makefile	Mon Sep 14 07:03:04 2015 +0000
@@ -633,9 +633,9 @@
 
 update_jdk: export_product_jdk export_fastdebug_jdk test_jdk
 
-copy_jdk: $(JDK_IMAGE_DIR)/jre/lib/rt.jar
+copy_jdk: $(JDK_IMAGE_DIR)/bin/java
 
-$(JDK_IMAGE_DIR)/jre/lib/rt.jar:
+$(JDK_IMAGE_DIR)/bin/java:
 	$(RM) -r $(JDK_IMAGE_DIR)
 	$(MKDIR) -p $(JDK_IMAGE_DIR)
 	($(CD) $(JDK_IMPORT_PATH) && \
--- a/hotspot/make/aix/makefiles/mapfile-vers-debug	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/aix/makefiles/mapfile-vers-debug	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/hotspot/make/aix/makefiles/mapfile-vers-product	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/aix/makefiles/mapfile-vers-product	Mon Sep 14 07:03:04 2015 +0000
@@ -139,18 +139,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/hotspot/make/bsd/makefiles/mapfile-vers-darwin-debug	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/bsd/makefiles/mapfile-vers-darwin-debug	Mon Sep 14 07:03:04 2015 +0000
@@ -139,18 +139,6 @@
                 _JVM_Halt
                 _JVM_HoldsLock
                 _JVM_IHashCode
-                _JVM_ImageAttributeOffsets
-                _JVM_ImageAttributeOffsetsLength
-                _JVM_ImageClose
-                _JVM_ImageFindAttributes
-                _JVM_ImageGetAttributes
-                _JVM_ImageGetAttributesCount
-                _JVM_ImageGetDataAddress
-                _JVM_ImageGetIndexAddress
-                _JVM_ImageGetStringBytes
-                _JVM_ImageOpen
-                _JVM_ImageRead
-                _JVM_ImageReadCompressed
                 _JVM_InitAgentProperties
                 _JVM_InitProperties
                 _JVM_InternString
--- a/hotspot/make/bsd/makefiles/mapfile-vers-darwin-product	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/bsd/makefiles/mapfile-vers-darwin-product	Mon Sep 14 07:03:04 2015 +0000
@@ -139,18 +139,6 @@
                 _JVM_Halt
                 _JVM_HoldsLock
                 _JVM_IHashCode
-                _JVM_ImageAttributeOffsets
-                _JVM_ImageAttributeOffsetsLength
-                _JVM_ImageClose
-                _JVM_ImageFindAttributes
-                _JVM_ImageGetAttributes
-                _JVM_ImageGetAttributesCount
-                _JVM_ImageGetDataAddress
-                _JVM_ImageGetIndexAddress
-                _JVM_ImageGetStringBytes
-                _JVM_ImageOpen
-                _JVM_ImageRead
-                _JVM_ImageReadCompressed
                 _JVM_InitAgentProperties
                 _JVM_InitProperties
                 _JVM_InternString
--- a/hotspot/make/bsd/makefiles/mapfile-vers-debug	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/bsd/makefiles/mapfile-vers-debug	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/hotspot/make/bsd/makefiles/mapfile-vers-product	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/bsd/makefiles/mapfile-vers-product	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/hotspot/make/bsd/makefiles/vm.make	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/bsd/makefiles/vm.make	Mon Sep 14 07:03:04 2015 +0000
@@ -131,7 +131,7 @@
 # By default, link the *.o into the library, not the executable.
 LINK_INTO$(LINK_INTO) = LIBJVM
 
-JDK_LIBDIR = $(JAVA_HOME)/jre/lib/$(LIBARCH)
+JDK_LIBDIR = $(JAVA_HOME)/lib/$(LIBARCH)
 
 #----------------------------------------------------------------------
 # jvm_db & dtrace
--- a/hotspot/make/build.sh	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/build.sh	Mon Sep 14 07:03:04 2015 +0000
@@ -49,7 +49,7 @@
 # Just in case:
 JAVA_HOME=`( cd $JAVA_HOME; pwd )`
 
-if [ "${ALT_BOOTDIR-}" = ""  -o  ! -d "${ALT_BOOTDIR-}" -o ! -d ${ALT_BOOTDIR-}/jre/lib/ ]; then
+if [ "${ALT_BOOTDIR-}" = ""  -o  ! -d "${ALT_BOOTDIR-}" -o ! -d ${ALT_BOOTDIR-}/lib/ ]; then
     ALT_BOOTDIR=${JAVA_HOME}
 fi
 
--- a/hotspot/make/hotspot.script	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/hotspot.script	Mon Sep 14 07:03:04 2015 +0000
@@ -127,7 +127,7 @@
 #     o		$JRE/lib/$ARCH
 # followed by the user's previous effective LD_LIBRARY_PATH, if
 # any.
-JRE=$JDK/jre
+JRE=$JDK
 JAVA_HOME=$JDK
 export JAVA_HOME
 
--- a/hotspot/make/linux/makefiles/mapfile-vers-debug	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/linux/makefiles/mapfile-vers-debug	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/hotspot/make/linux/makefiles/mapfile-vers-product	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/linux/makefiles/mapfile-vers-product	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/hotspot/make/solaris/makefiles/adlc.make	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/solaris/makefiles/adlc.make	Mon Sep 14 07:03:04 2015 +0000
@@ -76,6 +76,11 @@
 ifeq ($(shell expr $(COMPILER_REV_NUMERIC) \>= 509), 1)
   CFLAGS_WARN = +w -errwarn
 endif
+# When using compiler version 5.13 (Solaris Studio 12.4), calls to explicitly 
+# instantiated template functions trigger this warning when +w is active.
+ifeq ($(shell expr $(COMPILER_REV_NUMERIC) \>= 513), 1)
+  CFLAGS_WARN += -erroff=notemsource
+endif
 CFLAGS += $(CFLAGS_WARN)
 
 ifeq ("${Platform_compiler}", "sparcWorks")
--- a/hotspot/make/solaris/makefiles/buildtree.make	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/solaris/makefiles/buildtree.make	Mon Sep 14 07:03:04 2015 +0000
@@ -270,6 +270,7 @@
 	echo "CP ?= cp"; \
 	echo "MV ?= mv"; \
 	echo "include \$$(GAMMADIR)/make/$(OS_FAMILY)/makefiles/$(VARIANT).make"; \
+	echo "include \$$(GAMMADIR)/make/excludeSrc.make"; \
 	echo "include \$$(GAMMADIR)/make/$(OS_FAMILY)/makefiles/$(COMPILER).make"; \
 	) > $@
 
--- a/hotspot/make/solaris/makefiles/mapfile-vers	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/solaris/makefiles/mapfile-vers	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/hotspot/make/solaris/makefiles/vm.make	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/make/solaris/makefiles/vm.make	Mon Sep 14 07:03:04 2015 +0000
@@ -197,7 +197,7 @@
 Src_Dirs/COMPILER2 := $(CORE_PATHS) $(COMPILER2_PATHS)
 Src_Dirs/TIERED    := $(CORE_PATHS) $(COMPILER1_PATHS) $(COMPILER2_PATHS)
 Src_Dirs/ZERO      := $(CORE_PATHS)
-Src_Dirs/SHARK     := $(CORE_PATHS)
+Src_Dirs/SHARK     := $(CORE_PATHS) $(SHARK_PATHS)
 Src_Dirs := $(Src_Dirs/$(TYPE))
 
 COMPILER2_SPECIFIC_FILES := opto libadt bcEscapeAnalyzer.cpp c2_\* runtime_\*
@@ -206,7 +206,7 @@
 ZERO_SPECIFIC_FILES      := zero
 
 # Always exclude these.
-Src_Files_EXCLUDE := dtrace jsig.c jvmtiEnvRecommended.cpp jvmtiEnvStub.cpp
+Src_Files_EXCLUDE += dtrace jsig.c jvmtiEnvRecommended.cpp jvmtiEnvStub.cpp
 
 # Exclude per type.
 Src_Files_EXCLUDE/CORE      := $(COMPILER1_SPECIFIC_FILES) $(COMPILER2_SPECIFIC_FILES) $(ZERO_SPECIFIC_FILES) $(SHARK_SPECIFIC_FILES) ciTypeFlow.cpp
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad	Mon Sep 14 07:03:04 2015 +0000
@@ -3803,81 +3803,37 @@
 
   enc_class aarch64_enc_cmpxchg(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
     MacroAssembler _masm(&cbuf);
-    Register old_reg = as_Register($oldval$$reg);
-    Register new_reg = as_Register($newval$$reg);
-    Register base = as_Register($mem$$base);
-    Register addr_reg;
-    int index = $mem$$index;
-    int scale = $mem$$scale;
-    int disp = $mem$$disp;
-    if (index == -1) {
-       if (disp != 0) {
-        __ lea(rscratch2, Address(base, disp));
-        addr_reg = rscratch2;
-      } else {
-        // TODO
-        // should we ever get anything other than this case?
-        addr_reg = base;
-      }
-    } else {
-      Register index_reg = as_Register(index);
-      if (disp == 0) {
-        __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      } else {
-        __ lea(rscratch2, Address(base, disp));
-        __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      }
-    }
-    Label retry_load, done;
-    __ bind(retry_load);
-    __ ldxr(rscratch1, addr_reg);
-    __ cmp(rscratch1, old_reg);
-    __ br(Assembler::NE, done);
-    __ stlxr(rscratch1, new_reg, addr_reg);
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(done);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldxr, &MacroAssembler::cmp, &Assembler::stlxr);
   %}
 
   enc_class aarch64_enc_cmpxchgw(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
     MacroAssembler _masm(&cbuf);
-    Register old_reg = as_Register($oldval$$reg);
-    Register new_reg = as_Register($newval$$reg);
-    Register base = as_Register($mem$$base);
-    Register addr_reg;
-    int index = $mem$$index;
-    int scale = $mem$$scale;
-    int disp = $mem$$disp;
-    if (index == -1) {
-       if (disp != 0) {
-        __ lea(rscratch2, Address(base, disp));
-        addr_reg = rscratch2;
-      } else {
-        // TODO
-        // should we ever get anything other than this case?
-        addr_reg = base;
-      }
-    } else {
-      Register index_reg = as_Register(index);
-      if (disp == 0) {
-        __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      } else {
-        __ lea(rscratch2, Address(base, disp));
-        __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      }
-    }
-    Label retry_load, done;
-    __ bind(retry_load);
-    __ ldxrw(rscratch1, addr_reg);
-    __ cmpw(rscratch1, old_reg);
-    __ br(Assembler::NE, done);
-    __ stlxrw(rscratch1, new_reg, addr_reg);
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(done);
-  %}
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
+  %}
+
+
+  // The only difference between aarch64_enc_cmpxchg and
+  // aarch64_enc_cmpxchg_acq is that we use load-acquire in the
+  // CompareAndSwap sequence to serve as a barrier on acquiring a
+  // lock.
+  enc_class aarch64_enc_cmpxchg_acq(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
+    MacroAssembler _masm(&cbuf);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldaxr, &MacroAssembler::cmp, &Assembler::stlxr);
+  %}
+
+  enc_class aarch64_enc_cmpxchgw_acq(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
+    MacroAssembler _masm(&cbuf);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldaxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
+  %}
+
 
   // auxiliary used for CompareAndSwapX to set result register
   enc_class aarch64_enc_cset_eq(iRegINoSp res) %{
@@ -4373,12 +4329,12 @@
       return;
     }
 
-    if (UseBiasedLocking) {
-      __ biased_locking_enter(disp_hdr, oop, box, tmp, true, cont);
+    if (UseBiasedLocking && !UseOptoBiasInlining) {
+      __ biased_locking_enter(box, oop, disp_hdr, tmp, true, cont);
     }
 
     // Handle existing monitor
-    if (EmitSync & 0x02) {
+    if ((EmitSync & 0x02) == 0) {
       // we can use AArch64's bit test and branch here but
       // markoopDesc does not define a bit index just the bit value
       // so assert in case the bit pos changes
@@ -4398,13 +4354,10 @@
 
     // Compare object markOop with mark and if equal exchange scratch1
     // with object markOop.
-    // Note that this is simply a CAS: it does not generate any
-    // barriers.  These are separately generated by
-    // membar_acquire_lock().
     {
       Label retry_load;
       __ bind(retry_load);
-      __ ldxr(tmp, oop);
+      __ ldaxr(tmp, oop);
       __ cmp(tmp, disp_hdr);
       __ br(Assembler::NE, cas_failed);
       // use stlxr to ensure update is immediately visible
@@ -4454,7 +4407,7 @@
       {
         Label retry_load, fail;
         __ bind(retry_load);
-        __ ldxr(rscratch1, tmp);
+        __ ldaxr(rscratch1, tmp);
         __ cmp(disp_hdr, rscratch1);
         __ br(Assembler::NE, fail);
         // use stlxr to ensure update is immediately visible
@@ -4518,7 +4471,7 @@
       return;
     }
 
-    if (UseBiasedLocking) {
+    if (UseBiasedLocking && !UseOptoBiasInlining) {
       __ biased_locking_exit(oop, tmp, cont);
     }
 
@@ -8017,10 +7970,10 @@
   match(MemBarAcquireLock);
   ins_cost(VOLATILE_REF_COST);
 
-  format %{ "membar_acquire_lock" %}
-
-  ins_encode %{
-    __ membar(Assembler::LoadLoad|Assembler::LoadStore);
+  format %{ "membar_acquire_lock (elided)" %}
+
+  ins_encode %{
+    __ block_comment("membar_acquire_lock (elided)");
   %}
 
   ins_pipe(pipe_serial);
@@ -8080,10 +8033,10 @@
   match(MemBarReleaseLock);
   ins_cost(VOLATILE_REF_COST);
 
-  format %{ "membar_release_lock" %}
-
-  ins_encode %{
-    __ membar(Assembler::LoadStore|Assembler::StoreStore);
+  format %{ "membar_release_lock (elided)" %}
+
+  ins_encode %{
+    __ block_comment("membar_release_lock (elided)");
   %}
 
   ins_pipe(pipe_serial);
@@ -8369,7 +8322,11 @@
   ins_pipe(pipe_serial);
 %}
 
-// this has to be implemented as a CAS
+
+// storeLConditional is used by PhaseMacroExpand::expand_lock_node
+// when attempting to rebias a lock towards the current thread.  We
+// must use the acquire form of cmpxchg in order to guarantee acquire
+// semantics in this case.
 instruct storeLConditional(indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr)
 %{
   match(Set cr (StoreLConditional mem (Binary oldval newval)));
@@ -8381,12 +8338,14 @@
     "cmpw rscratch1, zr\t# EQ on successful write"
   %}
 
-  ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval));
+  ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval));
 
   ins_pipe(pipe_slow);
 %}
 
-// this has to be implemented as a CAS
+// storeIConditional also has acquire semantics, for no better reason
+// than matching storeLConditional.  At the time of writing this
+// comment storeIConditional was not used anywhere by AArch64.
 instruct storeIConditional(indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr)
 %{
   match(Set cr (StoreIConditional mem (Binary oldval newval)));
@@ -8398,7 +8357,7 @@
     "cmpw rscratch1, zr\t# EQ on successful write"
   %}
 
-  ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval));
+  ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval));
 
   ins_pipe(pipe_slow);
 %}
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -1210,7 +1210,7 @@
 
   INSN(ldrs, 0b00, 1);
   INSN(ldrd, 0b01, 1);
-  INSN(ldrq, 0x10, 1);
+  INSN(ldrq, 0b10, 1);
 
 #undef INSN
 
@@ -2285,13 +2285,13 @@
 #undef INSN
 
   // Table vector lookup
-#define INSN(NAME, op)                                                                                       \
-  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, unsigned registers, FloatRegister Vm) {  \
-    starti;                                                                                                  \
-    assert(T == T8B || T == T16B, "invalid arrangement");                                                    \
-    assert(0 < registers && registers <= 4, "invalid number of registers");                                  \
-    f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21), rf(Vm, 16), f(0, 15);                               \
-    f(registers - 1, 14, 13), f(op, 12),f(0b00, 11, 10), rf(Vn, 5), rf(Vd, 0);                               \
+#define INSN(NAME, op)                                                  \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, unsigned registers, FloatRegister Vm) { \
+    starti;                                                             \
+    assert(T == T8B || T == T16B, "invalid arrangement");               \
+    assert(0 < registers && registers <= 4, "invalid number of registers"); \
+    f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21), rf(Vm, 16), f(0, 15); \
+    f(registers - 1, 14, 13), f(op, 12),f(0b00, 11, 10), rf(Vn, 5), rf(Vd, 0); \
   }
 
   INSN(tbl, 0);
@@ -2299,6 +2299,7 @@
 
 #undef INSN
 
+  // AdvSIMD two-reg misc
 #define INSN(NAME, U, opcode)                                                       \
   void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {               \
        starti;                                                                      \
@@ -2316,10 +2317,19 @@
 
 #define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H)
   INSN(rev32, 1, 0b00000);
+private:
+  INSN(_rbit, 1, 0b00101);
+public:
+
 #undef ASSERTION
 
 #define ASSERTION (T == T8B || T == T16B)
   INSN(rev16, 0, 0b00001);
+  // RBIT only allows T8B and T16B but encodes them oddly.  Argh...
+  void rbit(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
+    assert((ASSERTION), MSG);
+    _rbit(Vd, SIMD_Arrangement(T & 1 | 0b010), Vn);
+  }
 #undef ASSERTION
 
 #undef MSG
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -3043,7 +3043,9 @@
   // register obj is destroyed afterwards.
 
   BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
+  assert(bs->kind() == BarrierSet::CardTableForRS ||
+         bs->kind() == BarrierSet::CardTableExtension,
+         "Wrong barrier set kind");
 
   CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -917,6 +917,8 @@
 
   void cmpptr(Register src1, Address src2);
 
+  // Various forms of CAS
+
   void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
                   Label &suceed, Label *fail);
 
@@ -938,6 +940,23 @@
     str(rscratch2, adr);
   }
 
+  // A generic CAS; success or failure is in the EQ flag.
+  template <typename T1, typename T2>
+  void cmpxchg(Register addr, Register expected, Register new_val,
+               T1 load_insn,
+               void (MacroAssembler::*cmp_insn)(Register, Register),
+               T2 store_insn,
+               Register tmp = rscratch1) {
+    Label retry_load, done;
+    bind(retry_load);
+    (this->*load_insn)(tmp, addr);
+    (this->*cmp_insn)(tmp, expected);
+    br(Assembler::NE, done);
+    (this->*store_insn)(tmp, new_val, addr);
+    cbnzw(tmp, retry_load);
+    bind(done);
+  }
+
   // Calls
 
   address trampoline_call(Address entry, CodeBuffer *cbuf = NULL);
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -691,7 +691,7 @@
         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
         __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
@@ -731,7 +731,7 @@
           __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
         {
           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
@@ -2364,7 +2364,7 @@
    *   c_rarg3   - int* table
    *
    * Ouput:
-   *       rax   - int crc result
+   *       r0   - int crc result
    */
   address generate_updateBytesCRC32C() {
     assert(UseCRC32CIntrinsics, "what are we doing here?");
@@ -2435,6 +2435,69 @@
     return start;
   }
 
+  void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
+                      FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
+                      FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
+    // Karatsuba multiplication performs a 128*128 -> 256-bit
+    // multiplication in three 128-bit multiplications and a few
+    // additions.
+    //
+    // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
+    // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
+    //
+    // Inputs:
+    //
+    // A0 in a.d[0]     (subkey)
+    // A1 in a.d[1]
+    // (A1+A0) in a1_xor_a0.d[0]
+    //
+    // B0 in b.d[0]     (state)
+    // B1 in b.d[1]
+
+    __ ext(tmp1, __ T16B, b, b, 0x08);
+    __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
+    __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
+    __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
+    __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
+
+    __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
+    __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
+    __ eor(tmp2, __ T16B, tmp2, tmp4);
+    __ eor(tmp2, __ T16B, tmp2, tmp3);
+
+    // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
+    __ ins(result_hi, __ D, tmp2, 0, 1);
+    __ ins(result_lo, __ D, tmp2, 1, 0);
+  }
+
+  void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
+                    FloatRegister p, FloatRegister z, FloatRegister t1) {
+    const FloatRegister t0 = result;
+
+    // The GCM field polynomial f is z^128 + p(z), where p =
+    // z^7+z^2+z+1.
+    //
+    //    z^128 === -p(z)  (mod (z^128 + p(z)))
+    //
+    // so, given that the product we're reducing is
+    //    a == lo + hi * z^128
+    // substituting,
+    //      === lo - hi * p(z)  (mod (z^128 + p(z)))
+    //
+    // we reduce by multiplying hi by p(z) and subtracting the result
+    // from (i.e. XORing it with) lo.  Because p has no nonzero high
+    // bits we can do this with two 64-bit multiplications, lo*p and
+    // hi*p.
+
+    __ pmull2(t0, __ T1Q, hi, p, __ T2D);
+    __ ext(t1, __ T16B, t0, z, 8);
+    __ eor(hi, __ T16B, hi, t1);
+    __ ext(t1, __ T16B, z, t0, 8);
+    __ eor(lo, __ T16B, lo, t1);
+    __ pmull(t0, __ T1Q, hi, p, __ T1D);
+    __ eor(result, __ T16B, lo, t0);
+  }
+
   /**
    *  Arguments:
    *
@@ -2448,10 +2511,27 @@
    *  Updated state at c_rarg0
    */
   address generate_ghash_processBlocks() {
+    // Bafflingly, GCM uses little-endian for the byte order, but
+    // big-endian for the bit order.  For example, the polynomial 1 is
+    // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
+    //
+    // So, we must either reverse the bytes in each word and do
+    // everything big-endian or reverse the bits in each byte and do
+    // it little-endian.  On AArch64 it's more idiomatic to reverse
+    // the bits in each byte (we have an instruction, RBIT, to do
+    // that) and keep the data in little-endian bit order throught the
+    // calculation, bit-reversing the inputs and outputs.
+
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+    __ align(wordSize * 2);
+    address p = __ pc();
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
     __ align(CodeEntryAlignment);
-    Label L_ghash_loop, L_exit;
-
-    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
     address start = __ pc();
 
     Register state   = c_rarg0;
@@ -2462,104 +2542,43 @@
     FloatRegister vzr = v30;
     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 
-    __ mov(v26, __ T16B, 1);
-    __ mov(v27, __ T16B, 63);
-    __ mov(v28, __ T16B, 62);
-    __ mov(v29, __ T16B, 57);
-
-    __ ldrq(v6, Address(state));
-    __ ldrq(v16, Address(subkeyH));
-
-    __ ext(v0, __ T16B, v6, v6, 0x08);
-    __ ext(v1, __ T16B, v16, v16, 0x08);
-    __ eor(v16, __ T16B, v16, v1);
-
-    __ bind(L_ghash_loop);
-
-    __ ldrq(v2, Address(__ post(data, 0x10)));
-    __ rev64(v2, __ T16B, v2); // swap data
-
-    __ ext(v6, __ T16B, v0, v0, 0x08);
-    __ eor(v6, __ T16B, v6, v2);
-    __ ext(v2, __ T16B, v6, v6, 0x08);
-
-    __ pmull2(v7, __ T1Q, v2, v1, __ T2D);  // A1*B1
-    __ eor(v6, __ T16B, v6, v2);
-    __ pmull(v5,  __ T1Q, v2, v1, __ T1D);  // A0*B0
-    __ pmull(v20, __ T1Q, v6, v16, __ T1D);  // (A1 + A0)(B1 + B0)
-
-    __ ext(v21, __ T16B, v5, v7, 0x08);
-    __ eor(v18, __ T16B, v7, v5); // A1*B1 xor A0*B0
-    __ eor(v20, __ T16B, v20, v21);
-    __ eor(v20, __ T16B, v20, v18);
-
-    // Registers pair <v7:v5> holds the result of carry-less multiplication
-    __ ins(v7, __ D, v20, 0, 1);
-    __ ins(v5, __ D, v20, 1, 0);
-
-    // Result of the multiplication is shifted by one bit position
-    // [X3:X2:X1:X0] = [X3:X2:X1:X0] << 1
-    __ ushr(v18, __ T2D, v5, -63 & 63);
-    __ ins(v25, __ D, v18, 1, 0);
-    __ ins(v25, __ D, vzr, 0, 0);
-    __ ushl(v5, __ T2D, v5, v26);
-    __ orr(v5, __ T16B, v5, v25);
-
-    __ ushr(v19, __ T2D, v7, -63 & 63);
-    __ ins(v19, __ D, v19, 1, 0);
-    __ ins(v19, __ D, v18, 0, 1);
-    __ ushl(v7, __ T2D, v7, v26);
-    __ orr(v6, __ T16B, v7, v19);
-
-    __ ins(v24, __ D, v5, 0, 1);
-
-    // A = X0 << 63
-    __ ushl(v21, __ T2D, v5, v27);
-
-    // A = X0 << 62
-    __ ushl(v22, __ T2D, v5, v28);
-
-    // A = X0 << 57
-    __ ushl(v23, __ T2D, v5, v29);
-
-    // D = X1^A^B^C
-    __ eor(v21, __ T16B, v21, v22);
-    __ eor(v21, __ T16B, v21, v23);
-    __ eor(v21, __ T16B, v21, v24);
-    __ ins(v5, __ D, v21, 1, 0);
-
-    // [E1:E0] = [D:X0] >> 1
-    __ ushr(v20, __ T2D, v5, -1 & 63);
-    __ ushl(v18, __ T2D, v5, v27);
-    __ ext(v25, __ T16B, v18, vzr, 0x08);
-    __ orr(v19, __ T16B, v20, v25);
-
-    __ eor(v7, __ T16B, v5, v19);
-
-    // [F1:F0] = [D:X0] >> 2
-    __ ushr(v20, __ T2D, v5, -2 & 63);
-    __ ushl(v18, __ T2D, v5, v28);
-    __ ins(v25, __ D, v18, 0, 1);
-    __ orr(v19, __ T16B, v20, v25);
-
-    __ eor(v7, __ T16B, v7, v19);
-
-    // [G1:G0] = [D:X0] >> 7
-    __ ushr(v20, __ T2D, v5, -7 & 63);
-    __ ushl(v18, __ T2D, v5, v29);
-    __ ins(v25, __ D, v18, 0, 1);
-    __ orr(v19, __ T16B, v20, v25);
-
-    // [H1:H0] = [D^E1^F1^G1:X0^E0^F0^G0]
-    __ eor(v7, __ T16B, v7, v19);
-
-    // Result = [H1:H0]^[X3:X2]
-    __ eor(v0, __ T16B, v7, v6);
-
-    __ subs(blocks, blocks, 1);
-    __ cbnz(blocks, L_ghash_loop);
-
-    __ ext(v1, __ T16B, v0, v0, 0x08);
+    __ ldrq(v0, Address(state));
+    __ ldrq(v1, Address(subkeyH));
+
+    __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
+    __ rbit(v0, __ T16B, v0);
+    __ rev64(v1, __ T16B, v1);
+    __ rbit(v1, __ T16B, v1);
+
+    __ ldrq(v26, p);
+
+    __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
+    __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+
+    {
+      Label L_ghash_loop;
+      __ bind(L_ghash_loop);
+
+      __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
+                                                 // reversing each byte
+      __ rbit(v2, __ T16B, v2);
+      __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
+
+      // Multiply state in v2 by subkey in v1
+      ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
+                     /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
+                     /*temps*/v6, v20, v18, v21);
+      // Reduce v7:v5 by the field polynomial
+      ghash_reduce(v0, v5, v7, v26, vzr, v20);
+
+      __ sub(blocks, blocks, 1);
+      __ cbnz(blocks, L_ghash_loop);
+    }
+
+    // The bit-reversed result is at this point in v0
+    __ rev64(v1, __ T16B, v0);
+    __ rbit(v1, __ T16B, v1);
+
     __ st1(v1, __ T16B, state);
     __ ret(lr);
 
--- a/hotspot/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -186,7 +186,7 @@
       }
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       {
         if (val == noreg) {
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -177,6 +177,12 @@
   if (UseCRC32 && (auxv & HWCAP_CRC32) == 0) {
     warning("UseCRC32 specified, but not supported on this CPU");
   }
+
+  if (UseAdler32Intrinsics) {
+    warning("Adler32Intrinsics not available on this CPU.");
+    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
+  }
+
   if (auxv & HWCAP_AES) {
     UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
     UseAESIntrinsics =
--- a/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -2614,7 +2614,7 @@
 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
   CardTableModRefBS* bs =
     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
-  assert(bs->kind() == BarrierSet::CardTableModRef ||
+  assert(bs->kind() == BarrierSet::CardTableForRS ||
          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
 #ifdef ASSERT
   cmpdi(CCR0, Rnew_val, 0);
--- a/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -656,7 +656,7 @@
           __ bind(filtered);
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
@@ -697,7 +697,7 @@
           }
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
         {
           Label Lskip_loop, Lstore_loop;
--- a/hotspot/src/cpu/ppc/vm/templateTable_ppc_64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/ppc/vm/templateTable_ppc_64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -105,7 +105,7 @@
       }
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       {
         Label Lnull, Ldone;
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -200,6 +200,11 @@
     FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
   }
 
+  if (UseAdler32Intrinsics) {
+    warning("Adler32Intrinsics not available on this CPU.");
+    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
     UseMultiplyToLenIntrinsic = true;
   }
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -3958,7 +3958,7 @@
   if (new_val == G0) return;
   CardTableModRefBS* bs =
     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
-  assert(bs->kind() == BarrierSet::CardTableModRef ||
+  assert(bs->kind() == BarrierSet::CardTableForRS ||
          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
   card_table_write(bs->byte_map_base, tmp, store_addr);
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/sparc/vm/memset_with_concurrent_readers_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+
+#include "gc/shared/memset_with_concurrent_readers.hpp"
+#include "runtime/prefetch.inline.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/macros.hpp"
+
+#if INCLUDE_ALL_GCS
+
+// An implementation of memset, for use when there may be concurrent
+// readers of the region being stored into.
+//
+// We can't use the standard library memset if it is implemented using
+// block initializing stores.  Doing so can result in concurrent readers
+// seeing spurious zeros.
+//
+// We can't use the obvious C/C++ for-loop, because the compiler may
+// recognize the idiomatic loop and optimize it into a call to the
+// standard library memset; we've seen exactly this happen with, for
+// example, Solaris Studio 12.3.  Hence the use of inline assembly
+// code, hiding loops from the compiler's optimizer.
+//
+// We don't attempt to use the standard library memset when it is safe
+// to do so.  We could conservatively do so by detecting the presence
+// of block initializing stores (VM_Version::has_blk_init()), but the
+// implementation provided here should be sufficient.
+
+inline void fill_subword(void* start, void* end, int value) {
+  STATIC_ASSERT(BytesPerWord == 8);
+  assert(pointer_delta(end, start, 1) < BytesPerWord, "precondition");
+  // Dispatch on (end - start).
+  void* pc;
+  __asm__ volatile(
+    // offset := (7 - (end - start)) + 3
+    //   3 instructions from rdpc to DISPATCH
+    " sub %[offset], %[end], %[offset]\n\t" // offset := start - end
+    " sllx %[offset], 2, %[offset]\n\t" // scale offset for instruction size of 4
+    " add %[offset], 40, %[offset]\n\t" // offset += 10 * instruction size
+    " rd %pc, %[pc]\n\t"                // dispatch on scaled offset
+    " jmpl %[pc]+%[offset], %g0\n\t"
+    "  nop\n\t"
+    // DISPATCH: no direct reference, but without it the store block may be elided.
+    "1:\n\t"
+    " stb %[value], [%[end]-7]\n\t" // end[-7] = value
+    " stb %[value], [%[end]-6]\n\t"
+    " stb %[value], [%[end]-5]\n\t"
+    " stb %[value], [%[end]-4]\n\t"
+    " stb %[value], [%[end]-3]\n\t"
+    " stb %[value], [%[end]-2]\n\t"
+    " stb %[value], [%[end]-1]\n\t" // end[-1] = value
+    : /* no outputs */
+      [pc] "&=r" (pc)               // temp
+    : [offset] "&+r" (start),
+      [end] "r" (end),
+      [value] "r" (value)
+    : "memory");
+}
+
+void memset_with_concurrent_readers(void* to, int value, size_t size) {
+  Prefetch::write(to, 0);
+  void* end = static_cast<char*>(to) + size;
+  if (size >= BytesPerWord) {
+    // Fill any partial word prefix.
+    uintx* aligned_to = static_cast<uintx*>(align_ptr_up(to, BytesPerWord));
+    fill_subword(to, aligned_to, value);
+
+    // Compute fill word.
+    STATIC_ASSERT(BitsPerByte == 8);
+    STATIC_ASSERT(BitsPerWord == 64);
+    uintx xvalue = value & 0xff;
+    xvalue |= (xvalue << 8);
+    xvalue |= (xvalue << 16);
+    xvalue |= (xvalue << 32);
+
+    uintx* aligned_end = static_cast<uintx*>(align_ptr_down(end, BytesPerWord));
+    assert(aligned_to <= aligned_end, "invariant");
+
+    // for ( ; aligned_to < aligned_end; ++aligned_to) {
+    //   *aligned_to = xvalue;
+    // }
+    uintptr_t temp;
+    __asm__ volatile(
+      // Unroll loop x8.
+      " sub %[aend], %[ato], %[temp]\n\t"
+      " cmp %[temp], 56\n\t"           // cc := (aligned_end - aligned_to) > 7 words
+      " ba %xcc, 2f\n\t"               // goto TEST always
+      "  sub %[aend], 56, %[temp]\n\t" // limit := aligned_end - 7 words
+      // LOOP:
+      "1:\n\t"                         // unrolled x8 store loop top
+      " cmp %[temp], %[ato]\n\t"       // cc := limit > (next) aligned_to
+      " stx %[xvalue], [%[ato]-64]\n\t" // store 8 words, aligned_to pre-incremented
+      " stx %[xvalue], [%[ato]-56]\n\t"
+      " stx %[xvalue], [%[ato]-48]\n\t"
+      " stx %[xvalue], [%[ato]-40]\n\t"
+      " stx %[xvalue], [%[ato]-32]\n\t"
+      " stx %[xvalue], [%[ato]-24]\n\t"
+      " stx %[xvalue], [%[ato]-16]\n\t"
+      " stx %[xvalue], [%[ato]-8]\n\t"
+      // TEST:
+      "2:\n\t"
+      " bgu,a %xcc, 1b\n\t"            // goto LOOP if more than 7 words remaining
+      "  add %[ato], 64, %[ato]\n\t"   // aligned_to += 8, for next iteration
+      // Fill remaining < 8 full words.
+      // Dispatch on (aligned_end - aligned_to).
+      // offset := (7 - (aligned_end - aligned_to)) + 3
+      //   3 instructions from rdpc to DISPATCH
+      " sub %[ato], %[aend], %[ato]\n\t" // offset := aligned_to - aligned_end
+      " srax %[ato], 1, %[ato]\n\t"      // scale offset for instruction size of 4
+      " add %[ato], 40, %[ato]\n\t"      // offset += 10 * instruction size
+      " rd %pc, %[temp]\n\t"             // dispatch on scaled offset
+      " jmpl %[temp]+%[ato], %g0\n\t"
+      "  nop\n\t"
+      // DISPATCH: no direct reference, but without it the store block may be elided.
+      "3:\n\t"
+      " stx %[xvalue], [%[aend]-56]\n\t" // aligned_end[-7] = xvalue
+      " stx %[xvalue], [%[aend]-48]\n\t"
+      " stx %[xvalue], [%[aend]-40]\n\t"
+      " stx %[xvalue], [%[aend]-32]\n\t"
+      " stx %[xvalue], [%[aend]-24]\n\t"
+      " stx %[xvalue], [%[aend]-16]\n\t"
+      " stx %[xvalue], [%[aend]-8]\n\t"  // aligned_end[-1] = xvalue
+      : /* no outputs */
+        [temp] "&=r" (temp)
+      : [ato] "&+r" (aligned_to),
+        [aend] "r" (aligned_end),
+        [xvalue] "r" (xvalue)
+      : "cc", "memory");
+    to = aligned_end;           // setup for suffix
+  }
+  // Fill any partial word suffix.  Also the prefix if size < BytesPerWord.
+  fill_subword(to, end, value);
+}
+
+#endif // INCLUDE_ALL_GCS
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -981,7 +981,7 @@
           __ restore();
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
@@ -1014,7 +1014,7 @@
           __ restore();
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
         {
           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
@@ -5110,6 +5110,188 @@
     return start;
   }
 
+#define ADLER32_NUM_TEMPS 16
+
+  /**
+   *  Arguments:
+   *
+   * Inputs:
+   *   O0   - int   adler
+   *   O1   - byte* buff
+   *   O2   - int   len
+   *
+   * Output:
+   *   O0   - int adler result
+   */
+  address generate_updateBytesAdler32() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
+    address start = __ pc();
+
+    Label L_cleanup_loop, L_cleanup_loop_check;
+    Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check;
+    Label L_nmax_check_done;
+
+    // Aliases
+    Register s1     = O0;
+    Register s2     = O3;
+    Register buff   = O1;
+    Register len    = O2;
+    Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7};
+
+    // Max number of bytes we can process before having to take the mod
+    // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
+    unsigned long NMAX = 0x15B0;
+
+    // Zero-out the upper bits of len
+    __ clruwu(len);
+
+    // Create the mask 0xFFFF
+    __ set64(0x00FFFF, O4, O5); // O5 is the temp register
+
+    // s1 is initialized to the lower 16 bits of adler
+    // s2 is initialized to the upper 16 bits of adler
+    __ srlx(O0, 16, O5); // adler >> 16
+    __ and3(O0, O4, s1); // s1  = (adler & 0xFFFF)
+    __ and3(O5, O4, s2); // s2  = ((adler >> 16) & 0xFFFF)
+
+    // The pipelined loop needs at least 16 elements for 1 iteration
+    // It does check this, but it is more effective to skip to the cleanup loop
+    // Setup the constant for cutoff checking
+    __ mov(15, O4);
+
+    // Check if we are above the cutoff, if not go to the cleanup loop immediately
+    __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check);
+
+    // Free up some registers for our use
+    for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
+      __ movxtod(temp[i], as_FloatRegister(2*i));
+    }
+
+    // Loop maintenance stuff is done at the end of the loop, so skip to there
+    __ ba_short(L_main_loop_check);
+
+    __ BIND(L_main_loop);
+
+    // Prologue for inner loop
+    __ ldub(buff, 0, L0);
+    __ dec(O5);
+
+    for (int i = 1; i < 8; i++) {
+      __ ldub(buff, i, temp[i]);
+    }
+
+    __ inc(buff, 8);
+
+    // Inner loop processes 16 elements at a time, might never execute if only 16 elements
+    // to be processed by the outter loop
+    __ ba_short(L_inner_loop_check);
+
+    __ BIND(L_inner_loop);
+
+    for (int i = 0; i < 8; i++) {
+      __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]);
+      __ add(s1, temp[i], s1);
+      __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]);
+      __ add(s2, s1, s2);
+    }
+
+    // Original temp 0-7 used and new loads to temp 0-7 issued
+    // temp 8-15 ready to be consumed
+    __ add(s1, I0, s1);
+    __ dec(O5);
+    __ add(s2, s1, s2);
+    __ add(s1, I1, s1);
+    __ inc(buff, 16);
+    __ add(s2, s1, s2);
+
+    for (int i = 0; i < 6; i++) {
+      __ add(s1, temp[10+i], s1);
+      __ add(s2, s1, s2);
+    }
+
+    __ BIND(L_inner_loop_check);
+    __ nop();
+    __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop);
+
+    // Epilogue
+    for (int i = 0; i < 4; i++) {
+      __ ldub(buff, (2*i), temp[8+(2*i)]);
+      __ add(s1, temp[i], s1);
+      __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]);
+      __ add(s2, s1, s2);
+    }
+
+    __ add(s1, temp[4], s1);
+    __ inc(buff, 8);
+
+    for (int i = 0; i < 11; i++) {
+      __ add(s2, s1, s2);
+      __ add(s1, temp[5+i], s1);
+    }
+
+    __ add(s2, s1, s2);
+
+    // Take the mod for s1 and s2
+    __ set64(0xFFF1, L0, L1);
+    __ udivx(s1, L0, L1);
+    __ udivx(s2, L0, L2);
+    __ mulx(L0, L1, L1);
+    __ mulx(L0, L2, L2);
+    __ sub(s1, L1, s1);
+    __ sub(s2, L2, s2);
+
+    // Make sure there is something left to process
+    __ BIND(L_main_loop_check);
+    __ set64(NMAX, L0, L1);
+    // k = len < NMAX ? len : NMAX
+    __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done);
+    __ andn(len, 0x0F, L0); // only loop a multiple of 16 times
+    __ BIND(L_nmax_check_done);
+    __ mov(L0, O5);
+    __ sub(len, L0, len); // len -= k
+
+    __ srlx(O5, 4, O5); // multiplies of 16
+    __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop);
+
+    // Restore anything we used, take the mod one last time, combine and return
+    // Restore any registers we saved
+    for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
+      __ movdtox(as_FloatRegister(2*i), temp[i]);
+    }
+
+    // There might be nothing left to process
+    __ ba_short(L_cleanup_loop_check);
+
+    __ BIND(L_cleanup_loop);
+    __ ldub(buff, 0, O4); // load single byte form buffer
+    __ inc(buff); // buff++
+    __ add(s1, O4, s1); // s1 += *buff++;
+    __ dec(len); // len--
+    __ add(s1, s2, s2); // s2 += s1;
+    __ BIND(L_cleanup_loop_check);
+    __ nop();
+    __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop);
+
+    // Take the mod one last time
+    __ set64(0xFFF1, O1, O2);
+    __ udivx(s1, O1, O2);
+    __ udivx(s2, O1, O5);
+    __ mulx(O1, O2, O2);
+    __ mulx(O1, O5, O5);
+    __ sub(s1, O2, s1);
+    __ sub(s2, O5, s2);
+
+    // Combine lower bits and higher bits
+    __ sllx(s2, 16, s2); // s2 = s2 << 16
+    __ or3(s1, s2, s1);  // adler = s2 | s1
+    // Final return value is in O0
+    __ retl();
+    __ delayed()->nop();
+
+    return start;
+  }
+
   void generate_initial() {
     // Generates all stubs and initializes the entry points
 
@@ -5206,6 +5388,11 @@
     if (UseCRC32CIntrinsics) {
       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
     }
+
+    // generate Adler32 intrinsics code
+    if (UseAdler32Intrinsics) {
+      StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
+    }
   }
 
 
--- a/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -41,7 +41,7 @@
 enum /* platform_dependent_constants */ {
   // %%%%%%%% May be able to shrink this a lot
   code_size1 = 20000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 24000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 27000            // simply increase if too small (assembler will crash if too small)
 };
 
 class Sparc {
--- a/hotspot/src/cpu/sparc/vm/templateTable_sparc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/sparc/vm/templateTable_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -91,7 +91,7 @@
       }
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       {
         if (index == noreg ) {
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -85,27 +85,6 @@
   _supports_cx8 = has_v9();
   _supports_atomic_getset4 = true; // swap instruction
 
-  // There are Fujitsu Sparc64 CPUs which support blk_init as well so
-  // we have to take this check out of the 'is_niagara()' block below.
-  if (has_blk_init()) {
-    // When using CMS or G1, we cannot use memset() in BOT updates
-    // because the sun4v/CMT version in libc_psr uses BIS which
-    // exposes "phantom zeros" to concurrent readers. See 6948537.
-    if (FLAG_IS_DEFAULT(UseMemSetInBOT) && (UseConcMarkSweepGC || UseG1GC)) {
-      FLAG_SET_DEFAULT(UseMemSetInBOT, false);
-    }
-    // Issue a stern warning if the user has explicitly set
-    // UseMemSetInBOT (it is known to cause issues), but allow
-    // use for experimentation and debugging.
-    if (UseConcMarkSweepGC || UseG1GC) {
-      if (UseMemSetInBOT) {
-        assert(!FLAG_IS_DEFAULT(UseMemSetInBOT), "Error");
-        warning("Experimental flag -XX:+UseMemSetInBOT is known to cause instability"
-                " on sun4v; please understand that you are using at your own risk!");
-      }
-    }
-  }
-
   if (is_niagara()) {
     // Indirect branch is the same cost as direct
     if (FLAG_IS_DEFAULT(UseInlineCaches)) {
@@ -377,6 +356,15 @@
     FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
   }
 
+  if (UseVIS > 2) {
+    if (FLAG_IS_DEFAULT(UseAdler32Intrinsics)) {
+      FLAG_SET_DEFAULT(UseAdler32Intrinsics, true);
+    }
+  } else if (UseAdler32Intrinsics) {
+    warning("SPARC Adler32 intrinsics require VIS3 instruction support. Intrinsics will be disabled.");
+    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
     (cache_line_size > ContendedPaddingWidth))
     ContendedPaddingWidth = cache_line_size;
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -394,25 +394,25 @@
   int mod_idx = 0;
   // We will test if the displacement fits the compressed format and if so
   // apply the compression to the displacment iff the result is8bit.
-  if (VM_Version::supports_evex() && is_evex_instruction) {
-    switch (tuple_type) {
+  if (VM_Version::supports_evex() && _is_evex_instruction) {
+    switch (_tuple_type) {
     case EVEX_FV:
-      if ((evex_encoding & VEX_W) == VEX_W) {
-        mod_idx += 2 + ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      if ((_evex_encoding & VEX_W) == VEX_W) {
+        mod_idx += 2 + ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
       } else {
-        mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+        mod_idx = ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
       }
       break;
 
     case EVEX_HV:
-      mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      mod_idx = ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
       break;
 
     case EVEX_FVM:
       break;
 
     case EVEX_T1S:
-      switch (input_size_in_bits) {
+      switch (_input_size_in_bits) {
       case EVEX_8bit:
         break;
 
@@ -433,7 +433,7 @@
     case EVEX_T1F:
     case EVEX_T2:
     case EVEX_T4:
-      mod_idx = (input_size_in_bits == EVEX_64bit) ? 1 : 0;
+      mod_idx = (_input_size_in_bits == EVEX_64bit) ? 1 : 0;
       break;
 
     case EVEX_T8:
@@ -459,8 +459,8 @@
       break;
     }
 
-    if (avx_vector_len >= AVX_128bit && avx_vector_len <= AVX_512bit) {
-      int disp_factor = tuple_table[tuple_type + mod_idx][avx_vector_len];
+    if (_avx_vector_len >= AVX_128bit && _avx_vector_len <= AVX_512bit) {
+      int disp_factor = tuple_table[_tuple_type + mod_idx][_avx_vector_len];
       if ((disp % disp_factor) == 0) {
         int new_disp = disp / disp_factor;
         if (is8bit(new_disp)) {
@@ -591,7 +591,7 @@
       emit_data(disp, rspec, disp32_operand);
     }
   }
-  is_evex_instruction = false;
+  _is_evex_instruction = false;
 }
 
 void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
@@ -1229,8 +1229,8 @@
 void Assembler::addsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x58, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
@@ -1245,8 +1245,8 @@
 void Assembler::addss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
 }
@@ -1254,16 +1254,16 @@
 void Assembler::aesdec(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDE);
   emit_operand(dst, src);
 }
 
 void Assembler::aesdec(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38,  /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDE);
   emit_int8(0xC0 | encode);
 }
@@ -1271,16 +1271,16 @@
 void Assembler::aesdeclast(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit,  /* legacy_mode */ true);
   emit_int8((unsigned char)0xDF);
   emit_operand(dst, src);
 }
 
 void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38,  /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDF);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1288,16 +1288,16 @@
 void Assembler::aesenc(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDC);
   emit_operand(dst, src);
 }
 
 void Assembler::aesenc(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDC);
   emit_int8(0xC0 | encode);
 }
@@ -1305,21 +1305,20 @@
 void Assembler::aesenclast(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit,  /* legacy_mode */ true);
   emit_int8((unsigned char)0xDD);
   emit_operand(dst, src);
 }
 
 void Assembler::aesenclast(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDD);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-
 void Assembler::andl(Address dst, int32_t imm32) {
   InstructionMark im(this);
   prefix(dst);
@@ -1347,7 +1346,7 @@
 
 void Assembler::andnl(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2);
   emit_int8((unsigned char)0xF2);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1355,7 +1354,7 @@
 void Assembler::andnl(Register dst, Register src1, Address src2) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_legacy(dst, src1, src2, false);
+  vex_prefix_0F38_legacy(dst, src1, src2);
   emit_int8((unsigned char)0xF2);
   emit_operand(dst, src2);
 }
@@ -1382,7 +1381,7 @@
 
 void Assembler::blsil(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1390,14 +1389,14 @@
 void Assembler::blsil(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_legacy(rbx, dst, src, false);
+  vex_prefix_0F38_legacy(rbx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rbx, src);
 }
 
 void Assembler::blsmskl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1405,14 +1404,14 @@
 void Assembler::blsmskl(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(rdx, dst, src, false);
+  vex_prefix_0F38_legacy(rdx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rdx, src);
 }
 
 void Assembler::blsrl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1420,7 +1419,7 @@
 void Assembler::blsrl(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_legacy(rcx, dst, src, false);
+  vex_prefix_0F38_legacy(rcx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rcx, src);
 }
@@ -1569,9 +1568,9 @@
   // 0x66 is there. Strangly ucomisd comes out correct
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
   }
@@ -1580,7 +1579,7 @@
 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true);
+    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
   }
@@ -1588,16 +1587,16 @@
 
 void Assembler::comiss(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::cpuid() {
@@ -1607,12 +1606,12 @@
 
 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3);
+  emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3, /* no_mask_reg */ false, /* legacy_mode */ true);
 }
 
 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE);
+  emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ true);
 }
 
 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
@@ -1627,8 +1626,8 @@
 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1F;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1F;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x5A, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
@@ -1637,12 +1636,7 @@
 
 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = 0;
-  if (VM_Version::supports_evex()) {
-    encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true);
-  } else {
-    encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, false);
-  }
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VM_Version::supports_evex());
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1650,9 +1644,9 @@
 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-    emit_simd_arith_q(0x2A, dst, src, VEX_SIMD_F2, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+    emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   } else {
     emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2);
   }
@@ -1660,23 +1654,23 @@
 
 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true);
+  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
 }
 
 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1688,8 +1682,8 @@
 
 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
@@ -1698,14 +1692,14 @@
 
 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1721,8 +1715,8 @@
 void Assembler::divsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
@@ -1740,8 +1734,8 @@
 
 void Assembler::divss(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
@@ -1995,8 +1989,16 @@
 
 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex()) {
-    emit_simd_arith_nonds_q(0x28, dst, src, VEX_SIMD_66, true);
+  if (VM_Version::supports_avx512novl()) {
+    int vector_len = AVX_512bit;
+    int dst_enc = dst->encoding();
+    int src_enc = src->encoding();
+    int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F,
+                                       /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+    emit_int8(0x28);
+    emit_int8((unsigned char)(0xC0 | encode));
+  } else if (VM_Version::supports_evex()) {
+    emit_simd_arith_nonds_q(0x28, dst, src, VEX_SIMD_66);
   } else {
     emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66);
   }
@@ -2004,13 +2006,19 @@
 
 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
+  if (VM_Version::supports_avx512novl()) {
+    int vector_len = AVX_512bit;
+    int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, vector_len);
+    emit_int8(0x28);
+    emit_int8((unsigned char)(0xC0 | encode));
+  } else {
+    emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
+  }
 }
 
 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, true, VEX_OPCODE_0F,
-                                      false, AVX_128bit);
+  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2023,48 +2031,54 @@
   emit_operand(dst, src);
 }
 
-void Assembler::kmovq(KRegister dst, KRegister src) {
+void Assembler::kmovql(KRegister dst, KRegister src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE,
-                                      true, VEX_OPCODE_0F, true);
+                                      /* no_mask_reg */ true, VEX_OPCODE_0F, /* rex_w */ true);
   emit_int8((unsigned char)0x90);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::kmovq(KRegister dst, Address src) {
+void Assembler::kmovql(KRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   int dst_enc = dst->encoding();
   int nds_enc = 0;
   vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_NONE,
-             VEX_OPCODE_0F, true, AVX_128bit, true, true);
+             VEX_OPCODE_0F, /* vex_w */  true, AVX_128bit, /* legacy_mode */ true, /* no_reg_mask */ true);
   emit_int8((unsigned char)0x90);
   emit_operand((Register)dst, src);
 }
 
-void Assembler::kmovq(Address dst, KRegister src) {
+void Assembler::kmovql(Address dst, KRegister src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   int src_enc = src->encoding();
   int nds_enc = 0;
   vex_prefix(dst, nds_enc, src_enc, VEX_SIMD_NONE,
-             VEX_OPCODE_0F, true, AVX_128bit, true, true);
+             VEX_OPCODE_0F, /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_reg_mask */ true);
   emit_int8((unsigned char)0x90);
   emit_operand((Register)src, dst);
 }
 
 void Assembler::kmovql(KRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
-  bool supports_bw = VM_Version::supports_avx512bw();
-  VexSimdPrefix pre = supports_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true,
-                                      VEX_OPCODE_0F, supports_bw);
+  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F, /* legacy_mode */ !_legacy_mode_bw);
   emit_int8((unsigned char)0x92);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::kmovdl(KRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
-  VexSimdPrefix pre = VM_Version::supports_avx512bw() ? VEX_SIMD_F2 : VEX_SIMD_NONE;
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true, VEX_OPCODE_0F, false);
+  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, /* no_mask_reg */ true);
+  emit_int8((unsigned char)0x92);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::kmovwl(KRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
   emit_int8((unsigned char)0x92);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2088,7 +2102,7 @@
 
 void Assembler::movdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x6E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2096,7 +2110,7 @@
 void Assembler::movdl(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // swap src/dst to get correct prefix
-  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x7E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2104,11 +2118,11 @@
 void Assembler::movdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, true, VEX_OPCODE_0F);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_reg_mask */ true);
   emit_int8(0x6E);
   emit_operand(dst, src);
 }
@@ -2116,58 +2130,61 @@
 void Assembler::movdl(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_reg_mask */ true);
   emit_int8(0x7E);
   emit_operand(src, dst);
 }
 
 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::movdqa(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::movdqu(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::movdqu(Address dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3, false);
+    _tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ false);
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
 
 // Move Unaligned 256bit Vector
 void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
   int vector_len = AVX_256bit;
   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
   emit_int8(0x6F);
@@ -2175,67 +2192,100 @@
 }
 
 void Assembler::vmovdqu(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   InstructionMark im(this);
   int vector_len = AVX_256bit;
-  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
+  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
   emit_int8(0x6F);
   emit_operand(dst, src);
 }
 
 void Assembler::vmovdqu(Address dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   InstructionMark im(this);
   int vector_len = AVX_256bit;
   // swap src<->dst for encoding
   assert(src != xnoreg, "sanity");
-  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
+  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len);
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
 
 // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64)
-void Assembler::evmovdqu(XMMRegister dst, XMMRegister src, int vector_len) {
+void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_F3, VEX_OPCODE_0F,
-                                     true, vector_len, false, false);
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x6F);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evmovdqu(XMMRegister dst, Address src, int vector_len) {
+void Assembler::evmovdqul(XMMRegister dst, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-    vex_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
-  } else {
-    vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
-  }
+    _tuple_type = EVEX_FVM;
+  }
+  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
   emit_int8(0x6F);
   emit_operand(dst, src);
 }
 
-void Assembler::evmovdqu(Address dst, XMMRegister src, int vector_len) {
+void Assembler::evmovdqul(Address dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   InstructionMark im(this);
   assert(src != xnoreg, "sanity");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-    // swap src<->dst for encoding
-    vex_prefix_q(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
-  } else {
-    // swap src<->dst for encoding
-    vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
-  }
+    _tuple_type = EVEX_FVM;
+  }
+  // swap src<->dst for encoding
+  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len);
+  emit_int8(0x7F);
+  emit_operand(src, dst);
+}
+
+void Assembler::evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(UseAVX > 0, "");
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_F3, VEX_OPCODE_0F,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  emit_int8(0x6F);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evmovdquq(XMMRegister dst, Address src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(UseAVX > 2, "");
+  InstructionMark im(this);
+  _tuple_type = EVEX_FVM;
+  vex_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
+  emit_int8(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::evmovdquq(Address dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(UseAVX > 2, "");
+  InstructionMark im(this);
+  assert(src != xnoreg, "sanity");
+  _tuple_type = EVEX_FVM;
+  // swap src<->dst for encoding
+  vex_prefix_q(src, xnoreg, dst, VEX_SIMD_F3, vector_len);
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
@@ -2282,10 +2332,12 @@
 void Assembler::movlpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x12, dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+    emit_simd_arith_q(0x12, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
+  } else {
+    emit_simd_arith(0x12, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
+  }
 }
 
 void Assembler::movq( MMXRegister dst, Address src ) {
@@ -2312,11 +2364,11 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    simd_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    simd_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   } else {
-    simd_prefix(dst, src, VEX_SIMD_F3, true, VEX_OPCODE_0F);
+    simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   }
   emit_int8(0x7E);
   emit_operand(dst, src);
@@ -2326,12 +2378,12 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    simd_prefix(src, xnoreg, dst, VEX_SIMD_66, true,
-                VEX_OPCODE_0F, true, AVX_128bit);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    simd_prefix(src, xnoreg, dst, VEX_SIMD_66, /* no_mask_reg */ true,
+                VEX_OPCODE_0F, /* rex_w */ true);
   } else {
-    simd_prefix(dst, src, VEX_SIMD_66, true);
+    simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   }
   emit_int8((unsigned char)0xD6);
   emit_operand(src, dst);
@@ -2356,7 +2408,7 @@
 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    emit_simd_arith_q(0x10, dst, src, VEX_SIMD_F2, true);
+    emit_simd_arith_q(0x10, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   } else {
     emit_simd_arith(0x10, dst, src, VEX_SIMD_F2);
   }
@@ -2365,9 +2417,9 @@
 void Assembler::movsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    emit_simd_arith_nonds_q(0x10, dst, src, VEX_SIMD_F2, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x10, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2);
   }
@@ -2377,11 +2429,11 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     simd_prefix_q(src, xnoreg, dst, VEX_SIMD_F2);
   } else {
-    simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, false);
+    simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, /* no_mask_reg */ false);
   }
   emit_int8(0x11);
   emit_operand(src, dst);
@@ -2389,26 +2441,26 @@
 
 void Assembler::movss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3, true);
+  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
 }
 
 void Assembler::movss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
 }
 
 void Assembler::movss(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3, false);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ false);
   emit_int8(0x11);
   emit_operand(src, dst);
 }
@@ -2501,8 +2553,8 @@
 void Assembler::mulsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x59, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
@@ -2521,8 +2573,8 @@
 void Assembler::mulss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
 }
@@ -2831,29 +2883,27 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x67, dst, src, VEX_SIMD_66,
-                  false, (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x67, dst, src, VEX_SIMD_66,
-                  false, (VM_Version::supports_avx512dq() == false));
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "some form of AVX must be enabled");
-  emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector_len,
-                 false, (VM_Version::supports_avx512dq() == false));
+  emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx2(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_3A, true, vector_len);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ true, vector_len);
   emit_int8(0x00);
   emit_int8(0xC0 | encode);
   emit_int8(imm8);
@@ -2867,8 +2917,8 @@
 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
   InstructionMark im(this);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_3A,
-              false, AVX_128bit, true);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_3A,
+              /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x61);
   emit_operand(dst, src);
   emit_int8(imm8);
@@ -2876,8 +2926,8 @@
 
 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_3A, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x61);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2885,8 +2935,8 @@
 
 void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2894,8 +2944,8 @@
 
 void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, /* no_mask_reg */  true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ true, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2903,8 +2953,8 @@
 
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2912,8 +2962,8 @@
 
 void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ true, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2922,17 +2972,17 @@
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_HVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
+    _tuple_type = EVEX_HVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_38);
   emit_int8(0x30);
   emit_operand(dst, src);
 }
 
 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_38);
   emit_int8(0x30);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3035,8 +3085,8 @@
 
 void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_ssse3(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38,
-                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x00);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3044,33 +3094,34 @@
 void Assembler::pshufb(XMMRegister dst, Address src) {
   assert(VM_Version::supports_ssse3(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38,
-              false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x00);
   emit_operand(dst, src);
 }
 
 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
+  _instruction_uses_vl = true;
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_66);
   emit_int8(mode & 0xFF);
-
 }
 
 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
+  _instruction_uses_vl = true;
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, false);
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x70);
   emit_operand(dst, src);
   emit_int8(mode & 0xFF);
@@ -3079,8 +3130,7 @@
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2, false,
-                        (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(mode & 0xFF);
 }
 
@@ -3089,29 +3139,33 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, false, VEX_OPCODE_0F,
-              false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, /* no_mask_reg */ false,
+              VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x70);
   emit_operand(dst, src);
   emit_int8(mode & 0xFF);
 }
 
 void Assembler::psrldq(XMMRegister dst, int shift) {
-  // Shift 128 bit value in xmm register by number of bytes.
+  // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  // XMM3 is for /3 encoding: 66 0F 73 /3 ib
+  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift);
 }
 
 void Assembler::pslldq(XMMRegister dst, int shift) {
-  // Shift left 128 bit value in xmm register by number of bytes.
+  // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  // XMM7 is for /7 encoding: 66 0F 73 /7 ib
+  int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift);
@@ -3121,16 +3175,16 @@
   assert(VM_Version::supports_sse4_1(), "");
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
 
 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3142,7 +3196,8 @@
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len, true, false);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* rex_w */ false,
+             vector_len, /* legacy_mode  */ true, /* no_mask_reg */ false);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
@@ -3150,8 +3205,7 @@
 void Assembler::vptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, true, false);
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3160,34 +3214,41 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_vlbw);
 }
 
 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_vlbw);
 }
 
 void Assembler::punpckldq(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x6C, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::push(int32_t imm32) {
@@ -3396,8 +3457,8 @@
 void Assembler::sqrtsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x51, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
@@ -3416,8 +3477,8 @@
 void Assembler::sqrtss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
 }
@@ -3479,10 +3540,14 @@
 void Assembler::subsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-  }
-  emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+  }
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::subss(XMMRegister dst, XMMRegister src) {
@@ -3493,8 +3558,8 @@
 void Assembler::subss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
 }
@@ -3553,9 +3618,9 @@
 void Assembler::ucomisd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
   }
@@ -3564,7 +3629,7 @@
 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true);
+    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
   }
@@ -3573,15 +3638,15 @@
 void Assembler::ucomiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::xabort(int8_t imm8) {
@@ -3664,8 +3729,8 @@
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3684,8 +3749,8 @@
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3698,8 +3763,8 @@
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3718,8 +3783,8 @@
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3732,8 +3797,8 @@
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3752,8 +3817,8 @@
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3766,8 +3831,8 @@
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3786,8 +3851,8 @@
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3802,6 +3867,7 @@
 // Float-point vector arithmetic
 
 void Assembler::addpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x58, dst, src, VEX_SIMD_66);
@@ -3811,11 +3877,13 @@
 }
 
 void Assembler::addps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3825,15 +3893,17 @@
 }
 
 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3841,15 +3911,17 @@
 }
 
 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::subpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_66);
@@ -3859,11 +3931,13 @@
 }
 
 void Assembler::subps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3873,15 +3947,17 @@
 }
 
 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3889,15 +3965,17 @@
 }
 
 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x59, dst, src, VEX_SIMD_66);
@@ -3907,11 +3985,13 @@
 }
 
 void Assembler::mulps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3921,15 +4001,17 @@
 }
 
 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3937,15 +4019,17 @@
 }
 
 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::divpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_66);
@@ -3955,11 +4039,13 @@
 }
 
 void Assembler::divps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3969,15 +4055,17 @@
 }
 
 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3985,164 +4073,178 @@
 }
 
 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
+void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(VM_Version::supports_avx(), "");
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vsqrtpd(XMMRegister dst, Address src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(VM_Version::supports_avx(), "");
+  if (VM_Version::supports_evex()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  }
+}
+
 void Assembler::andpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::andps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, false,
-                  (VM_Version::supports_avx512dq() == false));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::andps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE,
-                  false, (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::andpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  bool legacy_mode = (VM_Version::supports_avx512dq() == false);
-  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, legacy_mode);
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false,  /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len,
-                 (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE,
-                  false, (VM_Version::supports_avx512dq() == false));
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::xorpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::xorps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, false,
-                  (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len,
-                 (VM_Version::supports_avx512dq() == false));
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len,
-                 (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 // Integer vector arithmetic
 void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len,
-                                     VEX_OPCODE_0F_38, true, false);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4150,28 +4252,29 @@
 void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len,
-                                     VEX_OPCODE_0F_38, true, false);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::paddb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xFC, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::paddw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xFD, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xFD, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::paddd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xFE, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::paddq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xD4, dst, src, VEX_SIMD_66);
@@ -4182,38 +4285,38 @@
 
 void Assembler::phaddw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse3(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse3(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4225,33 +4328,35 @@
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len);
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len);
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4260,20 +4365,22 @@
 
 void Assembler::psubb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF8, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xF8, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psubw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF9, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xF9, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psubd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xFA, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::psubq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xFB, dst, src, VEX_SIMD_66);
@@ -4284,22 +4391,22 @@
 
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4311,35 +4418,35 @@
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4348,28 +4455,27 @@
 
 void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_sse4_1(), "");
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66,
-                                      false, VEX_OPCODE_0F_38);
+                                      /* no_mask_reg */ false, VEX_OPCODE_0F_38);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4379,8 +4485,8 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4388,22 +4494,23 @@
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len);
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   InstructionMark im(this);
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66,
-             VEX_OPCODE_0F_38, false, vector_len);
+             VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -4411,13 +4518,14 @@
 void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
   }
   InstructionMark im(this);
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66,
+             VEX_OPCODE_0F_38, /* vex_w */ true, vector_len, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -4426,26 +4534,28 @@
 void Assembler::psllw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
-                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F,
+                                      /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::pslld(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false);
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psllq(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F, /* rex_w */ true);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -4453,16 +4563,17 @@
 
 void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66, false,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::pslld(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xF2, dst, shift, VEX_SIMD_66);
 }
 
 void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xF3, dst, shift, VEX_SIMD_66);
@@ -4474,12 +4585,12 @@
 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
-  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
   emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector_len);
@@ -4487,6 +4598,7 @@
 }
 
 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
   if (VM_Version::supports_evex()) {
@@ -4499,16 +4611,17 @@
 
 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xF3, dst, src, shift, VEX_SIMD_66, vector_len);
@@ -4521,33 +4634,31 @@
 void Assembler::psrlw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 71 /2 ib
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
-                                      (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psrld(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 72 /2 ib
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false);
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psrlq(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   // Do not confuse it with psrldq SSE2 instruction which
   // shifts 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
-  int encode = 0;
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512bw()) {
-    encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false);
-  } else {
-    encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true);
-  }
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F, /* rex_w */ VM_Version::supports_evex());
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -4555,16 +4666,17 @@
 
 void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66, false,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psrld(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xD2, dst, shift, VEX_SIMD_66);
 }
 
 void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xD3, dst, shift, VEX_SIMD_66);
@@ -4575,20 +4687,21 @@
 
 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
-  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  // XMM2 is for /2 encoding: 66 0F 71 /2 ib
+  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
-  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  // XMM2 is for /2 encoding: 66 0F 72 /2 ib
   emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector_len);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
   if (VM_Version::supports_evex()) {
@@ -4601,16 +4714,17 @@
 
 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xD3, dst, src, shift, VEX_SIMD_66, vector_len);
@@ -4623,17 +4737,18 @@
 void Assembler::psraw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
-  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
-                                      (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psrad(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM4 is for /4 encoding: 66 0F 72 /4 ib
-  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false);
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -4641,11 +4756,11 @@
 
 void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66);
 }
@@ -4653,12 +4768,12 @@
 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
-  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
   emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector_len);
@@ -4667,11 +4782,11 @@
 
 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector_len);
 }
@@ -4684,53 +4799,61 @@
 }
 
 void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::por(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xEB, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xEF, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len);
 }
@@ -4739,6 +4862,9 @@
 void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4753,8 +4879,8 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66,
-                                     VEX_OPCODE_0F_3A, true, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x1A);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
@@ -4763,35 +4889,70 @@
 }
 
 void Assembler::vinsertf64x4h(XMMRegister dst, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_64bit;
-  }
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   int vector_len = AVX_512bit;
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, true, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ true, vector_len);
   emit_int8(0x1A);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
   emit_int8(0x01);
 }
 
-void Assembler::vinsertf128h(XMMRegister dst, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+void Assembler::vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  emit_int8(0x18);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into q0 128 bits (0..127)
+  // 0x01 - insert into q1 128 bits (128..255)
+  // 0x02 - insert into q2 128 bits (256..383)
+  // 0x03 - insert into q3 128 bits (384..511)
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vinsertf32x4h(XMMRegister dst, Address src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_32bit;
+  InstructionMark im(this);
+  int vector_len = AVX_512bit;
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
+  emit_int8(0x18);
+  emit_operand(dst, src);
+  // 0x00 - insert into q0 128 bits (0..127)
+  // 0x01 - insert into q1 128 bits (128..255)
+  // 0x02 - insert into q2 128 bits (256..383)
+  // 0x03 - insert into q3 128 bits (384..511)
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vinsertf128h(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x18);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
@@ -4801,6 +4962,9 @@
 void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4811,15 +4975,16 @@
 
 void Assembler::vextractf128h(Address dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
+  int vector_len = AVX_256bit;
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
-  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x19);
   emit_operand(src, dst);
   // 0x01 - extract from upper 128 bits
@@ -4829,6 +4994,9 @@
 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4844,7 +5012,7 @@
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_reg_mask */ false);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
@@ -4854,16 +5022,17 @@
 
 void Assembler::vinserti128h(XMMRegister dst, Address src) {
   assert(VM_Version::supports_avx2(), "");
+  int vector_len = AVX_256bit;
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x38);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
@@ -4873,6 +5042,9 @@
 void Assembler::vextracti128h(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4883,15 +5055,16 @@
 
 void Assembler::vextracti128h(Address dst, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
+  int vector_len = AVX_256bit;
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
-  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x39);
   emit_operand(src, dst);
   // 0x01 - extract from upper 128 bits
@@ -4904,7 +5077,7 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     true, vector_len, false, false);
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x3B);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from upper 256 bits
@@ -4916,8 +5089,14 @@
   int vector_len = AVX_512bit;
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
-  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+  int encode;
+  if (VM_Version::supports_avx512dq()) {
+    encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                   /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  } else {
+    encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                   /* vex_w */ false, vector_len, /* legacy_mode */ true, /* no_mask_reg */ false);
+  }
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from bits 255:128
@@ -4932,7 +5111,7 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x1B);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from upper 256 bits
@@ -4940,18 +5119,18 @@
 }
 
 void Assembler::vextractf64x4h(Address dst, XMMRegister src) {
-  assert(VM_Version::supports_avx2(), "");
-  tuple_type = EVEX_T4;
-  input_size_in_bits = EVEX_64bit;
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   int vector_len = AVX_512bit;
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
   vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-             VM_Version::supports_avx512dq(), vector_len);
+             /* vex_w */ true, vector_len);
   emit_int8(0x1B);
   emit_operand(src, dst);
-  // 0x01 - extract from upper 128 bits
+  // 0x01 - extract from upper 256 bits
   emit_int8(0x01);
 }
 
@@ -4960,8 +5139,42 @@
   int vector_len = AVX_512bit;
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
-  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66,
-                                     VEX_OPCODE_0F_3A, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  emit_int8(0x19);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vextractf32x4h(Address dst, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_32bit;
+  InstructionMark im(this);
+  int vector_len = AVX_512bit;
+  assert(src != xnoreg, "sanity");
+  int src_enc = src->encoding();
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
+  emit_int8(0x19);
+  emit_operand(src, dst);
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ !_legacy_mode_dq, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from bits 255:128
@@ -4970,195 +5183,192 @@
   emit_int8(value & 0x3);
 }
 
-void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) {
-  assert(VM_Version::supports_evex(), "");
-  int vector_len = AVX_512bit;
-  int src_enc = src->encoding();
-  int dst_enc = dst->encoding();
-  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
-  emit_int8(0x19);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x01 - extract from bits 255:128
-  // 0x02 - extract from bits 383:256
-  // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
-}
-
 // duplicate 4-bytes integer data from src into 8 locations in dest
 void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
-  assert(VM_Version::supports_avx2(), "");
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
   int vector_len = AVX_256bit;
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x78);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_8bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_8bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x78);
   emit_operand(dst, src);
 }
 
 // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x79);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_16bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_16bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x79);
   emit_operand(dst, src);
 }
 
 // duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_32bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_32bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
 
 // duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_64bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ true, vector_len);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
 
 // duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_32bit;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_32bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x18);
   emit_operand(dst, src);
 }
 
 // duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /*vex_w */ true, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_64bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ true, vector_len);
   emit_int8(0x19);
   emit_operand(dst, src);
 }
 
 // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /*vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7B);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5166,8 +5376,8 @@
 // Carry-Less Multiplication Quadword
 void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
   assert(VM_Version::supports_clmul(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_3A, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8((unsigned char)mask);
@@ -5177,8 +5387,7 @@
 void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
   assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
   int vector_len = AVX_128bit;
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_3A, true);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A, /* legacy_mode */ true);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8((unsigned char)mask);
@@ -5737,7 +5946,7 @@
                             int vector_len, bool no_mask_reg ){
   // EVEX 0x62 prefix
   prefix(EVEX_4bytes);
-  evex_encoding = (vex_w ? VEX_W : 0) | (evex_r ? EVEX_Rb : 0);
+  _evex_encoding = (vex_w ? VEX_W : 0) | (evex_r ? EVEX_Rb : 0);
 
   // P0: byte 2, initialized to RXBR`00mm
   // instead of not'd
@@ -5776,10 +5985,10 @@
   bool vex_r = ((xreg_enc & 8) == 8) ? 1 : 0;
   bool vex_b = adr.base_needs_rex();
   bool vex_x = adr.index_needs_rex();
-  avx_vector_len = vector_len;
-
-  // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit
-  if (VM_Version::supports_avx512vl() == false) {
+  _avx_vector_len = vector_len;
+
+  // if vector length is turned off, revert to AVX for vectors smaller than 512-bit
+  if (_legacy_mode_vl && _instruction_uses_vl) {
     switch (vector_len) {
     case AVX_128bit:
     case AVX_256bit:
@@ -5792,11 +6001,12 @@
   {
     bool evex_r = (xreg_enc >= 16);
     bool evex_v = (nds_enc >= 16);
-    is_evex_instruction = true;
+    _is_evex_instruction = true;
     evex_prefix(vex_r, vex_b, vex_x, vex_w, evex_r, evex_v, nds_enc, pre, opc, false, false, vector_len, no_mask_reg);
   } else {
     vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len);
   }
+  _instruction_uses_vl = false;
 }
 
 int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
@@ -5804,10 +6014,10 @@
   bool vex_r = ((dst_enc & 8) == 8) ? 1 : 0;
   bool vex_b = ((src_enc & 8) == 8) ? 1 : 0;
   bool vex_x = false;
-  avx_vector_len = vector_len;
-
-  // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit
-  if (VM_Version::supports_avx512vl() == false) {
+  _avx_vector_len = vector_len;
+
+  // if vector length is turned off, revert to AVX for vectors smaller than 512-bit
+  if (_legacy_mode_vl && _instruction_uses_vl) {
     switch (vector_len) {
     case AVX_128bit:
     case AVX_256bit:
@@ -5827,6 +6037,8 @@
     vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len);
   }
 
+  _instruction_uses_vl = false;
+
   // return modrm byte components for operands
   return (((dst_enc & 7) << 3) | (src_enc & 7));
 }
@@ -5915,13 +6127,13 @@
 }
 
 void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg, bool legacy_mode) {
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, legacy_mode, AVX_128bit);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, false, AVX_128bit, legacy_mode);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, true, AVX_128bit);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, true);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5945,7 +6157,7 @@
 
 void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src,
                                VexSimdPrefix pre, int vector_len, bool no_mask_reg, bool legacy_mode) {
-  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector_len, VEX_OPCODE_0F, false, no_mask_reg);
+  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector_len, VEX_OPCODE_0F, legacy_mode, no_mask_reg);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6594,7 +6806,7 @@
 
 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6602,11 +6814,11 @@
 void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix_q(dst, dst, src, VEX_SIMD_F2, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_operand(dst, src);
 }
@@ -6614,25 +6826,25 @@
 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix_q(dst, dst, src, VEX_SIMD_F3, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_operand(dst, src);
 }
 
 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6668,6 +6880,13 @@
   emit_operand(as_Register(1), src);
 }
 
+void Assembler::xrstor(Address src) {
+  prefixq(src);
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xAE);
+  emit_operand(as_Register(5), src);
+}
+
 void Assembler::fxsave(Address dst) {
   prefixq(dst);
   emit_int8(0x0F);
@@ -6675,6 +6894,13 @@
   emit_operand(as_Register(0), dst);
 }
 
+void Assembler::xsave(Address dst) {
+  prefixq(dst);
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xAE);
+  emit_operand(as_Register(4), dst);
+}
+
 void Assembler::idivq(Register src) {
   int encode = prefixq_and_encode(src->encoding());
   emit_int8((unsigned char)0xF7);
@@ -6801,7 +7027,7 @@
 void Assembler::movdq(XMMRegister dst, Register src) {
   // table D-1 says MMX/SSE2
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x6E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6810,7 +7036,7 @@
   // table D-1 says MMX/SSE2
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // swap src/dst to get correct prefix
-  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x7E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6943,8 +7169,8 @@
 
 void Assembler::mulxq(Register dst1, Register dst2, Register src) {
   assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(),
-                                     VEX_SIMD_F2, VEX_OPCODE_0F_38, true, AVX_128bit, true, false);
+  int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38,
+                                    /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_mask_reg */ false);
   emit_int8((unsigned char)0xF6);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -7106,8 +7332,8 @@
 
 void Assembler::rorxq(Register dst, Register src, int imm8) {
   assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2,
-                                     VEX_OPCODE_0F_3A, true, AVX_128bit, true, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_mask_reg */ false);
   emit_int8((unsigned char)0xF0);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -438,7 +438,9 @@
 
 };
 
-const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize);
+// 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
+// See fxsave and xsave(EVEX enabled) documentation for layout
+const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize);
 
 // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
 // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
@@ -594,11 +596,16 @@
 
 private:
 
-  int evex_encoding;
-  int input_size_in_bits;
-  int avx_vector_len;
-  int tuple_type;
-  bool is_evex_instruction;
+  int _evex_encoding;
+  int _input_size_in_bits;
+  int _avx_vector_len;
+  int _tuple_type;
+  bool _is_evex_instruction;
+  bool _legacy_mode_bw;
+  bool _legacy_mode_dq;
+  bool _legacy_mode_vl;
+  bool _legacy_mode_vlbw;
+  bool _instruction_uses_vl;
 
   // 64bit prefixes
   int prefix_and_encode(int reg_enc, bool byteinst = false);
@@ -972,11 +979,16 @@
   // belong in macro assembler but there is no need for both varieties to exist
 
   void init_attributes(void) {
-    evex_encoding = 0;
-    input_size_in_bits = 0;
-    avx_vector_len = AVX_NoVec;
-    tuple_type = EVEX_ETUP;
-    is_evex_instruction = false;
+    _evex_encoding = 0;
+    _input_size_in_bits = 0;
+    _avx_vector_len = AVX_NoVec;
+    _tuple_type = EVEX_ETUP;
+    _is_evex_instruction = false;
+    _legacy_mode_bw = (VM_Version::supports_avx512bw() == false);
+    _legacy_mode_dq = (VM_Version::supports_avx512dq() == false);
+    _legacy_mode_vl = (VM_Version::supports_avx512vl() == false);
+    _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false);
+    _instruction_uses_vl = false;
   }
 
   void lea(Register dst, Address src);
@@ -1344,8 +1356,10 @@
   void fxch(int i = 1);
 
   void fxrstor(Address src);
+  void xrstor(Address src);
 
   void fxsave(Address dst);
+  void xsave(Address dst);
 
   void fyl2x();
   void frndint();
@@ -1479,11 +1493,12 @@
   void movb(Address dst, int imm8);
   void movb(Register dst, Address src);
 
-  void kmovq(KRegister dst, KRegister src);
+  void kmovql(KRegister dst, KRegister src);
   void kmovql(KRegister dst, Register src);
   void kmovdl(KRegister dst, Register src);
-  void kmovq(Address dst, KRegister src);
-  void kmovq(KRegister dst, Address src);
+  void kmovwl(KRegister dst, Register src);
+  void kmovql(Address dst, KRegister src);
+  void kmovql(KRegister dst, Address src);
 
   void movdl(XMMRegister dst, Register src);
   void movdl(Register dst, XMMRegister src);
@@ -1509,9 +1524,12 @@
   void vmovdqu(XMMRegister dst, XMMRegister src);
 
    // Move Unaligned 512bit Vector
-  void evmovdqu(Address dst, XMMRegister src, int vector_len);
-  void evmovdqu(XMMRegister dst, Address src, int vector_len);
-  void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdqul(Address dst, XMMRegister src, int vector_len);
+  void evmovdqul(XMMRegister dst, Address src, int vector_len);
+  void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdquq(Address dst, XMMRegister src, int vector_len);
+  void evmovdquq(XMMRegister dst, Address src, int vector_len);
+  void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
 
   // Move lower 64bit to high 64bit in 128bit register
   void movlhps(XMMRegister dst, XMMRegister src);
@@ -1643,6 +1661,7 @@
 
   // Pemutation of 64bit words
   void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
+  void vpermq(XMMRegister dst, XMMRegister src, int imm8);
 
   void pause();
 
@@ -1920,6 +1939,10 @@
   void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
   void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
+  // Sqrt Packed Floating-Point Values - Double precision only
+  void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vsqrtpd(XMMRegister dst, Address src, int vector_len);
+
   // Bitwise Logical AND of Packed Floating-Point Values
   void andpd(XMMRegister dst, XMMRegister src);
   void andps(XMMRegister dst, XMMRegister src);
@@ -2057,6 +2080,9 @@
   void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
   void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
   void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
+  void vextractf32x4h(Address dst, XMMRegister src, int value);
+  void vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
+  void vinsertf32x4h(XMMRegister dst, Address src, int value);
 
   // duplicate 4-bytes integer data from src into 8 locations in dest
   void vpbroadcastd(XMMRegister dst, XMMRegister src);
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -3798,16 +3798,24 @@
     if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) {
       __ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg());
     }
-    __ xorps(dest->as_xmm_float_reg(),
-             ExternalAddress((address)float_signflip_pool));
-
+    if (UseAVX > 1) {
+      __ vnegatess(dest->as_xmm_float_reg(), dest->as_xmm_float_reg(),
+                   ExternalAddress((address)float_signflip_pool));
+    } else {
+      __ xorps(dest->as_xmm_float_reg(),
+               ExternalAddress((address)float_signflip_pool));
+    }
   } else if (dest->is_double_xmm()) {
     if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) {
       __ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg());
     }
-    __ xorpd(dest->as_xmm_double_reg(),
-             ExternalAddress((address)double_signflip_pool));
-
+    if (UseAVX > 1) {
+      __ vnegatesd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg(),
+                   ExternalAddress((address)double_signflip_pool));
+    } else {
+      __ xorpd(dest->as_xmm_double_reg(),
+               ExternalAddress((address)double_signflip_pool));
+    }
   } else if (left->is_single_fpu() || left->is_double_fpu()) {
     assert(left->fpu() == 0, "arg must be on TOS");
     assert(dest->fpu() == 0, "dest must be TOS");
--- a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -401,11 +401,9 @@
 
     } else if (UseSSE == 1) {
       int xmm_off = xmm_regs_as_doubles_off;
-      for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
-        if (n < xmm_bypass_limit) {
-          VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
-          map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
-        }
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
+        map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
         xmm_off += 2;
       }
       assert(xmm_off == float_regs_as_doubles_off, "incorrect number of xmm registers");
@@ -452,14 +450,11 @@
       __ frstor(Address(rsp, fpu_state_off * VMRegImpl::stack_slot_size));
 
       // Save the FPU registers in de-opt-able form
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
     }
 
     if (UseSSE >= 2) {
@@ -468,52 +463,26 @@
       // so always save them as doubles.
       // note that float values are _not_ converted automatically, so for float values
       // the second word contains only garbage data.
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0), xmm0);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8), xmm1);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7);
+      int xmm_bypass_limit = FrameMap::nof_xmm_regs;
+      int offset = 0;
 #ifdef _LP64
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64), xmm8);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72), xmm9);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80), xmm10);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88), xmm11);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96), xmm12);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104), xmm13);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112), xmm14);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120), xmm15);
-      if (UseAVX > 2) {
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128), xmm16);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136), xmm17);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144), xmm18);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152), xmm19);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160), xmm20);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168), xmm21);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176), xmm22);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184), xmm23);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192), xmm24);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200), xmm25);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208), xmm26);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216), xmm27);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224), xmm28);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232), xmm29);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240), xmm30);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248), xmm31);
+      if (UseAVX < 3) {
+        xmm_bypass_limit = xmm_bypass_limit / 2;
       }
-#endif // _LP64
+#endif
+      for (int n = 0; n < xmm_bypass_limit; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name);
+        offset += 8;
+      }
     } else if (UseSSE == 1) {
-      // save XMM registers as float because double not supported without SSE2
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0), xmm0);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8), xmm1);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7);
+      // save XMM registers as float because double not supported without SSE2(num MMX == num fpu)
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name);
+        offset += 8;
+      }
     }
   }
 
@@ -528,52 +497,26 @@
   if (restore_fpu_registers) {
     if (UseSSE >= 2) {
       // restore XMM registers
-      __ movdbl(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ movdbl(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ movdbl(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ movdbl(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ movdbl(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ movdbl(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ movdbl(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ movdbl(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      int xmm_bypass_limit = FrameMap::nof_xmm_regs;
 #ifdef _LP64
-      __ movdbl(xmm8, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64));
-      __ movdbl(xmm9, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72));
-      __ movdbl(xmm10, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80));
-      __ movdbl(xmm11, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88));
-      __ movdbl(xmm12, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96));
-      __ movdbl(xmm13, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104));
-      __ movdbl(xmm14, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112));
-      __ movdbl(xmm15, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120));
-      if (UseAVX > 2) {
-        __ movdbl(xmm16, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128));
-        __ movdbl(xmm17, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136));
-        __ movdbl(xmm18, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144));
-        __ movdbl(xmm19, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152));
-        __ movdbl(xmm20, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160));
-        __ movdbl(xmm21, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168));
-        __ movdbl(xmm22, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176));
-        __ movdbl(xmm23, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184));
-        __ movdbl(xmm24, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192));
-        __ movdbl(xmm25, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200));
-        __ movdbl(xmm26, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208));
-        __ movdbl(xmm27, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216));
-        __ movdbl(xmm28, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224));
-        __ movdbl(xmm29, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232));
-        __ movdbl(xmm30, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240));
-        __ movdbl(xmm31, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248));
+      if (UseAVX < 3) {
+        xmm_bypass_limit = xmm_bypass_limit / 2;
       }
-#endif // _LP64
+#endif
+      int offset = 0;
+      for (int n = 0; n < xmm_bypass_limit; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movdbl(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
     } else if (UseSSE == 1) {
-      // restore XMM registers
-      __ movflt(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ movflt(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ movflt(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ movflt(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ movflt(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ movflt(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ movflt(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ movflt(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      // restore XMM registers(num MMX == num fpu)
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movflt(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
     }
 
     if (UseSSE < 2) {
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -3751,8 +3751,31 @@
 }
 
 void MacroAssembler::pop_FPU_state() {
-  NOT_LP64(frstor(Address(rsp, 0));)
-  LP64_ONLY(fxrstor(Address(rsp, 0));)
+#ifndef _LP64
+  frstor(Address(rsp, 0));
+#else
+  // AVX will continue to use the fxsave area.
+  // EVEX needs to utilize the xsave area, which is under different
+  // management.
+  if(VM_Version::supports_evex()) {
+    // EDX:EAX describe the XSAVE header and
+    // are obtained while fetching info for XCR0 via cpuid.
+    // These two registers make up 64-bits in the header for which bits
+    // 62:10 are currently reserved for future implementations and unused.  Bit 63
+    // is unused for our implementation as we do not utilize
+    // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
+    // the functionality for PKRU state and MSR tracing.
+    // Ergo we are primarily concerned with bits 7..0, which define
+    // which ISA extensions and features are enabled for a given machine and are
+    // defined in XemXcr0Eax and is used to map the XSAVE area
+    // for restoring registers as described via XCR0.
+    movl(rdx,VM_Version::get_xsave_header_upper_segment());
+    movl(rax,VM_Version::get_xsave_header_lower_segment());
+    xrstor(Address(rsp, 0));
+  } else {
+    fxrstor(Address(rsp, 0));
+  }
+#endif
   addptr(rsp, FPUStateSizeInWords * wordSize);
 }
 
@@ -3769,13 +3792,49 @@
   push_FPU_state();
 }
 
+#ifdef _LP64
+#define XSTATE_BV 0x200
+#endif
+
 void MacroAssembler::push_FPU_state() {
   subptr(rsp, FPUStateSizeInWords * wordSize);
 #ifndef _LP64
   fnsave(Address(rsp, 0));
   fwait();
 #else
-  fxsave(Address(rsp, 0));
+  // AVX will continue to use the fxsave area.
+  // EVEX needs to utilize the xsave area, which is under different
+  // management.
+  if(VM_Version::supports_evex()) {
+    // Save a copy of EAX and EDX
+    push(rax);
+    push(rdx);
+    // EDX:EAX describe the XSAVE header and
+    // are obtained while fetching info for XCR0 via cpuid.
+    // These two registers make up 64-bits in the header for which bits
+    // 62:10 are currently reserved for future implementations and unused.  Bit 63
+    // is unused for our implementation as we do not utilize
+    // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
+    // the functionality for PKRU state and MSR tracing.
+    // Ergo we are primarily concerned with bits 7..0, which define
+    // which ISA extensions and features are enabled for a given machine and are
+    // defined in XemXcr0Eax and is used to program XSAVE area
+    // for saving the required registers as defined in XCR0.
+    int xcr0_edx = VM_Version::get_xsave_header_upper_segment();
+    int xcr0_eax = VM_Version::get_xsave_header_lower_segment();
+    movl(rdx,xcr0_edx);
+    movl(rax,xcr0_eax);
+    xsave(Address(rsp, wordSize*2));
+    // now Apply control bits and clear bytes 8..23 in the header
+    pop(rdx);
+    pop(rax);
+    movl(Address(rsp, XSTATE_BV), xcr0_eax);
+    movl(Address(rsp, XSTATE_BV+4), xcr0_edx);
+    andq(Address(rsp, XSTATE_BV+8), 0);
+    andq(Address(rsp, XSTATE_BV+16), 0);
+  } else {
+    fxsave(Address(rsp, 0));
+  }
 #endif // LP64
 }
 
@@ -4082,6 +4141,84 @@
   }
 }
 
+void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  int nds_enc = nds->encoding();
+  int dst_enc = dst->encoding();
+  bool dst_upper_bank = (dst_enc > 15);
+  bool nds_upper_bank = (nds_enc > 15);
+  if (VM_Version::supports_avx512novl() &&
+      (nds_upper_bank || dst_upper_bank)) {
+    if (dst_upper_bank) {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      movflt(xmm0, nds);
+      if (reachable(src)) {
+        vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+      movflt(dst, xmm0);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    } else {
+      movflt(dst, nds);
+      if (reachable(src)) {
+        vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+    }
+  } else {
+    if (reachable(src)) {
+      vxorps(dst, nds, as_Address(src), Assembler::AVX_128bit);
+    } else {
+      lea(rscratch1, src);
+      vxorps(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
+    }
+  }
+}
+
+void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  int nds_enc = nds->encoding();
+  int dst_enc = dst->encoding();
+  bool dst_upper_bank = (dst_enc > 15);
+  bool nds_upper_bank = (nds_enc > 15);
+  if (VM_Version::supports_avx512novl() &&
+      (nds_upper_bank || dst_upper_bank)) {
+    if (dst_upper_bank) {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      movdbl(xmm0, nds);
+      if (reachable(src)) {
+        vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+      movdbl(dst, xmm0);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    } else {
+      movdbl(dst, nds);
+      if (reachable(src)) {
+        vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+    }
+  } else {
+    if (reachable(src)) {
+      vxorpd(dst, nds, as_Address(src), Assembler::AVX_128bit);
+    } else {
+      lea(rscratch1, src);
+      vxorpd(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
+    }
+  }
+}
+
 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
   if (reachable(src)) {
     vxorpd(dst, nds, as_Address(src), vector_len);
@@ -4318,9 +4455,10 @@
 void MacroAssembler::store_check(Register obj) {
   // Does a store check for the oop in register obj. The content of
   // register obj is destroyed afterwards.
-
   BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
+  assert(bs->kind() == BarrierSet::CardTableForRS ||
+         bs->kind() == BarrierSet::CardTableExtension,
+         "Wrong barrier set kind");
 
   CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
@@ -4570,69 +4708,58 @@
 
   // if we are coming from c1, xmm registers may be live
   int off = 0;
+  int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
+  if (UseAVX > 2) {
+    num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
+  }
+
   if (UseSSE == 1)  {
     subptr(rsp, sizeof(jdouble)*8);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
+    for (int n = 0; n < 8; n++) {
+      movflt(Address(rsp, off++*sizeof(jdouble)), as_XMMRegister(n));
+    }
   } else if (UseSSE >= 2)  {
     if (UseAVX > 2) {
+      push(rbx);
       movl(rbx, 0xffff);
-#ifdef _LP64
-      kmovql(k1, rbx);
-#else
-      kmovdl(k1, rbx);
-#endif
+      kmovwl(k1, rbx);
+      pop(rbx);
     }
 #ifdef COMPILER2
     if (MaxVectorSize > 16) {
-      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+      if(UseAVX > 2) {
+        // Save upper half of ZMM registes
+        subptr(rsp, 32*num_xmm_regs);
+        for (int n = 0; n < num_xmm_regs; n++) {
+          vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
+        }
+        off = 0;
+      }
+      assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
       // Save upper half of YMM registes
-      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
-      vextractf128h(Address(rsp,  0),xmm0);
-      vextractf128h(Address(rsp, 16),xmm1);
-      vextractf128h(Address(rsp, 32),xmm2);
-      vextractf128h(Address(rsp, 48),xmm3);
-      vextractf128h(Address(rsp, 64),xmm4);
-      vextractf128h(Address(rsp, 80),xmm5);
-      vextractf128h(Address(rsp, 96),xmm6);
-      vextractf128h(Address(rsp,112),xmm7);
-#ifdef _LP64
-      vextractf128h(Address(rsp,128),xmm8);
-      vextractf128h(Address(rsp,144),xmm9);
-      vextractf128h(Address(rsp,160),xmm10);
-      vextractf128h(Address(rsp,176),xmm11);
-      vextractf128h(Address(rsp,192),xmm12);
-      vextractf128h(Address(rsp,208),xmm13);
-      vextractf128h(Address(rsp,224),xmm14);
-      vextractf128h(Address(rsp,240),xmm15);
-#endif
+      subptr(rsp, 16*num_xmm_regs);
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+      }
     }
 #endif
-    // Save whole 128bit (16 bytes) XMM regiters
-    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
-    movdqu(Address(rsp,off++*16),xmm0);
-    movdqu(Address(rsp,off++*16),xmm1);
-    movdqu(Address(rsp,off++*16),xmm2);
-    movdqu(Address(rsp,off++*16),xmm3);
-    movdqu(Address(rsp,off++*16),xmm4);
-    movdqu(Address(rsp,off++*16),xmm5);
-    movdqu(Address(rsp,off++*16),xmm6);
-    movdqu(Address(rsp,off++*16),xmm7);
+    // Save whole 128bit (16 bytes) XMM registers
+    subptr(rsp, 16*num_xmm_regs);
+    off = 0;
 #ifdef _LP64
-    movdqu(Address(rsp,off++*16),xmm8);
-    movdqu(Address(rsp,off++*16),xmm9);
-    movdqu(Address(rsp,off++*16),xmm10);
-    movdqu(Address(rsp,off++*16),xmm11);
-    movdqu(Address(rsp,off++*16),xmm12);
-    movdqu(Address(rsp,off++*16),xmm13);
-    movdqu(Address(rsp,off++*16),xmm14);
-    movdqu(Address(rsp,off++*16),xmm15);
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vextractf32x4h(Address(rsp, off++*16), as_XMMRegister(n), 0);
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        movdqu(Address(rsp, off++*16), as_XMMRegister(n));
+      }
+    }
+#else
+    for (int n = 0; n < num_xmm_regs; n++) {
+      movdqu(Address(rsp, off++*16), as_XMMRegister(n));
+    }
 #endif
   }
 
@@ -4687,7 +4814,7 @@
   movsd(Address(rsp, 0), xmm0);
   fld_d(Address(rsp, 0));
 #endif // _LP64
-  addptr(rsp, sizeof(jdouble) * nb_args);
+  addptr(rsp, sizeof(jdouble)*nb_args);
   if (num_fpu_regs_in_use > 1) {
     // Must save return value to stack and then restore entire FPU
     // stack except incoming arguments
@@ -4697,63 +4824,50 @@
       addptr(rsp, sizeof(jdouble));
     }
     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
-    addptr(rsp, sizeof(jdouble) * nb_args);
+    addptr(rsp, sizeof(jdouble)*nb_args);
   }
 
   off = 0;
   if (UseSSE == 1)  {
-    movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+    for (int n = 0; n < 8; n++) {
+      movflt(as_XMMRegister(n), Address(rsp, off++*sizeof(jdouble)));
+    }
     addptr(rsp, sizeof(jdouble)*8);
   } else if (UseSSE >= 2)  {
     // Restore whole 128bit (16 bytes) XMM regiters
-    movdqu(xmm0, Address(rsp,off++*16));
-    movdqu(xmm1, Address(rsp,off++*16));
-    movdqu(xmm2, Address(rsp,off++*16));
-    movdqu(xmm3, Address(rsp,off++*16));
-    movdqu(xmm4, Address(rsp,off++*16));
-    movdqu(xmm5, Address(rsp,off++*16));
-    movdqu(xmm6, Address(rsp,off++*16));
-    movdqu(xmm7, Address(rsp,off++*16));
 #ifdef _LP64
-    movdqu(xmm8, Address(rsp,off++*16));
-    movdqu(xmm9, Address(rsp,off++*16));
-    movdqu(xmm10, Address(rsp,off++*16));
-    movdqu(xmm11, Address(rsp,off++*16));
-    movdqu(xmm12, Address(rsp,off++*16));
-    movdqu(xmm13, Address(rsp,off++*16));
-    movdqu(xmm14, Address(rsp,off++*16));
-    movdqu(xmm15, Address(rsp,off++*16));
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vinsertf32x4h(as_XMMRegister(n), Address(rsp, off++*16), 0);
+      }
+    }
+    else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        movdqu(as_XMMRegister(n), Address(rsp, off++*16));
+      }
+    }
+#else
+    for (int n = 0; n < num_xmm_regs; n++) {
+      movdqu(as_XMMRegister(n), Address(rsp, off++ * 16));
+    }
 #endif
-    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+    addptr(rsp, 16*num_xmm_regs);
+
 #ifdef COMPILER2
     if (MaxVectorSize > 16) {
       // Restore upper half of YMM registes.
-      vinsertf128h(xmm0, Address(rsp,  0));
-      vinsertf128h(xmm1, Address(rsp, 16));
-      vinsertf128h(xmm2, Address(rsp, 32));
-      vinsertf128h(xmm3, Address(rsp, 48));
-      vinsertf128h(xmm4, Address(rsp, 64));
-      vinsertf128h(xmm5, Address(rsp, 80));
-      vinsertf128h(xmm6, Address(rsp, 96));
-      vinsertf128h(xmm7, Address(rsp,112));
-#ifdef _LP64
-      vinsertf128h(xmm8, Address(rsp,128));
-      vinsertf128h(xmm9, Address(rsp,144));
-      vinsertf128h(xmm10, Address(rsp,160));
-      vinsertf128h(xmm11, Address(rsp,176));
-      vinsertf128h(xmm12, Address(rsp,192));
-      vinsertf128h(xmm13, Address(rsp,208));
-      vinsertf128h(xmm14, Address(rsp,224));
-      vinsertf128h(xmm15, Address(rsp,240));
-#endif
-      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
+      }
+      addptr(rsp, 16*num_xmm_regs);
+      if(UseAVX > 2) {
+        off = 0;
+        for (int n = 0; n < num_xmm_regs; n++) {
+          vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
+        }
+        addptr(rsp, 32*num_xmm_regs);
+      }
     }
 #endif
   }
@@ -7093,11 +7207,7 @@
       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
       if (UseAVX > 2) {
         movl(rtmp, 0xffff);
-#ifdef _LP64
-        kmovql(k1, rtmp);
-#else
-        kmovdl(k1, rtmp);
-#endif
+        kmovwl(k1, rtmp);
       }
       movdl(xtmp, value);
       if (UseAVX > 2 && UseUnalignedLoadStores) {
@@ -7110,7 +7220,7 @@
         align(16);
 
         BIND(L_fill_64_bytes_loop);
-        evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
+        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
         addptr(to, 64);
         subl(count, 16 << shift);
         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
@@ -7118,7 +7228,7 @@
         BIND(L_check_fill_32_bytes);
         addl(count, 8 << shift);
         jccb(Assembler::less, L_check_fill_8_bytes);
-        evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
+        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_256bit);
         addptr(to, 32);
         subl(count, 8 << shift);
 
@@ -8397,6 +8507,14 @@
   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
 
+  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+  // context for the registers used, where all instructions below are using 128-bit mode
+  // On EVEX without VL and BW, these instructions will all be AVX.
+  if (VM_Version::supports_avx512vlbw()) {
+    movl(tmp, 0xffff);
+    kmovwl(k1, tmp);
+  }
+
   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
   notl(crc); // ~crc
   cmpl(len, 16);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -1069,6 +1069,9 @@
   void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
+  void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+  void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
   // AVX Vector instructions
 
   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -115,6 +115,7 @@
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words,
                                            int* total_frame_words, bool verify_fpu, bool save_vectors) {
   int vect_words = 0;
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 #ifdef COMPILER2
   if (save_vectors) {
     assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
@@ -173,59 +174,50 @@
     __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
   }
 
+  int off = st0_off;
+  int delta = st1_off - off;
+
   // Save the FPU registers in de-opt-able form
-
-  __ fstp_d(Address(rsp, st0_off*wordSize)); // st(0)
-  __ fstp_d(Address(rsp, st1_off*wordSize)); // st(1)
-  __ fstp_d(Address(rsp, st2_off*wordSize)); // st(2)
-  __ fstp_d(Address(rsp, st3_off*wordSize)); // st(3)
-  __ fstp_d(Address(rsp, st4_off*wordSize)); // st(4)
-  __ fstp_d(Address(rsp, st5_off*wordSize)); // st(5)
-  __ fstp_d(Address(rsp, st6_off*wordSize)); // st(6)
-  __ fstp_d(Address(rsp, st7_off*wordSize)); // st(7)
-
-  if( UseSSE == 1 ) {           // Save the XMM state
-    __ movflt(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movflt(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movflt(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movflt(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movflt(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movflt(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movflt(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movflt(Address(rsp,xmm7_off*wordSize),xmm7);
-  } else if( UseSSE >= 2 ) {
+  for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) {
+    __ fstp_d(Address(rsp, off*wordSize));
+    off += delta;
+  }
+
+  off = xmm0_off;
+  delta = xmm1_off - off;
+  if(UseSSE == 1) {           // Save the XMM state
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movflt(Address(rsp, off*wordSize), as_XMMRegister(n));
+      off += delta;
+    }
+  } else if(UseSSE >= 2) {
     // Save whole 128bit (16 bytes) XMM regiters
-    __ movdqu(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movdqu(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movdqu(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movdqu(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movdqu(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movdqu(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movdqu(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movdqu(Address(rsp,xmm7_off*wordSize),xmm7);
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vextractf32x4h(Address(rsp, off*wordSize), as_XMMRegister(n), 0);
+        off += delta;
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ movdqu(Address(rsp, off*wordSize), as_XMMRegister(n));
+        off += delta;
+      }
+    }
   }
 
   if (vect_words > 0) {
     assert(vect_words*wordSize == 128, "");
     __ subptr(rsp, 128); // Save upper half of YMM registes
-    __ vextractf128h(Address(rsp,  0),xmm0);
-    __ vextractf128h(Address(rsp, 16),xmm1);
-    __ vextractf128h(Address(rsp, 32),xmm2);
-    __ vextractf128h(Address(rsp, 48),xmm3);
-    __ vextractf128h(Address(rsp, 64),xmm4);
-    __ vextractf128h(Address(rsp, 80),xmm5);
-    __ vextractf128h(Address(rsp, 96),xmm6);
-    __ vextractf128h(Address(rsp,112),xmm7);
+    off = 0;
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+    }
     if (UseAVX > 2) {
       __ subptr(rsp, 256); // Save upper half of ZMM registes
-      __ vextractf64x4h(Address(rsp, 0), xmm0);
-      __ vextractf64x4h(Address(rsp, 32), xmm1);
-      __ vextractf64x4h(Address(rsp, 64), xmm2);
-      __ vextractf64x4h(Address(rsp, 96), xmm3);
-      __ vextractf64x4h(Address(rsp, 128), xmm4);
-      __ vextractf64x4h(Address(rsp, 160), xmm5);
-      __ vextractf64x4h(Address(rsp, 192), xmm6);
-      __ vextractf64x4h(Address(rsp, 224), xmm7);
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
+      }
     }
   }
 
@@ -238,58 +230,40 @@
   OopMap* map =  new OopMap( frame_words, 0 );
 
 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_words)
-
-  map->set_callee_saved(STACK_OFFSET( rax_off), rax->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rcx_off), rcx->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rdx_off), rdx->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rbx_off), rbx->as_VMReg());
+#define NEXTREG(x) (x)->as_VMReg()->next()
+
+  map->set_callee_saved(STACK_OFFSET(rax_off), rax->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rcx_off), rcx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rdx_off), rdx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rbx_off), rbx->as_VMReg());
   // rbp, location is known implicitly, no oopMap
-  map->set_callee_saved(STACK_OFFSET( rsi_off), rsi->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rdi_off), rdi->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st0_off), as_FloatRegister(0)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st1_off), as_FloatRegister(1)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st2_off), as_FloatRegister(2)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st3_off), as_FloatRegister(3)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st4_off), as_FloatRegister(4)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st5_off), as_FloatRegister(5)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st6_off), as_FloatRegister(6)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st7_off), as_FloatRegister(7)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm0_off), xmm0->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm1_off), xmm1->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm2_off), xmm2->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm3_off), xmm3->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm4_off), xmm4->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm5_off), xmm5->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm6_off), xmm6->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm7_off), xmm7->as_VMReg());
-  // %%% This is really a waste but we'll keep things as they were for now
-  if (true) {
-#define NEXTREG(x) (x)->as_VMReg()->next()
-    map->set_callee_saved(STACK_OFFSET(st0H_off), NEXTREG(as_FloatRegister(0)));
-    map->set_callee_saved(STACK_OFFSET(st1H_off), NEXTREG(as_FloatRegister(1)));
-    map->set_callee_saved(STACK_OFFSET(st2H_off), NEXTREG(as_FloatRegister(2)));
-    map->set_callee_saved(STACK_OFFSET(st3H_off), NEXTREG(as_FloatRegister(3)));
-    map->set_callee_saved(STACK_OFFSET(st4H_off), NEXTREG(as_FloatRegister(4)));
-    map->set_callee_saved(STACK_OFFSET(st5H_off), NEXTREG(as_FloatRegister(5)));
-    map->set_callee_saved(STACK_OFFSET(st6H_off), NEXTREG(as_FloatRegister(6)));
-    map->set_callee_saved(STACK_OFFSET(st7H_off), NEXTREG(as_FloatRegister(7)));
-    map->set_callee_saved(STACK_OFFSET(xmm0H_off), NEXTREG(xmm0));
-    map->set_callee_saved(STACK_OFFSET(xmm1H_off), NEXTREG(xmm1));
-    map->set_callee_saved(STACK_OFFSET(xmm2H_off), NEXTREG(xmm2));
-    map->set_callee_saved(STACK_OFFSET(xmm3H_off), NEXTREG(xmm3));
-    map->set_callee_saved(STACK_OFFSET(xmm4H_off), NEXTREG(xmm4));
-    map->set_callee_saved(STACK_OFFSET(xmm5H_off), NEXTREG(xmm5));
-    map->set_callee_saved(STACK_OFFSET(xmm6H_off), NEXTREG(xmm6));
-    map->set_callee_saved(STACK_OFFSET(xmm7H_off), NEXTREG(xmm7));
+  map->set_callee_saved(STACK_OFFSET(rsi_off), rsi->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rdi_off), rdi->as_VMReg());
+  // %%% This is really a waste but we'll keep things as they were for now for the upper component
+  off = st0_off;
+  delta = st1_off - off;
+  for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) {
+    FloatRegister freg_name = as_FloatRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), freg_name->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(freg_name));
+    off += delta;
+  }
+  off = xmm0_off;
+  delta = xmm1_off - off;
+  for (int n = 0; n < num_xmm_regs; n++) {
+    XMMRegister xmm_name = as_XMMRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(xmm_name));
+    off += delta;
+  }
 #undef NEXTREG
 #undef STACK_OFFSET
-  }
 
   return map;
-
 }
 
 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
   // Recover XMM & FPU state
   int additional_frame_bytes = 0;
 #ifdef COMPILER2
@@ -301,52 +275,43 @@
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
 #endif
+  int off = xmm0_off;
+  int delta = xmm1_off - off;
+
   if (UseSSE == 1) {
     assert(additional_frame_bytes == 0, "");
-    __ movflt(xmm0,Address(rsp,xmm0_off*wordSize));
-    __ movflt(xmm1,Address(rsp,xmm1_off*wordSize));
-    __ movflt(xmm2,Address(rsp,xmm2_off*wordSize));
-    __ movflt(xmm3,Address(rsp,xmm3_off*wordSize));
-    __ movflt(xmm4,Address(rsp,xmm4_off*wordSize));
-    __ movflt(xmm5,Address(rsp,xmm5_off*wordSize));
-    __ movflt(xmm6,Address(rsp,xmm6_off*wordSize));
-    __ movflt(xmm7,Address(rsp,xmm7_off*wordSize));
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movflt(as_XMMRegister(n), Address(rsp, off*wordSize));
+      off += delta;
+    }
   } else if (UseSSE >= 2) {
-#define STACK_ADDRESS(x) Address(rsp,(x)*wordSize + additional_frame_bytes)
-    __ movdqu(xmm0,STACK_ADDRESS(xmm0_off));
-    __ movdqu(xmm1,STACK_ADDRESS(xmm1_off));
-    __ movdqu(xmm2,STACK_ADDRESS(xmm2_off));
-    __ movdqu(xmm3,STACK_ADDRESS(xmm3_off));
-    __ movdqu(xmm4,STACK_ADDRESS(xmm4_off));
-    __ movdqu(xmm5,STACK_ADDRESS(xmm5_off));
-    __ movdqu(xmm6,STACK_ADDRESS(xmm6_off));
-    __ movdqu(xmm7,STACK_ADDRESS(xmm7_off));
-#undef STACK_ADDRESS
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vinsertf32x4h(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes), 0);
+        off += delta;
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
+        off += delta;
+      }
+    }
   }
   if (restore_vectors) {
+    if (UseAVX > 2) {
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
+      }
+      __ addptr(rsp, additional_frame_bytes*2); // Save upper half of ZMM registes
+    }
     // Restore upper half of YMM registes.
     assert(additional_frame_bytes == 128, "");
-    __ vinsertf128h(xmm0, Address(rsp,  0));
-    __ vinsertf128h(xmm1, Address(rsp, 16));
-    __ vinsertf128h(xmm2, Address(rsp, 32));
-    __ vinsertf128h(xmm3, Address(rsp, 48));
-    __ vinsertf128h(xmm4, Address(rsp, 64));
-    __ vinsertf128h(xmm5, Address(rsp, 80));
-    __ vinsertf128h(xmm6, Address(rsp, 96));
-    __ vinsertf128h(xmm7, Address(rsp,112));
-    __ addptr(rsp, additional_frame_bytes);
-    if (UseAVX > 2) {
-      additional_frame_bytes = 256;
-      __ vinsertf64x4h(xmm0, Address(rsp, 0));
-      __ vinsertf64x4h(xmm1, Address(rsp, 32));
-      __ vinsertf64x4h(xmm2, Address(rsp, 64));
-      __ vinsertf64x4h(xmm3, Address(rsp, 96));
-      __ vinsertf64x4h(xmm4, Address(rsp, 128));
-      __ vinsertf64x4h(xmm5, Address(rsp, 160));
-      __ vinsertf64x4h(xmm6, Address(rsp, 192));
-      __ vinsertf64x4h(xmm7, Address(rsp, 224));
-      __ addptr(rsp, additional_frame_bytes);
+    off = 0;
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
     }
+    __ addptr(rsp, additional_frame_bytes); // Save upper half of YMM registes
   }
   __ pop_FPU_state();
   __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -69,7 +69,9 @@
 class RegisterSaver {
   // Capture info about frame layout.  Layout offsets are in jint
   // units because compiler frame slots are jints.
+#define HALF_ZMM_BANK_WORDS 128
 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
+#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
   enum layout {
     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
     xmm_off       = fpu_state_off + 160/BytesPerInt,            // offset in fxsave save area
@@ -89,23 +91,24 @@
     DEF_XMM_OFFS(13),
     DEF_XMM_OFFS(14),
     DEF_XMM_OFFS(15),
-    DEF_XMM_OFFS(16),
-    DEF_XMM_OFFS(17),
-    DEF_XMM_OFFS(18),
-    DEF_XMM_OFFS(19),
-    DEF_XMM_OFFS(20),
-    DEF_XMM_OFFS(21),
-    DEF_XMM_OFFS(22),
-    DEF_XMM_OFFS(23),
-    DEF_XMM_OFFS(24),
-    DEF_XMM_OFFS(25),
-    DEF_XMM_OFFS(26),
-    DEF_XMM_OFFS(27),
-    DEF_XMM_OFFS(28),
-    DEF_XMM_OFFS(29),
-    DEF_XMM_OFFS(30),
-    DEF_XMM_OFFS(31),
-    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords - 1)*wordSize / BytesPerInt),
+    zmm_off = fpu_state_off + ((FPUStateSizeInWords - (HALF_ZMM_BANK_WORDS + 1))*wordSize / BytesPerInt),
+    DEF_ZMM_OFFS(16),
+    DEF_ZMM_OFFS(17),
+    DEF_ZMM_OFFS(18),
+    DEF_ZMM_OFFS(19),
+    DEF_ZMM_OFFS(20),
+    DEF_ZMM_OFFS(21),
+    DEF_ZMM_OFFS(22),
+    DEF_ZMM_OFFS(23),
+    DEF_ZMM_OFFS(24),
+    DEF_ZMM_OFFS(25),
+    DEF_ZMM_OFFS(26),
+    DEF_ZMM_OFFS(27),
+    DEF_ZMM_OFFS(28),
+    DEF_ZMM_OFFS(29),
+    DEF_ZMM_OFFS(30),
+    DEF_ZMM_OFFS(31),
+    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
     fpu_stateH_end,
     r15_off, r15H_off,
     r14_off, r14H_off,
@@ -155,9 +158,10 @@
 
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
   int vect_words = 0;
-  int num_xmm_regs = 16;
-  if (UseAVX > 2) {
-    num_xmm_regs = 32;
+  int off = 0;
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
+  if (UseAVX < 3) {
+    num_xmm_regs = num_xmm_regs/2;
   }
 #ifdef COMPILER2
   if (save_vectors) {
@@ -165,9 +169,7 @@
     assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
     // Save upper half of YMM registers
     vect_words = 16 * num_xmm_regs / wordSize;
-    additional_frame_words += vect_words;
-    if (UseAVX > 2) {
-      // Save upper half of ZMM registers as well
+    if (UseAVX < 3) {
       additional_frame_words += vect_words;
     }
   }
@@ -195,77 +197,13 @@
   __ enter();          // rsp becomes 16-byte aligned here
   __ push_CPU_state(); // Push a multiple of 16 bytes
 
-  if (vect_words > 0) {
+  // push cpu state handles this on EVEX enabled targets
+  if ((vect_words > 0) && (UseAVX < 3)) {
     assert(vect_words*wordSize >= 256, "");
-    __ subptr(rsp, 256); // Save upper half of YMM registes(0..15)
-    __ vextractf128h(Address(rsp, 0), xmm0);
-    __ vextractf128h(Address(rsp, 16), xmm1);
-    __ vextractf128h(Address(rsp, 32), xmm2);
-    __ vextractf128h(Address(rsp, 48), xmm3);
-    __ vextractf128h(Address(rsp, 64), xmm4);
-    __ vextractf128h(Address(rsp, 80), xmm5);
-    __ vextractf128h(Address(rsp, 96), xmm6);
-    __ vextractf128h(Address(rsp, 112), xmm7);
-    __ vextractf128h(Address(rsp, 128), xmm8);
-    __ vextractf128h(Address(rsp, 144), xmm9);
-    __ vextractf128h(Address(rsp, 160), xmm10);
-    __ vextractf128h(Address(rsp, 176), xmm11);
-    __ vextractf128h(Address(rsp, 192), xmm12);
-    __ vextractf128h(Address(rsp, 208), xmm13);
-    __ vextractf128h(Address(rsp, 224), xmm14);
-    __ vextractf128h(Address(rsp, 240), xmm15);
-    if (UseAVX > 2) {
-      __ subptr(rsp, 256); // Save upper half of YMM registes(16..31)
-      __ vextractf128h(Address(rsp, 0), xmm16);
-      __ vextractf128h(Address(rsp, 16), xmm17);
-      __ vextractf128h(Address(rsp, 32), xmm18);
-      __ vextractf128h(Address(rsp, 48), xmm19);
-      __ vextractf128h(Address(rsp, 64), xmm20);
-      __ vextractf128h(Address(rsp, 80), xmm21);
-      __ vextractf128h(Address(rsp, 96), xmm22);
-      __ vextractf128h(Address(rsp, 112), xmm23);
-      __ vextractf128h(Address(rsp, 128), xmm24);
-      __ vextractf128h(Address(rsp, 144), xmm25);
-      __ vextractf128h(Address(rsp, 160), xmm26);
-      __ vextractf128h(Address(rsp, 176), xmm27);
-      __ vextractf128h(Address(rsp, 192), xmm28);
-      __ vextractf128h(Address(rsp, 208), xmm29);
-      __ vextractf128h(Address(rsp, 224), xmm30);
-      __ vextractf128h(Address(rsp, 240), xmm31);
-      // Now handle the ZMM registers (0..31)
-      __ subptr(rsp, 1024); // Save upper half of ZMM registes
-      __ vextractf64x4h(Address(rsp, 0), xmm0);
-      __ vextractf64x4h(Address(rsp, 32), xmm1);
-      __ vextractf64x4h(Address(rsp, 64), xmm2);
-      __ vextractf64x4h(Address(rsp, 96), xmm3);
-      __ vextractf64x4h(Address(rsp, 128), xmm4);
-      __ vextractf64x4h(Address(rsp, 160), xmm5);
-      __ vextractf64x4h(Address(rsp, 192), xmm6);
-      __ vextractf64x4h(Address(rsp, 224), xmm7);
-      __ vextractf64x4h(Address(rsp, 256), xmm8);
-      __ vextractf64x4h(Address(rsp, 288), xmm9);
-      __ vextractf64x4h(Address(rsp, 320), xmm10);
-      __ vextractf64x4h(Address(rsp, 352), xmm11);
-      __ vextractf64x4h(Address(rsp, 384), xmm12);
-      __ vextractf64x4h(Address(rsp, 416), xmm13);
-      __ vextractf64x4h(Address(rsp, 448), xmm14);
-      __ vextractf64x4h(Address(rsp, 480), xmm15);
-      __ vextractf64x4h(Address(rsp, 512), xmm16);
-      __ vextractf64x4h(Address(rsp, 544), xmm17);
-      __ vextractf64x4h(Address(rsp, 576), xmm18);
-      __ vextractf64x4h(Address(rsp, 608), xmm19);
-      __ vextractf64x4h(Address(rsp, 640), xmm20);
-      __ vextractf64x4h(Address(rsp, 672), xmm21);
-      __ vextractf64x4h(Address(rsp, 704), xmm22);
-      __ vextractf64x4h(Address(rsp, 736), xmm23);
-      __ vextractf64x4h(Address(rsp, 768), xmm24);
-      __ vextractf64x4h(Address(rsp, 800), xmm25);
-      __ vextractf64x4h(Address(rsp, 832), xmm26);
-      __ vextractf64x4h(Address(rsp, 864), xmm27);
-      __ vextractf64x4h(Address(rsp, 896), xmm28);
-      __ vextractf64x4h(Address(rsp, 928), xmm29);
-      __ vextractf64x4h(Address(rsp, 960), xmm30);
-      __ vextractf64x4h(Address(rsp, 992), xmm31);
+    // Save upper half of YMM registes(0..num_xmm_regs)
+    __ subptr(rsp, num_xmm_regs*16);
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
     }
   }
   if (frame::arg_reg_save_area_bytes != 0) {
@@ -299,39 +237,24 @@
   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
-  if (UseAVX > 2) {
-    map->set_callee_saved(STACK_OFFSET(xmm16_off), xmm16->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm17_off), xmm17->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm18_off), xmm18->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm19_off), xmm19->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm20_off), xmm20->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm21_off), xmm21->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm22_off), xmm22->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm23_off), xmm23->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm24_off), xmm24->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm25_off), xmm25->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm26_off), xmm26->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm27_off), xmm27->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm28_off), xmm28->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm29_off), xmm29->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm30_off), xmm30->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm31_off), xmm31->as_VMReg());
+  // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
+  // on EVEX enabled targets, we get it included in the xsave area
+  off = xmm0_off;
+  int delta = xmm1_off - off;
+  for (int n = 0; n < 16; n++) {
+    XMMRegister xmm_name = as_XMMRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+    off += delta;
+  }
+  if(UseAVX > 2) {
+    // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
+    off = zmm16_off;
+    delta = zmm17_off - off;
+    for (int n = 16; n < num_xmm_regs; n++) {
+      XMMRegister xmm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+      off += delta;
+    }
   }
 
   // %%% These should all be a waste but we'll keep things as they were for now
@@ -351,39 +274,24 @@
     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
+    // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
+    // on EVEX enabled targets, we get it included in the xsave area
+    off = xmm0H_off;
+    delta = xmm1H_off - off;
+    for (int n = 0; n < 16; n++) {
+      XMMRegister xmm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+      off += delta;
+    }
     if (UseAVX > 2) {
-      map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()->next());
+      // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
+      off = zmm16H_off;
+      delta = zmm17H_off - off;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+        off += delta;
+      }
     }
   }
 
@@ -391,86 +299,25 @@
 }
 
 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
+  if (UseAVX < 3) {
+    num_xmm_regs = num_xmm_regs/2;
+  }
   if (frame::arg_reg_save_area_bytes != 0) {
     // Pop arg register save area
     __ addptr(rsp, frame::arg_reg_save_area_bytes);
   }
 #ifdef COMPILER2
-  if (restore_vectors) {
-    // Restore upper half of YMM registes (0..15)
-    assert(UseAVX > 0, "512bit vectors are supported only with AVX");
-    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
-    __ vinsertf128h(xmm0, Address(rsp,  0));
-    __ vinsertf128h(xmm1, Address(rsp, 16));
-    __ vinsertf128h(xmm2, Address(rsp, 32));
-    __ vinsertf128h(xmm3, Address(rsp, 48));
-    __ vinsertf128h(xmm4, Address(rsp, 64));
-    __ vinsertf128h(xmm5, Address(rsp, 80));
-    __ vinsertf128h(xmm6, Address(rsp, 96));
-    __ vinsertf128h(xmm7, Address(rsp,112));
-    __ vinsertf128h(xmm8, Address(rsp,128));
-    __ vinsertf128h(xmm9, Address(rsp,144));
-    __ vinsertf128h(xmm10, Address(rsp,160));
-    __ vinsertf128h(xmm11, Address(rsp,176));
-    __ vinsertf128h(xmm12, Address(rsp,192));
-    __ vinsertf128h(xmm13, Address(rsp,208));
-    __ vinsertf128h(xmm14, Address(rsp,224));
-    __ vinsertf128h(xmm15, Address(rsp,240));
-    __ addptr(rsp, 256);
-    if (UseAVX > 2) {
-      // Restore upper half of YMM registes (16..31)
-      __ vinsertf128h(xmm16, Address(rsp,  0));
-      __ vinsertf128h(xmm17, Address(rsp, 16));
-      __ vinsertf128h(xmm18, Address(rsp, 32));
-      __ vinsertf128h(xmm19, Address(rsp, 48));
-      __ vinsertf128h(xmm20, Address(rsp, 64));
-      __ vinsertf128h(xmm21, Address(rsp, 80));
-      __ vinsertf128h(xmm22, Address(rsp, 96));
-      __ vinsertf128h(xmm23, Address(rsp,112));
-      __ vinsertf128h(xmm24, Address(rsp,128));
-      __ vinsertf128h(xmm25, Address(rsp,144));
-      __ vinsertf128h(xmm26, Address(rsp,160));
-      __ vinsertf128h(xmm27, Address(rsp,176));
-      __ vinsertf128h(xmm28, Address(rsp,192));
-      __ vinsertf128h(xmm29, Address(rsp,208));
-      __ vinsertf128h(xmm30, Address(rsp,224));
-      __ vinsertf128h(xmm31, Address(rsp,240));
-      __ addptr(rsp, 256);
-      // Restore upper half of ZMM registes.
-      __ vinsertf64x4h(xmm0, Address(rsp, 0));
-      __ vinsertf64x4h(xmm1, Address(rsp, 32));
-      __ vinsertf64x4h(xmm2, Address(rsp, 64));
-      __ vinsertf64x4h(xmm3, Address(rsp, 96));
-      __ vinsertf64x4h(xmm4, Address(rsp, 128));
-      __ vinsertf64x4h(xmm5, Address(rsp, 160));
-      __ vinsertf64x4h(xmm6, Address(rsp, 192));
-      __ vinsertf64x4h(xmm7, Address(rsp, 224));
-      __ vinsertf64x4h(xmm8, Address(rsp, 256));
-      __ vinsertf64x4h(xmm9, Address(rsp, 288));
-      __ vinsertf64x4h(xmm10, Address(rsp, 320));
-      __ vinsertf64x4h(xmm11, Address(rsp, 352));
-      __ vinsertf64x4h(xmm12, Address(rsp, 384));
-      __ vinsertf64x4h(xmm13, Address(rsp, 416));
-      __ vinsertf64x4h(xmm14, Address(rsp, 448));
-      __ vinsertf64x4h(xmm15, Address(rsp, 480));
-      __ vinsertf64x4h(xmm16, Address(rsp, 512));
-      __ vinsertf64x4h(xmm17, Address(rsp, 544));
-      __ vinsertf64x4h(xmm18, Address(rsp, 576));
-      __ vinsertf64x4h(xmm19, Address(rsp, 608));
-      __ vinsertf64x4h(xmm20, Address(rsp, 640));
-      __ vinsertf64x4h(xmm21, Address(rsp, 672));
-      __ vinsertf64x4h(xmm22, Address(rsp, 704));
-      __ vinsertf64x4h(xmm23, Address(rsp, 736));
-      __ vinsertf64x4h(xmm24, Address(rsp, 768));
-      __ vinsertf64x4h(xmm25, Address(rsp, 800));
-      __ vinsertf64x4h(xmm26, Address(rsp, 832));
-      __ vinsertf64x4h(xmm27, Address(rsp, 864));
-      __ vinsertf64x4h(xmm28, Address(rsp, 896));
-      __ vinsertf64x4h(xmm29, Address(rsp, 928));
-      __ vinsertf64x4h(xmm30, Address(rsp, 960));
-      __ vinsertf64x4h(xmm31, Address(rsp, 992));
-      __ addptr(rsp, 1024);
+  // On EVEX enabled targets everything is handled in pop fpu state
+  if ((restore_vectors) && (UseAVX < 3)) {
+    assert(UseAVX > 0, "256/512-bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 64, "up to 512bit vectors are supported now");
+    int off = 0;
+    // Restore upper half of YMM registes (0..num_xmm_regs)
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  off++*16));
     }
+    __ addptr(rsp, num_xmm_regs*16);
   }
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -722,7 +722,7 @@
            __ popa();
          }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
@@ -754,7 +754,7 @@
         }
         break;
 
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
         {
           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
@@ -795,6 +795,12 @@
   void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
     assert( UseSSE >= 2, "supported cpu only" );
     Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
+    if (UseAVX > 2) {
+      __ push(rbx);
+      __ movl(rbx, 0xffff);
+      __ kmovdl(k1, rbx);
+      __ pop(rbx);
+    }
     // Copy 64-byte chunks
     __ jmpb(L_copy_64_bytes);
     __ align(OptoLoopAlignment);
@@ -802,8 +808,8 @@
 
     if (UseUnalignedLoadStores) {
       if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(from, 0), Assembler::AVX_512bit);
-        __ evmovdqu(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
+        __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
       } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(from,  0));
         __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
@@ -2217,6 +2223,15 @@
     const XMMRegister xmm_temp4  = xmm5;
 
     __ enter();   // required for proper stackwalking of RuntimeStub frame
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     __ movptr(from, from_param);
     __ movptr(key, key_param);
 
@@ -2315,6 +2330,15 @@
     const XMMRegister xmm_temp4  = xmm5;
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     __ movptr(from, from_param);
     __ movptr(key, key_param);
 
@@ -2441,6 +2465,14 @@
     __ enter(); // required for proper stackwalking of RuntimeStub frame
     handleSOERegisters(true /*saving*/);
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     // load registers from incoming parameters
     const Address  from_param(rbp, 8+0);
     const Address  to_param  (rbp, 8+4);
@@ -2602,6 +2634,14 @@
     __ enter(); // required for proper stackwalking of RuntimeStub frame
     handleSOERegisters(true /*saving*/);
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     // load registers from incoming parameters
     const Address  from_param(rbp, 8+0);
     const Address  to_param  (rbp, 8+4);
@@ -2782,6 +2822,14 @@
     __ enter();
     handleSOERegisters(true);  // Save registers
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     __ movptr(state, state_param);
     __ movptr(subkeyH, subkeyH_param);
     __ movptr(data, data_param);
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -269,12 +269,16 @@
       __ kmovql(k1, rbx);
     }
 #ifdef _WIN64
+    int last_reg = 15;
     if (UseAVX > 2) {
-      for (int i = 6; i <= 31; i++) {
-        __ movdqu(xmm_save(i), as_XMMRegister(i));
+      last_reg = 31;
+    }
+    if (VM_Version::supports_avx512novl()) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ vextractf32x4h(xmm_save(i), as_XMMRegister(i), 0);
       }
     } else {
-      for (int i = 6; i <= 15; i++) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
         __ movdqu(xmm_save(i), as_XMMRegister(i));
       }
     }
@@ -367,28 +371,34 @@
 #ifdef ASSERT
     // verify that threads correspond
     {
-      Label L, S;
+     Label L1, L2, L3;
       __ cmpptr(r15_thread, thread);
-      __ jcc(Assembler::notEqual, S);
+      __ jcc(Assembler::equal, L1);
+      __ stop("StubRoutines::call_stub: r15_thread is corrupted");
+      __ bind(L1);
       __ get_thread(rbx);
+      __ cmpptr(r15_thread, thread);
+      __ jcc(Assembler::equal, L2);
+      __ stop("StubRoutines::call_stub: r15_thread is modified by call");
+      __ bind(L2);
       __ cmpptr(r15_thread, rbx);
-      __ jcc(Assembler::equal, L);
-      __ bind(S);
-      __ jcc(Assembler::equal, L);
+      __ jcc(Assembler::equal, L3);
       __ stop("StubRoutines::call_stub: threads must correspond");
-      __ bind(L);
+      __ bind(L3);
     }
 #endif
 
     // restore regs belonging to calling function
 #ifdef _WIN64
-    int xmm_ub = 15;
-    if (UseAVX > 2) {
-      xmm_ub = 31;
-    }
     // emit the restores for xmm regs
-    for (int i = 6; i <= xmm_ub; i++) {
-      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    if (VM_Version::supports_avx512novl()) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ vinsertf32x4h(as_XMMRegister(i), xmm_save(i), 0);
+      }
+    } else {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ movdqu(as_XMMRegister(i), xmm_save(i));
+      }
     }
 #endif
     __ movptr(r15, r15_save);
@@ -450,15 +460,20 @@
 #ifdef ASSERT
     // verify that threads correspond
     {
-      Label L, S;
+      Label L1, L2, L3;
       __ cmpptr(r15_thread, thread);
-      __ jcc(Assembler::notEqual, S);
+      __ jcc(Assembler::equal, L1);
+      __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
+      __ bind(L1);
       __ get_thread(rbx);
+      __ cmpptr(r15_thread, thread);
+      __ jcc(Assembler::equal, L2);
+      __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
+      __ bind(L2);
       __ cmpptr(r15_thread, rbx);
-      __ jcc(Assembler::equal, L);
-      __ bind(S);
+      __ jcc(Assembler::equal, L3);
       __ stop("StubRoutines::catch_exception: threads must correspond");
-      __ bind(L);
+      __ bind(L3);
     }
 #endif
 
@@ -1244,7 +1259,7 @@
            __ popa();
         }
          break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
@@ -1284,7 +1299,7 @@
           __ popa();
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
         {
           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
@@ -1333,11 +1348,15 @@
     __ align(OptoLoopAlignment);
     if (UseUnalignedLoadStores) {
       Label L_end;
+      if (UseAVX > 2) {
+        __ movl(to, 0xffff);
+        __ kmovql(k1, to);
+      }
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
       if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
-        __ evmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
+        __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
       } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
@@ -1413,11 +1432,15 @@
     __ align(OptoLoopAlignment);
     if (UseUnalignedLoadStores) {
       Label L_end;
+      if (UseAVX > 2) {
+        __ movl(to, 0xffff);
+        __ kmovql(k1, to);
+      }
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
       if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
-        __ evmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
+        __ evmovdqul(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
       } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
@@ -3097,6 +3120,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 
@@ -3191,6 +3222,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 
@@ -3303,6 +3342,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
     // on win64, fill len_reg from stack position
     __ movl(len_reg, len_mem);
@@ -3499,6 +3546,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
     // on win64, fill len_reg from stack position
     __ movl(len_reg, len_mem);
@@ -3737,6 +3792,14 @@
 
     __ enter();
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
     // save the xmm registers which must be preserved 6-10
     __ subptr(rsp, -rsp_after_call_off * wordSize);
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -31,7 +31,7 @@
 
 enum platform_dependent_constants {
   code_size1 =  9000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 30000            // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -33,7 +33,7 @@
 
 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 24000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 32000           // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
--- a/hotspot/src/cpu/x86/vm/templateTable_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/templateTable_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -200,7 +200,7 @@
       }
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       {
         if (val == noreg) {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -367,16 +367,12 @@
     __ movl(rcx, VM_Version::ymm_test_value());
     __ movdl(xmm0, rcx);
     __ movl(rcx, 0xffff);
+    __ kmovwl(k1, rcx);
+    __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit);
 #ifdef _LP64
-    __ kmovql(k1, rcx);
-#else
-    __ kmovdl(k1, rcx);
-#endif
-    __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(xmm7, xmm0, Assembler::AVX_512bit);
-#ifdef _LP64
-    __ evmovdqu(xmm8, xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(xmm31, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm8, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm31, xmm0, Assembler::AVX_512bit);
 #endif
     VM_Version::clean_cpuFeatures();
     __ jmp(save_restore_except);
@@ -427,11 +423,11 @@
     UseAVX = 3;
     UseSSE = 2;
     __ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset())));
-    __ evmovdqu(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
 #ifdef _LP64
-    __ evmovdqu(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
-    __ evmovdqu(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
 #endif
     VM_Version::clean_cpuFeatures();
     UseAVX = saved_useavx;
@@ -714,6 +710,11 @@
     FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
   }
 
+  if (UseAdler32Intrinsics) {
+    warning("Adler32Intrinsics not available on this CPU.");
+    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
+  }
+
   // Adjust RTM (Restricted Transactional Memory) flags
   if (!supports_rtm() && UseRTMLocking) {
     // Can't continue because UseRTMLocking affects UseBiasedLocking flag
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -227,14 +227,15 @@
   union XemXcr0Eax {
     uint32_t value;
     struct {
-      uint32_t x87    : 1,
-               sse    : 1,
-               ymm    : 1,
-                      : 2,
-               opmask : 1,
-               zmm512 : 1,
-                zmm32 : 1,
-                      : 24;
+      uint32_t x87     : 1,
+               sse     : 1,
+               ymm     : 1,
+               bndregs : 1,
+               bndcsr  : 1,
+               opmask  : 1,
+               zmm512  : 1,
+               zmm32   : 1,
+                       : 24;
     } bits;
   };
 
@@ -703,6 +704,7 @@
   static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
   static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
   static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
+  static bool supports_avx512novl() { return (supports_evex() && !supports_avx512vl()); }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
@@ -817,6 +819,12 @@
     intx count = PrefetchFieldsAhead;
     return count >= 0 ? count : 1;
   }
+  static uint32_t get_xsave_header_lower_segment() {
+    return _cpuid_info.xem_xcr0_eax.value;
+  }
+  static uint32_t get_xsave_header_upper_segment() {
+    return _cpuid_info.xem_xcr0_edx;
+  }
 };
 
 #endif // CPU_X86_VM_VM_VERSION_X86_HPP
--- a/hotspot/src/cpu/x86/vm/x86.ad	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/x86.ad	Mon Sep 14 07:03:04 2015 +0000
@@ -1661,46 +1661,55 @@
   if (!has_match_rule(opcode))
     return false;
 
+  bool ret_value = true;
   switch (opcode) {
     case Op_PopCountI:
     case Op_PopCountL:
       if (!UsePopCountInstruction)
-        return false;
-    break;
+        ret_value = false;
+      break;
     case Op_MulVI:
       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
-        return false;
-    break;
+        ret_value = false;
+      break;
     case Op_MulVL:
     case Op_MulReductionVL:
       if (VM_Version::supports_avx512dq() == false)
-        return false;
+        ret_value = false;
+      break;
     case Op_AddReductionVL:
       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
-        return false;
+        ret_value = false;
+      break;
     case Op_AddReductionVI:
       if (UseSSE < 3) // requires at least SSE3
-        return false;
+        ret_value = false;
+      break;
     case Op_MulReductionVI:
       if (UseSSE < 4) // requires at least SSE4
-        return false;
+        ret_value = false;
+      break;
     case Op_AddReductionVF:
     case Op_AddReductionVD:
     case Op_MulReductionVF:
     case Op_MulReductionVD:
       if (UseSSE < 1) // requires at least SSE
-        return false;
-    break;
+        ret_value = false;
+      break;
+    case Op_SqrtVD:
+      if (UseAVX < 1) // enabled for AVX only
+        ret_value = false;
+      break;
     case Op_CompareAndSwapL:
 #ifdef _LP64
     case Op_CompareAndSwapP:
 #endif
       if (!VM_Version::supports_cx8())
-        return false;
-    break;
+        ret_value = false;
+      break;
   }
 
-  return true;  // Per default match rules are supported.
+  return ret_value;  // Per default match rules are supported.
 }
 
 // Max vector size in bytes. 0 if not supported.
@@ -1721,14 +1730,24 @@
   case T_DOUBLE:
   case T_LONG:
     if (size < 16) return 0;
+    break;
   case T_FLOAT:
   case T_INT:
     if (size < 8) return 0;
+    break;
   case T_BOOLEAN:
+    if (size < 4) return 0;
+    break;
+  case T_CHAR:
+    if (size < 4) return 0;
+    break;
   case T_BYTE:
-  case T_CHAR:
+    if (size < 4) return 0;
+    if ((size > 32) && !VM_Version::supports_avx512bw()) return 0;
+    break;
   case T_SHORT:
     if (size < 4) return 0;
+    if ((size > 16) && !VM_Version::supports_avx512bw()) return 0;
     break;
   default:
     ShouldNotReachHere();
@@ -1800,7 +1819,7 @@
       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
       break;
     case Op_VecZ:
-      __ evmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
+      __ evmovdqul(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
       break;
     default:
       ShouldNotReachHere();
@@ -1855,7 +1874,7 @@
         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
         break;
       case Op_VecZ:
-        __ evmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
+        __ evmovdqul(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
         break;
       default:
         ShouldNotReachHere();
@@ -1875,7 +1894,7 @@
         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
         break;
       case Op_VecZ:
-        __ evmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
+        __ evmovdqul(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
         break;
       default:
         ShouldNotReachHere();
@@ -1929,9 +1948,40 @@
     }
 #endif
   }
-  int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
+  bool is_single_byte = false;
+  int vec_len = 0;
+  if ((UseAVX > 2) && (stack_offset != 0)) {
+    switch (ireg) {
+	case Op_VecS:
+    case Op_VecD:
+    case Op_VecX:
+	  break;
+	case Op_VecY:
+	  vec_len = 1;
+	  break;
+    case Op_VecZ:
+	  vec_len = 2;
+	  break;
+    }
+    is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, Assembler::EVEX_FVM, Assembler::EVEX_32bit, 0);
+  }
+  int offset_size = 0;
+  int size = 5;
+  if (UseAVX > 2 ) {
+    if ((VM_Version::supports_avx512vl() == false) && (vec_len == 2)) { 
+      offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
+      size += 2; // Need an additional two bytes for EVEX encoding
+    } else if ((VM_Version::supports_avx512vl() == false) && (vec_len < 2)) { 
+      offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
+    } else {
+      offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
+      size += 2; // Need an additional two bytes for EVEX encodding
+    }
+  } else {
+    offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
+  }
   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
-  return 5+offset_size;
+  return size+offset_size;
 }
 
 static inline jfloat replicate4_imm(int con, int width) {
@@ -2675,11 +2725,10 @@
   predicate(UseAVX > 0);
   match(Set dst (NegF src));
   ins_cost(150);
-  format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signflip()), vector_len);
+  format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
+  ins_encode %{
+    __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
+                 ExternalAddress(float_signflip()));
   %}
   ins_pipe(pipe_slow);
 %}
@@ -2700,12 +2749,11 @@
   predicate(UseAVX > 0);
   match(Set dst (NegD src));
   ins_cost(150);
-  format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
+  format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
             "# neg double by sign flipping" %}
   ins_encode %{
-    int vector_len = 0;
-    __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signflip()), vector_len);
+    __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
+                 ExternalAddress(double_signflip()));
   %}
   ins_pipe(pipe_slow);
 %}
@@ -2838,7 +2886,7 @@
   format %{ "vmovdqu $dst k0,$mem\t! load vector (64 bytes)" %}
   ins_encode %{
     int vector_len = 2;
-    __ evmovdqu($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -2895,7 +2943,7 @@
   format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %}
   ins_encode %{
     int vector_len = 2;
-    __ evmovdqu($mem$$Address, $src$$XMMRegister, vector_len);
+    __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3315,6 +3363,37 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl2F_zero(vecD dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F_zero(vecX dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_zero(vecY dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl2D_mem(vecX dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD (LoadD mem)));
@@ -3349,6 +3428,28 @@
   ins_pipe( pipe_slow );
 %}
 
+// Replicate double (8 byte) scalar zero to be vector
+instruct Repl2D_zero(vecX dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
+  match(Set dst (ReplicateD zero));
+  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
+  ins_encode %{
+    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_zero(vecY dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
+  match(Set dst (ReplicateD zero));
+  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // ====================GENERIC REPLICATE==========================================
 
 // Replicate byte scalar to be vector
@@ -3680,38 +3781,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Replicate float (4 byte) scalar zero to be vector
-instruct Repl2F_zero(vecD dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateF zero));
-  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
-  ins_encode %{
-    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4F_zero(vecX dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateF zero));
-  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
-  ins_encode %{
-    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl8F_zero(vecY dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateF zero));
-  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate double (8 bytes) scalar to be vector
 instruct Repl2D(vecX dst, regD src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3723,28 +3792,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Replicate double (8 byte) scalar zero to be vector
-instruct Repl2D_zero(vecX dst, immD0 zero) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateD zero));
-  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
-  ins_encode %{
-    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4D_zero(vecY dst, immD0 zero) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateD zero));
-  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // ====================EVEX REPLICATE=============================================
 
 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
@@ -3814,7 +3861,7 @@
 %}
 
 instruct Repl64B_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB src));
   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
   ins_encode %{
@@ -3825,7 +3872,7 @@
 %}
 
 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
   ins_encode %{
@@ -3862,7 +3909,7 @@
 %}
 
 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
@@ -3953,7 +4000,7 @@
 %}
 
 instruct Repl32S_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS src));
   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
   ins_encode %{
@@ -3964,7 +4011,7 @@
 %}
 
 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
   ins_encode %{
@@ -4001,7 +4048,7 @@
 %}
 
 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastw $dst,$dst\t! replicate32S" %}
@@ -4318,13 +4365,50 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
   match(Set dst (ReplicateF zero));
-  format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
-  ins_encode %{
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
     int vector_len = 2;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -4373,13 +4457,38 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
+  match(Set dst (ReplicateD zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
+  match(Set dst (ReplicateD zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateD zero));
-  format %{ "vxorpd  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
-  ins_encode %{
+  format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
     int vector_len = 2;
-    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -7474,6 +7583,75 @@
   ins_pipe( pipe_slow );
 %}
 
+// --------------------------------- Sqrt --------------------------------------
+
+// Floating point vector sqrt - double precision only
+instruct vsqrt2D_reg(vecX dst, vecX src) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt2D_mem(vecX dst, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt4D_reg(vecY dst, vecY src) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt4D_mem(vecY dst, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt8D_mem(vecZ dst, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ------------------------------ LeftShift -----------------------------------
 
 // Shorts/Chars vector left shift
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Mon Sep 14 07:03:04 2015 +0000
@@ -1004,10 +1004,10 @@
       __ vmovdqu(Address(rsp, dst_offset), xmm0);
       __ vmovdqu(xmm0, Address(rsp, -32));
     case Op_VecZ:
-      __ evmovdqu(Address(rsp, -64), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, src_offset), 2);
-      __ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, -64), 2);
+      __ evmovdqul(Address(rsp, -64), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, -64), 2);
       break;
     default:
       ShouldNotReachHere();
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Mon Sep 14 07:03:04 2015 +0000
@@ -1075,10 +1075,10 @@
       __ vmovdqu(Address(rsp, dst_offset), xmm0);
       __ vmovdqu(xmm0, Address(rsp, -32));
     case Op_VecZ:
-      __ evmovdqu(Address(rsp, -64), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, src_offset), 2);
-      __ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, -64), 2);
+      __ evmovdqul(Address(rsp, -64), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, -64), 2);
       break;
     default:
       ShouldNotReachHere();
--- a/hotspot/src/os_cpu/linux_sparc/vm/vm_version_linux_sparc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/os_cpu/linux_sparc/vm/vm_version_linux_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -53,6 +53,10 @@
   return cpuinfo_field_contains("cpu", "Niagara");
 }
 
+static bool detect_M_family() {
+  return cpuinfo_field_contains("cpu", "SPARC-M");
+}
+
 static bool detect_blkinit() {
   return cpuinfo_field_contains("cpucaps", "blkinit");
 }
@@ -66,6 +70,11 @@
     features = niagara1_m | T_family_m;
   }
 
+  if (detect_M_family()) {
+    NOT_PRODUCT(if (PrintMiscellaneous && Verbose) tty->print_cr("Detected Linux on M family");)
+    features = sun4v_m | generic_v9_m | M_family_m | T_family_m;
+  }
+
   if (detect_blkinit()) {
     features |= blk_init_instructions_m;
   }
--- a/hotspot/src/share/vm/adlc/Doc/Syntax.doc	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/share/vm/adlc/Doc/Syntax.doc	Mon Sep 14 07:03:04 2015 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 1997, 1998, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -33,7 +33,7 @@
 the architecture of a processor, and is the input to the ADL Compiler.  The
 ADL Compiler compiles an ADL file into code which is incorporated into the
 Optimizing Just In Time Compiler (OJIT) to generate efficient and correct code
-for the target architecture.  The ADL describes three bassic different types
+for the target architecture.  The ADL describes three basic different types
 of architectural features.  It describes the instruction set (and associated
 operands) of the target architecture.  It describes the register set of the
 target architecture along with relevant information for the register allocator.
--- a/hotspot/src/share/vm/adlc/formssel.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/share/vm/adlc/formssel.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -4143,6 +4143,7 @@
     "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
     "MulVS","MulVI","MulVL","MulVF","MulVD",
     "DivVF","DivVD",
+    "SqrtVD",
     "AndV" ,"XorV" ,"OrV",
     "AddReductionVI", "AddReductionVL",
     "AddReductionVF", "AddReductionVD",
--- a/hotspot/src/share/vm/c1/c1_Compiler.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/share/vm/c1/c1_Compiler.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -32,7 +32,6 @@
 #include "c1/c1_Runtime1.hpp"
 #include "c1/c1_ValueType.hpp"
 #include "compiler/compileBroker.hpp"
-#include "compiler/compilerOracle.hpp"
 #include "interpreter/linkResolver.hpp"
 #include "memory/allocation.hpp"
 #include "memory/allocation.inline.hpp"
--- a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -4212,7 +4212,7 @@
   if (!PrintInlining && !compilation()->method()->has_option("PrintInlining")) {
     return;
   }
-  CompileTask::print_inlining(callee, scope()->level(), bci(), msg);
+  CompileTask::print_inlining_tty(callee, scope()->level(), bci(), msg);
   if (success && CIPrintMethodCodes) {
     callee->print_codes();
   }
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -1425,7 +1425,7 @@
       G1SATBCardTableModRef_pre_barrier(addr_opr, pre_val, do_load, patch, info);
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       // No pre barriers
       break;
@@ -1445,7 +1445,7 @@
       G1SATBCardTableModRef_post_barrier(addr,  new_val);
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       CardTableModRef_post_barrier(addr,  new_val);
       break;
--- a/hotspot/src/share/vm/ci/bcEscapeAnalyzer.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/share/vm/ci/bcEscapeAnalyzer.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -1447,7 +1447,6 @@
 
     if (methodData() == NULL)
       return;
-    bool printit = _method->should_print_assembly();
     if (methodData()->has_escape_info()) {
       TRACE_BCEA(2, tty->print_cr("[EA] Reading previous results for %s.%s",
                                   method->holder()->name()->as_utf8(),
--- a/hotspot/src/share/vm/classfile/classLoader.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/share/vm/classfile/classLoader.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -28,8 +28,8 @@
 #include "classfile/classLoader.hpp"
 #include "classfile/classLoaderData.inline.hpp"
 #include "classfile/classLoaderExt.hpp"
-#include "classfile/imageFile.hpp"
 #include "classfile/javaClasses.hpp"
+#include "classfile/jimage.hpp"
 #include "classfile/systemDictionary.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "compiler/compileBroker.hpp"
@@ -58,6 +58,7 @@
 #include "runtime/os.hpp"
 #include "runtime/threadCritical.hpp"
 #include "runtime/timer.hpp"
+#include "runtime/vm_version.hpp"
 #include "services/management.hpp"
 #include "services/threadService.hpp"
 #include "utilities/events.hpp"
@@ -68,7 +69,7 @@
 #include "classfile/sharedPathsMiscInfo.hpp"
 #endif
 
-// Entry points in zip.dll for loading zip/jar file entries and image file entries
+// Entry points in zip.dll for loading zip/jar file entries
 
 typedef void * * (JNICALL *ZipOpen_t)(const char *name, char **pmsg);
 typedef void (JNICALL *ZipClose_t)(jzfile *zip);
@@ -89,6 +90,15 @@
 static ZipInflateFully_t ZipInflateFully    = NULL;
 static Crc32_t           Crc32              = NULL;
 
+// Entry points for jimage.dll for loading jimage file entries
+
+static JImageOpen_t                    JImageOpen                    = NULL;
+static JImageClose_t                   JImageClose                   = NULL;
+static JImagePackageToModule_t         JImagePackageToModule         = NULL;
+static JImageFindResource_t            JImageFindResource            = NULL;
+static JImageGetResource_t             JImageGetResource             = NULL;
+static JImageResourceIterator_t        JImageResourceIterator        = NULL;
+
 // Globals
 
 PerfCounter*    ClassLoader::_perf_accumulated_time = NULL;
@@ -141,6 +151,15 @@
   return (strncmp(str, str_to_find, str_to_find_len) == 0);
 }
 
+static const char* get_jimage_version_string() {
+  static char version_string[10] = "";
+  if (version_string[0] == '\0') {
+    jio_snprintf(version_string, sizeof(version_string), "%d.%d",
+                 Abstract_VM_Version::vm_minor_version(), Abstract_VM_Version::vm_micro_version());
+  }
+  return (const char*)version_string;
+}
+
 bool string_ends_with(const char* str, const char* str_to_find) {
   size_t str_len = strlen(str);
   size_t str_to_find_len = strlen(str_to_find);
@@ -272,98 +291,114 @@
   }
 }
 
-ClassPathImageEntry::ClassPathImageEntry(ImageFileReader* image) :
+ClassPathImageEntry::ClassPathImageEntry(JImageFile* jimage, const char* name) :
   ClassPathEntry(),
-  _image(image),
-  _module_data(NULL) {
-  guarantee(image != NULL, "image file is null");
-
-  char module_data_name[JVM_MAXPATHLEN];
-  ImageModuleData::module_data_name(module_data_name, _image->name());
-  _module_data = new ImageModuleData(_image, module_data_name);
+  _jimage(jimage) {
+  guarantee(jimage != NULL, "jimage file is null");
+  guarantee(name != NULL, "jimage file name is null");
+  size_t len = strlen(name) + 1;
+  _name = NEW_C_HEAP_ARRAY(const char, len, mtClass);
+  strncpy((char *)_name, name, len);
 }
 
 ClassPathImageEntry::~ClassPathImageEntry() {
-  if (_module_data != NULL) {
-    delete _module_data;
-    _module_data = NULL;
+  if (_name != NULL) {
+    FREE_C_HEAP_ARRAY(const char, _name);
+    _name = NULL;
   }
-
-  if (_image != NULL) {
-    ImageFileReader::close(_image);
-    _image = NULL;
+  if (_jimage != NULL) {
+    (*JImageClose)(_jimage);
+    _jimage = NULL;
   }
 }
 
-const char* ClassPathImageEntry::name() {
-  return _image ? _image->name() : "";
+void ClassPathImageEntry::name_to_package(const char* name, char* buffer, int length) {
+  const char *pslash = strrchr(name, '/');
+  if (pslash == NULL) {
+    buffer[0] = '\0';
+    return;
+  }
+  int len = pslash - name;
+#if INCLUDE_CDS
+  if (len <= 0 && DumpSharedSpaces) {
+    buffer[0] = '\0';
+    return;
+  }
+#endif
+  assert(len > 0, "Bad length for package name");
+  if (len >= length) {
+    buffer[0] = '\0';
+    return;
+  }
+  // drop name after last slash (including slash)
+  // Ex., "java/lang/String.class" => "java/lang"
+  strncpy(buffer, name, len);
+  // ensure string termination (strncpy does not guarantee)
+  buffer[len] = '\0';
 }
 
+// For a class in a named module, look it up in the jimage file using this syntax:
+//    /<module-name>/<package-name>/<base-class>
+//
+// Assumptions:
+//     1. There are no unnamed modules in the jimage file.
+//     2. A package is in at most one module in the jimage file.
+//
 ClassFileStream* ClassPathImageEntry::open_stream(const char* name, TRAPS) {
-  ImageLocation location;
-  bool found = _image->find_location(name, location);
+  jlong size;
+  JImageLocationRef location = (*JImageFindResource)(_jimage, "", get_jimage_version_string(), name, &size);
 
-  if (!found) {
-    const char *pslash = strrchr(name, '/');
-    int len = pslash - name;
-
-    // NOTE: IMAGE_MAX_PATH is used here since this path is internal to the jimage
-    // (effectively unlimited.)  There are several JCK tests that use paths over
-    // 1024 characters long, the limit on Windows systems.
-    if (pslash && 0 < len && len < IMAGE_MAX_PATH) {
-
-      char path[IMAGE_MAX_PATH];
-      strncpy(path, name, len);
-      path[len] = '\0';
-      const char* moduleName = _module_data->package_to_module(path);
-
-      if (moduleName != NULL && (len + strlen(moduleName) + 2) < IMAGE_MAX_PATH) {
-        jio_snprintf(path, IMAGE_MAX_PATH - 1, "/%s/%s", moduleName, name);
-        location.clear_data();
-        found = _image->find_location(path, location);
-      }
+  if (location == 0) {
+    char package[JIMAGE_MAX_PATH];
+    name_to_package(name, package, JIMAGE_MAX_PATH);
+    if (package[0] != '\0') {
+        const char* module = (*JImagePackageToModule)(_jimage, package);
+        if (module == NULL) {
+            module = "java.base";
+        }
+        location = (*JImageFindResource)(_jimage, module, get_jimage_version_string(), name, &size);
     }
   }
 
-  if (found) {
-    u8 size = location.get_attribute(ImageLocation::ATTRIBUTE_UNCOMPRESSED);
+  if (location != 0) {
     if (UsePerfData) {
       ClassLoader::perf_sys_classfile_bytes_read()->inc(size);
     }
-    u1* data = NEW_RESOURCE_ARRAY(u1, size);
-    _image->get_resource(location, data);
-    return new ClassFileStream(data, (int)size, _image->name());  // Resource allocated
+    char* data = NEW_RESOURCE_ARRAY(char, size);
+    (*JImageGetResource)(_jimage, location, data, size);
+    return new ClassFileStream((u1*)data, (int)size, _name);  // Resource allocated
   }
 
   return NULL;
 }
 
 #ifndef PRODUCT
+bool ctw_visitor(JImageFile* jimage,
+        const char* module_name, const char* version, const char* package,
+        const char* name, const char* extension, void* arg) {
+  if (strcmp(extension, "class") == 0) {
+    Thread* THREAD = Thread::current();
+    char path[JIMAGE_MAX_PATH];
+    jio_snprintf(path, JIMAGE_MAX_PATH - 1, "%s/%s.class", package, name);
+    ClassLoader::compile_the_world_in(path, *(Handle*)arg, THREAD);
+    return !HAS_PENDING_EXCEPTION;
+  }
+  return true;
+}
+
 void ClassPathImageEntry::compile_the_world(Handle loader, TRAPS) {
   tty->print_cr("CompileTheWorld : Compiling all classes in %s", name());
   tty->cr();
-  const ImageStrings strings = _image->get_strings();
-  // Retrieve each path component string.
-  u4 length = _image->table_length();
-  for (u4 i = 0; i < length; i++) {
-    u1* location_data = _image->get_location_data(i);
-
-    if (location_data != NULL) {
-       ImageLocation location(location_data);
-       char path[IMAGE_MAX_PATH];
-       _image->location_path(location, path, IMAGE_MAX_PATH);
-       ClassLoader::compile_the_world_in(path, loader, CHECK);
+  (*JImageResourceIterator)(_jimage, (JImageResourceVisitor_t)ctw_visitor, (void *)&loader);
+  if (HAS_PENDING_EXCEPTION) {
+    if (PENDING_EXCEPTION->is_a(SystemDictionary::OutOfMemoryError_klass())) {
+      CLEAR_PENDING_EXCEPTION;
+      tty->print_cr("\nCompileTheWorld : Ran out of memory\n");
+      tty->print_cr("Increase class metadata storage if a limit was set");
+    } else {
+      tty->print_cr("\nCompileTheWorld : Unexpected exception occurred\n");
     }
   }
-  if (HAS_PENDING_EXCEPTION) {
-  if (PENDING_EXCEPTION->is_a(SystemDictionary::OutOfMemoryError_klass())) {
-    CLEAR_PENDING_EXCEPTION;
-    tty->print_cr("\nCompileTheWorld : Ran out of memory\n");
-    tty->print_cr("Increase class metadata storage if a limit was set");
-  } else {
-    tty->print_cr("\nCompileTheWorld : Unexpected exception occurred\n");
-  }
-  }
 }
 
 bool ClassPathImageEntry::is_jrt() {
@@ -490,7 +525,7 @@
   JavaThread* thread = JavaThread::current();
   ClassPathEntry* new_entry = NULL;
   if ((st->st_mode & S_IFREG) == S_IFREG) {
-    // Regular file, should be a zip or image file
+    // Regular file, should be a zip or jimage file
     // Canonicalized filename
     char canonical_path[JVM_MAXPATHLEN];
     if (!get_canonical_path(path, canonical_path, JVM_MAXPATHLEN)) {
@@ -501,9 +536,10 @@
         return NULL;
       }
     }
-    ImageFileReader* image = ImageFileReader::open(canonical_path);
-    if (image != NULL) {
-      new_entry = new ClassPathImageEntry(image);
+    jint error;
+    JImageFile* jimage =(*JImageOpen)(canonical_path, &error);
+    if (jimage != NULL) {
+      new_entry = new ClassPathImageEntry(jimage, canonical_path);
     } else {
       char* error_msg = NULL;
       jzfile* zip;
@@ -682,6 +718,35 @@
   // This lookup only works on 1.3. Do not check for non-null here
 }
 
+void ClassLoader::load_jimage_library() {
+  // First make sure native library is loaded
+  os::native_java_library();
+  // Load jimage library
+  char path[JVM_MAXPATHLEN];
+  char ebuf[1024];
+  void* handle = NULL;
+  if (os::dll_build_name(path, sizeof(path), Arguments::get_dll_dir(), "jimage")) {
+    handle = os::dll_load(path, ebuf, sizeof ebuf);
+  }
+  if (handle == NULL) {
+    vm_exit_during_initialization("Unable to load jimage library", path);
+  }
+
+  // Lookup jimage entry points
+  JImageOpen = CAST_TO_FN_PTR(JImageOpen_t, os::dll_lookup(handle, "JIMAGE_Open"));
+  guarantee(JImageOpen != NULL, "function JIMAGE_Open not found");
+  JImageClose = CAST_TO_FN_PTR(JImageClose_t, os::dll_lookup(handle, "JIMAGE_Close"));
+  guarantee(JImageClose != NULL, "function JIMAGE_Close not found");
+  JImagePackageToModule = CAST_TO_FN_PTR(JImagePackageToModule_t, os::dll_lookup(handle, "JIMAGE_PackageToModule"));
+  guarantee(JImagePackageToModule != NULL, "function JIMAGE_PackageToModule not found");
+  JImageFindResource = CAST_TO_FN_PTR(JImageFindResource_t, os::dll_lookup(handle, "JIMAGE_FindResource"));
+  guarantee(JImageFindResource != NULL, "function JIMAGE_FindResource not found");
+  JImageGetResource = CAST_TO_FN_PTR(JImageGetResource_t, os::dll_lookup(handle, "JIMAGE_GetResource"));
+  guarantee(JImageGetResource != NULL, "function JIMAGE_GetResource not found");
+  JImageResourceIterator = CAST_TO_FN_PTR(JImageResourceIterator_t, os::dll_lookup(handle, "JIMAGE_ResourceIterator"));
+  guarantee(JImageResourceIterator != NULL, "function JIMAGE_ResourceIterator not found");
+}
+
 jboolean ClassLoader::decompress(void *in, u8 inSize, void *out, u8 outSize, char **pmsg) {
   return (*ZipInflateFully)(in, inSize, out, outSize, pmsg);
 }
@@ -1086,6 +1151,8 @@
 
   // lookup zip library entry points
   load_zip_library();
+  // lookup jimage library entry points
+  load_jimage_library();
 #if INCLUDE_CDS
   // initialize search path
   if (DumpSharedSpaces) {
--- a/hotspot/src/share/vm/classfile/classLoader.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/hotspot/src/share/vm/classfile/classLoader.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -37,8 +37,7 @@
 
 // Class path entry (directory or zip file)
 
-class ImageFileReader;
-class ImageModuleData;
+class JImageFile;
 
 class ClassPathEntry: public CHeapObj<mtClass> {
  private:
@@ -52,7 +51,7 @@
   }
   virtual bool is_jar_file() = 0;
   virtual const char* name() = 0;
-  virtual ImageFileReader* image() = 0;
+  virtual JImageFile* jimage() = 0;
   // Constructor
   ClassPathEntry();
   // Attempt to locate file_name through this class path entry.
@@ -70,7 +69,7 @@
  public:
   bool is_jar_file()       { return false;  }
   const char* name()       { return _dir; }
-  ImageFileReader* image() { return NULL; }
+  JImageFile* jimage()     { return NULL; }
   ClassPathDirEntry(const char* dir);
   ClassFileStream* open_stream(const char* name, TRAPS);
   // Debugging
@@ -100,7 +99,7 @@
  public:
   bool is_jar_file()       { return true;  }
   const char* name()       { return _zip_name; }
-  ImageFileReader* image() { return NULL; }
+  JImageFile* jimage()     { return NULL; }
   ClassPathZipEntry(jzfile* zip, const char* zip_name);
   ~ClassPathZipEntry();
   u1* open_entry(const char* name, jint* filesize, bool nul_terminate, TRAPS);
@@ -115,16 +114,16 @@
 // For java image files
 class ClassPathImageEntry: public ClassPathEntry {
 private:
-  ImageFileReader* _image;
-  ImageModuleData* _module_data;
+  JImageFile* _jimage;
+  const char* _name;
 public:
   bool is_jar_file()  { return false;  }
-  bool is_open()  { return _image != NULL; }
-  const char* name();
-  ImageFileReader* image() { return _image; }
-  ImageModuleData* module_data() { return _module_data; }
-  ClassPathImageEntry(ImageFileReader* image);
+  bool is_open()  { return _jimage != NULL; }
+  const char* name() { return _name == NULL ? "" : _name; }
+  JImageFile* jimage() { return _jimage; }
+  ClassPathImageEntry(JImageFile* jimage, const char* name);
   ~ClassPathImageEntry();
+  static void name_to_package(const char* name, char* buffer, int length);
   ClassFileStream* open_stream(const char* name, TRAPS);
 
   // Debugging
@@ -206,6 +205,7 @@
   static void setup_search_path(const char *class_path);
 
   static void load_zip_library();
+  static void load_jimage_library();
   static ClassPathEntry* create_class_path_entry(const char *path, const struct stat* st,
                                                  bool throw_exception, TRAPS);
 
--- a/hotspot/src/share/vm/classfile/imageDecompressor.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#include "precompiled.hpp"
-#include "runtime/thread.inline.hpp"
-#include "classfile/imageDecompressor.hpp"
-#include "runtime/thread.hpp"
-#include "utilities/bytes.hpp"
-
-/*
- * Allocate in C Heap not in resource area, otherwise JVM crashes.
- * This array life time is the VM life time. Array is never freed and
- * is not expected to contain more than few references.
- */
-GrowableArray<ImageDecompressor*>* ImageDecompressor::_decompressors =
-  new(ResourceObj::C_HEAP, mtInternal) GrowableArray<ImageDecompressor*>(2, true);
-
-static Symbol* createSymbol(const char* str) {
-  Thread* THREAD = Thread::current();
-  Symbol* sym = SymbolTable::lookup(str, (int) strlen(str), THREAD);
-  if (HAS_PENDING_EXCEPTION) {
-    warning("can't create symbol\n");
-    CLEAR_PENDING_EXCEPTION;
-    return NULL;
-  }
-  return sym;
-}
-
-/*
- * Initialize the array of decompressors.
- */
-bool image_decompressor_init() {
-  Symbol* zipSymbol = createSymbol("zip");
-  if (zipSymbol == NULL) {
-    return false;
-  }
-  ImageDecompressor::add_decompressor(new ZipDecompressor(zipSymbol));
-
-  return true;
-}
-
-/*
- * Decompression entry point. Called from ImageFileReader::get_resource.
- */
-void ImageDecompressor::decompress_resource(u1* compressed, u1* uncompressed,
-        u4 uncompressed_size, const ImageStrings* strings, bool is_C_heap) {
-  bool has_header = false;
-  u1* decompressed_resource = compressed;
-  u1* compressed_resource = compressed;
-
-  // Resource could have been transformed by a stack of decompressors.
-  // Iterate and decompress resources until there is no more header.
-  do {
-    ResourceHeader _header;
-    memcpy(&_header, compressed_resource, sizeof (ResourceHeader));
-    has_header = _header._magic == ResourceHeader::resource_header_magic;
-    if (has_header) {
-      // decompressed_resource array contains the result of decompression
-      // when a resource content is terminal, it means that it is an actual resource,
-      // not an intermediate not fully uncompressed content. In this case
-      // the resource is allocated as an mtClass, otherwise as an mtOther
-      decompressed_resource = is_C_heap && _header._is_terminal ?
-              NEW_C_HEAP_ARRAY(u1, _header._uncompressed_size, mtClass) :
-              NEW_C_HEAP_ARRAY(u1, _header._uncompressed_size, mtOther);
-      // Retrieve the decompressor name
-      const char* decompressor_name = strings->get(_header._decompressor_name_offset);
-      if (decompressor_name == NULL) warning("image decompressor not found\n");
-      guarantee(decompressor_name, "image decompressor not found");
-      // Retrieve the decompressor instance
-      ImageDecompressor* decompressor = get_decompressor(decompressor_name);
-      if (decompressor == NULL) {
-        warning("image decompressor %s not found\n", decompressor_name);
-      }
-      guarantee(decompressor, "image decompressor not found");
-      u1* compressed_resource_base = compressed_resource;
-      compressed_resource += ResourceHeader::resource_header_length;
-      // Ask the decompressor to decompress the compressed content
-      decompressor->decompress_resource(compressed_resource, decompressed_resource,
-        &_header, strings);
-      if (compressed_resource_base != compressed) {
-        FREE_C_HEAP_ARRAY(char, compressed_resource_base);
-      }
-      compressed_resource = decompressed_resource;
-    }
-  } while (has_header);
-  memcpy(uncompressed, decompressed_resource, uncompressed_size);
-}
-
-// Zip decompressor
-
-void ZipDecompressor::decompress_resource(u1* data, u1* uncompressed,
-        ResourceHeader* header, const ImageStrings* strings) {
-  char* msg = NULL;
-  jboolean res = ClassLoader::decompress(data, header->_size, uncompressed,
-          header->_uncompressed_size, &msg);
-  if (!res) warning("decompression failed due to %s\n", msg);
-  guarantee(res, "decompression failed");
-}
-
-// END Zip Decompressor
--- a/hotspot/src/share/vm/classfile/imageDecompressor.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_VM_CLASSFILE_IMAGEDECOMPRESSOR_HPP
-#define SHARE_VM_CLASSFILE_IMAGEDECOMPRESSOR_HPP
-
-#include "runtime/thread.inline.hpp"
-#include "classfile/classLoader.hpp"
-#include "classfile/imageFile.hpp"
-#include "classfile/symbolTable.hpp"
-#include "oops/symbol.hpp"
-#include "utilities/growableArray.hpp"
-
-/*
- * Compressed resources located in image have an header.
- * This header contains:
- * - _magic: A magic u4, required to retrieved the header in the compressed content
- * - _size: The size of the compressed resource.
- * - _uncompressed_size: The uncompressed size of the compressed resource.
- * - _decompressor_name_offset: The ImageDecompressor instance name StringsTable offset.
- * - _decompressor_config_offset: StringsTable offset of configuration that could be needed by
- *   the decompressor in order to decompress.
- * - _is_terminal: 1: the compressed content is terminal. Uncompressing it would
- *   create the actual resource. 0: the compressed content is not terminal. Uncompressing it
- *   will result in a compressed content to be decompressed (This occurs when a stack of compressors
- *   have been used to compress the resource.
- */
-struct ResourceHeader {
-  /* Length of header, needed to retrieve content offset */
-  static const u1 resource_header_length = 21;
-  /* magic bytes that identifies a compressed resource header*/
-  static const u4 resource_header_magic = 0xCAFEFAFA;
-  u4 _magic; // Resource header
-  u4 _size;  // Resource size
-  u4 _uncompressed_size;  // Expected uncompressed size
-  u4 _decompressor_name_offset;  // Strings table decompressor offset
-  u4 _decompressor_config_offset; // Strings table config offset
-  u1 _is_terminal; // Last decompressor 1, otherwise 0.
-};
-
-/*
- * Resources located in jimage file can be compressed. Compression occurs at
- * jimage file creation time. When compressed a resource is added an header that
- * contains the name of the compressor that compressed it.
- * Various compression strategies can be applied to compress a resource.
- * The same resource can even be compressed multiple time by a stack of compressors.
- * At runtime, a resource is decompressed in a loop until there is no more header
- * meaning that the resource is equivalent to the not compressed resource.
- * In each iteration, the name of the compressor located in the current header
- * is used to retrieve the associated instance of ImageDecompressor.
- * For example “zip” is the name of the compressor that compresses resources
- * using the zip algorithm. The ZipDecompressor class name is also “zip”.
- * ImageDecompressor instances are retrieved from a static array in which
- * they are registered.
- */
-class ImageDecompressor: public CHeapObj<mtClass> {
-
-private:
-  const Symbol* _name;
-
-  /*
-   * Array of concrete decompressors. This array is used to retrieve the decompressor
-   * that can handle resource decompression.
-   */
-  static GrowableArray<ImageDecompressor*>* _decompressors;
-
-  /*
-   * Identifier of a decompressor. This name is the identification key to retrieve
-   * decompressor from a resource header.
-   */
-  inline const Symbol* get_name() const { return _name; }
-
-protected:
-  ImageDecompressor(const Symbol* name) : _name(name) {
-  }
-  virtual void decompress_resource(u1* data, u1* uncompressed,
-    ResourceHeader* header, const ImageStrings* strings) = 0;
-
-public:
-  inline static void add_decompressor(ImageDecompressor* decompressor) {
-    _decompressors->append(decompressor);
-  }
-  inline static ImageDecompressor* get_decompressor(const char * decompressor_name) {
-    Thread* THREAD = Thread::current();
-    TempNewSymbol sym = SymbolTable::new_symbol(decompressor_name,
-            (int) strlen(decompressor_name), CHECK_NULL);
-    if (HAS_PENDING_EXCEPTION) {
-      warning("can't create symbol\n");
-      CLEAR_PENDING_EXCEPTION;
-      return NULL;
-    }
-    for (int i = 0; i < _decompressors->length(); i++) {
-      ImageDecompressor* decompressor = _decompressors->at(i);
-      if (decompressor->get_name()->fast_compare(sym) == 0) {
-        return decompressor;
-      }
-    }
-    guarantee(false, "No decompressor found.");
-    return NULL;
-  }
-  static void decompress_resource(u1* compressed, u1* uncompressed,
-    u4 uncompressed_size, const ImageStrings* strings, bool is_C_heap);
-};
-
-/**
- * Zip decompressor.
- */
-class ZipDecompressor : public ImageDecompressor {
-public:
-  ZipDecompressor(const Symbol* sym) : ImageDecompressor(sym) { }
-  void decompress_resource(u1* data, u1* uncompressed, ResourceHeader* header,
-    const ImageStrings* strings);
-};
-
-#endif // SHARE_VM_CLASSFILE_IMAGEDECOMPRESSOR_HPP
--- a/hotspot/src/share/vm/classfile/imageFile.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,546 +0,0 @@
-/*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License