changeset 9285:720d0ff40323

Merge
author thartmann
date Mon, 14 Sep 2015 07:03:04 +0000
parents 69ed894b4045 41b6cb9246fe
children 486680e6ed5e
files src/share/vm/classfile/imageDecompressor.cpp src/share/vm/classfile/imageDecompressor.hpp src/share/vm/classfile/imageFile.cpp src/share/vm/classfile/imageFile.hpp src/share/vm/utilities/endian.cpp src/share/vm/utilities/endian.hpp test/runtime/modules/ImageFile/ImageAttributeOffsetsTest.java test/runtime/modules/ImageFile/ImageCloseTest.java test/runtime/modules/ImageFile/ImageFileHeaderTest.java test/runtime/modules/ImageFile/ImageFindAttributesTest.java test/runtime/modules/ImageFile/ImageGetAttributesTest.java test/runtime/modules/ImageFile/ImageGetDataAddressTest.java test/runtime/modules/ImageFile/ImageGetIndexAddressTest.java test/runtime/modules/ImageFile/ImageGetStringBytesTest.java test/runtime/modules/ImageFile/ImageOpenTest.java test/runtime/modules/ImageFile/ImageReadTest.java test/runtime/modules/ImageFile/LocationConstants.java
diffstat 279 files changed, 8420 insertions(+), 6919 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Mon Sep 14 07:02:50 2015 +0200
+++ b/.hgtags	Mon Sep 14 07:03:04 2015 +0000
@@ -482,3 +482,4 @@
 e66c3813789debfc06f206afde1bf7a84cb08451 jdk9-b77
 20dc06b04fe5ec373879414d60ef82ac70faef98 jdk9-b78
 e9e63d93bbfe2c6c23447e2c1f5cc71c98671cba jdk9-b79
+8e8377739c06b99b9011c003c77e0bef84c91e09 jdk9-b80
--- a/make/Makefile	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/Makefile	Mon Sep 14 07:03:04 2015 +0000
@@ -633,9 +633,9 @@
 
 update_jdk: export_product_jdk export_fastdebug_jdk test_jdk
 
-copy_jdk: $(JDK_IMAGE_DIR)/jre/lib/rt.jar
+copy_jdk: $(JDK_IMAGE_DIR)/bin/java
 
-$(JDK_IMAGE_DIR)/jre/lib/rt.jar:
+$(JDK_IMAGE_DIR)/bin/java:
 	$(RM) -r $(JDK_IMAGE_DIR)
 	$(MKDIR) -p $(JDK_IMAGE_DIR)
 	($(CD) $(JDK_IMPORT_PATH) && \
--- a/make/aix/makefiles/mapfile-vers-debug	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/aix/makefiles/mapfile-vers-debug	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/make/aix/makefiles/mapfile-vers-product	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/aix/makefiles/mapfile-vers-product	Mon Sep 14 07:03:04 2015 +0000
@@ -139,18 +139,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/make/bsd/makefiles/mapfile-vers-darwin-debug	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/bsd/makefiles/mapfile-vers-darwin-debug	Mon Sep 14 07:03:04 2015 +0000
@@ -139,18 +139,6 @@
                 _JVM_Halt
                 _JVM_HoldsLock
                 _JVM_IHashCode
-                _JVM_ImageAttributeOffsets
-                _JVM_ImageAttributeOffsetsLength
-                _JVM_ImageClose
-                _JVM_ImageFindAttributes
-                _JVM_ImageGetAttributes
-                _JVM_ImageGetAttributesCount
-                _JVM_ImageGetDataAddress
-                _JVM_ImageGetIndexAddress
-                _JVM_ImageGetStringBytes
-                _JVM_ImageOpen
-                _JVM_ImageRead
-                _JVM_ImageReadCompressed
                 _JVM_InitAgentProperties
                 _JVM_InitProperties
                 _JVM_InternString
--- a/make/bsd/makefiles/mapfile-vers-darwin-product	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/bsd/makefiles/mapfile-vers-darwin-product	Mon Sep 14 07:03:04 2015 +0000
@@ -139,18 +139,6 @@
                 _JVM_Halt
                 _JVM_HoldsLock
                 _JVM_IHashCode
-                _JVM_ImageAttributeOffsets
-                _JVM_ImageAttributeOffsetsLength
-                _JVM_ImageClose
-                _JVM_ImageFindAttributes
-                _JVM_ImageGetAttributes
-                _JVM_ImageGetAttributesCount
-                _JVM_ImageGetDataAddress
-                _JVM_ImageGetIndexAddress
-                _JVM_ImageGetStringBytes
-                _JVM_ImageOpen
-                _JVM_ImageRead
-                _JVM_ImageReadCompressed
                 _JVM_InitAgentProperties
                 _JVM_InitProperties
                 _JVM_InternString
--- a/make/bsd/makefiles/mapfile-vers-debug	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/bsd/makefiles/mapfile-vers-debug	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/make/bsd/makefiles/mapfile-vers-product	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/bsd/makefiles/mapfile-vers-product	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/make/bsd/makefiles/vm.make	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/bsd/makefiles/vm.make	Mon Sep 14 07:03:04 2015 +0000
@@ -131,7 +131,7 @@
 # By default, link the *.o into the library, not the executable.
 LINK_INTO$(LINK_INTO) = LIBJVM
 
-JDK_LIBDIR = $(JAVA_HOME)/jre/lib/$(LIBARCH)
+JDK_LIBDIR = $(JAVA_HOME)/lib/$(LIBARCH)
 
 #----------------------------------------------------------------------
 # jvm_db & dtrace
--- a/make/build.sh	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/build.sh	Mon Sep 14 07:03:04 2015 +0000
@@ -49,7 +49,7 @@
 # Just in case:
 JAVA_HOME=`( cd $JAVA_HOME; pwd )`
 
-if [ "${ALT_BOOTDIR-}" = ""  -o  ! -d "${ALT_BOOTDIR-}" -o ! -d ${ALT_BOOTDIR-}/jre/lib/ ]; then
+if [ "${ALT_BOOTDIR-}" = ""  -o  ! -d "${ALT_BOOTDIR-}" -o ! -d ${ALT_BOOTDIR-}/lib/ ]; then
     ALT_BOOTDIR=${JAVA_HOME}
 fi
 
--- a/make/hotspot.script	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/hotspot.script	Mon Sep 14 07:03:04 2015 +0000
@@ -127,7 +127,7 @@
 #     o		$JRE/lib/$ARCH
 # followed by the user's previous effective LD_LIBRARY_PATH, if
 # any.
-JRE=$JDK/jre
+JRE=$JDK
 JAVA_HOME=$JDK
 export JAVA_HOME
 
--- a/make/linux/makefiles/mapfile-vers-debug	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/linux/makefiles/mapfile-vers-debug	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/make/linux/makefiles/mapfile-vers-product	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/linux/makefiles/mapfile-vers-product	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/make/solaris/makefiles/adlc.make	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/solaris/makefiles/adlc.make	Mon Sep 14 07:03:04 2015 +0000
@@ -76,6 +76,11 @@
 ifeq ($(shell expr $(COMPILER_REV_NUMERIC) \>= 509), 1)
   CFLAGS_WARN = +w -errwarn
 endif
+# When using compiler version 5.13 (Solaris Studio 12.4), calls to explicitly 
+# instantiated template functions trigger this warning when +w is active.
+ifeq ($(shell expr $(COMPILER_REV_NUMERIC) \>= 513), 1)
+  CFLAGS_WARN += -erroff=notemsource
+endif
 CFLAGS += $(CFLAGS_WARN)
 
 ifeq ("${Platform_compiler}", "sparcWorks")
--- a/make/solaris/makefiles/buildtree.make	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/solaris/makefiles/buildtree.make	Mon Sep 14 07:03:04 2015 +0000
@@ -270,6 +270,7 @@
 	echo "CP ?= cp"; \
 	echo "MV ?= mv"; \
 	echo "include \$$(GAMMADIR)/make/$(OS_FAMILY)/makefiles/$(VARIANT).make"; \
+	echo "include \$$(GAMMADIR)/make/excludeSrc.make"; \
 	echo "include \$$(GAMMADIR)/make/$(OS_FAMILY)/makefiles/$(COMPILER).make"; \
 	) > $@
 
--- a/make/solaris/makefiles/mapfile-vers	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/solaris/makefiles/mapfile-vers	Mon Sep 14 07:03:04 2015 +0000
@@ -141,18 +141,6 @@
                 JVM_Halt;
                 JVM_HoldsLock;
                 JVM_IHashCode;
-                JVM_ImageAttributeOffsets;
-                JVM_ImageAttributeOffsetsLength;
-                JVM_ImageClose;
-                JVM_ImageFindAttributes;
-                JVM_ImageGetAttributes;
-                JVM_ImageGetAttributesCount;
-                JVM_ImageGetDataAddress;
-                JVM_ImageGetIndexAddress;
-                JVM_ImageGetStringBytes;
-                JVM_ImageOpen;
-                JVM_ImageRead;
-                JVM_ImageReadCompressed;
                 JVM_InitAgentProperties;
                 JVM_InitProperties;
                 JVM_InternString;
--- a/make/solaris/makefiles/vm.make	Mon Sep 14 07:02:50 2015 +0200
+++ b/make/solaris/makefiles/vm.make	Mon Sep 14 07:03:04 2015 +0000
@@ -197,7 +197,7 @@
 Src_Dirs/COMPILER2 := $(CORE_PATHS) $(COMPILER2_PATHS)
 Src_Dirs/TIERED    := $(CORE_PATHS) $(COMPILER1_PATHS) $(COMPILER2_PATHS)
 Src_Dirs/ZERO      := $(CORE_PATHS)
-Src_Dirs/SHARK     := $(CORE_PATHS)
+Src_Dirs/SHARK     := $(CORE_PATHS) $(SHARK_PATHS)
 Src_Dirs := $(Src_Dirs/$(TYPE))
 
 COMPILER2_SPECIFIC_FILES := opto libadt bcEscapeAnalyzer.cpp c2_\* runtime_\*
@@ -206,7 +206,7 @@
 ZERO_SPECIFIC_FILES      := zero
 
 # Always exclude these.
-Src_Files_EXCLUDE := dtrace jsig.c jvmtiEnvRecommended.cpp jvmtiEnvStub.cpp
+Src_Files_EXCLUDE += dtrace jsig.c jvmtiEnvRecommended.cpp jvmtiEnvStub.cpp
 
 # Exclude per type.
 Src_Files_EXCLUDE/CORE      := $(COMPILER1_SPECIFIC_FILES) $(COMPILER2_SPECIFIC_FILES) $(ZERO_SPECIFIC_FILES) $(SHARK_SPECIFIC_FILES) ciTypeFlow.cpp
--- a/src/cpu/aarch64/vm/aarch64.ad	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/aarch64/vm/aarch64.ad	Mon Sep 14 07:03:04 2015 +0000
@@ -3803,81 +3803,37 @@
 
   enc_class aarch64_enc_cmpxchg(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
     MacroAssembler _masm(&cbuf);
-    Register old_reg = as_Register($oldval$$reg);
-    Register new_reg = as_Register($newval$$reg);
-    Register base = as_Register($mem$$base);
-    Register addr_reg;
-    int index = $mem$$index;
-    int scale = $mem$$scale;
-    int disp = $mem$$disp;
-    if (index == -1) {
-       if (disp != 0) {
-        __ lea(rscratch2, Address(base, disp));
-        addr_reg = rscratch2;
-      } else {
-        // TODO
-        // should we ever get anything other than this case?
-        addr_reg = base;
-      }
-    } else {
-      Register index_reg = as_Register(index);
-      if (disp == 0) {
-        __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      } else {
-        __ lea(rscratch2, Address(base, disp));
-        __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      }
-    }
-    Label retry_load, done;
-    __ bind(retry_load);
-    __ ldxr(rscratch1, addr_reg);
-    __ cmp(rscratch1, old_reg);
-    __ br(Assembler::NE, done);
-    __ stlxr(rscratch1, new_reg, addr_reg);
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(done);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldxr, &MacroAssembler::cmp, &Assembler::stlxr);
   %}
 
   enc_class aarch64_enc_cmpxchgw(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
     MacroAssembler _masm(&cbuf);
-    Register old_reg = as_Register($oldval$$reg);
-    Register new_reg = as_Register($newval$$reg);
-    Register base = as_Register($mem$$base);
-    Register addr_reg;
-    int index = $mem$$index;
-    int scale = $mem$$scale;
-    int disp = $mem$$disp;
-    if (index == -1) {
-       if (disp != 0) {
-        __ lea(rscratch2, Address(base, disp));
-        addr_reg = rscratch2;
-      } else {
-        // TODO
-        // should we ever get anything other than this case?
-        addr_reg = base;
-      }
-    } else {
-      Register index_reg = as_Register(index);
-      if (disp == 0) {
-        __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      } else {
-        __ lea(rscratch2, Address(base, disp));
-        __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      }
-    }
-    Label retry_load, done;
-    __ bind(retry_load);
-    __ ldxrw(rscratch1, addr_reg);
-    __ cmpw(rscratch1, old_reg);
-    __ br(Assembler::NE, done);
-    __ stlxrw(rscratch1, new_reg, addr_reg);
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(done);
-  %}
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
+  %}
+
+
+  // The only difference between aarch64_enc_cmpxchg and
+  // aarch64_enc_cmpxchg_acq is that we use load-acquire in the
+  // CompareAndSwap sequence to serve as a barrier on acquiring a
+  // lock.
+  enc_class aarch64_enc_cmpxchg_acq(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
+    MacroAssembler _masm(&cbuf);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldaxr, &MacroAssembler::cmp, &Assembler::stlxr);
+  %}
+
+  enc_class aarch64_enc_cmpxchgw_acq(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
+    MacroAssembler _masm(&cbuf);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldaxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
+  %}
+
 
   // auxiliary used for CompareAndSwapX to set result register
   enc_class aarch64_enc_cset_eq(iRegINoSp res) %{
@@ -4373,12 +4329,12 @@
       return;
     }
 
-    if (UseBiasedLocking) {
-      __ biased_locking_enter(disp_hdr, oop, box, tmp, true, cont);
+    if (UseBiasedLocking && !UseOptoBiasInlining) {
+      __ biased_locking_enter(box, oop, disp_hdr, tmp, true, cont);
     }
 
     // Handle existing monitor
-    if (EmitSync & 0x02) {
+    if ((EmitSync & 0x02) == 0) {
       // we can use AArch64's bit test and branch here but
       // markoopDesc does not define a bit index just the bit value
       // so assert in case the bit pos changes
@@ -4398,13 +4354,10 @@
 
     // Compare object markOop with mark and if equal exchange scratch1
     // with object markOop.
-    // Note that this is simply a CAS: it does not generate any
-    // barriers.  These are separately generated by
-    // membar_acquire_lock().
     {
       Label retry_load;
       __ bind(retry_load);
-      __ ldxr(tmp, oop);
+      __ ldaxr(tmp, oop);
       __ cmp(tmp, disp_hdr);
       __ br(Assembler::NE, cas_failed);
       // use stlxr to ensure update is immediately visible
@@ -4454,7 +4407,7 @@
       {
         Label retry_load, fail;
         __ bind(retry_load);
-        __ ldxr(rscratch1, tmp);
+        __ ldaxr(rscratch1, tmp);
         __ cmp(disp_hdr, rscratch1);
         __ br(Assembler::NE, fail);
         // use stlxr to ensure update is immediately visible
@@ -4518,7 +4471,7 @@
       return;
     }
 
-    if (UseBiasedLocking) {
+    if (UseBiasedLocking && !UseOptoBiasInlining) {
       __ biased_locking_exit(oop, tmp, cont);
     }
 
@@ -8017,10 +7970,10 @@
   match(MemBarAcquireLock);
   ins_cost(VOLATILE_REF_COST);
 
-  format %{ "membar_acquire_lock" %}
-
-  ins_encode %{
-    __ membar(Assembler::LoadLoad|Assembler::LoadStore);
+  format %{ "membar_acquire_lock (elided)" %}
+
+  ins_encode %{
+    __ block_comment("membar_acquire_lock (elided)");
   %}
 
   ins_pipe(pipe_serial);
@@ -8080,10 +8033,10 @@
   match(MemBarReleaseLock);
   ins_cost(VOLATILE_REF_COST);
 
-  format %{ "membar_release_lock" %}
-
-  ins_encode %{
-    __ membar(Assembler::LoadStore|Assembler::StoreStore);
+  format %{ "membar_release_lock (elided)" %}
+
+  ins_encode %{
+    __ block_comment("membar_release_lock (elided)");
   %}
 
   ins_pipe(pipe_serial);
@@ -8369,7 +8322,11 @@
   ins_pipe(pipe_serial);
 %}
 
-// this has to be implemented as a CAS
+
+// storeLConditional is used by PhaseMacroExpand::expand_lock_node
+// when attempting to rebias a lock towards the current thread.  We
+// must use the acquire form of cmpxchg in order to guarantee acquire
+// semantics in this case.
 instruct storeLConditional(indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr)
 %{
   match(Set cr (StoreLConditional mem (Binary oldval newval)));
@@ -8381,12 +8338,14 @@
     "cmpw rscratch1, zr\t# EQ on successful write"
   %}
 
-  ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval));
+  ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval));
 
   ins_pipe(pipe_slow);
 %}
 
-// this has to be implemented as a CAS
+// storeIConditional also has acquire semantics, for no better reason
+// than matching storeLConditional.  At the time of writing this
+// comment storeIConditional was not used anywhere by AArch64.
 instruct storeIConditional(indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr)
 %{
   match(Set cr (StoreIConditional mem (Binary oldval newval)));
@@ -8398,7 +8357,7 @@
     "cmpw rscratch1, zr\t# EQ on successful write"
   %}
 
-  ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval));
+  ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval));
 
   ins_pipe(pipe_slow);
 %}
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -1210,7 +1210,7 @@
 
   INSN(ldrs, 0b00, 1);
   INSN(ldrd, 0b01, 1);
-  INSN(ldrq, 0x10, 1);
+  INSN(ldrq, 0b10, 1);
 
 #undef INSN
 
@@ -2285,13 +2285,13 @@
 #undef INSN
 
   // Table vector lookup
-#define INSN(NAME, op)                                                                                       \
-  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, unsigned registers, FloatRegister Vm) {  \
-    starti;                                                                                                  \
-    assert(T == T8B || T == T16B, "invalid arrangement");                                                    \
-    assert(0 < registers && registers <= 4, "invalid number of registers");                                  \
-    f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21), rf(Vm, 16), f(0, 15);                               \
-    f(registers - 1, 14, 13), f(op, 12),f(0b00, 11, 10), rf(Vn, 5), rf(Vd, 0);                               \
+#define INSN(NAME, op)                                                  \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, unsigned registers, FloatRegister Vm) { \
+    starti;                                                             \
+    assert(T == T8B || T == T16B, "invalid arrangement");               \
+    assert(0 < registers && registers <= 4, "invalid number of registers"); \
+    f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21), rf(Vm, 16), f(0, 15); \
+    f(registers - 1, 14, 13), f(op, 12),f(0b00, 11, 10), rf(Vn, 5), rf(Vd, 0); \
   }
 
   INSN(tbl, 0);
@@ -2299,6 +2299,7 @@
 
 #undef INSN
 
+  // AdvSIMD two-reg misc
 #define INSN(NAME, U, opcode)                                                       \
   void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {               \
        starti;                                                                      \
@@ -2316,10 +2317,19 @@
 
 #define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H)
   INSN(rev32, 1, 0b00000);
+private:
+  INSN(_rbit, 1, 0b00101);
+public:
+
 #undef ASSERTION
 
 #define ASSERTION (T == T8B || T == T16B)
   INSN(rev16, 0, 0b00001);
+  // RBIT only allows T8B and T16B but encodes them oddly.  Argh...
+  void rbit(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
+    assert((ASSERTION), MSG);
+    _rbit(Vd, SIMD_Arrangement(T & 1 | 0b010), Vn);
+  }
 #undef ASSERTION
 
 #undef MSG
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -3043,7 +3043,9 @@
   // register obj is destroyed afterwards.
 
   BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
+  assert(bs->kind() == BarrierSet::CardTableForRS ||
+         bs->kind() == BarrierSet::CardTableExtension,
+         "Wrong barrier set kind");
 
   CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -917,6 +917,8 @@
 
   void cmpptr(Register src1, Address src2);
 
+  // Various forms of CAS
+
   void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
                   Label &suceed, Label *fail);
 
@@ -938,6 +940,23 @@
     str(rscratch2, adr);
   }
 
+  // A generic CAS; success or failure is in the EQ flag.
+  template <typename T1, typename T2>
+  void cmpxchg(Register addr, Register expected, Register new_val,
+               T1 load_insn,
+               void (MacroAssembler::*cmp_insn)(Register, Register),
+               T2 store_insn,
+               Register tmp = rscratch1) {
+    Label retry_load, done;
+    bind(retry_load);
+    (this->*load_insn)(tmp, addr);
+    (this->*cmp_insn)(tmp, expected);
+    br(Assembler::NE, done);
+    (this->*store_insn)(tmp, new_val, addr);
+    cbnzw(tmp, retry_load);
+    bind(done);
+  }
+
   // Calls
 
   address trampoline_call(Address entry, CodeBuffer *cbuf = NULL);
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -691,7 +691,7 @@
         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
         __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
@@ -731,7 +731,7 @@
           __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
         {
           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
@@ -2364,7 +2364,7 @@
    *   c_rarg3   - int* table
    *
    * Ouput:
-   *       rax   - int crc result
+   *       r0   - int crc result
    */
   address generate_updateBytesCRC32C() {
     assert(UseCRC32CIntrinsics, "what are we doing here?");
@@ -2435,6 +2435,69 @@
     return start;
   }
 
+  void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
+                      FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
+                      FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
+    // Karatsuba multiplication performs a 128*128 -> 256-bit
+    // multiplication in three 128-bit multiplications and a few
+    // additions.
+    //
+    // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
+    // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
+    //
+    // Inputs:
+    //
+    // A0 in a.d[0]     (subkey)
+    // A1 in a.d[1]
+    // (A1+A0) in a1_xor_a0.d[0]
+    //
+    // B0 in b.d[0]     (state)
+    // B1 in b.d[1]
+
+    __ ext(tmp1, __ T16B, b, b, 0x08);
+    __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
+    __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
+    __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
+    __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
+
+    __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
+    __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
+    __ eor(tmp2, __ T16B, tmp2, tmp4);
+    __ eor(tmp2, __ T16B, tmp2, tmp3);
+
+    // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
+    __ ins(result_hi, __ D, tmp2, 0, 1);
+    __ ins(result_lo, __ D, tmp2, 1, 0);
+  }
+
+  void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
+                    FloatRegister p, FloatRegister z, FloatRegister t1) {
+    const FloatRegister t0 = result;
+
+    // The GCM field polynomial f is z^128 + p(z), where p =
+    // z^7+z^2+z+1.
+    //
+    //    z^128 === -p(z)  (mod (z^128 + p(z)))
+    //
+    // so, given that the product we're reducing is
+    //    a == lo + hi * z^128
+    // substituting,
+    //      === lo - hi * p(z)  (mod (z^128 + p(z)))
+    //
+    // we reduce by multiplying hi by p(z) and subtracting the result
+    // from (i.e. XORing it with) lo.  Because p has no nonzero high
+    // bits we can do this with two 64-bit multiplications, lo*p and
+    // hi*p.
+
+    __ pmull2(t0, __ T1Q, hi, p, __ T2D);
+    __ ext(t1, __ T16B, t0, z, 8);
+    __ eor(hi, __ T16B, hi, t1);
+    __ ext(t1, __ T16B, z, t0, 8);
+    __ eor(lo, __ T16B, lo, t1);
+    __ pmull(t0, __ T1Q, hi, p, __ T1D);
+    __ eor(result, __ T16B, lo, t0);
+  }
+
   /**
    *  Arguments:
    *
@@ -2448,10 +2511,27 @@
    *  Updated state at c_rarg0
    */
   address generate_ghash_processBlocks() {
+    // Bafflingly, GCM uses little-endian for the byte order, but
+    // big-endian for the bit order.  For example, the polynomial 1 is
+    // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
+    //
+    // So, we must either reverse the bytes in each word and do
+    // everything big-endian or reverse the bits in each byte and do
+    // it little-endian.  On AArch64 it's more idiomatic to reverse
+    // the bits in each byte (we have an instruction, RBIT, to do
+    // that) and keep the data in little-endian bit order throught the
+    // calculation, bit-reversing the inputs and outputs.
+
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+    __ align(wordSize * 2);
+    address p = __ pc();
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
     __ align(CodeEntryAlignment);
-    Label L_ghash_loop, L_exit;
-
-    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
     address start = __ pc();
 
     Register state   = c_rarg0;
@@ -2462,104 +2542,43 @@
     FloatRegister vzr = v30;
     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 
-    __ mov(v26, __ T16B, 1);
-    __ mov(v27, __ T16B, 63);
-    __ mov(v28, __ T16B, 62);
-    __ mov(v29, __ T16B, 57);
-
-    __ ldrq(v6, Address(state));
-    __ ldrq(v16, Address(subkeyH));
-
-    __ ext(v0, __ T16B, v6, v6, 0x08);
-    __ ext(v1, __ T16B, v16, v16, 0x08);
-    __ eor(v16, __ T16B, v16, v1);
-
-    __ bind(L_ghash_loop);
-
-    __ ldrq(v2, Address(__ post(data, 0x10)));
-    __ rev64(v2, __ T16B, v2); // swap data
-
-    __ ext(v6, __ T16B, v0, v0, 0x08);
-    __ eor(v6, __ T16B, v6, v2);
-    __ ext(v2, __ T16B, v6, v6, 0x08);
-
-    __ pmull2(v7, __ T1Q, v2, v1, __ T2D);  // A1*B1
-    __ eor(v6, __ T16B, v6, v2);
-    __ pmull(v5,  __ T1Q, v2, v1, __ T1D);  // A0*B0
-    __ pmull(v20, __ T1Q, v6, v16, __ T1D);  // (A1 + A0)(B1 + B0)
-
-    __ ext(v21, __ T16B, v5, v7, 0x08);
-    __ eor(v18, __ T16B, v7, v5); // A1*B1 xor A0*B0
-    __ eor(v20, __ T16B, v20, v21);
-    __ eor(v20, __ T16B, v20, v18);
-
-    // Registers pair <v7:v5> holds the result of carry-less multiplication
-    __ ins(v7, __ D, v20, 0, 1);
-    __ ins(v5, __ D, v20, 1, 0);
-
-    // Result of the multiplication is shifted by one bit position
-    // [X3:X2:X1:X0] = [X3:X2:X1:X0] << 1
-    __ ushr(v18, __ T2D, v5, -63 & 63);
-    __ ins(v25, __ D, v18, 1, 0);
-    __ ins(v25, __ D, vzr, 0, 0);
-    __ ushl(v5, __ T2D, v5, v26);
-    __ orr(v5, __ T16B, v5, v25);
-
-    __ ushr(v19, __ T2D, v7, -63 & 63);
-    __ ins(v19, __ D, v19, 1, 0);
-    __ ins(v19, __ D, v18, 0, 1);
-    __ ushl(v7, __ T2D, v7, v26);
-    __ orr(v6, __ T16B, v7, v19);
-
-    __ ins(v24, __ D, v5, 0, 1);
-
-    // A = X0 << 63
-    __ ushl(v21, __ T2D, v5, v27);
-
-    // A = X0 << 62
-    __ ushl(v22, __ T2D, v5, v28);
-
-    // A = X0 << 57
-    __ ushl(v23, __ T2D, v5, v29);
-
-    // D = X1^A^B^C
-    __ eor(v21, __ T16B, v21, v22);
-    __ eor(v21, __ T16B, v21, v23);
-    __ eor(v21, __ T16B, v21, v24);
-    __ ins(v5, __ D, v21, 1, 0);
-
-    // [E1:E0] = [D:X0] >> 1
-    __ ushr(v20, __ T2D, v5, -1 & 63);
-    __ ushl(v18, __ T2D, v5, v27);
-    __ ext(v25, __ T16B, v18, vzr, 0x08);
-    __ orr(v19, __ T16B, v20, v25);
-
-    __ eor(v7, __ T16B, v5, v19);
-
-    // [F1:F0] = [D:X0] >> 2
-    __ ushr(v20, __ T2D, v5, -2 & 63);
-    __ ushl(v18, __ T2D, v5, v28);
-    __ ins(v25, __ D, v18, 0, 1);
-    __ orr(v19, __ T16B, v20, v25);
-
-    __ eor(v7, __ T16B, v7, v19);
-
-    // [G1:G0] = [D:X0] >> 7
-    __ ushr(v20, __ T2D, v5, -7 & 63);
-    __ ushl(v18, __ T2D, v5, v29);
-    __ ins(v25, __ D, v18, 0, 1);
-    __ orr(v19, __ T16B, v20, v25);
-
-    // [H1:H0] = [D^E1^F1^G1:X0^E0^F0^G0]
-    __ eor(v7, __ T16B, v7, v19);
-
-    // Result = [H1:H0]^[X3:X2]
-    __ eor(v0, __ T16B, v7, v6);
-
-    __ subs(blocks, blocks, 1);
-    __ cbnz(blocks, L_ghash_loop);
-
-    __ ext(v1, __ T16B, v0, v0, 0x08);
+    __ ldrq(v0, Address(state));
+    __ ldrq(v1, Address(subkeyH));
+
+    __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
+    __ rbit(v0, __ T16B, v0);
+    __ rev64(v1, __ T16B, v1);
+    __ rbit(v1, __ T16B, v1);
+
+    __ ldrq(v26, p);
+
+    __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
+    __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+
+    {
+      Label L_ghash_loop;
+      __ bind(L_ghash_loop);
+
+      __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
+                                                 // reversing each byte
+      __ rbit(v2, __ T16B, v2);
+      __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
+
+      // Multiply state in v2 by subkey in v1
+      ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
+                     /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
+                     /*temps*/v6, v20, v18, v21);
+      // Reduce v7:v5 by the field polynomial
+      ghash_reduce(v0, v5, v7, v26, vzr, v20);
+
+      __ sub(blocks, blocks, 1);
+      __ cbnz(blocks, L_ghash_loop);
+    }
+
+    // The bit-reversed result is at this point in v0
+    __ rev64(v1, __ T16B, v0);
+    __ rbit(v1, __ T16B, v1);
+
     __ st1(v1, __ T16B, state);
     __ ret(lr);
 
--- a/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -186,7 +186,7 @@
       }
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       {
         if (val == noreg) {
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -177,6 +177,12 @@
   if (UseCRC32 && (auxv & HWCAP_CRC32) == 0) {
     warning("UseCRC32 specified, but not supported on this CPU");
   }
+
+  if (UseAdler32Intrinsics) {
+    warning("Adler32Intrinsics not available on this CPU.");
+    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
+  }
+
   if (auxv & HWCAP_AES) {
     UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
     UseAESIntrinsics =
--- a/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -2614,7 +2614,7 @@
 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
   CardTableModRefBS* bs =
     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
-  assert(bs->kind() == BarrierSet::CardTableModRef ||
+  assert(bs->kind() == BarrierSet::CardTableForRS ||
          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
 #ifdef ASSERT
   cmpdi(CCR0, Rnew_val, 0);
--- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -656,7 +656,7 @@
           __ bind(filtered);
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
@@ -697,7 +697,7 @@
           }
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
         {
           Label Lskip_loop, Lstore_loop;
--- a/src/cpu/ppc/vm/templateTable_ppc_64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/ppc/vm/templateTable_ppc_64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -105,7 +105,7 @@
       }
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       {
         Label Lnull, Ldone;
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -200,6 +200,11 @@
     FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
   }
 
+  if (UseAdler32Intrinsics) {
+    warning("Adler32Intrinsics not available on this CPU.");
+    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
     UseMultiplyToLenIntrinsic = true;
   }
--- a/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -3958,7 +3958,7 @@
   if (new_val == G0) return;
   CardTableModRefBS* bs =
     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
-  assert(bs->kind() == BarrierSet::CardTableModRef ||
+  assert(bs->kind() == BarrierSet::CardTableForRS ||
          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
   card_table_write(bs->byte_map_base, tmp, store_addr);
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cpu/sparc/vm/memset_with_concurrent_readers_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+
+#include "gc/shared/memset_with_concurrent_readers.hpp"
+#include "runtime/prefetch.inline.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/macros.hpp"
+
+#if INCLUDE_ALL_GCS
+
+// An implementation of memset, for use when there may be concurrent
+// readers of the region being stored into.
+//
+// We can't use the standard library memset if it is implemented using
+// block initializing stores.  Doing so can result in concurrent readers
+// seeing spurious zeros.
+//
+// We can't use the obvious C/C++ for-loop, because the compiler may
+// recognize the idiomatic loop and optimize it into a call to the
+// standard library memset; we've seen exactly this happen with, for
+// example, Solaris Studio 12.3.  Hence the use of inline assembly
+// code, hiding loops from the compiler's optimizer.
+//
+// We don't attempt to use the standard library memset when it is safe
+// to do so.  We could conservatively do so by detecting the presence
+// of block initializing stores (VM_Version::has_blk_init()), but the
+// implementation provided here should be sufficient.
+
+inline void fill_subword(void* start, void* end, int value) {
+  STATIC_ASSERT(BytesPerWord == 8);
+  assert(pointer_delta(end, start, 1) < BytesPerWord, "precondition");
+  // Dispatch on (end - start).
+  void* pc;
+  __asm__ volatile(
+    // offset := (7 - (end - start)) + 3
+    //   3 instructions from rdpc to DISPATCH
+    " sub %[offset], %[end], %[offset]\n\t" // offset := start - end
+    " sllx %[offset], 2, %[offset]\n\t" // scale offset for instruction size of 4
+    " add %[offset], 40, %[offset]\n\t" // offset += 10 * instruction size
+    " rd %pc, %[pc]\n\t"                // dispatch on scaled offset
+    " jmpl %[pc]+%[offset], %g0\n\t"
+    "  nop\n\t"
+    // DISPATCH: no direct reference, but without it the store block may be elided.
+    "1:\n\t"
+    " stb %[value], [%[end]-7]\n\t" // end[-7] = value
+    " stb %[value], [%[end]-6]\n\t"
+    " stb %[value], [%[end]-5]\n\t"
+    " stb %[value], [%[end]-4]\n\t"
+    " stb %[value], [%[end]-3]\n\t"
+    " stb %[value], [%[end]-2]\n\t"
+    " stb %[value], [%[end]-1]\n\t" // end[-1] = value
+    : /* no outputs */
+      [pc] "&=r" (pc)               // temp
+    : [offset] "&+r" (start),
+      [end] "r" (end),
+      [value] "r" (value)
+    : "memory");
+}
+
+void memset_with_concurrent_readers(void* to, int value, size_t size) {
+  Prefetch::write(to, 0);
+  void* end = static_cast<char*>(to) + size;
+  if (size >= BytesPerWord) {
+    // Fill any partial word prefix.
+    uintx* aligned_to = static_cast<uintx*>(align_ptr_up(to, BytesPerWord));
+    fill_subword(to, aligned_to, value);
+
+    // Compute fill word.
+    STATIC_ASSERT(BitsPerByte == 8);
+    STATIC_ASSERT(BitsPerWord == 64);
+    uintx xvalue = value & 0xff;
+    xvalue |= (xvalue << 8);
+    xvalue |= (xvalue << 16);
+    xvalue |= (xvalue << 32);
+
+    uintx* aligned_end = static_cast<uintx*>(align_ptr_down(end, BytesPerWord));
+    assert(aligned_to <= aligned_end, "invariant");
+
+    // for ( ; aligned_to < aligned_end; ++aligned_to) {
+    //   *aligned_to = xvalue;
+    // }
+    uintptr_t temp;
+    __asm__ volatile(
+      // Unroll loop x8.
+      " sub %[aend], %[ato], %[temp]\n\t"
+      " cmp %[temp], 56\n\t"           // cc := (aligned_end - aligned_to) > 7 words
+      " ba %xcc, 2f\n\t"               // goto TEST always
+      "  sub %[aend], 56, %[temp]\n\t" // limit := aligned_end - 7 words
+      // LOOP:
+      "1:\n\t"                         // unrolled x8 store loop top
+      " cmp %[temp], %[ato]\n\t"       // cc := limit > (next) aligned_to
+      " stx %[xvalue], [%[ato]-64]\n\t" // store 8 words, aligned_to pre-incremented
+      " stx %[xvalue], [%[ato]-56]\n\t"
+      " stx %[xvalue], [%[ato]-48]\n\t"
+      " stx %[xvalue], [%[ato]-40]\n\t"
+      " stx %[xvalue], [%[ato]-32]\n\t"
+      " stx %[xvalue], [%[ato]-24]\n\t"
+      " stx %[xvalue], [%[ato]-16]\n\t"
+      " stx %[xvalue], [%[ato]-8]\n\t"
+      // TEST:
+      "2:\n\t"
+      " bgu,a %xcc, 1b\n\t"            // goto LOOP if more than 7 words remaining
+      "  add %[ato], 64, %[ato]\n\t"   // aligned_to += 8, for next iteration
+      // Fill remaining < 8 full words.
+      // Dispatch on (aligned_end - aligned_to).
+      // offset := (7 - (aligned_end - aligned_to)) + 3
+      //   3 instructions from rdpc to DISPATCH
+      " sub %[ato], %[aend], %[ato]\n\t" // offset := aligned_to - aligned_end
+      " srax %[ato], 1, %[ato]\n\t"      // scale offset for instruction size of 4
+      " add %[ato], 40, %[ato]\n\t"      // offset += 10 * instruction size
+      " rd %pc, %[temp]\n\t"             // dispatch on scaled offset
+      " jmpl %[temp]+%[ato], %g0\n\t"
+      "  nop\n\t"
+      // DISPATCH: no direct reference, but without it the store block may be elided.
+      "3:\n\t"
+      " stx %[xvalue], [%[aend]-56]\n\t" // aligned_end[-7] = xvalue
+      " stx %[xvalue], [%[aend]-48]\n\t"
+      " stx %[xvalue], [%[aend]-40]\n\t"
+      " stx %[xvalue], [%[aend]-32]\n\t"
+      " stx %[xvalue], [%[aend]-24]\n\t"
+      " stx %[xvalue], [%[aend]-16]\n\t"
+      " stx %[xvalue], [%[aend]-8]\n\t"  // aligned_end[-1] = xvalue
+      : /* no outputs */
+        [temp] "&=r" (temp)
+      : [ato] "&+r" (aligned_to),
+        [aend] "r" (aligned_end),
+        [xvalue] "r" (xvalue)
+      : "cc", "memory");
+    to = aligned_end;           // setup for suffix
+  }
+  // Fill any partial word suffix.  Also the prefix if size < BytesPerWord.
+  fill_subword(to, end, value);
+}
+
+#endif // INCLUDE_ALL_GCS
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -981,7 +981,7 @@
           __ restore();
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
@@ -1014,7 +1014,7 @@
           __ restore();
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
         {
           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
@@ -5110,6 +5110,188 @@
     return start;
   }
 
+#define ADLER32_NUM_TEMPS 16
+
+  /**
+   *  Arguments:
+   *
+   * Inputs:
+   *   O0   - int   adler
+   *   O1   - byte* buff
+   *   O2   - int   len
+   *
+   * Output:
+   *   O0   - int adler result
+   */
+  address generate_updateBytesAdler32() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
+    address start = __ pc();
+
+    Label L_cleanup_loop, L_cleanup_loop_check;
+    Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check;
+    Label L_nmax_check_done;
+
+    // Aliases
+    Register s1     = O0;
+    Register s2     = O3;
+    Register buff   = O1;
+    Register len    = O2;
+    Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7};
+
+    // Max number of bytes we can process before having to take the mod
+    // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
+    unsigned long NMAX = 0x15B0;
+
+    // Zero-out the upper bits of len
+    __ clruwu(len);
+
+    // Create the mask 0xFFFF
+    __ set64(0x00FFFF, O4, O5); // O5 is the temp register
+
+    // s1 is initialized to the lower 16 bits of adler
+    // s2 is initialized to the upper 16 bits of adler
+    __ srlx(O0, 16, O5); // adler >> 16
+    __ and3(O0, O4, s1); // s1  = (adler & 0xFFFF)
+    __ and3(O5, O4, s2); // s2  = ((adler >> 16) & 0xFFFF)
+
+    // The pipelined loop needs at least 16 elements for 1 iteration
+    // It does check this, but it is more effective to skip to the cleanup loop
+    // Setup the constant for cutoff checking
+    __ mov(15, O4);
+
+    // Check if we are above the cutoff, if not go to the cleanup loop immediately
+    __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check);
+
+    // Free up some registers for our use
+    for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
+      __ movxtod(temp[i], as_FloatRegister(2*i));
+    }
+
+    // Loop maintenance stuff is done at the end of the loop, so skip to there
+    __ ba_short(L_main_loop_check);
+
+    __ BIND(L_main_loop);
+
+    // Prologue for inner loop
+    __ ldub(buff, 0, L0);
+    __ dec(O5);
+
+    for (int i = 1; i < 8; i++) {
+      __ ldub(buff, i, temp[i]);
+    }
+
+    __ inc(buff, 8);
+
+    // Inner loop processes 16 elements at a time, might never execute if only 16 elements
+    // to be processed by the outter loop
+    __ ba_short(L_inner_loop_check);
+
+    __ BIND(L_inner_loop);
+
+    for (int i = 0; i < 8; i++) {
+      __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]);
+      __ add(s1, temp[i], s1);
+      __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]);
+      __ add(s2, s1, s2);
+    }
+
+    // Original temp 0-7 used and new loads to temp 0-7 issued
+    // temp 8-15 ready to be consumed
+    __ add(s1, I0, s1);
+    __ dec(O5);
+    __ add(s2, s1, s2);
+    __ add(s1, I1, s1);
+    __ inc(buff, 16);
+    __ add(s2, s1, s2);
+
+    for (int i = 0; i < 6; i++) {
+      __ add(s1, temp[10+i], s1);
+      __ add(s2, s1, s2);
+    }
+
+    __ BIND(L_inner_loop_check);
+    __ nop();
+    __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop);
+
+    // Epilogue
+    for (int i = 0; i < 4; i++) {
+      __ ldub(buff, (2*i), temp[8+(2*i)]);
+      __ add(s1, temp[i], s1);
+      __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]);
+      __ add(s2, s1, s2);
+    }
+
+    __ add(s1, temp[4], s1);
+    __ inc(buff, 8);
+
+    for (int i = 0; i < 11; i++) {
+      __ add(s2, s1, s2);
+      __ add(s1, temp[5+i], s1);
+    }
+
+    __ add(s2, s1, s2);
+
+    // Take the mod for s1 and s2
+    __ set64(0xFFF1, L0, L1);
+    __ udivx(s1, L0, L1);
+    __ udivx(s2, L0, L2);
+    __ mulx(L0, L1, L1);
+    __ mulx(L0, L2, L2);
+    __ sub(s1, L1, s1);
+    __ sub(s2, L2, s2);
+
+    // Make sure there is something left to process
+    __ BIND(L_main_loop_check);
+    __ set64(NMAX, L0, L1);
+    // k = len < NMAX ? len : NMAX
+    __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done);
+    __ andn(len, 0x0F, L0); // only loop a multiple of 16 times
+    __ BIND(L_nmax_check_done);
+    __ mov(L0, O5);
+    __ sub(len, L0, len); // len -= k
+
+    __ srlx(O5, 4, O5); // multiplies of 16
+    __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop);
+
+    // Restore anything we used, take the mod one last time, combine and return
+    // Restore any registers we saved
+    for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
+      __ movdtox(as_FloatRegister(2*i), temp[i]);
+    }
+
+    // There might be nothing left to process
+    __ ba_short(L_cleanup_loop_check);
+
+    __ BIND(L_cleanup_loop);
+    __ ldub(buff, 0, O4); // load single byte form buffer
+    __ inc(buff); // buff++
+    __ add(s1, O4, s1); // s1 += *buff++;
+    __ dec(len); // len--
+    __ add(s1, s2, s2); // s2 += s1;
+    __ BIND(L_cleanup_loop_check);
+    __ nop();
+    __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop);
+
+    // Take the mod one last time
+    __ set64(0xFFF1, O1, O2);
+    __ udivx(s1, O1, O2);
+    __ udivx(s2, O1, O5);
+    __ mulx(O1, O2, O2);
+    __ mulx(O1, O5, O5);
+    __ sub(s1, O2, s1);
+    __ sub(s2, O5, s2);
+
+    // Combine lower bits and higher bits
+    __ sllx(s2, 16, s2); // s2 = s2 << 16
+    __ or3(s1, s2, s1);  // adler = s2 | s1
+    // Final return value is in O0
+    __ retl();
+    __ delayed()->nop();
+
+    return start;
+  }
+
   void generate_initial() {
     // Generates all stubs and initializes the entry points
 
@@ -5206,6 +5388,11 @@
     if (UseCRC32CIntrinsics) {
       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
     }
+
+    // generate Adler32 intrinsics code
+    if (UseAdler32Intrinsics) {
+      StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
+    }
   }
 
 
--- a/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -41,7 +41,7 @@
 enum /* platform_dependent_constants */ {
   // %%%%%%%% May be able to shrink this a lot
   code_size1 = 20000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 24000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 27000            // simply increase if too small (assembler will crash if too small)
 };
 
 class Sparc {
--- a/src/cpu/sparc/vm/templateTable_sparc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/sparc/vm/templateTable_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -91,7 +91,7 @@
       }
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       {
         if (index == noreg ) {
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -85,27 +85,6 @@
   _supports_cx8 = has_v9();
   _supports_atomic_getset4 = true; // swap instruction
 
-  // There are Fujitsu Sparc64 CPUs which support blk_init as well so
-  // we have to take this check out of the 'is_niagara()' block below.
-  if (has_blk_init()) {
-    // When using CMS or G1, we cannot use memset() in BOT updates
-    // because the sun4v/CMT version in libc_psr uses BIS which
-    // exposes "phantom zeros" to concurrent readers. See 6948537.
-    if (FLAG_IS_DEFAULT(UseMemSetInBOT) && (UseConcMarkSweepGC || UseG1GC)) {
-      FLAG_SET_DEFAULT(UseMemSetInBOT, false);
-    }
-    // Issue a stern warning if the user has explicitly set
-    // UseMemSetInBOT (it is known to cause issues), but allow
-    // use for experimentation and debugging.
-    if (UseConcMarkSweepGC || UseG1GC) {
-      if (UseMemSetInBOT) {
-        assert(!FLAG_IS_DEFAULT(UseMemSetInBOT), "Error");
-        warning("Experimental flag -XX:+UseMemSetInBOT is known to cause instability"
-                " on sun4v; please understand that you are using at your own risk!");
-      }
-    }
-  }
-
   if (is_niagara()) {
     // Indirect branch is the same cost as direct
     if (FLAG_IS_DEFAULT(UseInlineCaches)) {
@@ -377,6 +356,15 @@
     FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
   }
 
+  if (UseVIS > 2) {
+    if (FLAG_IS_DEFAULT(UseAdler32Intrinsics)) {
+      FLAG_SET_DEFAULT(UseAdler32Intrinsics, true);
+    }
+  } else if (UseAdler32Intrinsics) {
+    warning("SPARC Adler32 intrinsics require VIS3 instruction support. Intrinsics will be disabled.");
+    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
     (cache_line_size > ContendedPaddingWidth))
     ContendedPaddingWidth = cache_line_size;
--- a/src/cpu/x86/vm/assembler_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -394,25 +394,25 @@
   int mod_idx = 0;
   // We will test if the displacement fits the compressed format and if so
   // apply the compression to the displacment iff the result is8bit.
-  if (VM_Version::supports_evex() && is_evex_instruction) {
-    switch (tuple_type) {
+  if (VM_Version::supports_evex() && _is_evex_instruction) {
+    switch (_tuple_type) {
     case EVEX_FV:
-      if ((evex_encoding & VEX_W) == VEX_W) {
-        mod_idx += 2 + ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      if ((_evex_encoding & VEX_W) == VEX_W) {
+        mod_idx += 2 + ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
       } else {
-        mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+        mod_idx = ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
       }
       break;
 
     case EVEX_HV:
-      mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      mod_idx = ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
       break;
 
     case EVEX_FVM:
       break;
 
     case EVEX_T1S:
-      switch (input_size_in_bits) {
+      switch (_input_size_in_bits) {
       case EVEX_8bit:
         break;
 
@@ -433,7 +433,7 @@
     case EVEX_T1F:
     case EVEX_T2:
     case EVEX_T4:
-      mod_idx = (input_size_in_bits == EVEX_64bit) ? 1 : 0;
+      mod_idx = (_input_size_in_bits == EVEX_64bit) ? 1 : 0;
       break;
 
     case EVEX_T8:
@@ -459,8 +459,8 @@
       break;
     }
 
-    if (avx_vector_len >= AVX_128bit && avx_vector_len <= AVX_512bit) {
-      int disp_factor = tuple_table[tuple_type + mod_idx][avx_vector_len];
+    if (_avx_vector_len >= AVX_128bit && _avx_vector_len <= AVX_512bit) {
+      int disp_factor = tuple_table[_tuple_type + mod_idx][_avx_vector_len];
       if ((disp % disp_factor) == 0) {
         int new_disp = disp / disp_factor;
         if (is8bit(new_disp)) {
@@ -591,7 +591,7 @@
       emit_data(disp, rspec, disp32_operand);
     }
   }
-  is_evex_instruction = false;
+  _is_evex_instruction = false;
 }
 
 void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
@@ -1229,8 +1229,8 @@
 void Assembler::addsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x58, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
@@ -1245,8 +1245,8 @@
 void Assembler::addss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
 }
@@ -1254,16 +1254,16 @@
 void Assembler::aesdec(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDE);
   emit_operand(dst, src);
 }
 
 void Assembler::aesdec(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38,  /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDE);
   emit_int8(0xC0 | encode);
 }
@@ -1271,16 +1271,16 @@
 void Assembler::aesdeclast(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit,  /* legacy_mode */ true);
   emit_int8((unsigned char)0xDF);
   emit_operand(dst, src);
 }
 
 void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38,  /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDF);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1288,16 +1288,16 @@
 void Assembler::aesenc(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDC);
   emit_operand(dst, src);
 }
 
 void Assembler::aesenc(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDC);
   emit_int8(0xC0 | encode);
 }
@@ -1305,21 +1305,20 @@
 void Assembler::aesenclast(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit,  /* legacy_mode */ true);
   emit_int8((unsigned char)0xDD);
   emit_operand(dst, src);
 }
 
 void Assembler::aesenclast(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDD);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-
 void Assembler::andl(Address dst, int32_t imm32) {
   InstructionMark im(this);
   prefix(dst);
@@ -1347,7 +1346,7 @@
 
 void Assembler::andnl(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2);
   emit_int8((unsigned char)0xF2);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1355,7 +1354,7 @@
 void Assembler::andnl(Register dst, Register src1, Address src2) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_legacy(dst, src1, src2, false);
+  vex_prefix_0F38_legacy(dst, src1, src2);
   emit_int8((unsigned char)0xF2);
   emit_operand(dst, src2);
 }
@@ -1382,7 +1381,7 @@
 
 void Assembler::blsil(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1390,14 +1389,14 @@
 void Assembler::blsil(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_legacy(rbx, dst, src, false);
+  vex_prefix_0F38_legacy(rbx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rbx, src);
 }
 
 void Assembler::blsmskl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1405,14 +1404,14 @@
 void Assembler::blsmskl(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(rdx, dst, src, false);
+  vex_prefix_0F38_legacy(rdx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rdx, src);
 }
 
 void Assembler::blsrl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1420,7 +1419,7 @@
 void Assembler::blsrl(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_legacy(rcx, dst, src, false);
+  vex_prefix_0F38_legacy(rcx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rcx, src);
 }
@@ -1569,9 +1568,9 @@
   // 0x66 is there. Strangly ucomisd comes out correct
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
   }
@@ -1580,7 +1579,7 @@
 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true);
+    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
   }
@@ -1588,16 +1587,16 @@
 
 void Assembler::comiss(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::cpuid() {
@@ -1607,12 +1606,12 @@
 
 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3);
+  emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3, /* no_mask_reg */ false, /* legacy_mode */ true);
 }
 
 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE);
+  emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ true);
 }
 
 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
@@ -1627,8 +1626,8 @@
 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1F;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1F;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x5A, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
@@ -1637,12 +1636,7 @@
 
 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = 0;
-  if (VM_Version::supports_evex()) {
-    encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true);
-  } else {
-    encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, false);
-  }
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VM_Version::supports_evex());
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1650,9 +1644,9 @@
 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-    emit_simd_arith_q(0x2A, dst, src, VEX_SIMD_F2, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+    emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   } else {
     emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2);
   }
@@ -1660,23 +1654,23 @@
 
 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true);
+  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
 }
 
 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1688,8 +1682,8 @@
 
 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
@@ -1698,14 +1692,14 @@
 
 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1721,8 +1715,8 @@
 void Assembler::divsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
@@ -1740,8 +1734,8 @@
 
 void Assembler::divss(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
@@ -1995,8 +1989,16 @@
 
 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex()) {
-    emit_simd_arith_nonds_q(0x28, dst, src, VEX_SIMD_66, true);
+  if (VM_Version::supports_avx512novl()) {
+    int vector_len = AVX_512bit;
+    int dst_enc = dst->encoding();
+    int src_enc = src->encoding();
+    int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F,
+                                       /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+    emit_int8(0x28);
+    emit_int8((unsigned char)(0xC0 | encode));
+  } else if (VM_Version::supports_evex()) {
+    emit_simd_arith_nonds_q(0x28, dst, src, VEX_SIMD_66);
   } else {
     emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66);
   }
@@ -2004,13 +2006,19 @@
 
 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
+  if (VM_Version::supports_avx512novl()) {
+    int vector_len = AVX_512bit;
+    int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, vector_len);
+    emit_int8(0x28);
+    emit_int8((unsigned char)(0xC0 | encode));
+  } else {
+    emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
+  }
 }
 
 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, true, VEX_OPCODE_0F,
-                                      false, AVX_128bit);
+  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2023,48 +2031,54 @@
   emit_operand(dst, src);
 }
 
-void Assembler::kmovq(KRegister dst, KRegister src) {
+void Assembler::kmovql(KRegister dst, KRegister src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE,
-                                      true, VEX_OPCODE_0F, true);
+                                      /* no_mask_reg */ true, VEX_OPCODE_0F, /* rex_w */ true);
   emit_int8((unsigned char)0x90);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::kmovq(KRegister dst, Address src) {
+void Assembler::kmovql(KRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   int dst_enc = dst->encoding();
   int nds_enc = 0;
   vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_NONE,
-             VEX_OPCODE_0F, true, AVX_128bit, true, true);
+             VEX_OPCODE_0F, /* vex_w */  true, AVX_128bit, /* legacy_mode */ true, /* no_reg_mask */ true);
   emit_int8((unsigned char)0x90);
   emit_operand((Register)dst, src);
 }
 
-void Assembler::kmovq(Address dst, KRegister src) {
+void Assembler::kmovql(Address dst, KRegister src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   int src_enc = src->encoding();
   int nds_enc = 0;
   vex_prefix(dst, nds_enc, src_enc, VEX_SIMD_NONE,
-             VEX_OPCODE_0F, true, AVX_128bit, true, true);
+             VEX_OPCODE_0F, /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_reg_mask */ true);
   emit_int8((unsigned char)0x90);
   emit_operand((Register)src, dst);
 }
 
 void Assembler::kmovql(KRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
-  bool supports_bw = VM_Version::supports_avx512bw();
-  VexSimdPrefix pre = supports_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true,
-                                      VEX_OPCODE_0F, supports_bw);
+  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F, /* legacy_mode */ !_legacy_mode_bw);
   emit_int8((unsigned char)0x92);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::kmovdl(KRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
-  VexSimdPrefix pre = VM_Version::supports_avx512bw() ? VEX_SIMD_F2 : VEX_SIMD_NONE;
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true, VEX_OPCODE_0F, false);
+  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, /* no_mask_reg */ true);
+  emit_int8((unsigned char)0x92);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::kmovwl(KRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
   emit_int8((unsigned char)0x92);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2088,7 +2102,7 @@
 
 void Assembler::movdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x6E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2096,7 +2110,7 @@
 void Assembler::movdl(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // swap src/dst to get correct prefix
-  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x7E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2104,11 +2118,11 @@
 void Assembler::movdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, true, VEX_OPCODE_0F);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_reg_mask */ true);
   emit_int8(0x6E);
   emit_operand(dst, src);
 }
@@ -2116,58 +2130,61 @@
 void Assembler::movdl(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_reg_mask */ true);
   emit_int8(0x7E);
   emit_operand(src, dst);
 }
 
 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::movdqa(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::movdqu(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::movdqu(Address dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3, false);
+    _tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ false);
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
 
 // Move Unaligned 256bit Vector
 void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
   int vector_len = AVX_256bit;
   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
   emit_int8(0x6F);
@@ -2175,67 +2192,100 @@
 }
 
 void Assembler::vmovdqu(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   InstructionMark im(this);
   int vector_len = AVX_256bit;
-  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
+  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
   emit_int8(0x6F);
   emit_operand(dst, src);
 }
 
 void Assembler::vmovdqu(Address dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   InstructionMark im(this);
   int vector_len = AVX_256bit;
   // swap src<->dst for encoding
   assert(src != xnoreg, "sanity");
-  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
+  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len);
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
 
 // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64)
-void Assembler::evmovdqu(XMMRegister dst, XMMRegister src, int vector_len) {
+void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_F3, VEX_OPCODE_0F,
-                                     true, vector_len, false, false);
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x6F);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evmovdqu(XMMRegister dst, Address src, int vector_len) {
+void Assembler::evmovdqul(XMMRegister dst, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-    vex_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
-  } else {
-    vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
-  }
+    _tuple_type = EVEX_FVM;
+  }
+  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
   emit_int8(0x6F);
   emit_operand(dst, src);
 }
 
-void Assembler::evmovdqu(Address dst, XMMRegister src, int vector_len) {
+void Assembler::evmovdqul(Address dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   InstructionMark im(this);
   assert(src != xnoreg, "sanity");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-    // swap src<->dst for encoding
-    vex_prefix_q(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
-  } else {
-    // swap src<->dst for encoding
-    vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
-  }
+    _tuple_type = EVEX_FVM;
+  }
+  // swap src<->dst for encoding
+  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len);
+  emit_int8(0x7F);
+  emit_operand(src, dst);
+}
+
+void Assembler::evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(UseAVX > 0, "");
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_F3, VEX_OPCODE_0F,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  emit_int8(0x6F);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evmovdquq(XMMRegister dst, Address src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(UseAVX > 2, "");
+  InstructionMark im(this);
+  _tuple_type = EVEX_FVM;
+  vex_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
+  emit_int8(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::evmovdquq(Address dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(UseAVX > 2, "");
+  InstructionMark im(this);
+  assert(src != xnoreg, "sanity");
+  _tuple_type = EVEX_FVM;
+  // swap src<->dst for encoding
+  vex_prefix_q(src, xnoreg, dst, VEX_SIMD_F3, vector_len);
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
@@ -2282,10 +2332,12 @@
 void Assembler::movlpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x12, dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+    emit_simd_arith_q(0x12, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
+  } else {
+    emit_simd_arith(0x12, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
+  }
 }
 
 void Assembler::movq( MMXRegister dst, Address src ) {
@@ -2312,11 +2364,11 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    simd_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    simd_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   } else {
-    simd_prefix(dst, src, VEX_SIMD_F3, true, VEX_OPCODE_0F);
+    simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   }
   emit_int8(0x7E);
   emit_operand(dst, src);
@@ -2326,12 +2378,12 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    simd_prefix(src, xnoreg, dst, VEX_SIMD_66, true,
-                VEX_OPCODE_0F, true, AVX_128bit);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    simd_prefix(src, xnoreg, dst, VEX_SIMD_66, /* no_mask_reg */ true,
+                VEX_OPCODE_0F, /* rex_w */ true);
   } else {
-    simd_prefix(dst, src, VEX_SIMD_66, true);
+    simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   }
   emit_int8((unsigned char)0xD6);
   emit_operand(src, dst);
@@ -2356,7 +2408,7 @@
 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    emit_simd_arith_q(0x10, dst, src, VEX_SIMD_F2, true);
+    emit_simd_arith_q(0x10, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   } else {
     emit_simd_arith(0x10, dst, src, VEX_SIMD_F2);
   }
@@ -2365,9 +2417,9 @@
 void Assembler::movsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    emit_simd_arith_nonds_q(0x10, dst, src, VEX_SIMD_F2, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x10, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2);
   }
@@ -2377,11 +2429,11 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     simd_prefix_q(src, xnoreg, dst, VEX_SIMD_F2);
   } else {
-    simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, false);
+    simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, /* no_mask_reg */ false);
   }
   emit_int8(0x11);
   emit_operand(src, dst);
@@ -2389,26 +2441,26 @@
 
 void Assembler::movss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3, true);
+  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
 }
 
 void Assembler::movss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
 }
 
 void Assembler::movss(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3, false);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ false);
   emit_int8(0x11);
   emit_operand(src, dst);
 }
@@ -2501,8 +2553,8 @@
 void Assembler::mulsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x59, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
@@ -2521,8 +2573,8 @@
 void Assembler::mulss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
 }
@@ -2831,29 +2883,27 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x67, dst, src, VEX_SIMD_66,
-                  false, (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x67, dst, src, VEX_SIMD_66,
-                  false, (VM_Version::supports_avx512dq() == false));
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "some form of AVX must be enabled");
-  emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector_len,
-                 false, (VM_Version::supports_avx512dq() == false));
+  emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx2(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_3A, true, vector_len);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ true, vector_len);
   emit_int8(0x00);
   emit_int8(0xC0 | encode);
   emit_int8(imm8);
@@ -2867,8 +2917,8 @@
 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
   InstructionMark im(this);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_3A,
-              false, AVX_128bit, true);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_3A,
+              /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x61);
   emit_operand(dst, src);
   emit_int8(imm8);
@@ -2876,8 +2926,8 @@
 
 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_3A, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x61);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2885,8 +2935,8 @@
 
 void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2894,8 +2944,8 @@
 
 void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, /* no_mask_reg */  true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ true, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2903,8 +2953,8 @@
 
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2912,8 +2962,8 @@
 
 void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ true, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2922,17 +2972,17 @@
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_HVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
+    _tuple_type = EVEX_HVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_38);
   emit_int8(0x30);
   emit_operand(dst, src);
 }
 
 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_38);
   emit_int8(0x30);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3035,8 +3085,8 @@
 
 void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_ssse3(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38,
-                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x00);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3044,33 +3094,34 @@
 void Assembler::pshufb(XMMRegister dst, Address src) {
   assert(VM_Version::supports_ssse3(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38,
-              false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x00);
   emit_operand(dst, src);
 }
 
 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
+  _instruction_uses_vl = true;
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_66);
   emit_int8(mode & 0xFF);
-
 }
 
 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
+  _instruction_uses_vl = true;
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, false);
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x70);
   emit_operand(dst, src);
   emit_int8(mode & 0xFF);
@@ -3079,8 +3130,7 @@
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2, false,
-                        (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(mode & 0xFF);
 }
 
@@ -3089,29 +3139,33 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, false, VEX_OPCODE_0F,
-              false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, /* no_mask_reg */ false,
+              VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x70);
   emit_operand(dst, src);
   emit_int8(mode & 0xFF);
 }
 
 void Assembler::psrldq(XMMRegister dst, int shift) {
-  // Shift 128 bit value in xmm register by number of bytes.
+  // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  // XMM3 is for /3 encoding: 66 0F 73 /3 ib
+  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift);
 }
 
 void Assembler::pslldq(XMMRegister dst, int shift) {
-  // Shift left 128 bit value in xmm register by number of bytes.
+  // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  // XMM7 is for /7 encoding: 66 0F 73 /7 ib
+  int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift);
@@ -3121,16 +3175,16 @@
   assert(VM_Version::supports_sse4_1(), "");
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
 
 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3142,7 +3196,8 @@
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len, true, false);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* rex_w */ false,
+             vector_len, /* legacy_mode  */ true, /* no_mask_reg */ false);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
@@ -3150,8 +3205,7 @@
 void Assembler::vptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, true, false);
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3160,34 +3214,41 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_vlbw);
 }
 
 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_vlbw);
 }
 
 void Assembler::punpckldq(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x6C, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::push(int32_t imm32) {
@@ -3396,8 +3457,8 @@
 void Assembler::sqrtsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x51, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
@@ -3416,8 +3477,8 @@
 void Assembler::sqrtss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
 }
@@ -3479,10 +3540,14 @@
 void Assembler::subsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-  }
-  emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+  }
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::subss(XMMRegister dst, XMMRegister src) {
@@ -3493,8 +3558,8 @@
 void Assembler::subss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
 }
@@ -3553,9 +3618,9 @@
 void Assembler::ucomisd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
   }
@@ -3564,7 +3629,7 @@
 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true);
+    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
   }
@@ -3573,15 +3638,15 @@
 void Assembler::ucomiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::xabort(int8_t imm8) {
@@ -3664,8 +3729,8 @@
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3684,8 +3749,8 @@
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3698,8 +3763,8 @@
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3718,8 +3783,8 @@
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3732,8 +3797,8 @@
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3752,8 +3817,8 @@
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3766,8 +3831,8 @@
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3786,8 +3851,8 @@
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3802,6 +3867,7 @@
 // Float-point vector arithmetic
 
 void Assembler::addpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x58, dst, src, VEX_SIMD_66);
@@ -3811,11 +3877,13 @@
 }
 
 void Assembler::addps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3825,15 +3893,17 @@
 }
 
 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3841,15 +3911,17 @@
 }
 
 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::subpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_66);
@@ -3859,11 +3931,13 @@
 }
 
 void Assembler::subps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3873,15 +3947,17 @@
 }
 
 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3889,15 +3965,17 @@
 }
 
 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x59, dst, src, VEX_SIMD_66);
@@ -3907,11 +3985,13 @@
 }
 
 void Assembler::mulps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3921,15 +4001,17 @@
 }
 
 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3937,15 +4019,17 @@
 }
 
 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::divpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_66);
@@ -3955,11 +4039,13 @@
 }
 
 void Assembler::divps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3969,15 +4055,17 @@
 }
 
 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3985,164 +4073,178 @@
 }
 
 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
+void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(VM_Version::supports_avx(), "");
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vsqrtpd(XMMRegister dst, Address src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(VM_Version::supports_avx(), "");
+  if (VM_Version::supports_evex()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  }
+}
+
 void Assembler::andpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::andps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, false,
-                  (VM_Version::supports_avx512dq() == false));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::andps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE,
-                  false, (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::andpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  bool legacy_mode = (VM_Version::supports_avx512dq() == false);
-  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, legacy_mode);
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false,  /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len,
-                 (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE,
-                  false, (VM_Version::supports_avx512dq() == false));
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::xorpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::xorps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, false,
-                  (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len,
-                 (VM_Version::supports_avx512dq() == false));
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len,
-                 (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 // Integer vector arithmetic
 void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len,
-                                     VEX_OPCODE_0F_38, true, false);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4150,28 +4252,29 @@
 void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len,
-                                     VEX_OPCODE_0F_38, true, false);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::paddb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xFC, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::paddw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xFD, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xFD, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::paddd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xFE, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::paddq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xD4, dst, src, VEX_SIMD_66);
@@ -4182,38 +4285,38 @@
 
 void Assembler::phaddw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse3(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse3(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4225,33 +4328,35 @@
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len);
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len);
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4260,20 +4365,22 @@
 
 void Assembler::psubb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF8, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xF8, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psubw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF9, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xF9, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psubd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xFA, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::psubq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xFB, dst, src, VEX_SIMD_66);
@@ -4284,22 +4391,22 @@
 
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4311,35 +4418,35 @@
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4348,28 +4455,27 @@
 
 void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_sse4_1(), "");
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66,
-                                      false, VEX_OPCODE_0F_38);
+                                      /* no_mask_reg */ false, VEX_OPCODE_0F_38);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4379,8 +4485,8 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4388,22 +4494,23 @@
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len);
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   InstructionMark im(this);
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66,
-             VEX_OPCODE_0F_38, false, vector_len);
+             VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -4411,13 +4518,14 @@
 void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
   }
   InstructionMark im(this);
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66,
+             VEX_OPCODE_0F_38, /* vex_w */ true, vector_len, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -4426,26 +4534,28 @@
 void Assembler::psllw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
-                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F,
+                                      /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::pslld(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false);
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psllq(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F, /* rex_w */ true);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -4453,16 +4563,17 @@
 
 void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66, false,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::pslld(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xF2, dst, shift, VEX_SIMD_66);
 }
 
 void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xF3, dst, shift, VEX_SIMD_66);
@@ -4474,12 +4585,12 @@
 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
-  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
   emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector_len);
@@ -4487,6 +4598,7 @@
 }
 
 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
   if (VM_Version::supports_evex()) {
@@ -4499,16 +4611,17 @@
 
 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xF3, dst, src, shift, VEX_SIMD_66, vector_len);
@@ -4521,33 +4634,31 @@
 void Assembler::psrlw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 71 /2 ib
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
-                                      (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psrld(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 72 /2 ib
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false);
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psrlq(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   // Do not confuse it with psrldq SSE2 instruction which
   // shifts 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
-  int encode = 0;
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512bw()) {
-    encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false);
-  } else {
-    encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true);
-  }
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F, /* rex_w */ VM_Version::supports_evex());
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -4555,16 +4666,17 @@
 
 void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66, false,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psrld(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xD2, dst, shift, VEX_SIMD_66);
 }
 
 void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xD3, dst, shift, VEX_SIMD_66);
@@ -4575,20 +4687,21 @@
 
 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
-  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  // XMM2 is for /2 encoding: 66 0F 71 /2 ib
+  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
-  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  // XMM2 is for /2 encoding: 66 0F 72 /2 ib
   emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector_len);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
   if (VM_Version::supports_evex()) {
@@ -4601,16 +4714,17 @@
 
 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xD3, dst, src, shift, VEX_SIMD_66, vector_len);
@@ -4623,17 +4737,18 @@
 void Assembler::psraw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
-  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
-                                      (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psrad(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM4 is for /4 encoding: 66 0F 72 /4 ib
-  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false);
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -4641,11 +4756,11 @@
 
 void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66);
 }
@@ -4653,12 +4768,12 @@
 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
-  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
   emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector_len);
@@ -4667,11 +4782,11 @@
 
 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector_len);
 }
@@ -4684,53 +4799,61 @@
 }
 
 void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::por(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xEB, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xEF, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len);
 }
@@ -4739,6 +4862,9 @@
 void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4753,8 +4879,8 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66,
-                                     VEX_OPCODE_0F_3A, true, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x1A);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
@@ -4763,35 +4889,70 @@
 }
 
 void Assembler::vinsertf64x4h(XMMRegister dst, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_64bit;
-  }
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   int vector_len = AVX_512bit;
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, true, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ true, vector_len);
   emit_int8(0x1A);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
   emit_int8(0x01);
 }
 
-void Assembler::vinsertf128h(XMMRegister dst, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+void Assembler::vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  emit_int8(0x18);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into q0 128 bits (0..127)
+  // 0x01 - insert into q1 128 bits (128..255)
+  // 0x02 - insert into q2 128 bits (256..383)
+  // 0x03 - insert into q3 128 bits (384..511)
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vinsertf32x4h(XMMRegister dst, Address src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_32bit;
+  InstructionMark im(this);
+  int vector_len = AVX_512bit;
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
+  emit_int8(0x18);
+  emit_operand(dst, src);
+  // 0x00 - insert into q0 128 bits (0..127)
+  // 0x01 - insert into q1 128 bits (128..255)
+  // 0x02 - insert into q2 128 bits (256..383)
+  // 0x03 - insert into q3 128 bits (384..511)
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vinsertf128h(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x18);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
@@ -4801,6 +4962,9 @@
 void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4811,15 +4975,16 @@
 
 void Assembler::vextractf128h(Address dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
+  int vector_len = AVX_256bit;
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
-  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x19);
   emit_operand(src, dst);
   // 0x01 - extract from upper 128 bits
@@ -4829,6 +4994,9 @@
 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4844,7 +5012,7 @@
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_reg_mask */ false);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
@@ -4854,16 +5022,17 @@
 
 void Assembler::vinserti128h(XMMRegister dst, Address src) {
   assert(VM_Version::supports_avx2(), "");
+  int vector_len = AVX_256bit;
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x38);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
@@ -4873,6 +5042,9 @@
 void Assembler::vextracti128h(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4883,15 +5055,16 @@
 
 void Assembler::vextracti128h(Address dst, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
+  int vector_len = AVX_256bit;
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
-  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x39);
   emit_operand(src, dst);
   // 0x01 - extract from upper 128 bits
@@ -4904,7 +5077,7 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     true, vector_len, false, false);
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x3B);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from upper 256 bits
@@ -4916,8 +5089,14 @@
   int vector_len = AVX_512bit;
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
-  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+  int encode;
+  if (VM_Version::supports_avx512dq()) {
+    encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                   /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  } else {
+    encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                   /* vex_w */ false, vector_len, /* legacy_mode */ true, /* no_mask_reg */ false);
+  }
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from bits 255:128
@@ -4932,7 +5111,7 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x1B);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from upper 256 bits
@@ -4940,18 +5119,18 @@
 }
 
 void Assembler::vextractf64x4h(Address dst, XMMRegister src) {
-  assert(VM_Version::supports_avx2(), "");
-  tuple_type = EVEX_T4;
-  input_size_in_bits = EVEX_64bit;
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   int vector_len = AVX_512bit;
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
   vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-             VM_Version::supports_avx512dq(), vector_len);
+             /* vex_w */ true, vector_len);
   emit_int8(0x1B);
   emit_operand(src, dst);
-  // 0x01 - extract from upper 128 bits
+  // 0x01 - extract from upper 256 bits
   emit_int8(0x01);
 }
 
@@ -4960,8 +5139,42 @@
   int vector_len = AVX_512bit;
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
-  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66,
-                                     VEX_OPCODE_0F_3A, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  emit_int8(0x19);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vextractf32x4h(Address dst, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_32bit;
+  InstructionMark im(this);
+  int vector_len = AVX_512bit;
+  assert(src != xnoreg, "sanity");
+  int src_enc = src->encoding();
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
+  emit_int8(0x19);
+  emit_operand(src, dst);
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ !_legacy_mode_dq, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from bits 255:128
@@ -4970,195 +5183,192 @@
   emit_int8(value & 0x3);
 }
 
-void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) {
-  assert(VM_Version::supports_evex(), "");
-  int vector_len = AVX_512bit;
-  int src_enc = src->encoding();
-  int dst_enc = dst->encoding();
-  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
-  emit_int8(0x19);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x01 - extract from bits 255:128
-  // 0x02 - extract from bits 383:256
-  // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
-}
-
 // duplicate 4-bytes integer data from src into 8 locations in dest
 void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
-  assert(VM_Version::supports_avx2(), "");
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
   int vector_len = AVX_256bit;
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x78);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_8bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_8bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x78);
   emit_operand(dst, src);
 }
 
 // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x79);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_16bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_16bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x79);
   emit_operand(dst, src);
 }
 
 // duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_32bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_32bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
 
 // duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_64bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ true, vector_len);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
 
 // duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_32bit;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_32bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x18);
   emit_operand(dst, src);
 }
 
 // duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /*vex_w */ true, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_64bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ true, vector_len);
   emit_int8(0x19);
   emit_operand(dst, src);
 }
 
 // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /*vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7B);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5166,8 +5376,8 @@
 // Carry-Less Multiplication Quadword
 void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
   assert(VM_Version::supports_clmul(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_3A, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8((unsigned char)mask);
@@ -5177,8 +5387,7 @@
 void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
   assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
   int vector_len = AVX_128bit;
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_3A, true);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A, /* legacy_mode */ true);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8((unsigned char)mask);
@@ -5737,7 +5946,7 @@
                             int vector_len, bool no_mask_reg ){
   // EVEX 0x62 prefix
   prefix(EVEX_4bytes);
-  evex_encoding = (vex_w ? VEX_W : 0) | (evex_r ? EVEX_Rb : 0);
+  _evex_encoding = (vex_w ? VEX_W : 0) | (evex_r ? EVEX_Rb : 0);
 
   // P0: byte 2, initialized to RXBR`00mm
   // instead of not'd
@@ -5776,10 +5985,10 @@
   bool vex_r = ((xreg_enc & 8) == 8) ? 1 : 0;
   bool vex_b = adr.base_needs_rex();
   bool vex_x = adr.index_needs_rex();
-  avx_vector_len = vector_len;
-
-  // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit
-  if (VM_Version::supports_avx512vl() == false) {
+  _avx_vector_len = vector_len;
+
+  // if vector length is turned off, revert to AVX for vectors smaller than 512-bit
+  if (_legacy_mode_vl && _instruction_uses_vl) {
     switch (vector_len) {
     case AVX_128bit:
     case AVX_256bit:
@@ -5792,11 +6001,12 @@
   {
     bool evex_r = (xreg_enc >= 16);
     bool evex_v = (nds_enc >= 16);
-    is_evex_instruction = true;
+    _is_evex_instruction = true;
     evex_prefix(vex_r, vex_b, vex_x, vex_w, evex_r, evex_v, nds_enc, pre, opc, false, false, vector_len, no_mask_reg);
   } else {
     vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len);
   }
+  _instruction_uses_vl = false;
 }
 
 int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
@@ -5804,10 +6014,10 @@
   bool vex_r = ((dst_enc & 8) == 8) ? 1 : 0;
   bool vex_b = ((src_enc & 8) == 8) ? 1 : 0;
   bool vex_x = false;
-  avx_vector_len = vector_len;
-
-  // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit
-  if (VM_Version::supports_avx512vl() == false) {
+  _avx_vector_len = vector_len;
+
+  // if vector length is turned off, revert to AVX for vectors smaller than 512-bit
+  if (_legacy_mode_vl && _instruction_uses_vl) {
     switch (vector_len) {
     case AVX_128bit:
     case AVX_256bit:
@@ -5827,6 +6037,8 @@
     vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len);
   }
 
+  _instruction_uses_vl = false;
+
   // return modrm byte components for operands
   return (((dst_enc & 7) << 3) | (src_enc & 7));
 }
@@ -5915,13 +6127,13 @@
 }
 
 void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg, bool legacy_mode) {
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, legacy_mode, AVX_128bit);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, false, AVX_128bit, legacy_mode);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, true, AVX_128bit);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, true);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5945,7 +6157,7 @@
 
 void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src,
                                VexSimdPrefix pre, int vector_len, bool no_mask_reg, bool legacy_mode) {
-  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector_len, VEX_OPCODE_0F, false, no_mask_reg);
+  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector_len, VEX_OPCODE_0F, legacy_mode, no_mask_reg);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6594,7 +6806,7 @@
 
 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6602,11 +6814,11 @@
 void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix_q(dst, dst, src, VEX_SIMD_F2, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_operand(dst, src);
 }
@@ -6614,25 +6826,25 @@
 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix_q(dst, dst, src, VEX_SIMD_F3, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_operand(dst, src);
 }
 
 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6668,6 +6880,13 @@
   emit_operand(as_Register(1), src);
 }
 
+void Assembler::xrstor(Address src) {
+  prefixq(src);
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xAE);
+  emit_operand(as_Register(5), src);
+}
+
 void Assembler::fxsave(Address dst) {
   prefixq(dst);
   emit_int8(0x0F);
@@ -6675,6 +6894,13 @@
   emit_operand(as_Register(0), dst);
 }
 
+void Assembler::xsave(Address dst) {
+  prefixq(dst);
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xAE);
+  emit_operand(as_Register(4), dst);
+}
+
 void Assembler::idivq(Register src) {
   int encode = prefixq_and_encode(src->encoding());
   emit_int8((unsigned char)0xF7);
@@ -6801,7 +7027,7 @@
 void Assembler::movdq(XMMRegister dst, Register src) {
   // table D-1 says MMX/SSE2
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x6E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6810,7 +7036,7 @@
   // table D-1 says MMX/SSE2
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // swap src/dst to get correct prefix
-  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x7E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6943,8 +7169,8 @@
 
 void Assembler::mulxq(Register dst1, Register dst2, Register src) {
   assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(),
-                                     VEX_SIMD_F2, VEX_OPCODE_0F_38, true, AVX_128bit, true, false);
+  int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38,
+                                    /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_mask_reg */ false);
   emit_int8((unsigned char)0xF6);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -7106,8 +7332,8 @@
 
 void Assembler::rorxq(Register dst, Register src, int imm8) {
   assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2,
-                                     VEX_OPCODE_0F_3A, true, AVX_128bit, true, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_mask_reg */ false);
   emit_int8((unsigned char)0xF0);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
--- a/src/cpu/x86/vm/assembler_x86.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -438,7 +438,9 @@
 
 };
 
-const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize);
+// 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
+// See fxsave and xsave(EVEX enabled) documentation for layout
+const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize);
 
 // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
 // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
@@ -594,11 +596,16 @@
 
 private:
 
-  int evex_encoding;
-  int input_size_in_bits;
-  int avx_vector_len;
-  int tuple_type;
-  bool is_evex_instruction;
+  int _evex_encoding;
+  int _input_size_in_bits;
+  int _avx_vector_len;
+  int _tuple_type;
+  bool _is_evex_instruction;
+  bool _legacy_mode_bw;
+  bool _legacy_mode_dq;
+  bool _legacy_mode_vl;
+  bool _legacy_mode_vlbw;
+  bool _instruction_uses_vl;
 
   // 64bit prefixes
   int prefix_and_encode(int reg_enc, bool byteinst = false);
@@ -972,11 +979,16 @@
   // belong in macro assembler but there is no need for both varieties to exist
 
   void init_attributes(void) {
-    evex_encoding = 0;
-    input_size_in_bits = 0;
-    avx_vector_len = AVX_NoVec;
-    tuple_type = EVEX_ETUP;
-    is_evex_instruction = false;
+    _evex_encoding = 0;
+    _input_size_in_bits = 0;
+    _avx_vector_len = AVX_NoVec;
+    _tuple_type = EVEX_ETUP;
+    _is_evex_instruction = false;
+    _legacy_mode_bw = (VM_Version::supports_avx512bw() == false);
+    _legacy_mode_dq = (VM_Version::supports_avx512dq() == false);
+    _legacy_mode_vl = (VM_Version::supports_avx512vl() == false);
+    _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false);
+    _instruction_uses_vl = false;
   }
 
   void lea(Register dst, Address src);
@@ -1344,8 +1356,10 @@
   void fxch(int i = 1);
 
   void fxrstor(Address src);
+  void xrstor(Address src);
 
   void fxsave(Address dst);
+  void xsave(Address dst);
 
   void fyl2x();
   void frndint();
@@ -1479,11 +1493,12 @@
   void movb(Address dst, int imm8);
   void movb(Register dst, Address src);
 
-  void kmovq(KRegister dst, KRegister src);
+  void kmovql(KRegister dst, KRegister src);
   void kmovql(KRegister dst, Register src);
   void kmovdl(KRegister dst, Register src);
-  void kmovq(Address dst, KRegister src);
-  void kmovq(KRegister dst, Address src);
+  void kmovwl(KRegister dst, Register src);
+  void kmovql(Address dst, KRegister src);
+  void kmovql(KRegister dst, Address src);
 
   void movdl(XMMRegister dst, Register src);
   void movdl(Register dst, XMMRegister src);
@@ -1509,9 +1524,12 @@
   void vmovdqu(XMMRegister dst, XMMRegister src);
 
    // Move Unaligned 512bit Vector
-  void evmovdqu(Address dst, XMMRegister src, int vector_len);
-  void evmovdqu(XMMRegister dst, Address src, int vector_len);
-  void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdqul(Address dst, XMMRegister src, int vector_len);
+  void evmovdqul(XMMRegister dst, Address src, int vector_len);
+  void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdquq(Address dst, XMMRegister src, int vector_len);
+  void evmovdquq(XMMRegister dst, Address src, int vector_len);
+  void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
 
   // Move lower 64bit to high 64bit in 128bit register
   void movlhps(XMMRegister dst, XMMRegister src);
@@ -1643,6 +1661,7 @@
 
   // Pemutation of 64bit words
   void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
+  void vpermq(XMMRegister dst, XMMRegister src, int imm8);
 
   void pause();
 
@@ -1920,6 +1939,10 @@
   void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
   void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
+  // Sqrt Packed Floating-Point Values - Double precision only
+  void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vsqrtpd(XMMRegister dst, Address src, int vector_len);
+
   // Bitwise Logical AND of Packed Floating-Point Values
   void andpd(XMMRegister dst, XMMRegister src);
   void andps(XMMRegister dst, XMMRegister src);
@@ -2057,6 +2080,9 @@
   void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
   void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
   void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
+  void vextractf32x4h(Address dst, XMMRegister src, int value);
+  void vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
+  void vinsertf32x4h(XMMRegister dst, Address src, int value);
 
   // duplicate 4-bytes integer data from src into 8 locations in dest
   void vpbroadcastd(XMMRegister dst, XMMRegister src);
--- a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -3798,16 +3798,24 @@
     if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) {
       __ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg());
     }
-    __ xorps(dest->as_xmm_float_reg(),
-             ExternalAddress((address)float_signflip_pool));
-
+    if (UseAVX > 1) {
+      __ vnegatess(dest->as_xmm_float_reg(), dest->as_xmm_float_reg(),
+                   ExternalAddress((address)float_signflip_pool));
+    } else {
+      __ xorps(dest->as_xmm_float_reg(),
+               ExternalAddress((address)float_signflip_pool));
+    }
   } else if (dest->is_double_xmm()) {
     if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) {
       __ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg());
     }
-    __ xorpd(dest->as_xmm_double_reg(),
-             ExternalAddress((address)double_signflip_pool));
-
+    if (UseAVX > 1) {
+      __ vnegatesd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg(),
+                   ExternalAddress((address)double_signflip_pool));
+    } else {
+      __ xorpd(dest->as_xmm_double_reg(),
+               ExternalAddress((address)double_signflip_pool));
+    }
   } else if (left->is_single_fpu() || left->is_double_fpu()) {
     assert(left->fpu() == 0, "arg must be on TOS");
     assert(dest->fpu() == 0, "dest must be TOS");
--- a/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -401,11 +401,9 @@
 
     } else if (UseSSE == 1) {
       int xmm_off = xmm_regs_as_doubles_off;
-      for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
-        if (n < xmm_bypass_limit) {
-          VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
-          map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
-        }
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
+        map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
         xmm_off += 2;
       }
       assert(xmm_off == float_regs_as_doubles_off, "incorrect number of xmm registers");
@@ -452,14 +450,11 @@
       __ frstor(Address(rsp, fpu_state_off * VMRegImpl::stack_slot_size));
 
       // Save the FPU registers in de-opt-able form
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
     }
 
     if (UseSSE >= 2) {
@@ -468,52 +463,26 @@
       // so always save them as doubles.
       // note that float values are _not_ converted automatically, so for float values
       // the second word contains only garbage data.
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0), xmm0);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8), xmm1);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7);
+      int xmm_bypass_limit = FrameMap::nof_xmm_regs;
+      int offset = 0;
 #ifdef _LP64
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64), xmm8);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72), xmm9);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80), xmm10);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88), xmm11);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96), xmm12);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104), xmm13);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112), xmm14);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120), xmm15);
-      if (UseAVX > 2) {
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128), xmm16);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136), xmm17);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144), xmm18);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152), xmm19);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160), xmm20);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168), xmm21);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176), xmm22);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184), xmm23);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192), xmm24);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200), xmm25);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208), xmm26);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216), xmm27);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224), xmm28);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232), xmm29);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240), xmm30);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248), xmm31);
+      if (UseAVX < 3) {
+        xmm_bypass_limit = xmm_bypass_limit / 2;
       }
-#endif // _LP64
+#endif
+      for (int n = 0; n < xmm_bypass_limit; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name);
+        offset += 8;
+      }
     } else if (UseSSE == 1) {
-      // save XMM registers as float because double not supported without SSE2
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0), xmm0);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8), xmm1);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7);
+      // save XMM registers as float because double not supported without SSE2(num MMX == num fpu)
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name);
+        offset += 8;
+      }
     }
   }
 
@@ -528,52 +497,26 @@
   if (restore_fpu_registers) {
     if (UseSSE >= 2) {
       // restore XMM registers
-      __ movdbl(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ movdbl(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ movdbl(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ movdbl(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ movdbl(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ movdbl(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ movdbl(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ movdbl(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      int xmm_bypass_limit = FrameMap::nof_xmm_regs;
 #ifdef _LP64
-      __ movdbl(xmm8, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64));
-      __ movdbl(xmm9, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72));
-      __ movdbl(xmm10, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80));
-      __ movdbl(xmm11, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88));
-      __ movdbl(xmm12, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96));
-      __ movdbl(xmm13, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104));
-      __ movdbl(xmm14, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112));
-      __ movdbl(xmm15, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120));
-      if (UseAVX > 2) {
-        __ movdbl(xmm16, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128));
-        __ movdbl(xmm17, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136));
-        __ movdbl(xmm18, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144));
-        __ movdbl(xmm19, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152));
-        __ movdbl(xmm20, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160));
-        __ movdbl(xmm21, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168));
-        __ movdbl(xmm22, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176));
-        __ movdbl(xmm23, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184));
-        __ movdbl(xmm24, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192));
-        __ movdbl(xmm25, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200));
-        __ movdbl(xmm26, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208));
-        __ movdbl(xmm27, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216));
-        __ movdbl(xmm28, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224));
-        __ movdbl(xmm29, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232));
-        __ movdbl(xmm30, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240));
-        __ movdbl(xmm31, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248));
+      if (UseAVX < 3) {
+        xmm_bypass_limit = xmm_bypass_limit / 2;
       }
-#endif // _LP64
+#endif
+      int offset = 0;
+      for (int n = 0; n < xmm_bypass_limit; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movdbl(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
     } else if (UseSSE == 1) {
-      // restore XMM registers
-      __ movflt(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ movflt(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ movflt(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ movflt(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ movflt(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ movflt(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ movflt(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ movflt(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      // restore XMM registers(num MMX == num fpu)
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movflt(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
     }
 
     if (UseSSE < 2) {
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -3751,8 +3751,31 @@
 }
 
 void MacroAssembler::pop_FPU_state() {
-  NOT_LP64(frstor(Address(rsp, 0));)
-  LP64_ONLY(fxrstor(Address(rsp, 0));)
+#ifndef _LP64
+  frstor(Address(rsp, 0));
+#else
+  // AVX will continue to use the fxsave area.
+  // EVEX needs to utilize the xsave area, which is under different
+  // management.
+  if(VM_Version::supports_evex()) {
+    // EDX:EAX describe the XSAVE header and
+    // are obtained while fetching info for XCR0 via cpuid.
+    // These two registers make up 64-bits in the header for which bits
+    // 62:10 are currently reserved for future implementations and unused.  Bit 63
+    // is unused for our implementation as we do not utilize
+    // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
+    // the functionality for PKRU state and MSR tracing.
+    // Ergo we are primarily concerned with bits 7..0, which define
+    // which ISA extensions and features are enabled for a given machine and are
+    // defined in XemXcr0Eax and is used to map the XSAVE area
+    // for restoring registers as described via XCR0.
+    movl(rdx,VM_Version::get_xsave_header_upper_segment());
+    movl(rax,VM_Version::get_xsave_header_lower_segment());
+    xrstor(Address(rsp, 0));
+  } else {
+    fxrstor(Address(rsp, 0));
+  }
+#endif
   addptr(rsp, FPUStateSizeInWords * wordSize);
 }
 
@@ -3769,13 +3792,49 @@
   push_FPU_state();
 }
 
+#ifdef _LP64
+#define XSTATE_BV 0x200
+#endif
+
 void MacroAssembler::push_FPU_state() {
   subptr(rsp, FPUStateSizeInWords * wordSize);
 #ifndef _LP64
   fnsave(Address(rsp, 0));
   fwait();
 #else
-  fxsave(Address(rsp, 0));
+  // AVX will continue to use the fxsave area.
+  // EVEX needs to utilize the xsave area, which is under different
+  // management.
+  if(VM_Version::supports_evex()) {
+    // Save a copy of EAX and EDX
+    push(rax);
+    push(rdx);
+    // EDX:EAX describe the XSAVE header and
+    // are obtained while fetching info for XCR0 via cpuid.
+    // These two registers make up 64-bits in the header for which bits
+    // 62:10 are currently reserved for future implementations and unused.  Bit 63
+    // is unused for our implementation as we do not utilize
+    // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
+    // the functionality for PKRU state and MSR tracing.
+    // Ergo we are primarily concerned with bits 7..0, which define
+    // which ISA extensions and features are enabled for a given machine and are
+    // defined in XemXcr0Eax and is used to program XSAVE area
+    // for saving the required registers as defined in XCR0.
+    int xcr0_edx = VM_Version::get_xsave_header_upper_segment();
+    int xcr0_eax = VM_Version::get_xsave_header_lower_segment();
+    movl(rdx,xcr0_edx);
+    movl(rax,xcr0_eax);
+    xsave(Address(rsp, wordSize*2));
+    // now Apply control bits and clear bytes 8..23 in the header
+    pop(rdx);
+    pop(rax);
+    movl(Address(rsp, XSTATE_BV), xcr0_eax);
+    movl(Address(rsp, XSTATE_BV+4), xcr0_edx);
+    andq(Address(rsp, XSTATE_BV+8), 0);
+    andq(Address(rsp, XSTATE_BV+16), 0);
+  } else {
+    fxsave(Address(rsp, 0));
+  }
 #endif // LP64
 }
 
@@ -4082,6 +4141,84 @@
   }
 }
 
+void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  int nds_enc = nds->encoding();
+  int dst_enc = dst->encoding();
+  bool dst_upper_bank = (dst_enc > 15);
+  bool nds_upper_bank = (nds_enc > 15);
+  if (VM_Version::supports_avx512novl() &&
+      (nds_upper_bank || dst_upper_bank)) {
+    if (dst_upper_bank) {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      movflt(xmm0, nds);
+      if (reachable(src)) {
+        vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+      movflt(dst, xmm0);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    } else {
+      movflt(dst, nds);
+      if (reachable(src)) {
+        vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+    }
+  } else {
+    if (reachable(src)) {
+      vxorps(dst, nds, as_Address(src), Assembler::AVX_128bit);
+    } else {
+      lea(rscratch1, src);
+      vxorps(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
+    }
+  }
+}
+
+void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  int nds_enc = nds->encoding();
+  int dst_enc = dst->encoding();
+  bool dst_upper_bank = (dst_enc > 15);
+  bool nds_upper_bank = (nds_enc > 15);
+  if (VM_Version::supports_avx512novl() &&
+      (nds_upper_bank || dst_upper_bank)) {
+    if (dst_upper_bank) {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      movdbl(xmm0, nds);
+      if (reachable(src)) {
+        vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+      movdbl(dst, xmm0);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    } else {
+      movdbl(dst, nds);
+      if (reachable(src)) {
+        vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+    }
+  } else {
+    if (reachable(src)) {
+      vxorpd(dst, nds, as_Address(src), Assembler::AVX_128bit);
+    } else {
+      lea(rscratch1, src);
+      vxorpd(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
+    }
+  }
+}
+
 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
   if (reachable(src)) {
     vxorpd(dst, nds, as_Address(src), vector_len);
@@ -4318,9 +4455,10 @@
 void MacroAssembler::store_check(Register obj) {
   // Does a store check for the oop in register obj. The content of
   // register obj is destroyed afterwards.
-
   BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
+  assert(bs->kind() == BarrierSet::CardTableForRS ||
+         bs->kind() == BarrierSet::CardTableExtension,
+         "Wrong barrier set kind");
 
   CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
@@ -4570,69 +4708,58 @@
 
   // if we are coming from c1, xmm registers may be live
   int off = 0;
+  int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
+  if (UseAVX > 2) {
+    num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
+  }
+
   if (UseSSE == 1)  {
     subptr(rsp, sizeof(jdouble)*8);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
+    for (int n = 0; n < 8; n++) {
+      movflt(Address(rsp, off++*sizeof(jdouble)), as_XMMRegister(n));
+    }
   } else if (UseSSE >= 2)  {
     if (UseAVX > 2) {
+      push(rbx);
       movl(rbx, 0xffff);
-#ifdef _LP64
-      kmovql(k1, rbx);
-#else
-      kmovdl(k1, rbx);
-#endif
+      kmovwl(k1, rbx);
+      pop(rbx);
     }
 #ifdef COMPILER2
     if (MaxVectorSize > 16) {
-      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+      if(UseAVX > 2) {
+        // Save upper half of ZMM registes
+        subptr(rsp, 32*num_xmm_regs);
+        for (int n = 0; n < num_xmm_regs; n++) {
+          vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
+        }
+        off = 0;
+      }
+      assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
       // Save upper half of YMM registes
-      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
-      vextractf128h(Address(rsp,  0),xmm0);
-      vextractf128h(Address(rsp, 16),xmm1);
-      vextractf128h(Address(rsp, 32),xmm2);
-      vextractf128h(Address(rsp, 48),xmm3);
-      vextractf128h(Address(rsp, 64),xmm4);
-      vextractf128h(Address(rsp, 80),xmm5);
-      vextractf128h(Address(rsp, 96),xmm6);
-      vextractf128h(Address(rsp,112),xmm7);
-#ifdef _LP64
-      vextractf128h(Address(rsp,128),xmm8);
-      vextractf128h(Address(rsp,144),xmm9);
-      vextractf128h(Address(rsp,160),xmm10);
-      vextractf128h(Address(rsp,176),xmm11);
-      vextractf128h(Address(rsp,192),xmm12);
-      vextractf128h(Address(rsp,208),xmm13);
-      vextractf128h(Address(rsp,224),xmm14);
-      vextractf128h(Address(rsp,240),xmm15);
-#endif
+      subptr(rsp, 16*num_xmm_regs);
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+      }
     }
 #endif
-    // Save whole 128bit (16 bytes) XMM regiters
-    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
-    movdqu(Address(rsp,off++*16),xmm0);
-    movdqu(Address(rsp,off++*16),xmm1);
-    movdqu(Address(rsp,off++*16),xmm2);
-    movdqu(Address(rsp,off++*16),xmm3);
-    movdqu(Address(rsp,off++*16),xmm4);
-    movdqu(Address(rsp,off++*16),xmm5);
-    movdqu(Address(rsp,off++*16),xmm6);
-    movdqu(Address(rsp,off++*16),xmm7);
+    // Save whole 128bit (16 bytes) XMM registers
+    subptr(rsp, 16*num_xmm_regs);
+    off = 0;
 #ifdef _LP64
-    movdqu(Address(rsp,off++*16),xmm8);
-    movdqu(Address(rsp,off++*16),xmm9);
-    movdqu(Address(rsp,off++*16),xmm10);
-    movdqu(Address(rsp,off++*16),xmm11);
-    movdqu(Address(rsp,off++*16),xmm12);
-    movdqu(Address(rsp,off++*16),xmm13);
-    movdqu(Address(rsp,off++*16),xmm14);
-    movdqu(Address(rsp,off++*16),xmm15);
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vextractf32x4h(Address(rsp, off++*16), as_XMMRegister(n), 0);
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        movdqu(Address(rsp, off++*16), as_XMMRegister(n));
+      }
+    }
+#else
+    for (int n = 0; n < num_xmm_regs; n++) {
+      movdqu(Address(rsp, off++*16), as_XMMRegister(n));
+    }
 #endif
   }
 
@@ -4687,7 +4814,7 @@
   movsd(Address(rsp, 0), xmm0);
   fld_d(Address(rsp, 0));
 #endif // _LP64
-  addptr(rsp, sizeof(jdouble) * nb_args);
+  addptr(rsp, sizeof(jdouble)*nb_args);
   if (num_fpu_regs_in_use > 1) {
     // Must save return value to stack and then restore entire FPU
     // stack except incoming arguments
@@ -4697,63 +4824,50 @@
       addptr(rsp, sizeof(jdouble));
     }
     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
-    addptr(rsp, sizeof(jdouble) * nb_args);
+    addptr(rsp, sizeof(jdouble)*nb_args);
   }
 
   off = 0;
   if (UseSSE == 1)  {
-    movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+    for (int n = 0; n < 8; n++) {
+      movflt(as_XMMRegister(n), Address(rsp, off++*sizeof(jdouble)));
+    }
     addptr(rsp, sizeof(jdouble)*8);
   } else if (UseSSE >= 2)  {
     // Restore whole 128bit (16 bytes) XMM regiters
-    movdqu(xmm0, Address(rsp,off++*16));
-    movdqu(xmm1, Address(rsp,off++*16));
-    movdqu(xmm2, Address(rsp,off++*16));
-    movdqu(xmm3, Address(rsp,off++*16));
-    movdqu(xmm4, Address(rsp,off++*16));
-    movdqu(xmm5, Address(rsp,off++*16));
-    movdqu(xmm6, Address(rsp,off++*16));
-    movdqu(xmm7, Address(rsp,off++*16));
 #ifdef _LP64
-    movdqu(xmm8, Address(rsp,off++*16));
-    movdqu(xmm9, Address(rsp,off++*16));
-    movdqu(xmm10, Address(rsp,off++*16));
-    movdqu(xmm11, Address(rsp,off++*16));
-    movdqu(xmm12, Address(rsp,off++*16));
-    movdqu(xmm13, Address(rsp,off++*16));
-    movdqu(xmm14, Address(rsp,off++*16));
-    movdqu(xmm15, Address(rsp,off++*16));
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vinsertf32x4h(as_XMMRegister(n), Address(rsp, off++*16), 0);
+      }
+    }
+    else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        movdqu(as_XMMRegister(n), Address(rsp, off++*16));
+      }
+    }
+#else
+    for (int n = 0; n < num_xmm_regs; n++) {
+      movdqu(as_XMMRegister(n), Address(rsp, off++ * 16));
+    }
 #endif
-    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+    addptr(rsp, 16*num_xmm_regs);
+
 #ifdef COMPILER2
     if (MaxVectorSize > 16) {
       // Restore upper half of YMM registes.
-      vinsertf128h(xmm0, Address(rsp,  0));
-      vinsertf128h(xmm1, Address(rsp, 16));
-      vinsertf128h(xmm2, Address(rsp, 32));
-      vinsertf128h(xmm3, Address(rsp, 48));
-      vinsertf128h(xmm4, Address(rsp, 64));
-      vinsertf128h(xmm5, Address(rsp, 80));
-      vinsertf128h(xmm6, Address(rsp, 96));
-      vinsertf128h(xmm7, Address(rsp,112));
-#ifdef _LP64
-      vinsertf128h(xmm8, Address(rsp,128));
-      vinsertf128h(xmm9, Address(rsp,144));
-      vinsertf128h(xmm10, Address(rsp,160));
-      vinsertf128h(xmm11, Address(rsp,176));
-      vinsertf128h(xmm12, Address(rsp,192));
-      vinsertf128h(xmm13, Address(rsp,208));
-      vinsertf128h(xmm14, Address(rsp,224));
-      vinsertf128h(xmm15, Address(rsp,240));
-#endif
-      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
+      }
+      addptr(rsp, 16*num_xmm_regs);
+      if(UseAVX > 2) {
+        off = 0;
+        for (int n = 0; n < num_xmm_regs; n++) {
+          vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
+        }
+        addptr(rsp, 32*num_xmm_regs);
+      }
     }
 #endif
   }
@@ -7093,11 +7207,7 @@
       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
       if (UseAVX > 2) {
         movl(rtmp, 0xffff);
-#ifdef _LP64
-        kmovql(k1, rtmp);
-#else
-        kmovdl(k1, rtmp);
-#endif
+        kmovwl(k1, rtmp);
       }
       movdl(xtmp, value);
       if (UseAVX > 2 && UseUnalignedLoadStores) {
@@ -7110,7 +7220,7 @@
         align(16);
 
         BIND(L_fill_64_bytes_loop);
-        evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
+        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
         addptr(to, 64);
         subl(count, 16 << shift);
         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
@@ -7118,7 +7228,7 @@
         BIND(L_check_fill_32_bytes);
         addl(count, 8 << shift);
         jccb(Assembler::less, L_check_fill_8_bytes);
-        evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
+        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_256bit);
         addptr(to, 32);
         subl(count, 8 << shift);
 
@@ -8397,6 +8507,14 @@
   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
 
+  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+  // context for the registers used, where all instructions below are using 128-bit mode
+  // On EVEX without VL and BW, these instructions will all be AVX.
+  if (VM_Version::supports_avx512vlbw()) {
+    movl(tmp, 0xffff);
+    kmovwl(k1, tmp);
+  }
+
   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
   notl(crc); // ~crc
   cmpl(len, 16);
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -1069,6 +1069,9 @@
   void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
+  void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+  void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
   // AVX Vector instructions
 
   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -115,6 +115,7 @@
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words,
                                            int* total_frame_words, bool verify_fpu, bool save_vectors) {
   int vect_words = 0;
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 #ifdef COMPILER2
   if (save_vectors) {
     assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
@@ -173,59 +174,50 @@
     __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
   }
 
+  int off = st0_off;
+  int delta = st1_off - off;
+
   // Save the FPU registers in de-opt-able form
-
-  __ fstp_d(Address(rsp, st0_off*wordSize)); // st(0)
-  __ fstp_d(Address(rsp, st1_off*wordSize)); // st(1)
-  __ fstp_d(Address(rsp, st2_off*wordSize)); // st(2)
-  __ fstp_d(Address(rsp, st3_off*wordSize)); // st(3)
-  __ fstp_d(Address(rsp, st4_off*wordSize)); // st(4)
-  __ fstp_d(Address(rsp, st5_off*wordSize)); // st(5)
-  __ fstp_d(Address(rsp, st6_off*wordSize)); // st(6)
-  __ fstp_d(Address(rsp, st7_off*wordSize)); // st(7)
-
-  if( UseSSE == 1 ) {           // Save the XMM state
-    __ movflt(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movflt(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movflt(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movflt(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movflt(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movflt(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movflt(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movflt(Address(rsp,xmm7_off*wordSize),xmm7);
-  } else if( UseSSE >= 2 ) {
+  for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) {
+    __ fstp_d(Address(rsp, off*wordSize));
+    off += delta;
+  }
+
+  off = xmm0_off;
+  delta = xmm1_off - off;
+  if(UseSSE == 1) {           // Save the XMM state
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movflt(Address(rsp, off*wordSize), as_XMMRegister(n));
+      off += delta;
+    }
+  } else if(UseSSE >= 2) {
     // Save whole 128bit (16 bytes) XMM regiters
-    __ movdqu(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movdqu(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movdqu(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movdqu(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movdqu(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movdqu(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movdqu(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movdqu(Address(rsp,xmm7_off*wordSize),xmm7);
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vextractf32x4h(Address(rsp, off*wordSize), as_XMMRegister(n), 0);
+        off += delta;
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ movdqu(Address(rsp, off*wordSize), as_XMMRegister(n));
+        off += delta;
+      }
+    }
   }
 
   if (vect_words > 0) {
     assert(vect_words*wordSize == 128, "");
     __ subptr(rsp, 128); // Save upper half of YMM registes
-    __ vextractf128h(Address(rsp,  0),xmm0);
-    __ vextractf128h(Address(rsp, 16),xmm1);
-    __ vextractf128h(Address(rsp, 32),xmm2);
-    __ vextractf128h(Address(rsp, 48),xmm3);
-    __ vextractf128h(Address(rsp, 64),xmm4);
-    __ vextractf128h(Address(rsp, 80),xmm5);
-    __ vextractf128h(Address(rsp, 96),xmm6);
-    __ vextractf128h(Address(rsp,112),xmm7);
+    off = 0;
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+    }
     if (UseAVX > 2) {
       __ subptr(rsp, 256); // Save upper half of ZMM registes
-      __ vextractf64x4h(Address(rsp, 0), xmm0);
-      __ vextractf64x4h(Address(rsp, 32), xmm1);
-      __ vextractf64x4h(Address(rsp, 64), xmm2);
-      __ vextractf64x4h(Address(rsp, 96), xmm3);
-      __ vextractf64x4h(Address(rsp, 128), xmm4);
-      __ vextractf64x4h(Address(rsp, 160), xmm5);
-      __ vextractf64x4h(Address(rsp, 192), xmm6);
-      __ vextractf64x4h(Address(rsp, 224), xmm7);
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
+      }
     }
   }
 
@@ -238,58 +230,40 @@
   OopMap* map =  new OopMap( frame_words, 0 );
 
 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_words)
-
-  map->set_callee_saved(STACK_OFFSET( rax_off), rax->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rcx_off), rcx->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rdx_off), rdx->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rbx_off), rbx->as_VMReg());
+#define NEXTREG(x) (x)->as_VMReg()->next()
+
+  map->set_callee_saved(STACK_OFFSET(rax_off), rax->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rcx_off), rcx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rdx_off), rdx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rbx_off), rbx->as_VMReg());
   // rbp, location is known implicitly, no oopMap
-  map->set_callee_saved(STACK_OFFSET( rsi_off), rsi->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rdi_off), rdi->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st0_off), as_FloatRegister(0)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st1_off), as_FloatRegister(1)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st2_off), as_FloatRegister(2)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st3_off), as_FloatRegister(3)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st4_off), as_FloatRegister(4)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st5_off), as_FloatRegister(5)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st6_off), as_FloatRegister(6)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st7_off), as_FloatRegister(7)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm0_off), xmm0->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm1_off), xmm1->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm2_off), xmm2->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm3_off), xmm3->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm4_off), xmm4->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm5_off), xmm5->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm6_off), xmm6->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm7_off), xmm7->as_VMReg());
-  // %%% This is really a waste but we'll keep things as they were for now
-  if (true) {
-#define NEXTREG(x) (x)->as_VMReg()->next()
-    map->set_callee_saved(STACK_OFFSET(st0H_off), NEXTREG(as_FloatRegister(0)));
-    map->set_callee_saved(STACK_OFFSET(st1H_off), NEXTREG(as_FloatRegister(1)));
-    map->set_callee_saved(STACK_OFFSET(st2H_off), NEXTREG(as_FloatRegister(2)));
-    map->set_callee_saved(STACK_OFFSET(st3H_off), NEXTREG(as_FloatRegister(3)));
-    map->set_callee_saved(STACK_OFFSET(st4H_off), NEXTREG(as_FloatRegister(4)));
-    map->set_callee_saved(STACK_OFFSET(st5H_off), NEXTREG(as_FloatRegister(5)));
-    map->set_callee_saved(STACK_OFFSET(st6H_off), NEXTREG(as_FloatRegister(6)));
-    map->set_callee_saved(STACK_OFFSET(st7H_off), NEXTREG(as_FloatRegister(7)));
-    map->set_callee_saved(STACK_OFFSET(xmm0H_off), NEXTREG(xmm0));
-    map->set_callee_saved(STACK_OFFSET(xmm1H_off), NEXTREG(xmm1));
-    map->set_callee_saved(STACK_OFFSET(xmm2H_off), NEXTREG(xmm2));
-    map->set_callee_saved(STACK_OFFSET(xmm3H_off), NEXTREG(xmm3));
-    map->set_callee_saved(STACK_OFFSET(xmm4H_off), NEXTREG(xmm4));
-    map->set_callee_saved(STACK_OFFSET(xmm5H_off), NEXTREG(xmm5));
-    map->set_callee_saved(STACK_OFFSET(xmm6H_off), NEXTREG(xmm6));
-    map->set_callee_saved(STACK_OFFSET(xmm7H_off), NEXTREG(xmm7));
+  map->set_callee_saved(STACK_OFFSET(rsi_off), rsi->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rdi_off), rdi->as_VMReg());
+  // %%% This is really a waste but we'll keep things as they were for now for the upper component
+  off = st0_off;
+  delta = st1_off - off;
+  for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) {
+    FloatRegister freg_name = as_FloatRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), freg_name->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(freg_name));
+    off += delta;
+  }
+  off = xmm0_off;
+  delta = xmm1_off - off;
+  for (int n = 0; n < num_xmm_regs; n++) {
+    XMMRegister xmm_name = as_XMMRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(xmm_name));
+    off += delta;
+  }
 #undef NEXTREG
 #undef STACK_OFFSET
-  }
 
   return map;
-
 }
 
 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
   // Recover XMM & FPU state
   int additional_frame_bytes = 0;
 #ifdef COMPILER2
@@ -301,52 +275,43 @@
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
 #endif
+  int off = xmm0_off;
+  int delta = xmm1_off - off;
+
   if (UseSSE == 1) {
     assert(additional_frame_bytes == 0, "");
-    __ movflt(xmm0,Address(rsp,xmm0_off*wordSize));
-    __ movflt(xmm1,Address(rsp,xmm1_off*wordSize));
-    __ movflt(xmm2,Address(rsp,xmm2_off*wordSize));
-    __ movflt(xmm3,Address(rsp,xmm3_off*wordSize));
-    __ movflt(xmm4,Address(rsp,xmm4_off*wordSize));
-    __ movflt(xmm5,Address(rsp,xmm5_off*wordSize));
-    __ movflt(xmm6,Address(rsp,xmm6_off*wordSize));
-    __ movflt(xmm7,Address(rsp,xmm7_off*wordSize));
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movflt(as_XMMRegister(n), Address(rsp, off*wordSize));
+      off += delta;
+    }
   } else if (UseSSE >= 2) {
-#define STACK_ADDRESS(x) Address(rsp,(x)*wordSize + additional_frame_bytes)
-    __ movdqu(xmm0,STACK_ADDRESS(xmm0_off));
-    __ movdqu(xmm1,STACK_ADDRESS(xmm1_off));
-    __ movdqu(xmm2,STACK_ADDRESS(xmm2_off));
-    __ movdqu(xmm3,STACK_ADDRESS(xmm3_off));
-    __ movdqu(xmm4,STACK_ADDRESS(xmm4_off));
-    __ movdqu(xmm5,STACK_ADDRESS(xmm5_off));
-    __ movdqu(xmm6,STACK_ADDRESS(xmm6_off));
-    __ movdqu(xmm7,STACK_ADDRESS(xmm7_off));
-#undef STACK_ADDRESS
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vinsertf32x4h(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes), 0);
+        off += delta;
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
+        off += delta;
+      }
+    }
   }
   if (restore_vectors) {
+    if (UseAVX > 2) {
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
+      }
+      __ addptr(rsp, additional_frame_bytes*2); // Save upper half of ZMM registes
+    }
     // Restore upper half of YMM registes.
     assert(additional_frame_bytes == 128, "");
-    __ vinsertf128h(xmm0, Address(rsp,  0));
-    __ vinsertf128h(xmm1, Address(rsp, 16));
-    __ vinsertf128h(xmm2, Address(rsp, 32));
-    __ vinsertf128h(xmm3, Address(rsp, 48));
-    __ vinsertf128h(xmm4, Address(rsp, 64));
-    __ vinsertf128h(xmm5, Address(rsp, 80));
-    __ vinsertf128h(xmm6, Address(rsp, 96));
-    __ vinsertf128h(xmm7, Address(rsp,112));
-    __ addptr(rsp, additional_frame_bytes);
-    if (UseAVX > 2) {
-      additional_frame_bytes = 256;
-      __ vinsertf64x4h(xmm0, Address(rsp, 0));
-      __ vinsertf64x4h(xmm1, Address(rsp, 32));
-      __ vinsertf64x4h(xmm2, Address(rsp, 64));
-      __ vinsertf64x4h(xmm3, Address(rsp, 96));
-      __ vinsertf64x4h(xmm4, Address(rsp, 128));
-      __ vinsertf64x4h(xmm5, Address(rsp, 160));
-      __ vinsertf64x4h(xmm6, Address(rsp, 192));
-      __ vinsertf64x4h(xmm7, Address(rsp, 224));
-      __ addptr(rsp, additional_frame_bytes);
+    off = 0;
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
     }
+    __ addptr(rsp, additional_frame_bytes); // Save upper half of YMM registes
   }
   __ pop_FPU_state();
   __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -69,7 +69,9 @@
 class RegisterSaver {
   // Capture info about frame layout.  Layout offsets are in jint
   // units because compiler frame slots are jints.
+#define HALF_ZMM_BANK_WORDS 128
 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
+#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
   enum layout {
     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
     xmm_off       = fpu_state_off + 160/BytesPerInt,            // offset in fxsave save area
@@ -89,23 +91,24 @@
     DEF_XMM_OFFS(13),
     DEF_XMM_OFFS(14),
     DEF_XMM_OFFS(15),
-    DEF_XMM_OFFS(16),
-    DEF_XMM_OFFS(17),
-    DEF_XMM_OFFS(18),
-    DEF_XMM_OFFS(19),
-    DEF_XMM_OFFS(20),
-    DEF_XMM_OFFS(21),
-    DEF_XMM_OFFS(22),
-    DEF_XMM_OFFS(23),
-    DEF_XMM_OFFS(24),
-    DEF_XMM_OFFS(25),
-    DEF_XMM_OFFS(26),
-    DEF_XMM_OFFS(27),
-    DEF_XMM_OFFS(28),
-    DEF_XMM_OFFS(29),
-    DEF_XMM_OFFS(30),
-    DEF_XMM_OFFS(31),
-    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords - 1)*wordSize / BytesPerInt),
+    zmm_off = fpu_state_off + ((FPUStateSizeInWords - (HALF_ZMM_BANK_WORDS + 1))*wordSize / BytesPerInt),
+    DEF_ZMM_OFFS(16),
+    DEF_ZMM_OFFS(17),
+    DEF_ZMM_OFFS(18),
+    DEF_ZMM_OFFS(19),
+    DEF_ZMM_OFFS(20),
+    DEF_ZMM_OFFS(21),
+    DEF_ZMM_OFFS(22),
+    DEF_ZMM_OFFS(23),
+    DEF_ZMM_OFFS(24),
+    DEF_ZMM_OFFS(25),
+    DEF_ZMM_OFFS(26),
+    DEF_ZMM_OFFS(27),
+    DEF_ZMM_OFFS(28),
+    DEF_ZMM_OFFS(29),
+    DEF_ZMM_OFFS(30),
+    DEF_ZMM_OFFS(31),
+    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
     fpu_stateH_end,
     r15_off, r15H_off,
     r14_off, r14H_off,
@@ -155,9 +158,10 @@
 
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
   int vect_words = 0;
-  int num_xmm_regs = 16;
-  if (UseAVX > 2) {
-    num_xmm_regs = 32;
+  int off = 0;
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
+  if (UseAVX < 3) {
+    num_xmm_regs = num_xmm_regs/2;
   }
 #ifdef COMPILER2
   if (save_vectors) {
@@ -165,9 +169,7 @@
     assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
     // Save upper half of YMM registers
     vect_words = 16 * num_xmm_regs / wordSize;
-    additional_frame_words += vect_words;
-    if (UseAVX > 2) {
-      // Save upper half of ZMM registers as well
+    if (UseAVX < 3) {
       additional_frame_words += vect_words;
     }
   }
@@ -195,77 +197,13 @@
   __ enter();          // rsp becomes 16-byte aligned here
   __ push_CPU_state(); // Push a multiple of 16 bytes
 
-  if (vect_words > 0) {
+  // push cpu state handles this on EVEX enabled targets
+  if ((vect_words > 0) && (UseAVX < 3)) {
     assert(vect_words*wordSize >= 256, "");
-    __ subptr(rsp, 256); // Save upper half of YMM registes(0..15)
-    __ vextractf128h(Address(rsp, 0), xmm0);
-    __ vextractf128h(Address(rsp, 16), xmm1);
-    __ vextractf128h(Address(rsp, 32), xmm2);
-    __ vextractf128h(Address(rsp, 48), xmm3);
-    __ vextractf128h(Address(rsp, 64), xmm4);
-    __ vextractf128h(Address(rsp, 80), xmm5);
-    __ vextractf128h(Address(rsp, 96), xmm6);
-    __ vextractf128h(Address(rsp, 112), xmm7);
-    __ vextractf128h(Address(rsp, 128), xmm8);
-    __ vextractf128h(Address(rsp, 144), xmm9);
-    __ vextractf128h(Address(rsp, 160), xmm10);
-    __ vextractf128h(Address(rsp, 176), xmm11);
-    __ vextractf128h(Address(rsp, 192), xmm12);
-    __ vextractf128h(Address(rsp, 208), xmm13);
-    __ vextractf128h(Address(rsp, 224), xmm14);
-    __ vextractf128h(Address(rsp, 240), xmm15);
-    if (UseAVX > 2) {
-      __ subptr(rsp, 256); // Save upper half of YMM registes(16..31)
-      __ vextractf128h(Address(rsp, 0), xmm16);
-      __ vextractf128h(Address(rsp, 16), xmm17);
-      __ vextractf128h(Address(rsp, 32), xmm18);
-      __ vextractf128h(Address(rsp, 48), xmm19);
-      __ vextractf128h(Address(rsp, 64), xmm20);
-      __ vextractf128h(Address(rsp, 80), xmm21);
-      __ vextractf128h(Address(rsp, 96), xmm22);
-      __ vextractf128h(Address(rsp, 112), xmm23);
-      __ vextractf128h(Address(rsp, 128), xmm24);
-      __ vextractf128h(Address(rsp, 144), xmm25);
-      __ vextractf128h(Address(rsp, 160), xmm26);
-      __ vextractf128h(Address(rsp, 176), xmm27);
-      __ vextractf128h(Address(rsp, 192), xmm28);
-      __ vextractf128h(Address(rsp, 208), xmm29);
-      __ vextractf128h(Address(rsp, 224), xmm30);
-      __ vextractf128h(Address(rsp, 240), xmm31);
-      // Now handle the ZMM registers (0..31)
-      __ subptr(rsp, 1024); // Save upper half of ZMM registes
-      __ vextractf64x4h(Address(rsp, 0), xmm0);
-      __ vextractf64x4h(Address(rsp, 32), xmm1);
-      __ vextractf64x4h(Address(rsp, 64), xmm2);
-      __ vextractf64x4h(Address(rsp, 96), xmm3);
-      __ vextractf64x4h(Address(rsp, 128), xmm4);
-      __ vextractf64x4h(Address(rsp, 160), xmm5);
-      __ vextractf64x4h(Address(rsp, 192), xmm6);
-      __ vextractf64x4h(Address(rsp, 224), xmm7);
-      __ vextractf64x4h(Address(rsp, 256), xmm8);
-      __ vextractf64x4h(Address(rsp, 288), xmm9);
-      __ vextractf64x4h(Address(rsp, 320), xmm10);
-      __ vextractf64x4h(Address(rsp, 352), xmm11);
-      __ vextractf64x4h(Address(rsp, 384), xmm12);
-      __ vextractf64x4h(Address(rsp, 416), xmm13);
-      __ vextractf64x4h(Address(rsp, 448), xmm14);
-      __ vextractf64x4h(Address(rsp, 480), xmm15);
-      __ vextractf64x4h(Address(rsp, 512), xmm16);
-      __ vextractf64x4h(Address(rsp, 544), xmm17);
-      __ vextractf64x4h(Address(rsp, 576), xmm18);
-      __ vextractf64x4h(Address(rsp, 608), xmm19);
-      __ vextractf64x4h(Address(rsp, 640), xmm20);
-      __ vextractf64x4h(Address(rsp, 672), xmm21);
-      __ vextractf64x4h(Address(rsp, 704), xmm22);
-      __ vextractf64x4h(Address(rsp, 736), xmm23);
-      __ vextractf64x4h(Address(rsp, 768), xmm24);
-      __ vextractf64x4h(Address(rsp, 800), xmm25);
-      __ vextractf64x4h(Address(rsp, 832), xmm26);
-      __ vextractf64x4h(Address(rsp, 864), xmm27);
-      __ vextractf64x4h(Address(rsp, 896), xmm28);
-      __ vextractf64x4h(Address(rsp, 928), xmm29);
-      __ vextractf64x4h(Address(rsp, 960), xmm30);
-      __ vextractf64x4h(Address(rsp, 992), xmm31);
+    // Save upper half of YMM registes(0..num_xmm_regs)
+    __ subptr(rsp, num_xmm_regs*16);
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
     }
   }
   if (frame::arg_reg_save_area_bytes != 0) {
@@ -299,39 +237,24 @@
   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
-  if (UseAVX > 2) {
-    map->set_callee_saved(STACK_OFFSET(xmm16_off), xmm16->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm17_off), xmm17->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm18_off), xmm18->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm19_off), xmm19->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm20_off), xmm20->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm21_off), xmm21->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm22_off), xmm22->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm23_off), xmm23->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm24_off), xmm24->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm25_off), xmm25->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm26_off), xmm26->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm27_off), xmm27->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm28_off), xmm28->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm29_off), xmm29->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm30_off), xmm30->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm31_off), xmm31->as_VMReg());
+  // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
+  // on EVEX enabled targets, we get it included in the xsave area
+  off = xmm0_off;
+  int delta = xmm1_off - off;
+  for (int n = 0; n < 16; n++) {
+    XMMRegister xmm_name = as_XMMRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+    off += delta;
+  }
+  if(UseAVX > 2) {
+    // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
+    off = zmm16_off;
+    delta = zmm17_off - off;
+    for (int n = 16; n < num_xmm_regs; n++) {
+      XMMRegister xmm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+      off += delta;
+    }
   }
 
   // %%% These should all be a waste but we'll keep things as they were for now
@@ -351,39 +274,24 @@
     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
+    // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
+    // on EVEX enabled targets, we get it included in the xsave area
+    off = xmm0H_off;
+    delta = xmm1H_off - off;
+    for (int n = 0; n < 16; n++) {
+      XMMRegister xmm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+      off += delta;
+    }
     if (UseAVX > 2) {
-      map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()->next());
+      // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
+      off = zmm16H_off;
+      delta = zmm17H_off - off;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+        off += delta;
+      }
     }
   }
 
@@ -391,86 +299,25 @@
 }
 
 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
+  if (UseAVX < 3) {
+    num_xmm_regs = num_xmm_regs/2;
+  }
   if (frame::arg_reg_save_area_bytes != 0) {
     // Pop arg register save area
     __ addptr(rsp, frame::arg_reg_save_area_bytes);
   }
 #ifdef COMPILER2
-  if (restore_vectors) {
-    // Restore upper half of YMM registes (0..15)
-    assert(UseAVX > 0, "512bit vectors are supported only with AVX");
-    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
-    __ vinsertf128h(xmm0, Address(rsp,  0));
-    __ vinsertf128h(xmm1, Address(rsp, 16));
-    __ vinsertf128h(xmm2, Address(rsp, 32));
-    __ vinsertf128h(xmm3, Address(rsp, 48));
-    __ vinsertf128h(xmm4, Address(rsp, 64));
-    __ vinsertf128h(xmm5, Address(rsp, 80));
-    __ vinsertf128h(xmm6, Address(rsp, 96));
-    __ vinsertf128h(xmm7, Address(rsp,112));
-    __ vinsertf128h(xmm8, Address(rsp,128));
-    __ vinsertf128h(xmm9, Address(rsp,144));
-    __ vinsertf128h(xmm10, Address(rsp,160));
-    __ vinsertf128h(xmm11, Address(rsp,176));
-    __ vinsertf128h(xmm12, Address(rsp,192));
-    __ vinsertf128h(xmm13, Address(rsp,208));
-    __ vinsertf128h(xmm14, Address(rsp,224));
-    __ vinsertf128h(xmm15, Address(rsp,240));
-    __ addptr(rsp, 256);
-    if (UseAVX > 2) {
-      // Restore upper half of YMM registes (16..31)
-      __ vinsertf128h(xmm16, Address(rsp,  0));
-      __ vinsertf128h(xmm17, Address(rsp, 16));
-      __ vinsertf128h(xmm18, Address(rsp, 32));
-      __ vinsertf128h(xmm19, Address(rsp, 48));
-      __ vinsertf128h(xmm20, Address(rsp, 64));
-      __ vinsertf128h(xmm21, Address(rsp, 80));
-      __ vinsertf128h(xmm22, Address(rsp, 96));
-      __ vinsertf128h(xmm23, Address(rsp,112));
-      __ vinsertf128h(xmm24, Address(rsp,128));
-      __ vinsertf128h(xmm25, Address(rsp,144));
-      __ vinsertf128h(xmm26, Address(rsp,160));
-      __ vinsertf128h(xmm27, Address(rsp,176));
-      __ vinsertf128h(xmm28, Address(rsp,192));
-      __ vinsertf128h(xmm29, Address(rsp,208));
-      __ vinsertf128h(xmm30, Address(rsp,224));
-      __ vinsertf128h(xmm31, Address(rsp,240));
-      __ addptr(rsp, 256);
-      // Restore upper half of ZMM registes.
-      __ vinsertf64x4h(xmm0, Address(rsp, 0));
-      __ vinsertf64x4h(xmm1, Address(rsp, 32));
-      __ vinsertf64x4h(xmm2, Address(rsp, 64));
-      __ vinsertf64x4h(xmm3, Address(rsp, 96));
-      __ vinsertf64x4h(xmm4, Address(rsp, 128));
-      __ vinsertf64x4h(xmm5, Address(rsp, 160));
-      __ vinsertf64x4h(xmm6, Address(rsp, 192));
-      __ vinsertf64x4h(xmm7, Address(rsp, 224));
-      __ vinsertf64x4h(xmm8, Address(rsp, 256));
-      __ vinsertf64x4h(xmm9, Address(rsp, 288));
-      __ vinsertf64x4h(xmm10, Address(rsp, 320));
-      __ vinsertf64x4h(xmm11, Address(rsp, 352));
-      __ vinsertf64x4h(xmm12, Address(rsp, 384));
-      __ vinsertf64x4h(xmm13, Address(rsp, 416));
-      __ vinsertf64x4h(xmm14, Address(rsp, 448));
-      __ vinsertf64x4h(xmm15, Address(rsp, 480));
-      __ vinsertf64x4h(xmm16, Address(rsp, 512));
-      __ vinsertf64x4h(xmm17, Address(rsp, 544));
-      __ vinsertf64x4h(xmm18, Address(rsp, 576));
-      __ vinsertf64x4h(xmm19, Address(rsp, 608));
-      __ vinsertf64x4h(xmm20, Address(rsp, 640));
-      __ vinsertf64x4h(xmm21, Address(rsp, 672));
-      __ vinsertf64x4h(xmm22, Address(rsp, 704));
-      __ vinsertf64x4h(xmm23, Address(rsp, 736));
-      __ vinsertf64x4h(xmm24, Address(rsp, 768));
-      __ vinsertf64x4h(xmm25, Address(rsp, 800));
-      __ vinsertf64x4h(xmm26, Address(rsp, 832));
-      __ vinsertf64x4h(xmm27, Address(rsp, 864));
-      __ vinsertf64x4h(xmm28, Address(rsp, 896));
-      __ vinsertf64x4h(xmm29, Address(rsp, 928));
-      __ vinsertf64x4h(xmm30, Address(rsp, 960));
-      __ vinsertf64x4h(xmm31, Address(rsp, 992));
-      __ addptr(rsp, 1024);
+  // On EVEX enabled targets everything is handled in pop fpu state
+  if ((restore_vectors) && (UseAVX < 3)) {
+    assert(UseAVX > 0, "256/512-bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 64, "up to 512bit vectors are supported now");
+    int off = 0;
+    // Restore upper half of YMM registes (0..num_xmm_regs)
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  off++*16));
     }
+    __ addptr(rsp, num_xmm_regs*16);
   }
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -722,7 +722,7 @@
            __ popa();
          }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
@@ -754,7 +754,7 @@
         }
         break;
 
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
         {
           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
@@ -795,6 +795,12 @@
   void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
     assert( UseSSE >= 2, "supported cpu only" );
     Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
+    if (UseAVX > 2) {
+      __ push(rbx);
+      __ movl(rbx, 0xffff);
+      __ kmovdl(k1, rbx);
+      __ pop(rbx);
+    }
     // Copy 64-byte chunks
     __ jmpb(L_copy_64_bytes);
     __ align(OptoLoopAlignment);
@@ -802,8 +808,8 @@
 
     if (UseUnalignedLoadStores) {
       if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(from, 0), Assembler::AVX_512bit);
-        __ evmovdqu(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
+        __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
       } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(from,  0));
         __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
@@ -2217,6 +2223,15 @@
     const XMMRegister xmm_temp4  = xmm5;
 
     __ enter();   // required for proper stackwalking of RuntimeStub frame
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     __ movptr(from, from_param);
     __ movptr(key, key_param);
 
@@ -2315,6 +2330,15 @@
     const XMMRegister xmm_temp4  = xmm5;
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     __ movptr(from, from_param);
     __ movptr(key, key_param);
 
@@ -2441,6 +2465,14 @@
     __ enter(); // required for proper stackwalking of RuntimeStub frame
     handleSOERegisters(true /*saving*/);
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     // load registers from incoming parameters
     const Address  from_param(rbp, 8+0);
     const Address  to_param  (rbp, 8+4);
@@ -2602,6 +2634,14 @@
     __ enter(); // required for proper stackwalking of RuntimeStub frame
     handleSOERegisters(true /*saving*/);
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     // load registers from incoming parameters
     const Address  from_param(rbp, 8+0);
     const Address  to_param  (rbp, 8+4);
@@ -2782,6 +2822,14 @@
     __ enter();
     handleSOERegisters(true);  // Save registers
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     __ movptr(state, state_param);
     __ movptr(subkeyH, subkeyH_param);
     __ movptr(data, data_param);
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -269,12 +269,16 @@
       __ kmovql(k1, rbx);
     }
 #ifdef _WIN64
+    int last_reg = 15;
     if (UseAVX > 2) {
-      for (int i = 6; i <= 31; i++) {
-        __ movdqu(xmm_save(i), as_XMMRegister(i));
+      last_reg = 31;
+    }
+    if (VM_Version::supports_avx512novl()) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ vextractf32x4h(xmm_save(i), as_XMMRegister(i), 0);
       }
     } else {
-      for (int i = 6; i <= 15; i++) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
         __ movdqu(xmm_save(i), as_XMMRegister(i));
       }
     }
@@ -367,28 +371,34 @@
 #ifdef ASSERT
     // verify that threads correspond
     {
-      Label L, S;
+     Label L1, L2, L3;
       __ cmpptr(r15_thread, thread);
-      __ jcc(Assembler::notEqual, S);
+      __ jcc(Assembler::equal, L1);
+      __ stop("StubRoutines::call_stub: r15_thread is corrupted");
+      __ bind(L1);
       __ get_thread(rbx);
+      __ cmpptr(r15_thread, thread);
+      __ jcc(Assembler::equal, L2);
+      __ stop("StubRoutines::call_stub: r15_thread is modified by call");
+      __ bind(L2);
       __ cmpptr(r15_thread, rbx);
-      __ jcc(Assembler::equal, L);
-      __ bind(S);
-      __ jcc(Assembler::equal, L);
+      __ jcc(Assembler::equal, L3);
       __ stop("StubRoutines::call_stub: threads must correspond");
-      __ bind(L);
+      __ bind(L3);
     }
 #endif
 
     // restore regs belonging to calling function
 #ifdef _WIN64
-    int xmm_ub = 15;
-    if (UseAVX > 2) {
-      xmm_ub = 31;
-    }
     // emit the restores for xmm regs
-    for (int i = 6; i <= xmm_ub; i++) {
-      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    if (VM_Version::supports_avx512novl()) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ vinsertf32x4h(as_XMMRegister(i), xmm_save(i), 0);
+      }
+    } else {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ movdqu(as_XMMRegister(i), xmm_save(i));
+      }
     }
 #endif
     __ movptr(r15, r15_save);
@@ -450,15 +460,20 @@
 #ifdef ASSERT
     // verify that threads correspond
     {
-      Label L, S;
+      Label L1, L2, L3;
       __ cmpptr(r15_thread, thread);
-      __ jcc(Assembler::notEqual, S);
+      __ jcc(Assembler::equal, L1);
+      __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
+      __ bind(L1);
       __ get_thread(rbx);
+      __ cmpptr(r15_thread, thread);
+      __ jcc(Assembler::equal, L2);
+      __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
+      __ bind(L2);
       __ cmpptr(r15_thread, rbx);
-      __ jcc(Assembler::equal, L);
-      __ bind(S);
+      __ jcc(Assembler::equal, L3);
       __ stop("StubRoutines::catch_exception: threads must correspond");
-      __ bind(L);
+      __ bind(L3);
     }
 #endif
 
@@ -1244,7 +1259,7 @@
            __ popa();
         }
          break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef:
         break;
@@ -1284,7 +1299,7 @@
           __ popa();
         }
         break;
-      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
         {
           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
@@ -1333,11 +1348,15 @@
     __ align(OptoLoopAlignment);
     if (UseUnalignedLoadStores) {
       Label L_end;
+      if (UseAVX > 2) {
+        __ movl(to, 0xffff);
+        __ kmovql(k1, to);
+      }
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
       if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
-        __ evmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
+        __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
       } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
@@ -1413,11 +1432,15 @@
     __ align(OptoLoopAlignment);
     if (UseUnalignedLoadStores) {
       Label L_end;
+      if (UseAVX > 2) {
+        __ movl(to, 0xffff);
+        __ kmovql(k1, to);
+      }
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
       if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
-        __ evmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
+        __ evmovdqul(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
       } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
@@ -3097,6 +3120,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 
@@ -3191,6 +3222,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 
@@ -3303,6 +3342,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
     // on win64, fill len_reg from stack position
     __ movl(len_reg, len_mem);
@@ -3499,6 +3546,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
     // on win64, fill len_reg from stack position
     __ movl(len_reg, len_mem);
@@ -3737,6 +3792,14 @@
 
     __ enter();
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
     // save the xmm registers which must be preserved 6-10
     __ subptr(rsp, -rsp_after_call_off * wordSize);
--- a/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -31,7 +31,7 @@
 
 enum platform_dependent_constants {
   code_size1 =  9000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 30000            // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
--- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -33,7 +33,7 @@
 
 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 24000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 32000           // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
--- a/src/cpu/x86/vm/templateTable_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/templateTable_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -200,7 +200,7 @@
       }
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       {
         if (val == noreg) {
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -367,16 +367,12 @@
     __ movl(rcx, VM_Version::ymm_test_value());
     __ movdl(xmm0, rcx);
     __ movl(rcx, 0xffff);
+    __ kmovwl(k1, rcx);
+    __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit);
 #ifdef _LP64
-    __ kmovql(k1, rcx);
-#else
-    __ kmovdl(k1, rcx);
-#endif
-    __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(xmm7, xmm0, Assembler::AVX_512bit);
-#ifdef _LP64
-    __ evmovdqu(xmm8, xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(xmm31, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm8, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm31, xmm0, Assembler::AVX_512bit);
 #endif
     VM_Version::clean_cpuFeatures();
     __ jmp(save_restore_except);
@@ -427,11 +423,11 @@
     UseAVX = 3;
     UseSSE = 2;
     __ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset())));
-    __ evmovdqu(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
 #ifdef _LP64
-    __ evmovdqu(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
-    __ evmovdqu(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
 #endif
     VM_Version::clean_cpuFeatures();
     UseAVX = saved_useavx;
@@ -714,6 +710,11 @@
     FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
   }
 
+  if (UseAdler32Intrinsics) {
+    warning("Adler32Intrinsics not available on this CPU.");
+    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
+  }
+
   // Adjust RTM (Restricted Transactional Memory) flags
   if (!supports_rtm() && UseRTMLocking) {
     // Can't continue because UseRTMLocking affects UseBiasedLocking flag
--- a/src/cpu/x86/vm/vm_version_x86.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/vm_version_x86.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -227,14 +227,15 @@
   union XemXcr0Eax {
     uint32_t value;
     struct {
-      uint32_t x87    : 1,
-               sse    : 1,
-               ymm    : 1,
-                      : 2,
-               opmask : 1,
-               zmm512 : 1,
-                zmm32 : 1,
-                      : 24;
+      uint32_t x87     : 1,
+               sse     : 1,
+               ymm     : 1,
+               bndregs : 1,
+               bndcsr  : 1,
+               opmask  : 1,
+               zmm512  : 1,
+               zmm32   : 1,
+                       : 24;
     } bits;
   };
 
@@ -703,6 +704,7 @@
   static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
   static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
   static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
+  static bool supports_avx512novl() { return (supports_evex() && !supports_avx512vl()); }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
@@ -817,6 +819,12 @@
     intx count = PrefetchFieldsAhead;
     return count >= 0 ? count : 1;
   }
+  static uint32_t get_xsave_header_lower_segment() {
+    return _cpuid_info.xem_xcr0_eax.value;
+  }
+  static uint32_t get_xsave_header_upper_segment() {
+    return _cpuid_info.xem_xcr0_edx;
+  }
 };
 
 #endif // CPU_X86_VM_VM_VERSION_X86_HPP
--- a/src/cpu/x86/vm/x86.ad	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/x86.ad	Mon Sep 14 07:03:04 2015 +0000
@@ -1661,46 +1661,55 @@
   if (!has_match_rule(opcode))
     return false;
 
+  bool ret_value = true;
   switch (opcode) {
     case Op_PopCountI:
     case Op_PopCountL:
       if (!UsePopCountInstruction)
-        return false;
-    break;
+        ret_value = false;
+      break;
     case Op_MulVI:
       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
-        return false;
-    break;
+        ret_value = false;
+      break;
     case Op_MulVL:
     case Op_MulReductionVL:
       if (VM_Version::supports_avx512dq() == false)
-        return false;
+        ret_value = false;
+      break;
     case Op_AddReductionVL:
       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
-        return false;
+        ret_value = false;
+      break;
     case Op_AddReductionVI:
       if (UseSSE < 3) // requires at least SSE3
-        return false;
+        ret_value = false;
+      break;
     case Op_MulReductionVI:
       if (UseSSE < 4) // requires at least SSE4
-        return false;
+        ret_value = false;
+      break;
     case Op_AddReductionVF:
     case Op_AddReductionVD:
     case Op_MulReductionVF:
     case Op_MulReductionVD:
       if (UseSSE < 1) // requires at least SSE
-        return false;
-    break;
+        ret_value = false;
+      break;
+    case Op_SqrtVD:
+      if (UseAVX < 1) // enabled for AVX only
+        ret_value = false;
+      break;
     case Op_CompareAndSwapL:
 #ifdef _LP64
     case Op_CompareAndSwapP:
 #endif
       if (!VM_Version::supports_cx8())
-        return false;
-    break;
+        ret_value = false;
+      break;
   }
 
-  return true;  // Per default match rules are supported.
+  return ret_value;  // Per default match rules are supported.
 }
 
 // Max vector size in bytes. 0 if not supported.
@@ -1721,14 +1730,24 @@
   case T_DOUBLE:
   case T_LONG:
     if (size < 16) return 0;
+    break;
   case T_FLOAT:
   case T_INT:
     if (size < 8) return 0;
+    break;
   case T_BOOLEAN:
+    if (size < 4) return 0;
+    break;
+  case T_CHAR:
+    if (size < 4) return 0;
+    break;
   case T_BYTE:
-  case T_CHAR:
+    if (size < 4) return 0;
+    if ((size > 32) && !VM_Version::supports_avx512bw()) return 0;
+    break;
   case T_SHORT:
     if (size < 4) return 0;
+    if ((size > 16) && !VM_Version::supports_avx512bw()) return 0;
     break;
   default:
     ShouldNotReachHere();
@@ -1800,7 +1819,7 @@
       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
       break;
     case Op_VecZ:
-      __ evmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
+      __ evmovdqul(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
       break;
     default:
       ShouldNotReachHere();
@@ -1855,7 +1874,7 @@
         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
         break;
       case Op_VecZ:
-        __ evmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
+        __ evmovdqul(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
         break;
       default:
         ShouldNotReachHere();
@@ -1875,7 +1894,7 @@
         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
         break;
       case Op_VecZ:
-        __ evmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
+        __ evmovdqul(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
         break;
       default:
         ShouldNotReachHere();
@@ -1929,9 +1948,40 @@
     }
 #endif
   }
-  int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
+  bool is_single_byte = false;
+  int vec_len = 0;
+  if ((UseAVX > 2) && (stack_offset != 0)) {
+    switch (ireg) {
+	case Op_VecS:
+    case Op_VecD:
+    case Op_VecX:
+	  break;
+	case Op_VecY:
+	  vec_len = 1;
+	  break;
+    case Op_VecZ:
+	  vec_len = 2;
+	  break;
+    }
+    is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, Assembler::EVEX_FVM, Assembler::EVEX_32bit, 0);
+  }
+  int offset_size = 0;
+  int size = 5;
+  if (UseAVX > 2 ) {
+    if ((VM_Version::supports_avx512vl() == false) && (vec_len == 2)) { 
+      offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
+      size += 2; // Need an additional two bytes for EVEX encoding
+    } else if ((VM_Version::supports_avx512vl() == false) && (vec_len < 2)) { 
+      offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
+    } else {
+      offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
+      size += 2; // Need an additional two bytes for EVEX encodding
+    }
+  } else {
+    offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
+  }
   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
-  return 5+offset_size;
+  return size+offset_size;
 }
 
 static inline jfloat replicate4_imm(int con, int width) {
@@ -2675,11 +2725,10 @@
   predicate(UseAVX > 0);
   match(Set dst (NegF src));
   ins_cost(150);
-  format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signflip()), vector_len);
+  format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
+  ins_encode %{
+    __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
+                 ExternalAddress(float_signflip()));
   %}
   ins_pipe(pipe_slow);
 %}
@@ -2700,12 +2749,11 @@
   predicate(UseAVX > 0);
   match(Set dst (NegD src));
   ins_cost(150);
-  format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
+  format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
             "# neg double by sign flipping" %}
   ins_encode %{
-    int vector_len = 0;
-    __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signflip()), vector_len);
+    __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
+                 ExternalAddress(double_signflip()));
   %}
   ins_pipe(pipe_slow);
 %}
@@ -2838,7 +2886,7 @@
   format %{ "vmovdqu $dst k0,$mem\t! load vector (64 bytes)" %}
   ins_encode %{
     int vector_len = 2;
-    __ evmovdqu($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -2895,7 +2943,7 @@
   format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %}
   ins_encode %{
     int vector_len = 2;
-    __ evmovdqu($mem$$Address, $src$$XMMRegister, vector_len);
+    __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3315,6 +3363,37 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl2F_zero(vecD dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F_zero(vecX dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_zero(vecY dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl2D_mem(vecX dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD (LoadD mem)));
@@ -3349,6 +3428,28 @@
   ins_pipe( pipe_slow );
 %}
 
+// Replicate double (8 byte) scalar zero to be vector
+instruct Repl2D_zero(vecX dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
+  match(Set dst (ReplicateD zero));
+  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
+  ins_encode %{
+    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_zero(vecY dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
+  match(Set dst (ReplicateD zero));
+  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // ====================GENERIC REPLICATE==========================================
 
 // Replicate byte scalar to be vector
@@ -3680,38 +3781,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Replicate float (4 byte) scalar zero to be vector
-instruct Repl2F_zero(vecD dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateF zero));
-  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
-  ins_encode %{
-    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4F_zero(vecX dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateF zero));
-  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
-  ins_encode %{
-    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl8F_zero(vecY dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateF zero));
-  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate double (8 bytes) scalar to be vector
 instruct Repl2D(vecX dst, regD src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3723,28 +3792,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Replicate double (8 byte) scalar zero to be vector
-instruct Repl2D_zero(vecX dst, immD0 zero) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateD zero));
-  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
-  ins_encode %{
-    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4D_zero(vecY dst, immD0 zero) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateD zero));
-  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // ====================EVEX REPLICATE=============================================
 
 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
@@ -3814,7 +3861,7 @@
 %}
 
 instruct Repl64B_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB src));
   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
   ins_encode %{
@@ -3825,7 +3872,7 @@
 %}
 
 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
   ins_encode %{
@@ -3862,7 +3909,7 @@
 %}
 
 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
@@ -3953,7 +4000,7 @@
 %}
 
 instruct Repl32S_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS src));
   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
   ins_encode %{
@@ -3964,7 +4011,7 @@
 %}
 
 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
   ins_encode %{
@@ -4001,7 +4048,7 @@
 %}
 
 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastw $dst,$dst\t! replicate32S" %}
@@ -4318,13 +4365,50 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
   match(Set dst (ReplicateF zero));
-  format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
-  ins_encode %{
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
     int vector_len = 2;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -4373,13 +4457,38 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
+  match(Set dst (ReplicateD zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
+  match(Set dst (ReplicateD zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateD zero));
-  format %{ "vxorpd  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
-  ins_encode %{
+  format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
     int vector_len = 2;
-    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -7474,6 +7583,75 @@
   ins_pipe( pipe_slow );
 %}
 
+// --------------------------------- Sqrt --------------------------------------
+
+// Floating point vector sqrt - double precision only
+instruct vsqrt2D_reg(vecX dst, vecX src) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt2D_mem(vecX dst, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt4D_reg(vecY dst, vecY src) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt4D_mem(vecY dst, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt8D_mem(vecZ dst, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ------------------------------ LeftShift -----------------------------------
 
 // Shorts/Chars vector left shift
--- a/src/cpu/x86/vm/x86_32.ad	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/x86_32.ad	Mon Sep 14 07:03:04 2015 +0000
@@ -1004,10 +1004,10 @@
       __ vmovdqu(Address(rsp, dst_offset), xmm0);
       __ vmovdqu(xmm0, Address(rsp, -32));
     case Op_VecZ:
-      __ evmovdqu(Address(rsp, -64), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, src_offset), 2);
-      __ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, -64), 2);
+      __ evmovdqul(Address(rsp, -64), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, -64), 2);
       break;
     default:
       ShouldNotReachHere();
--- a/src/cpu/x86/vm/x86_64.ad	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/cpu/x86/vm/x86_64.ad	Mon Sep 14 07:03:04 2015 +0000
@@ -1075,10 +1075,10 @@
       __ vmovdqu(Address(rsp, dst_offset), xmm0);
       __ vmovdqu(xmm0, Address(rsp, -32));
     case Op_VecZ:
-      __ evmovdqu(Address(rsp, -64), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, src_offset), 2);
-      __ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, -64), 2);
+      __ evmovdqul(Address(rsp, -64), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, -64), 2);
       break;
     default:
       ShouldNotReachHere();
--- a/src/os_cpu/linux_sparc/vm/vm_version_linux_sparc.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/os_cpu/linux_sparc/vm/vm_version_linux_sparc.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -53,6 +53,10 @@
   return cpuinfo_field_contains("cpu", "Niagara");
 }
 
+static bool detect_M_family() {
+  return cpuinfo_field_contains("cpu", "SPARC-M");
+}
+
 static bool detect_blkinit() {
   return cpuinfo_field_contains("cpucaps", "blkinit");
 }
@@ -66,6 +70,11 @@
     features = niagara1_m | T_family_m;
   }
 
+  if (detect_M_family()) {
+    NOT_PRODUCT(if (PrintMiscellaneous && Verbose) tty->print_cr("Detected Linux on M family");)
+    features = sun4v_m | generic_v9_m | M_family_m | T_family_m;
+  }
+
   if (detect_blkinit()) {
     features |= blk_init_instructions_m;
   }
--- a/src/share/vm/adlc/Doc/Syntax.doc	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/share/vm/adlc/Doc/Syntax.doc	Mon Sep 14 07:03:04 2015 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 1997, 1998, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -33,7 +33,7 @@
 the architecture of a processor, and is the input to the ADL Compiler.  The
 ADL Compiler compiles an ADL file into code which is incorporated into the
 Optimizing Just In Time Compiler (OJIT) to generate efficient and correct code
-for the target architecture.  The ADL describes three bassic different types
+for the target architecture.  The ADL describes three basic different types
 of architectural features.  It describes the instruction set (and associated
 operands) of the target architecture.  It describes the register set of the
 target architecture along with relevant information for the register allocator.
--- a/src/share/vm/adlc/formssel.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/share/vm/adlc/formssel.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -4143,6 +4143,7 @@
     "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
     "MulVS","MulVI","MulVL","MulVF","MulVD",
     "DivVF","DivVD",
+    "SqrtVD",
     "AndV" ,"XorV" ,"OrV",
     "AddReductionVI", "AddReductionVL",
     "AddReductionVF", "AddReductionVD",
--- a/src/share/vm/c1/c1_Compiler.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/share/vm/c1/c1_Compiler.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -32,7 +32,6 @@
 #include "c1/c1_Runtime1.hpp"
 #include "c1/c1_ValueType.hpp"
 #include "compiler/compileBroker.hpp"
-#include "compiler/compilerOracle.hpp"
 #include "interpreter/linkResolver.hpp"
 #include "memory/allocation.hpp"
 #include "memory/allocation.inline.hpp"
--- a/src/share/vm/c1/c1_GraphBuilder.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/share/vm/c1/c1_GraphBuilder.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -4212,7 +4212,7 @@
   if (!PrintInlining && !compilation()->method()->has_option("PrintInlining")) {
     return;
   }
-  CompileTask::print_inlining(callee, scope()->level(), bci(), msg);
+  CompileTask::print_inlining_tty(callee, scope()->level(), bci(), msg);
   if (success && CIPrintMethodCodes) {
     callee->print_codes();
   }
--- a/src/share/vm/c1/c1_LIRGenerator.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/share/vm/c1/c1_LIRGenerator.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -1425,7 +1425,7 @@
       G1SATBCardTableModRef_pre_barrier(addr_opr, pre_val, do_load, patch, info);
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       // No pre barriers
       break;
@@ -1445,7 +1445,7 @@
       G1SATBCardTableModRef_post_barrier(addr,  new_val);
       break;
 #endif // INCLUDE_ALL_GCS
-    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableForRS:
     case BarrierSet::CardTableExtension:
       CardTableModRef_post_barrier(addr,  new_val);
       break;
--- a/src/share/vm/ci/bcEscapeAnalyzer.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/share/vm/ci/bcEscapeAnalyzer.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -1447,7 +1447,6 @@
 
     if (methodData() == NULL)
       return;
-    bool printit = _method->should_print_assembly();
     if (methodData()->has_escape_info()) {
       TRACE_BCEA(2, tty->print_cr("[EA] Reading previous results for %s.%s",
                                   method->holder()->name()->as_utf8(),
--- a/src/share/vm/classfile/classLoader.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/share/vm/classfile/classLoader.cpp	Mon Sep 14 07:03:04 2015 +0000
@@ -28,8 +28,8 @@
 #include "classfile/classLoader.hpp"
 #include "classfile/classLoaderData.inline.hpp"
 #include "classfile/classLoaderExt.hpp"
-#include "classfile/imageFile.hpp"
 #include "classfile/javaClasses.hpp"
+#include "classfile/jimage.hpp"
 #include "classfile/systemDictionary.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "compiler/compileBroker.hpp"
@@ -58,6 +58,7 @@
 #include "runtime/os.hpp"
 #include "runtime/threadCritical.hpp"
 #include "runtime/timer.hpp"
+#include "runtime/vm_version.hpp"
 #include "services/management.hpp"
 #include "services/threadService.hpp"
 #include "utilities/events.hpp"
@@ -68,7 +69,7 @@
 #include "classfile/sharedPathsMiscInfo.hpp"
 #endif
 
-// Entry points in zip.dll for loading zip/jar file entries and image file entries
+// Entry points in zip.dll for loading zip/jar file entries
 
 typedef void * * (JNICALL *ZipOpen_t)(const char *name, char **pmsg);
 typedef void (JNICALL *ZipClose_t)(jzfile *zip);
@@ -89,6 +90,15 @@
 static ZipInflateFully_t ZipInflateFully    = NULL;
 static Crc32_t           Crc32              = NULL;
 
+// Entry points for jimage.dll for loading jimage file entries
+
+static JImageOpen_t                    JImageOpen                    = NULL;
+static JImageClose_t                   JImageClose                   = NULL;
+static JImagePackageToModule_t         JImagePackageToModule         = NULL;
+static JImageFindResource_t            JImageFindResource            = NULL;
+static JImageGetResource_t             JImageGetResource             = NULL;
+static JImageResourceIterator_t        JImageResourceIterator        = NULL;
+
 // Globals
 
 PerfCounter*    ClassLoader::_perf_accumulated_time = NULL;
@@ -141,6 +151,15 @@
   return (strncmp(str, str_to_find, str_to_find_len) == 0);
 }
 
+static const char* get_jimage_version_string() {
+  static char version_string[10] = "";
+  if (version_string[0] == '\0') {
+    jio_snprintf(version_string, sizeof(version_string), "%d.%d",
+                 Abstract_VM_Version::vm_minor_version(), Abstract_VM_Version::vm_micro_version());
+  }
+  return (const char*)version_string;
+}
+
 bool string_ends_with(const char* str, const char* str_to_find) {
   size_t str_len = strlen(str);
   size_t str_to_find_len = strlen(str_to_find);
@@ -272,98 +291,114 @@
   }
 }
 
-ClassPathImageEntry::ClassPathImageEntry(ImageFileReader* image) :
+ClassPathImageEntry::ClassPathImageEntry(JImageFile* jimage, const char* name) :
   ClassPathEntry(),
-  _image(image),
-  _module_data(NULL) {
-  guarantee(image != NULL, "image file is null");
-
-  char module_data_name[JVM_MAXPATHLEN];
-  ImageModuleData::module_data_name(module_data_name, _image->name());
-  _module_data = new ImageModuleData(_image, module_data_name);
+  _jimage(jimage) {
+  guarantee(jimage != NULL, "jimage file is null");
+  guarantee(name != NULL, "jimage file name is null");
+  size_t len = strlen(name) + 1;
+  _name = NEW_C_HEAP_ARRAY(const char, len, mtClass);
+  strncpy((char *)_name, name, len);
 }
 
 ClassPathImageEntry::~ClassPathImageEntry() {
-  if (_module_data != NULL) {
-    delete _module_data;
-    _module_data = NULL;
+  if (_name != NULL) {
+    FREE_C_HEAP_ARRAY(const char, _name);
+    _name = NULL;
   }
-
-  if (_image != NULL) {
-    ImageFileReader::close(_image);
-    _image = NULL;
+  if (_jimage != NULL) {
+    (*JImageClose)(_jimage);
+    _jimage = NULL;
   }
 }
 
-const char* ClassPathImageEntry::name() {
-  return _image ? _image->name() : "";
+void ClassPathImageEntry::name_to_package(const char* name, char* buffer, int length) {
+  const char *pslash = strrchr(name, '/');
+  if (pslash == NULL) {
+    buffer[0] = '\0';
+    return;
+  }
+  int len = pslash - name;
+#if INCLUDE_CDS
+  if (len <= 0 && DumpSharedSpaces) {
+    buffer[0] = '\0';
+    return;
+  }
+#endif
+  assert(len > 0, "Bad length for package name");
+  if (len >= length) {
+    buffer[0] = '\0';
+    return;
+  }
+  // drop name after last slash (including slash)
+  // Ex., "java/lang/String.class" => "java/lang"
+  strncpy(buffer, name, len);
+  // ensure string termination (strncpy does not guarantee)
+  buffer[len] = '\0';
 }
 
+// For a class in a named module, look it up in the jimage file using this syntax:
+//    /<module-name>/<package-name>/<base-class>
+//
+// Assumptions:
+//     1. There are no unnamed modules in the jimage file.
+//     2. A package is in at most one module in the jimage file.
+//
 ClassFileStream* ClassPathImageEntry::open_stream(const char* name, TRAPS) {
-  ImageLocation location;
-  bool found = _image->find_location(name, location);
+  jlong size;
+  JImageLocationRef location = (*JImageFindResource)(_jimage, "", get_jimage_version_string(), name, &size);
 
-  if (!found) {
-    const char *pslash = strrchr(name, '/');
-    int len = pslash - name;
-
-    // NOTE: IMAGE_MAX_PATH is used here since this path is internal to the jimage
-    // (effectively unlimited.)  There are several JCK tests that use paths over
-    // 1024 characters long, the limit on Windows systems.
-    if (pslash && 0 < len && len < IMAGE_MAX_PATH) {
-
-      char path[IMAGE_MAX_PATH];
-      strncpy(path, name, len);
-      path[len] = '\0';
-      const char* moduleName = _module_data->package_to_module(path);
-
-      if (moduleName != NULL && (len + strlen(moduleName) + 2) < IMAGE_MAX_PATH) {
-        jio_snprintf(path, IMAGE_MAX_PATH - 1, "/%s/%s", moduleName, name);
-        location.clear_data();
-        found = _image->find_location(path, location);
-      }
+  if (location == 0) {
+    char package[JIMAGE_MAX_PATH];
+    name_to_package(name, package, JIMAGE_MAX_PATH);
+    if (package[0] != '\0') {
+        const char* module = (*JImagePackageToModule)(_jimage, package);
+        if (module == NULL) {
+            module = "java.base";
+        }
+        location = (*JImageFindResource)(_jimage, module, get_jimage_version_string(), name, &size);
     }
   }
 
-  if (found) {
-    u8 size = location.get_attribute(ImageLocation::ATTRIBUTE_UNCOMPRESSED);
+  if (location != 0) {
     if (UsePerfData) {
       ClassLoader::perf_sys_classfile_bytes_read()->inc(size);
     }
-    u1* data = NEW_RESOURCE_ARRAY(u1, size);
-    _image->get_resource(location, data);
-    return new ClassFileStream(data, (int)size, _image->name());  // Resource allocated
+    char* data = NEW_RESOURCE_ARRAY(char, size);
+    (*JImageGetResource)(_jimage, location, data, size);
+    return new ClassFileStream((u1*)data, (int)size, _name);  // Resource allocated
   }
 
   return NULL;
 }
 
 #ifndef PRODUCT
+bool ctw_visitor(JImageFile* jimage,
+        const char* module_name, const char* version, const char* package,
+        const char* name, const char* extension, void* arg) {
+  if (strcmp(extension, "class") == 0) {
+    Thread* THREAD = Thread::current();
+    char path[JIMAGE_MAX_PATH];
+    jio_snprintf(path, JIMAGE_MAX_PATH - 1, "%s/%s.class", package, name);
+    ClassLoader::compile_the_world_in(path, *(Handle*)arg, THREAD);
+    return !HAS_PENDING_EXCEPTION;
+  }
+  return true;
+}
+
 void ClassPathImageEntry::compile_the_world(Handle loader, TRAPS) {
   tty->print_cr("CompileTheWorld : Compiling all classes in %s", name());
   tty->cr();
-  const ImageStrings strings = _image->get_strings();
-  // Retrieve each path component string.
-  u4 length = _image->table_length();
-  for (u4 i = 0; i < length; i++) {
-    u1* location_data = _image->get_location_data(i);
-
-    if (location_data != NULL) {
-       ImageLocation location(location_data);
-       char path[IMAGE_MAX_PATH];
-       _image->location_path(location, path, IMAGE_MAX_PATH);
-       ClassLoader::compile_the_world_in(path, loader, CHECK);
+  (*JImageResourceIterator)(_jimage, (JImageResourceVisitor_t)ctw_visitor, (void *)&loader);
+  if (HAS_PENDING_EXCEPTION) {
+    if (PENDING_EXCEPTION->is_a(SystemDictionary::OutOfMemoryError_klass())) {
+      CLEAR_PENDING_EXCEPTION;
+      tty->print_cr("\nCompileTheWorld : Ran out of memory\n");
+      tty->print_cr("Increase class metadata storage if a limit was set");
+    } else {
+      tty->print_cr("\nCompileTheWorld : Unexpected exception occurred\n");
     }
   }
-  if (HAS_PENDING_EXCEPTION) {
-  if (PENDING_EXCEPTION->is_a(SystemDictionary::OutOfMemoryError_klass())) {
-    CLEAR_PENDING_EXCEPTION;
-    tty->print_cr("\nCompileTheWorld : Ran out of memory\n");
-    tty->print_cr("Increase class metadata storage if a limit was set");
-  } else {
-    tty->print_cr("\nCompileTheWorld : Unexpected exception occurred\n");
-  }
-  }
 }
 
 bool ClassPathImageEntry::is_jrt() {
@@ -490,7 +525,7 @@
   JavaThread* thread = JavaThread::current();
   ClassPathEntry* new_entry = NULL;
   if ((st->st_mode & S_IFREG) == S_IFREG) {
-    // Regular file, should be a zip or image file
+    // Regular file, should be a zip or jimage file
     // Canonicalized filename
     char canonical_path[JVM_MAXPATHLEN];
     if (!get_canonical_path(path, canonical_path, JVM_MAXPATHLEN)) {
@@ -501,9 +536,10 @@
         return NULL;
       }
     }
-    ImageFileReader* image = ImageFileReader::open(canonical_path);
-    if (image != NULL) {
-      new_entry = new ClassPathImageEntry(image);
+    jint error;
+    JImageFile* jimage =(*JImageOpen)(canonical_path, &error);
+    if (jimage != NULL) {
+      new_entry = new ClassPathImageEntry(jimage, canonical_path);
     } else {
       char* error_msg = NULL;
       jzfile* zip;
@@ -682,6 +718,35 @@
   // This lookup only works on 1.3. Do not check for non-null here
 }
 
+void ClassLoader::load_jimage_library() {
+  // First make sure native library is loaded
+  os::native_java_library();
+  // Load jimage library
+  char path[JVM_MAXPATHLEN];
+  char ebuf[1024];
+  void* handle = NULL;
+  if (os::dll_build_name(path, sizeof(path), Arguments::get_dll_dir(), "jimage")) {
+    handle = os::dll_load(path, ebuf, sizeof ebuf);
+  }
+  if (handle == NULL) {
+    vm_exit_during_initialization("Unable to load jimage library", path);
+  }
+
+  // Lookup jimage entry points
+  JImageOpen = CAST_TO_FN_PTR(JImageOpen_t, os::dll_lookup(handle, "JIMAGE_Open"));
+  guarantee(JImageOpen != NULL, "function JIMAGE_Open not found");
+  JImageClose = CAST_TO_FN_PTR(JImageClose_t, os::dll_lookup(handle, "JIMAGE_Close"));
+  guarantee(JImageClose != NULL, "function JIMAGE_Close not found");
+  JImagePackageToModule = CAST_TO_FN_PTR(JImagePackageToModule_t, os::dll_lookup(handle, "JIMAGE_PackageToModule"));
+  guarantee(JImagePackageToModule != NULL, "function JIMAGE_PackageToModule not found");
+  JImageFindResource = CAST_TO_FN_PTR(JImageFindResource_t, os::dll_lookup(handle, "JIMAGE_FindResource"));
+  guarantee(JImageFindResource != NULL, "function JIMAGE_FindResource not found");
+  JImageGetResource = CAST_TO_FN_PTR(JImageGetResource_t, os::dll_lookup(handle, "JIMAGE_GetResource"));
+  guarantee(JImageGetResource != NULL, "function JIMAGE_GetResource not found");
+  JImageResourceIterator = CAST_TO_FN_PTR(JImageResourceIterator_t, os::dll_lookup(handle, "JIMAGE_ResourceIterator"));
+  guarantee(JImageResourceIterator != NULL, "function JIMAGE_ResourceIterator not found");
+}
+
 jboolean ClassLoader::decompress(void *in, u8 inSize, void *out, u8 outSize, char **pmsg) {
   return (*ZipInflateFully)(in, inSize, out, outSize, pmsg);
 }
@@ -1086,6 +1151,8 @@
 
   // lookup zip library entry points
   load_zip_library();
+  // lookup jimage library entry points
+  load_jimage_library();
 #if INCLUDE_CDS
   // initialize search path
   if (DumpSharedSpaces) {
--- a/src/share/vm/classfile/classLoader.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ b/src/share/vm/classfile/classLoader.hpp	Mon Sep 14 07:03:04 2015 +0000
@@ -37,8 +37,7 @@
 
 // Class path entry (directory or zip file)
 
-class ImageFileReader;
-class ImageModuleData;
+class JImageFile;
 
 class ClassPathEntry: public CHeapObj<mtClass> {
  private:
@@ -52,7 +51,7 @@
   }
   virtual bool is_jar_file() = 0;
   virtual const char* name() = 0;
-  virtual ImageFileReader* image() = 0;
+  virtual JImageFile* jimage() = 0;
   // Constructor
   ClassPathEntry();
   // Attempt to locate file_name through this class path entry.
@@ -70,7 +69,7 @@
  public:
   bool is_jar_file()       { return false;  }
   const char* name()       { return _dir; }
-  ImageFileReader* image() { return NULL; }
+  JImageFile* jimage()     { return NULL; }
   ClassPathDirEntry(const char* dir);
   ClassFileStream* open_stream(const char* name, TRAPS);
   // Debugging
@@ -100,7 +99,7 @@
  public:
   bool is_jar_file()       { return true;  }
   const char* name()       { return _zip_name; }
-  ImageFileReader* image() { return NULL; }
+  JImageFile* jimage()     { return NULL; }
   ClassPathZipEntry(jzfile* zip, const char* zip_name);
   ~ClassPathZipEntry();
   u1* open_entry(const char* name, jint* filesize, bool nul_terminate, TRAPS);
@@ -115,16 +114,16 @@
 // For java image files
 class ClassPathImageEntry: public ClassPathEntry {
 private:
-  ImageFileReader* _image;
-  ImageModuleData* _module_data;
+  JImageFile* _jimage;
+  const char* _name;
 public:
   bool is_jar_file()  { return false;  }
-  bool is_open()  { return _image != NULL; }
-  const char* name();
-  ImageFileReader* image() { return _image; }
-  ImageModuleData* module_data() { return _module_data; }
-  ClassPathImageEntry(ImageFileReader* image);
+  bool is_open()  { return _jimage != NULL; }
+  const char* name() { return _name == NULL ? "" : _name; }
+  JImageFile* jimage() { return _jimage; }
+  ClassPathImageEntry(JImageFile* jimage, const char* name);
   ~ClassPathImageEntry();
+  static void name_to_package(const char* name, char* buffer, int length);
   ClassFileStream* open_stream(const char* name, TRAPS);
 
   // Debugging
@@ -206,6 +205,7 @@
   static void setup_search_path(const char *class_path);
 
   static void load_zip_library();
+  static void load_jimage_library();
   static ClassPathEntry* create_class_path_entry(const char *path, const struct stat* st,
                                                  bool throw_exception, TRAPS);
 
--- a/src/share/vm/classfile/imageDecompressor.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#include "precompiled.hpp"
-#include "runtime/thread.inline.hpp"
-#include "classfile/imageDecompressor.hpp"
-#include "runtime/thread.hpp"
-#include "utilities/bytes.hpp"
-
-/*
- * Allocate in C Heap not in resource area, otherwise JVM crashes.
- * This array life time is the VM life time. Array is never freed and
- * is not expected to contain more than few references.
- */
-GrowableArray<ImageDecompressor*>* ImageDecompressor::_decompressors =
-  new(ResourceObj::C_HEAP, mtInternal) GrowableArray<ImageDecompressor*>(2, true);
-
-static Symbol* createSymbol(const char* str) {
-  Thread* THREAD = Thread::current();
-  Symbol* sym = SymbolTable::lookup(str, (int) strlen(str), THREAD);
-  if (HAS_PENDING_EXCEPTION) {
-    warning("can't create symbol\n");
-    CLEAR_PENDING_EXCEPTION;
-    return NULL;
-  }
-  return sym;
-}
-
-/*
- * Initialize the array of decompressors.
- */
-bool image_decompressor_init() {
-  Symbol* zipSymbol = createSymbol("zip");
-  if (zipSymbol == NULL) {
-    return false;
-  }
-  ImageDecompressor::add_decompressor(new ZipDecompressor(zipSymbol));
-
-  return true;
-}
-
-/*
- * Decompression entry point. Called from ImageFileReader::get_resource.
- */
-void ImageDecompressor::decompress_resource(u1* compressed, u1* uncompressed,
-        u4 uncompressed_size, const ImageStrings* strings, bool is_C_heap) {
-  bool has_header = false;
-  u1* decompressed_resource = compressed;
-  u1* compressed_resource = compressed;
-
-  // Resource could have been transformed by a stack of decompressors.
-  // Iterate and decompress resources until there is no more header.
-  do {
-    ResourceHeader _header;
-    memcpy(&_header, compressed_resource, sizeof (ResourceHeader));
-    has_header = _header._magic == ResourceHeader::resource_header_magic;
-    if (has_header) {
-      // decompressed_resource array contains the result of decompression
-      // when a resource content is terminal, it means that it is an actual resource,
-      // not an intermediate not fully uncompressed content. In this case
-      // the resource is allocated as an mtClass, otherwise as an mtOther
-      decompressed_resource = is_C_heap && _header._is_terminal ?
-              NEW_C_HEAP_ARRAY(u1, _header._uncompressed_size, mtClass) :
-              NEW_C_HEAP_ARRAY(u1, _header._uncompressed_size, mtOther);
-      // Retrieve the decompressor name
-      const char* decompressor_name = strings->get(_header._decompressor_name_offset);
-      if (decompressor_name == NULL) warning("image decompressor not found\n");
-      guarantee(decompressor_name, "image decompressor not found");
-      // Retrieve the decompressor instance
-      ImageDecompressor* decompressor = get_decompressor(decompressor_name);
-      if (decompressor == NULL) {
-        warning("image decompressor %s not found\n", decompressor_name);
-      }
-      guarantee(decompressor, "image decompressor not found");
-      u1* compressed_resource_base = compressed_resource;
-      compressed_resource += ResourceHeader::resource_header_length;
-      // Ask the decompressor to decompress the compressed content
-      decompressor->decompress_resource(compressed_resource, decompressed_resource,
-        &_header, strings);
-      if (compressed_resource_base != compressed) {
-        FREE_C_HEAP_ARRAY(char, compressed_resource_base);
-      }
-      compressed_resource = decompressed_resource;
-    }
-  } while (has_header);
-  memcpy(uncompressed, decompressed_resource, uncompressed_size);
-}
-
-// Zip decompressor
-
-void ZipDecompressor::decompress_resource(u1* data, u1* uncompressed,
-        ResourceHeader* header, const ImageStrings* strings) {
-  char* msg = NULL;
-  jboolean res = ClassLoader::decompress(data, header->_size, uncompressed,
-          header->_uncompressed_size, &msg);
-  if (!res) warning("decompression failed due to %s\n", msg);
-  guarantee(res, "decompression failed");
-}
-
-// END Zip Decompressor
--- a/src/share/vm/classfile/imageDecompressor.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_VM_CLASSFILE_IMAGEDECOMPRESSOR_HPP
-#define SHARE_VM_CLASSFILE_IMAGEDECOMPRESSOR_HPP
-
-#include "runtime/thread.inline.hpp"
-#include "classfile/classLoader.hpp"
-#include "classfile/imageFile.hpp"
-#include "classfile/symbolTable.hpp"
-#include "oops/symbol.hpp"
-#include "utilities/growableArray.hpp"
-
-/*
- * Compressed resources located in image have an header.
- * This header contains:
- * - _magic: A magic u4, required to retrieved the header in the compressed content
- * - _size: The size of the compressed resource.
- * - _uncompressed_size: The uncompressed size of the compressed resource.
- * - _decompressor_name_offset: The ImageDecompressor instance name StringsTable offset.
- * - _decompressor_config_offset: StringsTable offset of configuration that could be needed by
- *   the decompressor in order to decompress.
- * - _is_terminal: 1: the compressed content is terminal. Uncompressing it would
- *   create the actual resource. 0: the compressed content is not terminal. Uncompressing it
- *   will result in a compressed content to be decompressed (This occurs when a stack of compressors
- *   have been used to compress the resource.
- */
-struct ResourceHeader {
-  /* Length of header, needed to retrieve content offset */
-  static const u1 resource_header_length = 21;
-  /* magic bytes that identifies a compressed resource header*/
-  static const u4 resource_header_magic = 0xCAFEFAFA;
-  u4 _magic; // Resource header
-  u4 _size;  // Resource size
-  u4 _uncompressed_size;  // Expected uncompressed size
-  u4 _decompressor_name_offset;  // Strings table decompressor offset
-  u4 _decompressor_config_offset; // Strings table config offset
-  u1 _is_terminal; // Last decompressor 1, otherwise 0.
-};
-
-/*
- * Resources located in jimage file can be compressed. Compression occurs at
- * jimage file creation time. When compressed a resource is added an header that
- * contains the name of the compressor that compressed it.
- * Various compression strategies can be applied to compress a resource.
- * The same resource can even be compressed multiple time by a stack of compressors.
- * At runtime, a resource is decompressed in a loop until there is no more header
- * meaning that the resource is equivalent to the not compressed resource.
- * In each iteration, the name of the compressor located in the current header
- * is used to retrieve the associated instance of ImageDecompressor.
- * For example “zip” is the name of the compressor that compresses resources
- * using the zip algorithm. The ZipDecompressor class name is also “zip”.
- * ImageDecompressor instances are retrieved from a static array in which
- * they are registered.
- */
-class ImageDecompressor: public CHeapObj<mtClass> {
-
-private:
-  const Symbol* _name;
-
-  /*
-   * Array of concrete decompressors. This array is used to retrieve the decompressor
-   * that can handle resource decompression.
-   */
-  static GrowableArray<ImageDecompressor*>* _decompressors;
-
-  /*
-   * Identifier of a decompressor. This name is the identification key to retrieve
-   * decompressor from a resource header.
-   */
-  inline const Symbol* get_name() const { return _name; }
-
-protected:
-  ImageDecompressor(const Symbol* name) : _name(name) {
-  }
-  virtual void decompress_resource(u1* data, u1* uncompressed,
-    ResourceHeader* header, const ImageStrings* strings) = 0;
-
-public:
-  inline static void add_decompressor(ImageDecompressor* decompressor) {
-    _decompressors->append(decompressor);
-  }
-  inline static ImageDecompressor* get_decompressor(const char * decompressor_name) {
-    Thread* THREAD = Thread::current();
-    TempNewSymbol sym = SymbolTable::new_symbol(decompressor_name,
-            (int) strlen(decompressor_name), CHECK_NULL);
-    if (HAS_PENDING_EXCEPTION) {
-      warning("can't create symbol\n");
-      CLEAR_PENDING_EXCEPTION;
-      return NULL;
-    }
-    for (int i = 0; i < _decompressors->length(); i++) {
-      ImageDecompressor* decompressor = _decompressors->at(i);
-      if (decompressor->get_name()->fast_compare(sym) == 0) {
-        return decompressor;
-      }
-    }
-    guarantee(false, "No decompressor found.");
-    return NULL;
-  }
-  static void decompress_resource(u1* compressed, u1* uncompressed,
-    u4 uncompressed_size, const ImageStrings* strings, bool is_C_heap);
-};
-
-/**
- * Zip decompressor.
- */
-class ZipDecompressor : public ImageDecompressor {
-public:
-  ZipDecompressor(const Symbol* sym) : ImageDecompressor(sym) { }
-  void decompress_resource(u1* data, u1* uncompressed, ResourceHeader* header,
-    const ImageStrings* strings);
-};
-
-#endif // SHARE_VM_CLASSFILE_IMAGEDECOMPRESSOR_HPP
--- a/src/share/vm/classfile/imageFile.cpp	Mon Sep 14 07:02:50 2015 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,546 +0,0 @@
-/*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#include "precompiled.hpp"
-#include "classfile/imageDecompressor.hpp"
-#include "classfile/imageFile.hpp"
-#include "memory/resourceArea.hpp"
-#include "runtime/mutex.hpp"
-#include "runtime/mutexLocker.hpp"
-#include "runtime/os.inline.hpp"
-#include "utilities/endian.hpp"
-#include "utilities/growableArray.hpp"
-
-// Image files are an alternate file format for storing classes and resources. The
-// goal is to supply file access which is faster and smaller than the jar format.
-//
-// (More detailed nodes in the header.)
-//
-
-// Compute the Perfect Hashing hash code for the supplied UTF-8 string.
-s4 ImageStrings::hash_code(const char* string, s4 seed) {
-  // Access bytes as unsigned.
-  u1* bytes = (u1*)string;
-  // Compute hash code.
-  for (u1 byte = *bytes++; byte; byte = *bytes++) {
-    seed = (seed * HASH_MULTIPLIER) ^ byte;
-  }
-  // Ensure the result is not signed.
-  return seed & 0x7FFFFFFF;
-}
-
-// Match up a string in a perfect hash table.  Result still needs validation
-// for precise match (false positive.)
-s4 ImageStrings::find(Endian* endian, const char* name, s4* redirect, u4 length) {
-  // If the table is empty, then short cut.
-  if (redirect == NULL || length == 0) {
-    return NOT_FOUND;
-  }
-  // Compute the basic perfect hash for name.
-  s4 hash_code = ImageStrings::hash_code(name);
-  // Modulo table size.
-  s4 index = hash_code % length;
-  // Get redirect entry.
-  //   value == 0 then not found
-  //   value < 0 then -1 - value is true index
-  //   value > 0 then value is seed for recomputing hash.
-  s4 value = endian->get(redirect[index]);
-  // if recompute is required.
-  if (value > 0) {
-    // Entry collision value, need to recompute hash.
-    hash_code = ImageStrings::hash_code(name, value);
-    // Modulo table size.
-    return hash_code % length;
-  } else if (value < 0) {
-    // Compute direct index.
-    return -1 - value;
-  }
-  // No entry found.
-  return NOT_FOUND;
-}
-
-// Test to see if UTF-8 string begins with the start UTF-8 string.  If so,
-// return non-NULL address of remaining portion of string.  Otherwise, return
-// NULL.  Used to test sections of a path without copying from image string
-// table.
-const char* ImageStrings::starts_with(const char* string, const char* start) {
-  char ch1, ch2;
-  // Match up the strings the best we can.
-  while ((ch1 = *string) && (ch2 = *start)) {
-    if (ch1 != ch2) {
-      // Mismatch, return NULL.
-      return NULL;
-    }
-    // Next characters.
-    string++, start++;
-  }
-  // Return remainder of string.
-  return string;
-}
-
-// Inflates the attribute stream into individual values stored in the long
-// array _attributes. This allows an attribute value to be quickly accessed by
-// direct indexing.  Unspecified values default to zero (from constructor.)
-void ImageLocation::set_data(u1* data) {
-  // Deflate the attribute stream into an array of attributes.
-  u1 byte;
-  // Repeat until end header is found.
-  while ((byte = *data)) {
-    // Extract kind from header byte.
-    u1 kind = attribute_kind(byte);
-    guarantee(kind < ATTRIBUTE_COUNT, "invalid image location attribute");
-    // Extract length of data (in bytes).
-    u1 n = attribute_length(byte);
-    // Read value (most significant first.)
-    _attributes[kind] = attribute_value(data + 1, n);
-    // Position to next attribute by skipping attribute header and data bytes.
-    data += n + 1;
-  }
-}
-
-// Zero all attribute values.
-void ImageLocation::clear_data() {
-  // Set defaults to zero.
-  memset(_attributes, 0, sizeof(_attributes));
-}
-
-// ImageModuleData constructor maps out sub-tables for faster access.
-ImageModuleData::ImageModuleData(const ImageFileReader* image_file,
-        const char* module_data_name) :
-    _image_file(image_file),
-    _endian(image_file->endian()),
-    _strings(image_file->get_strings()) {
-  // Retrieve the resource containing the module data for the image file.
-  ImageLocation location;
-  bool found = image_file->find_location(module_data_name, location);
-  guarantee(found, "missing module data");
-  u8 data_size = location.get_attribute(ImageLocation::ATTRIBUTE_UNCOMPRESSED);
-  _data = (u1*)NEW_C_HEAP_ARRAY(char, data_size, mtClass);
-  _image_file->get_resource(location, _data);
-  // Map out the header.
-  _header = (Header*)_data;
-  // Get the package to module entry count.
-  u4 ptm_count = _header->ptm_count(_endian);
-  // Get the module to package entry count.
-  u4 mtp_count = _header->mtp_count(_endian);
-  // Compute the offset of the package to module perfect hash redirect.
-  u4 ptm_redirect_offset = sizeof(Header);
-  // Compute the offset of the package to module data.
-  u4 ptm_data_offset = ptm_redirect_offset + ptm_count * sizeof(s4);
-  // Compute the offset of the module to package perfect hash redirect.
-  u4 mtp_redirect_offset = ptm_data_offset + ptm_count * sizeof(PTMData);
-  // Compute the offset of the module to package data.
-  u4 mtp_data_offset = mtp_redirect_offset + mtp_count * sizeof(s4);
-  // Compute the offset of the module to package tables.
-  u4 mtp_packages_offset = mtp_data_offset + mtp_count * sizeof(MTPData);
-  // Compute the address of the package to module perfect hash redirect.
-  _ptm_redirect = (s4*)(_data + ptm_redirect_offset);
-  // Compute the address of the package to module data.
-  _ptm_data = (PTMData*)(_data + ptm_data_offset);
-  // Compute the address of the module to package perfect hash redirect.
-  _mtp_redirect = (s4*)(_data + mtp_redirect_offset);
-  // Compute the address of the module to package data.
-  _mtp_data = (MTPData*)(_data + mtp_data_offset);
-  // Compute the address of the module to package tables.
-  _mtp_packages = (s4*)(_data + mtp_packages_offset);
-}
-
-// Release module data resource.
-ImageModuleData::~ImageModuleData() {
-  if (_data != NULL) {
-    FREE_C_HEAP_ARRAY(u1, _data);
-  }
-}
-
-// Return the name of the module data resource.  Ex. "./lib/modules/file.jimage"
-// yields "file.jdata"
-void ImageModuleData::module_data_name(char* buffer, const char* image_file_name) {
-  // Locate the last slash in the file name path.
-  const char* slash = strrchr(image_file_name, os::file_separator()[0]);
-  // Trim the path to name and extension.
-  const char* name = slash != NULL ? slash + 1 : (char *)image_file_name;
-  // Locate the extension period.
-  const char* dot = strrchr(name, '.');
-  guarantee(dot, "missing extension on jimage name");
-  // Trim to only base name.
-  int length = dot - name;
-  strncpy(buffer, name, length);
-  buffer[length] = '\0';
-  // Append extension.
-  strcat(buffer, ".jdata");
-}
-
-// Return the module in which a package resides.  Returns NULL if not found.
-const char* ImageModuleData::package_to_module(const char* package_name) {
-  // Search the package to module table.
-  s4 index = ImageStrings::find(_endian, package_name, _ptm_redirect,
-                                  _header->ptm_count(_endian));
-  // If entry is found.
-  if (index != ImageStrings::NOT_FOUND) {
-    // Retrieve the package to module entry.
-    PTMData* data = _ptm_data + index;
-    // Verify that it is the correct data.
-    if (strcmp(package_name, get_string(data->name_offset(_endian))) != 0) {
-      return NULL;
-    }
-    // Return the module name.
-    return get_string(data->module_name_offset(_endian));
-  }
-  return NULL;
-}
-
-// Returns all the package names in a module.  Returns NULL if module not found.
-GrowableArray<const char*>* ImageModuleData::module_to_packages(const char* module_name) {
-  // Search the module to package table.
-  s4 index = ImageStrings::find(_endian, module_name, _mtp_redirect,
-                                  _header->mtp_count(_endian));
-  // If entry is found.
-  if (index != ImageStrings::NOT_FOUND) {
-    // Retrieve the module to package entry.
-    MTPData* data = _mtp_data + index;
-    // Verify that it is the correct data.
-    if (strcmp(module_name, get_string(data->name_offset(_endian))) != 0) {
-      return NULL;
-    }
-    // Construct an array of all the package entries.
-    GrowableArray<const char*>* packages = new GrowableArray<const char*>();
-    s4 package_offset = data->package_offset(_endian);
-    for (u4 i = 0; i < data->package_count(_endian); i++) {
-      u4 package_name_offset = mtp_package(package_offset + i);
-      const char* package_name = get_string(package_name_offset);
-      packages->append(package_name);
-    }
-    return packages;
-  }
-  return NULL;
-}
-
-// Table to manage multiple opens of an image file.
-GrowableArray<ImageFileReader*>* ImageFileReader::_reader_table =
-  new(ResourceObj::C_HEAP, mtInternal) GrowableArray<ImageFileReader*>(2, true);
-
-// Open an image file, reuse structure if file already open.
-ImageFileReader* ImageFileReader::open(const char* name, bool big_endian) {
-  // Lock out _reader_table.
-  MutexLocker ml(ImageFileReaderTable_lock);
-  ImageFileReader* reader;
-  // Search for an exist image file.
-  for (int i = 0; i < _reader_table->length(); i++) {
-    // Retrieve table entry.
-    reader = _reader_table->at(i);
-    // If name matches, then reuse (bump up use count.)
-    if (strcmp(reader->name(), name) == 0) {
-      reader->inc_use();
-      return reader;
-    }
-  }
-  // Need a new image reader.
-  reader = new ImageFileReader(name, big_endian);
-  bool opened = reader->open();
-  // If failed to open.
-  if (!opened) {
-    delete reader;
-    return NULL;
-  }
-  // Bump use count and add to table.
-  reader->inc_use();
-  _reader_table->append(reader);
-  return reader;
-}
-
-// Close an image file if the file is not in use elsewhere.
-void ImageFileReader::close(ImageFileReader *reader) {
-  // Lock out _reader_table.
-  MutexLocker ml(ImageFileReaderTable_lock);
-  // If last use then remove from table and then close.
-  if (reader->dec_use()) {
-    _reader_table->remove(reader);
-    delete reader;
-  }
-}
-
-// Return an id for the specifed ImageFileReader.
-u8 ImageFileReader::readerToID(ImageFileReader *reader) {
-  // ID is just the cloaked reader address.
-  return (u8)reader;
-}
-
-// Validate the image id.
-bool ImageFileReader::idCheck(u8 id) {
-  // Make sure the ID is a managed (_reader_table) reader.
-  MutexLocker ml(ImageFileReaderTable_lock);
-  return _reader_table->contains((ImageFileReader*)id);
-}
-
-// Return an id for the specifed ImageFileReader.
-ImageFileReader* ImageFileReader::idToReader(u8 id) {
-#ifdef PRODUCT
-  // Fast convert.
-  return (ImageFileReader*)id;
-#else
-  // Do a slow check before fast convert.
-  return idCheck(id) ? (ImageFileReader*)id : NULL;
-#endif
-}
-
-// Constructor intializes to a closed state.
-ImageFileReader::ImageFileReader(const char* name, bool big_endian) {
-  // Copy the image file name.
-  _name = NEW_C_HEAP_ARRAY(char, strlen(name) + 1, mtClass);
-  strcpy(_name, name);
-  // Initialize for a closed file.
-  _fd = -1;
-  _endian = Endian::get_handler(big_endian);
-  _index_data = NULL;
-}
-
-// Close image and free up data structures.
-ImageFileReader::~ImageFileReader() {
-  // Ensure file is closed.
-  close();
-  // Free up name.
-  if (_name != NULL) {
-    FREE_C_HEAP_ARRAY(char, _name);
-    _name = NULL;
-  }
-}
-
-// Open image file for read access.
-bool ImageFileReader::open() {
-  // If file exists open for reading.
-  struct stat st;
-  if (os::stat(_name, &st) != 0 ||
-    (st.st_mode & S_IFREG) != S_IFREG ||
-    (_fd = os::open(_name, 0, O_RDONLY)) == -1) {
-    return false;
-  }
-  // Retrieve the file size.
-  _file_size = (u8)st.st_size;
-  // Read image file header and verify it has a valid header.
-  size_t header_size = sizeof(ImageHeader);
-  if (_file_size < header_size ||
-    !read_at((u1*)&_header, header_size, 0) ||
-    _header.magic(_endian) != IMAGE_MAGIC ||
-    _header.major_version(_endian) != MAJOR_VERSION ||
-    _header.minor_version(_endian) != MINOR_VERSION) {
-    close();
-    return false;
-  }
-  // Size of image index.
-  _index_size = index_size();
-  // Make sure file is large enough to contain the index.
-  if (_file_size < _index_size) {
-    return false;
-  }
-  // Determine how much of the image is memory mapped.
-  off_t map_size = (off_t)(MemoryMapImage ? _file_size : _index_size);
-  // Memory map image (minimally the index.)
-  _index_data = (u1*)os::map_memory(_fd, _name, 0, NULL, map_size, true, false);
-  guarantee(_index_data, "image file not memory mapped");
-  // Retrieve length of index perfect hash table.
-  u4 length = table_length();
-  // Compute offset of the perfect hash table redirect table.
-  u4 redirect_table_offset = (u4)header_size;
-  // Compute offset of index attribute offsets.
-  u4 offsets_table_offset = redirect_table_offset + length * sizeof(s4);
-  // Compute offset of index location attribute data.
-  u4 location_bytes_offset = offsets_table_offset + length * sizeof(u4);
-  // Compute offset of index string table.
-  u4 string_bytes_offset = location_bytes_offset + locations_size();
-  // Compute address of the perfect hash table redirect table.
-  _redirect_table = (s4*)(_index_data + redirect_table_offset);
-  // Compute address of index attribute offsets.
-  _offsets_table = (u4*)(_index_data + offsets_table_offset);
-  // Compute address of index location attribute data.
-  _location_bytes = _index_data + location_bytes_offset;
-  // Compute address of index string table.
-  _string_bytes = _index_data + string_bytes_offset;
-  // Successful open.
-  return true;
-}
-
-// Close image file.
-void ImageFileReader::close() {
-  // Dealllocate the index.
-  if (_index_data != NULL) {
-    os::unmap_memory((char*)_index_data, _index_size);
-    _index_data = NULL;
-  }
-  // Close file.
-  if (_fd != -1) {
-    os::close(_fd);
-    _fd = -1;
-  }
-}
-
-// Read directly from the file.
-bool ImageFileReader::read_at(u1* data, u8 size, u8 offset) const {
-  return os::read_at(_fd, data, size, offset) == size;
-}
-
-// Find the location attributes associated with the path.  Returns true if
-// the location is found, false otherwise.
-bool ImageFileReader::find_location(const char* path, ImageLocation& location) const {
-  // Locate the entry in the index perfect hash table.
-  s4 index = ImageStrings::find(_endian, path, _redirect_table, table_length());
-  // If is found.
-  if (index != ImageStrings::NOT_FOUND) {
-    // Get address of first byte of location attribute stream.
-    u1* data = get_location_data(index);
-    // Expand location attributes.
-    location.set_data(data);
-    // Make sure result is not a false positive.
-    return verify_location(location, path);
-  }
-  return false;
-}
-
-// Assemble the location path from the string fragments indicated in the location attributes.
-void ImageFileReader::location_path(ImageLocation& location, char* path, size_t max) const {
-  // Manage the image string table.
-  ImageStrings strings(_string_bytes, _header.strings_size(_endian));
-  // Position to first character of the path buffer.
-  char* next = path;
-  // Temp for string length.
-  size_t length;
-  // Get module string.
-  const char* module = location.get_attribute(ImageLocation::ATTRIBUTE_MODULE, strings);
-  // If module string is not empty string.
-  if (*module != '\0') {
-    // Get length of module name.
-    length = strlen(module);
-    // Make sure there is no buffer overflow.
-    guarantee(next - path + length + 2 < max, "buffer overflow");
-    // Append '/module/'.
-    *next++ = '/';
-    strcpy(next, module); next += length;
-    *next++ = '/';
-  }
-  // Get parent (package) string.
-  const char* parent = location.get_attribute(ImageLocation::ATTRIBUTE_PARENT, strings);
-  // If parent string is not empty string.
-  if (*parent != '\0') {
-    // Get length of module string.
-    length = strlen(parent);
-    // Make sure there is no buffer overflow.
-    guarantee(next - path + length + 1 < max, "buffer overflow");
-    // Append 'patent/' .
-    strcpy(next, parent); next += length;
-    *next++ = '/';
-  }
-  // Get base name string.
-  const char* base = location.get_attribute(ImageLocation::ATTRIBUTE_BASE, strings);
-  // Get length of base name.
-  length = strlen(base);
-  // Make sure there is no buffer overflow.
-  guarantee(next - path + length < max, "buffer overflow");
-  // Append base name.
-  strcpy(next, base); next += length;
-  // Get extension string.
-  const char* extension = location.get_attribute(ImageLocation::ATTRIBUTE_EXTENSION, strings);
-  // If extension string is not empty string.
-  if (*extension != '\0') {
-    // Get length of extension string.
-    length = strlen(extension);
-    // Make sure there is no buffer overflow.
-    guarantee(next - path + length + 1 < max, "buffer overflow");
-    // Append '.extension' .
-    *next++ = '.';
-    strcpy(next, extension); next += length;
-  }
-  // Make sure there is no buffer overflow.
-  guarantee((size_t)(next - path) < max, "buffer overflow");
-  // Terminate string.
-  *next = '\0';
-}
-
-// Verify that a found location matches the supplied path (without copying.)
-bool ImageFileReader::verify_location(ImageLocation& location, const char* path) const {
-  // Manage the image string table.
-  ImageStrings strings(_string_bytes, _header.strings_size(_endian));
-  // Position to first character of the path string.
-  const char* next = path;
-  // Get module name string.
-  const char* module = location.get_attribute(ImageLocation::ATTRIBUTE_MODULE, strings);
-  // If module string is not empty.
-  if (*module != '\0') {
-    // Compare '/module/' .
-    if (*next++ != '/') return false;
-    if (!(next = ImageStrings::starts_with(next, module))) return false;
-    if (*next++ != '/') return false;
-  }
-  // Get parent (package) string
-  const char* parent = location.get_attribute(ImageLocation::ATTRIBUTE_PARENT, strings);
-  // If parent string is not empty string.
-  if (*parent != '\0') {
-    // Compare 'parent/' .
-    if (!(next = ImageStrings::starts_with(next, parent))) return false;
-    if (*next++ != '/') return false;
-  }
-  // Get base name string.
-  const char* base = location.get_attribute(ImageLocation::ATTRIBUTE_BASE, strings);
-  // Compare with basne name.
-  if (!(next = ImageStrings::starts_with(next, base))) return false;
-  // Get extension string.
-  const char* extension = location.get_attribute(ImageLocation::ATTRIBUTE_EXTENSION, strings);
-  // If extension is not empty.
-  if (*extension != '\0') {
-    // Compare '.extension' .
-    if (*next++ != '.') return false;
-    if (!(next = ImageStrings::starts_with(next, extension))) return false;
-  }
-  // True only if complete match and no more characters.
-  return *next == '\0';
-}
-
-// Return the resource data for the supplied location.
-void ImageFileReader::get_resource(ImageLocation& location, u1* uncompressed_data) const {
-  // Retrieve the byte offset and size of the resource.
-  u8 offset = location.get_attribute(ImageLocation::ATTRIBUTE_OFFSET);
-  u8 uncompressed_size = location.get_attribute(ImageLocation::ATTRIBUTE_UNCOMPRESSED);
-  u8 compressed_size = location.get_attribute(ImageLocation::ATTRIBUTE_COMPRESSED);
-  if (compressed_size != 0) {
-    ResourceMark rm;
-    u1* compressed_data;
-    // If not memory mapped read in bytes.
-    if (!MemoryMapImage) {
-      // Allocate buffer for compression.
-      compressed_data = NEW_RESOURCE_ARRAY(u1, compressed_size);
-      // Read bytes from offset beyond the image index.
-      bool is_read = read_at(compressed_data, compressed_size, _index_size + offset);
-      guarantee(is_read, "error reading from image or short read");
-    } else {
-      compressed_data = get_data_address() + offset;
-    }
-    // Get image string table.
-    const ImageStrings strings = get_strings();
-    // Decompress resource.
-    ImageDecompressor::decompress_resource(compressed_data, uncompressed_data, uncompressed_size,
-            &strings, false);
-  } else {
-    // Read bytes from offset beyond the image index.
-    bool is_read = read_at(uncompressed_data, uncompressed_size, _index_size + offset);
-    guarantee(is_read, "error reading from image or short read");
-  }
-}
--- a/src/share/vm/classfile/imageFile.hpp	Mon Sep 14 07:02:50 2015 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,602 +0,0 @@
-/*
- * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_VM_CLASSFILE_IMAGEFILE_HPP
-#define SHARE_VM_CLASSFILE_IMAGEFILE_HPP
-
-#include "classfile/classLoader.hpp"
-#include "memory/allocation.hpp"
-#include "memory/allocation.inline.hpp"
-#include "utilities/endian.hpp"
-#include "utilities/globalDefinitions.hpp"
-#include "utilities/growableArray.hpp"
-
-// Image files are an alternate file format for storing classes and resources. The
-// goal is to supply file access which is faster and smaller than the jar format.
-// It should be noted that unlike jars, information stored in an image is in native
-// endian format. This allows the image to be mapped into memory without endian
-// translation.  This also means that images are platform dependent.
-//
-// Image files are structured as three sections;
-//
-//         +-----------+
-//         |  Header   |
-//         +-----------+
-//         |           |
-//         |   Index   |
-//         |           |
-//         +-----------+
-//         |           |
-//         |           |
-//         | Resources |
-//         |           |
-//         |           |
-//         +-----------+
-//
-// The header contains information related to identification and description of
-// contents.
-//
-//         +-------------------------+
-//         |   Magic (0xCAFEDADA)    |
-//         +------------+------------+
-//         | Major Vers | Minor Vers |
-//         +------------+------------+
-//         |          Flags          |
-//         +-------------------------+
-//         |      Resource Count     |
-//         +-------------------------+
-//         |       Table Length      |
-//         +-------------------------+
-//         |      Attributes Size    |
-//         +-------------------------+
-//         |       Strings Size      |
-//         +-------------------------+
-//
-// Magic - means of identifying validity of the file.  This avoids requiring a
-//         special file extension.
-// Major vers, minor vers - differences in version numbers indicate structural
-//                          changes in the image.
-// Flags - various image wide flags (future).
-// Resource count - number of resources in the file.
-// Table length - the length of lookup tables used in the index.
-// Attributes size - number of bytes in the region used to store location attribute
-//                   streams.
-// Strings size - the size of the region used to store strings used by the
-//                index and meta data.
-//
-// The index contains information related to resource lookup. The algorithm
-// used for lookup is "A Practical Minimal Perfect Hashing Method"
-// (http://homepages.dcc.ufmg.br/~nivio/papers/wea05.pdf). Given a path string
-// in the form /<module>/<package>/<base>.<extension>  return the resource location
-// information;
-//
-//     redirectIndex = hash(path, DEFAULT_SEED) % table_length;
-//     redirect = redirectTable[redirectIndex];
-//     if (redirect == 0) return not found;
-//     locationIndex = redirect < 0 ? -1 - redirect : hash(path, redirect) % table_length;
-//     location = locationTable[locationIndex];
-//     if (!verify(location, path)) return not found;
-//     return location;
-//
-// Note: The hash function takes an initial seed value.  A different seed value
-// usually returns a different result for strings that would otherwise collide with
-// other seeds. The verify function guarantees the found resource location is
-// indeed the resource we are looking for.
-//
-// The following is the format of the index;
-//
-//         +-------------------+
-//         |   Redirect Table  |
-//         +-------------------+
-//         | Attribute Offsets |
-//         +-------------------+
-//         |   Attribute Data  |
-//         +-------------------+
-//         |      Strings      |
-//         +-------------------+
-//
-// Redirect Table - Array of 32-bit signed values representing actions that
-//                  should take place for hashed strings that map to that
-//                  value.  Negative values indicate no hash collision and can be
-//                  quickly converted to indices into attribute offsets.  Positive
-//                  values represent a new seed for hashing an index into attribute
-//                  offsets.  Zero indicates not found.
-// Attribute Offsets - Array of 32-bit unsigned values representing offsets into
-//                     attribute data.  Attribute offsets can be iterated to do a
-//                     full survey of resources in the image.  Offset of zero
-//                     indicates no attributes.
-// Attribute Data - Bytes representing compact attribute data for locations. (See
-//                  comments in ImageLocation.)
-// Strings - Collection of zero terminated UTF-8 strings used by the index and
-//           image meta data.  Each string is accessed by offset.  Each string is
-//           unique.  Offset zero is reserved for the empty string.
-//
-// Note that the memory mapped index assumes 32 bit alignment of each component
-// in the index.
-//
-// Endianness of an image.
-// An image booted by hotspot is always in native endian.  However, it is possible
-// to read (by the JDK) in alternate endian format.  Primarily, this is during
-// cross platform scenarios.  Ex, where javac needs to read an embedded image
-// to access classes for crossing compilation.
-//
-
-class ImageFileReader; // forward declaration
-
-// Manage image file string table.
-class ImageStrings VALUE_OBJ_CLASS_SPEC {
-private:
-  u1* _data; // Data bytes for strings.
-  u4 _size; // Number of bytes in the string table.
-public:
-  enum {
-    // Not found result from find routine.
-    NOT_FOUND = -1,
-    // Prime used to generate hash for Perfect Hashing.
-    HASH_MULTIPLIER = 0x01000193
-  };
-
-  ImageStrings(u1* data, u4 size) : _data(data), _size(size) {}
-
-  // Return the UTF-8 string beginning at offset.
-  inline const char* get(u4 offset) const {
-    guarantee(offset < _size, "offset exceeds string table size");
-    return (const char*)(_data + offset);
-  }
-
-  // Compute the Perfect Hashing hash code for the supplied UTF-8 string.
-  inline static u4 hash_code(const char* string) {
-    return hash_code(string, HASH_MULTIPLIER);
-  }
-
-  // Compute the Perfect Hashing hash code for the supplied string, starting at seed.
-  static s4 hash_code(const char* string, s4 seed);
-
-  // Match up a string in a perfect hash table.  Result still needs validation
-  // for precise match.
-  static s4 find(Endian* endian, const char* name, s4* redirect, u4 length);
-
-  // Test to see if UTF-8 string begins with the start UTF-8 string.  If so,
-  // return non-NULL address of remaining portion of string.  Otherwise, return
-  // NULL.  Used to test sections of a path without copying from image string
-  // table.
-  static const char* starts_with(const char* string, const char* start);
-
-  // Test to see if UTF-8 string begins with start char.  If so, return non-NULL
-  // address of remaining portion of string.  Otherwise, return NULL.  Used
-  // to test a character of a path without copying.
-  inline static const char* starts_with(const char* string, const char ch) {
-    return *string == ch ? string + 1 : NULL;
-  }
-};
-
-// Manage image file location attribute data.  Within an image, a location's
-// attributes are compressed into a stream of bytes.  An attribute stream is
-// composed of individual attribute sequences.  Each attribute sequence begins with
-// a header byte containing the attribute 'kind' (upper 5 bits of header) and the
-// 'length' less 1 (lower 3 bits of header) of bytes that follow containing the
-// attribute value.  Attribute values present as most significant byte first.
-//
-// Ex. Container offset (ATTRIBUTE_OFFSET) 0x33562 would be represented as 0x22
-// (kind = 4, length = 3), 0x03, 0x35, 0x62.
-//
-// An attribute stream is terminated with a header kind of ATTRIBUTE_END (header
-// byte of zero.)
-//
-// ImageLocation inflates the stream into individual values stored in the long
-// array _attributes. This allows an attribute value can be quickly accessed by
-// direct indexing. Unspecified values default to zero.
-//
-// Notes:
-//  - Even though ATTRIBUTE_END is used to mark the end of the attribute stream,
-//    streams will contain zero byte values to represent lesser significant bits.
-//    Thus, detecting a zero byte is not sufficient to detect the end of an attribute
-//    stream.
-//  - ATTRIBUTE_OFFSET represents the number of bytes from the beginning of the region
-//    storing the resources.  Thus, in an image this represents the number of bytes
-//    after the index.
-//  - Currently, compressed resources are represented by having a non-zero
-//    ATTRIBUTE_COMPRESSED value.  This represents the number of bytes stored in the
-//    image, and the value of ATTRIBUTE_UNCOMPRESSED represents number of bytes of the
-//    inflated resource in memory. If the ATTRIBUTE_COMPRESSED is zero then the value
-//    of ATTRIBUTE_UNCOMPRESSED represents both the number of bytes in the image and
-//    in memory.  In the future, additional compression techniques will be used and
-//    represented differently.
-//  - Package strings include trailing slash and extensions include prefix period.
-//
-class ImageLocation VALUE_OBJ_CLASS_SPEC {
-public:
-  enum {
-    ATTRIBUTE_END,          // End of attribute stream marker
-    ATTRIBUTE_MODULE,       // String table offset of module name
-    ATTRIBUTE_PARENT,       // String table offset of resource path parent
-    ATTRIBUTE_BASE,         // String table offset of resource path base
-    ATTRIBUTE_EXTENSION,    // String table offset of resource path extension
-    ATTRIBUTE_OFFSET,       // Container byte offset of resource
-    ATTRIBUTE_COMPRESSED,   // In image byte size of the compressed resource
-    ATTRIBUTE_UNCOMPRESSED, // In memory byte size of the uncompressed resource
-    ATTRIBUTE_COUNT         // Number of attribute kinds
-  };
-
-private:
-  // Values of inflated attributes.
-  u8 _attributes[ATTRIBUTE_COUNT];
-
-  // Return the attribute value number of bytes.
-  inline static u1 attribute_length(u1 data) {
-    return (data & 0x7) + 1;
-  }
-
-  // Return the attribute kind.
-  inline static u1 attribute_kind(u1 data) {
-    u1 kind = data >> 3;
-    guarantee(kind < ATTRIBUTE_COUNT, "invalid attribute kind");
-    return kind;
-  }
-
-  // Return the attribute length.
-  inline static u8 attribute_value(u1* data, u1 n) {
-    guarantee(0 < n && n <= 8, "invalid attribute value length");
-    u8 value = 0;
-    // Most significant bytes first.
-    for (u1 i = 0; i < n; i++) {
-      value <<= 8;
-      value |= data[i];
-    }
-    return value;
-  }
-
-public:
-  ImageLocation() {
-    clear_data();
-  }
-
-  ImageLocation(u1* data) {
-    clear_data();
-    set_data(data);
-  }
-
-  // Inflates the attribute stream into individual values stored in the long
-  // array _attributes. This allows an attribute value to be quickly accessed by
-  // direct indexing. Unspecified values default to zero.
-  void set_data(u1* data);
-
-  // Zero all attribute values.
-  void clear_data();
-
-  // Retrieve an attribute value from the inflated array.
-  inline u8 get_attribute(u1 kind) const {
-    guarantee(ATTRIBUTE_END < kind && kind < ATTRIBUTE_COUNT, "invalid attribute kind");
-    return _attributes[kind];
-  }
-
-  // Retrieve an attribute string value from the inflated array.
-  inline const char* get_attribute(u4 kind, const ImageStrings& strings) const {
-    return strings.get((u4)get_attribute(kind));
-  }
-};
-
-//
-// NOTE: needs revision.
-// Each loader requires set of module meta data to identify which modules and
-// packages are managed by that loader.  Currently, there is one image file per
-// builtin loader, so only one  module meta data resource per file.
-//
-// Each element in the module meta data is a native endian 4 byte integer.  Note
-// that entries with zero offsets for string table entries should be ignored (
-// padding for hash table lookup.)
-//
-// Format:
-//    Count of package to module entries
-//    Count of module to package entries
-//    Perfect Hash redirect table[Count of package to module entries]
-//    Package to module entries[Count of package to module entries]
-//        Offset to package name in string table
-//        Offset to module name in string table
-//    Perfect Hash redirect table[Count of module to package entries]
-//    Module to package entries[Count of module to package entries]
-//        Offset to module name in string table
-//        Count of packages in module
-//        Offset to first package in packages table
-//    Packages[]
-//        Offset to package name in string table
-//
-// Manage the image module meta data.
-class ImageModuleData : public CHeapObj<mtClass> {
-  class Header VALUE_OBJ_CLASS_SPEC {
-  private:
-    u4 _ptm_count;      // Count of package to module entries
-    u4 _mtp_count;      // Count of module to package entries
-  public:
-    inline u4 ptm_count(Endian* endian) const { return endian->get(_ptm_count); }
-    inline u4 mtp_count(Endian* endian) const { return endian->get(_mtp_count); }
-  };
-
-  // Hashtable entry
-  class HashData VALUE_OBJ_CLASS_SPEC {
-  private:
-    u4 _name_offset;    // Name offset in string table
-  public:
-    inline s4 name_offset(Endian* endian) const { return endian->get(_name_offset); }
-  };
-
-  // Package to module hashtable entry
-  class PTMData : public HashData {
-  private:
-    u4 _module_name_offset; // Module name offset in string table
-  public:
-    inline s4 module_name_offset(Endian* endian) const { return endian->get(_module_name_offset); }
-  };
-
-  // Module to package hashtable entry
-  class MTPData : public HashData {
-  private:
-    u4 _package_count;     // Number of packages in module
-    u4 _package_offset;    // Offset in package list
-  public:
-    inline u4 package_count(Endian* endian)  const { return endian->get(_package_count); }
-    inline u4 package_offset(Endian* endian) const { return endian->get(_package_offset); }
-  };
-
-  const ImageFileReader* _image_file; // Source image file
-  Endian* _endian;                    // Endian handler
-  ImageStrings _strings;              // Image file strings
-  u1* _data;                          // Module data resource data
-  u8 _data_size;                      // Size of resource data
-  Header* _header;                    // Module data header
-  s4* _ptm_redirect;                  // Package to module hashtable redirect
-  PTMData* _ptm_data;                 // Package to module data
-  s4* _mtp_redirect;                  // Module to packages hashtable redirect
-  MTPData* _mtp_data;                 // Module to packages data
-  s4* _mtp_packages;                  // Package data (name offsets)
-
-  // Return a string from the string table.
-  inline const char* get_string(u4 offset) {
-    return _strings.get(offset);
-  }
-
-  inline u4 mtp_package(u4 index) {
-    return _endian->get(_mtp_packages[index]);
-  }
-
-public:
-  ImageModuleData(const ImageFileReader* image_file, const char* module_data_name);
-  ~ImageModuleData();
-
-  // Return the name of the module data resource.
-  static void module_data_name(char* buffer, const char* image_file_name);
-
-  // Return the module in which a package resides.  Returns NULL if not found.
-  const char* package_to_module(const char* package_name);
-
-  // Returns all the package names in a module.  Returns NULL if module not found.
-  GrowableArray<const char*>* module_to_packages(const char* module_name);
-};
-
-// Image file header, starting at offset 0.
-class ImageHeader VALUE_OBJ_CLASS_SPEC {
-private:
-  u4 _magic;           // Image file marker
-  u4 _version;         // Image file major version number
-  u4 _flags;           // Image file flags
-  u4 _resource_count;  // Number of resources in file
-  u4 _table_length;    // Number of slots in index tables
-  u4 _locations_size;  // Number of bytes in attribute table
-  u4 _strings_size;    // Number of bytes in string table
-
-public:
-  u4 magic() const { return _magic; }
-  u4 magic(Endian* endian) const { return endian->get(_magic); }
-  void set_magic(Endian* endian, u4 magic) { return endian->set(_magic, magic); }
-
-  u4 major_version(Endian* endian) const { return endian->get(_version) >> 16; }
-  u4 minor_version(Endian* endian) const { return endian->get(_version) & 0xFFFF; }
-  void set_version(Endian* endian, u4 major_version, u4 minor_version) {
-    return endian->set(_version, major_version << 16 | minor_version);
-  }
-
-  u4 flags(Endian* endian) const { return endian->get(_flags); }
-  void set_flags(Endian* endian, u4 value) { return endian->set(_flags, value); }
-
-  u4 resource_count(Endian* endian) const { return endian->get(_resource_count); }
-  void set_resource_count(Endian* endian, u4 count) { return endian->set(_resource_count, count); }
-
-  u4 table_length(Endian* endian) const { return endian->get(_table_length); }
-  void set_table_length(Endian* endian, u4 count) { return endian->set(_table_length, count); }
-
-  u4 locations_size(Endian* endian) const { return endian->get(_locations_size); }
-  void set_locations_size(Endian* endian, u4 size) { return endian->set(_locations_size, size); }
-
-  u4 strings_size(Endian* endian) const { return endian->get(_strings_size); }
-  void set_strings_size(Endian* endian, u4 size) { return endian->set(_strings_size, size); }
-};
-
-// Max path length limit independent of platform.  Windows max path is 1024,
-// other platforms use 4096.  The JCK fails several tests when 1024 is used.
-#define IMAGE_MAX_PATH 4096
-
-// Manage the image file.
-// ImageFileReader manages the content of an image file.
-// Initially, the header of the image file is read for validation.  If valid,
-// values in the header are used calculate the size of the image index.  The
-// index is then memory mapped to allow load on demand and sharing.  The
-// -XX:+MemoryMapImage flag determines if the entire file is loaded (server use.)
-// An image can be used by Hotspot and multiple reference points in the JDK, thus
-// it is desirable to share a reader.  To accomodate sharing, a share table is
-// defined (see ImageFileReaderTable in imageFile.cpp)  To track the number of
-// uses, ImageFileReader keeps a use count (_use).  Use is incremented when
-// 'opened' by reference point and decremented when 'closed'.  Use of zero
-// leads the ImageFileReader to be actually closed and discarded.
-class ImageFileReader : public CHeapObj<mtClass> {
-private:
-  // Manage a number of image files such that an image can be shared across
-  // multiple uses (ex. loader.)
-  static GrowableArray<ImageFileReader*>* _reader_table;
-
-  char* _name;         // Name of image
-  s4 _use;             // Use count
-  int _fd;             // File descriptor
-  Endian* _endian;     // Endian handler
-  u8 _file_size;       // File size in bytes
-  ImageHeader _header; // Image header
-  size_t _index_size;  // Total size of index
-  u1* _index_data;     // Raw index data
-  s4* _redirect_table; // Perfect hash redirect table
-  u4* _offsets_table;  // Location offset table
-  u1* _location_bytes; // Location attributes
-  u1* _string_bytes;   // String table
-
-  ImageFileReader(const char* name, bool big_endian);
-  ~ImageFileReader();
-
-  // Compute number of bytes in image file index.
-  inline u8 index_size() {
-    return sizeof(ImageHeader) +
-      table_length() * sizeof(u4) * 2 + locations_size() + strings_size();
-  }
-
-public:
-  enum {
-    // Image file marker.
-    IMAGE_MAGIC = 0xCAFEDADA,
-    // Endian inverted Image file marker.
-    IMAGE_MAGIC_INVERT = 0xDADAFECA,
-    // Image file major version number.
-    MAJOR_VERSION = 1,
-    // Image file minor version number.
-    MINOR_VERSION = 0
-  };
-
-  // Open an image file, reuse structure if file already open.
-  static ImageFileReader* open(const char* name, bool big_endian = Endian::is_big_endian());
-
-  // Close an image file if the file is not in use elsewhere.
-  static void close(ImageFileReader *reader);
-
-  // Return an id for the specifed ImageFileReader.
-  static u8 readerToID(ImageFileReader *reader);
-
-  // Validate the image id.
-  static bool idCheck(u8 id);
-
-  // Return an id for the specifed ImageFileReader.
-  static ImageFileReader* idToReader(u8 id);
-
-  // Open image file for read access.
-  bool open();
-
-  // Close image file.
-  void close();
-
-  // Read directly from the file.
-  bool read_at(u1* data, u8 size, u8 offset) const;
-
-  inline Endian* endian() const { return _endian; }
-
-  // Retrieve name of image file.
-  inline const char* name() const {
-    return _name;
-  }
-
-  // Retrieve size of image file.
-  inline u8 file_size() const {
-    return _file_size;
-  }
-
-  // Return first address of index data.
-  inline u1* get_index_address() const {
-    return _index_data;
-  }
-
-  // Return first address of resource data.
-  inline u1* get_data_address() const {
-    return _index_data + _index_size;
-  }
-
-  // Get the size of the index data.
-  size_t get_index_size() const {
-    return _index_size;
-  }
-
-  inline u4 table_length() const {
-    return _header.table_length(_endian);
-  }
-
-  inline u4 locations_size() const {
-    return _header.locations_size(_endian);
-  }
-
-  inline u4 strings_size()const  {
-    return _header.strings_size(_endian);
-  }
-
-  inline u4* offsets_table() const {
-    return _offsets_table;
-  }
-
-  // Increment use count.
-  inline void inc_use() {
-    _use++;
-  }
-
-  // Decrement use count.
-  inline bool dec_use() {
-    return --_use == 0;
-  }
-
-  // Return a string table accessor.
-  inline const ImageStrings get_strings() const {
-    return ImageStrings(_string_bytes, _header.strings_size(_endian));
-  }
-
-  // Return location attribute stream at offset.
-  inline u1* get_location_offset_data(u4 offset) const {
-    guarantee((u4)offset < _header.locations_size(_endian),
-              "offset exceeds location attributes size");
-    return offset != 0 ? _location_bytes + offset : NULL;
-  }
-
-  // Return location attribute stream for location i.
-  inline u1* get_location_data(u4 index) const {
-    guarantee((u4)index < _header.table_length(_endian),
-              "index exceeds location count");
-    u4 offset = _endian->get(_offsets_table[i