changeset 9326:fbac2a5639dc

Merge
author jwilhelm
date Mon, 21 Sep 2015 17:49:57 +0200
parents 9cd2f42c84c0 1ac336e4e8fe
children 5f8fa6465399 eca671f4c014
files
diffstat 36 files changed, 2196 insertions(+), 1440 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Tue Sep 22 14:24:31 2015 -0400
+++ b/.hgtags	Mon Sep 21 17:49:57 2015 +0200
@@ -483,3 +483,4 @@
 20dc06b04fe5ec373879414d60ef82ac70faef98 jdk9-b78
 e9e63d93bbfe2c6c23447e2c1f5cc71c98671cba jdk9-b79
 8e8377739c06b99b9011c003c77e0bef84c91e09 jdk9-b80
+4142c190cd5ca4fb70ec367b4f97ef936272d8ef jdk9-b81
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/aarch64/vm/aarch64.ad	Mon Sep 21 17:49:57 2015 +0200
@@ -3803,81 +3803,37 @@
 
   enc_class aarch64_enc_cmpxchg(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
     MacroAssembler _masm(&cbuf);
-    Register old_reg = as_Register($oldval$$reg);
-    Register new_reg = as_Register($newval$$reg);
-    Register base = as_Register($mem$$base);
-    Register addr_reg;
-    int index = $mem$$index;
-    int scale = $mem$$scale;
-    int disp = $mem$$disp;
-    if (index == -1) {
-       if (disp != 0) {
-        __ lea(rscratch2, Address(base, disp));
-        addr_reg = rscratch2;
-      } else {
-        // TODO
-        // should we ever get anything other than this case?
-        addr_reg = base;
-      }
-    } else {
-      Register index_reg = as_Register(index);
-      if (disp == 0) {
-        __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      } else {
-        __ lea(rscratch2, Address(base, disp));
-        __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      }
-    }
-    Label retry_load, done;
-    __ bind(retry_load);
-    __ ldxr(rscratch1, addr_reg);
-    __ cmp(rscratch1, old_reg);
-    __ br(Assembler::NE, done);
-    __ stlxr(rscratch1, new_reg, addr_reg);
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(done);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldxr, &MacroAssembler::cmp, &Assembler::stlxr);
   %}
 
   enc_class aarch64_enc_cmpxchgw(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
     MacroAssembler _masm(&cbuf);
-    Register old_reg = as_Register($oldval$$reg);
-    Register new_reg = as_Register($newval$$reg);
-    Register base = as_Register($mem$$base);
-    Register addr_reg;
-    int index = $mem$$index;
-    int scale = $mem$$scale;
-    int disp = $mem$$disp;
-    if (index == -1) {
-       if (disp != 0) {
-        __ lea(rscratch2, Address(base, disp));
-        addr_reg = rscratch2;
-      } else {
-        // TODO
-        // should we ever get anything other than this case?
-        addr_reg = base;
-      }
-    } else {
-      Register index_reg = as_Register(index);
-      if (disp == 0) {
-        __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      } else {
-        __ lea(rscratch2, Address(base, disp));
-        __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      }
-    }
-    Label retry_load, done;
-    __ bind(retry_load);
-    __ ldxrw(rscratch1, addr_reg);
-    __ cmpw(rscratch1, old_reg);
-    __ br(Assembler::NE, done);
-    __ stlxrw(rscratch1, new_reg, addr_reg);
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(done);
-  %}
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
+  %}
+
+
+  // The only difference between aarch64_enc_cmpxchg and
+  // aarch64_enc_cmpxchg_acq is that we use load-acquire in the
+  // CompareAndSwap sequence to serve as a barrier on acquiring a
+  // lock.
+  enc_class aarch64_enc_cmpxchg_acq(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
+    MacroAssembler _masm(&cbuf);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldaxr, &MacroAssembler::cmp, &Assembler::stlxr);
+  %}
+
+  enc_class aarch64_enc_cmpxchgw_acq(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
+    MacroAssembler _masm(&cbuf);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldaxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
+  %}
+
 
   // auxiliary used for CompareAndSwapX to set result register
   enc_class aarch64_enc_cset_eq(iRegINoSp res) %{
@@ -4398,13 +4354,10 @@
 
     // Compare object markOop with mark and if equal exchange scratch1
     // with object markOop.
-    // Note that this is simply a CAS: it does not generate any
-    // barriers.  These are separately generated by
-    // membar_acquire_lock().
     {
       Label retry_load;
       __ bind(retry_load);
-      __ ldxr(tmp, oop);
+      __ ldaxr(tmp, oop);
       __ cmp(tmp, disp_hdr);
       __ br(Assembler::NE, cas_failed);
       // use stlxr to ensure update is immediately visible
@@ -4454,7 +4407,7 @@
       {
         Label retry_load, fail;
         __ bind(retry_load);
-        __ ldxr(rscratch1, tmp);
+        __ ldaxr(rscratch1, tmp);
         __ cmp(disp_hdr, rscratch1);
         __ br(Assembler::NE, fail);
         // use stlxr to ensure update is immediately visible
@@ -8017,10 +7970,10 @@
   match(MemBarAcquireLock);
   ins_cost(VOLATILE_REF_COST);
 
-  format %{ "membar_acquire_lock" %}
-
-  ins_encode %{
-    __ membar(Assembler::LoadLoad|Assembler::LoadStore);
+  format %{ "membar_acquire_lock (elided)" %}
+
+  ins_encode %{
+    __ block_comment("membar_acquire_lock (elided)");
   %}
 
   ins_pipe(pipe_serial);
@@ -8080,10 +8033,10 @@
   match(MemBarReleaseLock);
   ins_cost(VOLATILE_REF_COST);
 
-  format %{ "membar_release_lock" %}
-
-  ins_encode %{
-    __ membar(Assembler::LoadStore|Assembler::StoreStore);
+  format %{ "membar_release_lock (elided)" %}
+
+  ins_encode %{
+    __ block_comment("membar_release_lock (elided)");
   %}
 
   ins_pipe(pipe_serial);
@@ -8369,7 +8322,11 @@
   ins_pipe(pipe_serial);
 %}
 
-// this has to be implemented as a CAS
+
+// storeLConditional is used by PhaseMacroExpand::expand_lock_node
+// when attempting to rebias a lock towards the current thread.  We
+// must use the acquire form of cmpxchg in order to guarantee acquire
+// semantics in this case.
 instruct storeLConditional(indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr)
 %{
   match(Set cr (StoreLConditional mem (Binary oldval newval)));
@@ -8381,12 +8338,14 @@
     "cmpw rscratch1, zr\t# EQ on successful write"
   %}
 
-  ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval));
+  ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval));
 
   ins_pipe(pipe_slow);
 %}
 
-// this has to be implemented as a CAS
+// storeIConditional also has acquire semantics, for no better reason
+// than matching storeLConditional.  At the time of writing this
+// comment storeIConditional was not used anywhere by AArch64.
 instruct storeIConditional(indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr)
 %{
   match(Set cr (StoreIConditional mem (Binary oldval newval)));
@@ -8398,7 +8357,7 @@
     "cmpw rscratch1, zr\t# EQ on successful write"
   %}
 
-  ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval));
+  ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval));
 
   ins_pipe(pipe_slow);
 %}
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Sep 21 17:49:57 2015 +0200
@@ -917,6 +917,8 @@
 
   void cmpptr(Register src1, Address src2);
 
+  // Various forms of CAS
+
   void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
                   Label &suceed, Label *fail);
 
@@ -938,6 +940,23 @@
     str(rscratch2, adr);
   }
 
+  // A generic CAS; success or failure is in the EQ flag.
+  template <typename T1, typename T2>
+  void cmpxchg(Register addr, Register expected, Register new_val,
+               T1 load_insn,
+               void (MacroAssembler::*cmp_insn)(Register, Register),
+               T2 store_insn,
+               Register tmp = rscratch1) {
+    Label retry_load, done;
+    bind(retry_load);
+    (this->*load_insn)(tmp, addr);
+    (this->*cmp_insn)(tmp, expected);
+    br(Assembler::NE, done);
+    (this->*store_insn)(tmp, new_val, addr);
+    cbnzw(tmp, retry_load);
+    bind(done);
+  }
+
   // Calls
 
   address trampoline_call(Address entry, CodeBuffer *cbuf = NULL);
--- a/src/cpu/x86/vm/assembler_x86.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -394,25 +394,25 @@
   int mod_idx = 0;
   // We will test if the displacement fits the compressed format and if so
   // apply the compression to the displacment iff the result is8bit.
-  if (VM_Version::supports_evex() && is_evex_instruction) {
-    switch (tuple_type) {
+  if (VM_Version::supports_evex() && _is_evex_instruction) {
+    switch (_tuple_type) {
     case EVEX_FV:
-      if ((evex_encoding & VEX_W) == VEX_W) {
-        mod_idx += 2 + ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      if ((_evex_encoding & VEX_W) == VEX_W) {
+        mod_idx += 2 + ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
       } else {
-        mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+        mod_idx = ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
       }
       break;
 
     case EVEX_HV:
-      mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      mod_idx = ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
       break;
 
     case EVEX_FVM:
       break;
 
     case EVEX_T1S:
-      switch (input_size_in_bits) {
+      switch (_input_size_in_bits) {
       case EVEX_8bit:
         break;
 
@@ -433,7 +433,7 @@
     case EVEX_T1F:
     case EVEX_T2:
     case EVEX_T4:
-      mod_idx = (input_size_in_bits == EVEX_64bit) ? 1 : 0;
+      mod_idx = (_input_size_in_bits == EVEX_64bit) ? 1 : 0;
       break;
 
     case EVEX_T8:
@@ -459,8 +459,8 @@
       break;
     }
 
-    if (avx_vector_len >= AVX_128bit && avx_vector_len <= AVX_512bit) {
-      int disp_factor = tuple_table[tuple_type + mod_idx][avx_vector_len];
+    if (_avx_vector_len >= AVX_128bit && _avx_vector_len <= AVX_512bit) {
+      int disp_factor = tuple_table[_tuple_type + mod_idx][_avx_vector_len];
       if ((disp % disp_factor) == 0) {
         int new_disp = disp / disp_factor;
         if (is8bit(new_disp)) {
@@ -591,7 +591,7 @@
       emit_data(disp, rspec, disp32_operand);
     }
   }
-  is_evex_instruction = false;
+  _is_evex_instruction = false;
 }
 
 void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
@@ -1229,8 +1229,8 @@
 void Assembler::addsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x58, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
@@ -1245,8 +1245,8 @@
 void Assembler::addss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
 }
@@ -1254,16 +1254,16 @@
 void Assembler::aesdec(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDE);
   emit_operand(dst, src);
 }
 
 void Assembler::aesdec(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38,  /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDE);
   emit_int8(0xC0 | encode);
 }
@@ -1271,16 +1271,16 @@
 void Assembler::aesdeclast(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit,  /* legacy_mode */ true);
   emit_int8((unsigned char)0xDF);
   emit_operand(dst, src);
 }
 
 void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38,  /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDF);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1288,16 +1288,16 @@
 void Assembler::aesenc(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDC);
   emit_operand(dst, src);
 }
 
 void Assembler::aesenc(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDC);
   emit_int8(0xC0 | encode);
 }
@@ -1305,21 +1305,20 @@
 void Assembler::aesenclast(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit,  /* legacy_mode */ true);
   emit_int8((unsigned char)0xDD);
   emit_operand(dst, src);
 }
 
 void Assembler::aesenclast(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8((unsigned char)0xDD);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-
 void Assembler::andl(Address dst, int32_t imm32) {
   InstructionMark im(this);
   prefix(dst);
@@ -1347,7 +1346,7 @@
 
 void Assembler::andnl(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2);
   emit_int8((unsigned char)0xF2);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1355,7 +1354,7 @@
 void Assembler::andnl(Register dst, Register src1, Address src2) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_legacy(dst, src1, src2, false);
+  vex_prefix_0F38_legacy(dst, src1, src2);
   emit_int8((unsigned char)0xF2);
   emit_operand(dst, src2);
 }
@@ -1382,7 +1381,7 @@
 
 void Assembler::blsil(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1390,14 +1389,14 @@
 void Assembler::blsil(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_legacy(rbx, dst, src, false);
+  vex_prefix_0F38_legacy(rbx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rbx, src);
 }
 
 void Assembler::blsmskl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1405,14 +1404,14 @@
 void Assembler::blsmskl(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(rdx, dst, src, false);
+  vex_prefix_0F38_legacy(rdx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rdx, src);
 }
 
 void Assembler::blsrl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src, false);
+  int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1420,7 +1419,7 @@
 void Assembler::blsrl(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38_legacy(rcx, dst, src, false);
+  vex_prefix_0F38_legacy(rcx, dst, src);
   emit_int8((unsigned char)0xF3);
   emit_operand(rcx, src);
 }
@@ -1569,9 +1568,9 @@
   // 0x66 is there. Strangly ucomisd comes out correct
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
   }
@@ -1580,7 +1579,7 @@
 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true);
+    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
   }
@@ -1588,16 +1587,16 @@
 
 void Assembler::comiss(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::cpuid() {
@@ -1607,12 +1606,12 @@
 
 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3);
+  emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3, /* no_mask_reg */ false, /* legacy_mode */ true);
 }
 
 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE);
+  emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ true);
 }
 
 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
@@ -1627,8 +1626,8 @@
 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1F;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1F;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x5A, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
@@ -1637,12 +1636,7 @@
 
 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = 0;
-  if (VM_Version::supports_evex()) {
-    encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true);
-  } else {
-    encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, false);
-  }
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VM_Version::supports_evex());
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1650,9 +1644,9 @@
 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-    emit_simd_arith_q(0x2A, dst, src, VEX_SIMD_F2, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+    emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   } else {
     emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2);
   }
@@ -1660,23 +1654,23 @@
 
 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true);
+  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
 }
 
 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1688,8 +1682,8 @@
 
 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
@@ -1698,14 +1692,14 @@
 
 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1721,8 +1715,8 @@
 void Assembler::divsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
@@ -1740,8 +1734,8 @@
 
 void Assembler::divss(XMMRegister dst, Address src) {
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
@@ -1995,8 +1989,16 @@
 
 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex()) {
-    emit_simd_arith_nonds_q(0x28, dst, src, VEX_SIMD_66, true);
+  if (VM_Version::supports_avx512novl()) {
+    int vector_len = AVX_512bit;
+    int dst_enc = dst->encoding();
+    int src_enc = src->encoding();
+    int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F,
+                                       /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+    emit_int8(0x28);
+    emit_int8((unsigned char)(0xC0 | encode));
+  } else if (VM_Version::supports_evex()) {
+    emit_simd_arith_nonds_q(0x28, dst, src, VEX_SIMD_66);
   } else {
     emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66);
   }
@@ -2004,13 +2006,19 @@
 
 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
+  if (VM_Version::supports_avx512novl()) {
+    int vector_len = AVX_512bit;
+    int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, vector_len);
+    emit_int8(0x28);
+    emit_int8((unsigned char)(0xC0 | encode));
+  } else {
+    emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
+  }
 }
 
 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, true, VEX_OPCODE_0F,
-                                      false, AVX_128bit);
+  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2023,48 +2031,54 @@
   emit_operand(dst, src);
 }
 
-void Assembler::kmovq(KRegister dst, KRegister src) {
+void Assembler::kmovql(KRegister dst, KRegister src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE,
-                                      true, VEX_OPCODE_0F, true);
+                                      /* no_mask_reg */ true, VEX_OPCODE_0F, /* rex_w */ true);
   emit_int8((unsigned char)0x90);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::kmovq(KRegister dst, Address src) {
+void Assembler::kmovql(KRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   int dst_enc = dst->encoding();
   int nds_enc = 0;
   vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_NONE,
-             VEX_OPCODE_0F, true, AVX_128bit, true, true);
+             VEX_OPCODE_0F, /* vex_w */  true, AVX_128bit, /* legacy_mode */ true, /* no_reg_mask */ true);
   emit_int8((unsigned char)0x90);
   emit_operand((Register)dst, src);
 }
 
-void Assembler::kmovq(Address dst, KRegister src) {
+void Assembler::kmovql(Address dst, KRegister src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   int src_enc = src->encoding();
   int nds_enc = 0;
   vex_prefix(dst, nds_enc, src_enc, VEX_SIMD_NONE,
-             VEX_OPCODE_0F, true, AVX_128bit, true, true);
+             VEX_OPCODE_0F, /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_reg_mask */ true);
   emit_int8((unsigned char)0x90);
   emit_operand((Register)src, dst);
 }
 
 void Assembler::kmovql(KRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
-  bool supports_bw = VM_Version::supports_avx512bw();
-  VexSimdPrefix pre = supports_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true,
-                                      VEX_OPCODE_0F, supports_bw);
+  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F, /* legacy_mode */ !_legacy_mode_bw);
   emit_int8((unsigned char)0x92);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::kmovdl(KRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
-  VexSimdPrefix pre = VM_Version::supports_avx512bw() ? VEX_SIMD_F2 : VEX_SIMD_NONE;
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true, VEX_OPCODE_0F, false);
+  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, /* no_mask_reg */ true);
+  emit_int8((unsigned char)0x92);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::kmovwl(KRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
   emit_int8((unsigned char)0x92);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2088,7 +2102,7 @@
 
 void Assembler::movdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x6E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2096,7 +2110,7 @@
 void Assembler::movdl(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // swap src/dst to get correct prefix
-  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x7E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2104,11 +2118,11 @@
 void Assembler::movdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, true, VEX_OPCODE_0F);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_reg_mask */ true);
   emit_int8(0x6E);
   emit_operand(dst, src);
 }
@@ -2116,58 +2130,61 @@
 void Assembler::movdl(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_reg_mask */ true);
   emit_int8(0x7E);
   emit_operand(src, dst);
 }
 
 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::movdqa(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::movdqu(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::movdqu(Address dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3, false);
+    _tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ false);
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
 
 // Move Unaligned 256bit Vector
 void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
   int vector_len = AVX_256bit;
   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
   emit_int8(0x6F);
@@ -2175,67 +2192,100 @@
 }
 
 void Assembler::vmovdqu(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   InstructionMark im(this);
   int vector_len = AVX_256bit;
-  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
+  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
   emit_int8(0x6F);
   emit_operand(dst, src);
 }
 
 void Assembler::vmovdqu(Address dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
+    _tuple_type = EVEX_FVM;
   }
   InstructionMark im(this);
   int vector_len = AVX_256bit;
   // swap src<->dst for encoding
   assert(src != xnoreg, "sanity");
-  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
+  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len);
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
 
 // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64)
-void Assembler::evmovdqu(XMMRegister dst, XMMRegister src, int vector_len) {
+void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_F3, VEX_OPCODE_0F,
-                                     true, vector_len, false, false);
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x6F);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evmovdqu(XMMRegister dst, Address src, int vector_len) {
+void Assembler::evmovdqul(XMMRegister dst, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-    vex_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
-  } else {
-    vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
-  }
+    _tuple_type = EVEX_FVM;
+  }
+  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
   emit_int8(0x6F);
   emit_operand(dst, src);
 }
 
-void Assembler::evmovdqu(Address dst, XMMRegister src, int vector_len) {
+void Assembler::evmovdqul(Address dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "");
   InstructionMark im(this);
   assert(src != xnoreg, "sanity");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-    // swap src<->dst for encoding
-    vex_prefix_q(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
-  } else {
-    // swap src<->dst for encoding
-    vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
-  }
+    _tuple_type = EVEX_FVM;
+  }
+  // swap src<->dst for encoding
+  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len);
+  emit_int8(0x7F);
+  emit_operand(src, dst);
+}
+
+void Assembler::evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(UseAVX > 0, "");
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_F3, VEX_OPCODE_0F,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  emit_int8(0x6F);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evmovdquq(XMMRegister dst, Address src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(UseAVX > 2, "");
+  InstructionMark im(this);
+  _tuple_type = EVEX_FVM;
+  vex_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
+  emit_int8(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::evmovdquq(Address dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(UseAVX > 2, "");
+  InstructionMark im(this);
+  assert(src != xnoreg, "sanity");
+  _tuple_type = EVEX_FVM;
+  // swap src<->dst for encoding
+  vex_prefix_q(src, xnoreg, dst, VEX_SIMD_F3, vector_len);
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
@@ -2282,10 +2332,12 @@
 void Assembler::movlpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x12, dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+    emit_simd_arith_q(0x12, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
+  } else {
+    emit_simd_arith(0x12, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
+  }
 }
 
 void Assembler::movq( MMXRegister dst, Address src ) {
@@ -2312,11 +2364,11 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    simd_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    simd_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   } else {
-    simd_prefix(dst, src, VEX_SIMD_F3, true, VEX_OPCODE_0F);
+    simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   }
   emit_int8(0x7E);
   emit_operand(dst, src);
@@ -2326,12 +2378,12 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    simd_prefix(src, xnoreg, dst, VEX_SIMD_66, true,
-                VEX_OPCODE_0F, true, AVX_128bit);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    simd_prefix(src, xnoreg, dst, VEX_SIMD_66, /* no_mask_reg */ true,
+                VEX_OPCODE_0F, /* rex_w */ true);
   } else {
-    simd_prefix(dst, src, VEX_SIMD_66, true);
+    simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   }
   emit_int8((unsigned char)0xD6);
   emit_operand(src, dst);
@@ -2356,7 +2408,7 @@
 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    emit_simd_arith_q(0x10, dst, src, VEX_SIMD_F2, true);
+    emit_simd_arith_q(0x10, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   } else {
     emit_simd_arith(0x10, dst, src, VEX_SIMD_F2);
   }
@@ -2365,9 +2417,9 @@
 void Assembler::movsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    emit_simd_arith_nonds_q(0x10, dst, src, VEX_SIMD_F2, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x10, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2);
   }
@@ -2377,11 +2429,11 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     simd_prefix_q(src, xnoreg, dst, VEX_SIMD_F2);
   } else {
-    simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, false);
+    simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, /* no_mask_reg */ false);
   }
   emit_int8(0x11);
   emit_operand(src, dst);
@@ -2389,26 +2441,26 @@
 
 void Assembler::movss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3, true);
+  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
 }
 
 void Assembler::movss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
 }
 
 void Assembler::movss(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3, false);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ false);
   emit_int8(0x11);
   emit_operand(src, dst);
 }
@@ -2501,8 +2553,8 @@
 void Assembler::mulsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x59, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
@@ -2521,8 +2573,8 @@
 void Assembler::mulss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
 }
@@ -2831,29 +2883,27 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x67, dst, src, VEX_SIMD_66,
-                  false, (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x67, dst, src, VEX_SIMD_66,
-                  false, (VM_Version::supports_avx512dq() == false));
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "some form of AVX must be enabled");
-  emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector_len,
-                 false, (VM_Version::supports_avx512dq() == false));
+  emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx2(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_3A, true, vector_len);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ true, vector_len);
   emit_int8(0x00);
   emit_int8(0xC0 | encode);
   emit_int8(imm8);
@@ -2867,8 +2917,8 @@
 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
   InstructionMark im(this);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_3A,
-              false, AVX_128bit, true);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_3A,
+              /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x61);
   emit_operand(dst, src);
   emit_int8(imm8);
@@ -2876,8 +2926,8 @@
 
 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_3A, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x61);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2885,8 +2935,8 @@
 
 void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2894,8 +2944,8 @@
 
 void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, /* no_mask_reg */  true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ true, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2903,8 +2953,8 @@
 
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2912,8 +2962,8 @@
 
 void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
-                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
+  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ true, AVX_128bit, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2922,17 +2972,17 @@
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_HVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
+    _tuple_type = EVEX_HVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_38);
   emit_int8(0x30);
   emit_operand(dst, src);
 }
 
 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_38);
   emit_int8(0x30);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3035,8 +3085,8 @@
 
 void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_ssse3(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38,
-                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x00);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3044,33 +3094,34 @@
 void Assembler::pshufb(XMMRegister dst, Address src) {
   assert(VM_Version::supports_ssse3(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38,
-              false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x00);
   emit_operand(dst, src);
 }
 
 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
+  _instruction_uses_vl = true;
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_66);
   emit_int8(mode & 0xFF);
-
 }
 
 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
+  _instruction_uses_vl = true;
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, false);
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x70);
   emit_operand(dst, src);
   emit_int8(mode & 0xFF);
@@ -3079,8 +3130,7 @@
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2, false,
-                        (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(mode & 0xFF);
 }
 
@@ -3089,29 +3139,33 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  InstructionMark im(this);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, false, VEX_OPCODE_0F,
-              false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, /* no_mask_reg */ false,
+              VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x70);
   emit_operand(dst, src);
   emit_int8(mode & 0xFF);
 }
 
 void Assembler::psrldq(XMMRegister dst, int shift) {
-  // Shift 128 bit value in xmm register by number of bytes.
+  // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  // XMM3 is for /3 encoding: 66 0F 73 /3 ib
+  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift);
 }
 
 void Assembler::pslldq(XMMRegister dst, int shift) {
-  // Shift left 128 bit value in xmm register by number of bytes.
+  // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  // XMM7 is for /7 encoding: 66 0F 73 /7 ib
+  int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift);
@@ -3121,16 +3175,16 @@
   assert(VM_Version::supports_sse4_1(), "");
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false,
-              VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+              VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
 
 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3142,7 +3196,8 @@
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len, true, false);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* rex_w */ false,
+             vector_len, /* legacy_mode  */ true, /* no_mask_reg */ false);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
@@ -3150,8 +3205,7 @@
 void Assembler::vptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, true, false);
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3160,34 +3214,41 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_vlbw);
 }
 
 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
+  emit_simd_arith(0x60, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_vlbw);
 }
 
 void Assembler::punpckldq(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x6C, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::push(int32_t imm32) {
@@ -3396,8 +3457,8 @@
 void Assembler::sqrtsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x51, dst, src, VEX_SIMD_F2);
   } else {
     emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
@@ -3416,8 +3477,8 @@
 void Assembler::sqrtss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
 }
@@ -3479,10 +3540,14 @@
 void Assembler::subsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-  }
-  emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+  }
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::subss(XMMRegister dst, XMMRegister src) {
@@ -3493,8 +3558,8 @@
 void Assembler::subss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
 }
@@ -3553,9 +3618,9 @@
 void Assembler::ucomisd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
-    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
   }
@@ -3564,7 +3629,7 @@
 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true);
+    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   } else {
     emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
   }
@@ -3573,15 +3638,15 @@
 void Assembler::ucomiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true);
 }
 
 void Assembler::xabort(int8_t imm8) {
@@ -3664,8 +3729,8 @@
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3684,8 +3749,8 @@
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3698,8 +3763,8 @@
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3718,8 +3783,8 @@
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3732,8 +3797,8 @@
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3752,8 +3817,8 @@
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3766,8 +3831,8 @@
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
   } else {
     emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
@@ -3786,8 +3851,8 @@
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
@@ -3802,6 +3867,7 @@
 // Float-point vector arithmetic
 
 void Assembler::addpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x58, dst, src, VEX_SIMD_66);
@@ -3811,11 +3877,13 @@
 }
 
 void Assembler::addps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3825,15 +3893,17 @@
 }
 
 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3841,15 +3911,17 @@
 }
 
 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::subpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_66);
@@ -3859,11 +3931,13 @@
 }
 
 void Assembler::subps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3873,15 +3947,17 @@
 }
 
 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3889,15 +3965,17 @@
 }
 
 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x59, dst, src, VEX_SIMD_66);
@@ -3907,11 +3985,13 @@
 }
 
 void Assembler::mulps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3921,15 +4001,17 @@
 }
 
 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3937,15 +4019,17 @@
 }
 
 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::divpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_66);
@@ -3955,11 +4039,13 @@
 }
 
 void Assembler::divps(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE);
 }
 
 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3969,15 +4055,17 @@
 }
 
 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -3985,164 +4073,178 @@
 }
 
 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
+void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(VM_Version::supports_avx(), "");
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vsqrtpd(XMMRegister dst, Address src, int vector_len) {
+  _instruction_uses_vl = true;
+  assert(VM_Version::supports_avx(), "");
+  if (VM_Version::supports_evex()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len);
+  }
+}
+
 void Assembler::andpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::andps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, false,
-                  (VM_Version::supports_avx512dq() == false));
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::andps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE,
-                  false, (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::andpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  bool legacy_mode = (VM_Version::supports_avx512dq() == false);
-  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, legacy_mode);
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false,  /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len,
-                 (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE,
-                  false, (VM_Version::supports_avx512dq() == false));
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::xorpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66);
   } else {
-    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true);
+    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::xorps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, false,
-                  (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+  if (VM_Version::supports_avx512dq()) {
     emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len,
-                 (VM_Version::supports_avx512dq() == false));
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+  if (VM_Version::supports_avx512dq()) {
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
-    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true);
+    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true);
   }
 }
 
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
-  }
-  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len,
-                 (VM_Version::supports_avx512dq() == false));
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
 // Integer vector arithmetic
 void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len,
-                                     VEX_OPCODE_0F_38, true, false);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4150,28 +4252,29 @@
 void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len,
-                                     VEX_OPCODE_0F_38, true, false);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::paddb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xFC, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::paddw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xFD, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xFD, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::paddd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xFE, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::paddq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xD4, dst, src, VEX_SIMD_66);
@@ -4182,38 +4285,38 @@
 
 void Assembler::phaddw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse3(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse3(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4225,33 +4328,35 @@
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len);
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len);
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4260,20 +4365,22 @@
 
 void Assembler::psubb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF8, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xF8, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psubw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF9, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xF9, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psubd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xFA, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::psubq(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xFB, dst, src, VEX_SIMD_66);
@@ -4284,22 +4391,22 @@
 
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4311,35 +4418,35 @@
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
     emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
   } else {
     emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
@@ -4348,28 +4455,27 @@
 
 void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_sse4_1(), "");
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66,
-                                      false, VEX_OPCODE_0F_38);
+                                      /* no_mask_reg */ false, VEX_OPCODE_0F_38);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4379,8 +4485,8 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4388,22 +4494,23 @@
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FVM;
-  }
-  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len);
+    _tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   InstructionMark im(this);
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66,
-             VEX_OPCODE_0F_38, false, vector_len);
+             VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -4411,13 +4518,14 @@
 void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_64bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_64bit;
   }
   InstructionMark im(this);
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66,
+             VEX_OPCODE_0F_38, /* vex_w */ true, vector_len, /* legacy_mode */ _legacy_mode_dq);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -4426,26 +4534,28 @@
 void Assembler::psllw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
-                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F,
+                                      /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::pslld(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false);
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psllq(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F, /* rex_w */ true);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -4453,16 +4563,17 @@
 
 void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66, false,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::pslld(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xF2, dst, shift, VEX_SIMD_66);
 }
 
 void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xF3, dst, shift, VEX_SIMD_66);
@@ -4474,12 +4585,12 @@
 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
-  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
   emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector_len);
@@ -4487,6 +4598,7 @@
 }
 
 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
   if (VM_Version::supports_evex()) {
@@ -4499,16 +4611,17 @@
 
 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xF3, dst, src, shift, VEX_SIMD_66, vector_len);
@@ -4521,33 +4634,31 @@
 void Assembler::psrlw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 71 /2 ib
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
-                                      (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psrld(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 72 /2 ib
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false);
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psrlq(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   // Do not confuse it with psrldq SSE2 instruction which
   // shifts 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
-  int encode = 0;
-  if (VM_Version::supports_evex() && VM_Version::supports_avx512bw()) {
-    encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false);
-  } else {
-    encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true);
-  }
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F, /* rex_w */ VM_Version::supports_evex());
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -4555,16 +4666,17 @@
 
 void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66, false,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psrld(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xD2, dst, shift, VEX_SIMD_66);
 }
 
 void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
     emit_simd_arith_q(0xD3, dst, shift, VEX_SIMD_66);
@@ -4575,20 +4687,21 @@
 
 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
-  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  // XMM2 is for /2 encoding: 66 0F 71 /2 ib
+  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
-  // XMM2 is for /2 encoding: 66 0F 73 /2 ib
+  // XMM2 is for /2 encoding: 66 0F 72 /2 ib
   emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector_len);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
   if (VM_Version::supports_evex()) {
@@ -4601,16 +4714,17 @@
 
 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
     emit_vex_arith_q(0xD3, dst, src, shift, VEX_SIMD_66, vector_len);
@@ -4623,17 +4737,18 @@
 void Assembler::psraw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
-  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
-                                      (VM_Version::supports_avx512bw() == false));
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::psrad(XMMRegister dst, int shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM4 is for /4 encoding: 66 0F 72 /4 ib
-  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false);
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -4641,11 +4756,11 @@
 
 void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66,
-                  (VM_Version::supports_avx512bw() == false));
+  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66);
 }
@@ -4653,12 +4768,12 @@
 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
-  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
   emit_int8(shift & 0xFF);
 }
 
 void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
   emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector_len);
@@ -4667,11 +4782,11 @@
 
 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector_len,
-                 (VM_Version::supports_avx512bw() == false));
+  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw);
 }
 
 void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector_len);
 }
@@ -4684,53 +4799,61 @@
 }
 
 void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::por(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xEB, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xEF, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(UseAVX > 0, "requires some form of AVX");
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_FV;
-    input_size_in_bits = EVEX_32bit;
+    _tuple_type = EVEX_FV;
+    _input_size_in_bits = EVEX_32bit;
   }
   emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len);
 }
@@ -4739,6 +4862,9 @@
 void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4753,8 +4879,8 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66,
-                                     VEX_OPCODE_0F_3A, true, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x1A);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
@@ -4763,35 +4889,70 @@
 }
 
 void Assembler::vinsertf64x4h(XMMRegister dst, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_64bit;
-  }
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   int vector_len = AVX_512bit;
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, true, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ true, vector_len);
   emit_int8(0x1A);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
   emit_int8(0x01);
 }
 
-void Assembler::vinsertf128h(XMMRegister dst, Address src) {
-  assert(VM_Version::supports_avx(), "");
-  if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+void Assembler::vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  emit_int8(0x18);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into q0 128 bits (0..127)
+  // 0x01 - insert into q1 128 bits (128..255)
+  // 0x02 - insert into q2 128 bits (256..383)
+  // 0x03 - insert into q3 128 bits (384..511)
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vinsertf32x4h(XMMRegister dst, Address src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_32bit;
+  InstructionMark im(this);
+  int vector_len = AVX_512bit;
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
+  emit_int8(0x18);
+  emit_operand(dst, src);
+  // 0x00 - insert into q0 128 bits (0..127)
+  // 0x01 - insert into q1 128 bits (128..255)
+  // 0x02 - insert into q2 128 bits (256..383)
+  // 0x03 - insert into q3 128 bits (384..511)
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vinsertf128h(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x18);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
@@ -4801,6 +4962,9 @@
 void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4811,15 +4975,16 @@
 
 void Assembler::vextractf128h(Address dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
+  int vector_len = AVX_256bit;
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
-  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x19);
   emit_operand(src, dst);
   // 0x01 - extract from upper 128 bits
@@ -4829,6 +4994,9 @@
 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4844,7 +5012,7 @@
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_reg_mask */ false);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
@@ -4854,16 +5022,17 @@
 
 void Assembler::vinserti128h(XMMRegister dst, Address src) {
   assert(VM_Version::supports_avx2(), "");
+  int vector_len = AVX_256bit;
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x38);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
@@ -4873,6 +5042,9 @@
 void Assembler::vextracti128h(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   int vector_len = AVX_256bit;
+  if (VM_Version::supports_evex()) {
+    vector_len = AVX_512bit;
+  }
   int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4883,15 +5055,16 @@
 
 void Assembler::vextracti128h(Address dst, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
+  int vector_len = AVX_256bit;
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T4;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  int vector_len = AVX_256bit;
+    _tuple_type = EVEX_T4;
+    _input_size_in_bits = EVEX_32bit;
+    vector_len = AVX_512bit;
+  }
+  InstructionMark im(this);
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
-  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
   emit_int8(0x39);
   emit_operand(src, dst);
   // 0x01 - extract from upper 128 bits
@@ -4904,7 +5077,7 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     true, vector_len, false, false);
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x3B);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from upper 256 bits
@@ -4916,8 +5089,14 @@
   int vector_len = AVX_512bit;
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
-  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+  int encode;
+  if (VM_Version::supports_avx512dq()) {
+    encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                   /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  } else {
+    encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                   /* vex_w */ false, vector_len, /* legacy_mode */ true, /* no_mask_reg */ false);
+  }
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from bits 255:128
@@ -4932,7 +5111,7 @@
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
   int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x1B);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from upper 256 bits
@@ -4940,18 +5119,18 @@
 }
 
 void Assembler::vextractf64x4h(Address dst, XMMRegister src) {
-  assert(VM_Version::supports_avx2(), "");
-  tuple_type = EVEX_T4;
-  input_size_in_bits = EVEX_64bit;
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   int vector_len = AVX_512bit;
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
   vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-             VM_Version::supports_avx512dq(), vector_len);
+             /* vex_w */ true, vector_len);
   emit_int8(0x1B);
   emit_operand(src, dst);
-  // 0x01 - extract from upper 128 bits
+  // 0x01 - extract from upper 256 bits
   emit_int8(0x01);
 }
 
@@ -4960,8 +5139,42 @@
   int vector_len = AVX_512bit;
   int src_enc = src->encoding();
   int dst_enc = dst->encoding();
-  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66,
-                                     VEX_OPCODE_0F_3A, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
+  emit_int8(0x19);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vextractf32x4h(Address dst, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  _tuple_type = EVEX_T4;
+  _input_size_in_bits = EVEX_32bit;
+  InstructionMark im(this);
+  int vector_len = AVX_512bit;
+  assert(src != xnoreg, "sanity");
+  int src_enc = src->encoding();
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len);
+  emit_int8(0x19);
+  emit_operand(src, dst);
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ !_legacy_mode_dq, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x01 - extract from bits 255:128
@@ -4970,195 +5183,192 @@
   emit_int8(value & 0x3);
 }
 
-void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) {
-  assert(VM_Version::supports_evex(), "");
-  int vector_len = AVX_512bit;
-  int src_enc = src->encoding();
-  int dst_enc = dst->encoding();
-  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
-                                     VM_Version::supports_avx512dq(), vector_len, false, false);
-  emit_int8(0x19);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x01 - extract from bits 255:128
-  // 0x02 - extract from bits 383:256
-  // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
-}
-
 // duplicate 4-bytes integer data from src into 8 locations in dest
 void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
-  assert(VM_Version::supports_avx2(), "");
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
   int vector_len = AVX_256bit;
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x78);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_8bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_8bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x78);
   emit_operand(dst, src);
 }
 
 // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x79);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_16bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_16bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x79);
   emit_operand(dst, src);
 }
 
 // duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_38, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_32bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_32bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
 
 // duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_64bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ true, vector_len);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
 
 // duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_32bit;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_32bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len);
   emit_int8(0x18);
   emit_operand(dst, src);
 }
 
 // duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /*vex_w */ true, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  tuple_type = EVEX_T1S;
-  input_size_in_bits = EVEX_64bit;
+  _instruction_uses_vl = true;
+  assert(UseAVX > 1, "");
+  _tuple_type = EVEX_T1S;
+  _input_size_in_bits = EVEX_64bit;
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ true, vector_len);
   emit_int8(0x19);
   emit_operand(dst, src);
 }
 
 // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /*vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7B);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, false, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) {
+  _instruction_uses_vl = true;
   assert(VM_Version::supports_evex(), "");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
-                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38,
+                                     /* vex_w */ true, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false);
   emit_int8(0x7C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5166,8 +5376,8 @@
 // Carry-Less Multiplication Quadword
 void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
   assert(VM_Version::supports_clmul(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
-                                      VEX_OPCODE_0F_3A, false, AVX_128bit, true);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8((unsigned char)mask);
@@ -5177,8 +5387,7 @@
 void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
   assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
   int vector_len = AVX_128bit;
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66,
-                                     vector_len, VEX_OPCODE_0F_3A, true);
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A, /* legacy_mode */ true);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8((unsigned char)mask);
@@ -5737,7 +5946,7 @@
                             int vector_len, bool no_mask_reg ){
   // EVEX 0x62 prefix
   prefix(EVEX_4bytes);
-  evex_encoding = (vex_w ? VEX_W : 0) | (evex_r ? EVEX_Rb : 0);
+  _evex_encoding = (vex_w ? VEX_W : 0) | (evex_r ? EVEX_Rb : 0);
 
   // P0: byte 2, initialized to RXBR`00mm
   // instead of not'd
@@ -5776,10 +5985,10 @@
   bool vex_r = ((xreg_enc & 8) == 8) ? 1 : 0;
   bool vex_b = adr.base_needs_rex();
   bool vex_x = adr.index_needs_rex();
-  avx_vector_len = vector_len;
-
-  // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit
-  if (VM_Version::supports_avx512vl() == false) {
+  _avx_vector_len = vector_len;
+
+  // if vector length is turned off, revert to AVX for vectors smaller than 512-bit
+  if (_legacy_mode_vl && _instruction_uses_vl) {
     switch (vector_len) {
     case AVX_128bit:
     case AVX_256bit:
@@ -5792,11 +6001,12 @@
   {
     bool evex_r = (xreg_enc >= 16);
     bool evex_v = (nds_enc >= 16);
-    is_evex_instruction = true;
+    _is_evex_instruction = true;
     evex_prefix(vex_r, vex_b, vex_x, vex_w, evex_r, evex_v, nds_enc, pre, opc, false, false, vector_len, no_mask_reg);
   } else {
     vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len);
   }
+  _instruction_uses_vl = false;
 }
 
 int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
@@ -5804,10 +6014,10 @@
   bool vex_r = ((dst_enc & 8) == 8) ? 1 : 0;
   bool vex_b = ((src_enc & 8) == 8) ? 1 : 0;
   bool vex_x = false;
-  avx_vector_len = vector_len;
-
-  // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit
-  if (VM_Version::supports_avx512vl() == false) {
+  _avx_vector_len = vector_len;
+
+  // if vector length is turned off, revert to AVX for vectors smaller than 512-bit
+  if (_legacy_mode_vl && _instruction_uses_vl) {
     switch (vector_len) {
     case AVX_128bit:
     case AVX_256bit:
@@ -5827,6 +6037,8 @@
     vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len);
   }
 
+  _instruction_uses_vl = false;
+
   // return modrm byte components for operands
   return (((dst_enc & 7) << 3) | (src_enc & 7));
 }
@@ -5915,13 +6127,13 @@
 }
 
 void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg, bool legacy_mode) {
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, legacy_mode, AVX_128bit);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, false, AVX_128bit, legacy_mode);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, true, AVX_128bit);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, true);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5945,7 +6157,7 @@
 
 void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src,
                                VexSimdPrefix pre, int vector_len, bool no_mask_reg, bool legacy_mode) {
-  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector_len, VEX_OPCODE_0F, false, no_mask_reg);
+  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector_len, VEX_OPCODE_0F, legacy_mode, no_mask_reg);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6594,7 +6806,7 @@
 
 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6602,11 +6814,11 @@
 void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix_q(dst, dst, src, VEX_SIMD_F2, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_operand(dst, src);
 }
@@ -6614,25 +6826,25 @@
 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
-    tuple_type = EVEX_T1S;
-    input_size_in_bits = EVEX_32bit;
-  }
-  InstructionMark im(this);
-  simd_prefix_q(dst, dst, src, VEX_SIMD_F3, true);
+    _tuple_type = EVEX_T1S;
+    _input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true);
   emit_int8(0x2A);
   emit_operand(dst, src);
 }
 
 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, /* no_mask_reg */ true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6668,6 +6880,13 @@
   emit_operand(as_Register(1), src);
 }
 
+void Assembler::xrstor(Address src) {
+  prefixq(src);
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xAE);
+  emit_operand(as_Register(5), src);
+}
+
 void Assembler::fxsave(Address dst) {
   prefixq(dst);
   emit_int8(0x0F);
@@ -6675,6 +6894,13 @@
   emit_operand(as_Register(0), dst);
 }
 
+void Assembler::xsave(Address dst) {
+  prefixq(dst);
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xAE);
+  emit_operand(as_Register(4), dst);
+}
+
 void Assembler::idivq(Register src) {
   int encode = prefixq_and_encode(src->encoding());
   emit_int8((unsigned char)0xF7);
@@ -6801,7 +7027,7 @@
 void Assembler::movdq(XMMRegister dst, Register src) {
   // table D-1 says MMX/SSE2
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x6E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6810,7 +7036,7 @@
   // table D-1 says MMX/SSE2
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // swap src/dst to get correct prefix
-  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66, true);
+  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66, /* no_mask_reg */ true);
   emit_int8(0x7E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -6943,8 +7169,8 @@
 
 void Assembler::mulxq(Register dst1, Register dst2, Register src) {
   assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(),
-                                     VEX_SIMD_F2, VEX_OPCODE_0F_38, true, AVX_128bit, true, false);
+  int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38,
+                                    /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_mask_reg */ false);
   emit_int8((unsigned char)0xF6);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -7106,8 +7332,8 @@
 
 void Assembler::rorxq(Register dst, Register src, int imm8) {
   assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2,
-                                     VEX_OPCODE_0F_3A, true, AVX_128bit, true, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A,
+                                     /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_mask_reg */ false);
   emit_int8((unsigned char)0xF0);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
--- a/src/cpu/x86/vm/assembler_x86.hpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Mon Sep 21 17:49:57 2015 +0200
@@ -438,7 +438,9 @@
 
 };
 
-const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize);
+// 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
+// See fxsave and xsave(EVEX enabled) documentation for layout
+const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize);
 
 // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
 // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
@@ -594,11 +596,16 @@
 
 private:
 
-  int evex_encoding;
-  int input_size_in_bits;
-  int avx_vector_len;
-  int tuple_type;
-  bool is_evex_instruction;
+  int _evex_encoding;
+  int _input_size_in_bits;
+  int _avx_vector_len;
+  int _tuple_type;
+  bool _is_evex_instruction;
+  bool _legacy_mode_bw;
+  bool _legacy_mode_dq;
+  bool _legacy_mode_vl;
+  bool _legacy_mode_vlbw;
+  bool _instruction_uses_vl;
 
   // 64bit prefixes
   int prefix_and_encode(int reg_enc, bool byteinst = false);
@@ -972,11 +979,16 @@
   // belong in macro assembler but there is no need for both varieties to exist
 
   void init_attributes(void) {
-    evex_encoding = 0;
-    input_size_in_bits = 0;
-    avx_vector_len = AVX_NoVec;
-    tuple_type = EVEX_ETUP;
-    is_evex_instruction = false;
+    _evex_encoding = 0;
+    _input_size_in_bits = 0;
+    _avx_vector_len = AVX_NoVec;
+    _tuple_type = EVEX_ETUP;
+    _is_evex_instruction = false;
+    _legacy_mode_bw = (VM_Version::supports_avx512bw() == false);
+    _legacy_mode_dq = (VM_Version::supports_avx512dq() == false);
+    _legacy_mode_vl = (VM_Version::supports_avx512vl() == false);
+    _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false);
+    _instruction_uses_vl = false;
   }
 
   void lea(Register dst, Address src);
@@ -1344,8 +1356,10 @@
   void fxch(int i = 1);
 
   void fxrstor(Address src);
+  void xrstor(Address src);
 
   void fxsave(Address dst);
+  void xsave(Address dst);
 
   void fyl2x();
   void frndint();
@@ -1479,11 +1493,12 @@
   void movb(Address dst, int imm8);
   void movb(Register dst, Address src);
 
-  void kmovq(KRegister dst, KRegister src);
+  void kmovql(KRegister dst, KRegister src);
   void kmovql(KRegister dst, Register src);
   void kmovdl(KRegister dst, Register src);
-  void kmovq(Address dst, KRegister src);
-  void kmovq(KRegister dst, Address src);
+  void kmovwl(KRegister dst, Register src);
+  void kmovql(Address dst, KRegister src);
+  void kmovql(KRegister dst, Address src);
 
   void movdl(XMMRegister dst, Register src);
   void movdl(Register dst, XMMRegister src);
@@ -1509,9 +1524,12 @@
   void vmovdqu(XMMRegister dst, XMMRegister src);
 
    // Move Unaligned 512bit Vector
-  void evmovdqu(Address dst, XMMRegister src, int vector_len);
-  void evmovdqu(XMMRegister dst, Address src, int vector_len);
-  void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdqul(Address dst, XMMRegister src, int vector_len);
+  void evmovdqul(XMMRegister dst, Address src, int vector_len);
+  void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdquq(Address dst, XMMRegister src, int vector_len);
+  void evmovdquq(XMMRegister dst, Address src, int vector_len);
+  void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
 
   // Move lower 64bit to high 64bit in 128bit register
   void movlhps(XMMRegister dst, XMMRegister src);
@@ -1643,6 +1661,7 @@
 
   // Pemutation of 64bit words
   void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
+  void vpermq(XMMRegister dst, XMMRegister src, int imm8);
 
   void pause();
 
@@ -1920,6 +1939,10 @@
   void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
   void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
+  // Sqrt Packed Floating-Point Values - Double precision only
+  void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vsqrtpd(XMMRegister dst, Address src, int vector_len);
+
   // Bitwise Logical AND of Packed Floating-Point Values
   void andpd(XMMRegister dst, XMMRegister src);
   void andps(XMMRegister dst, XMMRegister src);
@@ -2057,6 +2080,9 @@
   void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
   void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
   void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
+  void vextractf32x4h(Address dst, XMMRegister src, int value);
+  void vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
+  void vinsertf32x4h(XMMRegister dst, Address src, int value);
 
   // duplicate 4-bytes integer data from src into 8 locations in dest
   void vpbroadcastd(XMMRegister dst, XMMRegister src);
--- a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -3798,16 +3798,24 @@
     if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) {
       __ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg());
     }
-    __ xorps(dest->as_xmm_float_reg(),
-             ExternalAddress((address)float_signflip_pool));
-
+    if (UseAVX > 1) {
+      __ vnegatess(dest->as_xmm_float_reg(), dest->as_xmm_float_reg(),
+                   ExternalAddress((address)float_signflip_pool));
+    } else {
+      __ xorps(dest->as_xmm_float_reg(),
+               ExternalAddress((address)float_signflip_pool));
+    }
   } else if (dest->is_double_xmm()) {
     if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) {
       __ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg());
     }
-    __ xorpd(dest->as_xmm_double_reg(),
-             ExternalAddress((address)double_signflip_pool));
-
+    if (UseAVX > 1) {
+      __ vnegatesd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg(),
+                   ExternalAddress((address)double_signflip_pool));
+    } else {
+      __ xorpd(dest->as_xmm_double_reg(),
+               ExternalAddress((address)double_signflip_pool));
+    }
   } else if (left->is_single_fpu() || left->is_double_fpu()) {
     assert(left->fpu() == 0, "arg must be on TOS");
     assert(dest->fpu() == 0, "dest must be TOS");
--- a/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -401,11 +401,9 @@
 
     } else if (UseSSE == 1) {
       int xmm_off = xmm_regs_as_doubles_off;
-      for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
-        if (n < xmm_bypass_limit) {
-          VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
-          map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
-        }
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
+        map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
         xmm_off += 2;
       }
       assert(xmm_off == float_regs_as_doubles_off, "incorrect number of xmm registers");
@@ -452,14 +450,11 @@
       __ frstor(Address(rsp, fpu_state_off * VMRegImpl::stack_slot_size));
 
       // Save the FPU registers in de-opt-able form
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
     }
 
     if (UseSSE >= 2) {
@@ -468,52 +463,26 @@
       // so always save them as doubles.
       // note that float values are _not_ converted automatically, so for float values
       // the second word contains only garbage data.
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0), xmm0);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8), xmm1);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7);
+      int xmm_bypass_limit = FrameMap::nof_xmm_regs;
+      int offset = 0;
 #ifdef _LP64
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64), xmm8);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72), xmm9);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80), xmm10);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88), xmm11);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96), xmm12);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104), xmm13);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112), xmm14);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120), xmm15);
-      if (UseAVX > 2) {
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128), xmm16);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136), xmm17);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144), xmm18);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152), xmm19);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160), xmm20);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168), xmm21);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176), xmm22);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184), xmm23);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192), xmm24);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200), xmm25);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208), xmm26);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216), xmm27);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224), xmm28);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232), xmm29);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240), xmm30);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248), xmm31);
+      if (UseAVX < 3) {
+        xmm_bypass_limit = xmm_bypass_limit / 2;
       }
-#endif // _LP64
+#endif
+      for (int n = 0; n < xmm_bypass_limit; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name);
+        offset += 8;
+      }
     } else if (UseSSE == 1) {
-      // save XMM registers as float because double not supported without SSE2
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0), xmm0);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8), xmm1);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7);
+      // save XMM registers as float because double not supported without SSE2(num MMX == num fpu)
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name);
+        offset += 8;
+      }
     }
   }
 
@@ -528,52 +497,26 @@
   if (restore_fpu_registers) {
     if (UseSSE >= 2) {
       // restore XMM registers
-      __ movdbl(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ movdbl(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ movdbl(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ movdbl(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ movdbl(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ movdbl(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ movdbl(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ movdbl(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      int xmm_bypass_limit = FrameMap::nof_xmm_regs;
 #ifdef _LP64
-      __ movdbl(xmm8, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64));
-      __ movdbl(xmm9, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72));
-      __ movdbl(xmm10, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80));
-      __ movdbl(xmm11, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88));
-      __ movdbl(xmm12, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96));
-      __ movdbl(xmm13, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104));
-      __ movdbl(xmm14, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112));
-      __ movdbl(xmm15, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120));
-      if (UseAVX > 2) {
-        __ movdbl(xmm16, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128));
-        __ movdbl(xmm17, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136));
-        __ movdbl(xmm18, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144));
-        __ movdbl(xmm19, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152));
-        __ movdbl(xmm20, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160));
-        __ movdbl(xmm21, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168));
-        __ movdbl(xmm22, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176));
-        __ movdbl(xmm23, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184));
-        __ movdbl(xmm24, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192));
-        __ movdbl(xmm25, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200));
-        __ movdbl(xmm26, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208));
-        __ movdbl(xmm27, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216));
-        __ movdbl(xmm28, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224));
-        __ movdbl(xmm29, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232));
-        __ movdbl(xmm30, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240));
-        __ movdbl(xmm31, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248));
+      if (UseAVX < 3) {
+        xmm_bypass_limit = xmm_bypass_limit / 2;
       }
-#endif // _LP64
+#endif
+      int offset = 0;
+      for (int n = 0; n < xmm_bypass_limit; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movdbl(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
     } else if (UseSSE == 1) {
-      // restore XMM registers
-      __ movflt(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ movflt(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ movflt(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ movflt(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ movflt(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ movflt(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ movflt(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ movflt(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      // restore XMM registers(num MMX == num fpu)
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movflt(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
     }
 
     if (UseSSE < 2) {
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -3751,8 +3751,31 @@
 }
 
 void MacroAssembler::pop_FPU_state() {
-  NOT_LP64(frstor(Address(rsp, 0));)
-  LP64_ONLY(fxrstor(Address(rsp, 0));)
+#ifndef _LP64
+  frstor(Address(rsp, 0));
+#else
+  // AVX will continue to use the fxsave area.
+  // EVEX needs to utilize the xsave area, which is under different
+  // management.
+  if(VM_Version::supports_evex()) {
+    // EDX:EAX describe the XSAVE header and
+    // are obtained while fetching info for XCR0 via cpuid.
+    // These two registers make up 64-bits in the header for which bits
+    // 62:10 are currently reserved for future implementations and unused.  Bit 63
+    // is unused for our implementation as we do not utilize
+    // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
+    // the functionality for PKRU state and MSR tracing.
+    // Ergo we are primarily concerned with bits 7..0, which define
+    // which ISA extensions and features are enabled for a given machine and are
+    // defined in XemXcr0Eax and is used to map the XSAVE area
+    // for restoring registers as described via XCR0.
+    movl(rdx,VM_Version::get_xsave_header_upper_segment());
+    movl(rax,VM_Version::get_xsave_header_lower_segment());
+    xrstor(Address(rsp, 0));
+  } else {
+    fxrstor(Address(rsp, 0));
+  }
+#endif
   addptr(rsp, FPUStateSizeInWords * wordSize);
 }
 
@@ -3769,13 +3792,49 @@
   push_FPU_state();
 }
 
+#ifdef _LP64
+#define XSTATE_BV 0x200
+#endif
+
 void MacroAssembler::push_FPU_state() {
   subptr(rsp, FPUStateSizeInWords * wordSize);
 #ifndef _LP64
   fnsave(Address(rsp, 0));
   fwait();
 #else
-  fxsave(Address(rsp, 0));
+  // AVX will continue to use the fxsave area.
+  // EVEX needs to utilize the xsave area, which is under different
+  // management.
+  if(VM_Version::supports_evex()) {
+    // Save a copy of EAX and EDX
+    push(rax);
+    push(rdx);
+    // EDX:EAX describe the XSAVE header and
+    // are obtained while fetching info for XCR0 via cpuid.
+    // These two registers make up 64-bits in the header for which bits
+    // 62:10 are currently reserved for future implementations and unused.  Bit 63
+    // is unused for our implementation as we do not utilize
+    // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
+    // the functionality for PKRU state and MSR tracing.
+    // Ergo we are primarily concerned with bits 7..0, which define
+    // which ISA extensions and features are enabled for a given machine and are
+    // defined in XemXcr0Eax and is used to program XSAVE area
+    // for saving the required registers as defined in XCR0.
+    int xcr0_edx = VM_Version::get_xsave_header_upper_segment();
+    int xcr0_eax = VM_Version::get_xsave_header_lower_segment();
+    movl(rdx,xcr0_edx);
+    movl(rax,xcr0_eax);
+    xsave(Address(rsp, wordSize*2));
+    // now Apply control bits and clear bytes 8..23 in the header
+    pop(rdx);
+    pop(rax);
+    movl(Address(rsp, XSTATE_BV), xcr0_eax);
+    movl(Address(rsp, XSTATE_BV+4), xcr0_edx);
+    andq(Address(rsp, XSTATE_BV+8), 0);
+    andq(Address(rsp, XSTATE_BV+16), 0);
+  } else {
+    fxsave(Address(rsp, 0));
+  }
 #endif // LP64
 }
 
@@ -4082,6 +4141,84 @@
   }
 }
 
+void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  int nds_enc = nds->encoding();
+  int dst_enc = dst->encoding();
+  bool dst_upper_bank = (dst_enc > 15);
+  bool nds_upper_bank = (nds_enc > 15);
+  if (VM_Version::supports_avx512novl() &&
+      (nds_upper_bank || dst_upper_bank)) {
+    if (dst_upper_bank) {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      movflt(xmm0, nds);
+      if (reachable(src)) {
+        vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+      movflt(dst, xmm0);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    } else {
+      movflt(dst, nds);
+      if (reachable(src)) {
+        vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+    }
+  } else {
+    if (reachable(src)) {
+      vxorps(dst, nds, as_Address(src), Assembler::AVX_128bit);
+    } else {
+      lea(rscratch1, src);
+      vxorps(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
+    }
+  }
+}
+
+void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  int nds_enc = nds->encoding();
+  int dst_enc = dst->encoding();
+  bool dst_upper_bank = (dst_enc > 15);
+  bool nds_upper_bank = (nds_enc > 15);
+  if (VM_Version::supports_avx512novl() &&
+      (nds_upper_bank || dst_upper_bank)) {
+    if (dst_upper_bank) {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      movdbl(xmm0, nds);
+      if (reachable(src)) {
+        vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+      movdbl(dst, xmm0);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    } else {
+      movdbl(dst, nds);
+      if (reachable(src)) {
+        vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+    }
+  } else {
+    if (reachable(src)) {
+      vxorpd(dst, nds, as_Address(src), Assembler::AVX_128bit);
+    } else {
+      lea(rscratch1, src);
+      vxorpd(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
+    }
+  }
+}
+
 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
   if (reachable(src)) {
     vxorpd(dst, nds, as_Address(src), vector_len);
@@ -4318,7 +4455,6 @@
 void MacroAssembler::store_check(Register obj) {
   // Does a store check for the oop in register obj. The content of
   // register obj is destroyed afterwards.
-
   BarrierSet* bs = Universe::heap()->barrier_set();
   assert(bs->kind() == BarrierSet::CardTableForRS ||
          bs->kind() == BarrierSet::CardTableExtension,
@@ -4572,69 +4708,58 @@
 
   // if we are coming from c1, xmm registers may be live
   int off = 0;
+  int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
+  if (UseAVX > 2) {
+    num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
+  }
+
   if (UseSSE == 1)  {
     subptr(rsp, sizeof(jdouble)*8);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
+    for (int n = 0; n < 8; n++) {
+      movflt(Address(rsp, off++*sizeof(jdouble)), as_XMMRegister(n));
+    }
   } else if (UseSSE >= 2)  {
     if (UseAVX > 2) {
+      push(rbx);
       movl(rbx, 0xffff);
-#ifdef _LP64
-      kmovql(k1, rbx);
-#else
-      kmovdl(k1, rbx);
-#endif
+      kmovwl(k1, rbx);
+      pop(rbx);
     }
 #ifdef COMPILER2
     if (MaxVectorSize > 16) {
-      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+      if(UseAVX > 2) {
+        // Save upper half of ZMM registes
+        subptr(rsp, 32*num_xmm_regs);
+        for (int n = 0; n < num_xmm_regs; n++) {
+          vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
+        }
+        off = 0;
+      }
+      assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
       // Save upper half of YMM registes
-      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
-      vextractf128h(Address(rsp,  0),xmm0);
-      vextractf128h(Address(rsp, 16),xmm1);
-      vextractf128h(Address(rsp, 32),xmm2);
-      vextractf128h(Address(rsp, 48),xmm3);
-      vextractf128h(Address(rsp, 64),xmm4);
-      vextractf128h(Address(rsp, 80),xmm5);
-      vextractf128h(Address(rsp, 96),xmm6);
-      vextractf128h(Address(rsp,112),xmm7);
-#ifdef _LP64
-      vextractf128h(Address(rsp,128),xmm8);
-      vextractf128h(Address(rsp,144),xmm9);
-      vextractf128h(Address(rsp,160),xmm10);
-      vextractf128h(Address(rsp,176),xmm11);
-      vextractf128h(Address(rsp,192),xmm12);
-      vextractf128h(Address(rsp,208),xmm13);
-      vextractf128h(Address(rsp,224),xmm14);
-      vextractf128h(Address(rsp,240),xmm15);
-#endif
+      subptr(rsp, 16*num_xmm_regs);
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+      }
     }
 #endif
-    // Save whole 128bit (16 bytes) XMM regiters
-    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
-    movdqu(Address(rsp,off++*16),xmm0);
-    movdqu(Address(rsp,off++*16),xmm1);
-    movdqu(Address(rsp,off++*16),xmm2);
-    movdqu(Address(rsp,off++*16),xmm3);
-    movdqu(Address(rsp,off++*16),xmm4);
-    movdqu(Address(rsp,off++*16),xmm5);
-    movdqu(Address(rsp,off++*16),xmm6);
-    movdqu(Address(rsp,off++*16),xmm7);
+    // Save whole 128bit (16 bytes) XMM registers
+    subptr(rsp, 16*num_xmm_regs);
+    off = 0;
 #ifdef _LP64
-    movdqu(Address(rsp,off++*16),xmm8);
-    movdqu(Address(rsp,off++*16),xmm9);
-    movdqu(Address(rsp,off++*16),xmm10);
-    movdqu(Address(rsp,off++*16),xmm11);
-    movdqu(Address(rsp,off++*16),xmm12);
-    movdqu(Address(rsp,off++*16),xmm13);
-    movdqu(Address(rsp,off++*16),xmm14);
-    movdqu(Address(rsp,off++*16),xmm15);
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vextractf32x4h(Address(rsp, off++*16), as_XMMRegister(n), 0);
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        movdqu(Address(rsp, off++*16), as_XMMRegister(n));
+      }
+    }
+#else
+    for (int n = 0; n < num_xmm_regs; n++) {
+      movdqu(Address(rsp, off++*16), as_XMMRegister(n));
+    }
 #endif
   }
 
@@ -4689,7 +4814,7 @@
   movsd(Address(rsp, 0), xmm0);
   fld_d(Address(rsp, 0));
 #endif // _LP64
-  addptr(rsp, sizeof(jdouble) * nb_args);
+  addptr(rsp, sizeof(jdouble)*nb_args);
   if (num_fpu_regs_in_use > 1) {
     // Must save return value to stack and then restore entire FPU
     // stack except incoming arguments
@@ -4699,63 +4824,50 @@
       addptr(rsp, sizeof(jdouble));
     }
     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
-    addptr(rsp, sizeof(jdouble) * nb_args);
+    addptr(rsp, sizeof(jdouble)*nb_args);
   }
 
   off = 0;
   if (UseSSE == 1)  {
-    movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+    for (int n = 0; n < 8; n++) {
+      movflt(as_XMMRegister(n), Address(rsp, off++*sizeof(jdouble)));
+    }
     addptr(rsp, sizeof(jdouble)*8);
   } else if (UseSSE >= 2)  {
     // Restore whole 128bit (16 bytes) XMM regiters
-    movdqu(xmm0, Address(rsp,off++*16));
-    movdqu(xmm1, Address(rsp,off++*16));
-    movdqu(xmm2, Address(rsp,off++*16));
-    movdqu(xmm3, Address(rsp,off++*16));
-    movdqu(xmm4, Address(rsp,off++*16));
-    movdqu(xmm5, Address(rsp,off++*16));
-    movdqu(xmm6, Address(rsp,off++*16));
-    movdqu(xmm7, Address(rsp,off++*16));
 #ifdef _LP64
-    movdqu(xmm8, Address(rsp,off++*16));
-    movdqu(xmm9, Address(rsp,off++*16));
-    movdqu(xmm10, Address(rsp,off++*16));
-    movdqu(xmm11, Address(rsp,off++*16));
-    movdqu(xmm12, Address(rsp,off++*16));
-    movdqu(xmm13, Address(rsp,off++*16));
-    movdqu(xmm14, Address(rsp,off++*16));
-    movdqu(xmm15, Address(rsp,off++*16));
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vinsertf32x4h(as_XMMRegister(n), Address(rsp, off++*16), 0);
+      }
+    }
+    else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        movdqu(as_XMMRegister(n), Address(rsp, off++*16));
+      }
+    }
+#else
+    for (int n = 0; n < num_xmm_regs; n++) {
+      movdqu(as_XMMRegister(n), Address(rsp, off++ * 16));
+    }
 #endif
-    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+    addptr(rsp, 16*num_xmm_regs);
+
 #ifdef COMPILER2
     if (MaxVectorSize > 16) {
       // Restore upper half of YMM registes.
-      vinsertf128h(xmm0, Address(rsp,  0));
-      vinsertf128h(xmm1, Address(rsp, 16));
-      vinsertf128h(xmm2, Address(rsp, 32));
-      vinsertf128h(xmm3, Address(rsp, 48));
-      vinsertf128h(xmm4, Address(rsp, 64));
-      vinsertf128h(xmm5, Address(rsp, 80));
-      vinsertf128h(xmm6, Address(rsp, 96));
-      vinsertf128h(xmm7, Address(rsp,112));
-#ifdef _LP64
-      vinsertf128h(xmm8, Address(rsp,128));
-      vinsertf128h(xmm9, Address(rsp,144));
-      vinsertf128h(xmm10, Address(rsp,160));
-      vinsertf128h(xmm11, Address(rsp,176));
-      vinsertf128h(xmm12, Address(rsp,192));
-      vinsertf128h(xmm13, Address(rsp,208));
-      vinsertf128h(xmm14, Address(rsp,224));
-      vinsertf128h(xmm15, Address(rsp,240));
-#endif
-      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
+      }
+      addptr(rsp, 16*num_xmm_regs);
+      if(UseAVX > 2) {
+        off = 0;
+        for (int n = 0; n < num_xmm_regs; n++) {
+          vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
+        }
+        addptr(rsp, 32*num_xmm_regs);
+      }
     }
 #endif
   }
@@ -7095,11 +7207,7 @@
       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
       if (UseAVX > 2) {
         movl(rtmp, 0xffff);
-#ifdef _LP64
-        kmovql(k1, rtmp);
-#else
-        kmovdl(k1, rtmp);
-#endif
+        kmovwl(k1, rtmp);
       }
       movdl(xtmp, value);
       if (UseAVX > 2 && UseUnalignedLoadStores) {
@@ -7112,7 +7220,7 @@
         align(16);
 
         BIND(L_fill_64_bytes_loop);
-        evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
+        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
         addptr(to, 64);
         subl(count, 16 << shift);
         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
@@ -7120,7 +7228,7 @@
         BIND(L_check_fill_32_bytes);
         addl(count, 8 << shift);
         jccb(Assembler::less, L_check_fill_8_bytes);
-        evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
+        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_256bit);
         addptr(to, 32);
         subl(count, 8 << shift);
 
@@ -8399,6 +8507,14 @@
   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
 
+  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+  // context for the registers used, where all instructions below are using 128-bit mode
+  // On EVEX without VL and BW, these instructions will all be AVX.
+  if (VM_Version::supports_avx512vlbw()) {
+    movl(tmp, 0xffff);
+    kmovwl(k1, tmp);
+  }
+
   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
   notl(crc); // ~crc
   cmpl(len, 16);
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Sep 21 17:49:57 2015 +0200
@@ -1069,6 +1069,9 @@
   void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
+  void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+  void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
   // AVX Vector instructions
 
   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -115,6 +115,7 @@
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words,
                                            int* total_frame_words, bool verify_fpu, bool save_vectors) {
   int vect_words = 0;
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 #ifdef COMPILER2
   if (save_vectors) {
     assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
@@ -173,59 +174,50 @@
     __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
   }
 
+  int off = st0_off;
+  int delta = st1_off - off;
+
   // Save the FPU registers in de-opt-able form
-
-  __ fstp_d(Address(rsp, st0_off*wordSize)); // st(0)
-  __ fstp_d(Address(rsp, st1_off*wordSize)); // st(1)
-  __ fstp_d(Address(rsp, st2_off*wordSize)); // st(2)
-  __ fstp_d(Address(rsp, st3_off*wordSize)); // st(3)
-  __ fstp_d(Address(rsp, st4_off*wordSize)); // st(4)
-  __ fstp_d(Address(rsp, st5_off*wordSize)); // st(5)
-  __ fstp_d(Address(rsp, st6_off*wordSize)); // st(6)
-  __ fstp_d(Address(rsp, st7_off*wordSize)); // st(7)
-
-  if( UseSSE == 1 ) {           // Save the XMM state
-    __ movflt(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movflt(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movflt(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movflt(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movflt(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movflt(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movflt(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movflt(Address(rsp,xmm7_off*wordSize),xmm7);
-  } else if( UseSSE >= 2 ) {
+  for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) {
+    __ fstp_d(Address(rsp, off*wordSize));
+    off += delta;
+  }
+
+  off = xmm0_off;
+  delta = xmm1_off - off;
+  if(UseSSE == 1) {           // Save the XMM state
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movflt(Address(rsp, off*wordSize), as_XMMRegister(n));
+      off += delta;
+    }
+  } else if(UseSSE >= 2) {
     // Save whole 128bit (16 bytes) XMM regiters
-    __ movdqu(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movdqu(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movdqu(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movdqu(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movdqu(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movdqu(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movdqu(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movdqu(Address(rsp,xmm7_off*wordSize),xmm7);
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vextractf32x4h(Address(rsp, off*wordSize), as_XMMRegister(n), 0);
+        off += delta;
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ movdqu(Address(rsp, off*wordSize), as_XMMRegister(n));
+        off += delta;
+      }
+    }
   }
 
   if (vect_words > 0) {
     assert(vect_words*wordSize == 128, "");
     __ subptr(rsp, 128); // Save upper half of YMM registes
-    __ vextractf128h(Address(rsp,  0),xmm0);
-    __ vextractf128h(Address(rsp, 16),xmm1);
-    __ vextractf128h(Address(rsp, 32),xmm2);
-    __ vextractf128h(Address(rsp, 48),xmm3);
-    __ vextractf128h(Address(rsp, 64),xmm4);
-    __ vextractf128h(Address(rsp, 80),xmm5);
-    __ vextractf128h(Address(rsp, 96),xmm6);
-    __ vextractf128h(Address(rsp,112),xmm7);
+    off = 0;
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+    }
     if (UseAVX > 2) {
       __ subptr(rsp, 256); // Save upper half of ZMM registes
-      __ vextractf64x4h(Address(rsp, 0), xmm0);
-      __ vextractf64x4h(Address(rsp, 32), xmm1);
-      __ vextractf64x4h(Address(rsp, 64), xmm2);
-      __ vextractf64x4h(Address(rsp, 96), xmm3);
-      __ vextractf64x4h(Address(rsp, 128), xmm4);
-      __ vextractf64x4h(Address(rsp, 160), xmm5);
-      __ vextractf64x4h(Address(rsp, 192), xmm6);
-      __ vextractf64x4h(Address(rsp, 224), xmm7);
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
+      }
     }
   }
 
@@ -238,58 +230,40 @@
   OopMap* map =  new OopMap( frame_words, 0 );
 
 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_words)
-
-  map->set_callee_saved(STACK_OFFSET( rax_off), rax->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rcx_off), rcx->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rdx_off), rdx->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rbx_off), rbx->as_VMReg());
+#define NEXTREG(x) (x)->as_VMReg()->next()
+
+  map->set_callee_saved(STACK_OFFSET(rax_off), rax->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rcx_off), rcx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rdx_off), rdx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rbx_off), rbx->as_VMReg());
   // rbp, location is known implicitly, no oopMap
-  map->set_callee_saved(STACK_OFFSET( rsi_off), rsi->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rdi_off), rdi->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st0_off), as_FloatRegister(0)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st1_off), as_FloatRegister(1)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st2_off), as_FloatRegister(2)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st3_off), as_FloatRegister(3)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st4_off), as_FloatRegister(4)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st5_off), as_FloatRegister(5)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st6_off), as_FloatRegister(6)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st7_off), as_FloatRegister(7)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm0_off), xmm0->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm1_off), xmm1->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm2_off), xmm2->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm3_off), xmm3->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm4_off), xmm4->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm5_off), xmm5->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm6_off), xmm6->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm7_off), xmm7->as_VMReg());
-  // %%% This is really a waste but we'll keep things as they were for now
-  if (true) {
-#define NEXTREG(x) (x)->as_VMReg()->next()
-    map->set_callee_saved(STACK_OFFSET(st0H_off), NEXTREG(as_FloatRegister(0)));
-    map->set_callee_saved(STACK_OFFSET(st1H_off), NEXTREG(as_FloatRegister(1)));
-    map->set_callee_saved(STACK_OFFSET(st2H_off), NEXTREG(as_FloatRegister(2)));
-    map->set_callee_saved(STACK_OFFSET(st3H_off), NEXTREG(as_FloatRegister(3)));
-    map->set_callee_saved(STACK_OFFSET(st4H_off), NEXTREG(as_FloatRegister(4)));
-    map->set_callee_saved(STACK_OFFSET(st5H_off), NEXTREG(as_FloatRegister(5)));
-    map->set_callee_saved(STACK_OFFSET(st6H_off), NEXTREG(as_FloatRegister(6)));
-    map->set_callee_saved(STACK_OFFSET(st7H_off), NEXTREG(as_FloatRegister(7)));
-    map->set_callee_saved(STACK_OFFSET(xmm0H_off), NEXTREG(xmm0));
-    map->set_callee_saved(STACK_OFFSET(xmm1H_off), NEXTREG(xmm1));
-    map->set_callee_saved(STACK_OFFSET(xmm2H_off), NEXTREG(xmm2));
-    map->set_callee_saved(STACK_OFFSET(xmm3H_off), NEXTREG(xmm3));
-    map->set_callee_saved(STACK_OFFSET(xmm4H_off), NEXTREG(xmm4));
-    map->set_callee_saved(STACK_OFFSET(xmm5H_off), NEXTREG(xmm5));
-    map->set_callee_saved(STACK_OFFSET(xmm6H_off), NEXTREG(xmm6));
-    map->set_callee_saved(STACK_OFFSET(xmm7H_off), NEXTREG(xmm7));
+  map->set_callee_saved(STACK_OFFSET(rsi_off), rsi->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rdi_off), rdi->as_VMReg());
+  // %%% This is really a waste but we'll keep things as they were for now for the upper component
+  off = st0_off;
+  delta = st1_off - off;
+  for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) {
+    FloatRegister freg_name = as_FloatRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), freg_name->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(freg_name));
+    off += delta;
+  }
+  off = xmm0_off;
+  delta = xmm1_off - off;
+  for (int n = 0; n < num_xmm_regs; n++) {
+    XMMRegister xmm_name = as_XMMRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(xmm_name));
+    off += delta;
+  }
 #undef NEXTREG
 #undef STACK_OFFSET
-  }
 
   return map;
-
 }
 
 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
   // Recover XMM & FPU state
   int additional_frame_bytes = 0;
 #ifdef COMPILER2
@@ -301,52 +275,43 @@
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
 #endif
+  int off = xmm0_off;
+  int delta = xmm1_off - off;
+
   if (UseSSE == 1) {
     assert(additional_frame_bytes == 0, "");
-    __ movflt(xmm0,Address(rsp,xmm0_off*wordSize));
-    __ movflt(xmm1,Address(rsp,xmm1_off*wordSize));
-    __ movflt(xmm2,Address(rsp,xmm2_off*wordSize));
-    __ movflt(xmm3,Address(rsp,xmm3_off*wordSize));
-    __ movflt(xmm4,Address(rsp,xmm4_off*wordSize));
-    __ movflt(xmm5,Address(rsp,xmm5_off*wordSize));
-    __ movflt(xmm6,Address(rsp,xmm6_off*wordSize));
-    __ movflt(xmm7,Address(rsp,xmm7_off*wordSize));
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movflt(as_XMMRegister(n), Address(rsp, off*wordSize));
+      off += delta;
+    }
   } else if (UseSSE >= 2) {
-#define STACK_ADDRESS(x) Address(rsp,(x)*wordSize + additional_frame_bytes)
-    __ movdqu(xmm0,STACK_ADDRESS(xmm0_off));
-    __ movdqu(xmm1,STACK_ADDRESS(xmm1_off));
-    __ movdqu(xmm2,STACK_ADDRESS(xmm2_off));
-    __ movdqu(xmm3,STACK_ADDRESS(xmm3_off));
-    __ movdqu(xmm4,STACK_ADDRESS(xmm4_off));
-    __ movdqu(xmm5,STACK_ADDRESS(xmm5_off));
-    __ movdqu(xmm6,STACK_ADDRESS(xmm6_off));
-    __ movdqu(xmm7,STACK_ADDRESS(xmm7_off));
-#undef STACK_ADDRESS
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vinsertf32x4h(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes), 0);
+        off += delta;
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
+        off += delta;
+      }
+    }
   }
   if (restore_vectors) {
+    if (UseAVX > 2) {
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
+      }
+      __ addptr(rsp, additional_frame_bytes*2); // Save upper half of ZMM registes
+    }
     // Restore upper half of YMM registes.
     assert(additional_frame_bytes == 128, "");
-    __ vinsertf128h(xmm0, Address(rsp,  0));
-    __ vinsertf128h(xmm1, Address(rsp, 16));
-    __ vinsertf128h(xmm2, Address(rsp, 32));
-    __ vinsertf128h(xmm3, Address(rsp, 48));
-    __ vinsertf128h(xmm4, Address(rsp, 64));
-    __ vinsertf128h(xmm5, Address(rsp, 80));
-    __ vinsertf128h(xmm6, Address(rsp, 96));
-    __ vinsertf128h(xmm7, Address(rsp,112));
-    __ addptr(rsp, additional_frame_bytes);
-    if (UseAVX > 2) {
-      additional_frame_bytes = 256;
-      __ vinsertf64x4h(xmm0, Address(rsp, 0));
-      __ vinsertf64x4h(xmm1, Address(rsp, 32));
-      __ vinsertf64x4h(xmm2, Address(rsp, 64));
-      __ vinsertf64x4h(xmm3, Address(rsp, 96));
-      __ vinsertf64x4h(xmm4, Address(rsp, 128));
-      __ vinsertf64x4h(xmm5, Address(rsp, 160));
-      __ vinsertf64x4h(xmm6, Address(rsp, 192));
-      __ vinsertf64x4h(xmm7, Address(rsp, 224));
-      __ addptr(rsp, additional_frame_bytes);
+    off = 0;
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
     }
+    __ addptr(rsp, additional_frame_bytes); // Save upper half of YMM registes
   }
   __ pop_FPU_state();
   __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -69,7 +69,9 @@
 class RegisterSaver {
   // Capture info about frame layout.  Layout offsets are in jint
   // units because compiler frame slots are jints.
+#define HALF_ZMM_BANK_WORDS 128
 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
+#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
   enum layout {
     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
     xmm_off       = fpu_state_off + 160/BytesPerInt,            // offset in fxsave save area
@@ -89,23 +91,24 @@
     DEF_XMM_OFFS(13),
     DEF_XMM_OFFS(14),
     DEF_XMM_OFFS(15),
-    DEF_XMM_OFFS(16),
-    DEF_XMM_OFFS(17),
-    DEF_XMM_OFFS(18),
-    DEF_XMM_OFFS(19),
-    DEF_XMM_OFFS(20),
-    DEF_XMM_OFFS(21),
-    DEF_XMM_OFFS(22),
-    DEF_XMM_OFFS(23),
-    DEF_XMM_OFFS(24),
-    DEF_XMM_OFFS(25),
-    DEF_XMM_OFFS(26),
-    DEF_XMM_OFFS(27),
-    DEF_XMM_OFFS(28),
-    DEF_XMM_OFFS(29),
-    DEF_XMM_OFFS(30),
-    DEF_XMM_OFFS(31),
-    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords - 1)*wordSize / BytesPerInt),
+    zmm_off = fpu_state_off + ((FPUStateSizeInWords - (HALF_ZMM_BANK_WORDS + 1))*wordSize / BytesPerInt),
+    DEF_ZMM_OFFS(16),
+    DEF_ZMM_OFFS(17),
+    DEF_ZMM_OFFS(18),
+    DEF_ZMM_OFFS(19),
+    DEF_ZMM_OFFS(20),
+    DEF_ZMM_OFFS(21),
+    DEF_ZMM_OFFS(22),
+    DEF_ZMM_OFFS(23),
+    DEF_ZMM_OFFS(24),
+    DEF_ZMM_OFFS(25),
+    DEF_ZMM_OFFS(26),
+    DEF_ZMM_OFFS(27),
+    DEF_ZMM_OFFS(28),
+    DEF_ZMM_OFFS(29),
+    DEF_ZMM_OFFS(30),
+    DEF_ZMM_OFFS(31),
+    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
     fpu_stateH_end,
     r15_off, r15H_off,
     r14_off, r14H_off,
@@ -155,9 +158,10 @@
 
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
   int vect_words = 0;
-  int num_xmm_regs = 16;
-  if (UseAVX > 2) {
-    num_xmm_regs = 32;
+  int off = 0;
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
+  if (UseAVX < 3) {
+    num_xmm_regs = num_xmm_regs/2;
   }
 #ifdef COMPILER2
   if (save_vectors) {
@@ -165,9 +169,7 @@
     assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
     // Save upper half of YMM registers
     vect_words = 16 * num_xmm_regs / wordSize;
-    additional_frame_words += vect_words;
-    if (UseAVX > 2) {
-      // Save upper half of ZMM registers as well
+    if (UseAVX < 3) {
       additional_frame_words += vect_words;
     }
   }
@@ -195,77 +197,13 @@
   __ enter();          // rsp becomes 16-byte aligned here
   __ push_CPU_state(); // Push a multiple of 16 bytes
 
-  if (vect_words > 0) {
+  // push cpu state handles this on EVEX enabled targets
+  if ((vect_words > 0) && (UseAVX < 3)) {
     assert(vect_words*wordSize >= 256, "");
-    __ subptr(rsp, 256); // Save upper half of YMM registes(0..15)
-    __ vextractf128h(Address(rsp, 0), xmm0);
-    __ vextractf128h(Address(rsp, 16), xmm1);
-    __ vextractf128h(Address(rsp, 32), xmm2);
-    __ vextractf128h(Address(rsp, 48), xmm3);
-    __ vextractf128h(Address(rsp, 64), xmm4);
-    __ vextractf128h(Address(rsp, 80), xmm5);
-    __ vextractf128h(Address(rsp, 96), xmm6);
-    __ vextractf128h(Address(rsp, 112), xmm7);
-    __ vextractf128h(Address(rsp, 128), xmm8);
-    __ vextractf128h(Address(rsp, 144), xmm9);
-    __ vextractf128h(Address(rsp, 160), xmm10);
-    __ vextractf128h(Address(rsp, 176), xmm11);
-    __ vextractf128h(Address(rsp, 192), xmm12);
-    __ vextractf128h(Address(rsp, 208), xmm13);
-    __ vextractf128h(Address(rsp, 224), xmm14);
-    __ vextractf128h(Address(rsp, 240), xmm15);
-    if (UseAVX > 2) {
-      __ subptr(rsp, 256); // Save upper half of YMM registes(16..31)
-      __ vextractf128h(Address(rsp, 0), xmm16);
-      __ vextractf128h(Address(rsp, 16), xmm17);
-      __ vextractf128h(Address(rsp, 32), xmm18);
-      __ vextractf128h(Address(rsp, 48), xmm19);
-      __ vextractf128h(Address(rsp, 64), xmm20);
-      __ vextractf128h(Address(rsp, 80), xmm21);
-      __ vextractf128h(Address(rsp, 96), xmm22);
-      __ vextractf128h(Address(rsp, 112), xmm23);
-      __ vextractf128h(Address(rsp, 128), xmm24);
-      __ vextractf128h(Address(rsp, 144), xmm25);
-      __ vextractf128h(Address(rsp, 160), xmm26);
-      __ vextractf128h(Address(rsp, 176), xmm27);
-      __ vextractf128h(Address(rsp, 192), xmm28);
-      __ vextractf128h(Address(rsp, 208), xmm29);
-      __ vextractf128h(Address(rsp, 224), xmm30);
-      __ vextractf128h(Address(rsp, 240), xmm31);
-      // Now handle the ZMM registers (0..31)
-      __ subptr(rsp, 1024); // Save upper half of ZMM registes
-      __ vextractf64x4h(Address(rsp, 0), xmm0);
-      __ vextractf64x4h(Address(rsp, 32), xmm1);
-      __ vextractf64x4h(Address(rsp, 64), xmm2);
-      __ vextractf64x4h(Address(rsp, 96), xmm3);
-      __ vextractf64x4h(Address(rsp, 128), xmm4);
-      __ vextractf64x4h(Address(rsp, 160), xmm5);
-      __ vextractf64x4h(Address(rsp, 192), xmm6);
-      __ vextractf64x4h(Address(rsp, 224), xmm7);
-      __ vextractf64x4h(Address(rsp, 256), xmm8);
-      __ vextractf64x4h(Address(rsp, 288), xmm9);
-      __ vextractf64x4h(Address(rsp, 320), xmm10);
-      __ vextractf64x4h(Address(rsp, 352), xmm11);
-      __ vextractf64x4h(Address(rsp, 384), xmm12);
-      __ vextractf64x4h(Address(rsp, 416), xmm13);
-      __ vextractf64x4h(Address(rsp, 448), xmm14);
-      __ vextractf64x4h(Address(rsp, 480), xmm15);
-      __ vextractf64x4h(Address(rsp, 512), xmm16);
-      __ vextractf64x4h(Address(rsp, 544), xmm17);
-      __ vextractf64x4h(Address(rsp, 576), xmm18);
-      __ vextractf64x4h(Address(rsp, 608), xmm19);
-      __ vextractf64x4h(Address(rsp, 640), xmm20);
-      __ vextractf64x4h(Address(rsp, 672), xmm21);
-      __ vextractf64x4h(Address(rsp, 704), xmm22);
-      __ vextractf64x4h(Address(rsp, 736), xmm23);
-      __ vextractf64x4h(Address(rsp, 768), xmm24);
-      __ vextractf64x4h(Address(rsp, 800), xmm25);
-      __ vextractf64x4h(Address(rsp, 832), xmm26);
-      __ vextractf64x4h(Address(rsp, 864), xmm27);
-      __ vextractf64x4h(Address(rsp, 896), xmm28);
-      __ vextractf64x4h(Address(rsp, 928), xmm29);
-      __ vextractf64x4h(Address(rsp, 960), xmm30);
-      __ vextractf64x4h(Address(rsp, 992), xmm31);
+    // Save upper half of YMM registes(0..num_xmm_regs)
+    __ subptr(rsp, num_xmm_regs*16);
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
     }
   }
   if (frame::arg_reg_save_area_bytes != 0) {
@@ -299,39 +237,24 @@
   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
-  if (UseAVX > 2) {
-    map->set_callee_saved(STACK_OFFSET(xmm16_off), xmm16->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm17_off), xmm17->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm18_off), xmm18->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm19_off), xmm19->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm20_off), xmm20->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm21_off), xmm21->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm22_off), xmm22->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm23_off), xmm23->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm24_off), xmm24->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm25_off), xmm25->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm26_off), xmm26->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm27_off), xmm27->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm28_off), xmm28->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm29_off), xmm29->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm30_off), xmm30->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm31_off), xmm31->as_VMReg());
+  // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
+  // on EVEX enabled targets, we get it included in the xsave area
+  off = xmm0_off;
+  int delta = xmm1_off - off;
+  for (int n = 0; n < 16; n++) {
+    XMMRegister xmm_name = as_XMMRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+    off += delta;
+  }
+  if(UseAVX > 2) {
+    // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
+    off = zmm16_off;
+    delta = zmm17_off - off;
+    for (int n = 16; n < num_xmm_regs; n++) {
+      XMMRegister xmm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+      off += delta;
+    }
   }
 
   // %%% These should all be a waste but we'll keep things as they were for now
@@ -351,39 +274,24 @@
     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
+    // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
+    // on EVEX enabled targets, we get it included in the xsave area
+    off = xmm0H_off;
+    delta = xmm1H_off - off;
+    for (int n = 0; n < 16; n++) {
+      XMMRegister xmm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+      off += delta;
+    }
     if (UseAVX > 2) {
-      map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()->next());
+      // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
+      off = zmm16H_off;
+      delta = zmm17H_off - off;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+        off += delta;
+      }
     }
   }
 
@@ -391,86 +299,25 @@
 }
 
 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
+  if (UseAVX < 3) {
+    num_xmm_regs = num_xmm_regs/2;
+  }
   if (frame::arg_reg_save_area_bytes != 0) {
     // Pop arg register save area
     __ addptr(rsp, frame::arg_reg_save_area_bytes);
   }
 #ifdef COMPILER2
-  if (restore_vectors) {
-    // Restore upper half of YMM registes (0..15)
-    assert(UseAVX > 0, "512bit vectors are supported only with AVX");
-    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
-    __ vinsertf128h(xmm0, Address(rsp,  0));
-    __ vinsertf128h(xmm1, Address(rsp, 16));
-    __ vinsertf128h(xmm2, Address(rsp, 32));
-    __ vinsertf128h(xmm3, Address(rsp, 48));
-    __ vinsertf128h(xmm4, Address(rsp, 64));
-    __ vinsertf128h(xmm5, Address(rsp, 80));
-    __ vinsertf128h(xmm6, Address(rsp, 96));
-    __ vinsertf128h(xmm7, Address(rsp,112));
-    __ vinsertf128h(xmm8, Address(rsp,128));
-    __ vinsertf128h(xmm9, Address(rsp,144));
-    __ vinsertf128h(xmm10, Address(rsp,160));
-    __ vinsertf128h(xmm11, Address(rsp,176));
-    __ vinsertf128h(xmm12, Address(rsp,192));
-    __ vinsertf128h(xmm13, Address(rsp,208));
-    __ vinsertf128h(xmm14, Address(rsp,224));
-    __ vinsertf128h(xmm15, Address(rsp,240));
-    __ addptr(rsp, 256);
-    if (UseAVX > 2) {
-      // Restore upper half of YMM registes (16..31)
-      __ vinsertf128h(xmm16, Address(rsp,  0));
-      __ vinsertf128h(xmm17, Address(rsp, 16));
-      __ vinsertf128h(xmm18, Address(rsp, 32));
-      __ vinsertf128h(xmm19, Address(rsp, 48));
-      __ vinsertf128h(xmm20, Address(rsp, 64));
-      __ vinsertf128h(xmm21, Address(rsp, 80));
-      __ vinsertf128h(xmm22, Address(rsp, 96));
-      __ vinsertf128h(xmm23, Address(rsp,112));
-      __ vinsertf128h(xmm24, Address(rsp,128));
-      __ vinsertf128h(xmm25, Address(rsp,144));
-      __ vinsertf128h(xmm26, Address(rsp,160));
-      __ vinsertf128h(xmm27, Address(rsp,176));
-      __ vinsertf128h(xmm28, Address(rsp,192));
-      __ vinsertf128h(xmm29, Address(rsp,208));
-      __ vinsertf128h(xmm30, Address(rsp,224));
-      __ vinsertf128h(xmm31, Address(rsp,240));
-      __ addptr(rsp, 256);
-      // Restore upper half of ZMM registes.
-      __ vinsertf64x4h(xmm0, Address(rsp, 0));
-      __ vinsertf64x4h(xmm1, Address(rsp, 32));
-      __ vinsertf64x4h(xmm2, Address(rsp, 64));
-      __ vinsertf64x4h(xmm3, Address(rsp, 96));
-      __ vinsertf64x4h(xmm4, Address(rsp, 128));
-      __ vinsertf64x4h(xmm5, Address(rsp, 160));
-      __ vinsertf64x4h(xmm6, Address(rsp, 192));
-      __ vinsertf64x4h(xmm7, Address(rsp, 224));
-      __ vinsertf64x4h(xmm8, Address(rsp, 256));
-      __ vinsertf64x4h(xmm9, Address(rsp, 288));
-      __ vinsertf64x4h(xmm10, Address(rsp, 320));
-      __ vinsertf64x4h(xmm11, Address(rsp, 352));
-      __ vinsertf64x4h(xmm12, Address(rsp, 384));
-      __ vinsertf64x4h(xmm13, Address(rsp, 416));
-      __ vinsertf64x4h(xmm14, Address(rsp, 448));
-      __ vinsertf64x4h(xmm15, Address(rsp, 480));
-      __ vinsertf64x4h(xmm16, Address(rsp, 512));
-      __ vinsertf64x4h(xmm17, Address(rsp, 544));
-      __ vinsertf64x4h(xmm18, Address(rsp, 576));
-      __ vinsertf64x4h(xmm19, Address(rsp, 608));
-      __ vinsertf64x4h(xmm20, Address(rsp, 640));
-      __ vinsertf64x4h(xmm21, Address(rsp, 672));
-      __ vinsertf64x4h(xmm22, Address(rsp, 704));
-      __ vinsertf64x4h(xmm23, Address(rsp, 736));
-      __ vinsertf64x4h(xmm24, Address(rsp, 768));
-      __ vinsertf64x4h(xmm25, Address(rsp, 800));
-      __ vinsertf64x4h(xmm26, Address(rsp, 832));
-      __ vinsertf64x4h(xmm27, Address(rsp, 864));
-      __ vinsertf64x4h(xmm28, Address(rsp, 896));
-      __ vinsertf64x4h(xmm29, Address(rsp, 928));
-      __ vinsertf64x4h(xmm30, Address(rsp, 960));
-      __ vinsertf64x4h(xmm31, Address(rsp, 992));
-      __ addptr(rsp, 1024);
+  // On EVEX enabled targets everything is handled in pop fpu state
+  if ((restore_vectors) && (UseAVX < 3)) {
+    assert(UseAVX > 0, "256/512-bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 64, "up to 512bit vectors are supported now");
+    int off = 0;
+    // Restore upper half of YMM registes (0..num_xmm_regs)
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  off++*16));
     }
+    __ addptr(rsp, num_xmm_regs*16);
   }
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -795,6 +795,12 @@
   void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
     assert( UseSSE >= 2, "supported cpu only" );
     Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
+    if (UseAVX > 2) {
+      __ push(rbx);
+      __ movl(rbx, 0xffff);
+      __ kmovdl(k1, rbx);
+      __ pop(rbx);
+    }
     // Copy 64-byte chunks
     __ jmpb(L_copy_64_bytes);
     __ align(OptoLoopAlignment);
@@ -802,8 +808,8 @@
 
     if (UseUnalignedLoadStores) {
       if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(from, 0), Assembler::AVX_512bit);
-        __ evmovdqu(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
+        __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
       } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(from,  0));
         __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
@@ -2217,6 +2223,15 @@
     const XMMRegister xmm_temp4  = xmm5;
 
     __ enter();   // required for proper stackwalking of RuntimeStub frame
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     __ movptr(from, from_param);
     __ movptr(key, key_param);
 
@@ -2315,6 +2330,15 @@
     const XMMRegister xmm_temp4  = xmm5;
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     __ movptr(from, from_param);
     __ movptr(key, key_param);
 
@@ -2441,6 +2465,14 @@
     __ enter(); // required for proper stackwalking of RuntimeStub frame
     handleSOERegisters(true /*saving*/);
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     // load registers from incoming parameters
     const Address  from_param(rbp, 8+0);
     const Address  to_param  (rbp, 8+4);
@@ -2602,6 +2634,14 @@
     __ enter(); // required for proper stackwalking of RuntimeStub frame
     handleSOERegisters(true /*saving*/);
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     // load registers from incoming parameters
     const Address  from_param(rbp, 8+0);
     const Address  to_param  (rbp, 8+4);
@@ -2782,6 +2822,14 @@
     __ enter();
     handleSOERegisters(true);  // Save registers
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
     __ movptr(state, state_param);
     __ movptr(subkeyH, subkeyH_param);
     __ movptr(data, data_param);
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -269,12 +269,16 @@
       __ kmovql(k1, rbx);
     }
 #ifdef _WIN64
+    int last_reg = 15;
     if (UseAVX > 2) {
-      for (int i = 6; i <= 31; i++) {
-        __ movdqu(xmm_save(i), as_XMMRegister(i));
+      last_reg = 31;
+    }
+    if (VM_Version::supports_avx512novl()) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ vextractf32x4h(xmm_save(i), as_XMMRegister(i), 0);
       }
     } else {
-      for (int i = 6; i <= 15; i++) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
         __ movdqu(xmm_save(i), as_XMMRegister(i));
       }
     }
@@ -386,13 +390,15 @@
 
     // restore regs belonging to calling function
 #ifdef _WIN64
-    int xmm_ub = 15;
-    if (UseAVX > 2) {
-      xmm_ub = 31;
-    }
     // emit the restores for xmm regs
-    for (int i = 6; i <= xmm_ub; i++) {
-      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    if (VM_Version::supports_avx512novl()) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ vinsertf32x4h(as_XMMRegister(i), xmm_save(i), 0);
+      }
+    } else {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ movdqu(as_XMMRegister(i), xmm_save(i));
+      }
     }
 #endif
     __ movptr(r15, r15_save);
@@ -1342,11 +1348,15 @@
     __ align(OptoLoopAlignment);
     if (UseUnalignedLoadStores) {
       Label L_end;
+      if (UseAVX > 2) {
+        __ movl(to, 0xffff);
+        __ kmovql(k1, to);
+      }
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
       if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
-        __ evmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
+        __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
       } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
@@ -1422,11 +1432,15 @@
     __ align(OptoLoopAlignment);
     if (UseUnalignedLoadStores) {
       Label L_end;
+      if (UseAVX > 2) {
+        __ movl(to, 0xffff);
+        __ kmovql(k1, to);
+      }
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
       if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
-        __ evmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
+        __ evmovdqul(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
       } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
@@ -3106,6 +3120,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 
@@ -3200,6 +3222,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 
@@ -3312,6 +3342,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
     // on win64, fill len_reg from stack position
     __ movl(len_reg, len_mem);
@@ -3508,6 +3546,14 @@
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
     // on win64, fill len_reg from stack position
     __ movl(len_reg, len_mem);
@@ -3746,6 +3792,14 @@
 
     __ enter();
 
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
     // save the xmm registers which must be preserved 6-10
     __ subptr(rsp, -rsp_after_call_off * wordSize);
--- a/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Mon Sep 21 17:49:57 2015 +0200
@@ -31,7 +31,7 @@
 
 enum platform_dependent_constants {
   code_size1 =  9000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 30000            // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
--- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Mon Sep 21 17:49:57 2015 +0200
@@ -33,7 +33,7 @@
 
 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 24000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 32000           // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -367,16 +367,12 @@
     __ movl(rcx, VM_Version::ymm_test_value());
     __ movdl(xmm0, rcx);
     __ movl(rcx, 0xffff);
+    __ kmovwl(k1, rcx);
+    __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit);
 #ifdef _LP64
-    __ kmovql(k1, rcx);
-#else
-    __ kmovdl(k1, rcx);
-#endif
-    __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(xmm7, xmm0, Assembler::AVX_512bit);
-#ifdef _LP64
-    __ evmovdqu(xmm8, xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(xmm31, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm8, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm31, xmm0, Assembler::AVX_512bit);
 #endif
     VM_Version::clean_cpuFeatures();
     __ jmp(save_restore_except);
@@ -427,11 +423,11 @@
     UseAVX = 3;
     UseSSE = 2;
     __ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset())));
-    __ evmovdqu(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
 #ifdef _LP64
-    __ evmovdqu(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
-    __ evmovdqu(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
 #endif
     VM_Version::clean_cpuFeatures();
     UseAVX = saved_useavx;
--- a/src/cpu/x86/vm/vm_version_x86.hpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/vm_version_x86.hpp	Mon Sep 21 17:49:57 2015 +0200
@@ -227,14 +227,15 @@
   union XemXcr0Eax {
     uint32_t value;
     struct {
-      uint32_t x87    : 1,
-               sse    : 1,
-               ymm    : 1,
-                      : 2,
-               opmask : 1,
-               zmm512 : 1,
-                zmm32 : 1,
-                      : 24;
+      uint32_t x87     : 1,
+               sse     : 1,
+               ymm     : 1,
+               bndregs : 1,
+               bndcsr  : 1,
+               opmask  : 1,
+               zmm512  : 1,
+               zmm32   : 1,
+                       : 24;
     } bits;
   };
 
@@ -703,6 +704,7 @@
   static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
   static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
   static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
+  static bool supports_avx512novl() { return (supports_evex() && !supports_avx512vl()); }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
@@ -817,6 +819,12 @@
     intx count = PrefetchFieldsAhead;
     return count >= 0 ? count : 1;
   }
+  static uint32_t get_xsave_header_lower_segment() {
+    return _cpuid_info.xem_xcr0_eax.value;
+  }
+  static uint32_t get_xsave_header_upper_segment() {
+    return _cpuid_info.xem_xcr0_edx;
+  }
 };
 
 #endif // CPU_X86_VM_VM_VERSION_X86_HPP
--- a/src/cpu/x86/vm/x86.ad	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/x86.ad	Mon Sep 21 17:49:57 2015 +0200
@@ -1661,46 +1661,55 @@
   if (!has_match_rule(opcode))
     return false;
 
+  bool ret_value = true;
   switch (opcode) {
     case Op_PopCountI:
     case Op_PopCountL:
       if (!UsePopCountInstruction)
-        return false;
-    break;
+        ret_value = false;
+      break;
     case Op_MulVI:
       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
-        return false;
-    break;
+        ret_value = false;
+      break;
     case Op_MulVL:
     case Op_MulReductionVL:
       if (VM_Version::supports_avx512dq() == false)
-        return false;
+        ret_value = false;
+      break;
     case Op_AddReductionVL:
       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
-        return false;
+        ret_value = false;
+      break;
     case Op_AddReductionVI:
       if (UseSSE < 3) // requires at least SSE3
-        return false;
+        ret_value = false;
+      break;
     case Op_MulReductionVI:
       if (UseSSE < 4) // requires at least SSE4
-        return false;
+        ret_value = false;
+      break;
     case Op_AddReductionVF:
     case Op_AddReductionVD:
     case Op_MulReductionVF:
     case Op_MulReductionVD:
       if (UseSSE < 1) // requires at least SSE
-        return false;
-    break;
+        ret_value = false;
+      break;
+    case Op_SqrtVD:
+      if (UseAVX < 1) // enabled for AVX only
+        ret_value = false;
+      break;
     case Op_CompareAndSwapL:
 #ifdef _LP64
     case Op_CompareAndSwapP:
 #endif
       if (!VM_Version::supports_cx8())
-        return false;
-    break;
+        ret_value = false;
+      break;
   }
 
-  return true;  // Per default match rules are supported.
+  return ret_value;  // Per default match rules are supported.
 }
 
 // Max vector size in bytes. 0 if not supported.
@@ -1721,14 +1730,24 @@
   case T_DOUBLE:
   case T_LONG:
     if (size < 16) return 0;
+    break;
   case T_FLOAT:
   case T_INT:
     if (size < 8) return 0;
+    break;
   case T_BOOLEAN:
+    if (size < 4) return 0;
+    break;
+  case T_CHAR:
+    if (size < 4) return 0;
+    break;
   case T_BYTE:
-  case T_CHAR:
+    if (size < 4) return 0;
+    if ((size > 32) && !VM_Version::supports_avx512bw()) return 0;
+    break;
   case T_SHORT:
     if (size < 4) return 0;
+    if ((size > 16) && !VM_Version::supports_avx512bw()) return 0;
     break;
   default:
     ShouldNotReachHere();
@@ -1800,7 +1819,7 @@
       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
       break;
     case Op_VecZ:
-      __ evmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
+      __ evmovdqul(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
       break;
     default:
       ShouldNotReachHere();
@@ -1855,7 +1874,7 @@
         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
         break;
       case Op_VecZ:
-        __ evmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
+        __ evmovdqul(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
         break;
       default:
         ShouldNotReachHere();
@@ -1875,7 +1894,7 @@
         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
         break;
       case Op_VecZ:
-        __ evmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
+        __ evmovdqul(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
         break;
       default:
         ShouldNotReachHere();
@@ -1929,9 +1948,40 @@
     }
 #endif
   }
-  int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
+  bool is_single_byte = false;
+  int vec_len = 0;
+  if ((UseAVX > 2) && (stack_offset != 0)) {
+    switch (ireg) {
+	case Op_VecS:
+    case Op_VecD:
+    case Op_VecX:
+	  break;
+	case Op_VecY:
+	  vec_len = 1;
+	  break;
+    case Op_VecZ:
+	  vec_len = 2;
+	  break;
+    }
+    is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, Assembler::EVEX_FVM, Assembler::EVEX_32bit, 0);
+  }
+  int offset_size = 0;
+  int size = 5;
+  if (UseAVX > 2 ) {
+    if ((VM_Version::supports_avx512vl() == false) && (vec_len == 2)) { 
+      offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
+      size += 2; // Need an additional two bytes for EVEX encoding
+    } else if ((VM_Version::supports_avx512vl() == false) && (vec_len < 2)) { 
+      offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
+    } else {
+      offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
+      size += 2; // Need an additional two bytes for EVEX encodding
+    }
+  } else {
+    offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
+  }
   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
-  return 5+offset_size;
+  return size+offset_size;
 }
 
 static inline jfloat replicate4_imm(int con, int width) {
@@ -2675,11 +2725,10 @@
   predicate(UseAVX > 0);
   match(Set dst (NegF src));
   ins_cost(150);
-  format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signflip()), vector_len);
+  format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
+  ins_encode %{
+    __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
+                 ExternalAddress(float_signflip()));
   %}
   ins_pipe(pipe_slow);
 %}
@@ -2700,12 +2749,11 @@
   predicate(UseAVX > 0);
   match(Set dst (NegD src));
   ins_cost(150);
-  format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
+  format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
             "# neg double by sign flipping" %}
   ins_encode %{
-    int vector_len = 0;
-    __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signflip()), vector_len);
+    __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
+                 ExternalAddress(double_signflip()));
   %}
   ins_pipe(pipe_slow);
 %}
@@ -2838,7 +2886,7 @@
   format %{ "vmovdqu $dst k0,$mem\t! load vector (64 bytes)" %}
   ins_encode %{
     int vector_len = 2;
-    __ evmovdqu($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -2895,7 +2943,7 @@
   format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %}
   ins_encode %{
     int vector_len = 2;
-    __ evmovdqu($mem$$Address, $src$$XMMRegister, vector_len);
+    __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3315,6 +3363,37 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl2F_zero(vecD dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F_zero(vecX dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_zero(vecY dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl2D_mem(vecX dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD (LoadD mem)));
@@ -3349,6 +3428,28 @@
   ins_pipe( pipe_slow );
 %}
 
+// Replicate double (8 byte) scalar zero to be vector
+instruct Repl2D_zero(vecX dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
+  match(Set dst (ReplicateD zero));
+  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
+  ins_encode %{
+    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_zero(vecY dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
+  match(Set dst (ReplicateD zero));
+  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // ====================GENERIC REPLICATE==========================================
 
 // Replicate byte scalar to be vector
@@ -3680,38 +3781,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Replicate float (4 byte) scalar zero to be vector
-instruct Repl2F_zero(vecD dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateF zero));
-  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
-  ins_encode %{
-    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4F_zero(vecX dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateF zero));
-  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
-  ins_encode %{
-    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl8F_zero(vecY dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateF zero));
-  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate double (8 bytes) scalar to be vector
 instruct Repl2D(vecX dst, regD src) %{
   predicate(n->as_Vector()->length() == 2);
@@ -3723,28 +3792,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Replicate double (8 byte) scalar zero to be vector
-instruct Repl2D_zero(vecX dst, immD0 zero) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateD zero));
-  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
-  ins_encode %{
-    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4D_zero(vecY dst, immD0 zero) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateD zero));
-  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // ====================EVEX REPLICATE=============================================
 
 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
@@ -3814,7 +3861,7 @@
 %}
 
 instruct Repl64B_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB src));
   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
   ins_encode %{
@@ -3825,7 +3872,7 @@
 %}
 
 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
   ins_encode %{
@@ -3862,7 +3909,7 @@
 %}
 
 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
@@ -3953,7 +4000,7 @@
 %}
 
 instruct Repl32S_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS src));
   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
   ins_encode %{
@@ -3964,7 +4011,7 @@
 %}
 
 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
   ins_encode %{
@@ -4001,7 +4048,7 @@
 %}
 
 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastw $dst,$dst\t! replicate32S" %}
@@ -4318,13 +4365,50 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
   match(Set dst (ReplicateF zero));
-  format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
-  ins_encode %{
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
     int vector_len = 2;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -4373,13 +4457,38 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
+  match(Set dst (ReplicateD zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
+  match(Set dst (ReplicateD zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateD zero));
-  format %{ "vxorpd  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
-  ins_encode %{
+  format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
     int vector_len = 2;
-    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -7474,6 +7583,75 @@
   ins_pipe( pipe_slow );
 %}
 
+// --------------------------------- Sqrt --------------------------------------
+
+// Floating point vector sqrt - double precision only
+instruct vsqrt2D_reg(vecX dst, vecX src) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt2D_mem(vecX dst, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt4D_reg(vecY dst, vecY src) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt4D_mem(vecY dst, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt8D_mem(vecZ dst, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ------------------------------ LeftShift -----------------------------------
 
 // Shorts/Chars vector left shift
--- a/src/cpu/x86/vm/x86_32.ad	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/x86_32.ad	Mon Sep 21 17:49:57 2015 +0200
@@ -1004,10 +1004,10 @@
       __ vmovdqu(Address(rsp, dst_offset), xmm0);
       __ vmovdqu(xmm0, Address(rsp, -32));
     case Op_VecZ:
-      __ evmovdqu(Address(rsp, -64), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, src_offset), 2);
-      __ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, -64), 2);
+      __ evmovdqul(Address(rsp, -64), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, -64), 2);
       break;
     default:
       ShouldNotReachHere();
--- a/src/cpu/x86/vm/x86_64.ad	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/cpu/x86/vm/x86_64.ad	Mon Sep 21 17:49:57 2015 +0200
@@ -1075,10 +1075,10 @@
       __ vmovdqu(Address(rsp, dst_offset), xmm0);
       __ vmovdqu(xmm0, Address(rsp, -32));
     case Op_VecZ:
-      __ evmovdqu(Address(rsp, -64), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, src_offset), 2);
-      __ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, -64), 2);
+      __ evmovdqul(Address(rsp, -64), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, -64), 2);
       break;
     default:
       ShouldNotReachHere();
--- a/src/share/vm/adlc/formssel.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/adlc/formssel.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -4143,6 +4143,7 @@
     "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
     "MulVS","MulVI","MulVL","MulVF","MulVD",
     "DivVF","DivVD",
+    "SqrtVD",
     "AndV" ,"XorV" ,"OrV",
     "AddReductionVI", "AddReductionVL",
     "AddReductionVF", "AddReductionVD",
--- a/src/share/vm/opto/classes.hpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/opto/classes.hpp	Mon Sep 21 17:49:57 2015 +0200
@@ -290,6 +290,7 @@
 macro(MulReductionVD)
 macro(DivVF)
 macro(DivVD)
+macro(SqrtVD)
 macro(LShiftCntV)
 macro(RShiftCntV)
 macro(LShiftVB)
--- a/src/share/vm/opto/ifnode.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/opto/ifnode.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -858,18 +858,29 @@
     // this_bool = <=
     //   dom_bool = >= (proj = True) or dom_bool = < (proj = False)
     //     x in [a, b] on the fail (= True) projection, b+1 > a-1:
-    //     lo = a, hi = b, adjusted_lim = b-a, cond = <=u
+    //     lo = a, hi = b, adjusted_lim = b-a+1, cond = <u
+    //     lo = a, hi = b, adjusted_lim = b-a, cond = <=u doesn't work because b = a - 1 is possible, then b-a = -1
     //   dom_bool = > (proj = True) or dom_bool = <= (proj = False)
     //     x in ]a, b] on the fail (= True) projection b+1 > a:
     //     lo = a+1, hi = b, adjusted_lim = b-a, cond = <u
-    //     lo = a+1, hi = b, adjusted_lim = b-a-1, cond = <=u doesn't work because a = b is possible, then hi-lo = -1
+    //     lo = a+1, hi = b, adjusted_lim = b-a-1, cond = <=u doesn't work because a = b is possible, then b-a-1 = -1
 
-    if (lo_test == BoolTest::gt || lo_test == BoolTest::le) {
-      if (hi_test == BoolTest::le) {
+    if (hi_test == BoolTest::lt) {
+      if (lo_test == BoolTest::gt || lo_test == BoolTest::le) {
+        lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
+      }
+    } else {
+      assert(hi_test == BoolTest::le, "bad test");
+      if (lo_test == BoolTest::ge || lo_test == BoolTest::lt) {
         adjusted_lim = igvn->transform(new SubINode(hi, lo));
+        adjusted_lim = igvn->transform(new AddINode(adjusted_lim, igvn->intcon(1)));
+        cond = BoolTest::lt;
+      } else {
+        assert(lo_test == BoolTest::gt || lo_test == BoolTest::le, "bad test");
+        adjusted_lim = igvn->transform(new SubINode(hi, lo));
+        lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
         cond = BoolTest::lt;
       }
-      lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
     }
   } else if (lo_type->_lo > hi_type->_hi && lo_type->_hi == max_jint && hi_type->_lo == min_jint) {
 
@@ -879,7 +890,8 @@
     //     lo = b, hi = a, adjusted_lim = a-b, cond = >=u
     //   dom_bool = <= (proj = True) or dom_bool = > (proj = False)
     //     x in [b, a] on the fail (= False) projection, a+1 > b-1:
-    //     lo = b, hi = a, adjusted_lim = a-b, cond = >u
+    //     lo = b, hi = a, adjusted_lim = a-b+1, cond = >=u
+    //     lo = b, hi = a, adjusted_lim = a-b, cond = >u doesn't work because a = b - 1 is possible, then b-a = -1
     // this_bool = <=
     //   dom_bool = < (proj = True) or dom_bool = >= (proj = False)
     //     x in ]b, a[ on the fail (= False) projection, a > b:
@@ -887,7 +899,7 @@
     //   dom_bool = <= (proj = True) or dom_bool = > (proj = False)
     //     x in ]b, a] on the fail (= False) projection, a+1 > b:
     //     lo = b+1, hi = a, adjusted_lim = a-b, cond = >=u
-    //     lo = b+1, hi = a, adjusted_lim = a-b-1, cond = >u doesn't work because a = b is possible, then hi-lo = -1
+    //     lo = b+1, hi = a, adjusted_lim = a-b-1, cond = >u doesn't work because a = b is possible, then b-a-1 = -1
 
     swap(lo, hi);
     swap(lo_type, hi_type);
@@ -900,14 +912,26 @@
 
     cond = (hi_test == BoolTest::le || hi_test == BoolTest::gt) ? BoolTest::gt : BoolTest::ge;
 
-    if (lo_test == BoolTest::le) {
-      if (cond == BoolTest::gt) {
+    if (lo_test == BoolTest::lt) {
+      if (hi_test == BoolTest::lt || hi_test == BoolTest::ge) {
+        cond = BoolTest::ge;
+      } else {
+        assert(hi_test == BoolTest::le || hi_test == BoolTest::gt, "bad test");
         adjusted_lim = igvn->transform(new SubINode(hi, lo));
+        adjusted_lim = igvn->transform(new AddINode(adjusted_lim, igvn->intcon(1)));
         cond = BoolTest::ge;
       }
-      lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
+    } else if (lo_test == BoolTest::le) {
+      if (hi_test == BoolTest::lt || hi_test == BoolTest::ge) {
+        lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
+        cond = BoolTest::ge;
+      } else {
+        assert(hi_test == BoolTest::le || hi_test == BoolTest::gt, "bad test");
+        adjusted_lim = igvn->transform(new SubINode(hi, lo));
+        lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
+        cond = BoolTest::ge;
+      }
     }
-
   } else {
     const TypeInt* failtype  = filtered_int_type(igvn, n, proj);
     if (failtype != NULL) {
--- a/src/share/vm/opto/loopPredicate.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/opto/loopPredicate.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -112,6 +112,13 @@
     if (_idom != NULL) {
       set_idom(call, rgn, dom_depth(rgn));
     }
+    for (DUIterator_Fast imax, i = uncommon_proj->fast_outs(imax); i < imax; i++) {
+      Node* n = uncommon_proj->fast_out(i);
+      if (n->is_Load() || n->is_Store()) {
+        _igvn.replace_input_of(n, 0, rgn);
+        --i; --imax;
+      }
+    }
   } else {
     // Find region's edge corresponding to uncommon_proj
     for (; proj_index < rgn->req(); proj_index++)
--- a/src/share/vm/opto/loopnode.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/opto/loopnode.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -1901,7 +1901,7 @@
     if (stride_con > 0) tty->print("+");
     tty->print("%d", stride_con);
 
-    tty->print(" (%d iters) ", (int)cl->profile_trip_cnt());
+    tty->print(" (%0.f iters) ", cl->profile_trip_cnt());
 
     if (cl->is_pre_loop ()) tty->print(" pre" );
     if (cl->is_main_loop()) tty->print(" main");
--- a/src/share/vm/opto/superword.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/opto/superword.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -1858,6 +1858,11 @@
           vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
           vlen_in_bytes = vn->as_Vector()->length_in_bytes();
         }
+      } else if (opc == Op_SqrtD) {
+        // Promote operand to vector (Sqrt is a 2 address instruction)
+        Node* in = vector_opd(p, 1);
+        vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
+        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
       } else {
         ShouldNotReachHere();
       }
--- a/src/share/vm/opto/vectornode.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/opto/vectornode.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -92,6 +92,9 @@
   case Op_DivD:
     assert(bt == T_DOUBLE, "must be");
     return Op_DivVD;
+  case Op_SqrtD:
+    assert(bt == T_DOUBLE, "must be");
+    return Op_SqrtVD;
   case Op_LShiftI:
     switch (bt) {
     case T_BOOLEAN:
@@ -277,6 +280,9 @@
   case Op_DivVF: return new DivVFNode(n1, n2, vt);
   case Op_DivVD: return new DivVDNode(n1, n2, vt);
 
+  // Currently only supports double precision sqrt
+  case Op_SqrtVD: return new SqrtVDNode(n1, vt);
+
   case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt);
   case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt);
   case Op_LShiftVI: return new LShiftVINode(n1, n2, vt);
--- a/src/share/vm/opto/vectornode.hpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/opto/vectornode.hpp	Mon Sep 21 17:49:57 2015 +0200
@@ -309,6 +309,14 @@
   virtual int Opcode() const;
 };
 
+//------------------------------SqrtVDNode--------------------------------------
+// Vector Sqrt double
+class SqrtVDNode : public VectorNode {
+ public:
+  SqrtVDNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------LShiftVBNode-----------------------------------
 // Vector left shift bytes
 class LShiftVBNode : public VectorNode {
--- a/src/share/vm/prims/whitebox.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/prims/whitebox.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -1041,11 +1041,18 @@
 }
 
 WB_ENTRY(jlong, WB_AllocateCodeBlob(JNIEnv* env, jobject o, jint size, jint blob_type))
-    return (jlong) WhiteBox::allocate_code_blob(size, blob_type);
+  if (size < 0) {
+    THROW_MSG_0(vmSymbols::java_lang_IllegalArgumentException(),
+      err_msg("WB_AllocateCodeBlob: size is negative: " INT32_FORMAT, size));
+  }
+  return (jlong) WhiteBox::allocate_code_blob(size, blob_type);
 WB_END
 
 WB_ENTRY(void, WB_FreeCodeBlob(JNIEnv* env, jobject o, jlong addr))
-    BufferBlob::free((BufferBlob*) addr);
+  if (addr == 0) {
+    return;
+  }
+  BufferBlob::free((BufferBlob*) addr);
 WB_END
 
 WB_ENTRY(jobjectArray, WB_GetCodeHeapEntries(JNIEnv* env, jobject o, jint blob_type))
@@ -1090,9 +1097,13 @@
 WB_END
 
 WB_ENTRY(jobjectArray, WB_GetCodeBlob(JNIEnv* env, jobject o, jlong addr))
-    ThreadToNativeFromVM ttn(thread);
-    CodeBlobStub stub((CodeBlob*) addr);
-    return codeBlob2objectArray(thread, env, &stub);
+  if (addr == 0) {
+    THROW_MSG_NULL(vmSymbols::java_lang_NullPointerException(),
+      "WB_GetCodeBlob: addr is null");
+  }
+  ThreadToNativeFromVM ttn(thread);
+  CodeBlobStub stub((CodeBlob*) addr);
+  return codeBlob2objectArray(thread, env, &stub);
 WB_END
 
 WB_ENTRY(jlong, WB_GetThreadStackSize(JNIEnv* env, jobject o))
--- a/src/share/vm/services/lowMemoryDetector.cpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/services/lowMemoryDetector.cpp	Mon Sep 21 17:49:57 2015 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -200,9 +200,10 @@
 // any clears unless the usage becomes greater than or equal
 // to the high threshold.
 //
-// If the current level is between high and low threhsold, no change.
+// If the current level is between high and low threshold, no change.
 //
 void SensorInfo::set_gauge_sensor_level(MemoryUsage usage, ThresholdSupport* high_low_threshold) {
+  assert(Service_lock->owned_by_self(), "Must own Service_lock");
   assert(high_low_threshold->is_high_threshold_supported(), "just checking");
 
   bool is_over_high = high_low_threshold->is_high_threshold_crossed(usage);
@@ -257,6 +258,7 @@
 //      the sensor will be on (i.e. sensor is currently off
 //      and has pending trigger requests).
 void SensorInfo::set_counter_sensor_level(MemoryUsage usage, ThresholdSupport* counter_threshold) {
+  assert(Service_lock->owned_by_self(), "Must own Service_lock");
   assert(counter_threshold->is_high_threshold_supported(), "just checking");
 
   bool is_over_high = counter_threshold->is_high_threshold_crossed(usage);
@@ -278,9 +280,7 @@
 }
 
 void SensorInfo::process_pending_requests(TRAPS) {
-  if (!has_pending_requests()) {
-    return;
-  }
+  assert(has_pending_requests(), "Must have pending request");
 
   int pending_count = pending_trigger_count();
   if (pending_clear_count() > 0) {
@@ -293,7 +293,6 @@
 
 void SensorInfo::trigger(int count, TRAPS) {
   assert(count <= _pending_trigger_count, "just checking");
-
   if (_sensor_obj != NULL) {
     Klass* k = Management::sun_management_Sensor_klass(CHECK);
     instanceKlassHandle sensorKlass (THREAD, k);
@@ -316,6 +315,7 @@
   {
     // Holds Service_lock and update the sensor state
     MutexLockerEx ml(Service_lock, Mutex::_no_safepoint_check_flag);
+    assert(_pending_trigger_count > 0, "Must have pending trigger");
     _sensor_on = true;
     _sensor_count += count;
     _pending_trigger_count = _pending_trigger_count - count;
@@ -323,6 +323,20 @@
 }
 
 void SensorInfo::clear(int count, TRAPS) {
+  {
+    // Holds Service_lock and update the sensor state
+    MutexLockerEx ml(Service_lock, Mutex::_no_safepoint_check_flag);
+    if (_pending_clear_count == 0) {
+      // Bail out if we lost a race to set_*_sensor_level() which may have
+      // reactivated the sensor in the meantime because it was triggered again.
+      return;
+    }
+    _sensor_on = false;
+    _sensor_count += count;
+    _pending_clear_count = 0;
+    _pending_trigger_count = _pending_trigger_count - count;
+  }
+
   if (_sensor_obj != NULL) {
     Klass* k = Management::sun_management_Sensor_klass(CHECK);
     instanceKlassHandle sensorKlass (THREAD, k);
@@ -338,14 +352,6 @@
                             &args,
                             CHECK);
   }
-
-  {
-    // Holds Service_lock and update the sensor state
-    MutexLockerEx ml(Service_lock, Mutex::_no_safepoint_check_flag);
-    _sensor_on = false;
-    _pending_clear_count = 0;
-    _pending_trigger_count = _pending_trigger_count - count;
-  }
 }
 
 //--------------------------------------------------------------
--- a/src/share/vm/services/lowMemoryDetector.hpp	Tue Sep 22 14:24:31 2015 -0400
+++ b/src/share/vm/services/lowMemoryDetector.hpp	Mon Sep 21 17:49:57 2015 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -180,7 +180,7 @@
   // any clears unless the usage becomes greater than or equal
   // to the high threshold.
   //
-  // If the current level is between high and low threhsold, no change.
+  // If the current level is between high and low threshold, no change.
   //
   void set_gauge_sensor_level(MemoryUsage usage, ThresholdSupport* high_low_threshold);
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/arraycopy/TestEliminatedArrayLoopPredicateCopyDeopt.java	Mon Sep 21 17:49:57 2015 +0200
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8134974
+ * @summary Cannot pin eliminated arraycopy loads for deopt state in uncommon trap path if it is a loop predicate unc
+ * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestEliminatedArrayLoopPredicateCopyDeopt
+ *
+ */
+
+public class TestEliminatedArrayLoopPredicateCopyDeopt {
+
+    static boolean test(int[] array_src) {
+        int[] array_dst = new int[10];
+        System.arraycopy(array_src, 0, array_dst, 0, 10);
+
+        for (int i = 0; i < 100; i++) {
+            array_src[i] = i;
+        }
+        if (array_dst[0] == 0) {
+            return true;
+        }
+        return false;
+    }
+
+    static public void main(String[] args) {
+        int[] array_src = new int[100];
+        for (int i = 0; i < 20000; i++) {
+            test(array_src);
+        }
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/loopopts/superword/SumRedSqrt_Double.java	Mon Sep 21 17:49:57 2015 +0200
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+* @test
+* @summary Add C2 x86 Superword support for scalar sum reduction optimizations : double sqrt test
+*
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+*
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+*
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+*
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+*/
+
+public class SumRedSqrt_Double
+{
+  public static void main(String[] args) throws Exception {
+    double[] a = new double[256*1024];
+    double[] b = new double[256*1024];
+    double[] c = new double[256*1024];
+    double[] d = new double[256*1024];
+    sumReductionInit(a,b,c);
+    double total = 0;
+    double valid = 2.06157643776E14;
+    for(int j = 0; j < 2000; j++) {
+      total = sumReductionImplement(a,b,c,d,total);
+    }
+    if(total == valid) {
+      System.out.println("Success");
+    } else {
+      System.out.println("Invalid sum of elements variable in total: " + total);
+      System.out.println("Expected value = " + valid);
+      throw new Exception("Failed");
+    }
+  }
+
+  public static void sumReductionInit(
+    double[] a,
+    double[] b,
+    double[] c)
+  {
+    for(int j = 0; j < 1; j++)
+    {
+      for(int i = 0; i < a.length; i++)
+      {
+        a[i] = i * 1 + j;
+        b[i] = i * 1 - j;
+        c[i] = i + j;
+      }
+    }
+  }
+
+  public static double sumReductionImplement(
+    double[] a,
+    double[] b,
+    double[] c,
+    double[] d,
+    double total)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      d[i]= Math.sqrt(a[i] * b[i]) + Math.sqrt(a[i] * c[i]) + Math.sqrt(b[i] * c[i]);
+      total += d[i];
+    }
+    return total;
+  }
+
+}
--- a/test/compiler/rangechecks/TestBadFoldCompare.java	Tue Sep 22 14:24:31 2015 -0400
+++ b/test/compiler/rangechecks/TestBadFoldCompare.java	Mon Sep 21 17:49:57 2015 +0200
@@ -24,7 +24,8 @@
 /*
  * @test
  * @bug 8085832
- * @summary x <= 0 || x > 0 wrongly folded as (x-1) >u -1
+ * @bug 8135069
+ * @summary x <= 0 || x > 0 wrongly folded as (x-1) >u -1 and x < 0 || x > -1 wrongly folded as x >u -1
  * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestBadFoldCompare
  */
 
@@ -58,6 +59,34 @@
         helper2(i, 0, 0, flag);
     }
 
+    static boolean test3_taken;
+
+    static void helper3(int i, int a, int b, boolean flag) {
+        if (flag) {
+            if (i < a || i > b - 1) {
+                test3_taken = true;
+            }
+        }
+    }
+
+    static void test3(int i, boolean flag) {
+        helper3(i, 0, 0, flag);
+    }
+
+    static boolean test4_taken;
+
+    static void helper4(int i, int a, int b, boolean flag) {
+        if (flag) {
+            if (i > b - 1 || i < a) {
+                test4_taken = true;
+            }
+        }
+    }
+
+    static void test4(int i, boolean flag) {
+        helper4(i, 0, 0, flag);
+    }
+
     static public void main(String[] args) {
         boolean success = true;
 
@@ -87,6 +116,35 @@
             System.out.println("Test2 failed");
             success = false;
         }
+
+        for (int i = 0; i < 20000; i++) {
+            helper3(5, 0,  10, (i%2)==0);
+            helper3(-1, 0,  10, (i%2)==0);
+            helper3(15, 0,  10, (i%2)==0);
+            test3(0, false);
+        }
+        test3_taken = false;
+        test3(0, true);
+
+        if (!test3_taken) {
+            System.out.println("Test3 failed");
+            success = false;
+        }
+
+        for (int i = 0; i < 20000; i++) {
+            helper4(5, 0,  10, (i%2)==0);
+            helper4(-1, 0,  10, (i%2)==0);
+            helper4(15, 0,  10, (i%2)==0);
+            test4(0, false);
+        }
+        test4_taken = false;
+        test4(0, true);
+
+        if (!test4_taken) {
+            System.out.println("Test4 failed");
+            success = false;
+        }
+
         if (!success) {
             throw new RuntimeException("Some tests failed");
         }
--- a/test/testlibrary/jdk/test/lib/Utils.java	Tue Sep 22 14:24:31 2015 -0400
+++ b/test/testlibrary/jdk/test/lib/Utils.java	Mon Sep 21 17:49:57 2015 +0200
@@ -428,4 +428,28 @@
     public static long adjustTimeout(long tOut) {
         return Math.round(tOut * Utils.TIMEOUT_FACTOR);
     }
+
+    /**
+     * Runs runnable and checks that it throws expected exception. If exceptionException is null it means
+     * that we expect no exception to be thrown.
+     * @param runnable what we run
+     * @param expectedException expected exception
+     */
+    public static void runAndCheckException(Runnable runnable, Class<? extends Throwable> expectedException) {
+        try {
+            runnable.run();
+            if (expectedException != null) {
+                throw new AssertionError("Didn't get expected exception " + expectedException.getSimpleName());
+            }
+        } catch (Throwable t) {
+            if (expectedException == null) {
+                throw new AssertionError("Got unexpected exception ", t);
+            }
+            if (!expectedException.isAssignableFrom(t.getClass())) {
+                throw new AssertionError(String.format("Got unexpected exception %s instead of %s",
+                        t.getClass().getSimpleName(), expectedException.getSimpleName()), t);
+            }
+        }
+    }
+
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/testlibrary_tests/whitebox/BlobSanityTest.java	Mon Sep 21 17:49:57 2015 +0200
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test BlobSanityTest
+ * @bug 8132980
+ * @library /testlibrary /../../test/lib
+ * @modules java.management/sun.management
+ * @build BlobSanityTest
+ * @run main ClassFileInstaller sun.hotspot.WhiteBox
+ *                              sun.hotspot.WhiteBox$WhiteBoxPermission
+ * @run main/othervm -Xbootclasspath/a:. -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI BlobSanityTest
+ * @summary sanity testing of allocateCodeBlob, freeCodeBlob and getCodeBlob
+ */
+
+
+import sun.hotspot.WhiteBox;
+
+import java.util.function.Consumer;
+import jdk.test.lib.Utils;
+
+public class BlobSanityTest {
+
+    private static void runTest(Consumer<Integer> consumer, int val, String testCaseName, Class<? extends Throwable>
+            expectedException) {
+            System.out.println("Calling " + testCaseName);
+            Utils.runAndCheckException(() -> consumer.accept(val), expectedException);
+            System.out.println("Looks ok");
+    }
+
+    public static void main(String[] args) throws Exception {
+        System.out.println("Crash means that sanity check failed");
+
+        WhiteBox wb = WhiteBox.getWhiteBox();
+
+        runTest(wb::freeCodeBlob, 0, "wb::freeCodeBlob(0)", null);
+        runTest(wb::getCodeBlob, 0, "wb::getCodeBlob(0)", NullPointerException.class);
+        runTest(x -> wb.allocateCodeBlob(x, 0), -1, "wb::allocateCodeBlob(-1,0)", IllegalArgumentException.class);
+    }
+}