changeset 38049:e8541793960f

8153998: Masked vector post loops Summary: Masked vectorization for post loops to execute in a single iteration in place of fixup scalar loops which used to take many iterations to complete work for user loops. Reviewed-by: twisti, kvn
author mcberg
date Mon, 18 Apr 2016 15:18:14 -0700
parents 21720d6174a2
children 8fc8bec6e8a7
files hotspot/src/cpu/aarch64/vm/aarch64.ad hotspot/src/cpu/ppc/vm/ppc.ad hotspot/src/cpu/sparc/vm/sparc.ad hotspot/src/cpu/x86/vm/assembler_x86.cpp hotspot/src/cpu/x86/vm/assembler_x86.hpp hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp hotspot/src/cpu/x86/vm/x86.ad hotspot/src/cpu/x86/vm/x86_32.ad hotspot/src/cpu/x86/vm/x86_64.ad hotspot/src/share/vm/opto/classes.hpp hotspot/src/share/vm/opto/loopUnswitch.cpp hotspot/src/share/vm/opto/loopnode.cpp hotspot/src/share/vm/opto/matcher.hpp hotspot/src/share/vm/opto/node.hpp hotspot/src/share/vm/opto/superword.cpp hotspot/src/share/vm/opto/superword.hpp hotspot/src/share/vm/opto/vectornode.hpp hotspot/src/share/vm/runtime/vmStructs.cpp
diffstat 19 files changed, 641 insertions(+), 130 deletions(-) [+]
line wrap: on
line diff
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad	Mon Apr 18 15:18:14 2016 -0700
@@ -3346,6 +3346,10 @@
   return ret_value;  // Per default match rules are supported.
 }
 
+const bool Matcher::has_predicated_vectors(void) {
+  return false;
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
   return default_pressure_threshold;
 }
--- a/hotspot/src/cpu/ppc/vm/ppc.ad	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/cpu/ppc/vm/ppc.ad	Mon Apr 18 15:18:14 2016 -0700
@@ -2047,6 +2047,10 @@
   return ret_value;  // Per default match rules are supported.
 }
 
+const bool Matcher::has_predicated_vectors(void) {
+  return false;
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
   return default_pressure_threshold;
 }
--- a/hotspot/src/cpu/sparc/vm/sparc.ad	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad	Mon Apr 18 15:18:14 2016 -0700
@@ -1904,6 +1904,10 @@
   return ret_value;  // Per default match rules are supported.
 }
 
+const bool Matcher::has_predicated_vectors(void) {
+  return false;
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
   return default_pressure_threshold;
 }
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Apr 18 15:18:14 2016 -0700
@@ -1240,6 +1240,7 @@
 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -1250,6 +1251,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
@@ -1599,6 +1601,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);;
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x2F);
   emit_operand(dst, src);
@@ -1607,6 +1610,7 @@
 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x2F);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -1733,6 +1737,7 @@
 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5A);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -1743,6 +1748,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5A);
   emit_operand(dst, src);
@@ -1849,6 +1855,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
@@ -1857,6 +1864,7 @@
 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2131,6 +2139,7 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_128bit;
   InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x28);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2165,6 +2174,7 @@
   NOT_LP64(assert(VM_Version::supports_sse3(), ""));
   int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_128bit;
   InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x12);
   emit_int8(0xC0 | encode);
@@ -2202,6 +2212,15 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::kmovwl(KRegister dst, Address src) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x90);
+  emit_operand((Register)dst, src);
+}
+
 void Assembler::kmovdl(KRegister dst, Register src) {
   assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
@@ -2260,6 +2279,14 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::knotwl(KRegister dst, KRegister src) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x44);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 // This instruction produces ZF or CF flags
 void Assembler::kortestbl(KRegister src1, KRegister src2) {
   assert(VM_Version::supports_avx512dq(), "");
@@ -2432,6 +2459,7 @@
 void Assembler::evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3;
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
@@ -2444,6 +2472,7 @@
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3;
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_is_evex_instruction();
   vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
   emit_operand(dst, src);
@@ -2456,6 +2485,7 @@
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3;
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
   emit_int8(0x7F);
   emit_operand(src, dst);
@@ -2464,6 +2494,7 @@
 void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3;
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
@@ -2475,6 +2506,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_is_evex_instruction();
   int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3;
   vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
@@ -2487,6 +2519,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_is_evex_instruction();
   int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3;
   vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
   emit_int8(0x7F);
@@ -2518,8 +2551,8 @@
   assert(src != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
-  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x7F);
   emit_operand(src, dst);
@@ -2538,8 +2571,8 @@
   assert(VM_Version::supports_evex(), "");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
-  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
   emit_operand(dst, src);
@@ -2550,8 +2583,8 @@
   assert(src != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
-  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x7F);
   emit_operand(src, dst);
@@ -2601,6 +2634,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x12);
   emit_operand(dst, src);
@@ -2631,6 +2665,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x7E);
   emit_operand(dst, src);
@@ -2641,6 +2676,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD6);
   emit_operand(src, dst);
@@ -2665,6 +2701,7 @@
 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x10);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2675,6 +2712,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x10);
   emit_operand(dst, src);
@@ -2685,6 +2723,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x11);
   emit_operand(src, dst);
@@ -2808,6 +2847,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
@@ -2816,6 +2856,7 @@
 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3795,6 +3836,7 @@
 void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6C);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4090,6 +4132,7 @@
 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x51);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4100,6 +4143,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x51);
   emit_operand(dst, src);
@@ -4175,6 +4219,7 @@
 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4185,6 +4230,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
@@ -4272,6 +4318,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x2E);
   emit_operand(dst, src);
@@ -4280,6 +4327,7 @@
 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x2E);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4391,6 +4439,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
@@ -4399,6 +4448,7 @@
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4427,6 +4477,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
@@ -4435,6 +4486,7 @@
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4463,6 +4515,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
@@ -4471,6 +4524,7 @@
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4499,6 +4553,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
@@ -4507,6 +4562,7 @@
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4537,6 +4593,7 @@
 void Assembler::addpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4546,6 +4603,7 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
@@ -4564,6 +4622,7 @@
 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4582,6 +4641,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
@@ -4600,6 +4660,7 @@
 void Assembler::subpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4616,6 +4677,7 @@
 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4634,6 +4696,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
@@ -4652,6 +4715,7 @@
 void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4662,6 +4726,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
@@ -4678,6 +4743,7 @@
 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4696,6 +4762,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
@@ -4714,6 +4781,7 @@
 void Assembler::divpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4730,6 +4798,7 @@
 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4748,6 +4817,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
@@ -4766,6 +4836,7 @@
 void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x51);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4776,6 +4847,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x51);
   emit_operand(dst, src);
@@ -4784,6 +4856,7 @@
 void Assembler::andpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4812,6 +4885,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_operand(dst, src);
@@ -4820,6 +4894,7 @@
 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4838,6 +4913,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_operand(dst, src);
@@ -4856,6 +4932,7 @@
 void Assembler::unpckhpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x15);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4864,6 +4941,7 @@
 void Assembler::unpcklpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x14);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4872,6 +4950,7 @@
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4890,6 +4969,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_operand(dst, src);
@@ -4908,6 +4988,7 @@
 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4926,6 +5007,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_operand(dst, src);
@@ -4996,6 +5078,7 @@
 void Assembler::paddq(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD4);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5044,6 +5127,7 @@
 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD4);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5084,6 +5168,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD4);
   emit_operand(dst, src);
@@ -5115,6 +5200,7 @@
 void Assembler::psubq(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFB);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5147,6 +5233,7 @@
 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFB);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5187,6 +5274,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFB);
   emit_operand(dst, src);
@@ -5225,8 +5313,9 @@
 }
 
 void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  assert(UseAVX > 2, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
+  assert(UseAVX > 2, "requires some form of EVEX");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5253,10 +5342,11 @@
 }
 
 void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  assert(UseAVX > 0, "requires some form of AVX");
+  assert(UseAVX > 2, "requires some form of EVEX");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_is_evex_instruction();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x40);
   emit_operand(dst, src);
@@ -5312,6 +5402,7 @@
 void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5341,6 +5432,7 @@
 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
   int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x73);
@@ -5367,6 +5459,7 @@
 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5398,6 +5491,7 @@
   // shifts 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x73);
@@ -5424,6 +5518,7 @@
 void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD3);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5452,6 +5547,7 @@
 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
   int encode = vex_prefix_and_encode(xmm2->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x73);
@@ -5478,6 +5574,7 @@
 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD3);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5587,6 +5684,7 @@
 void Assembler::pandn(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xDF);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5876,9 +5974,9 @@
 }
 
 void Assembler::vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_avx512dq(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5966,9 +6064,9 @@
 }
 
 void Assembler::vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_avx512dq(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6093,7 +6191,8 @@
 // duplicate 8-byte integer data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6103,7 +6202,8 @@
   assert(VM_Version::supports_evex(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   // swap src<->dst for encoding
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -6138,7 +6238,8 @@
 // duplicate double precision data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6148,8 +6249,9 @@
   assert(VM_Version::supports_evex(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_rex_vex_w_reverted();
   // swap src<->dst for encoding
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x19);
@@ -6163,12 +6265,9 @@
 void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
-  if (attributes.is_evex_instruction()) {
-    emit_int8(0x7A);
-  } else {
-    emit_int8(0x78);
-  }
+  emit_int8(0x7A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
@@ -6176,12 +6275,9 @@
 void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
-  if (attributes.is_evex_instruction()) {
-    emit_int8(0x7B);
-  } else {
-    emit_int8(0x79);
-  }
+  emit_int8(0x7B);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
@@ -6189,12 +6285,9 @@
 void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
-  if (attributes.is_evex_instruction()) {
-    emit_int8(0x7C);
-  } else {
-    emit_int8(0x58);
-  }
+  emit_int8(0x7C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
@@ -6202,12 +6295,9 @@
 void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
-  if (attributes.is_evex_instruction()) {
-    emit_int8(0x7C);
-  } else {
-    emit_int8(0x59);
-  }
+  emit_int8(0x7C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
@@ -6862,6 +6952,9 @@
     attributes->set_is_evex_instruction();
     evex_prefix(vex_r, vex_b, vex_x, evex_r, evex_v, nds_enc, pre, opc);
   } else {
+    if (UseAVX > 2 && attributes->is_rex_vex_w_reverted()) {
+      attributes->set_rex_vex_w(false);
+    }
     vex_prefix(vex_r, vex_b, vex_x, nds_enc, pre, opc);
   }
 }
@@ -6921,6 +7014,9 @@
     attributes->set_is_evex_instruction();
     evex_prefix(vex_r, vex_b, vex_x, evex_r, evex_v, nds_enc, pre, opc);
   } else {
+    if (UseAVX > 2 && attributes->is_rex_vex_w_reverted()) {
+      attributes->set_rex_vex_w(false);
+    }
     vex_prefix(vex_r, vex_b, vex_x, nds_enc, pre, opc);
   }
 
@@ -6975,6 +7071,21 @@
   emit_int8((unsigned char)(0xF0 & src2_enc<<4));
 }
 
+void Assembler::shlxl(Register dst, Register src1, Register src2) {
+  assert(VM_Version::supports_bmi2(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xF7);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::shlxq(Register dst, Register src1, Register src2) {
+  assert(VM_Version::supports_bmi2(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xF7);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
 
 #ifndef _LP64
 
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Apr 18 15:18:14 2016 -0700
@@ -1337,6 +1337,7 @@
   void kmovbl(KRegister dst, Register src);
   void kmovbl(Register dst, KRegister src);
   void kmovwl(KRegister dst, Register src);
+  void kmovwl(KRegister dst, Address src);
   void kmovwl(Register dst, KRegister src);
   void kmovdl(KRegister dst, Register src);
   void kmovdl(Register dst, KRegister src);
@@ -1346,6 +1347,8 @@
   void kmovql(KRegister dst, Register src);
   void kmovql(Register dst, KRegister src);
 
+  void knotwl(KRegister dst, KRegister src);
+
   void kortestbl(KRegister dst, KRegister src);
   void kortestwl(KRegister dst, KRegister src);
   void kortestdl(KRegister dst, KRegister src);
@@ -2052,6 +2055,8 @@
   void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
   void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
 
+  void shlxl(Register dst, Register src1, Register src2);
+  void shlxq(Register dst, Register src1, Register src2);
 
  protected:
   // Next instructions require address alignment 16 bytes SSE mode.
@@ -2077,6 +2082,7 @@
     :
       _avx_vector_len(vector_len),
       _rex_vex_w(rex_vex_w),
+      _rex_vex_w_reverted(false),
       _legacy_mode(legacy_mode),
       _no_reg_mask(no_reg_mask),
       _uses_vl(uses_vl),
@@ -2100,6 +2106,7 @@
 private:
   int  _avx_vector_len;
   bool _rex_vex_w;
+  bool _rex_vex_w_reverted;
   bool _legacy_mode;
   bool _no_reg_mask;
   bool _uses_vl;
@@ -2116,6 +2123,7 @@
   // query functions for field accessors
   int  get_vector_len(void) const { return _avx_vector_len; }
   bool is_rex_vex_w(void) const { return _rex_vex_w; }
+  bool is_rex_vex_w_reverted(void) { return _rex_vex_w_reverted; }
   bool is_legacy_mode(void) const { return _legacy_mode; }
   bool is_no_reg_mask(void) const { return _no_reg_mask; }
   bool uses_vl(void) const { return _uses_vl; }
@@ -2129,6 +2137,12 @@
   // Set the vector len manually
   void set_vector_len(int vector_len) { _avx_vector_len = vector_len; }
 
+  // Set revert rex_vex_w for avx encoding
+  void set_rex_vex_w_reverted(void) { _rex_vex_w_reverted = true; }
+
+  // Set rex_vex_w based on state
+  void set_rex_vex_w(bool state) { _rex_vex_w = state; }
+
   // Set the instruction to be encoded in AVX mode
   void set_is_legacy_mode(void) { _legacy_mode = true; }
 
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Apr 18 15:18:14 2016 -0700
@@ -3399,6 +3399,18 @@
   }
 }
 
+void MacroAssembler::setvectmask(Register dst, Register src) {
+  Assembler::movl(dst, 1);
+  Assembler::shlxl(dst, dst, src);
+  Assembler::decl(dst);
+  Assembler::kmovdl(k1, dst);
+  Assembler::movl(dst, src);
+}
+
+void MacroAssembler::restorevectmask() {
+  Assembler::knotwl(k1, k0);
+}
+
 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
   if (reachable(src)) {
     if (UseXmmLoadAndClearUpper) {
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Apr 18 15:18:14 2016 -0700
@@ -156,6 +156,10 @@
   void incrementq(Register reg, int value = 1);
   void incrementq(Address dst, int value = 1);
 
+  // special instructions for EVEX
+  void setvectmask(Register dst, Register src);
+  void restorevectmask();
+
   // Support optimal SSE move instructions.
   void movflt(XMMRegister dst, XMMRegister src) {
     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
--- a/hotspot/src/cpu/x86/vm/x86.ad	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/cpu/x86/vm/x86.ad	Mon Apr 18 15:18:14 2016 -0700
@@ -1758,6 +1758,15 @@
   return ret_value;  // Per default match rules are supported.
 }
 
+const bool Matcher::has_predicated_vectors(void) {
+  bool ret_value = false;
+  if (UseAVX > 2) {
+    ret_value = VM_Version::supports_avx512vl();
+  }
+
+  return ret_value;
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
   int float_pressure_threshold = default_pressure_threshold;
 #ifdef _LP64
@@ -1875,7 +1884,7 @@
       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
       break;
     case Op_VecZ:
-      __ evmovdqul(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
+      __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
       break;
     default:
       ShouldNotReachHere();
@@ -1930,7 +1939,7 @@
         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
         break;
       case Op_VecZ:
-        __ evmovdqul(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
+        __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
         break;
       default:
         ShouldNotReachHere();
@@ -1950,7 +1959,7 @@
         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
         break;
       case Op_VecZ:
-        __ evmovdqul(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
+        __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
         break;
       default:
         ShouldNotReachHere();
@@ -2176,6 +2185,19 @@
   ins_pipe(pipe_slow);
 %}
 
+// =================================EVEX special===============================
+
+instruct setMask(rRegI dst, rRegI src) %{
+  predicate(Matcher::has_predicated_vectors());
+  match(Set dst (SetVectMaskI  src));
+  effect(TEMP dst);
+  format %{ "setvectmask   $dst, $src" %}
+  ins_encode %{
+    __ setvectmask($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // ============================================================================
 
 instruct addF_reg(regF dst, regF src) %{
@@ -3069,11 +3091,11 @@
 %}
 
 // Load vectors (64 bytes long)
-instruct loadV64(vecZ dst, memory mem) %{
-  predicate(n->as_LoadVector()->memory_size() == 64);
+instruct loadV64_dword(vecZ dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
   match(Set dst (LoadVector mem));
   ins_cost(125);
-  format %{ "vmovdqu $dst k0,$mem\t! load vector (64 bytes)" %}
+  format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
   ins_encode %{
     int vector_len = 2;
     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
@@ -3081,6 +3103,19 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load vectors (64 bytes long)
+instruct loadV64_qword(vecZ dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
+  match(Set dst (LoadVector mem));
+  ins_cost(125);
+  format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Store vectors
 instruct storeV4(memory mem, vecS src) %{
   predicate(n->as_StoreVector()->memory_size() == 4);
@@ -3126,11 +3161,11 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct storeV64(memory mem, vecZ src) %{
-  predicate(n->as_StoreVector()->memory_size() == 64);
+instruct storeV64_dword(memory mem, vecZ src) %{
+  predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
   match(Set mem (StoreVector mem src));
   ins_cost(145);
-  format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %}
+  format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
   ins_encode %{
     int vector_len = 2;
     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
@@ -3138,6 +3173,18 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct storeV64_qword(memory mem, vecZ src) %{
+  predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
+  match(Set mem (StoreVector mem src));
+  ins_cost(145);
+  format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ====================LEGACY REPLICATE=======================================
 
 instruct Repl4B_mem(vecS dst, memory mem) %{
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Mon Apr 18 15:18:14 2016 -0700
@@ -1021,10 +1021,10 @@
       __ vmovdqu(xmm0, Address(rsp, -32));
       break;
     case Op_VecZ:
-      __ evmovdqul(Address(rsp, -64), xmm0, 2);
-      __ evmovdqul(xmm0, Address(rsp, src_offset), 2);
-      __ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
-      __ evmovdqul(xmm0, Address(rsp, -64), 2);
+      __ evmovdquq(Address(rsp, -64), xmm0, 2);
+      __ evmovdquq(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdquq(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdquq(xmm0, Address(rsp, -64), 2);
       break;
     default:
       ShouldNotReachHere();
@@ -12047,6 +12047,7 @@
 
 // Jump Direct Conditional - Label defines a relative address from Jcc+1
 instruct jmpLoopEnd(cmpOp cop, eFlagsReg cr, label labl) %{
+  predicate(!n->has_vector_mask_set());
   match(CountedLoopEnd cop cr);
   effect(USE labl);
 
@@ -12062,6 +12063,7 @@
 
 // Jump Direct Conditional - Label defines a relative address from Jcc+1
 instruct jmpLoopEndU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
+  predicate(!n->has_vector_mask_set());
   match(CountedLoopEnd cop cmp);
   effect(USE labl);
 
@@ -12076,6 +12078,7 @@
 %}
 
 instruct jmpLoopEndUCF(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
+  predicate(!n->has_vector_mask_set());
   match(CountedLoopEnd cop cmp);
   effect(USE labl);
 
@@ -12089,6 +12092,60 @@
   ins_pipe( pipe_jcc );
 %}
 
+// mask version
+// Jump Direct Conditional - Label defines a relative address from Jcc+1
+instruct jmpLoopEnd_and_restoreMask(cmpOp cop, eFlagsReg cr, label labl) %{
+  predicate(n->has_vector_mask_set());
+  match(CountedLoopEnd cop cr);
+  effect(USE labl);
+
+  ins_cost(400);
+  format %{ "J$cop    $labl\t# Loop end\n\t"
+            "restorevectmask \t# vector mask restore for loops" %}
+  size(10);
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+    __ restorevectmask();
+  %}
+  ins_pipe( pipe_jcc );
+%}
+
+// Jump Direct Conditional - Label defines a relative address from Jcc+1
+instruct jmpLoopEndU_and_restoreMask(cmpOpU cop, eFlagsRegU cmp, label labl) %{
+  predicate(n->has_vector_mask_set());
+  match(CountedLoopEnd cop cmp);
+  effect(USE labl);
+
+  ins_cost(400);
+  format %{ "J$cop,u  $labl\t# Loop end\n\t"
+            "restorevectmask \t# vector mask restore for loops" %}
+  size(10);
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+    __ restorevectmask();
+  %}
+  ins_pipe( pipe_jcc );
+%}
+
+instruct jmpLoopEndUCF_and_restoreMask(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
+  predicate(n->has_vector_mask_set());
+  match(CountedLoopEnd cop cmp);
+  effect(USE labl);
+
+  ins_cost(300);
+  format %{ "J$cop,u  $labl\t# Loop end\n\t"
+            "restorevectmask \t# vector mask restore for loops" %}
+  size(10);
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+    __ restorevectmask();
+  %}
+  ins_pipe( pipe_jcc );
+%}
+
 // Jump Direct Conditional - using unsigned comparison
 instruct jmpConU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
   match(If cop cmp);
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Mon Apr 18 15:18:14 2016 -0700
@@ -1081,10 +1081,10 @@
       __ vmovdqu(xmm0, Address(rsp, -32));
       break;
     case Op_VecZ:
-      __ evmovdqul(Address(rsp, -64), xmm0, 2);
-      __ evmovdqul(xmm0, Address(rsp, src_offset), 2);
-      __ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
-      __ evmovdqul(xmm0, Address(rsp, -64), 2);
+      __ evmovdquq(Address(rsp, -64), xmm0, 2);
+      __ evmovdquq(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdquq(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdquq(xmm0, Address(rsp, -64), 2);
       break;
     default:
       ShouldNotReachHere();
@@ -11443,6 +11443,7 @@
 // Jump Direct Conditional - Label defines a relative address from Jcc+1
 instruct jmpLoopEnd(cmpOp cop, rFlagsReg cr, label labl)
 %{
+  predicate(!n->has_vector_mask_set());
   match(CountedLoopEnd cop cr);
   effect(USE labl);
 
@@ -11458,6 +11459,7 @@
 
 // Jump Direct Conditional - Label defines a relative address from Jcc+1
 instruct jmpLoopEndU(cmpOpU cop, rFlagsRegU cmp, label labl) %{
+  predicate(!n->has_vector_mask_set());
   match(CountedLoopEnd cop cmp);
   effect(USE labl);
 
@@ -11472,6 +11474,7 @@
 %}
 
 instruct jmpLoopEndUCF(cmpOpUCF cop, rFlagsRegUCF cmp, label labl) %{
+  predicate(!n->has_vector_mask_set());
   match(CountedLoopEnd cop cmp);
   effect(USE labl);
 
@@ -11485,6 +11488,61 @@
   ins_pipe(pipe_jcc);
 %}
 
+// mask version
+// Jump Direct Conditional - Label defines a relative address from Jcc+1
+instruct jmpLoopEnd_and_restoreMask(cmpOp cop, rFlagsReg cr, label labl)
+%{
+  predicate(n->has_vector_mask_set());
+  match(CountedLoopEnd cop cr);
+  effect(USE labl);
+
+  ins_cost(400);
+  format %{ "j$cop     $labl\t# loop end\n\t"
+            "restorevectmask \t# vector mask restore for loops" %}
+  size(10);
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+    __ restorevectmask();
+  %}
+  ins_pipe(pipe_jcc);
+%}
+
+// Jump Direct Conditional - Label defines a relative address from Jcc+1
+instruct jmpLoopEndU_and_restoreMask(cmpOpU cop, rFlagsRegU cmp, label labl) %{
+  predicate(n->has_vector_mask_set());
+  match(CountedLoopEnd cop cmp);
+  effect(USE labl);
+
+  ins_cost(400);
+  format %{ "j$cop,u   $labl\t# loop end\n\t"
+            "restorevectmask \t# vector mask restore for loops" %}
+  size(10);
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+    __ restorevectmask();
+  %}
+  ins_pipe(pipe_jcc);
+%}
+
+instruct jmpLoopEndUCF_and_restoreMask(cmpOpUCF cop, rFlagsRegUCF cmp, label labl) %{
+  predicate(n->has_vector_mask_set());
+  match(CountedLoopEnd cop cmp);
+  effect(USE labl);
+
+  ins_cost(300);
+  format %{ "j$cop,u   $labl\t# loop end\n\t"
+            "restorevectmask \t# vector mask restore for loops" %}
+  size(10);
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+    __ restorevectmask();
+  %}
+  ins_pipe(pipe_jcc);
+%}
+
 // Jump Direct Conditional - using unsigned comparison
 instruct jmpConU(cmpOpU cop, rFlagsRegU cmp, label labl) %{
   match(If cop cmp);
--- a/hotspot/src/share/vm/opto/classes.hpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/share/vm/opto/classes.hpp	Mon Apr 18 15:18:14 2016 -0700
@@ -178,6 +178,7 @@
 macro(MaxI)
 macro(MemBarAcquire)
 macro(LoadFence)
+macro(SetVectMaskI)
 macro(MemBarAcquireLock)
 macro(MemBarCPUOrder)
 macro(MemBarRelease)
--- a/hotspot/src/share/vm/opto/loopUnswitch.cpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/share/vm/opto/loopUnswitch.cpp	Mon Apr 18 15:18:14 2016 -0700
@@ -374,10 +374,17 @@
     return false; // skip malformed counted loop
   }
   if (!cl->is_main_loop()) {
-    if (TraceLoopOpts) {
-      tty->print_cr("CountedLoopReserveKit::create_reserve: %d not main loop", cl->_idx);
+    bool loop_not_canonical = true;
+    if (cl->is_post_loop() && (cl->slp_max_unroll() > 0)) {
+      loop_not_canonical = false;
     }
-    return false; // skip normal, pre, and post loops
+    // only reject some loop forms
+    if (loop_not_canonical) {
+      if (TraceLoopOpts) {
+        tty->print_cr("CountedLoopReserveKit::create_reserve: %d not canonical loop", cl->_idx);
+      }
+      return false; // skip normal, pre, and post (conditionally) loops
+    }
   }
 
   _lp = _lpt->_head->as_Loop();
--- a/hotspot/src/share/vm/opto/loopnode.cpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/share/vm/opto/loopnode.cpp	Mon Apr 18 15:18:14 2016 -0700
@@ -2369,11 +2369,13 @@
                   if (multi_version_post_loops(lpt, lpt_next) == false) {
                     // Cause the rce loop to be optimized away if we fail
                     cl->mark_is_multiversioned();
+                    cl->set_slp_max_unroll(0);
                     poison_rce_post_loop(lpt);
                   }
                 }
               }
             }
+            sw.transform_loop(lpt, true);
           }
         } else if (cl->is_main_loop()) {
           sw.transform_loop(lpt, true);
--- a/hotspot/src/share/vm/opto/matcher.hpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/share/vm/opto/matcher.hpp	Mon Apr 18 15:18:14 2016 -0700
@@ -273,6 +273,9 @@
   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
   static const bool match_rule_supported_vector(int opcode, int vlen);
 
+  // Some microarchitectures have mask registers used on vectors
+  static const bool has_predicated_vectors(void);
+
   // Some uarchs have different sized float register resources
   static const int float_pressure(int default_pressure_threshold);
 
--- a/hotspot/src/share/vm/opto/node.hpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/share/vm/opto/node.hpp	Mon Apr 18 15:18:14 2016 -0700
@@ -722,8 +722,9 @@
     Flag_avoid_back_to_back_after    = Flag_avoid_back_to_back_before << 1,
     Flag_has_call                    = Flag_avoid_back_to_back_after << 1,
     Flag_is_reduction                = Flag_has_call << 1,
-    Flag_is_scheduled                = Flag_is_reduction,
-    Flag_is_expensive                = Flag_is_scheduled << 1,
+    Flag_is_scheduled                = Flag_is_reduction << 1,
+    Flag_has_vector_mask_set         = Flag_is_scheduled << 1,
+    Flag_is_expensive                = Flag_has_vector_mask_set << 1,
     _max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination
   };
 
@@ -912,6 +913,9 @@
   // It must have the loop's phi as input and provide a def to the phi.
   bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
 
+  // The node is a CountedLoopEnd with a mask annotation so as to emit a restore context
+  bool has_vector_mask_set() const { return (_flags & Flag_has_vector_mask_set) != 0; }
+
   // Used in lcm to mark nodes that have scheduled
   bool is_scheduled() const { return (_flags & Flag_is_scheduled) != 0; }
 
--- a/hotspot/src/share/vm/opto/superword.cpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/share/vm/opto/superword.cpp	Mon Apr 18 15:18:14 2016 -0700
@@ -52,6 +52,7 @@
   _packset(arena(), 8,  0, NULL),         // packs for the current block
   _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
   _block(arena(), 8,  0, NULL),           // nodes in current block
+  _post_block(arena(), 8, 0, NULL),       // nodes common to current block which are marked as post loop vectorizable
   _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
@@ -100,10 +101,30 @@
 
   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
 
-  if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
+  bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
+  if (post_loop_allowed) {
+    if (cl->is_reduction_loop()) return; // no predication mapping
+    Node *limit = cl->limit();
+    if (limit->is_Con()) return; // non constant limits only
+    // Now check the limit for expressions we do not handle
+    if (limit->is_Add()) {
+      Node *in2 = limit->in(2);
+      if (in2->is_Con()) {
+        int val = in2->get_int();
+        // should not try to program these cases
+        if (val < 0) return;
+      }
+    }
+  }
+
+  // skip any loop that has not been assigned max unroll by analysis
+  if (do_optimization) {
+    if (cl->slp_max_unroll() == 0) return;
+  }
+
   // Check for no control flow in body (other than exit)
   Node *cl_exit = cl->loopexit();
-  if (cl_exit->in(0) != lpt->_head) {
+  if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
     #ifndef PRODUCT
       if (TraceSuperWord) {
         tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
@@ -121,15 +142,16 @@
     return;
   }
 
-  // We only re-enter slp when we vector mapped a queried loop and we want to
-  // continue unrolling, in this case, slp is not subsequently done.
-  if (cl->do_unroll_only()) return;
-
-  // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
-  CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
-  if (pre_end == NULL) return;
-  Node *pre_opaq1 = pre_end->limit();
-  if (pre_opaq1->Opcode() != Op_Opaque1) return;
+  // Skip any loops already optimized by slp
+  if (cl->is_vectorized_loop()) return;
+
+  if (cl->is_main_loop()) {
+    // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
+    CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
+    if (pre_end == NULL) return;
+    Node *pre_opaq1 = pre_end->limit();
+    if (pre_opaq1->Opcode() != Op_Opaque1) return;
+  }
 
   init(); // initialize data structures
 
@@ -142,6 +164,19 @@
   if (do_optimization) {
     assert(_packset.length() == 0, "packset must be empty");
     SLP_extract();
+    if (PostLoopMultiversioning && Matcher::has_predicated_vectors()) {
+      if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
+        IdealLoopTree *lpt_next = lpt->_next;
+        CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
+        _phase->has_range_checks(lpt_next);
+        if (cl_next->is_post_loop() && !cl_next->range_checks_present()) {
+          if (!cl_next->is_vectorized_loop()) {
+            int slp_max_unroll_factor = cl->slp_max_unroll();
+            cl_next->set_slp_max_unroll(slp_max_unroll_factor);
+          }
+        }
+      }
+    }
   }
 }
 
@@ -154,6 +189,9 @@
   Node_Stack nstack((int)ignored_size);
   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
   Node *cl_exit = cl->loopexit();
+  int rpo_idx = _post_block.length();
+
+  assert(rpo_idx == 0, "post loop block is empty");
 
   // First clear the entries
   for (uint i = 0; i < lpt()->_body.size(); i++) {
@@ -161,6 +199,7 @@
   }
 
   int max_vector = Matcher::max_vector_size(T_INT);
+  bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
 
   // Process the loop, some/all of the stack entries will not be in order, ergo
   // need to preprocess the ignored initial state before we process the loop
@@ -259,6 +298,7 @@
   if (is_slp) {
     // Now we try to find the maximum supported consistent vector which the machine
     // description can use
+    bool small_basic_type = false;
     for (uint i = 0; i < lpt()->_body.size(); i++) {
       if (ignored_loop_nodes[i] != -1) continue;
 
@@ -269,6 +309,26 @@
       } else {
         bt = n->bottom_type()->basic_type();
       }
+
+      if (post_loop_allowed) {
+        if (!small_basic_type) {
+          switch (bt) {
+          case T_CHAR:
+          case T_BYTE:
+          case T_SHORT:
+            small_basic_type = true;
+            break;
+
+          case T_LONG:
+            // TODO: Remove when support completed for mask context with LONG.
+            //       Support needs to be augmented for logical qword operations, currently we map to dword
+            //       buckets for vectors on logicals as these were legacy.
+            small_basic_type = true;
+            break;
+          }
+        }
+      }
+
       if (is_java_primitive(bt) == false) continue;
 
       int cur_max_vector = Matcher::max_vector_size(bt);
@@ -288,6 +348,12 @@
         if (cur_max_vector < max_vector) {
           max_vector = cur_max_vector;
         }
+
+        // We only process post loops on predicated targets where we want to
+        // mask map the loop to a single iteration
+        if (post_loop_allowed) {
+          _post_block.at_put_grow(rpo_idx++, n);
+        }
       }
     }
     if (is_slp) {
@@ -295,7 +361,14 @@
       cl->mark_passed_slp();
     }
     cl->mark_was_slp();
-    cl->set_slp_max_unroll(local_loop_unroll_factor);
+    if (cl->is_main_loop()) {
+      cl->set_slp_max_unroll(local_loop_unroll_factor);
+    } else if (post_loop_allowed) {
+      if (!small_basic_type) {
+        // avoid replication context for small basic types in programmable masked loops
+        cl->set_slp_max_unroll(local_loop_unroll_factor);
+      }
+    }
   }
 }
 
@@ -350,67 +423,104 @@
   if (!construct_bb()) {
     return; // Exit if no interesting nodes or complex graph.
   }
+
   // build    _dg, _disjoint_ptrs
   dependence_graph();
 
   // compute function depth(Node*)
   compute_max_depth();
 
-  if (_do_vector_loop) {
-    if (mark_generations() != -1) {
-      hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
-
-      if (!construct_bb()) {
-        return; // Exit if no interesting nodes or complex graph.
+  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+  bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
+  if (cl->is_main_loop()) {
+    if (_do_vector_loop) {
+      if (mark_generations() != -1) {
+        hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
+
+        if (!construct_bb()) {
+          return; // Exit if no interesting nodes or complex graph.
+        }
+        dependence_graph();
+        compute_max_depth();
       }
-      dependence_graph();
-      compute_max_depth();
+
+#ifndef PRODUCT
+      if (TraceSuperWord) {
+        tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
+        _lpt->dump_head();
+        for (int j = 0; j < _block.length(); j++) {
+          Node* n = _block.at(j);
+          int d = depth(n);
+          for (int i = 0; i < d; i++) tty->print("%s", "  ");
+          tty->print("%d :", d);
+          n->dump();
+        }
+      }
+#endif
     }
 
-#ifndef PRODUCT
-    if (TraceSuperWord) {
-      tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
-      _lpt->dump_head();
-      for (int j = 0; j < _block.length(); j++) {
-        Node* n = _block.at(j);
-        int d = depth(n);
-        for (int i = 0;  i < d; i++) tty->print("%s", "  ");
-        tty->print("%d :", d);
-        n->dump();
+    compute_vector_element_type();
+
+    // Attempt vectorization
+
+    find_adjacent_refs();
+
+    extend_packlist();
+
+    if (_do_vector_loop) {
+      if (_packset.length() == 0) {
+        if (TraceSuperWord) {
+          tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
+        }
+        pack_parallel();
       }
     }
-#endif
-  }
-
-  compute_vector_element_type();
-
-  // Attempt vectorization
-
-  find_adjacent_refs();
-
-  extend_packlist();
-
-  if (_do_vector_loop) {
-    if (_packset.length() == 0) {
-      if (TraceSuperWord) {
-        tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
+
+    combine_packs();
+
+    construct_my_pack_map();
+
+    if (_do_vector_loop) {
+      merge_packs_to_cmovd();
+    }
+
+    filter_packs();
+
+    schedule();
+  } else if (post_loop_allowed) {
+    int saved_mapped_unroll_factor = cl->slp_max_unroll();
+    if (saved_mapped_unroll_factor) {
+      int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
+
+      // now reset the slp_unroll_factor so that we can check the analysis mapped
+      // what the vector loop was mapped to
+      cl->set_slp_max_unroll(0);
+
+      // do the analysis on the post loop
+      unrolling_analysis(vector_mapped_unroll_factor);
+
+      // if our analyzed loop is a canonical fit, start processing it
+      if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
+        // now add the vector nodes to packsets
+        for (int i = 0; i < _post_block.length(); i++) {
+          Node* n = _post_block.at(i);
+          Node_List* singleton = new Node_List();
+          singleton->push(n);
+          _packset.append(singleton);
+          set_my_pack(n, singleton);
+        }
+
+        // map base types for vector usage
+        compute_vector_element_type();
+      } else {
+        return;
       }
-      pack_parallel();
+    } else {
+      // for some reason we could not map the slp analysis state of the vectorized loop
+      return;
     }
   }
 
-  combine_packs();
-
-  construct_my_pack_map();
-
-  if (_do_vector_loop) {
-    merge_packs_to_cmovd();
-  }
-
-  filter_packs();
-
-  schedule();
-
   output();
 }
 
@@ -811,6 +921,7 @@
 // Add dependence edges to load/store nodes for memory dependence
 //    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
 void SuperWord::dependence_graph() {
+  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
   // First, assign a dependence node to each memory node
   for (int i = 0; i < _block.length(); i++ ) {
     Node *n = _block.at(i);
@@ -825,7 +936,9 @@
     Node* n_tail = _mem_slice_tail.at(i);
 
     // Get slice in predecessor order (last is first)
-    mem_slice_preds(n_tail, n, _nlist);
+    if (cl->is_main_loop()) {
+      mem_slice_preds(n_tail, n, _nlist);
+    }
 
 #ifndef PRODUCT
     if(TraceSuperWord && Verbose) {
@@ -2029,20 +2142,23 @@
   }
 #endif
 
-  // MUST ENSURE main loop's initial value is properly aligned:
-  //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
-
-  align_initial_loop_index(align_to_ref());
-
-  // Insert extract (unpack) operations for scalar uses
-  for (int i = 0; i < _packset.length(); i++) {
-    insert_extracts(_packset.at(i));
+  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+  if (cl->is_main_loop()) {
+    // MUST ENSURE main loop's initial value is properly aligned:
+    //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
+
+    align_initial_loop_index(align_to_ref());
+
+    // Insert extract (unpack) operations for scalar uses
+    for (int i = 0; i < _packset.length(); i++) {
+      insert_extracts(_packset.at(i));
+    }
   }
 
   Compile* C = _phase->C;
-  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
   uint max_vlen_in_bytes = 0;
   uint max_vlen = 0;
+  bool can_process_post_loop = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
 
   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
 
@@ -2064,6 +2180,10 @@
       Node* vn = NULL;
       Node* low_adr = p->at(0);
       Node* first   = executed_first(p);
+      if (can_process_post_loop) {
+        // override vlen with the main loops vector length
+        vlen = cl->slp_max_unroll();
+      }
       NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
       int   opc = n->Opcode();
       if (n->is_Load()) {
@@ -2153,6 +2273,10 @@
         vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
       } else if (is_cmov_pack(p)) {
+        if (can_process_post_loop) {
+          // do not refactor of flow in post loop context
+          return;
+        }
         if (!n->is_CMove()) {
           continue;
         }
@@ -2217,6 +2341,7 @@
         ShouldNotReachHere();
       }
 
+      _block.at_put(i, vn);
       _igvn.register_new_node_with_optimizer(vn);
       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
       for (uint j = 0; j < p->size(); j++) {
@@ -2225,6 +2350,14 @@
       }
       _igvn._worklist.push(vn);
 
+      if (can_process_post_loop) {
+        // first check if the vector size if the maximum vector which we can use on the machine,
+        // other vector size have reduced values for predicated data mapping.
+        if (vlen_in_bytes != (uint)MaxVectorSize) {
+          return;
+        }
+      }
+
       if (vlen_in_bytes > max_vlen_in_bytes) {
         max_vlen = vlen;
         max_vlen_in_bytes = vlen_in_bytes;
@@ -2247,15 +2380,38 @@
         if (TraceSuperWordLoopUnrollAnalysis) {
           tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
         }
-        // For atomic unrolled loops which are vector mapped, instigate more unrolling.
+
+        // For atomic unrolled loops which are vector mapped, instigate more unrolling
         cl->set_notpassed_slp();
-        // if vector resources are limited, do not allow additional unrolling
-        if (FLOATPRESSURE > 8) {
-          C->set_major_progress();
+        if (cl->is_main_loop()) {
+          // if vector resources are limited, do not allow additional unrolling, also
+          // do not unroll more on pure vector loops which were not reduced so that we can
+          // program the post loop to single iteration execution.
+          if (FLOATPRESSURE > 8) {
+            C->set_major_progress();
+            cl->mark_do_unroll_only();
+          }
         }
-        cl->mark_do_unroll_only();
+
         if (do_reserve_copy()) {
           cl->mark_loop_vectorized();
+          if (can_process_post_loop) {
+            // Now create the difference of trip and limit and use it as our mask index.
+            // Note: We limited the unroll of the vectorized loop so that
+            //       only vlen-1 size iterations can remain to be mask programmed.
+            Node *incr = cl->incr();
+            SubINode *index = new SubINode(cl->limit(), cl->init_trip());
+            _igvn.register_new_node_with_optimizer(index);
+            SetVectMaskINode  *mask = new SetVectMaskINode(_phase->get_ctrl(cl->init_trip()), index);
+            _igvn.register_new_node_with_optimizer(mask);
+            // make this a single iteration loop
+            AddINode *new_incr = new AddINode(incr->in(1), mask);
+            _igvn.register_new_node_with_optimizer(new_incr);
+            _phase->set_ctrl(new_incr, _phase->get_ctrl(incr));
+            _igvn.replace_node(incr, new_incr);
+            cl->mark_is_multiversioned();
+            cl->loopexit()->add_flag(Node::Flag_has_vector_mask_set);
+          }
         }
       }
     }
@@ -2274,6 +2430,12 @@
   Node* p0 = p->at(0);
   uint vlen = p->size();
   Node* opd = p0->in(opd_idx);
+  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+
+  if (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()) {
+    // override vlen with the main loops vector length
+    vlen = cl->slp_max_unroll();
+  }
 
   if (same_inputs(p, opd_idx)) {
     if (opd->is_Vector() || opd->is_LoadVector()) {
@@ -3090,13 +3252,13 @@
   return pre_end;
 }
 
-
 //------------------------------init---------------------------
 void SuperWord::init() {
   _dg.init();
   _packset.clear();
   _disjoint_ptrs.clear();
   _block.clear();
+  _post_block.clear();
   _data_entry.clear();
   _mem_slice_head.clear();
   _mem_slice_tail.clear();
@@ -3120,6 +3282,7 @@
   _packset.clear();
   _disjoint_ptrs.clear();
   _block.clear();
+  _post_block.clear();
   _data_entry.clear();
   _mem_slice_head.clear();
   _mem_slice_tail.clear();
--- a/hotspot/src/share/vm/opto/superword.hpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/share/vm/opto/superword.hpp	Mon Apr 18 15:18:14 2016 -0700
@@ -261,6 +261,7 @@
   GrowableArray<int> _bb_idx;            // Map from Node _idx to index within block
 
   GrowableArray<Node*> _block;           // Nodes in current block
+  GrowableArray<Node*> _post_block;      // Nodes in post loop block
   GrowableArray<Node*> _data_entry;      // Nodes with all inputs from outside
   GrowableArray<Node*> _mem_slice_head;  // Memory slice head nodes
   GrowableArray<Node*> _mem_slice_tail;  // Memory slice tail nodes
--- a/hotspot/src/share/vm/opto/vectornode.hpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/share/vm/opto/vectornode.hpp	Mon Apr 18 15:18:14 2016 -0700
@@ -529,6 +529,7 @@
                               Node* adr, const TypePtr* atyp,
                               uint vlen, BasicType bt,
                               ControlDependency control_dependency = LoadNode::DependsOnlyOnTest);
+  uint element_size(void) { return type2aelembytes(vect_type()->element_basic_type()); }
 };
 
 //------------------------------StoreVectorNode--------------------------------
@@ -553,6 +554,8 @@
   static StoreVectorNode* make(int opc, Node* ctl, Node* mem,
                                Node* adr, const TypePtr* atyp, Node* val,
                                uint vlen);
+
+  uint element_size(void) { return type2aelembytes(vect_type()->element_basic_type()); }
 };
 
 
@@ -791,4 +794,15 @@
   virtual uint ideal_reg() const { return Op_RegD; }
 };
 
+//------------------------------SetVectMaskINode-------------------------------
+// Provide a mask for a vector predicate machine
+class SetVectMaskINode : public Node {
+public:
+  SetVectMaskINode(Node *c, Node *in1) : Node(c, in1) {}
+  virtual int Opcode() const;
+  const Type *bottom_type() const { return TypeInt::INT; }
+  virtual uint ideal_reg() const { return Op_RegI; }
+  virtual const Type *Value(PhaseGVN *phase) const { return TypeInt::INT; }
+};
+
 #endif // SHARE_VM_OPTO_VECTORNODE_HPP
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp	Mon Apr 18 20:57:16 2016 +0000
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp	Mon Apr 18 15:18:14 2016 -0700
@@ -1923,6 +1923,7 @@
   declare_c2_type(ConvL2INode, Node)                                      \
   declare_c2_type(CastX2PNode, Node)                                      \
   declare_c2_type(CastP2XNode, Node)                                      \
+  declare_c2_type(SetVectMaskINode, Node)                                 \
   declare_c2_type(MemBarNode, MultiNode)                                  \
   declare_c2_type(MemBarAcquireNode, MemBarNode)                          \
   declare_c2_type(MemBarReleaseNode, MemBarNode)                          \