changeset 51857:9978fea8a371

8210764: Update avx512 implementation Reviewed-by: kvn Contributed-by: sandhya.viswanathan@intel.com
author kvn
date Mon, 24 Sep 2018 16:37:28 -0700
parents 11b9d3a6f31c
children cef2c1ea2f60
files src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp src/hotspot/cpu/arm/c1_LIRAssembler_arm.cpp src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp src/hotspot/cpu/s390/c1_LIRAssembler_s390.cpp src/hotspot/cpu/sparc/c1_LIRAssembler_sparc.cpp src/hotspot/cpu/x86/assembler_x86.cpp src/hotspot/cpu/x86/assembler_x86.hpp src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp src/hotspot/cpu/x86/c1_LIRGenerator_x86.cpp src/hotspot/cpu/x86/globals_x86.hpp src/hotspot/cpu/x86/macroAssembler_x86.cpp src/hotspot/cpu/x86/macroAssembler_x86.hpp src/hotspot/cpu/x86/vm_version_x86.cpp src/hotspot/cpu/x86/vm_version_x86.hpp src/hotspot/cpu/x86/x86.ad src/hotspot/cpu/x86/x86_32.ad src/hotspot/cpu/x86/x86_64.ad src/hotspot/share/c1/c1_LIR.cpp src/hotspot/share/c1/c1_LIR.hpp src/hotspot/share/c1/c1_LIRAssembler.cpp src/hotspot/share/c1/c1_LIRAssembler.hpp
diffstat 21 files changed, 1448 insertions(+), 3183 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -2792,7 +2792,10 @@
 }
 
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
+  // tmp must be unused
+  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
+
   if (left->is_single_cpu()) {
     assert(dest->is_single_cpu(), "expect single result reg");
     __ negw(dest->as_register(), left->as_register());
--- a/src/hotspot/cpu/arm/c1_LIRAssembler_arm.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/arm/c1_LIRAssembler_arm.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -3265,7 +3265,9 @@
 }
 
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
+  // tmp must be unused
+  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
 
   if (left->is_single_cpu()) {
     assert (dest->type() == T_INT, "unexpected result type");
--- a/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -2840,7 +2840,9 @@
 }
 
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
+  // tmp must be unused
+  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
   assert(left->is_register(), "can only handle registers");
 
   if (left->is_single_cpu()) {
--- a/src/hotspot/cpu/s390/c1_LIRAssembler_s390.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/s390/c1_LIRAssembler_s390.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -2850,7 +2850,9 @@
   ShouldNotCallThis(); // There are no delay slots on ZARCH_64.
 }
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
+  // tmp must be unused
+  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
   assert(left->is_register(), "can only handle registers");
 
   if (left->is_single_cpu()) {
--- a/src/hotspot/cpu/sparc/c1_LIRAssembler_sparc.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/sparc/c1_LIRAssembler_sparc.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -3024,7 +3024,9 @@
 }
 
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
+  // tmp must be unused
+  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
   assert(left->is_register(), "can only handle registers");
 
   if (left->is_single_cpu()) {
--- a/src/hotspot/cpu/x86/assembler_x86.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -2199,7 +2199,7 @@
 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_128bit;
-  InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x28);
@@ -2209,7 +2209,7 @@
 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_128bit;
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x28);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2217,7 +2217,7 @@
 
 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2465,8 +2465,7 @@
 
 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_128bit;
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2583,7 +2582,7 @@
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(is_vector_masking(), "");    // For stub code use only
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_embedded_opmask_register_specifier(mask);
   attributes.set_is_evex_instruction();
@@ -2608,7 +2607,7 @@
   assert(is_vector_masking(), "");
   assert(VM_Version::supports_avx512vlbw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_embedded_opmask_register_specifier(mask);
   attributes.set_is_evex_instruction();
@@ -2752,7 +2751,7 @@
 void Assembler::movlpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@@ -3512,7 +3511,7 @@
 void Assembler::evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx512vlbw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
   int dst_enc = kdst->encoding();
@@ -3525,7 +3524,7 @@
   assert(is_vector_masking(), "");
   assert(VM_Version::supports_avx512vlbw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.reset_is_clear_context();
   attributes.set_embedded_opmask_register_specifier(mask);
@@ -3538,7 +3537,7 @@
 
 void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) {
   assert(VM_Version::supports_avx512vlbw(), "");
-  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x3E);
@@ -3549,7 +3548,7 @@
 void Assembler::evpcmpuw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) {
   assert(is_vector_masking(), "");
   assert(VM_Version::supports_avx512vlbw(), "");
-  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.reset_is_clear_context();
   attributes.set_embedded_opmask_register_specifier(mask);
   attributes.set_is_evex_instruction();
@@ -3562,7 +3561,7 @@
 void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len) {
   assert(VM_Version::supports_avx512vlbw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
   int dst_enc = kdst->encoding();
@@ -3575,7 +3574,7 @@
 void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx512bw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   int dst_enc = kdst->encoding();
@@ -3588,7 +3587,7 @@
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(is_vector_masking(), "");    // For stub code use only
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_reg_mask */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_reg_mask */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.reset_is_clear_context();
   attributes.set_embedded_opmask_register_specifier(mask);
@@ -3741,7 +3740,7 @@
 
 void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3750,7 +3749,7 @@
 
 void Assembler::pextrd(Address dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x16);
@@ -3760,7 +3759,7 @@
 
 void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3769,7 +3768,7 @@
 
 void Assembler::pextrq(Address dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x16);
@@ -3779,7 +3778,7 @@
 
 void Assembler::pextrw(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse2(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xC5);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3788,7 +3787,7 @@
 
 void Assembler::pextrw(Address dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
   simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x15);
@@ -3798,7 +3797,7 @@
 
 void Assembler::pextrb(Address dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
   simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x14);
@@ -3808,7 +3807,7 @@
 
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3817,7 +3816,7 @@
 
 void Assembler::pinsrd(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x22);
@@ -3827,7 +3826,7 @@
 
 void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3836,7 +3835,7 @@
 
 void Assembler::pinsrq(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x22);
@@ -3846,7 +3845,7 @@
 
 void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse2(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xC4);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3855,7 +3854,7 @@
 
 void Assembler::pinsrw(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse2(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xC4);
@@ -3865,7 +3864,7 @@
 
 void Assembler::pinsrb(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x20);
@@ -3876,7 +3875,7 @@
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
@@ -3885,7 +3884,7 @@
 
 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3895,7 +3894,7 @@
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
@@ -3906,7 +3905,7 @@
   assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
   vector_len == AVX_256bit? VM_Version::supports_avx2() :
   vector_len == AVX_512bit? VM_Version::supports_avx512bw() : 0, "");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
   emit_int8((unsigned char) (0xC0 | encode));
@@ -3918,7 +3917,7 @@
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_embedded_opmask_register_specifier(mask);
   attributes.set_is_evex_instruction();
@@ -3930,7 +3929,7 @@
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(src != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
@@ -3943,7 +3942,7 @@
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(src != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.reset_is_clear_context();
   attributes.set_embedded_opmask_register_specifier(mask);
@@ -3957,7 +3956,7 @@
   assert(VM_Version::supports_evex(), "");
   assert(src != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_QVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
@@ -3969,7 +3968,7 @@
   assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
   vector_len == AVX_256bit? VM_Version::supports_avx2() :
   vector_len == AVX_512bit? VM_Version::supports_evex() : 0, " ");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x33);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4082,7 +4081,7 @@
 
 void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_ssse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x00);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4101,7 +4100,7 @@
 void Assembler::pshufb(XMMRegister dst, Address src) {
   assert(VM_Version::supports_ssse3(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x00);
@@ -4147,7 +4146,7 @@
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x70);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4159,7 +4158,7 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x70);
@@ -4180,7 +4179,7 @@
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4190,7 +4189,7 @@
 void Assembler::pslldq(XMMRegister dst, int shift) {
   // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   // XMM7 is for /7 encoding: 66 0F 73 /7 ib
   int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x73);
@@ -4456,7 +4455,7 @@
 
 void Assembler::palignr(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_ssse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x0F);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4477,6 +4476,7 @@
 void Assembler::evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x3);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5591,7 +5591,7 @@
 void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5600,7 +5600,7 @@
 void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5650,7 +5650,7 @@
 
 void Assembler::phaddw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5658,7 +5658,7 @@
 
 void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6281,6 +6281,15 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::vpandn(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0xDF);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+
 void Assembler::por(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6369,8 +6378,7 @@
 void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx2(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6383,9 +6391,8 @@
   assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x38);
@@ -6398,7 +6405,8 @@
 void Assembler::vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6413,10 +6421,10 @@
   assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  attributes.set_is_evex_instruction();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_operand(dst, src);
@@ -6430,9 +6438,10 @@
 void Assembler::vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x38);
+  emit_int8(0x3A);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
   // 0x01 - insert into upper 256 bits
@@ -6445,8 +6454,7 @@
 void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6459,9 +6467,8 @@
   assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
@@ -6472,16 +6479,16 @@
 }
 
 void Assembler::vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_avx2(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into q0 128 bits (0..127)
   // 0x01 - insert into q1 128 bits (128..255)
-  // 0x02 - insert into q2 128 bits (256..383)
-  // 0x03 - insert into q3 128 bits (384..511)
+  // 0x02 - insert into q0 128 bits (256..383)
+  // 0x03 - insert into q1 128 bits (384..512)
   emit_int8(imm8 & 0x03);
 }
 
@@ -6489,24 +6496,24 @@
   assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_operand(dst, src);
   // 0x00 - insert into q0 128 bits (0..127)
   // 0x01 - insert into q1 128 bits (128..255)
-  // 0x02 - insert into q2 128 bits (256..383)
-  // 0x03 - insert into q3 128 bits (384..511)
+  // 0x02 - insert into q0 128 bits (256..383)
+  // 0x03 - insert into q1 128 bits (384..512)
   emit_int8(imm8 & 0x03);
 }
 
 void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1A);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6520,8 +6527,9 @@
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_is_evex_instruction();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1A);
   emit_operand(dst, src);
@@ -6534,10 +6542,9 @@
 // vextracti forms
 
 void Assembler::vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
+  assert(VM_Version::supports_avx2(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6550,9 +6557,8 @@
   assert(VM_Version::supports_avx2(), "");
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   attributes.reset_is_clear_context();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -6564,10 +6570,10 @@
 }
 
 void Assembler::vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
+  assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6583,9 +6589,10 @@
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   attributes.reset_is_clear_context();
+  attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_operand(src, dst);
@@ -6599,7 +6606,8 @@
 void Assembler::vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx512dq(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6613,7 +6621,8 @@
 void Assembler::vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x3B);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6622,14 +6631,28 @@
   emit_int8(imm8 & 0x01);
 }
 
-
+void Assembler::vextracti64x4(Address dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_64bit);
+  attributes.reset_is_clear_context();
+  attributes.set_is_evex_instruction();
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x38);
+  emit_operand(src, dst);
+  // 0x00 - extract from lower 256 bits
+  // 0x01 - extract from upper 256 bits
+  emit_int8(imm8 & 0x01);
+}
 // vextractf forms
 
 void Assembler::vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6642,9 +6665,8 @@
   assert(VM_Version::supports_avx(), "");
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   attributes.reset_is_clear_context();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -6656,10 +6678,10 @@
 }
 
 void Assembler::vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
+  assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6675,9 +6697,10 @@
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   attributes.reset_is_clear_context();
+  attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_operand(src, dst);
@@ -6691,7 +6714,8 @@
 void Assembler::vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx512dq(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6705,7 +6729,8 @@
 void Assembler::vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1B);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6719,9 +6744,10 @@
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4,/* input_size_in_bits */  EVEX_64bit);
   attributes.reset_is_clear_context();
+  attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1B);
   emit_operand(src, dst);
@@ -6730,38 +6756,17 @@
   emit_int8(imm8 & 0x01);
 }
 
-
-// legacy word/dword replicate
-void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
+// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
+void Assembler::vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
-  emit_int8(0x79);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
-void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
-  assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
-  emit_int8(0x58);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
-
-// xmm/mem sourced byte/word/dword/qword replicate
-
-// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
-void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x78);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastb(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6773,16 +6778,16 @@
 }
 
 // duplicate 2-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
-void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x79);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastw(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6793,17 +6798,19 @@
   emit_operand(dst, src);
 }
 
+// xmm/mem sourced byte/word/dword/qword replicate
+
 // duplicate 4-byte integer data from src into programmed locations in dest : requires AVX512VL
-void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(UseAVX >= 2, "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastd(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6815,8 +6822,8 @@
 }
 
 // duplicate 8-byte integer data from src into programmed locations in dest : requires AVX512VL
-void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -6824,8 +6831,8 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastq(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6863,16 +6870,16 @@
 // scalar single/double precision replicate
 
 // duplicate single precision data from src into programmed locations in dest : requires AVX512VL
-void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastss(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6884,8 +6891,8 @@
 }
 
 // duplicate double precision data from src into programmed locations in dest : requires AVX512VL
-void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -6893,8 +6900,8 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6911,7 +6918,7 @@
 
 // duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -6921,7 +6928,7 @@
 
 // duplicate 2-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -6967,7 +6974,7 @@
 // Carry-Less Multiplication Quadword
 void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
   assert(VM_Version::supports_clmul(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6977,7 +6984,7 @@
 // Carry-Less Multiplication Quadword
 void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
   assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7597,33 +7604,23 @@
   set_attributes(attributes);
   attributes->set_current_assembler(this);
 
-  // if vector length is turned off, revert to AVX for vectors smaller than 512-bit
-  if (UseAVX > 2 && _legacy_mode_vl && attributes->uses_vl()) {
-    switch (attributes->get_vector_len()) {
-    case AVX_128bit:
-    case AVX_256bit:
-      attributes->set_is_legacy_mode();
-      break;
+  // For EVEX instruction (which is not marked as pure EVEX instruction) check and see if this instruction
+  // is allowed in legacy mode and has resources which will fit in it.
+  // Pure EVEX instructions will have is_evex_instruction set in their definition.
+  if (!attributes->is_legacy_mode()) {
+    if (UseAVX > 2 && !attributes->is_evex_instruction() && !_is_managed) {
+      if ((attributes->get_vector_len() != AVX_512bit) && (nds_enc < 16) && (xreg_enc < 16)) {
+          attributes->set_is_legacy_mode();
+      }
     }
   }
 
-  // For pure EVEX check and see if this instruction
-  // is allowed in legacy mode and has resources which will
-  // fit in it.  Pure EVEX instructions will use set_is_evex_instruction in their definition,
-  // else that field is set when we encode to EVEX
-  if (UseAVX > 2 && !attributes->is_legacy_mode() &&
-      !_is_managed && !attributes->is_evex_instruction()) {
-    if (!_legacy_mode_vl && attributes->get_vector_len() != AVX_512bit) {
-      bool check_register_bank = NOT_IA32(true) IA32_ONLY(false);
-      if (check_register_bank) {
-        // check nds_enc and xreg_enc for upper bank usage
-        if (nds_enc < 16 && xreg_enc < 16) {
-          attributes->set_is_legacy_mode();
-        }
-      } else {
-        attributes->set_is_legacy_mode();
-      }
-    }
+  if (UseAVX > 2) {
+    assert(((!attributes->uses_vl()) ||
+            (attributes->get_vector_len() == AVX_512bit) ||
+            (!_legacy_mode_vl) ||
+            (attributes->is_legacy_mode())),"XMM register should be 0-15");
+    assert(((nds_enc < 16 && xreg_enc < 16) || (!attributes->is_legacy_mode())),"XMM register should be 0-15");
   }
 
   _is_managed = false;
@@ -7653,43 +7650,31 @@
   bool vex_x = false;
   set_attributes(attributes);
   attributes->set_current_assembler(this);
-  bool check_register_bank = NOT_IA32(true) IA32_ONLY(false);
-
-  // if vector length is turned off, revert to AVX for vectors smaller than 512-bit
-  if (UseAVX > 2 && _legacy_mode_vl && attributes->uses_vl()) {
-    switch (attributes->get_vector_len()) {
-    case AVX_128bit:
-    case AVX_256bit:
-      if (check_register_bank) {
-        if (dst_enc >= 16 || nds_enc >= 16 || src_enc >= 16) {
-          // up propagate arithmetic instructions to meet RA requirements
-          attributes->set_vector_len(AVX_512bit);
-        } else {
+
+  // For EVEX instruction (which is not marked as pure EVEX instruction) check and see if this instruction
+  // is allowed in legacy mode and has resources which will fit in it.
+  // Pure EVEX instructions will have is_evex_instruction set in their definition.
+  if (!attributes->is_legacy_mode()) {
+    if (UseAVX > 2 && !attributes->is_evex_instruction() && !_is_managed) {
+      if ((!attributes->uses_vl() || (attributes->get_vector_len() != AVX_512bit)) &&
+          (dst_enc < 16) && (nds_enc < 16) && (src_enc < 16)) {
           attributes->set_is_legacy_mode();
-        }
-      } else {
-        attributes->set_is_legacy_mode();
       }
-      break;
     }
   }
 
-  // For pure EVEX check and see if this instruction
-  // is allowed in legacy mode and has resources which will
-  // fit in it.  Pure EVEX instructions will use set_is_evex_instruction in their definition,
-  // else that field is set when we encode to EVEX
-  if (UseAVX > 2 && !attributes->is_legacy_mode() &&
-      !_is_managed && !attributes->is_evex_instruction()) {
-    if (!_legacy_mode_vl && attributes->get_vector_len() != AVX_512bit) {
-      if (check_register_bank) {
-        // check dst_enc, nds_enc and src_enc for upper bank usage
-        if (dst_enc < 16 && nds_enc < 16 && src_enc < 16) {
-          attributes->set_is_legacy_mode();
-        }
-      } else {
-        attributes->set_is_legacy_mode();
-      }
-    }
+  if (UseAVX > 2) {
+    // All the scalar fp instructions (with uses_vl as false) can have legacy_mode as false
+    // Instruction with uses_vl true are vector instructions
+    // All the vector instructions with AVX_512bit length can have legacy_mode as false
+    // All the vector instructions with < AVX_512bit length can have legacy_mode as false if AVX512vl() is supported
+    // Rest all should have legacy_mode set as true
+    assert(((!attributes->uses_vl()) ||
+            (attributes->get_vector_len() == AVX_512bit) ||
+            (!_legacy_mode_vl) ||
+            (attributes->is_legacy_mode())),"XMM register should be 0-15");
+    // Instruction with legacy_mode true should have dst, nds and src < 15
+    assert(((dst_enc < 16 && nds_enc < 16 && src_enc < 16) || (!attributes->is_legacy_mode())),"XMM register should be 0-15");
   }
 
   _is_managed = false;
@@ -7741,7 +7726,7 @@
 void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xC2);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7751,7 +7736,7 @@
 void Assembler::blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x4B);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7762,7 +7747,7 @@
 void Assembler::cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xC2);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7772,7 +7757,7 @@
 void Assembler::blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x4A);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7782,7 +7767,7 @@
 
 void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
   assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x02);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7791,7 +7776,7 @@
 
 void Assembler::shlxl(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8((unsigned char)0xF7);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7799,7 +7784,7 @@
 
 void Assembler::shlxq(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8((unsigned char)0xF7);
   emit_int8((unsigned char)(0xC0 | encode));
--- a/src/hotspot/cpu/x86/assembler_x86.hpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp	Mon Sep 24 16:37:28 2018 -0700
@@ -2097,6 +2097,7 @@
 
   // Andn packed integers
   void pandn(XMMRegister dst, XMMRegister src);
+  void vpandn(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
 
   // Or packed integers
   void por(XMMRegister dst, XMMRegister src);
@@ -2134,6 +2135,7 @@
   void vextracti32x4(Address dst, XMMRegister src, uint8_t imm8);
   void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
   void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextracti64x4(Address dst, XMMRegister src, uint8_t imm8);
 
   // vextractf forms
   void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
@@ -2144,28 +2146,24 @@
   void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
   void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
 
-  // legacy xmm sourced word/dword replicate
-  void vpbroadcastw(XMMRegister dst, XMMRegister src);
-  void vpbroadcastd(XMMRegister dst, XMMRegister src);
-
   // xmm/mem sourced byte/word/dword/qword replicate
-  void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
-  void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastw(XMMRegister dst, Address src, int vector_len);
-  void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastd(XMMRegister dst, Address src, int vector_len);
-  void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastb(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastw(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastd(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastq(XMMRegister dst, Address src, int vector_len);
 
   void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
   void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);
 
   // scalar single/double precision replicate
-  void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
-  void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastss(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastsd(XMMRegister dst, Address src, int vector_len);
 
   // gpr sourced byte/word/dword/qword replicate
   void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
--- a/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -68,7 +68,6 @@
 static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], (jlong)UCONST64(0x8000000000000000), (jlong)UCONST64(0x8000000000000000));
 
 
-
 NEEDS_CLEANUP // remove this definitions ?
 const Register IC_Klass    = rax;   // where the IC klass is cached
 const Register SYNC_header = rax;   // synchronization header
@@ -650,7 +649,7 @@
 
     case T_FLOAT: {
       if (dest->is_single_xmm()) {
-        if (c->is_zero_float()) {
+        if (LP64_ONLY(UseAVX < 2 &&) c->is_zero_float()) {
           __ xorps(dest->as_xmm_float_reg(), dest->as_xmm_float_reg());
         } else {
           __ movflt(dest->as_xmm_float_reg(),
@@ -672,7 +671,7 @@
 
     case T_DOUBLE: {
       if (dest->is_double_xmm()) {
-        if (c->is_zero_double()) {
+        if (LP64_ONLY(UseAVX < 2 &&) c->is_zero_double()) {
           __ xorpd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg());
         } else {
           __ movdbl(dest->as_xmm_double_reg(),
@@ -2395,16 +2394,24 @@
 }
 
 
-void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, LIR_Opr dest, LIR_Op* op) {
+void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr tmp, LIR_Opr dest, LIR_Op* op) {
   if (value->is_double_xmm()) {
     switch(code) {
       case lir_abs :
         {
-          if (dest->as_xmm_double_reg() != value->as_xmm_double_reg()) {
-            __ movdbl(dest->as_xmm_double_reg(), value->as_xmm_double_reg());
+#ifdef _LP64
+          if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
+            assert(tmp->is_valid(), "need temporary");
+            __ vpandn(dest->as_xmm_double_reg(), tmp->as_xmm_double_reg(), value->as_xmm_double_reg(), 2);
+          } else {
+#endif
+            if (dest->as_xmm_double_reg() != value->as_xmm_double_reg()) {
+              __ movdbl(dest->as_xmm_double_reg(), value->as_xmm_double_reg());
+            }
+            assert(!tmp->is_valid(), "do not need temporary");
+            __ andpd(dest->as_xmm_double_reg(),
+                     ExternalAddress((address)double_signmask_pool));
           }
-          __ andpd(dest->as_xmm_double_reg(),
-                    ExternalAddress((address)double_signmask_pool));
         }
         break;
 
@@ -3734,7 +3741,7 @@
 }
 
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
   if (left->is_single_cpu()) {
     __ negl(left->as_register());
     move_regs(left->as_register(), dest->as_register());
@@ -3759,24 +3766,36 @@
 #endif // _LP64
 
   } else if (dest->is_single_xmm()) {
-    if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) {
-      __ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg());
+#ifdef _LP64
+    if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
+      assert(tmp->is_valid(), "need temporary");
+      assert_different_registers(left->as_xmm_float_reg(), tmp->as_xmm_float_reg());
+      __ vpxor(dest->as_xmm_float_reg(), tmp->as_xmm_float_reg(), left->as_xmm_float_reg(), 2);
     }
-    if (UseAVX > 0) {
-      __ vnegatess(dest->as_xmm_float_reg(), dest->as_xmm_float_reg(),
-                   ExternalAddress((address)float_signflip_pool));
-    } else {
+    else
+#endif
+    {
+      assert(!tmp->is_valid(), "do not need temporary");
+      if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) {
+        __ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg());
+      }
       __ xorps(dest->as_xmm_float_reg(),
                ExternalAddress((address)float_signflip_pool));
     }
   } else if (dest->is_double_xmm()) {
-    if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) {
-      __ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg());
+#ifdef _LP64
+    if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
+      assert(tmp->is_valid(), "need temporary");
+      assert_different_registers(left->as_xmm_double_reg(), tmp->as_xmm_double_reg());
+      __ vpxor(dest->as_xmm_double_reg(), tmp->as_xmm_double_reg(), left->as_xmm_double_reg(), 2);
     }
-    if (UseAVX > 0) {
-      __ vnegatesd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg(),
-                   ExternalAddress((address)double_signflip_pool));
-    } else {
+    else
+#endif
+    {
+      assert(!tmp->is_valid(), "do not need temporary");
+      if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) {
+        __ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg());
+      }
       __ xorpd(dest->as_xmm_double_reg(),
                ExternalAddress((address)double_signflip_pool));
     }
--- a/src/hotspot/cpu/x86/c1_LIRGenerator_x86.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/c1_LIRGenerator_x86.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -320,7 +320,21 @@
   value.set_destroys_register();
   value.load_item();
   LIR_Opr reg = rlock(x);
-  __ negate(value.result(), reg);
+
+  LIR_Opr tmp = LIR_OprFact::illegalOpr;
+#ifdef _LP64
+  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
+    if (x->type()->tag() == doubleTag) {
+      tmp = new_register(T_DOUBLE);
+      __ move(LIR_OprFact::doubleConst(-0.0), tmp);
+    }
+    else if (x->type()->tag() == floatTag) {
+      tmp = new_register(T_FLOAT);
+      __ move(LIR_OprFact::floatConst(-0.0), tmp);
+    }
+  }
+#endif
+  __ negate(value.result(), reg, tmp);
 
   set_result(x, round_item(reg));
 }
@@ -748,8 +762,17 @@
   LIR_Opr calc_input = value.result();
   LIR_Opr calc_result = rlock_result(x);
 
+  LIR_Opr tmp = LIR_OprFact::illegalOpr;
+#ifdef _LP64
+  if (UseAVX > 2 && (!VM_Version::supports_avx512vl()) &&
+      (x->id() == vmIntrinsics::_dabs)) {
+    tmp = new_register(T_DOUBLE);
+    __ move(LIR_OprFact::doubleConst(-0.0), tmp);
+  }
+#endif
+
   switch(x->id()) {
-    case vmIntrinsics::_dabs:   __ abs  (calc_input, calc_result, LIR_OprFact::illegalOpr); break;
+    case vmIntrinsics::_dabs:   __ abs  (calc_input, calc_result, tmp); break;
     case vmIntrinsics::_dsqrt:  __ sqrt (calc_input, calc_result, LIR_OprFact::illegalOpr); break;
     default:                    ShouldNotReachHere();
   }
--- a/src/hotspot/cpu/x86/globals_x86.hpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/globals_x86.hpp	Mon Sep 24 16:37:28 2018 -0700
@@ -119,7 +119,7 @@
   product(bool, UseStoreImmI16, true,                                       \
           "Use store immediate 16-bits value instruction on x86")           \
                                                                             \
-  product(intx, UseAVX, 2,                                                  \
+  product(intx, UseAVX, 3,                                                  \
           "Highest supported AVX instructions set on x86/x64")              \
           range(0, 99)                                                      \
                                                                             \
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -2942,16 +2942,6 @@
   }
 }
 
-void MacroAssembler::push_zmm(XMMRegister reg) {
-  lea(rsp, Address(rsp, -64)); // Use lea to not affect flags
-  evmovdqul(Address(rsp, 0), reg, Assembler::AVX_512bit);
-}
-
-void MacroAssembler::pop_zmm(XMMRegister reg) {
-  evmovdqul(reg, Address(rsp, 0), Assembler::AVX_512bit);
-  lea(rsp, Address(rsp, 64)); // Use lea to not affect flags
-}
-
 void MacroAssembler::fremr(Register tmp) {
   save_rax(tmp);
   { Label L;
@@ -3332,27 +3322,18 @@
 }
 
 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
-    Assembler::vextractf32x4(dst, src, 0);
-  } else {
+    assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::movdqu(dst, src);
-  }
 }
 
 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
-    Assembler::vinsertf32x4(dst, dst, src, 0);
-  } else {
+    assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::movdqu(dst, src);
-  }
 }
 
 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
-    Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
-  } else {
+    assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::movdqu(dst, src);
-  }
 }
 
 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
@@ -3365,28 +3346,18 @@
 }
 
 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
-    vextractf64x4_low(dst, src);
-  } else {
+    assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::vmovdqu(dst, src);
-  }
 }
 
 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
-    vinsertf64x4_low(dst, src);
-  } else {
+    assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::vmovdqu(dst, src);
-  }
 }
 
 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
-    Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
-  }
-  else {
+    assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::vmovdqu(dst, src);
-  }
 }
 
 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
@@ -3670,187 +3641,43 @@
 }
 
 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::pcmpeqb(dst, src);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::pcmpeqb(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pcmpeqb(xmm0, src);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::pcmpeqb(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::pcmpeqb(xmm1, xmm0);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::pcmpeqb(dst, src);
 }
 
 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::pcmpeqw(dst, src);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::pcmpeqw(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pcmpeqw(xmm0, src);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::pcmpeqw(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::pcmpeqw(xmm1, xmm0);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::pcmpeqw(dst, src);
 }
 
 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
-  int dst_enc = dst->encoding();
-  if (dst_enc < 16) {
-    Assembler::pcmpestri(dst, src, imm8);
-  } else {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pcmpestri(xmm0, src, imm8);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  }
+  assert((dst->encoding() < 16),"XMM register should be 0-15");
+  Assembler::pcmpestri(dst, src, imm8);
 }
 
 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::pcmpestri(dst, src, imm8);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pcmpestri(xmm0, src, imm8);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::pcmpestri(dst, xmm0, imm8);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::pcmpestri(xmm1, xmm0, imm8);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
+  Assembler::pcmpestri(dst, src, imm8);
 }
 
 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::pmovzxbw(dst, src);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::pmovzxbw(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pmovzxbw(xmm0, src);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::pmovzxbw(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::pmovzxbw(xmm1, xmm0);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::pmovzxbw(dst, src);
 }
 
 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
-  int dst_enc = dst->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::pmovzxbw(dst, src);
-  } else if (dst_enc < 16) {
-    Assembler::pmovzxbw(dst, src);
-  } else {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pmovzxbw(xmm0, src);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::pmovzxbw(dst, src);
 }
 
 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
-  int src_enc = src->encoding();
-  if (src_enc < 16) {
-    Assembler::pmovmskb(dst, src);
-  } else {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::pmovmskb(dst, xmm0);
-    pop_zmm(xmm0);
-  }
+  assert((src->encoding() < 16),"XMM register should be 0-15");
+  Assembler::pmovmskb(dst, src);
 }
 
 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::ptest(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::ptest(xmm0, src);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::ptest(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::ptest(xmm1, xmm0);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
+  Assembler::ptest(dst, src);
 }
 
 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
@@ -3979,194 +3806,33 @@
 }
 
 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if ((dst_enc < 16) && (nds_enc < 16)) {
-    vandps(dst, nds, negate_field, vector_len);
-  } else if ((src_enc < 16) && (dst_enc < 16)) {
-    // Use src scratch register
-    evmovdqul(src, nds, Assembler::AVX_512bit);
-    vandps(dst, src, negate_field, vector_len);
-  } else if (dst_enc < 16) {
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-    vandps(dst, dst, negate_field, vector_len);
-  } else if (nds_enc < 16) {
-    vandps(nds, nds, negate_field, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (src_enc < 16) {
-    evmovdqul(src, nds, Assembler::AVX_512bit);
-    vandps(src, src, negate_field, vector_len);
-    evmovdqul(dst, src, Assembler::AVX_512bit);
-  } else {
-    if (src_enc != dst_enc) {
-      // Use src scratch register
-      evmovdqul(src, xmm0, Assembler::AVX_512bit);
-      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-      vandps(xmm0, xmm0, negate_field, vector_len);
-      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-      evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    } else {
-      push_zmm(xmm0);
-      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-      vandps(xmm0, xmm0, negate_field, vector_len);
-      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-      pop_zmm(xmm0);
-    }
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
+  vandps(dst, nds, negate_field, vector_len);
 }
 
 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if ((dst_enc < 16) && (nds_enc < 16)) {
-    vandpd(dst, nds, negate_field, vector_len);
-  } else if ((src_enc < 16) && (dst_enc < 16)) {
-    // Use src scratch register
-    evmovdqul(src, nds, Assembler::AVX_512bit);
-    vandpd(dst, src, negate_field, vector_len);
-  } else if (dst_enc < 16) {
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-    vandpd(dst, dst, negate_field, vector_len);
-  } else if (nds_enc < 16) {
-    vandpd(nds, nds, negate_field, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (src_enc < 16) {
-    evmovdqul(src, nds, Assembler::AVX_512bit);
-    vandpd(src, src, negate_field, vector_len);
-    evmovdqul(dst, src, Assembler::AVX_512bit);
-  } else {
-    if (src_enc != dst_enc) {
-      evmovdqul(src, xmm0, Assembler::AVX_512bit);
-      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-      vandpd(xmm0, xmm0, negate_field, vector_len);
-      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-      evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    } else {
-      push_zmm(xmm0);
-      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-      vandpd(xmm0, xmm0, negate_field, vector_len);
-      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-      pop_zmm(xmm0);
-    }
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
+  vandpd(dst, nds, negate_field, vector_len);
 }
 
 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpaddb(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpaddb(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpaddb(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpaddb(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpaddb(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpaddb(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpaddb(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpaddb(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with src as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpaddb(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpaddb(xmm0, xmm0, src, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpaddb(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpaddw(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpaddw(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpaddw(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpaddw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpaddw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpaddw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpaddw(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpaddw(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with nds as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpaddw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpaddw(xmm0, xmm0, src, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpaddw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
@@ -4178,627 +3844,109 @@
   }
 }
 
-void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpbroadcastw(dst, src);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpbroadcastw(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpbroadcastw(xmm0, src);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpbroadcastw(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::vpbroadcastw(xmm1, xmm0);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpbroadcastw(dst, src, vector_len);
 }
 
 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  assert(dst_enc == nds_enc, "");
-  if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpcmpeqb(dst, nds, src, vector_len);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpcmpeqb(xmm0, xmm0, src, vector_len);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpcmpeqb(dst, dst, xmm0, vector_len);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::vpcmpeqb(xmm1, xmm1, xmm0, vector_len);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpcmpeqb(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  assert(dst_enc == nds_enc, "");
-  if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpcmpeqw(dst, nds, src, vector_len);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpcmpeqw(xmm0, xmm0, src, vector_len);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpcmpeqw(dst, dst, xmm0, vector_len);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::vpcmpeqw(xmm1, xmm1, xmm0, vector_len);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpcmpeqw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpmovzxbw(dst, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpmovzxbw(dst, src, vector_len);
-  } else {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpmovzxbw(xmm0, src, vector_len);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpmovzxbw(dst, src, vector_len);
 }
 
 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
-  int src_enc = src->encoding();
-  if (src_enc < 16) {
-    Assembler::vpmovmskb(dst, src);
-  } else {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpmovmskb(dst, xmm0);
-    pop_zmm(xmm0);
-  }
+  assert((src->encoding() < 16),"XMM register should be 0-15");
+  Assembler::vpmovmskb(dst, src);
 }
 
 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpmullw(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpmullw(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpmullw(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpmullw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpmullw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpmullw(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpmullw(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with src as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(xmm0, xmm0, src, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpmullw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsubb(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpsubb(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpsubb(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsubb(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpsubb(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsubb(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsubb(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpsubb(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with src as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsubb(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsubb(xmm0, xmm0, src, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsubb(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsubw(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpsubw(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpsubw(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsubw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpsubw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsubw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsubw(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpsubw(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with src as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsubw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsubw(xmm0, xmm0, src, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsubw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int shift_enc = shift->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsraw(dst, nds, shift, vector_len);
-  } else if ((dst_enc < 16) && (shift_enc < 16)) {
-    Assembler::vpsraw(dst, dst, shift, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with shift
-    evmovdqul(nds, shift, Assembler::AVX_512bit);
-    Assembler::vpsraw(dst, dst, nds, vector_len);
-  } else if ((shift_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsraw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds to save a copy of xmm0 and hold shift
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsraw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else if (nds_enc < 16) {
-    // use nds and dst as temps
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsraw(nds, nds, xmm0, vector_len);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsraw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsraw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsraw(dst, nds, shift, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpsraw(dst, dst, shift, vector_len);
-  } else if (nds_enc < 16) {
-    // use nds as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsraw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // use nds as scratch for xmm0
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsraw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int shift_enc = shift->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsrlw(dst, nds, shift, vector_len);
-  } else if ((dst_enc < 16) && (shift_enc < 16)) {
-    Assembler::vpsrlw(dst, dst, shift, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with shift
-    evmovdqul(nds, shift, Assembler::AVX_512bit);
-    Assembler::vpsrlw(dst, dst, nds, vector_len);
-  } else if ((shift_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsrlw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds to save a copy of xmm0 and hold shift
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsrlw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else if (nds_enc < 16) {
-    // use nds and dst as temps
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsrlw(nds, nds, xmm0, vector_len);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsrlw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsrlw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsrlw(dst, nds, shift, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpsrlw(dst, dst, shift, vector_len);
-  } else if (nds_enc < 16) {
-    // use nds as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsrlw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // use nds as scratch for xmm0
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsrlw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int shift_enc = shift->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsllw(dst, nds, shift, vector_len);
-  } else if ((dst_enc < 16) && (shift_enc < 16)) {
-    Assembler::vpsllw(dst, dst, shift, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with shift
-    evmovdqul(nds, shift, Assembler::AVX_512bit);
-    Assembler::vpsllw(dst, dst, nds, vector_len);
-  } else if ((shift_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsllw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds to save a copy of xmm0 and hold shift
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsllw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else if (nds_enc < 16) {
-    // use nds and dst as temps
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsllw(nds, nds, xmm0, vector_len);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsllw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsllw(dst, nds, shift, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpsllw(dst, dst, shift, vector_len);
-  } else if (nds_enc < 16) {
-    // use nds as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsllw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // use nds as scratch for xmm0
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsllw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vptest(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vptest(xmm0, src);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vptest(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::vptest(xmm1, xmm0);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
-}
-
-// This instruction exists within macros, ergo we cannot control its input
-// when emitted through those patterns.
+  assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
+  Assembler::vptest(dst, src);
+}
+
 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
-  if (VM_Version::supports_avx512nobw()) {
-    int dst_enc = dst->encoding();
-    int src_enc = src->encoding();
-    if (dst_enc == src_enc) {
-      if (dst_enc < 16) {
-        Assembler::punpcklbw(dst, src);
-      } else {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        Assembler::punpcklbw(xmm0, xmm0);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm0);
-      }
-    } else {
-      if ((src_enc < 16) && (dst_enc < 16)) {
-        Assembler::punpcklbw(dst, src);
-      } else if (src_enc < 16) {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        Assembler::punpcklbw(xmm0, src);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm0);
-      } else if (dst_enc < 16) {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, src, Assembler::AVX_512bit);
-        Assembler::punpcklbw(dst, xmm0);
-        pop_zmm(xmm0);
-      } else {
-        push_zmm(xmm0);
-        push_zmm(xmm1);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        evmovdqul(xmm1, src, Assembler::AVX_512bit);
-        Assembler::punpcklbw(xmm0, xmm1);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm1);
-        pop_zmm(xmm0);
-      }
-    }
-  } else {
-    Assembler::punpcklbw(dst, src);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::punpcklbw(dst, src);
 }
 
 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
-  if (VM_Version::supports_avx512vl()) {
-    Assembler::pshufd(dst, src, mode);
-  } else {
-    int dst_enc = dst->encoding();
-    if (dst_enc < 16) {
-      Assembler::pshufd(dst, src, mode);
-    } else {
-      push_zmm(xmm0);
-      Assembler::pshufd(xmm0, src, mode);
-      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-      pop_zmm(xmm0);
-    }
-  }
-}
-
-// This instruction exists within macros, ergo we cannot control its input
-// when emitted through those patterns.
+  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
+  Assembler::pshufd(dst, src, mode);
+}
+
 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
-  if (VM_Version::supports_avx512nobw()) {
-    int dst_enc = dst->encoding();
-    int src_enc = src->encoding();
-    if (dst_enc == src_enc) {
-      if (dst_enc < 16) {
-        Assembler::pshuflw(dst, src, mode);
-      } else {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        Assembler::pshuflw(xmm0, xmm0, mode);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm0);
-      }
-    } else {
-      if ((src_enc < 16) && (dst_enc < 16)) {
-        Assembler::pshuflw(dst, src, mode);
-      } else if (src_enc < 16) {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        Assembler::pshuflw(xmm0, src, mode);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm0);
-      } else if (dst_enc < 16) {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, src, Assembler::AVX_512bit);
-        Assembler::pshuflw(dst, xmm0, mode);
-        pop_zmm(xmm0);
-      } else {
-        push_zmm(xmm0);
-        push_zmm(xmm1);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        evmovdqul(xmm1, src, Assembler::AVX_512bit);
-        Assembler::pshuflw(xmm0, xmm1, mode);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm1);
-        pop_zmm(xmm0);
-      }
-    }
-  } else {
-    Assembler::pshuflw(dst, src, mode);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::pshuflw(dst, src, mode);
 }
 
 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
@@ -4874,47 +4022,13 @@
 }
 
 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
-  int nds_enc = nds->encoding();
-  int dst_enc = dst->encoding();
-  bool dst_upper_bank = (dst_enc > 15);
-  bool nds_upper_bank = (nds_enc > 15);
-  if (VM_Version::supports_avx512novl() &&
-      (nds_upper_bank || dst_upper_bank)) {
-    if (dst_upper_bank) {
-      push_zmm(xmm0);
-      movflt(xmm0, nds);
-      vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
-      movflt(dst, xmm0);
-      pop_zmm(xmm0);
-    } else {
-      movflt(dst, nds);
-      vxorps(dst, dst, src, Assembler::AVX_128bit);
-    }
-  } else {
-    vxorps(dst, nds, src, Assembler::AVX_128bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
+  vxorps(dst, nds, src, Assembler::AVX_128bit);
 }
 
 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
-  int nds_enc = nds->encoding();
-  int dst_enc = dst->encoding();
-  bool dst_upper_bank = (dst_enc > 15);
-  bool nds_upper_bank = (nds_enc > 15);
-  if (VM_Version::supports_avx512novl() &&
-      (nds_upper_bank || dst_upper_bank)) {
-    if (dst_upper_bank) {
-      push_zmm(xmm0);
-      movdbl(xmm0, nds);
-      vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
-      movdbl(dst, xmm0);
-      pop_zmm(xmm0);
-    } else {
-      movdbl(dst, nds);
-      vxorpd(dst, dst, src, Assembler::AVX_128bit);
-    }
-  } else {
-    vxorpd(dst, nds, src, Assembler::AVX_128bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
+  vxorpd(dst, nds, src, Assembler::AVX_128bit);
 }
 
 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
@@ -7064,7 +6178,7 @@
     cmpl(cnt1, 2*stride);
     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
     movdl(vec1, ch);
-    vpbroadcastw(vec1, vec1);
+    vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
     vpxor(vec2, vec2);
     movl(tmp, cnt1);
     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
@@ -7659,7 +6773,7 @@
 
       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
       movdl(vec2, tmp1);
-      vpbroadcastd(vec2, vec2);
+      vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
 
       bind(COMPARE_WIDE_VECTORS);
       vmovdqu(vec1, Address(ary1, len, Address::times_1));
@@ -8091,7 +7205,7 @@
       if (UseAVX > 2 && UseUnalignedLoadStores) {
         // Fill 64-byte chunks
         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
-        evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
+        vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
 
         subl(count, 16 << shift);
         jcc(Assembler::less, L_check_fill_32_bytes);
@@ -8114,7 +7228,7 @@
       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
         // Fill 64-byte chunks
         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
-        vpbroadcastd(xtmp, xtmp);
+        vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
 
         subl(count, 16 << shift);
         jcc(Assembler::less, L_check_fill_32_bytes);
@@ -8256,7 +7370,7 @@
       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
       movdl(tmp1Reg, tmp5);
-      vpbroadcastd(tmp1Reg, tmp1Reg);
+      vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
       jmp(L_chars_32_check);
 
       bind(L_copy_32_chars);
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp	Mon Sep 24 16:37:28 2018 -0700
@@ -482,10 +482,6 @@
   // from register xmm0. Otherwise, the value is stored from the FPU stack.
   void store_double(Address dst);
 
-  // Save/restore ZMM (512bit) register on stack.
-  void push_zmm(XMMRegister reg);
-  void pop_zmm(XMMRegister reg);
-
   // pushes double TOS element of FPU stack on CPU stack; pops from FPU stack
   void push_fTOS();
 
@@ -1214,9 +1210,11 @@
   void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
 
-  void vpbroadcastw(XMMRegister dst, XMMRegister src);
+  void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastw(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastw(dst, src, vector_len); }
 
   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
 
   void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -403,7 +403,7 @@
       __ movdl(xmm0, rcx);
       __ movl(rcx, 0xffff);
       __ kmovwl(k1, rcx);
-      __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
+      __ vpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
       __ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit);
 #ifdef _LP64
       __ evmovdqul(xmm8, xmm0, Assembler::AVX_512bit);
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp	Mon Sep 24 16:37:28 2018 -0700
@@ -816,7 +816,10 @@
   static bool supports_avx512cd() { return (_features & CPU_AVX512CD) != 0; }
   static bool supports_avx512bw() { return (_features & CPU_AVX512BW) != 0; }
   static bool supports_avx512vl() { return (_features & CPU_AVX512VL) != 0; }
-  static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
+  static bool supports_avx512vlbw() { return (supports_evex() && supports_avx512bw() && supports_avx512vl()); }
+  static bool supports_avx512vldq() { return (supports_evex() && supports_avx512dq() && supports_avx512vl()); }
+  static bool supports_avx512vlbwdq() { return (supports_evex() && supports_avx512vl() &&
+                                                supports_avx512bw() && supports_avx512dq()); }
   static bool supports_avx512novl() { return (supports_evex() && !supports_avx512vl()); }
   static bool supports_avx512nobw() { return (supports_evex() && !supports_avx512bw()); }
   static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
--- a/src/hotspot/cpu/x86/x86.ad	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/x86.ad	Mon Sep 24 16:37:28 2018 -0700
@@ -729,6 +729,7 @@
                     );
 
 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 
 // Class for pre evex double registers
 reg_class double_reg_legacy(XMM0,  XMM0b,
@@ -789,6 +790,7 @@
                      );
 
 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 
 // Class for pre evex 32bit vector registers
 reg_class vectors_reg_legacy(XMM0,
@@ -849,6 +851,7 @@
                       );
 
 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 
 // Class for all 64bit vector registers
 reg_class vectord_reg_legacy(XMM0,  XMM0b,
@@ -909,6 +912,7 @@
                       );
 
 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 
 // Class for all 128bit vector registers
 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
@@ -969,6 +973,7 @@
                       );
 
 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 
 // Class for all 256bit vector registers
 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
@@ -1029,9 +1034,10 @@
                       );
 
 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 
 // Class for all 512bit vector registers
-reg_class vectorz_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
+reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
@@ -1067,6 +1073,30 @@
 #endif
                       );
 
+// Class for restricted 512bit vector registers
+reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
+                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
+                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
+                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
+                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
+                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
+                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
+                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
+#ifdef _LP64
+                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
+                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
+                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
+                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
+                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
+                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
+                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
+                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
+#endif
+                      );
+
+reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
+
 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
@@ -1487,6 +1517,8 @@
   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
     size = (UseAVX > 2) ? 64 : 32;
+  if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
+    size = (VM_Version::supports_avx512bw()) ? 64 : 32;
   // Use flag to limit vector size.
   size = MIN2(size,(int)MaxVectorSize);
   // Minimum 2 values in vector (or 4 for bytes).
@@ -1528,7 +1560,7 @@
   return MIN2(size,max_size);
 }
 
-// Vector ideal reg corresponding to specidied size in bytes
+// Vector ideal reg corresponding to specified size in bytes
 const uint Matcher::vector_ideal_reg(int size) {
   assert(MaxVectorSize >= size, "");
   switch(size) {
@@ -1648,10 +1680,28 @@
     case Op_VecS: // copy whole register
     case Op_VecD:
     case Op_VecX:
+#ifndef LP64
       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+#else
+      if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+        __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+      } else {
+        __ vpxor(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), 2);
+        __ vinserti32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
+     }
+#endif
       break;
     case Op_VecY:
+#ifndef LP64
       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+#else
+      if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+      } else {
+        __ vpxor(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), 2);
+        __ vinserti64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
+     }
+#endif
       break;
     case Op_VecZ:
       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
@@ -1703,10 +1753,28 @@
         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
         break;
       case Op_VecX:
+#ifndef LP64
         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+#else
+        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+          __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        } else {
+          __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
+          __ vinserti32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
+        }
+#endif
         break;
       case Op_VecY:
+#ifndef LP64
         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+#else
+        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+          __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        } else {
+          __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
+          __ vinserti64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
+        }
+#endif
         break;
       case Op_VecZ:
         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
@@ -1723,10 +1791,28 @@
         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
         break;
       case Op_VecX:
+#ifndef LP64
         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+#else
+        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+          __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        }
+        else {
+          __ vextracti32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
+        }
+#endif
         break;
       case Op_VecY:
+#ifndef LP64
         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+#else
+        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+          __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        }
+        else {
+          __ vextracti64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
+        }
+#endif
         break;
       case Op_VecZ:
         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
@@ -1908,7 +1994,6 @@
 // in the ADLC because operands constitute user defined types which are used in
 // instruction definitions.
 
-// This one generically applies only for evex, so only one version
 operand vecZ() %{
   constraint(ALLOC_IN_RC(vectorz_reg));
   match(VecZ);
@@ -1917,6 +2002,14 @@
   interface(REG_INTER);
 %}
 
+operand legVecZ() %{
+  constraint(ALLOC_IN_RC(vectorz_reg_vl));
+  match(VecZ);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 // Comparison Code for FP conditional move
 operand cmpOp_vcmppd() %{
   match(Bool);
@@ -2547,8 +2640,8 @@
   ins_pipe(pipe_slow);
 %}
 
-instruct absF_reg_reg(regF dst, regF src) %{
-  predicate(VM_Version::supports_avxonly());
+instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
+  predicate(UseAVX > 0);
   match(Set dst (AbsF src));
   ins_cost(150);
   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
@@ -2560,48 +2653,6 @@
   ins_pipe(pipe_slow);
 %}
 
-#ifdef _LP64
-instruct absF_reg_reg_evex(regF dst, regF src) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
-  match(Set dst (AbsF src));
-  ins_cost(150);
-  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
-  predicate(VM_Version::supports_avx512novl());
-  match(Set dst (AbsF src1));
-  effect(TEMP src2);
-  ins_cost(150);
-  format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
-              ExternalAddress(float_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-#else // _LP64
-instruct absF_reg_reg_evex(regF dst, regF src) %{
-  predicate(UseAVX > 2);
-  match(Set dst (AbsF src));
-  ins_cost(150);
-  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-#endif
-
 instruct absD_reg(regD dst) %{
   predicate((UseSSE>=2) && (UseAVX == 0));
   match(Set dst (AbsD dst));
@@ -2614,8 +2665,8 @@
   ins_pipe(pipe_slow);
 %}
 
-instruct absD_reg_reg(regD dst, regD src) %{
-  predicate(VM_Version::supports_avxonly());
+instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
+  predicate(UseAVX > 0);
   match(Set dst (AbsD src));
   ins_cost(150);
   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
@@ -2628,50 +2679,6 @@
   ins_pipe(pipe_slow);
 %}
 
-#ifdef _LP64
-instruct absD_reg_reg_evex(regD dst, regD src) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
-  match(Set dst (AbsD src));
-  ins_cost(150);
-  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
-            "# abs double by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
-  predicate(VM_Version::supports_avx512novl());
-  match(Set dst (AbsD src1));
-  effect(TEMP src2);
-  ins_cost(150);
-  format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
-              ExternalAddress(double_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-#else // _LP64
-instruct absD_reg_reg_evex(regD dst, regD src) %{
-  predicate(UseAVX > 2);
-  match(Set dst (AbsD src));
-  ins_cost(150);
-  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
-            "# abs double by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-#endif
-
 instruct negF_reg(regF dst) %{
   predicate((UseSSE>=1) && (UseAVX == 0));
   match(Set dst (NegF dst));
@@ -2683,7 +2690,7 @@
   ins_pipe(pipe_slow);
 %}
 
-instruct negF_reg_reg(regF dst, regF src) %{
+instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
   predicate(UseAVX > 0);
   match(Set dst (NegF src));
   ins_cost(150);
@@ -2707,11 +2714,11 @@
   ins_pipe(pipe_slow);
 %}
 
-instruct negD_reg_reg(regD dst, regD src) %{
+instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
   predicate(UseAVX > 0);
   match(Set dst (NegD src));
   ins_cost(150);
-  format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
+  format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
             "# neg double by sign flipping" %}
   ins_encode %{
     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
@@ -2835,6 +2842,7 @@
 
 // ====================VECTOR INSTRUCTIONS=====================================
 
+
 // Load vectors (4 bytes long)
 instruct loadV4(vecS dst, memory mem) %{
   predicate(n->as_LoadVector()->memory_size() == 4);
@@ -2847,6 +2855,26 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load vectors (4 bytes long)
+instruct MoveVecS2Leg(legVecS dst, vecS src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load vectors (4 bytes long)
+instruct MoveLeg2VecS(vecS dst, legVecS src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load vectors (8 bytes long)
 instruct loadV8(vecD dst, memory mem) %{
   predicate(n->as_LoadVector()->memory_size() == 8);
@@ -2859,6 +2887,26 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load vectors (8 bytes long)
+instruct MoveVecD2Leg(legVecD dst, vecD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load vectors (8 bytes long)
+instruct MoveLeg2VecD(vecD dst, legVecD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load vectors (16 bytes long)
 instruct loadV16(vecX dst, memory mem) %{
   predicate(n->as_LoadVector()->memory_size() == 16);
@@ -2871,6 +2919,36 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load vectors (16 bytes long)
+instruct MoveVecX2Leg(legVecX dst, vecX src) %{
+  match(Set dst src);
+  format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
+  ins_encode %{
+    if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
+      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
+    } else {
+      int vector_len = 2;
+      __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    }
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load vectors (16 bytes long)
+instruct MoveLeg2VecX(vecX dst, legVecX src) %{
+  match(Set dst src);
+  format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
+  ins_encode %{
+    if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
+      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
+    } else {
+      int vector_len = 2;
+      __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    }
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load vectors (32 bytes long)
 instruct loadV32(vecY dst, memory mem) %{
   predicate(n->as_LoadVector()->memory_size() == 32);
@@ -2883,6 +2961,36 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load vectors (32 bytes long)
+instruct MoveVecY2Leg(legVecY dst, vecY src) %{
+  match(Set dst src);
+  format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
+  ins_encode %{
+    if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
+      __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
+    } else {
+      int vector_len = 2;
+      __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    }
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load vectors (32 bytes long)
+instruct MoveLeg2VecY(vecY dst, legVecY src) %{
+  match(Set dst src);
+  format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
+  ins_encode %{
+    if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
+      __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
+    } else {
+      int vector_len = 2;
+      __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    }
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load vectors (64 bytes long)
 instruct loadV64_dword(vecZ dst, memory mem) %{
   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
@@ -2909,6 +3017,26 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
+  match(Set dst src);
+  format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
+  match(Set dst src);
+  format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Store vectors
 instruct storeV4(memory mem, vecS src) %{
   predicate(n->as_StoreVector()->memory_size() == 4);
@@ -3068,6 +3196,44 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl64B(legVecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "punpcklbw $dst,$mem\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
+  ins_encode %{
+    __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl16B_imm(vecX dst, immI con) %{
   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB con));
@@ -3094,6 +3260,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl64B_imm(legVecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl4S(vecD dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS src));
@@ -3198,6 +3380,56 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl32S(legVecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "pshuflw $dst,$mem,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_imm(legVecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl4I(vecX dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI src));
@@ -3246,6 +3478,36 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl16I(legVecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "pshufd  $dst,$mem,0x00\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl4I_imm(vecX dst, immI con) %{
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI con));
@@ -3272,6 +3534,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl16I_imm(legVecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Long could be loaded into xmm register directly from memory.
 instruct Repl2L_mem(vecX dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
@@ -3300,8 +3578,24 @@
   %}
   ins_pipe( pipe_slow );
 %}
+
+instruct Repl8L(legVecZ dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  format %{ "movdq   $dst,$src\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
+  ins_encode %{
+    __ movdq($dst$$XMMRegister, $src$$Register);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
 #else // _LP64
-instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
+instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL src));
   effect(TEMP dst, USE src, TEMP tmp);
@@ -3319,6 +3613,27 @@
   %}
   ins_pipe( pipe_slow );
 %}
+
+instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
 #endif // _LP64
 
 instruct Repl4L_imm(vecY dst, immL con) %{
@@ -3335,6 +3650,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl8L_imm(legVecZ dst, immL con) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress($con));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl4L_mem(vecY dst, memory mem) %{
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL (LoadL mem)));
@@ -3349,6 +3680,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl8L_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "movq    $dst,$mem\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl2F_mem(vecD dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF (LoadF mem)));
@@ -3369,8 +3716,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl8F(vecY dst, regF src) %{
-  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+instruct Repl8F(vecY dst, vlRegF src) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF src));
   format %{ "pshufd  $dst,$src,0x00\n\t"
             "vinsertf128_high $dst,$dst\t! replicate8F" %}
@@ -3393,6 +3740,34 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl16F(legVecZ dst, vlRegF src) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$src,0x00\n\t"
+            "vinsertf128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "pshufd  $dst,$mem,0x00\n\t"
+            "vinsertf128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl2F_zero(vecD dst, immF0 zero) %{
   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
   match(Set dst (ReplicateF zero));
@@ -3434,8 +3809,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl4D(vecY dst, regD src) %{
-  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+instruct Repl4D(vecY dst, vlRegD src) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD src));
   format %{ "pshufd  $dst,$src,0x44\n\t"
             "vinsertf128_high $dst,$dst\t! replicate4D" %}
@@ -3458,6 +3833,34 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl8D(legVecZ dst, vlRegD src) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD src));
+  format %{ "pshufd  $dst,$src,0x44\n\t"
+            "vinsertf128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD (LoadD mem)));
+  format %{ "pshufd  $dst,$mem,0x44\n\t"
+            "vinsertf128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate double (8 byte) scalar zero to be vector
 instruct Repl2D_zero(vecX dst, immD0 zero) %{
   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
@@ -3736,7 +4139,7 @@
   ins_pipe( pipe_slow );
 %}
 #else // _LP64
-instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
+instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
   predicate(n->as_Vector()->length() == 2);
   match(Set dst (ReplicateL src));
   effect(TEMP dst, USE src, TEMP tmp);
@@ -3791,7 +4194,7 @@
 %}
 
 // Replicate float (4 byte) scalar to be vector
-instruct Repl2F(vecD dst, regF src) %{
+instruct Repl2F(vecD dst, vlRegF src) %{
   predicate(n->as_Vector()->length() == 2);
   match(Set dst (ReplicateF src));
   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
@@ -3801,7 +4204,7 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl4F(vecX dst, regF src) %{
+instruct Repl4F(vecX dst, vlRegF src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (ReplicateF src));
   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
@@ -3812,7 +4215,7 @@
 %}
 
 // Replicate double (8 bytes) scalar to be vector
-instruct Repl2D(vecX dst, regD src) %{
+instruct Repl2D(vecX dst, vlRegD src) %{
   predicate(n->as_Vector()->length() == 2);
   match(Set dst (ReplicateD src));
   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
@@ -3825,31 +4228,31 @@
 // ====================EVEX REPLICATE=============================================
 
 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl16B_evex(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB src));
-  format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
+  format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
   ins_encode %{
    int vector_len = 0;
     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
@@ -3858,20 +4261,20 @@
 %}
 
 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl32B_evex(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB src));
-  format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
+  format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
   ins_encode %{
    int vector_len = 1;
     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
@@ -3880,20 +4283,20 @@
 %}
 
 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl64B_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB src));
-  format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
+  format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
   ins_encode %{
    int vector_len = 2;
     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
@@ -3902,51 +4305,51 @@
 %}
 
 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl16B_imm_evex(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastb $dst,$dst\t! replicate16B" %}
   ins_encode %{
    int vector_len = 0;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl32B_imm_evex(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastb $dst,$dst\t! replicate32B" %}
   ins_encode %{
    int vector_len = 1;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
   ins_encode %{
    int vector_len = 2;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3964,9 +4367,9 @@
 %}
 
 instruct Repl4S_evex(vecD dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS src));
-  format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
+  format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
   ins_encode %{
    int vector_len = 0;
     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
@@ -3975,20 +4378,20 @@
 %}
 
 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8S_evex(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS src));
-  format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
+  format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
   ins_encode %{
    int vector_len = 0;
     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
@@ -3997,20 +4400,20 @@
 %}
 
 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl16S_evex(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS src));
-  format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
+  format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
   ins_encode %{
    int vector_len = 1;
     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4019,20 +4422,20 @@
 %}
 
 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl32S_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS src));
-  format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
+  format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
   ins_encode %{
    int vector_len = 2;
     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4041,51 +4444,51 @@
 %}
 
 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8S_imm_evex(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastw $dst,$dst\t! replicate8S" %}
   ins_encode %{
    int vector_len = 0;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl16S_imm_evex(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastw $dst,$dst\t! replicate16S" %}
   ins_encode %{
    int vector_len = 1;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastw $dst,$dst\t! replicate32S" %}
   ins_encode %{
    int vector_len = 2;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4103,9 +4506,9 @@
 %}
 
 instruct Repl4I_evex(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI src));
-  format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
+  format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
   ins_encode %{
     int vector_len = 0;
     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4114,20 +4517,20 @@
 %}
 
 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI (LoadI mem)));
   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8I_evex(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI src));
-  format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
+  format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
   ins_encode %{
     int vector_len = 1;
     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4136,12 +4539,12 @@
 %}
 
 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI (LoadI mem)));
   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4149,7 +4552,7 @@
 instruct Repl16I_evex(vecZ dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
   match(Set dst (ReplicateI src));
-  format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
+  format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
   ins_encode %{
     int vector_len = 2;
     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4163,33 +4566,33 @@
   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl4I_imm_evex(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI con));
   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
   ins_encode %{
     int vector_len = 0;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8I_imm_evex(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI con));
   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
   ins_encode %{
     int vector_len = 1;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4202,7 +4605,7 @@
   ins_encode %{
     int vector_len = 2;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4222,9 +4625,9 @@
 // Replicate long (8 byte) scalar to be vector
 #ifdef _LP64
 instruct Repl4L_evex(vecY dst, rRegL src) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL src));
-  format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
+  format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
   ins_encode %{
     int vector_len = 1;
     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4235,7 +4638,7 @@
 instruct Repl8L_evex(vecZ dst, rRegL src) %{
   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateL src));
-  format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
+  format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
   ins_encode %{
     int vector_len = 2;
     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4244,7 +4647,7 @@
 %}
 #else // _LP64
 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL src));
   effect(TEMP dst, USE src, TEMP tmp);
   format %{ "movdl   $dst,$src.lo\n\t"
@@ -4256,12 +4659,12 @@
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
-    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
+    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateL src));
   effect(TEMP dst, USE src, TEMP tmp);
@@ -4274,21 +4677,21 @@
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
-    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 #endif // _LP64
 
 instruct Repl4L_imm_evex(vecY dst, immL con) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
   ins_encode %{
     int vector_len = 1;
     __ movq($dst$$XMMRegister, $constantaddress($con));
-    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4301,29 +4704,29 @@
   ins_encode %{
     int vector_len = 2;
     __ movq($dst$$XMMRegister, $constantaddress($con));
-    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL (LoadL mem)));
   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL (LoadL mem)));
   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4334,7 +4737,7 @@
   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4352,23 +4755,23 @@
 %}
 
 instruct Repl8F_evex(vecY dst, regF src) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF src));
-  format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
+  format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF (LoadF mem)));
   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4376,10 +4779,10 @@
 instruct Repl16F_evex(vecZ dst, regF src) %{
   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
   match(Set dst (ReplicateF src));
-  format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
+  format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4390,7 +4793,7 @@
   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4444,23 +4847,23 @@
 %}
 
 instruct Repl4D_evex(vecY dst, regD src) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD src));
-  format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
+  format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD (LoadD mem)));
   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4468,10 +4871,10 @@
 instruct Repl8D_evex(vecZ dst, regD src) %{
   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateD src));
-  format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
+  format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4482,7 +4885,7 @@
   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4525,7 +4928,7 @@
 
 // ====================REDUCTION ARITHMETIC=======================================
 
-instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
   predicate(UseSSE > 2 && UseAVX == 0);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp2, TEMP tmp);
@@ -4544,7 +4947,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
   predicate(VM_Version::supports_avxonly());
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4562,7 +4965,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4582,7 +4985,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseSSE > 2 && UseAVX == 0);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4603,7 +5006,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(VM_Version::supports_avxonly());
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4623,7 +5026,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4647,7 +5050,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
+instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(VM_Version::supports_avxonly());
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4671,7 +5074,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
+instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4699,7 +5102,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
+instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
@@ -4731,7 +5134,7 @@
 %}
 
 #ifdef _LP64
-instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4750,7 +5153,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
+instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4773,7 +5176,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
+instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4801,7 +5204,7 @@
 %}
 #endif
 
-instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
+instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -4816,7 +5219,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
+instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -4831,7 +5234,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
+instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -4854,7 +5257,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
+instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -4877,7 +5280,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
+instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -4916,7 +5319,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
+instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -4987,7 +5390,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
+instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (AddReductionVD dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -5002,7 +5405,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
+instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (AddReductionVD dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -5017,14 +5420,14 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
+instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (AddReductionVD dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
   format %{ "vaddsd  $dst,$dst,$src2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2,0x1\n\t"
+            "vextractf128  $tmp2,$src2,0x1\n\t"
             "vaddsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
@@ -5032,7 +5435,7 @@
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5040,7 +5443,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
+instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVD dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -5079,7 +5482,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
   predicate(UseSSE > 3 && UseAVX == 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5098,7 +5501,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5118,7 +5521,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseSSE > 3 && UseAVX == 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5141,7 +5544,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5165,8 +5568,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
-  predicate(UseAVX > 0);
+instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
+  predicate(UseAVX > 1);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
   format %{ "vextracti128_high  $tmp,$src2\n\t"
@@ -5193,7 +5596,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
+instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
   predicate(UseAVX > 2);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
@@ -5225,7 +5628,7 @@
 %}
 
 #ifdef _LP64
-instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5244,7 +5647,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
+instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5267,7 +5670,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
+instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5295,7 +5698,7 @@
 %}
 #endif
 
-instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
+instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -5310,7 +5713,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
+instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -5325,7 +5728,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
+instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -5348,7 +5751,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
+instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -5371,7 +5774,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
+instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -5410,7 +5813,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
+instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -5481,7 +5884,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
+instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (MulReductionVD dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -5496,7 +5899,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
+instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVD dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -5511,7 +5914,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
+instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVD dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -5534,7 +5937,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
+instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (MulReductionVD dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -5588,8 +5991,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
   ins_encode %{
@@ -5599,31 +6002,9 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVB src1 src2));
-  format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+
+instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
   ins_encode %{
@@ -5633,29 +6014,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVB src (LoadVector mem)));
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd8B(vecD dst, vecD src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVB dst src));
@@ -5666,8 +6024,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
   ins_encode %{
@@ -5677,31 +6035,9 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVB src1 src2));
-  format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+
+instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
   ins_encode %{
@@ -5711,29 +6047,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVB src (LoadVector mem)));
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd16B(vecX dst, vecX src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
   match(Set dst (AddVB dst src));
@@ -5744,8 +6057,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
+instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
   ins_encode %{
@@ -5755,31 +6068,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVB src1 src2));
-  format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
+instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
   ins_encode %{
@@ -5789,31 +6079,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVB src (LoadVector mem)));
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
+instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
   ins_encode %{
@@ -5823,31 +6090,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
-  match(Set dst (AddVB src1 src2));
-  format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
-  match(Set dst (AddVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
+instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
   ins_encode %{
@@ -5857,31 +6101,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
-  match(Set dst (AddVB src (LoadVector mem)));
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
-  match(Set dst (AddVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
   ins_encode %{
@@ -5892,7 +6113,7 @@
 %}
 
 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
   ins_encode %{
@@ -5913,8 +6134,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
   ins_encode %{
@@ -5924,31 +6145,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (AddVS src1 src2));
-  format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (AddVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
   ins_encode %{
@@ -5958,29 +6156,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (AddVS src (LoadVector mem)));
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (AddVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd4S(vecD dst, vecD src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVS dst src));
@@ -5991,8 +6166,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
   ins_encode %{
@@ -6002,31 +6177,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVS src1 src2));
-  format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
   ins_encode %{
@@ -6036,29 +6188,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVS src (LoadVector mem)));
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd8S(vecX dst, vecX src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVS dst src));
@@ -6069,8 +6198,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
   ins_encode %{
@@ -6080,31 +6209,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVS src1 src2));
-  format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
   ins_encode %{
@@ -6114,31 +6220,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVS src (LoadVector mem)));
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
   ins_encode %{
@@ -6148,31 +6231,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVS src1 src2));
-  format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
   ins_encode %{
@@ -6182,31 +6242,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVS src (LoadVector mem)));
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
   ins_encode %{
@@ -6217,7 +6254,7 @@
 %}
 
 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
   ins_encode %{
@@ -6229,7 +6266,7 @@
 
 // Integers vector add
 instruct vadd2I(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (AddVI dst src));
   format %{ "paddd   $dst,$src\t! add packed2I" %}
   ins_encode %{
@@ -6261,7 +6298,7 @@
 %}
 
 instruct vadd4I(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVI dst src));
   format %{ "paddd   $dst,$src\t! add packed4I" %}
   ins_encode %{
@@ -6338,7 +6375,7 @@
 
 // Longs vector add
 instruct vadd2L(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (AddVL dst src));
   format %{ "paddq   $dst,$src\t! add packed2L" %}
   ins_encode %{
@@ -6415,7 +6452,7 @@
 
 // Floats vector add
 instruct vadd2F(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (AddVF dst src));
   format %{ "addps   $dst,$src\t! add packed2F" %}
   ins_encode %{
@@ -6447,7 +6484,7 @@
 %}
 
 instruct vadd4F(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVF dst src));
   format %{ "addps   $dst,$src\t! add packed4F" %}
   ins_encode %{
@@ -6524,7 +6561,7 @@
 
 // Doubles vector add
 instruct vadd2D(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (AddVD dst src));
   format %{ "addpd   $dst,$src\t! add packed2D" %}
   ins_encode %{
@@ -6612,8 +6649,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
   ins_encode %{
@@ -6623,31 +6660,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVB src1 src2));
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
   ins_encode %{
@@ -6657,29 +6671,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVB src (LoadVector mem)));
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub8B(vecD dst, vecD src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVB dst src));
@@ -6690,8 +6681,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
   ins_encode %{
@@ -6701,31 +6692,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVB src1 src2));
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
   ins_encode %{
@@ -6735,29 +6703,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVB src (LoadVector mem)));
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub16B(vecX dst, vecX src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
   match(Set dst (SubVB dst src));
@@ -6768,8 +6713,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
+instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
   ins_encode %{
@@ -6779,31 +6724,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVB src1 src2));
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
+instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
   ins_encode %{
@@ -6813,31 +6735,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVB src (LoadVector mem)));
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
+instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
   ins_encode %{
@@ -6847,31 +6746,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
-  match(Set dst (SubVB src1 src2));
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
-  match(Set dst (SubVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
+instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
   ins_encode %{
@@ -6881,31 +6757,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
-  match(Set dst (SubVB src (LoadVector mem)));
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
-  match(Set dst (SubVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
   ins_encode %{
@@ -6916,7 +6769,7 @@
 %}
 
 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
   ins_encode %{
@@ -6937,8 +6790,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
   ins_encode %{
@@ -6948,31 +6801,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (SubVS src1 src2));
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (SubVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
   ins_encode %{
@@ -6982,29 +6812,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (SubVS src (LoadVector mem)));
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (SubVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub4S(vecD dst, vecD src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVS dst src));
@@ -7015,8 +6822,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
   ins_encode %{
@@ -7026,31 +6833,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVS src1 src2));
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
   ins_encode %{
@@ -7060,29 +6844,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVS src (LoadVector mem)));
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub8S(vecX dst, vecX src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVS dst src));
@@ -7093,8 +6854,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
   ins_encode %{
@@ -7104,31 +6865,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVS src1 src2));
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
   ins_encode %{
@@ -7138,31 +6876,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVS src (LoadVector mem)));
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
   ins_encode %{
@@ -7172,31 +6887,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVS src1 src2));
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
   ins_encode %{
@@ -7206,31 +6898,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVS src (LoadVector mem)));
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVS dst (LoadVector mem)));
-   effect(TEMP src);
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
   ins_encode %{
@@ -7241,7 +6910,7 @@
 %}
 
 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
   ins_encode %{
@@ -7253,7 +6922,7 @@
 
 // Integers vector sub
 instruct vsub2I(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVI dst src));
   format %{ "psubd   $dst,$src\t! sub packed2I" %}
   ins_encode %{
@@ -7285,7 +6954,7 @@
 %}
 
 instruct vsub4I(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVI dst src));
   format %{ "psubd   $dst,$src\t! sub packed4I" %}
   ins_encode %{
@@ -7362,7 +7031,7 @@
 
 // Longs vector sub
 instruct vsub2L(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVL dst src));
   format %{ "psubq   $dst,$src\t! sub packed2L" %}
   ins_encode %{
@@ -7439,7 +7108,7 @@
 
 // Floats vector sub
 instruct vsub2F(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVF dst src));
   format %{ "subps   $dst,$src\t! sub packed2F" %}
   ins_encode %{
@@ -7471,7 +7140,7 @@
 %}
 
 instruct vsub4F(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVF dst src));
   format %{ "subps   $dst,$src\t! sub packed4F" %}
   ins_encode %{
@@ -7548,7 +7217,7 @@
 
 // Doubles vector sub
 instruct vsub2D(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVD dst src));
   format %{ "subpd   $dst,$src\t! sub packed2D" %}
   ins_encode %{
@@ -7636,8 +7305,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
   ins_encode %{
@@ -7647,31 +7316,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (MulVS src1 src2));
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (MulVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
   ins_encode %{
@@ -7681,29 +7327,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (MulVS src (LoadVector mem)));
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (MulVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vmul4S(vecD dst, vecD src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (MulVS dst src));
@@ -7714,8 +7337,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
   ins_encode %{
@@ -7725,31 +7348,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (MulVS src1 src2));
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (MulVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
   ins_encode %{
@@ -7759,29 +7359,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (MulVS src (LoadVector mem)));
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (MulVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vmul8S(vecX dst, vecX src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
   match(Set dst (MulVS dst src));
@@ -7792,8 +7369,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
   ins_encode %{
@@ -7803,31 +7380,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (MulVS src1 src2));
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (MulVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
   ins_encode %{
@@ -7837,31 +7391,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (MulVS src (LoadVector mem)));
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (MulVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
   ins_encode %{
@@ -7871,31 +7402,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (MulVS src1 src2));
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (MulVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
   ins_encode %{
@@ -7905,31 +7413,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (MulVS src (LoadVector mem)));
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (MulVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
   ins_encode %{
@@ -7940,7 +7425,7 @@
 %}
 
 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
   ins_encode %{
@@ -8127,7 +7612,7 @@
 
 // Floats vector mul
 instruct vmul2F(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (MulVF dst src));
   format %{ "mulps   $dst,$src\t! mul packed2F" %}
   ins_encode %{
@@ -8159,7 +7644,7 @@
 %}
 
 instruct vmul4F(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (MulVF dst src));
   format %{ "mulps   $dst,$src\t! mul packed4F" %}
   ins_encode %{
@@ -8236,7 +7721,7 @@
 
 // Doubles vector mul
 instruct vmul2D(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (MulVD dst src));
   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
   ins_encode %{
@@ -8311,8 +7796,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vcmov8F_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
-  predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 8);
+instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
   effect(TEMP dst, USE src1, USE src2);
   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
@@ -8327,8 +7812,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
-  predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
+instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
   effect(TEMP dst, USE src1, USE src2);
   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
@@ -8347,7 +7832,7 @@
 
 // Floats vector div
 instruct vdiv2F(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (DivVF dst src));
   format %{ "divps   $dst,$src\t! div packed2F" %}
   ins_encode %{
@@ -8379,7 +7864,7 @@
 %}
 
 instruct vdiv4F(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (DivVF dst src));
   format %{ "divps   $dst,$src\t! div packed4F" %}
   ins_encode %{
@@ -8456,7 +7941,7 @@
 
 // Doubles vector div
 instruct vdiv2D(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (DivVD dst src));
   format %{ "divpd   $dst,$src\t! div packed2D" %}
   ins_encode %{
@@ -8725,8 +8210,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
   ins_encode %{
@@ -8736,58 +8221,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
+instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -8813,8 +8252,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
   ins_encode %{
@@ -8824,58 +8263,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
+instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -8901,8 +8294,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
   ins_encode %{
@@ -8912,65 +8305,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
+instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
   ins_encode %{
@@ -8980,65 +8327,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
+instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
   ins_encode %{
     int vector_len = 1;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
   ins_encode %{
@@ -9049,7 +8350,7 @@
 %}
 
 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
   ins_encode %{
@@ -9061,7 +8362,7 @@
 
 // Integers vector left shift
 instruct vsll2I(vecD dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVI dst shift));
   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
   ins_encode %{
@@ -9071,7 +8372,7 @@
 %}
 
 instruct vsll2I_imm(vecD dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVI dst shift));
   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
   ins_encode %{
@@ -9103,7 +8404,7 @@
 %}
 
 instruct vsll4I(vecX dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (LShiftVI dst shift));
   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
   ins_encode %{
@@ -9113,7 +8414,7 @@
 %}
 
 instruct vsll4I_imm(vecX dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (LShiftVI dst shift));
   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
   ins_encode %{
@@ -9190,7 +8491,7 @@
 
 // Longs vector left shift
 instruct vsll2L(vecX dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVL dst shift));
   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
   ins_encode %{
@@ -9200,7 +8501,7 @@
 %}
 
 instruct vsll2L_imm(vecX dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVL dst shift));
   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
   ins_encode %{
@@ -9302,8 +8603,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
   ins_encode %{
@@ -9313,58 +8614,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
+instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -9390,8 +8645,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
   ins_encode %{
@@ -9401,58 +8656,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
+instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -9478,8 +8687,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
   ins_encode %{
@@ -9489,65 +8698,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
+instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
   ins_encode %{
@@ -9557,65 +8720,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
+instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
   ins_encode %{
     int vector_len = 1;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
   ins_encode %{
@@ -9626,7 +8743,7 @@
 %}
 
 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
   ins_encode %{
@@ -9638,7 +8755,7 @@
 
 // Integers vector logical right shift
 instruct vsrl2I(vecD dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVI dst shift));
   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
   ins_encode %{
@@ -9648,7 +8765,7 @@
 %}
 
 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVI dst shift));
   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
   ins_encode %{
@@ -9680,7 +8797,7 @@
 %}
 
 instruct vsrl4I(vecX dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (URShiftVI dst shift));
   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
   ins_encode %{
@@ -9690,7 +8807,7 @@
 %}
 
 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (URShiftVI dst shift));
   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
   ins_encode %{
@@ -9767,7 +8884,7 @@
 
 // Longs vector logical right shift
 instruct vsrl2L(vecX dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVL dst shift));
   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
   ins_encode %{
@@ -9777,7 +8894,7 @@
 %}
 
 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVL dst shift));
   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
   ins_encode %{
@@ -9866,7 +8983,7 @@
 %}
 
 instruct vsra2S_imm(vecS dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (RShiftVS dst shift));
   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
   ins_encode %{
@@ -9875,8 +8992,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
   ins_encode %{
@@ -9886,58 +9003,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
+instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -9963,8 +9034,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
   ins_encode %{
@@ -9974,58 +9045,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
+instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -10051,8 +9076,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
   ins_encode %{
@@ -10062,65 +9087,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
+instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
   ins_encode %{
@@ -10130,65 +9109,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
+instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
   ins_encode %{
     int vector_len = 1;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
   ins_encode %{
@@ -10199,7 +9132,7 @@
 %}
 
 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
   ins_encode %{
@@ -10211,7 +9144,7 @@
 
 // Integers vector arithmetic right shift
 instruct vsra2I(vecD dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (RShiftVI dst shift));
   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
   ins_encode %{
@@ -10221,7 +9154,7 @@
 %}
 
 instruct vsra2I_imm(vecD dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (RShiftVI dst shift));
   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
   ins_encode %{
@@ -10253,7 +9186,7 @@
 %}
 
 instruct vsra4I(vecX dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (RShiftVI dst shift));
   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
   ins_encode %{
@@ -10263,7 +9196,7 @@
 %}
 
 instruct vsra4I_imm(vecX dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (RShiftVI dst shift));
   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
   ins_encode %{
@@ -10344,7 +9277,7 @@
 // --------------------------------- AND --------------------------------------
 
 instruct vand4B(vecS dst, vecS src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
   match(Set dst (AndV dst src));
   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
   ins_encode %{
@@ -10376,7 +9309,7 @@
 %}
 
 instruct vand8B(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 8);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (AndV dst src));
   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
   ins_encode %{
@@ -10408,7 +9341,7 @@
 %}
 
 instruct vand16B(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 16);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (AndV dst src));
   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
   ins_encode %{
@@ -10486,7 +9419,7 @@
 // --------------------------------- OR ---------------------------------------
 
 instruct vor4B(vecS dst, vecS src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
   match(Set dst (OrV dst src));
   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
   ins_encode %{
@@ -10518,7 +9451,7 @@
 %}
 
 instruct vor8B(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 8);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (OrV dst src));
   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
   ins_encode %{
@@ -10550,7 +9483,7 @@
 %}
 
 instruct vor16B(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 16);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (OrV dst src));
   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
   ins_encode %{
@@ -10628,7 +9561,7 @@
 // --------------------------------- XOR --------------------------------------
 
 instruct vxor4B(vecS dst, vecS src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
   match(Set dst (XorV dst src));
   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
   ins_encode %{
@@ -10660,7 +9593,7 @@
 %}
 
 instruct vxor8B(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 8);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (XorV dst src));
   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
   ins_encode %{
@@ -10692,7 +9625,7 @@
 %}
 
 instruct vxor16B(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 16);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (XorV dst src));
   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
   ins_encode %{
--- a/src/hotspot/cpu/x86/x86_32.ad	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/x86_32.ad	Mon Sep 24 16:37:28 2018 -0700
@@ -4101,6 +4101,15 @@
   interface(REG_INTER);
 %}
 
+// Float register operands
+operand vlRegF() %{
+   constraint(ALLOC_IN_RC(float_reg_vl));
+   match(RegF);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
 // XMM Double register operands
 operand regD() %{
   predicate( UseSSE>=2 );
@@ -4110,6 +4119,15 @@
   interface(REG_INTER);
 %}
 
+// Double register operands
+operand vlRegD() %{
+   constraint(ALLOC_IN_RC(double_reg_vl));
+   match(RegD);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
 // Vectors : note, we use legacy registers to avoid extra (unneeded in 32-bit VM)
 // runtime code generation via reg_class_dynamic.
 operand vecS() %{
@@ -4120,6 +4138,14 @@
   interface(REG_INTER);
 %}
 
+operand legVecS() %{
+  constraint(ALLOC_IN_RC(vectors_reg_legacy));
+  match(VecS);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vecD() %{
   constraint(ALLOC_IN_RC(vectord_reg_legacy));
   match(VecD);
@@ -4128,6 +4154,14 @@
   interface(REG_INTER);
 %}
 
+operand legVecD() %{
+  constraint(ALLOC_IN_RC(vectord_reg_legacy));
+  match(VecD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vecX() %{
   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
   match(VecX);
@@ -4136,6 +4170,14 @@
   interface(REG_INTER);
 %}
 
+operand legVecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg_legacy));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vecY() %{
   constraint(ALLOC_IN_RC(vectory_reg_legacy));
   match(VecY);
@@ -4144,6 +4186,14 @@
   interface(REG_INTER);
 %}
 
+operand legVecY() %{
+  constraint(ALLOC_IN_RC(vectory_reg_legacy));
+  match(VecY);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 //----------Memory Operands----------------------------------------------------
 // Direct Memory Operand
 operand direct(immP addr) %{
@@ -6515,6 +6565,26 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load Double
+instruct MoveD2VL(vlRegD dst, regD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Double
+instruct MoveVL2D(regD dst, vlRegD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Store XMM register to memory (single-precision floating point)
 // MOVSS instruction
 instruct storeF(memory mem, regF src) %{
@@ -6528,6 +6598,26 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load Float
+instruct MoveF2VL(vlRegF dst, regF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Float
+instruct MoveVL2F(regF dst, vlRegF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Store Float
 instruct storeFPR( memory mem, regFPR1 src) %{
   predicate(UseSSE==0);
--- a/src/hotspot/cpu/x86/x86_64.ad	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/cpu/x86/x86_64.ad	Mon Sep 24 16:37:28 2018 -0700
@@ -3656,6 +3656,15 @@
    interface(REG_INTER);
 %}
 
+// Float register operands
+operand vlRegF() %{
+   constraint(ALLOC_IN_RC(float_reg_vl));
+   match(RegF);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
 // Double register operands
 operand regD() %{
    constraint(ALLOC_IN_RC(double_reg));
@@ -3665,33 +3674,75 @@
    interface(REG_INTER);
 %}
 
+// Double register operands
+operand vlRegD() %{
+   constraint(ALLOC_IN_RC(double_reg_vl));
+   match(RegD);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
 // Vectors
 operand vecS() %{
-  constraint(ALLOC_IN_RC(vectors_reg));
+  constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
   match(VecS);
 
   format %{ %}
   interface(REG_INTER);
 %}
 
-operand vecD() %{
-  constraint(ALLOC_IN_RC(vectord_reg));
-  match(VecD);
+// Vectors
+operand legVecS() %{
+  constraint(ALLOC_IN_RC(vectors_reg_legacy));
+  match(VecS);
 
   format %{ %}
   interface(REG_INTER);
 %}
 
-operand vecX() %{
-  constraint(ALLOC_IN_RC(vectorx_reg));
-  match(VecX);
+operand vecD() %{
+  constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
+  match(VecD);
 
   format %{ %}
   interface(REG_INTER);
 %}
 
+operand legVecD() %{
+  constraint(ALLOC_IN_RC(vectord_reg_legacy));
+  match(VecD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand legVecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg_legacy));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vecY() %{
-  constraint(ALLOC_IN_RC(vectory_reg));
+  constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
+  match(VecY);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand legVecY() %{
+  constraint(ALLOC_IN_RC(vectory_reg_legacy));
   match(VecY);
 
   format %{ %}
@@ -5287,6 +5338,26 @@
   ins_pipe(pipe_slow); // XXX
 %}
 
+// Load Float
+instruct MoveF2VL(vlRegF dst, regF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Float
+instruct MoveVL2F(regF dst, vlRegF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load Double
 instruct loadD_partial(regD dst, memory mem)
 %{
@@ -5314,6 +5385,26 @@
   ins_pipe(pipe_slow); // XXX
 %}
 
+// Load Double
+instruct MoveD2VL(vlRegD dst, regD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Double
+instruct MoveVL2D(regD dst, vlRegD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load Effective Address
 instruct leaP8(rRegP dst, indOffset8 mem)
 %{
@@ -10858,7 +10949,7 @@
 %}
 
 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
-                         rax_RegI result, regD tmp1, rFlagsReg cr)
+                         rax_RegI result, legVecS tmp1, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -10874,7 +10965,7 @@
 %}
 
 instruct string_compareU(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
-                         rax_RegI result, regD tmp1, rFlagsReg cr)
+                         rax_RegI result, legVecS tmp1, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -10890,7 +10981,7 @@
 %}
 
 instruct string_compareLU(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
-                          rax_RegI result, regD tmp1, rFlagsReg cr)
+                          rax_RegI result, legVecS tmp1, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -10906,7 +10997,7 @@
 %}
 
 instruct string_compareUL(rsi_RegP str1, rdx_RegI cnt1, rdi_RegP str2, rcx_RegI cnt2,
-                          rax_RegI result, regD tmp1, rFlagsReg cr)
+                          rax_RegI result, legVecS tmp1, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -10923,7 +11014,7 @@
 
 // fast search of substring with known size.
 instruct string_indexof_conL(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, immI int_cnt2,
-                             rbx_RegI result, regD vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
+                             rbx_RegI result, legVecS vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
@@ -10952,7 +11043,7 @@
 
 // fast search of substring with known size.
 instruct string_indexof_conU(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, immI int_cnt2,
-                             rbx_RegI result, regD vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
+                             rbx_RegI result, legVecS vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
@@ -10981,7 +11072,7 @@
 
 // fast search of substring with known size.
 instruct string_indexof_conUL(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, immI int_cnt2,
-                             rbx_RegI result, regD vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
+                             rbx_RegI result, legVecS vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
@@ -11009,7 +11100,7 @@
 %}
 
 instruct string_indexofL(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, rax_RegI cnt2,
-                         rbx_RegI result, regD vec, rcx_RegI tmp, rFlagsReg cr)
+                         rbx_RegI result, legVecS vec, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -11026,7 +11117,7 @@
 %}
 
 instruct string_indexofU(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, rax_RegI cnt2,
-                         rbx_RegI result, regD vec, rcx_RegI tmp, rFlagsReg cr)
+                         rbx_RegI result, legVecS vec, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -11043,7 +11134,7 @@
 %}
 
 instruct string_indexofUL(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, rax_RegI cnt2,
-                         rbx_RegI result, regD vec, rcx_RegI tmp, rFlagsReg cr)
+                         rbx_RegI result, legVecS vec, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -11060,7 +11151,7 @@
 %}
 
 instruct string_indexofU_char(rdi_RegP str1, rdx_RegI cnt1, rax_RegI ch,
-                              rbx_RegI result, regD vec1, regD vec2, regD vec3, rcx_RegI tmp, rFlagsReg cr)
+                              rbx_RegI result, legVecS vec1, legVecS vec2, legVecS vec3, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics);
   match(Set result (StrIndexOfChar (Binary str1 cnt1) ch));
@@ -11075,7 +11166,7 @@
 
 // fast string equals
 instruct string_equals(rdi_RegP str1, rsi_RegP str2, rcx_RegI cnt, rax_RegI result,
-                       regD tmp1, regD tmp2, rbx_RegI tmp3, rFlagsReg cr)
+                       legVecS tmp1, legVecS tmp2, rbx_RegI tmp3, rFlagsReg cr)
 %{
   match(Set result (StrEquals (Binary str1 str2) cnt));
   effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL tmp3, KILL cr);
@@ -11091,7 +11182,7 @@
 
 // fast array equals
 instruct array_equalsB(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result,
-                       regD tmp1, regD tmp2, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr)
+                       legVecS tmp1, legVecS tmp2, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr)
 %{
   predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL);
   match(Set result (AryEq ary1 ary2));
@@ -11107,7 +11198,7 @@
 %}
 
 instruct array_equalsC(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result,
-                      regD tmp1, regD tmp2, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr)
+                      legVecS tmp1, legVecS tmp2, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr)
 %{
   predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU);
   match(Set result (AryEq ary1 ary2));
@@ -11123,7 +11214,7 @@
 %}
 
 instruct has_negatives(rsi_RegP ary1, rcx_RegI len, rax_RegI result,
-                      regD tmp1, regD tmp2, rbx_RegI tmp3, rFlagsReg cr)
+                      legVecS tmp1, legVecS tmp2, rbx_RegI tmp3, rFlagsReg cr)
 %{
   match(Set result (HasNegatives ary1 len));
   effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL len, KILL tmp3, KILL cr);
@@ -11138,7 +11229,7 @@
 %}
 
 // fast char[] to byte[] compression
-instruct string_compress(rsi_RegP src, rdi_RegP dst, rdx_RegI len, regD tmp1, regD tmp2, regD tmp3, regD tmp4,
+instruct string_compress(rsi_RegP src, rdi_RegP dst, rdx_RegI len, legVecS tmp1, legVecS tmp2, legVecS tmp3, legVecS tmp4,
                          rcx_RegI tmp5, rax_RegI result, rFlagsReg cr) %{
   match(Set result (StrCompressedCopy src (Binary dst len)));
   effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL tmp5, KILL cr);
@@ -11154,7 +11245,7 @@
 
 // fast byte[] to char[] inflation
 instruct string_inflate(Universe dummy, rsi_RegP src, rdi_RegP dst, rdx_RegI len,
-                        regD tmp1, rcx_RegI tmp2, rFlagsReg cr) %{
+                        legVecS tmp1, rcx_RegI tmp2, rFlagsReg cr) %{
   match(Set dummy (StrInflatedCopy src (Binary dst len)));
   effect(TEMP tmp1, TEMP tmp2, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr);
 
@@ -11168,7 +11259,7 @@
 
 // encode char[] to byte[] in ISO_8859_1
 instruct encode_iso_array(rsi_RegP src, rdi_RegP dst, rdx_RegI len,
-                          regD tmp1, regD tmp2, regD tmp3, regD tmp4,
+                          legVecS tmp1, legVecS tmp2, legVecS tmp3, legVecS tmp4,
                           rcx_RegI tmp5, rax_RegI result, rFlagsReg cr) %{
   match(Set result (EncodeISOArray src (Binary dst len)));
   effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL tmp5, KILL cr);
--- a/src/hotspot/share/c1/c1_LIR.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/share/c1/c1_LIR.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -472,7 +472,6 @@
     case lir_pop:            // input always valid, result and info always invalid
     case lir_return:         // input always valid, result and info always invalid
     case lir_leal:           // input and result always valid, info always invalid
-    case lir_neg:            // input and result always valid, info always invalid
     case lir_monaddr:        // input and result always valid, info always invalid
     case lir_null_check:     // input and info always valid, result always invalid
     case lir_move:           // input and result always valid, may have info
@@ -580,6 +579,7 @@
     case lir_rem:
     case lir_sqrt:
     case lir_abs:
+    case lir_neg:
     case lir_logic_and:
     case lir_logic_or:
     case lir_logic_xor:
@@ -1662,7 +1662,6 @@
      case lir_null_check:            s = "null_check";    break;
      case lir_return:                s = "return";        break;
      case lir_safepoint:             s = "safepoint";     break;
-     case lir_neg:                   s = "neg";           break;
      case lir_leal:                  s = "leal";          break;
      case lir_branch:                s = "branch";        break;
      case lir_cond_float_branch:     s = "flt_cond_br";   break;
@@ -1690,6 +1689,7 @@
      case lir_div_strictfp:          s = "div_strictfp";  break;
      case lir_rem:                   s = "rem";           break;
      case lir_abs:                   s = "abs";           break;
+     case lir_neg:                   s = "neg";           break;
      case lir_sqrt:                  s = "sqrt";          break;
      case lir_logic_and:             s = "logic_and";     break;
      case lir_logic_or:              s = "logic_or";      break;
--- a/src/hotspot/share/c1/c1_LIR.hpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/share/c1/c1_LIR.hpp	Mon Sep 24 16:37:28 2018 -0700
@@ -911,7 +911,6 @@
       , lir_null_check
       , lir_return
       , lir_leal
-      , lir_neg
       , lir_branch
       , lir_cond_float_branch
       , lir_move
@@ -939,6 +938,7 @@
       , lir_rem
       , lir_sqrt
       , lir_abs
+      , lir_neg
       , lir_tan
       , lir_log10
       , lir_logic_and
@@ -2075,7 +2075,6 @@
 
   void branch_destination(Label* lbl)            { append(new LIR_OpLabel(lbl)); }
 
-  void negate(LIR_Opr from, LIR_Opr to)          { append(new LIR_Op1(lir_neg, from, to)); }
   void leal(LIR_Opr from, LIR_Opr result_reg, LIR_PatchCode patch_code = lir_patch_none, CodeEmitInfo* info = NULL) { append(new LIR_Op1(lir_leal, from, result_reg, T_ILLEGAL, patch_code, info)); }
 
   // result is a stack location for old backend and vreg for UseLinearScan
@@ -2159,6 +2158,7 @@
                LIR_Opr t1, LIR_Opr t2, LIR_Opr result = LIR_OprFact::illegalOpr);
 
   void abs (LIR_Opr from, LIR_Opr to, LIR_Opr tmp)                { append(new LIR_Op2(lir_abs , from, tmp, to)); }
+  void negate(LIR_Opr from, LIR_Opr to, LIR_Opr tmp = LIR_OprFact::illegalOpr)              { append(new LIR_Op2(lir_neg, from, tmp, to)); }
   void sqrt(LIR_Opr from, LIR_Opr to, LIR_Opr tmp)                { append(new LIR_Op2(lir_sqrt, from, tmp, to)); }
   void fmad(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmad, from, from1, from2, to)); }
   void fmaf(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmaf, from, from1, from2, to)); }
--- a/src/hotspot/share/c1/c1_LIRAssembler.cpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/share/c1/c1_LIRAssembler.cpp	Mon Sep 24 16:37:28 2018 -0700
@@ -554,10 +554,6 @@
       pop(op->in_opr());
       break;
 
-    case lir_neg:
-      negate(op->in_opr(), op->result_opr());
-      break;
-
     case lir_leal:
       leal(op->in_opr(), op->result_opr(), op->patch_code(), op->info());
       break;
@@ -750,6 +746,10 @@
       intrinsic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op);
       break;
 
+    case lir_neg:
+      negate(op->in_opr1(), op->result_opr(), op->in_opr2());
+      break;
+
     case lir_logic_and:
     case lir_logic_or:
     case lir_logic_xor:
--- a/src/hotspot/share/c1/c1_LIRAssembler.hpp	Mon Sep 24 13:51:22 2018 -0700
+++ b/src/hotspot/share/c1/c1_LIRAssembler.hpp	Mon Sep 24 16:37:28 2018 -0700
@@ -239,7 +239,7 @@
   void align_backward_branch_target();
   void align_call(LIR_Code code);
 
-  void negate(LIR_Opr left, LIR_Opr dest);
+  void negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp = LIR_OprFact::illegalOpr);
   void leal(LIR_Opr src, LIR_Opr dest, LIR_PatchCode patch_code, CodeEmitInfo* info);
 
   void rt_call(LIR_Opr result, address dest, const LIR_OprList* args, LIR_Opr tmp, CodeEmitInfo* info);