changeset 51938:d3e0d57cd3ff

Merge
author prr
date Thu, 27 Sep 2018 10:49:10 -0700
parents c3fc25df8f5a 11fd6c8188d9
children afb3c0884bf1
files test/jdk/ProblemList.txt test/jdk/TEST.groups test/langtools/tools/javadoc/api/basic/IsSupportedOptionTest.java
diffstat 196 files changed, 5314 insertions(+), 5624 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Wed Sep 26 18:36:55 2018 +0100
+++ b/.hgtags	Thu Sep 27 10:49:10 2018 -0700
@@ -513,3 +513,4 @@
 8f594f75e0547d4ca16649cb3501659e3155e81b jdk-12+10
 f0f5d23449d31f1b3580c8a73313918cafeaefd7 jdk-12+11
 15094d12a632f452a2064318a4e416d0c7a9ce0c jdk-12+12
+511a9946f83e3e3c7b9dbe1840367063fb39b4e1 jdk-12+13
--- a/make/RunTests.gmk	Wed Sep 26 18:36:55 2018 +0100
+++ b/make/RunTests.gmk	Thu Sep 27 10:49:10 2018 -0700
@@ -531,8 +531,8 @@
     $1_JTREG_BASIC_OPTIONS += $$(addprefix -exclude:, $$($1_JTREG_PROBLEM_LIST))
   endif
 
-  ifneq ($$(JIB_JAR), )
-    $1_JTREG_BASIC_OPTIONS += -cpa:$$(JIB_JAR)
+  ifneq ($$(JIB_HOME), )
+    $1_JTREG_BASIC_OPTIONS += -e:JIB_HOME=$$(JIB_HOME)
   endif
 
   $1_JTREG_BASIC_OPTIONS += -e:TEST_IMAGE_GRAAL_DIR=${TEST_IMAGE_DIR}/hotspot/jtreg/graal
--- a/make/autoconf/spec.gmk.in	Wed Sep 26 18:36:55 2018 +0100
+++ b/make/autoconf/spec.gmk.in	Thu Sep 27 10:49:10 2018 -0700
@@ -723,7 +723,7 @@
 XATTR:=@XATTR@
 JT_HOME:=@JT_HOME@
 JTREGEXE:=@JTREGEXE@
-JIB_JAR:=@JIB_JAR@
+JIB_HOME:=@JIB_HOME@
 XCODEBUILD=@XCODEBUILD@
 DTRACE := @DTRACE@
 FIXPATH:=@FIXPATH@
--- a/make/autoconf/toolchain.m4	Wed Sep 26 18:36:55 2018 +0100
+++ b/make/autoconf/toolchain.m4	Thu Sep 27 10:49:10 2018 -0700
@@ -1144,5 +1144,5 @@
     fi
   fi
 
-  AC_SUBST(JIB_JAR)
+  AC_SUBST(JIB_HOME)
 ])
--- a/make/conf/jib-profiles.js	Wed Sep 26 18:36:55 2018 +0100
+++ b/make/conf/jib-profiles.js	Thu Sep 27 10:49:10 2018 -0700
@@ -840,7 +840,7 @@
         linux_x64: "gcc7.3.0-OEL6.4+1.0",
         macosx_x64: "Xcode9.4-MacOSX10.13+1.0",
         solaris_x64: "SS12u4-Solaris11u1+1.0",
-        solaris_sparcv9: "SS12u4-Solaris11u1+1.1",
+        solaris_sparcv9: "SS12u6-Solaris11u3+1.0",
         windows_x64: "VS2017-15.5.5+1.0",
         linux_aarch64: (input.profile != null && input.profile.indexOf("arm64") >= 0
                     ? "gcc-linaro-aarch64-linux-gnu-4.8-2013.11_linux+1.0"
@@ -961,9 +961,9 @@
             ext: "zip",
             classifier: "distribution",
             revision: "3.0-SNAPSHOT",
-            environment_name: "JIB_JAR",
+            environment_name: "JIB_HOME",
             environment_value: input.get("jib", "install_path")
-                + "/jib-3.0-SNAPSHOT-distribution/lib/jib-3.0-SNAPSHOT.jar"
+                + "/jib-3.0-SNAPSHOT-distribution"
         },
 
         ant: {
--- a/make/devkit/createSolarisDevkit12.6.sh	Wed Sep 26 18:36:55 2018 +0100
+++ b/make/devkit/createSolarisDevkit12.6.sh	Thu Sep 27 10:49:10 2018 -0700
@@ -34,18 +34,19 @@
 # install in a separate temporary image.
 #
 # The Solaris Studio installation must contain at least these packages:
-# developer/developerstudio-126/backend               12.6-1.0.0.0               i--
-# developer/developerstudio-126/c++                   12.6-1.0.0.0               i--
-# developer/developerstudio-126/cc                    12.6-1.0.0.0               i--
-# developer/developerstudio-126/dbx (solarisstudio)   12.6-1.0.0.0               i--
-# developer/developerstudio-126/library/c++-libs      12.6-1.0.0.0               i--
-# developer/developerstudio-126/library/math-libs     12.6-1.0.0.0               i--
-# developer/developerstudio-126/library/c-libs        12.6-1.0.0.0               i--
-# developer/developerstudio-126/library/studio-gccrt  12.6-1.0.0.0               i--
-# developer/developerstudio-126/studio-common         12.6-1.0.0.0               i--
-# developer/developerstudio-126/studio-ja             12.6-1.0.0.0               i--
-# developer/developerstudio-126/studio-legal          12.6-1.0.0.0               i--
-# developer/developerstudio-126/studio-zhCN           12.6-1.0.0.0               i--
+#developer/developerstudio-126/backend                12.6-1.0.0.1
+#developer/developerstudio-126/c++                    12.6-1.0.2.0
+#developer/developerstudio-126/cc                     12.6-1.0.1.0
+#developer/developerstudio-126/dbx                    12.6-1.0.0.1
+#developer/developerstudio-126/library/c++-libs       12.6-1.0.2.0
+#developer/developerstudio-126/library/c-libs         12.6-1.0.0.1
+#developer/developerstudio-126/library/f90-libs       12.6-1.0.0.1
+#developer/developerstudio-126/library/math-libs      12.6-1.0.0.1
+#developer/developerstudio-126/library/studio-gccrt   12.6-1.0.0.1
+#developer/developerstudio-126/studio-common          12.6-1.0.0.1
+#developer/developerstudio-126/studio-ja              12.6-1.0.0.1
+#developer/developerstudio-126/studio-legal           12.6-1.0.0.1
+#developer/developerstudio-126/studio-zhCN            12.6-1.0.0.1
 #
 # erik.joelsson@oracle.com
 
@@ -93,7 +94,7 @@
   pkg -R $INSTALL_ROOT set-publisher -P -g ${PUBLISHER_URI} solaris
   sudo pkg -R $INSTALL_ROOT install --accept entire@$SOLARIS_ENTIRE_VERSION \
       system/install developer/gnu-binutils system/library/mmheap system/picl \
-      developer/assembler
+      developer/assembler system/library/freetype-2
 else
   echo "Skipping installing packages"
 fi
--- a/make/launcher/LauncherCommon.gmk	Wed Sep 26 18:36:55 2018 +0100
+++ b/make/launcher/LauncherCommon.gmk	Thu Sep 27 10:49:10 2018 -0700
@@ -36,9 +36,7 @@
   LAUNCHER_CFLAGS += -fvisibility=hidden
   LDFLAGS_JDKEXE += -Wl,--exclude-libs,ALL
 else ifeq ($(TOOLCHAIN_TYPE), clang)
-  ifneq ($(OPENJDK_TARGET_OS), macosx)
-    LAUNCHER_CFLAGS += -fvisibility=hidden
-  endif
+  LAUNCHER_CFLAGS += -fvisibility=hidden
 else ifeq ($(TOOLCHAIN_TYPE), solstudio)
   LAUNCHER_CFLAGS += -xldscope=hidden
 else ifeq ($(TOOLCHAIN_TYPE), xlc)
--- a/make/lib/CoreLibraries.gmk	Wed Sep 26 18:36:55 2018 +0100
+++ b/make/lib/CoreLibraries.gmk	Thu Sep 27 10:49:10 2018 -0700
@@ -244,7 +244,7 @@
       EXCLUDE_FILES := $(LIBJLI_EXCLUDE_FILES), \
       EXTRA_FILES := $(LIBJLI_EXTRA_FILES), \
       OPTIMIZATION := HIGH, \
-      CFLAGS := $(STATIC_LIBRARY_FLAGS) $(LIBJLI_CFLAGS_JDKLIB) $(LIBJLI_CFLAGS) \
+      CFLAGS := $(STATIC_LIBRARY_FLAGS) $(CFLAGS_JDKLIB) $(LIBJLI_CFLAGS) \
           $(addprefix -I, $(LIBJLI_SRC_DIRS)), \
       ARFLAGS := $(ARFLAGS), \
       OBJECT_DIR := $(SUPPORT_OUTPUTDIR)/native/$(MODULE)/libjli_static))
--- a/make/lib/LibCommon.gmk	Wed Sep 26 18:36:55 2018 +0100
+++ b/make/lib/LibCommon.gmk	Thu Sep 27 10:49:10 2018 -0700
@@ -46,11 +46,9 @@
   LDFLAGS_JDKLIB += -Wl,--exclude-libs,ALL
   EXPORT_ALL_SYMBOLS := -fvisibility=default
 else ifeq ($(TOOLCHAIN_TYPE), clang)
-  ifneq ($(OPENJDK_TARGET_OS), macosx)
-    CFLAGS_JDKLIB += -fvisibility=hidden
-    CXXFLAGS_JDKLIB += -fvisibility=hidden
-    EXPORT_ALL_SYMBOLS := -fvisibility=default
-  endif
+  CFLAGS_JDKLIB += -fvisibility=hidden
+  CXXFLAGS_JDKLIB += -fvisibility=hidden
+  EXPORT_ALL_SYMBOLS := -fvisibility=default
 else ifeq ($(TOOLCHAIN_TYPE), solstudio)
   CFLAGS_JDKLIB += -xldscope=hidden
   CXXFLAGS_JDKLIB += -xldscope=hidden
--- a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -1025,37 +1025,17 @@
   return exact_log2(elem_size);
 }
 
-void LIR_Assembler::arithmetic_idiv(LIR_Op3* op, bool is_irem) {
-  Register Rdividend = op->in_opr1()->as_register();
-  Register Rdivisor  = op->in_opr2()->as_register();
-  Register Rscratch  = op->in_opr3()->as_register();
-  Register Rresult   = op->result_opr()->as_register();
-  int divisor = -1;
-
-  /*
-  TODO: For some reason, using the Rscratch that gets passed in is
-  not possible because the register allocator does not see the tmp reg
-  as used, and assignes it the same register as Rdividend. We use rscratch1
-   instead.
-
-  assert(Rdividend != Rscratch, "");
-  assert(Rdivisor  != Rscratch, "");
-  */
-
-  if (Rdivisor == noreg && is_power_of_2(divisor)) {
-    // convert division by a power of two into some shifts and logical operations
-  }
-
-  __ corrected_idivl(Rresult, Rdividend, Rdivisor, is_irem, rscratch1);
-}
 
 void LIR_Assembler::emit_op3(LIR_Op3* op) {
   switch (op->code()) {
   case lir_idiv:
-    arithmetic_idiv(op, false);
-    break;
   case lir_irem:
-    arithmetic_idiv(op, true);
+    arithmetic_idiv(op->code(),
+                    op->in_opr1(),
+                    op->in_opr2(),
+                    op->in_opr3(),
+                    op->result_opr(),
+                    op->info());
     break;
   case lir_fmad:
     __ fmaddd(op->result_opr()->as_double_reg(),
@@ -1752,16 +1732,43 @@
       }
 
     } else if (right->is_constant()) {
-      jlong c = right->as_constant_ptr()->as_jlong_bits();
+      jlong c = right->as_constant_ptr()->as_jlong();
       Register dreg = as_reg(dest);
-      assert(code == lir_add || code == lir_sub, "mismatched arithmetic op");
-      if (c == 0 && dreg == lreg_lo) {
-        COMMENT("effective nop elided");
-        return;
-      }
       switch (code) {
-        case lir_add: __ add(dreg, lreg_lo, c); break;
-        case lir_sub: __ sub(dreg, lreg_lo, c); break;
+        case lir_add:
+        case lir_sub:
+          if (c == 0 && dreg == lreg_lo) {
+            COMMENT("effective nop elided");
+            return;
+          }
+          code == lir_add ? __ add(dreg, lreg_lo, c) : __ sub(dreg, lreg_lo, c);
+          break;
+        case lir_div:
+          assert(c > 0 && is_power_of_2_long(c), "divisor must be power-of-2 constant");
+          if (c == 1) {
+            // move lreg_lo to dreg if divisor is 1
+            __ mov(dreg, lreg_lo);
+          } else {
+            unsigned int shift = exact_log2_long(c);
+            // use rscratch1 as intermediate result register
+            __ asr(rscratch1, lreg_lo, 63);
+            __ add(rscratch1, lreg_lo, rscratch1, Assembler::LSR, 64 - shift);
+            __ asr(dreg, rscratch1, shift);
+          }
+          break;
+        case lir_rem:
+          assert(c > 0 && is_power_of_2_long(c), "divisor must be power-of-2 constant");
+          if (c == 1) {
+            // move 0 to dreg if divisor is 1
+            __ mov(dreg, zr);
+          } else {
+            // use rscratch1 as intermediate result register
+            __ negs(rscratch1, lreg_lo);
+            __ andr(dreg, lreg_lo, c - 1);
+            __ andr(rscratch1, rscratch1, c - 1);
+            __ csneg(dreg, dreg, rscratch1, Assembler::MI);
+          }
+          break;
         default:
           ShouldNotReachHere();
       }
@@ -1862,7 +1869,51 @@
 
 
 
-void LIR_Assembler::arithmetic_idiv(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr temp, LIR_Opr result, CodeEmitInfo* info) { Unimplemented(); }
+void LIR_Assembler::arithmetic_idiv(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr illegal, LIR_Opr result, CodeEmitInfo* info) {
+
+  // opcode check
+  assert((code == lir_idiv) || (code == lir_irem), "opcode must be idiv or irem");
+  bool is_irem = (code == lir_irem);
+
+  // operand check
+  assert(left->is_single_cpu(),   "left must be register");
+  assert(right->is_single_cpu() || right->is_constant(),  "right must be register or constant");
+  assert(result->is_single_cpu(), "result must be register");
+  Register lreg = left->as_register();
+  Register dreg = result->as_register();
+
+  // power-of-2 constant check and codegen
+  if (right->is_constant()) {
+    int c = right->as_constant_ptr()->as_jint();
+    assert(c > 0 && is_power_of_2(c), "divisor must be power-of-2 constant");
+    if (is_irem) {
+      if (c == 1) {
+        // move 0 to dreg if divisor is 1
+        __ movw(dreg, zr);
+      } else {
+        // use rscratch1 as intermediate result register
+        __ negsw(rscratch1, lreg);
+        __ andw(dreg, lreg, c - 1);
+        __ andw(rscratch1, rscratch1, c - 1);
+        __ csnegw(dreg, dreg, rscratch1, Assembler::MI);
+      }
+    } else {
+      if (c == 1) {
+        // move lreg to dreg if divisor is 1
+        __ movw(dreg, lreg);
+      } else {
+        unsigned int shift = exact_log2(c);
+        // use rscratch1 as intermediate result register
+        __ asrw(rscratch1, lreg, 31);
+        __ addw(rscratch1, lreg, rscratch1, Assembler::LSR, 32 - shift);
+        __ asrw(dreg, rscratch1, shift);
+      }
+    }
+  } else {
+    Register rreg = right->as_register();
+    __ corrected_idivl(dreg, lreg, rreg, is_irem, rscratch1);
+  }
+}
 
 
 void LIR_Assembler::comp_op(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Op2* op) {
@@ -2792,7 +2843,10 @@
 }
 
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
+  // tmp must be unused
+  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
+
   if (left->is_single_cpu()) {
     assert(dest->is_single_cpu(), "expect single result reg");
     __ negw(dest->as_register(), left->as_register());
--- a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -75,8 +75,6 @@
     _deopt_handler_size = 7 * NativeInstruction::instruction_size
   };
 
-  void arithmetic_idiv(LIR_Op3* op, bool is_irem);
-
 public:
 
   void store_parameter(Register r, int offset_from_esp_in_words);
--- a/src/hotspot/cpu/aarch64/c1_LIRGenerator_aarch64.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/aarch64/c1_LIRGenerator_aarch64.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -440,17 +440,26 @@
 
   if (x->op() == Bytecodes::_ldiv || x->op() == Bytecodes::_lrem) {
 
-    // the check for division by zero destroys the right operand
-    right.set_destroys_register();
-
-    // check for division by zero (destroys registers of right operand!)
-    CodeEmitInfo* info = state_for(x);
-
     left.load_item();
-    right.load_item();
-
-    __ cmp(lir_cond_equal, right.result(), LIR_OprFact::longConst(0));
-    __ branch(lir_cond_equal, T_LONG, new DivByZeroStub(info));
+    bool need_zero_check = true;
+    if (right.is_constant()) {
+      jlong c = right.get_jlong_constant();
+      // no need to do div-by-zero check if the divisor is a non-zero constant
+      if (c != 0) need_zero_check = false;
+      // do not load right if the divisor is a power-of-2 constant
+      if (c > 0 && is_power_of_2_long(c)) {
+        right.dont_load_item();
+      } else {
+        right.load_item();
+      }
+    } else {
+      right.load_item();
+    }
+    if (need_zero_check) {
+      CodeEmitInfo* info = state_for(x);
+      __ cmp(lir_cond_equal, right.result(), LIR_OprFact::longConst(0));
+      __ branch(lir_cond_equal, T_LONG, new DivByZeroStub(info));
+    }
 
     rlock_result(x);
     switch (x->op()) {
@@ -506,19 +515,32 @@
   // do not need to load right, as we can handle stack and constants
   if (x->op() == Bytecodes::_idiv || x->op() == Bytecodes::_irem) {
 
-    right_arg->load_item();
     rlock_result(x);
+    bool need_zero_check = true;
+    if (right.is_constant()) {
+      jint c = right.get_jint_constant();
+      // no need to do div-by-zero check if the divisor is a non-zero constant
+      if (c != 0) need_zero_check = false;
+      // do not load right if the divisor is a power-of-2 constant
+      if (c > 0 && is_power_of_2(c)) {
+        right_arg->dont_load_item();
+      } else {
+        right_arg->load_item();
+      }
+    } else {
+      right_arg->load_item();
+    }
+    if (need_zero_check) {
+      CodeEmitInfo* info = state_for(x);
+      __ cmp(lir_cond_equal, right_arg->result(), LIR_OprFact::longConst(0));
+      __ branch(lir_cond_equal, T_INT, new DivByZeroStub(info));
+    }
 
-    CodeEmitInfo* info = state_for(x);
-    LIR_Opr tmp = new_register(T_INT);
-    __ cmp(lir_cond_equal, right_arg->result(), LIR_OprFact::longConst(0));
-    __ branch(lir_cond_equal, T_INT, new DivByZeroStub(info));
-    info = state_for(x);
-
+    LIR_Opr ill = LIR_OprFact::illegalOpr;
     if (x->op() == Bytecodes::_irem) {
-      __ irem(left_arg->result(), right_arg->result(), x->operand(), tmp, NULL);
+      __ irem(left_arg->result(), right_arg->result(), x->operand(), ill, NULL);
     } else if (x->op() == Bytecodes::_idiv) {
-      __ idiv(left_arg->result(), right_arg->result(), x->operand(), tmp, NULL);
+      __ idiv(left_arg->result(), right_arg->result(), x->operand(), ill, NULL);
     }
 
   } else if (x->op() == Bytecodes::_iadd || x->op() == Bytecodes::_isub) {
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -822,6 +822,15 @@
   return stub_start_addr;
 }
 
+void MacroAssembler::c2bool(Register x) {
+  // implements x == 0 ? 0 : 1
+  // note: must only look at least-significant byte of x
+  //       since C-style booleans are stored in one byte
+  //       only! (was bug)
+  tst(x, 0xff);
+  cset(x, Assembler::NE);
+}
+
 address MacroAssembler::ic_call(address entry, jint method_index) {
   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -782,6 +782,9 @@
 
   void resolve_jobject(Register value, Register thread, Register tmp);
 
+  // C 'boolean' to Java boolean: x == 0 ? 0 : 1
+  void c2bool(Register x);
+
   // oop manipulations
   void load_klass(Register dst, Register src);
   void store_klass(Register dst, Register src);
--- a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -1924,7 +1924,7 @@
 
   // Unpack native results.
   switch (ret_type) {
-  case T_BOOLEAN: __ ubfx(r0, r0, 0, 8);             break;
+  case T_BOOLEAN: __ c2bool(r0);                     break;
   case T_CHAR   : __ ubfx(r0, r0, 0, 16);            break;
   case T_BYTE   : __ sbfx(r0, r0, 0, 8);             break;
   case T_SHORT  : __ sbfx(r0, r0, 0, 16);            break;
--- a/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -557,7 +557,7 @@
         BasicType type) {
     address entry = __ pc();
   switch (type) {
-  case T_BOOLEAN: __ uxtb(r0, r0);        break;
+  case T_BOOLEAN: __ c2bool(r0);         break;
   case T_CHAR   : __ uxth(r0, r0);       break;
   case T_BYTE   : __ sxtb(r0, r0);        break;
   case T_SHORT  : __ sxth(r0, r0);        break;
--- a/src/hotspot/cpu/arm/c1_LIRAssembler_arm.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/arm/c1_LIRAssembler_arm.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -3265,7 +3265,9 @@
 }
 
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
+  // tmp must be unused
+  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
 
   if (left->is_single_cpu()) {
     assert (dest->type() == T_INT, "unexpected result type");
--- a/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -2840,7 +2840,9 @@
 }
 
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
+  // tmp must be unused
+  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
   assert(left->is_register(), "can only handle registers");
 
   if (left->is_single_cpu()) {
--- a/src/hotspot/cpu/s390/c1_LIRAssembler_s390.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/s390/c1_LIRAssembler_s390.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -2850,7 +2850,9 @@
   ShouldNotCallThis(); // There are no delay slots on ZARCH_64.
 }
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
+  // tmp must be unused
+  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
   assert(left->is_register(), "can only handle registers");
 
   if (left->is_single_cpu()) {
--- a/src/hotspot/cpu/sparc/c1_LIRAssembler_sparc.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/sparc/c1_LIRAssembler_sparc.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -3024,7 +3024,9 @@
 }
 
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
+  // tmp must be unused
+  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
   assert(left->is_register(), "can only handle registers");
 
   if (left->is_single_cpu()) {
--- a/src/hotspot/cpu/x86/assembler_x86.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -2199,7 +2199,7 @@
 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_128bit;
-  InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_rex_vex_w_reverted();
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x28);
@@ -2209,7 +2209,7 @@
 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_128bit;
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x28);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2217,7 +2217,7 @@
 
 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2465,8 +2465,7 @@
 
 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_128bit;
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2583,7 +2582,7 @@
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(is_vector_masking(), "");    // For stub code use only
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_embedded_opmask_register_specifier(mask);
   attributes.set_is_evex_instruction();
@@ -2608,7 +2607,7 @@
   assert(is_vector_masking(), "");
   assert(VM_Version::supports_avx512vlbw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_embedded_opmask_register_specifier(mask);
   attributes.set_is_evex_instruction();
@@ -2752,7 +2751,7 @@
 void Assembler::movlpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   attributes.set_rex_vex_w_reverted();
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@@ -3512,7 +3511,7 @@
 void Assembler::evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx512vlbw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
   int dst_enc = kdst->encoding();
@@ -3525,7 +3524,7 @@
   assert(is_vector_masking(), "");
   assert(VM_Version::supports_avx512vlbw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.reset_is_clear_context();
   attributes.set_embedded_opmask_register_specifier(mask);
@@ -3538,7 +3537,7 @@
 
 void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) {
   assert(VM_Version::supports_avx512vlbw(), "");
-  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x3E);
@@ -3549,7 +3548,7 @@
 void Assembler::evpcmpuw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) {
   assert(is_vector_masking(), "");
   assert(VM_Version::supports_avx512vlbw(), "");
-  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.reset_is_clear_context();
   attributes.set_embedded_opmask_register_specifier(mask);
   attributes.set_is_evex_instruction();
@@ -3562,7 +3561,7 @@
 void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len) {
   assert(VM_Version::supports_avx512vlbw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
   int dst_enc = kdst->encoding();
@@ -3575,7 +3574,7 @@
 void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx512bw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   int dst_enc = kdst->encoding();
@@ -3588,7 +3587,7 @@
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(is_vector_masking(), "");    // For stub code use only
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_reg_mask */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_reg_mask */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.reset_is_clear_context();
   attributes.set_embedded_opmask_register_specifier(mask);
@@ -3741,7 +3740,7 @@
 
 void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3750,7 +3749,7 @@
 
 void Assembler::pextrd(Address dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x16);
@@ -3760,7 +3759,7 @@
 
 void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3769,7 +3768,7 @@
 
 void Assembler::pextrq(Address dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x16);
@@ -3779,7 +3778,7 @@
 
 void Assembler::pextrw(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse2(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xC5);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3788,7 +3787,7 @@
 
 void Assembler::pextrw(Address dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
   simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x15);
@@ -3798,7 +3797,7 @@
 
 void Assembler::pextrb(Address dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
   simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x14);
@@ -3808,7 +3807,7 @@
 
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3817,7 +3816,7 @@
 
 void Assembler::pinsrd(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x22);
@@ -3827,7 +3826,7 @@
 
 void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3836,7 +3835,7 @@
 
 void Assembler::pinsrq(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x22);
@@ -3846,7 +3845,7 @@
 
 void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse2(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xC4);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3855,7 +3854,7 @@
 
 void Assembler::pinsrw(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse2(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xC4);
@@ -3865,7 +3864,7 @@
 
 void Assembler::pinsrb(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x20);
@@ -3876,7 +3875,7 @@
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
@@ -3885,7 +3884,7 @@
 
 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3895,7 +3894,7 @@
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
@@ -3906,7 +3905,7 @@
   assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
   vector_len == AVX_256bit? VM_Version::supports_avx2() :
   vector_len == AVX_512bit? VM_Version::supports_avx512bw() : 0, "");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
   emit_int8((unsigned char) (0xC0 | encode));
@@ -3918,7 +3917,7 @@
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_embedded_opmask_register_specifier(mask);
   attributes.set_is_evex_instruction();
@@ -3930,7 +3929,7 @@
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(src != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
@@ -3943,7 +3942,7 @@
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(src != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.reset_is_clear_context();
   attributes.set_embedded_opmask_register_specifier(mask);
@@ -3957,7 +3956,7 @@
   assert(VM_Version::supports_evex(), "");
   assert(src != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_QVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
@@ -3969,7 +3968,7 @@
   assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
   vector_len == AVX_256bit? VM_Version::supports_avx2() :
   vector_len == AVX_512bit? VM_Version::supports_evex() : 0, " ");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x33);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4082,7 +4081,7 @@
 
 void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_ssse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x00);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4101,7 +4100,7 @@
 void Assembler::pshufb(XMMRegister dst, Address src) {
   assert(VM_Version::supports_ssse3(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x00);
@@ -4147,7 +4146,7 @@
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x70);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4159,7 +4158,7 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x70);
@@ -4180,7 +4179,7 @@
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4190,7 +4189,7 @@
 void Assembler::pslldq(XMMRegister dst, int shift) {
   // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   // XMM7 is for /7 encoding: 66 0F 73 /7 ib
   int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x73);
@@ -4456,7 +4455,7 @@
 
 void Assembler::palignr(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_ssse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x0F);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4477,6 +4476,7 @@
 void Assembler::evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x3);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5591,7 +5591,7 @@
 void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5600,7 +5600,7 @@
 void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5650,7 +5650,7 @@
 
 void Assembler::phaddw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5658,7 +5658,7 @@
 
 void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6281,6 +6281,15 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::vpandn(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0xDF);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+
 void Assembler::por(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6369,8 +6378,7 @@
 void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx2(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6383,9 +6391,8 @@
   assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x38);
@@ -6398,7 +6405,8 @@
 void Assembler::vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6413,10 +6421,10 @@
   assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  attributes.set_is_evex_instruction();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_operand(dst, src);
@@ -6430,9 +6438,10 @@
 void Assembler::vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x38);
+  emit_int8(0x3A);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
   // 0x01 - insert into upper 256 bits
@@ -6445,8 +6454,7 @@
 void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6459,9 +6467,8 @@
   assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
@@ -6472,16 +6479,16 @@
 }
 
 void Assembler::vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_avx2(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into q0 128 bits (0..127)
   // 0x01 - insert into q1 128 bits (128..255)
-  // 0x02 - insert into q2 128 bits (256..383)
-  // 0x03 - insert into q3 128 bits (384..511)
+  // 0x02 - insert into q0 128 bits (256..383)
+  // 0x03 - insert into q1 128 bits (384..512)
   emit_int8(imm8 & 0x03);
 }
 
@@ -6489,24 +6496,24 @@
   assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_operand(dst, src);
   // 0x00 - insert into q0 128 bits (0..127)
   // 0x01 - insert into q1 128 bits (128..255)
-  // 0x02 - insert into q2 128 bits (256..383)
-  // 0x03 - insert into q3 128 bits (384..511)
+  // 0x02 - insert into q0 128 bits (256..383)
+  // 0x03 - insert into q1 128 bits (384..512)
   emit_int8(imm8 & 0x03);
 }
 
 void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1A);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6520,8 +6527,9 @@
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_64bit);
+  attributes.set_is_evex_instruction();
   vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1A);
   emit_operand(dst, src);
@@ -6534,10 +6542,9 @@
 // vextracti forms
 
 void Assembler::vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
+  assert(VM_Version::supports_avx2(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6550,9 +6557,8 @@
   assert(VM_Version::supports_avx2(), "");
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   attributes.reset_is_clear_context();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -6564,10 +6570,10 @@
 }
 
 void Assembler::vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
+  assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6583,9 +6589,10 @@
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   attributes.reset_is_clear_context();
+  attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_operand(src, dst);
@@ -6599,7 +6606,8 @@
 void Assembler::vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx512dq(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6613,7 +6621,8 @@
 void Assembler::vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x3B);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6622,14 +6631,28 @@
   emit_int8(imm8 & 0x01);
 }
 
-
+void Assembler::vextracti64x4(Address dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_64bit);
+  attributes.reset_is_clear_context();
+  attributes.set_is_evex_instruction();
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x38);
+  emit_operand(src, dst);
+  // 0x00 - extract from lower 256 bits
+  // 0x01 - extract from upper 256 bits
+  emit_int8(imm8 & 0x01);
+}
 // vextractf forms
 
 void Assembler::vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6642,9 +6665,8 @@
   assert(VM_Version::supports_avx(), "");
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   attributes.reset_is_clear_context();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -6656,10 +6678,10 @@
 }
 
 void Assembler::vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
+  assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6675,9 +6697,10 @@
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   attributes.reset_is_clear_context();
+  attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_operand(src, dst);
@@ -6691,7 +6714,8 @@
 void Assembler::vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx512dq(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6705,7 +6729,8 @@
 void Assembler::vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1B);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6719,9 +6744,10 @@
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4,/* input_size_in_bits */  EVEX_64bit);
   attributes.reset_is_clear_context();
+  attributes.set_is_evex_instruction();
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1B);
   emit_operand(src, dst);
@@ -6730,38 +6756,17 @@
   emit_int8(imm8 & 0x01);
 }
 
-
-// legacy word/dword replicate
-void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
+// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
+void Assembler::vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
-  emit_int8(0x79);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
-void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
-  assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
-  emit_int8(0x58);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
-
-// xmm/mem sourced byte/word/dword/qword replicate
-
-// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
-void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x78);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastb(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6773,16 +6778,16 @@
 }
 
 // duplicate 2-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
-void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x79);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastw(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6793,17 +6798,19 @@
   emit_operand(dst, src);
 }
 
+// xmm/mem sourced byte/word/dword/qword replicate
+
 // duplicate 4-byte integer data from src into programmed locations in dest : requires AVX512VL
-void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(UseAVX >= 2, "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastd(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6815,8 +6822,8 @@
 }
 
 // duplicate 8-byte integer data from src into programmed locations in dest : requires AVX512VL
-void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -6824,8 +6831,8 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastq(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6863,16 +6870,16 @@
 // scalar single/double precision replicate
 
 // duplicate single precision data from src into programmed locations in dest : requires AVX512VL
-void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastss(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6884,8 +6891,8 @@
 }
 
 // duplicate double precision data from src into programmed locations in dest : requires AVX512VL
-void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_rex_vex_w_reverted();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -6893,8 +6900,8 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6911,7 +6918,7 @@
 
 // duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -6921,7 +6928,7 @@
 
 // duplicate 2-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
+  assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -6967,7 +6974,7 @@
 // Carry-Less Multiplication Quadword
 void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
   assert(VM_Version::supports_clmul(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6977,7 +6984,7 @@
 // Carry-Less Multiplication Quadword
 void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
   assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7597,33 +7604,23 @@
   set_attributes(attributes);
   attributes->set_current_assembler(this);
 
-  // if vector length is turned off, revert to AVX for vectors smaller than 512-bit
-  if (UseAVX > 2 && _legacy_mode_vl && attributes->uses_vl()) {
-    switch (attributes->get_vector_len()) {
-    case AVX_128bit:
-    case AVX_256bit:
-      attributes->set_is_legacy_mode();
-      break;
+  // For EVEX instruction (which is not marked as pure EVEX instruction) check and see if this instruction
+  // is allowed in legacy mode and has resources which will fit in it.
+  // Pure EVEX instructions will have is_evex_instruction set in their definition.
+  if (!attributes->is_legacy_mode()) {
+    if (UseAVX > 2 && !attributes->is_evex_instruction() && !_is_managed) {
+      if ((attributes->get_vector_len() != AVX_512bit) && (nds_enc < 16) && (xreg_enc < 16)) {
+          attributes->set_is_legacy_mode();
+      }
     }
   }
 
-  // For pure EVEX check and see if this instruction
-  // is allowed in legacy mode and has resources which will
-  // fit in it.  Pure EVEX instructions will use set_is_evex_instruction in their definition,
-  // else that field is set when we encode to EVEX
-  if (UseAVX > 2 && !attributes->is_legacy_mode() &&
-      !_is_managed && !attributes->is_evex_instruction()) {
-    if (!_legacy_mode_vl && attributes->get_vector_len() != AVX_512bit) {
-      bool check_register_bank = NOT_IA32(true) IA32_ONLY(false);
-      if (check_register_bank) {
-        // check nds_enc and xreg_enc for upper bank usage
-        if (nds_enc < 16 && xreg_enc < 16) {
-          attributes->set_is_legacy_mode();
-        }
-      } else {
-        attributes->set_is_legacy_mode();
-      }
-    }
+  if (UseAVX > 2) {
+    assert(((!attributes->uses_vl()) ||
+            (attributes->get_vector_len() == AVX_512bit) ||
+            (!_legacy_mode_vl) ||
+            (attributes->is_legacy_mode())),"XMM register should be 0-15");
+    assert(((nds_enc < 16 && xreg_enc < 16) || (!attributes->is_legacy_mode())),"XMM register should be 0-15");
   }
 
   _is_managed = false;
@@ -7653,43 +7650,31 @@
   bool vex_x = false;
   set_attributes(attributes);
   attributes->set_current_assembler(this);
-  bool check_register_bank = NOT_IA32(true) IA32_ONLY(false);
-
-  // if vector length is turned off, revert to AVX for vectors smaller than 512-bit
-  if (UseAVX > 2 && _legacy_mode_vl && attributes->uses_vl()) {
-    switch (attributes->get_vector_len()) {
-    case AVX_128bit:
-    case AVX_256bit:
-      if (check_register_bank) {
-        if (dst_enc >= 16 || nds_enc >= 16 || src_enc >= 16) {
-          // up propagate arithmetic instructions to meet RA requirements
-          attributes->set_vector_len(AVX_512bit);
-        } else {
+
+  // For EVEX instruction (which is not marked as pure EVEX instruction) check and see if this instruction
+  // is allowed in legacy mode and has resources which will fit in it.
+  // Pure EVEX instructions will have is_evex_instruction set in their definition.
+  if (!attributes->is_legacy_mode()) {
+    if (UseAVX > 2 && !attributes->is_evex_instruction() && !_is_managed) {
+      if ((!attributes->uses_vl() || (attributes->get_vector_len() != AVX_512bit)) &&
+          (dst_enc < 16) && (nds_enc < 16) && (src_enc < 16)) {
           attributes->set_is_legacy_mode();
-        }
-      } else {
-        attributes->set_is_legacy_mode();
       }
-      break;
     }
   }
 
-  // For pure EVEX check and see if this instruction
-  // is allowed in legacy mode and has resources which will
-  // fit in it.  Pure EVEX instructions will use set_is_evex_instruction in their definition,
-  // else that field is set when we encode to EVEX
-  if (UseAVX > 2 && !attributes->is_legacy_mode() &&
-      !_is_managed && !attributes->is_evex_instruction()) {
-    if (!_legacy_mode_vl && attributes->get_vector_len() != AVX_512bit) {
-      if (check_register_bank) {
-        // check dst_enc, nds_enc and src_enc for upper bank usage
-        if (dst_enc < 16 && nds_enc < 16 && src_enc < 16) {
-          attributes->set_is_legacy_mode();
-        }
-      } else {
-        attributes->set_is_legacy_mode();
-      }
-    }
+  if (UseAVX > 2) {
+    // All the scalar fp instructions (with uses_vl as false) can have legacy_mode as false
+    // Instruction with uses_vl true are vector instructions
+    // All the vector instructions with AVX_512bit length can have legacy_mode as false
+    // All the vector instructions with < AVX_512bit length can have legacy_mode as false if AVX512vl() is supported
+    // Rest all should have legacy_mode set as true
+    assert(((!attributes->uses_vl()) ||
+            (attributes->get_vector_len() == AVX_512bit) ||
+            (!_legacy_mode_vl) ||
+            (attributes->is_legacy_mode())),"XMM register should be 0-15");
+    // Instruction with legacy_mode true should have dst, nds and src < 15
+    assert(((dst_enc < 16 && nds_enc < 16 && src_enc < 16) || (!attributes->is_legacy_mode())),"XMM register should be 0-15");
   }
 
   _is_managed = false;
@@ -7741,7 +7726,7 @@
 void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xC2);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7751,7 +7736,7 @@
 void Assembler::blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x4B);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7762,7 +7747,7 @@
 void Assembler::cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xC2);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7772,7 +7757,7 @@
 void Assembler::blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x4A);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7782,7 +7767,7 @@
 
 void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
   assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x02);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7791,7 +7776,7 @@
 
 void Assembler::shlxl(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8((unsigned char)0xF7);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -7799,7 +7784,7 @@
 
 void Assembler::shlxq(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8((unsigned char)0xF7);
   emit_int8((unsigned char)(0xC0 | encode));
--- a/src/hotspot/cpu/x86/assembler_x86.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -2097,6 +2097,7 @@
 
   // Andn packed integers
   void pandn(XMMRegister dst, XMMRegister src);
+  void vpandn(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
 
   // Or packed integers
   void por(XMMRegister dst, XMMRegister src);
@@ -2134,6 +2135,7 @@
   void vextracti32x4(Address dst, XMMRegister src, uint8_t imm8);
   void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
   void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextracti64x4(Address dst, XMMRegister src, uint8_t imm8);
 
   // vextractf forms
   void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
@@ -2144,28 +2146,24 @@
   void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
   void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
 
-  // legacy xmm sourced word/dword replicate
-  void vpbroadcastw(XMMRegister dst, XMMRegister src);
-  void vpbroadcastd(XMMRegister dst, XMMRegister src);
-
   // xmm/mem sourced byte/word/dword/qword replicate
-  void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
-  void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastw(XMMRegister dst, Address src, int vector_len);
-  void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastd(XMMRegister dst, Address src, int vector_len);
-  void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastb(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastw(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastd(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastq(XMMRegister dst, Address src, int vector_len);
 
   void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
   void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);
 
   // scalar single/double precision replicate
-  void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
-  void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
-  void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastss(XMMRegister dst, Address src, int vector_len);
+  void vpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastsd(XMMRegister dst, Address src, int vector_len);
 
   // gpr sourced byte/word/dword/qword replicate
   void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
--- a/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -68,7 +68,6 @@
 static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], (jlong)UCONST64(0x8000000000000000), (jlong)UCONST64(0x8000000000000000));
 
 
-
 NEEDS_CLEANUP // remove this definitions ?
 const Register IC_Klass    = rax;   // where the IC klass is cached
 const Register SYNC_header = rax;   // synchronization header
@@ -650,7 +649,7 @@
 
     case T_FLOAT: {
       if (dest->is_single_xmm()) {
-        if (c->is_zero_float()) {
+        if (LP64_ONLY(UseAVX < 2 &&) c->is_zero_float()) {
           __ xorps(dest->as_xmm_float_reg(), dest->as_xmm_float_reg());
         } else {
           __ movflt(dest->as_xmm_float_reg(),
@@ -672,7 +671,7 @@
 
     case T_DOUBLE: {
       if (dest->is_double_xmm()) {
-        if (c->is_zero_double()) {
+        if (LP64_ONLY(UseAVX < 2 &&) c->is_zero_double()) {
           __ xorpd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg());
         } else {
           __ movdbl(dest->as_xmm_double_reg(),
@@ -2395,16 +2394,24 @@
 }
 
 
-void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, LIR_Opr dest, LIR_Op* op) {
+void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr tmp, LIR_Opr dest, LIR_Op* op) {
   if (value->is_double_xmm()) {
     switch(code) {
       case lir_abs :
         {
-          if (dest->as_xmm_double_reg() != value->as_xmm_double_reg()) {
-            __ movdbl(dest->as_xmm_double_reg(), value->as_xmm_double_reg());
+#ifdef _LP64
+          if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
+            assert(tmp->is_valid(), "need temporary");
+            __ vpandn(dest->as_xmm_double_reg(), tmp->as_xmm_double_reg(), value->as_xmm_double_reg(), 2);
+          } else {
+#endif
+            if (dest->as_xmm_double_reg() != value->as_xmm_double_reg()) {
+              __ movdbl(dest->as_xmm_double_reg(), value->as_xmm_double_reg());
+            }
+            assert(!tmp->is_valid(), "do not need temporary");
+            __ andpd(dest->as_xmm_double_reg(),
+                     ExternalAddress((address)double_signmask_pool));
           }
-          __ andpd(dest->as_xmm_double_reg(),
-                    ExternalAddress((address)double_signmask_pool));
         }
         break;
 
@@ -3734,7 +3741,7 @@
 }
 
 
-void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
   if (left->is_single_cpu()) {
     __ negl(left->as_register());
     move_regs(left->as_register(), dest->as_register());
@@ -3759,24 +3766,36 @@
 #endif // _LP64
 
   } else if (dest->is_single_xmm()) {
-    if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) {
-      __ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg());
+#ifdef _LP64
+    if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
+      assert(tmp->is_valid(), "need temporary");
+      assert_different_registers(left->as_xmm_float_reg(), tmp->as_xmm_float_reg());
+      __ vpxor(dest->as_xmm_float_reg(), tmp->as_xmm_float_reg(), left->as_xmm_float_reg(), 2);
     }
-    if (UseAVX > 0) {
-      __ vnegatess(dest->as_xmm_float_reg(), dest->as_xmm_float_reg(),
-                   ExternalAddress((address)float_signflip_pool));
-    } else {
+    else
+#endif
+    {
+      assert(!tmp->is_valid(), "do not need temporary");
+      if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) {
+        __ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg());
+      }
       __ xorps(dest->as_xmm_float_reg(),
                ExternalAddress((address)float_signflip_pool));
     }
   } else if (dest->is_double_xmm()) {
-    if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) {
-      __ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg());
+#ifdef _LP64
+    if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
+      assert(tmp->is_valid(), "need temporary");
+      assert_different_registers(left->as_xmm_double_reg(), tmp->as_xmm_double_reg());
+      __ vpxor(dest->as_xmm_double_reg(), tmp->as_xmm_double_reg(), left->as_xmm_double_reg(), 2);
     }
-    if (UseAVX > 0) {
-      __ vnegatesd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg(),
-                   ExternalAddress((address)double_signflip_pool));
-    } else {
+    else
+#endif
+    {
+      assert(!tmp->is_valid(), "do not need temporary");
+      if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) {
+        __ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg());
+      }
       __ xorpd(dest->as_xmm_double_reg(),
                ExternalAddress((address)double_signflip_pool));
     }
--- a/src/hotspot/cpu/x86/c1_LIRGenerator_x86.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/c1_LIRGenerator_x86.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -320,7 +320,21 @@
   value.set_destroys_register();
   value.load_item();
   LIR_Opr reg = rlock(x);
-  __ negate(value.result(), reg);
+
+  LIR_Opr tmp = LIR_OprFact::illegalOpr;
+#ifdef _LP64
+  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
+    if (x->type()->tag() == doubleTag) {
+      tmp = new_register(T_DOUBLE);
+      __ move(LIR_OprFact::doubleConst(-0.0), tmp);
+    }
+    else if (x->type()->tag() == floatTag) {
+      tmp = new_register(T_FLOAT);
+      __ move(LIR_OprFact::floatConst(-0.0), tmp);
+    }
+  }
+#endif
+  __ negate(value.result(), reg, tmp);
 
   set_result(x, round_item(reg));
 }
@@ -748,8 +762,17 @@
   LIR_Opr calc_input = value.result();
   LIR_Opr calc_result = rlock_result(x);
 
+  LIR_Opr tmp = LIR_OprFact::illegalOpr;
+#ifdef _LP64
+  if (UseAVX > 2 && (!VM_Version::supports_avx512vl()) &&
+      (x->id() == vmIntrinsics::_dabs)) {
+    tmp = new_register(T_DOUBLE);
+    __ move(LIR_OprFact::doubleConst(-0.0), tmp);
+  }
+#endif
+
   switch(x->id()) {
-    case vmIntrinsics::_dabs:   __ abs  (calc_input, calc_result, LIR_OprFact::illegalOpr); break;
+    case vmIntrinsics::_dabs:   __ abs  (calc_input, calc_result, tmp); break;
     case vmIntrinsics::_dsqrt:  __ sqrt (calc_input, calc_result, LIR_OprFact::illegalOpr); break;
     default:                    ShouldNotReachHere();
   }
--- a/src/hotspot/cpu/x86/globals_x86.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/globals_x86.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -119,7 +119,7 @@
   product(bool, UseStoreImmI16, true,                                       \
           "Use store immediate 16-bits value instruction on x86")           \
                                                                             \
-  product(intx, UseAVX, 2,                                                  \
+  product(intx, UseAVX, 3,                                                  \
           "Highest supported AVX instructions set on x86/x64")              \
           range(0, 99)                                                      \
                                                                             \
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -2942,16 +2942,6 @@
   }
 }
 
-void MacroAssembler::push_zmm(XMMRegister reg) {
-  lea(rsp, Address(rsp, -64)); // Use lea to not affect flags
-  evmovdqul(Address(rsp, 0), reg, Assembler::AVX_512bit);
-}
-
-void MacroAssembler::pop_zmm(XMMRegister reg) {
-  evmovdqul(reg, Address(rsp, 0), Assembler::AVX_512bit);
-  lea(rsp, Address(rsp, 64)); // Use lea to not affect flags
-}
-
 void MacroAssembler::fremr(Register tmp) {
   save_rax(tmp);
   { Label L;
@@ -3332,27 +3322,18 @@
 }
 
 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
-    Assembler::vextractf32x4(dst, src, 0);
-  } else {
+    assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::movdqu(dst, src);
-  }
 }
 
 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
-    Assembler::vinsertf32x4(dst, dst, src, 0);
-  } else {
+    assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::movdqu(dst, src);
-  }
 }
 
 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
-    Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
-  } else {
+    assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::movdqu(dst, src);
-  }
 }
 
 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
@@ -3365,28 +3346,18 @@
 }
 
 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
-    vextractf64x4_low(dst, src);
-  } else {
+    assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::vmovdqu(dst, src);
-  }
 }
 
 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
-    vinsertf64x4_low(dst, src);
-  } else {
+    assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::vmovdqu(dst, src);
-  }
 }
 
 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
-  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
-    Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
-  }
-  else {
+    assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
     Assembler::vmovdqu(dst, src);
-  }
 }
 
 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
@@ -3670,187 +3641,43 @@
 }
 
 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::pcmpeqb(dst, src);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::pcmpeqb(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pcmpeqb(xmm0, src);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::pcmpeqb(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::pcmpeqb(xmm1, xmm0);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::pcmpeqb(dst, src);
 }
 
 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::pcmpeqw(dst, src);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::pcmpeqw(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pcmpeqw(xmm0, src);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::pcmpeqw(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::pcmpeqw(xmm1, xmm0);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::pcmpeqw(dst, src);
 }
 
 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
-  int dst_enc = dst->encoding();
-  if (dst_enc < 16) {
-    Assembler::pcmpestri(dst, src, imm8);
-  } else {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pcmpestri(xmm0, src, imm8);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  }
+  assert((dst->encoding() < 16),"XMM register should be 0-15");
+  Assembler::pcmpestri(dst, src, imm8);
 }
 
 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::pcmpestri(dst, src, imm8);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pcmpestri(xmm0, src, imm8);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::pcmpestri(dst, xmm0, imm8);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::pcmpestri(xmm1, xmm0, imm8);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
+  Assembler::pcmpestri(dst, src, imm8);
 }
 
 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::pmovzxbw(dst, src);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::pmovzxbw(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pmovzxbw(xmm0, src);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::pmovzxbw(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::pmovzxbw(xmm1, xmm0);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::pmovzxbw(dst, src);
 }
 
 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
-  int dst_enc = dst->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::pmovzxbw(dst, src);
-  } else if (dst_enc < 16) {
-    Assembler::pmovzxbw(dst, src);
-  } else {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::pmovzxbw(xmm0, src);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::pmovzxbw(dst, src);
 }
 
 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
-  int src_enc = src->encoding();
-  if (src_enc < 16) {
-    Assembler::pmovmskb(dst, src);
-  } else {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::pmovmskb(dst, xmm0);
-    pop_zmm(xmm0);
-  }
+  assert((src->encoding() < 16),"XMM register should be 0-15");
+  Assembler::pmovmskb(dst, src);
 }
 
 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::ptest(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::ptest(xmm0, src);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::ptest(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::ptest(xmm1, xmm0);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
+  Assembler::ptest(dst, src);
 }
 
 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
@@ -3979,194 +3806,33 @@
 }
 
 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if ((dst_enc < 16) && (nds_enc < 16)) {
-    vandps(dst, nds, negate_field, vector_len);
-  } else if ((src_enc < 16) && (dst_enc < 16)) {
-    // Use src scratch register
-    evmovdqul(src, nds, Assembler::AVX_512bit);
-    vandps(dst, src, negate_field, vector_len);
-  } else if (dst_enc < 16) {
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-    vandps(dst, dst, negate_field, vector_len);
-  } else if (nds_enc < 16) {
-    vandps(nds, nds, negate_field, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (src_enc < 16) {
-    evmovdqul(src, nds, Assembler::AVX_512bit);
-    vandps(src, src, negate_field, vector_len);
-    evmovdqul(dst, src, Assembler::AVX_512bit);
-  } else {
-    if (src_enc != dst_enc) {
-      // Use src scratch register
-      evmovdqul(src, xmm0, Assembler::AVX_512bit);
-      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-      vandps(xmm0, xmm0, negate_field, vector_len);
-      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-      evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    } else {
-      push_zmm(xmm0);
-      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-      vandps(xmm0, xmm0, negate_field, vector_len);
-      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-      pop_zmm(xmm0);
-    }
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
+  vandps(dst, nds, negate_field, vector_len);
 }
 
 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if ((dst_enc < 16) && (nds_enc < 16)) {
-    vandpd(dst, nds, negate_field, vector_len);
-  } else if ((src_enc < 16) && (dst_enc < 16)) {
-    // Use src scratch register
-    evmovdqul(src, nds, Assembler::AVX_512bit);
-    vandpd(dst, src, negate_field, vector_len);
-  } else if (dst_enc < 16) {
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-    vandpd(dst, dst, negate_field, vector_len);
-  } else if (nds_enc < 16) {
-    vandpd(nds, nds, negate_field, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (src_enc < 16) {
-    evmovdqul(src, nds, Assembler::AVX_512bit);
-    vandpd(src, src, negate_field, vector_len);
-    evmovdqul(dst, src, Assembler::AVX_512bit);
-  } else {
-    if (src_enc != dst_enc) {
-      evmovdqul(src, xmm0, Assembler::AVX_512bit);
-      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-      vandpd(xmm0, xmm0, negate_field, vector_len);
-      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-      evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    } else {
-      push_zmm(xmm0);
-      evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-      vandpd(xmm0, xmm0, negate_field, vector_len);
-      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-      pop_zmm(xmm0);
-    }
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
+  vandpd(dst, nds, negate_field, vector_len);
 }
 
 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpaddb(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpaddb(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpaddb(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpaddb(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpaddb(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpaddb(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpaddb(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpaddb(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with src as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpaddb(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpaddb(xmm0, xmm0, src, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpaddb(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpaddw(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpaddw(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpaddw(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpaddw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpaddw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpaddw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpaddw(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpaddw(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with nds as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpaddw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpaddw(xmm0, xmm0, src, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpaddw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
@@ -4178,627 +3844,109 @@
   }
 }
 
-void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpbroadcastw(dst, src);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpbroadcastw(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpbroadcastw(xmm0, src);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpbroadcastw(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::vpbroadcastw(xmm1, xmm0);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpbroadcastw(dst, src, vector_len);
 }
 
 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  assert(dst_enc == nds_enc, "");
-  if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpcmpeqb(dst, nds, src, vector_len);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpcmpeqb(xmm0, xmm0, src, vector_len);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpcmpeqb(dst, dst, xmm0, vector_len);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::vpcmpeqb(xmm1, xmm1, xmm0, vector_len);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpcmpeqb(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  assert(dst_enc == nds_enc, "");
-  if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpcmpeqw(dst, nds, src, vector_len);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpcmpeqw(xmm0, xmm0, src, vector_len);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpcmpeqw(dst, dst, xmm0, vector_len);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::vpcmpeqw(xmm1, xmm1, xmm0, vector_len);
-    movdqu(dst, xmm1);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpcmpeqw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpmovzxbw(dst, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpmovzxbw(dst, src, vector_len);
-  } else {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpmovzxbw(xmm0, src, vector_len);
-    movdqu(dst, xmm0);
-    pop_zmm(xmm0);
-  }
+  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpmovzxbw(dst, src, vector_len);
 }
 
 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
-  int src_enc = src->encoding();
-  if (src_enc < 16) {
-    Assembler::vpmovmskb(dst, src);
-  } else {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpmovmskb(dst, xmm0);
-    pop_zmm(xmm0);
-  }
+  assert((src->encoding() < 16),"XMM register should be 0-15");
+  Assembler::vpmovmskb(dst, src);
 }
 
 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpmullw(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpmullw(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpmullw(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpmullw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpmullw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpmullw(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpmullw(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with src as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(xmm0, xmm0, src, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpmullw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsubb(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpsubb(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpsubb(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsubb(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpsubb(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsubb(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsubb(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpsubb(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with src as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsubb(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsubb(xmm0, xmm0, src, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsubb(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsubw(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpsubw(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpsubw(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsubw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpsubw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsubw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsubw(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpsubw(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with src as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsubw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsubw(xmm0, xmm0, src, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsubw(dst, nds, src, vector_len);
 }
 
 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int shift_enc = shift->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsraw(dst, nds, shift, vector_len);
-  } else if ((dst_enc < 16) && (shift_enc < 16)) {
-    Assembler::vpsraw(dst, dst, shift, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with shift
-    evmovdqul(nds, shift, Assembler::AVX_512bit);
-    Assembler::vpsraw(dst, dst, nds, vector_len);
-  } else if ((shift_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsraw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds to save a copy of xmm0 and hold shift
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsraw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else if (nds_enc < 16) {
-    // use nds and dst as temps
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsraw(nds, nds, xmm0, vector_len);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsraw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsraw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsraw(dst, nds, shift, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpsraw(dst, dst, shift, vector_len);
-  } else if (nds_enc < 16) {
-    // use nds as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsraw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // use nds as scratch for xmm0
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsraw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int shift_enc = shift->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsrlw(dst, nds, shift, vector_len);
-  } else if ((dst_enc < 16) && (shift_enc < 16)) {
-    Assembler::vpsrlw(dst, dst, shift, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with shift
-    evmovdqul(nds, shift, Assembler::AVX_512bit);
-    Assembler::vpsrlw(dst, dst, nds, vector_len);
-  } else if ((shift_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsrlw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds to save a copy of xmm0 and hold shift
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsrlw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else if (nds_enc < 16) {
-    // use nds and dst as temps
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsrlw(nds, nds, xmm0, vector_len);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsrlw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsrlw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsrlw(dst, nds, shift, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpsrlw(dst, dst, shift, vector_len);
-  } else if (nds_enc < 16) {
-    // use nds as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsrlw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // use nds as scratch for xmm0
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsrlw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int shift_enc = shift->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsllw(dst, nds, shift, vector_len);
-  } else if ((dst_enc < 16) && (shift_enc < 16)) {
-    Assembler::vpsllw(dst, dst, shift, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with shift
-    evmovdqul(nds, shift, Assembler::AVX_512bit);
-    Assembler::vpsllw(dst, dst, nds, vector_len);
-  } else if ((shift_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch with dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsllw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds to save a copy of xmm0 and hold shift
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsllw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else if (nds_enc < 16) {
-    // use nds and dst as temps
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
-    Assembler::vpsllw(nds, nds, xmm0, vector_len);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    push_zmm(xmm1);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    pop_zmm(xmm1);
-  }
+  assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsllw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpsllw(dst, nds, shift, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpsllw(dst, dst, shift, vector_len);
-  } else if (nds_enc < 16) {
-    // use nds as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpsllw(nds, nds, shift, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // use nds as scratch for xmm0
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::vpsllw(dst, nds, shift, vector_len);
 }
 
 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
-  int dst_enc = dst->encoding();
-  int src_enc = src->encoding();
-  if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vptest(dst, src);
-  } else if (src_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vptest(xmm0, src);
-    pop_zmm(xmm0);
-  } else if (dst_enc < 16) {
-    push_zmm(xmm0);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vptest(dst, xmm0);
-    pop_zmm(xmm0);
-  } else {
-    push_zmm(xmm0);
-    push_zmm(xmm1);
-    movdqu(xmm0, src);
-    movdqu(xmm1, dst);
-    Assembler::vptest(xmm1, xmm0);
-    pop_zmm(xmm1);
-    pop_zmm(xmm0);
-  }
-}
-
-// This instruction exists within macros, ergo we cannot control its input
-// when emitted through those patterns.
+  assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
+  Assembler::vptest(dst, src);
+}
+
 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
-  if (VM_Version::supports_avx512nobw()) {
-    int dst_enc = dst->encoding();
-    int src_enc = src->encoding();
-    if (dst_enc == src_enc) {
-      if (dst_enc < 16) {
-        Assembler::punpcklbw(dst, src);
-      } else {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        Assembler::punpcklbw(xmm0, xmm0);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm0);
-      }
-    } else {
-      if ((src_enc < 16) && (dst_enc < 16)) {
-        Assembler::punpcklbw(dst, src);
-      } else if (src_enc < 16) {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        Assembler::punpcklbw(xmm0, src);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm0);
-      } else if (dst_enc < 16) {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, src, Assembler::AVX_512bit);
-        Assembler::punpcklbw(dst, xmm0);
-        pop_zmm(xmm0);
-      } else {
-        push_zmm(xmm0);
-        push_zmm(xmm1);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        evmovdqul(xmm1, src, Assembler::AVX_512bit);
-        Assembler::punpcklbw(xmm0, xmm1);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm1);
-        pop_zmm(xmm0);
-      }
-    }
-  } else {
-    Assembler::punpcklbw(dst, src);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::punpcklbw(dst, src);
 }
 
 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
-  if (VM_Version::supports_avx512vl()) {
-    Assembler::pshufd(dst, src, mode);
-  } else {
-    int dst_enc = dst->encoding();
-    if (dst_enc < 16) {
-      Assembler::pshufd(dst, src, mode);
-    } else {
-      push_zmm(xmm0);
-      Assembler::pshufd(xmm0, src, mode);
-      evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-      pop_zmm(xmm0);
-    }
-  }
-}
-
-// This instruction exists within macros, ergo we cannot control its input
-// when emitted through those patterns.
+  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
+  Assembler::pshufd(dst, src, mode);
+}
+
 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
-  if (VM_Version::supports_avx512nobw()) {
-    int dst_enc = dst->encoding();
-    int src_enc = src->encoding();
-    if (dst_enc == src_enc) {
-      if (dst_enc < 16) {
-        Assembler::pshuflw(dst, src, mode);
-      } else {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        Assembler::pshuflw(xmm0, xmm0, mode);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm0);
-      }
-    } else {
-      if ((src_enc < 16) && (dst_enc < 16)) {
-        Assembler::pshuflw(dst, src, mode);
-      } else if (src_enc < 16) {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        Assembler::pshuflw(xmm0, src, mode);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm0);
-      } else if (dst_enc < 16) {
-        push_zmm(xmm0);
-        evmovdqul(xmm0, src, Assembler::AVX_512bit);
-        Assembler::pshuflw(dst, xmm0, mode);
-        pop_zmm(xmm0);
-      } else {
-        push_zmm(xmm0);
-        push_zmm(xmm1);
-        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-        evmovdqul(xmm1, src, Assembler::AVX_512bit);
-        Assembler::pshuflw(xmm0, xmm1, mode);
-        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-        pop_zmm(xmm1);
-        pop_zmm(xmm0);
-      }
-    }
-  } else {
-    Assembler::pshuflw(dst, src, mode);
-  }
+  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
+  Assembler::pshuflw(dst, src, mode);
 }
 
 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
@@ -4874,47 +4022,13 @@
 }
 
 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
-  int nds_enc = nds->encoding();
-  int dst_enc = dst->encoding();
-  bool dst_upper_bank = (dst_enc > 15);
-  bool nds_upper_bank = (nds_enc > 15);
-  if (VM_Version::supports_avx512novl() &&
-      (nds_upper_bank || dst_upper_bank)) {
-    if (dst_upper_bank) {
-      push_zmm(xmm0);
-      movflt(xmm0, nds);
-      vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
-      movflt(dst, xmm0);
-      pop_zmm(xmm0);
-    } else {
-      movflt(dst, nds);
-      vxorps(dst, dst, src, Assembler::AVX_128bit);
-    }
-  } else {
-    vxorps(dst, nds, src, Assembler::AVX_128bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
+  vxorps(dst, nds, src, Assembler::AVX_128bit);
 }
 
 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
-  int nds_enc = nds->encoding();
-  int dst_enc = dst->encoding();
-  bool dst_upper_bank = (dst_enc > 15);
-  bool nds_upper_bank = (nds_enc > 15);
-  if (VM_Version::supports_avx512novl() &&
-      (nds_upper_bank || dst_upper_bank)) {
-    if (dst_upper_bank) {
-      push_zmm(xmm0);
-      movdbl(xmm0, nds);
-      vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
-      movdbl(dst, xmm0);
-      pop_zmm(xmm0);
-    } else {
-      movdbl(dst, nds);
-      vxorpd(dst, dst, src, Assembler::AVX_128bit);
-    }
-  } else {
-    vxorpd(dst, nds, src, Assembler::AVX_128bit);
-  }
+  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
+  vxorpd(dst, nds, src, Assembler::AVX_128bit);
 }
 
 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
@@ -7064,7 +6178,7 @@
     cmpl(cnt1, 2*stride);
     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
     movdl(vec1, ch);
-    vpbroadcastw(vec1, vec1);
+    vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
     vpxor(vec2, vec2);
     movl(tmp, cnt1);
     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
@@ -7659,7 +6773,7 @@
 
       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
       movdl(vec2, tmp1);
-      vpbroadcastd(vec2, vec2);
+      vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
 
       bind(COMPARE_WIDE_VECTORS);
       vmovdqu(vec1, Address(ary1, len, Address::times_1));
@@ -8091,7 +7205,7 @@
       if (UseAVX > 2 && UseUnalignedLoadStores) {
         // Fill 64-byte chunks
         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
-        evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
+        vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
 
         subl(count, 16 << shift);
         jcc(Assembler::less, L_check_fill_32_bytes);
@@ -8114,7 +7228,7 @@
       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
         // Fill 64-byte chunks
         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
-        vpbroadcastd(xtmp, xtmp);
+        vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
 
         subl(count, 16 << shift);
         jcc(Assembler::less, L_check_fill_32_bytes);
@@ -8256,7 +7370,7 @@
       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
       movdl(tmp1Reg, tmp5);
-      vpbroadcastd(tmp1Reg, tmp1Reg);
+      vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
       jmp(L_chars_32_check);
 
       bind(L_copy_32_chars);
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -482,10 +482,6 @@
   // from register xmm0. Otherwise, the value is stored from the FPU stack.
   void store_double(Address dst);
 
-  // Save/restore ZMM (512bit) register on stack.
-  void push_zmm(XMMRegister reg);
-  void pop_zmm(XMMRegister reg);
-
   // pushes double TOS element of FPU stack on CPU stack; pops from FPU stack
   void push_fTOS();
 
@@ -1214,9 +1210,11 @@
   void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
 
-  void vpbroadcastw(XMMRegister dst, XMMRegister src);
+  void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
+  void vpbroadcastw(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastw(dst, src, vector_len); }
 
   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
 
   void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -403,7 +403,7 @@
       __ movdl(xmm0, rcx);
       __ movl(rcx, 0xffff);
       __ kmovwl(k1, rcx);
-      __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
+      __ vpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
       __ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit);
 #ifdef _LP64
       __ evmovdqul(xmm8, xmm0, Assembler::AVX_512bit);
@@ -885,7 +885,7 @@
     FLAG_SET_DEFAULT(UseSHA, false);
   }
 
-  if (supports_sha() && UseSHA) {
+  if (supports_sha() && supports_sse4_1() && UseSHA) {
     if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) {
       FLAG_SET_DEFAULT(UseSHA1Intrinsics, true);
     }
@@ -894,7 +894,7 @@
     FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
   }
 
-  if (UseSHA) {
+  if (supports_sse4_1() && UseSHA) {
     if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
       FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
     }
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -816,7 +816,10 @@
   static bool supports_avx512cd() { return (_features & CPU_AVX512CD) != 0; }
   static bool supports_avx512bw() { return (_features & CPU_AVX512BW) != 0; }
   static bool supports_avx512vl() { return (_features & CPU_AVX512VL) != 0; }
-  static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
+  static bool supports_avx512vlbw() { return (supports_evex() && supports_avx512bw() && supports_avx512vl()); }
+  static bool supports_avx512vldq() { return (supports_evex() && supports_avx512dq() && supports_avx512vl()); }
+  static bool supports_avx512vlbwdq() { return (supports_evex() && supports_avx512vl() &&
+                                                supports_avx512bw() && supports_avx512dq()); }
   static bool supports_avx512novl() { return (supports_evex() && !supports_avx512vl()); }
   static bool supports_avx512nobw() { return (supports_evex() && !supports_avx512bw()); }
   static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
--- a/src/hotspot/cpu/x86/x86.ad	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/x86.ad	Thu Sep 27 10:49:10 2018 -0700
@@ -729,6 +729,7 @@
                     );
 
 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 
 // Class for pre evex double registers
 reg_class double_reg_legacy(XMM0,  XMM0b,
@@ -789,6 +790,7 @@
                      );
 
 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 
 // Class for pre evex 32bit vector registers
 reg_class vectors_reg_legacy(XMM0,
@@ -849,6 +851,7 @@
                       );
 
 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 
 // Class for all 64bit vector registers
 reg_class vectord_reg_legacy(XMM0,  XMM0b,
@@ -909,6 +912,7 @@
                       );
 
 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 
 // Class for all 128bit vector registers
 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
@@ -969,6 +973,7 @@
                       );
 
 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 
 // Class for all 256bit vector registers
 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
@@ -1029,9 +1034,10 @@
                       );
 
 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 
 // Class for all 512bit vector registers
-reg_class vectorz_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
+reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
@@ -1067,6 +1073,30 @@
 #endif
                       );
 
+// Class for restricted 512bit vector registers
+reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
+                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
+                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
+                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
+                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
+                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
+                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
+                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
+#ifdef _LP64
+                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
+                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
+                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
+                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
+                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
+                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
+                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
+                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
+#endif
+                      );
+
+reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
+reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
+
 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
@@ -1487,6 +1517,8 @@
   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
     size = (UseAVX > 2) ? 64 : 32;
+  if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
+    size = (VM_Version::supports_avx512bw()) ? 64 : 32;
   // Use flag to limit vector size.
   size = MIN2(size,(int)MaxVectorSize);
   // Minimum 2 values in vector (or 4 for bytes).
@@ -1528,7 +1560,7 @@
   return MIN2(size,max_size);
 }
 
-// Vector ideal reg corresponding to specidied size in bytes
+// Vector ideal reg corresponding to specified size in bytes
 const uint Matcher::vector_ideal_reg(int size) {
   assert(MaxVectorSize >= size, "");
   switch(size) {
@@ -1648,10 +1680,28 @@
     case Op_VecS: // copy whole register
     case Op_VecD:
     case Op_VecX:
+#ifndef LP64
       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+#else
+      if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+        __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+      } else {
+        __ vpxor(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), 2);
+        __ vinserti32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
+     }
+#endif
       break;
     case Op_VecY:
+#ifndef LP64
       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+#else
+      if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+      } else {
+        __ vpxor(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), 2);
+        __ vinserti64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
+     }
+#endif
       break;
     case Op_VecZ:
       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
@@ -1703,10 +1753,28 @@
         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
         break;
       case Op_VecX:
+#ifndef LP64
         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+#else
+        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+          __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        } else {
+          __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
+          __ vinserti32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
+        }
+#endif
         break;
       case Op_VecY:
+#ifndef LP64
         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+#else
+        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+          __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        } else {
+          __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
+          __ vinserti64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
+        }
+#endif
         break;
       case Op_VecZ:
         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
@@ -1723,10 +1791,28 @@
         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
         break;
       case Op_VecX:
+#ifndef LP64
         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+#else
+        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+          __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        }
+        else {
+          __ vextracti32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
+        }
+#endif
         break;
       case Op_VecY:
+#ifndef LP64
         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+#else
+        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
+          __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        }
+        else {
+          __ vextracti64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
+        }
+#endif
         break;
       case Op_VecZ:
         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
@@ -1908,7 +1994,6 @@
 // in the ADLC because operands constitute user defined types which are used in
 // instruction definitions.
 
-// This one generically applies only for evex, so only one version
 operand vecZ() %{
   constraint(ALLOC_IN_RC(vectorz_reg));
   match(VecZ);
@@ -1917,6 +2002,14 @@
   interface(REG_INTER);
 %}
 
+operand legVecZ() %{
+  constraint(ALLOC_IN_RC(vectorz_reg_vl));
+  match(VecZ);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 // Comparison Code for FP conditional move
 operand cmpOp_vcmppd() %{
   match(Bool);
@@ -2547,8 +2640,8 @@
   ins_pipe(pipe_slow);
 %}
 
-instruct absF_reg_reg(regF dst, regF src) %{
-  predicate(VM_Version::supports_avxonly());
+instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
+  predicate(UseAVX > 0);
   match(Set dst (AbsF src));
   ins_cost(150);
   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
@@ -2560,48 +2653,6 @@
   ins_pipe(pipe_slow);
 %}
 
-#ifdef _LP64
-instruct absF_reg_reg_evex(regF dst, regF src) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
-  match(Set dst (AbsF src));
-  ins_cost(150);
-  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
-  predicate(VM_Version::supports_avx512novl());
-  match(Set dst (AbsF src1));
-  effect(TEMP src2);
-  ins_cost(150);
-  format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
-              ExternalAddress(float_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-#else // _LP64
-instruct absF_reg_reg_evex(regF dst, regF src) %{
-  predicate(UseAVX > 2);
-  match(Set dst (AbsF src));
-  ins_cost(150);
-  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-#endif
-
 instruct absD_reg(regD dst) %{
   predicate((UseSSE>=2) && (UseAVX == 0));
   match(Set dst (AbsD dst));
@@ -2614,8 +2665,8 @@
   ins_pipe(pipe_slow);
 %}
 
-instruct absD_reg_reg(regD dst, regD src) %{
-  predicate(VM_Version::supports_avxonly());
+instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
+  predicate(UseAVX > 0);
   match(Set dst (AbsD src));
   ins_cost(150);
   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
@@ -2628,50 +2679,6 @@
   ins_pipe(pipe_slow);
 %}
 
-#ifdef _LP64
-instruct absD_reg_reg_evex(regD dst, regD src) %{
-  predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
-  match(Set dst (AbsD src));
-  ins_cost(150);
-  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
-            "# abs double by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
-  predicate(VM_Version::supports_avx512novl());
-  match(Set dst (AbsD src1));
-  effect(TEMP src2);
-  ins_cost(150);
-  format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
-              ExternalAddress(double_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-#else // _LP64
-instruct absD_reg_reg_evex(regD dst, regD src) %{
-  predicate(UseAVX > 2);
-  match(Set dst (AbsD src));
-  ins_cost(150);
-  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
-            "# abs double by sign masking" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signmask()), vector_len);
-  %}
-  ins_pipe(pipe_slow);
-%}
-#endif
-
 instruct negF_reg(regF dst) %{
   predicate((UseSSE>=1) && (UseAVX == 0));
   match(Set dst (NegF dst));
@@ -2683,7 +2690,7 @@
   ins_pipe(pipe_slow);
 %}
 
-instruct negF_reg_reg(regF dst, regF src) %{
+instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
   predicate(UseAVX > 0);
   match(Set dst (NegF src));
   ins_cost(150);
@@ -2707,11 +2714,11 @@
   ins_pipe(pipe_slow);
 %}
 
-instruct negD_reg_reg(regD dst, regD src) %{
+instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
   predicate(UseAVX > 0);
   match(Set dst (NegD src));
   ins_cost(150);
-  format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
+  format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
             "# neg double by sign flipping" %}
   ins_encode %{
     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
@@ -2835,6 +2842,7 @@
 
 // ====================VECTOR INSTRUCTIONS=====================================
 
+
 // Load vectors (4 bytes long)
 instruct loadV4(vecS dst, memory mem) %{
   predicate(n->as_LoadVector()->memory_size() == 4);
@@ -2847,6 +2855,26 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load vectors (4 bytes long)
+instruct MoveVecS2Leg(legVecS dst, vecS src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load vectors (4 bytes long)
+instruct MoveLeg2VecS(vecS dst, legVecS src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load vectors (8 bytes long)
 instruct loadV8(vecD dst, memory mem) %{
   predicate(n->as_LoadVector()->memory_size() == 8);
@@ -2859,6 +2887,26 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load vectors (8 bytes long)
+instruct MoveVecD2Leg(legVecD dst, vecD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load vectors (8 bytes long)
+instruct MoveLeg2VecD(vecD dst, legVecD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load vectors (16 bytes long)
 instruct loadV16(vecX dst, memory mem) %{
   predicate(n->as_LoadVector()->memory_size() == 16);
@@ -2871,6 +2919,36 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load vectors (16 bytes long)
+instruct MoveVecX2Leg(legVecX dst, vecX src) %{
+  match(Set dst src);
+  format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
+  ins_encode %{
+    if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
+      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
+    } else {
+      int vector_len = 2;
+      __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    }
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load vectors (16 bytes long)
+instruct MoveLeg2VecX(vecX dst, legVecX src) %{
+  match(Set dst src);
+  format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
+  ins_encode %{
+    if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
+      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
+    } else {
+      int vector_len = 2;
+      __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    }
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load vectors (32 bytes long)
 instruct loadV32(vecY dst, memory mem) %{
   predicate(n->as_LoadVector()->memory_size() == 32);
@@ -2883,6 +2961,36 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load vectors (32 bytes long)
+instruct MoveVecY2Leg(legVecY dst, vecY src) %{
+  match(Set dst src);
+  format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
+  ins_encode %{
+    if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
+      __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
+    } else {
+      int vector_len = 2;
+      __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    }
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load vectors (32 bytes long)
+instruct MoveLeg2VecY(vecY dst, legVecY src) %{
+  match(Set dst src);
+  format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
+  ins_encode %{
+    if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
+      __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
+    } else {
+      int vector_len = 2;
+      __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    }
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load vectors (64 bytes long)
 instruct loadV64_dword(vecZ dst, memory mem) %{
   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
@@ -2909,6 +3017,26 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
+  match(Set dst src);
+  format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
+  match(Set dst src);
+  format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Store vectors
 instruct storeV4(memory mem, vecS src) %{
   predicate(n->as_StoreVector()->memory_size() == 4);
@@ -3068,6 +3196,44 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl64B(legVecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB (LoadB mem)));
+  format %{ "punpcklbw $dst,$mem\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
+  ins_encode %{
+    __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl16B_imm(vecX dst, immI con) %{
   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB con));
@@ -3094,6 +3260,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl64B_imm(legVecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl4S(vecD dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS src));
@@ -3198,6 +3380,56 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl32S(legVecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS (LoadS mem)));
+  format %{ "pshuflw $dst,$mem,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_imm(legVecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl4I(vecX dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI src));
@@ -3246,6 +3478,36 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl16I(legVecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "pshufd  $dst,$mem,0x00\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl4I_imm(vecX dst, immI con) %{
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI con));
@@ -3272,6 +3534,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl16I_imm(legVecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Long could be loaded into xmm register directly from memory.
 instruct Repl2L_mem(vecX dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
@@ -3300,8 +3578,24 @@
   %}
   ins_pipe( pipe_slow );
 %}
+
+instruct Repl8L(legVecZ dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  format %{ "movdq   $dst,$src\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
+  ins_encode %{
+    __ movdq($dst$$XMMRegister, $src$$Register);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
 #else // _LP64
-instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
+instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL src));
   effect(TEMP dst, USE src, TEMP tmp);
@@ -3319,6 +3613,27 @@
   %}
   ins_pipe( pipe_slow );
 %}
+
+instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
 #endif // _LP64
 
 instruct Repl4L_imm(vecY dst, immL con) %{
@@ -3335,6 +3650,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl8L_imm(legVecZ dst, immL con) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress($con));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl4L_mem(vecY dst, memory mem) %{
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL (LoadL mem)));
@@ -3349,6 +3680,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl8L_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "movq    $dst,$mem\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl2F_mem(vecD dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF (LoadF mem)));
@@ -3369,8 +3716,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl8F(vecY dst, regF src) %{
-  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+instruct Repl8F(vecY dst, vlRegF src) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF src));
   format %{ "pshufd  $dst,$src,0x00\n\t"
             "vinsertf128_high $dst,$dst\t! replicate8F" %}
@@ -3393,6 +3740,34 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl16F(legVecZ dst, vlRegF src) %{
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$src,0x00\n\t"
+            "vinsertf128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateF (LoadF mem)));
+  format %{ "pshufd  $dst,$mem,0x00\n\t"
+            "vinsertf128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct Repl2F_zero(vecD dst, immF0 zero) %{
   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
   match(Set dst (ReplicateF zero));
@@ -3434,8 +3809,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct Repl4D(vecY dst, regD src) %{
-  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+instruct Repl4D(vecY dst, vlRegD src) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD src));
   format %{ "pshufd  $dst,$src,0x44\n\t"
             "vinsertf128_high $dst,$dst\t! replicate4D" %}
@@ -3458,6 +3833,34 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl8D(legVecZ dst, vlRegD src) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD src));
+  format %{ "pshufd  $dst,$src,0x44\n\t"
+            "vinsertf128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_mem(legVecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+  match(Set dst (ReplicateD (LoadD mem)));
+  format %{ "pshufd  $dst,$mem,0x44\n\t"
+            "vinsertf128_high $dst,$dst\t"
+            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate double (8 byte) scalar zero to be vector
 instruct Repl2D_zero(vecX dst, immD0 zero) %{
   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
@@ -3736,7 +4139,7 @@
   ins_pipe( pipe_slow );
 %}
 #else // _LP64
-instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
+instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
   predicate(n->as_Vector()->length() == 2);
   match(Set dst (ReplicateL src));
   effect(TEMP dst, USE src, TEMP tmp);
@@ -3791,7 +4194,7 @@
 %}
 
 // Replicate float (4 byte) scalar to be vector
-instruct Repl2F(vecD dst, regF src) %{
+instruct Repl2F(vecD dst, vlRegF src) %{
   predicate(n->as_Vector()->length() == 2);
   match(Set dst (ReplicateF src));
   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
@@ -3801,7 +4204,7 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct Repl4F(vecX dst, regF src) %{
+instruct Repl4F(vecX dst, vlRegF src) %{
   predicate(n->as_Vector()->length() == 4);
   match(Set dst (ReplicateF src));
   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
@@ -3812,7 +4215,7 @@
 %}
 
 // Replicate double (8 bytes) scalar to be vector
-instruct Repl2D(vecX dst, regD src) %{
+instruct Repl2D(vecX dst, vlRegD src) %{
   predicate(n->as_Vector()->length() == 2);
   match(Set dst (ReplicateD src));
   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
@@ -3825,31 +4228,31 @@
 // ====================EVEX REPLICATE=============================================
 
 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl16B_evex(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB src));
-  format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
+  format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
   ins_encode %{
    int vector_len = 0;
     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
@@ -3858,20 +4261,20 @@
 %}
 
 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl32B_evex(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB src));
-  format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
+  format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
   ins_encode %{
    int vector_len = 1;
     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
@@ -3880,20 +4283,20 @@
 %}
 
 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl64B_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB src));
-  format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
+  format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
   ins_encode %{
    int vector_len = 2;
     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
@@ -3902,51 +4305,51 @@
 %}
 
 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB (LoadB mem)));
   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl16B_imm_evex(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastb $dst,$dst\t! replicate16B" %}
   ins_encode %{
    int vector_len = 0;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl32B_imm_evex(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastb $dst,$dst\t! replicate32B" %}
   ins_encode %{
    int vector_len = 1;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
   ins_encode %{
    int vector_len = 2;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
-    __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3964,9 +4367,9 @@
 %}
 
 instruct Repl4S_evex(vecD dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS src));
-  format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
+  format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
   ins_encode %{
    int vector_len = 0;
     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
@@ -3975,20 +4378,20 @@
 %}
 
 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8S_evex(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS src));
-  format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
+  format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
   ins_encode %{
    int vector_len = 0;
     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
@@ -3997,20 +4400,20 @@
 %}
 
 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl16S_evex(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS src));
-  format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
+  format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
   ins_encode %{
    int vector_len = 1;
     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4019,20 +4422,20 @@
 %}
 
 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl32S_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS src));
-  format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
+  format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
   ins_encode %{
    int vector_len = 2;
     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4041,51 +4444,51 @@
 %}
 
 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8S_imm_evex(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastw $dst,$dst\t! replicate8S" %}
   ins_encode %{
    int vector_len = 0;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl16S_imm_evex(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastw $dst,$dst\t! replicate16S" %}
   ins_encode %{
    int vector_len = 1;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
+  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastw $dst,$dst\t! replicate32S" %}
   ins_encode %{
    int vector_len = 2;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
-    __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4103,9 +4506,9 @@
 %}
 
 instruct Repl4I_evex(vecX dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI src));
-  format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
+  format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
   ins_encode %{
     int vector_len = 0;
     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4114,20 +4517,20 @@
 %}
 
 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI (LoadI mem)));
   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8I_evex(vecY dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI src));
-  format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
+  format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
   ins_encode %{
     int vector_len = 1;
     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4136,12 +4539,12 @@
 %}
 
 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI (LoadI mem)));
   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4149,7 +4552,7 @@
 instruct Repl16I_evex(vecZ dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
   match(Set dst (ReplicateI src));
-  format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
+  format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
   ins_encode %{
     int vector_len = 2;
     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4163,33 +4566,33 @@
   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl4I_imm_evex(vecX dst, immI con) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI con));
   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
   ins_encode %{
     int vector_len = 0;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8I_imm_evex(vecY dst, immI con) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI con));
   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
   ins_encode %{
     int vector_len = 1;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4202,7 +4605,7 @@
   ins_encode %{
     int vector_len = 2;
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
-    __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4222,9 +4625,9 @@
 // Replicate long (8 byte) scalar to be vector
 #ifdef _LP64
 instruct Repl4L_evex(vecY dst, rRegL src) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL src));
-  format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
+  format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
   ins_encode %{
     int vector_len = 1;
     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4235,7 +4638,7 @@
 instruct Repl8L_evex(vecZ dst, rRegL src) %{
   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateL src));
-  format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
+  format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
   ins_encode %{
     int vector_len = 2;
     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
@@ -4244,7 +4647,7 @@
 %}
 #else // _LP64
 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL src));
   effect(TEMP dst, USE src, TEMP tmp);
   format %{ "movdl   $dst,$src.lo\n\t"
@@ -4256,12 +4659,12 @@
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
-    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
+    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateL src));
   effect(TEMP dst, USE src, TEMP tmp);
@@ -4274,21 +4677,21 @@
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
-    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 #endif // _LP64
 
 instruct Repl4L_imm_evex(vecY dst, immL con) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
   ins_encode %{
     int vector_len = 1;
     __ movq($dst$$XMMRegister, $constantaddress($con));
-    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4301,29 +4704,29 @@
   ins_encode %{
     int vector_len = 2;
     __ movq($dst$$XMMRegister, $constantaddress($con));
-    __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL (LoadL mem)));
   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
   ins_encode %{
     int vector_len = 0;
-    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateL (LoadL mem)));
   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4334,7 +4737,7 @@
   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4352,23 +4755,23 @@
 %}
 
 instruct Repl8F_evex(vecY dst, regF src) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF src));
-  format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
+  format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF (LoadF mem)));
   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4376,10 +4779,10 @@
 instruct Repl16F_evex(vecZ dst, regF src) %{
   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
   match(Set dst (ReplicateF src));
-  format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
+  format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4390,7 +4793,7 @@
   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4444,23 +4847,23 @@
 %}
 
 instruct Repl4D_evex(vecY dst, regD src) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD src));
-  format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
+  format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD (LoadD mem)));
   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
   ins_encode %{
     int vector_len = 1;
-    __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4468,10 +4871,10 @@
 instruct Repl8D_evex(vecZ dst, regD src) %{
   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
   match(Set dst (ReplicateD src));
-  format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
+  format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+    __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4482,7 +4885,7 @@
   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
   ins_encode %{
     int vector_len = 2;
-    __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4525,7 +4928,7 @@
 
 // ====================REDUCTION ARITHMETIC=======================================
 
-instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
   predicate(UseSSE > 2 && UseAVX == 0);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp2, TEMP tmp);
@@ -4544,7 +4947,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
   predicate(VM_Version::supports_avxonly());
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4562,7 +4965,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4582,7 +4985,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseSSE > 2 && UseAVX == 0);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4603,7 +5006,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(VM_Version::supports_avxonly());
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4623,7 +5026,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4647,7 +5050,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
+instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(VM_Version::supports_avxonly());
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4671,7 +5074,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
+instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4699,7 +5102,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
+instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
@@ -4731,7 +5134,7 @@
 %}
 
 #ifdef _LP64
-instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4750,7 +5153,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
+instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4773,7 +5176,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
+instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -4801,7 +5204,7 @@
 %}
 #endif
 
-instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
+instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -4816,7 +5219,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
+instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -4831,7 +5234,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
+instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -4854,7 +5257,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
+instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -4877,7 +5280,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
+instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -4916,7 +5319,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
+instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVF dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -4987,7 +5390,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
+instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (AddReductionVD dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -5002,7 +5405,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
+instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (AddReductionVD dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -5017,14 +5420,14 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
+instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (AddReductionVD dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
   format %{ "vaddsd  $dst,$dst,$src2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2,0x1\n\t"
+            "vextractf128  $tmp2,$src2,0x1\n\t"
             "vaddsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
@@ -5032,7 +5435,7 @@
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5040,7 +5443,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
+instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVD dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -5079,7 +5482,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
   predicate(UseSSE > 3 && UseAVX == 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5098,7 +5501,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5118,7 +5521,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseSSE > 3 && UseAVX == 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5141,7 +5544,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5165,8 +5568,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
-  predicate(UseAVX > 0);
+instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
+  predicate(UseAVX > 1);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
   format %{ "vextracti128_high  $tmp,$src2\n\t"
@@ -5193,7 +5596,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
+instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
   predicate(UseAVX > 2);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
@@ -5225,7 +5628,7 @@
 %}
 
 #ifdef _LP64
-instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
+instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5244,7 +5647,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
+instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5267,7 +5670,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
+instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -5295,7 +5698,7 @@
 %}
 #endif
 
-instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
+instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -5310,7 +5713,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
+instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -5325,7 +5728,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
+instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -5348,7 +5751,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
+instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -5371,7 +5774,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
+instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -5410,7 +5813,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
+instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (MulReductionVF dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -5481,7 +5884,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
+instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (MulReductionVD dst src2));
   effect(TEMP dst, TEMP tmp);
@@ -5496,7 +5899,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
+instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVD dst src2));
   effect(TEMP tmp, TEMP dst);
@@ -5511,7 +5914,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
+instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVD dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -5534,7 +5937,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
+instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
   predicate(UseAVX > 2);
   match(Set dst (MulReductionVD dst src2));
   effect(TEMP tmp, TEMP dst, TEMP tmp2);
@@ -5588,8 +5991,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
   ins_encode %{
@@ -5599,31 +6002,9 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVB src1 src2));
-  format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+
+instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
   ins_encode %{
@@ -5633,29 +6014,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVB src (LoadVector mem)));
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd8B(vecD dst, vecD src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVB dst src));
@@ -5666,8 +6024,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
   ins_encode %{
@@ -5677,31 +6035,9 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVB src1 src2));
-  format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+
+instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
   ins_encode %{
@@ -5711,29 +6047,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVB src (LoadVector mem)));
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd16B(vecX dst, vecX src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
   match(Set dst (AddVB dst src));
@@ -5744,8 +6057,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
+instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
   ins_encode %{
@@ -5755,31 +6068,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVB src1 src2));
-  format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
+instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
   ins_encode %{
@@ -5789,31 +6079,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVB src (LoadVector mem)));
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
+instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
   ins_encode %{
@@ -5823,31 +6090,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
-  match(Set dst (AddVB src1 src2));
-  format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
-  match(Set dst (AddVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
+instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
   ins_encode %{
@@ -5857,31 +6101,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
-  match(Set dst (AddVB src (LoadVector mem)));
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
-  match(Set dst (AddVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
   ins_encode %{
@@ -5892,7 +6113,7 @@
 %}
 
 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
   ins_encode %{
@@ -5913,8 +6134,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
   ins_encode %{
@@ -5924,31 +6145,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (AddVS src1 src2));
-  format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (AddVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
   ins_encode %{
@@ -5958,29 +6156,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (AddVS src (LoadVector mem)));
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (AddVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd4S(vecD dst, vecD src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVS dst src));
@@ -5991,8 +6166,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
   ins_encode %{
@@ -6002,31 +6177,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVS src1 src2));
-  format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
   ins_encode %{
@@ -6036,29 +6188,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVS src (LoadVector mem)));
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (AddVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd8S(vecX dst, vecX src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVS dst src));
@@ -6069,8 +6198,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
   ins_encode %{
@@ -6080,31 +6209,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVS src1 src2));
-  format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
   ins_encode %{
@@ -6114,31 +6220,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVS src (LoadVector mem)));
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (AddVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
   ins_encode %{
@@ -6148,31 +6231,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVS src1 src2));
-  format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
   ins_encode %{
@@ -6182,31 +6242,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVS src (LoadVector mem)));
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (AddVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
   ins_encode %{
@@ -6217,7 +6254,7 @@
 %}
 
 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
   ins_encode %{
@@ -6229,7 +6266,7 @@
 
 // Integers vector add
 instruct vadd2I(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (AddVI dst src));
   format %{ "paddd   $dst,$src\t! add packed2I" %}
   ins_encode %{
@@ -6261,7 +6298,7 @@
 %}
 
 instruct vadd4I(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVI dst src));
   format %{ "paddd   $dst,$src\t! add packed4I" %}
   ins_encode %{
@@ -6338,7 +6375,7 @@
 
 // Longs vector add
 instruct vadd2L(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (AddVL dst src));
   format %{ "paddq   $dst,$src\t! add packed2L" %}
   ins_encode %{
@@ -6415,7 +6452,7 @@
 
 // Floats vector add
 instruct vadd2F(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (AddVF dst src));
   format %{ "addps   $dst,$src\t! add packed2F" %}
   ins_encode %{
@@ -6447,7 +6484,7 @@
 %}
 
 instruct vadd4F(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (AddVF dst src));
   format %{ "addps   $dst,$src\t! add packed4F" %}
   ins_encode %{
@@ -6524,7 +6561,7 @@
 
 // Doubles vector add
 instruct vadd2D(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (AddVD dst src));
   format %{ "addpd   $dst,$src\t! add packed2D" %}
   ins_encode %{
@@ -6612,8 +6649,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
   ins_encode %{
@@ -6623,31 +6660,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVB src1 src2));
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
   ins_encode %{
@@ -6657,29 +6671,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVB src (LoadVector mem)));
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub8B(vecD dst, vecD src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVB dst src));
@@ -6690,8 +6681,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
   ins_encode %{
@@ -6701,31 +6692,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVB src1 src2));
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
   ins_encode %{
@@ -6735,29 +6703,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVB src (LoadVector mem)));
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub16B(vecX dst, vecX src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
   match(Set dst (SubVB dst src));
@@ -6768,8 +6713,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
+instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
   ins_encode %{
@@ -6779,31 +6724,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVB src1 src2));
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
+instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
   ins_encode %{
@@ -6813,31 +6735,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVB src (LoadVector mem)));
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
+instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
   ins_encode %{
@@ -6847,31 +6746,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
-  match(Set dst (SubVB src1 src2));
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
-  match(Set dst (SubVB dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
+instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
   ins_encode %{
@@ -6881,31 +6757,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
-  match(Set dst (SubVB src (LoadVector mem)));
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
-  match(Set dst (SubVB dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
   ins_encode %{
@@ -6916,7 +6769,7 @@
 %}
 
 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
   ins_encode %{
@@ -6937,8 +6790,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
   ins_encode %{
@@ -6948,31 +6801,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (SubVS src1 src2));
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (SubVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
   ins_encode %{
@@ -6982,29 +6812,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (SubVS src (LoadVector mem)));
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (SubVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub4S(vecD dst, vecD src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVS dst src));
@@ -7015,8 +6822,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
   ins_encode %{
@@ -7026,31 +6833,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVS src1 src2));
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
   ins_encode %{
@@ -7060,29 +6844,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVS src (LoadVector mem)));
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (SubVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub8S(vecX dst, vecX src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVS dst src));
@@ -7093,8 +6854,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
   ins_encode %{
@@ -7104,31 +6865,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVS src1 src2));
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
   ins_encode %{
@@ -7138,31 +6876,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVS src (LoadVector mem)));
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (SubVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
   ins_encode %{
@@ -7172,31 +6887,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVS src1 src2));
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
   ins_encode %{
@@ -7206,31 +6898,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVS src (LoadVector mem)));
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (SubVS dst (LoadVector mem)));
-   effect(TEMP src);
-  format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
   ins_encode %{
@@ -7241,7 +6910,7 @@
 %}
 
 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
   ins_encode %{
@@ -7253,7 +6922,7 @@
 
 // Integers vector sub
 instruct vsub2I(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVI dst src));
   format %{ "psubd   $dst,$src\t! sub packed2I" %}
   ins_encode %{
@@ -7285,7 +6954,7 @@
 %}
 
 instruct vsub4I(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVI dst src));
   format %{ "psubd   $dst,$src\t! sub packed4I" %}
   ins_encode %{
@@ -7362,7 +7031,7 @@
 
 // Longs vector sub
 instruct vsub2L(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVL dst src));
   format %{ "psubq   $dst,$src\t! sub packed2L" %}
   ins_encode %{
@@ -7439,7 +7108,7 @@
 
 // Floats vector sub
 instruct vsub2F(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVF dst src));
   format %{ "subps   $dst,$src\t! sub packed2F" %}
   ins_encode %{
@@ -7471,7 +7140,7 @@
 %}
 
 instruct vsub4F(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (SubVF dst src));
   format %{ "subps   $dst,$src\t! sub packed4F" %}
   ins_encode %{
@@ -7548,7 +7217,7 @@
 
 // Doubles vector sub
 instruct vsub2D(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (SubVD dst src));
   format %{ "subpd   $dst,$src\t! sub packed2D" %}
   ins_encode %{
@@ -7636,8 +7305,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
   ins_encode %{
@@ -7647,31 +7316,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (MulVS src1 src2));
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (MulVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
   ins_encode %{
@@ -7681,29 +7327,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (MulVS src (LoadVector mem)));
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (MulVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vmul4S(vecD dst, vecD src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (MulVS dst src));
@@ -7714,8 +7337,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
   ins_encode %{
@@ -7725,31 +7348,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (MulVS src1 src2));
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (MulVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
   ins_encode %{
@@ -7759,29 +7359,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (MulVS src (LoadVector mem)));
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (MulVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vmul8S(vecX dst, vecX src) %{
   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
   match(Set dst (MulVS dst src));
@@ -7792,8 +7369,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
   ins_encode %{
@@ -7803,31 +7380,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (MulVS src1 src2));
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (MulVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
   ins_encode %{
@@ -7837,31 +7391,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (MulVS src (LoadVector mem)));
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (MulVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
   ins_encode %{
@@ -7871,31 +7402,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (MulVS src1 src2));
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (MulVS dst src2));
-  effect(TEMP src1);
-  format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
   ins_encode %{
@@ -7905,31 +7413,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (MulVS src (LoadVector mem)));
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (MulVS dst (LoadVector mem)));
-  effect(TEMP src);
-  format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
   ins_encode %{
@@ -7940,7 +7425,7 @@
 %}
 
 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
   ins_encode %{
@@ -8127,7 +7612,7 @@
 
 // Floats vector mul
 instruct vmul2F(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (MulVF dst src));
   format %{ "mulps   $dst,$src\t! mul packed2F" %}
   ins_encode %{
@@ -8159,7 +7644,7 @@
 %}
 
 instruct vmul4F(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (MulVF dst src));
   format %{ "mulps   $dst,$src\t! mul packed4F" %}
   ins_encode %{
@@ -8236,7 +7721,7 @@
 
 // Doubles vector mul
 instruct vmul2D(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (MulVD dst src));
   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
   ins_encode %{
@@ -8311,8 +7796,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vcmov8F_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
-  predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 8);
+instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
   effect(TEMP dst, USE src1, USE src2);
   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
@@ -8327,8 +7812,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
-  predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
+instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
   effect(TEMP dst, USE src1, USE src2);
   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
@@ -8347,7 +7832,7 @@
 
 // Floats vector div
 instruct vdiv2F(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (DivVF dst src));
   format %{ "divps   $dst,$src\t! div packed2F" %}
   ins_encode %{
@@ -8379,7 +7864,7 @@
 %}
 
 instruct vdiv4F(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (DivVF dst src));
   format %{ "divps   $dst,$src\t! div packed4F" %}
   ins_encode %{
@@ -8456,7 +7941,7 @@
 
 // Doubles vector div
 instruct vdiv2D(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (DivVD dst src));
   format %{ "divpd   $dst,$src\t! div packed2D" %}
   ins_encode %{
@@ -8725,8 +8210,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
   ins_encode %{
@@ -8736,58 +8221,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
+instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -8813,8 +8252,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
   ins_encode %{
@@ -8824,58 +8263,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
+instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -8901,8 +8294,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
   ins_encode %{
@@ -8912,65 +8305,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
+instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
   ins_encode %{
@@ -8980,65 +8327,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
+instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
   ins_encode %{
     int vector_len = 1;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (LShiftVS src shift));
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (LShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
   ins_encode %{
@@ -9049,7 +8350,7 @@
 %}
 
 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
   ins_encode %{
@@ -9061,7 +8362,7 @@
 
 // Integers vector left shift
 instruct vsll2I(vecD dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVI dst shift));
   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
   ins_encode %{
@@ -9071,7 +8372,7 @@
 %}
 
 instruct vsll2I_imm(vecD dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVI dst shift));
   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
   ins_encode %{
@@ -9103,7 +8404,7 @@
 %}
 
 instruct vsll4I(vecX dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (LShiftVI dst shift));
   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
   ins_encode %{
@@ -9113,7 +8414,7 @@
 %}
 
 instruct vsll4I_imm(vecX dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (LShiftVI dst shift));
   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
   ins_encode %{
@@ -9190,7 +8491,7 @@
 
 // Longs vector left shift
 instruct vsll2L(vecX dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVL dst shift));
   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
   ins_encode %{
@@ -9200,7 +8501,7 @@
 %}
 
 instruct vsll2L_imm(vecX dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (LShiftVL dst shift));
   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
   ins_encode %{
@@ -9302,8 +8603,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
   ins_encode %{
@@ -9313,58 +8614,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
+instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -9390,8 +8645,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
   ins_encode %{
@@ -9401,58 +8656,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
+instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -9478,8 +8687,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
   ins_encode %{
@@ -9489,65 +8698,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
+instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
   ins_encode %{
@@ -9557,65 +8720,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
+instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
   ins_encode %{
     int vector_len = 1;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (URShiftVS src shift));
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (URShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
   ins_encode %{
@@ -9626,7 +8743,7 @@
 %}
 
 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
   ins_encode %{
@@ -9638,7 +8755,7 @@
 
 // Integers vector logical right shift
 instruct vsrl2I(vecD dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVI dst shift));
   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
   ins_encode %{
@@ -9648,7 +8765,7 @@
 %}
 
 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVI dst shift));
   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
   ins_encode %{
@@ -9680,7 +8797,7 @@
 %}
 
 instruct vsrl4I(vecX dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (URShiftVI dst shift));
   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
   ins_encode %{
@@ -9690,7 +8807,7 @@
 %}
 
 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (URShiftVI dst shift));
   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
   ins_encode %{
@@ -9767,7 +8884,7 @@
 
 // Longs vector logical right shift
 instruct vsrl2L(vecX dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVL dst shift));
   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
   ins_encode %{
@@ -9777,7 +8894,7 @@
 %}
 
 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (URShiftVL dst shift));
   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
   ins_encode %{
@@ -9866,7 +8983,7 @@
 %}
 
 instruct vsra2S_imm(vecS dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (RShiftVS dst shift));
   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
   ins_encode %{
@@ -9875,8 +8992,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
+instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
   ins_encode %{
@@ -9886,58 +9003,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
+instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -9963,8 +9034,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
+instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
   ins_encode %{
@@ -9974,58 +9045,12 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
+instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
@@ -10051,8 +9076,8 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
+instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
   ins_encode %{
@@ -10062,65 +9087,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
+instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
   ins_encode %{
     int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
-  ins_encode %{
-    int vector_len = 0;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
+instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
   ins_encode %{
@@ -10130,65 +9109,19 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
+instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
   ins_encode %{
     int vector_len = 1;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
-  match(Set dst (RShiftVS src shift));
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
-  match(Set dst (RShiftVS dst shift));
-  effect(TEMP src);
-  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
   ins_encode %{
@@ -10199,7 +9132,7 @@
 %}
 
 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
-  predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
+  predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
   ins_encode %{
@@ -10211,7 +9144,7 @@
 
 // Integers vector arithmetic right shift
 instruct vsra2I(vecD dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (RShiftVI dst shift));
   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
   ins_encode %{
@@ -10221,7 +9154,7 @@
 %}
 
 instruct vsra2I_imm(vecD dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 2);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
   match(Set dst (RShiftVI dst shift));
   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
   ins_encode %{
@@ -10253,7 +9186,7 @@
 %}
 
 instruct vsra4I(vecX dst, vecS shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (RShiftVI dst shift));
   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
   ins_encode %{
@@ -10263,7 +9196,7 @@
 %}
 
 instruct vsra4I_imm(vecX dst, immI8 shift) %{
-  predicate(n->as_Vector()->length() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
   match(Set dst (RShiftVI dst shift));
   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
   ins_encode %{
@@ -10344,7 +9277,7 @@
 // --------------------------------- AND --------------------------------------
 
 instruct vand4B(vecS dst, vecS src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
   match(Set dst (AndV dst src));
   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
   ins_encode %{
@@ -10376,7 +9309,7 @@
 %}
 
 instruct vand8B(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 8);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (AndV dst src));
   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
   ins_encode %{
@@ -10408,7 +9341,7 @@
 %}
 
 instruct vand16B(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 16);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (AndV dst src));
   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
   ins_encode %{
@@ -10486,7 +9419,7 @@
 // --------------------------------- OR ---------------------------------------
 
 instruct vor4B(vecS dst, vecS src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
   match(Set dst (OrV dst src));
   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
   ins_encode %{
@@ -10518,7 +9451,7 @@
 %}
 
 instruct vor8B(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 8);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (OrV dst src));
   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
   ins_encode %{
@@ -10550,7 +9483,7 @@
 %}
 
 instruct vor16B(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 16);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (OrV dst src));
   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
   ins_encode %{
@@ -10628,7 +9561,7 @@
 // --------------------------------- XOR --------------------------------------
 
 instruct vxor4B(vecS dst, vecS src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 4);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
   match(Set dst (XorV dst src));
   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
   ins_encode %{
@@ -10660,7 +9593,7 @@
 %}
 
 instruct vxor8B(vecD dst, vecD src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 8);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
   match(Set dst (XorV dst src));
   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
   ins_encode %{
@@ -10692,7 +9625,7 @@
 %}
 
 instruct vxor16B(vecX dst, vecX src) %{
-  predicate(n->as_Vector()->length_in_bytes() == 16);
+  predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
   match(Set dst (XorV dst src));
   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
   ins_encode %{
--- a/src/hotspot/cpu/x86/x86_32.ad	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/x86_32.ad	Thu Sep 27 10:49:10 2018 -0700
@@ -4101,6 +4101,15 @@
   interface(REG_INTER);
 %}
 
+// Float register operands
+operand vlRegF() %{
+   constraint(ALLOC_IN_RC(float_reg_vl));
+   match(RegF);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
 // XMM Double register operands
 operand regD() %{
   predicate( UseSSE>=2 );
@@ -4110,6 +4119,15 @@
   interface(REG_INTER);
 %}
 
+// Double register operands
+operand vlRegD() %{
+   constraint(ALLOC_IN_RC(double_reg_vl));
+   match(RegD);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
 // Vectors : note, we use legacy registers to avoid extra (unneeded in 32-bit VM)
 // runtime code generation via reg_class_dynamic.
 operand vecS() %{
@@ -4120,6 +4138,14 @@
   interface(REG_INTER);
 %}
 
+operand legVecS() %{
+  constraint(ALLOC_IN_RC(vectors_reg_legacy));
+  match(VecS);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vecD() %{
   constraint(ALLOC_IN_RC(vectord_reg_legacy));
   match(VecD);
@@ -4128,6 +4154,14 @@
   interface(REG_INTER);
 %}
 
+operand legVecD() %{
+  constraint(ALLOC_IN_RC(vectord_reg_legacy));
+  match(VecD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vecX() %{
   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
   match(VecX);
@@ -4136,6 +4170,14 @@
   interface(REG_INTER);
 %}
 
+operand legVecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg_legacy));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vecY() %{
   constraint(ALLOC_IN_RC(vectory_reg_legacy));
   match(VecY);
@@ -4144,6 +4186,14 @@
   interface(REG_INTER);
 %}
 
+operand legVecY() %{
+  constraint(ALLOC_IN_RC(vectory_reg_legacy));
+  match(VecY);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 //----------Memory Operands----------------------------------------------------
 // Direct Memory Operand
 operand direct(immP addr) %{
@@ -6515,6 +6565,26 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load Double
+instruct MoveD2VL(vlRegD dst, regD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Double
+instruct MoveVL2D(regD dst, vlRegD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Store XMM register to memory (single-precision floating point)
 // MOVSS instruction
 instruct storeF(memory mem, regF src) %{
@@ -6528,6 +6598,26 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load Float
+instruct MoveF2VL(vlRegF dst, regF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Float
+instruct MoveVL2F(regF dst, vlRegF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Store Float
 instruct storeFPR( memory mem, regFPR1 src) %{
   predicate(UseSSE==0);
--- a/src/hotspot/cpu/x86/x86_64.ad	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/cpu/x86/x86_64.ad	Thu Sep 27 10:49:10 2018 -0700
@@ -3656,6 +3656,15 @@
    interface(REG_INTER);
 %}
 
+// Float register operands
+operand vlRegF() %{
+   constraint(ALLOC_IN_RC(float_reg_vl));
+   match(RegF);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
 // Double register operands
 operand regD() %{
    constraint(ALLOC_IN_RC(double_reg));
@@ -3665,33 +3674,75 @@
    interface(REG_INTER);
 %}
 
+// Double register operands
+operand vlRegD() %{
+   constraint(ALLOC_IN_RC(double_reg_vl));
+   match(RegD);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
 // Vectors
 operand vecS() %{
-  constraint(ALLOC_IN_RC(vectors_reg));
+  constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
   match(VecS);
 
   format %{ %}
   interface(REG_INTER);
 %}
 
-operand vecD() %{
-  constraint(ALLOC_IN_RC(vectord_reg));
-  match(VecD);
+// Vectors
+operand legVecS() %{
+  constraint(ALLOC_IN_RC(vectors_reg_legacy));
+  match(VecS);
 
   format %{ %}
   interface(REG_INTER);
 %}
 
-operand vecX() %{
-  constraint(ALLOC_IN_RC(vectorx_reg));
-  match(VecX);
+operand vecD() %{
+  constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
+  match(VecD);
 
   format %{ %}
   interface(REG_INTER);
 %}
 
+operand legVecD() %{
+  constraint(ALLOC_IN_RC(vectord_reg_legacy));
+  match(VecD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand legVecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg_legacy));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vecY() %{
-  constraint(ALLOC_IN_RC(vectory_reg));
+  constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
+  match(VecY);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand legVecY() %{
+  constraint(ALLOC_IN_RC(vectory_reg_legacy));
   match(VecY);
 
   format %{ %}
@@ -5287,6 +5338,26 @@
   ins_pipe(pipe_slow); // XXX
 %}
 
+// Load Float
+instruct MoveF2VL(vlRegF dst, regF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Float
+instruct MoveVL2F(regF dst, vlRegF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t! load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load Double
 instruct loadD_partial(regD dst, memory mem)
 %{
@@ -5314,6 +5385,26 @@
   ins_pipe(pipe_slow); // XXX
 %}
 
+// Load Double
+instruct MoveD2VL(vlRegD dst, regD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Load Double
+instruct MoveVL2D(regD dst, vlRegD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t! load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load Effective Address
 instruct leaP8(rRegP dst, indOffset8 mem)
 %{
@@ -10858,7 +10949,7 @@
 %}
 
 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
-                         rax_RegI result, regD tmp1, rFlagsReg cr)
+                         rax_RegI result, legVecS tmp1, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -10874,7 +10965,7 @@
 %}
 
 instruct string_compareU(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
-                         rax_RegI result, regD tmp1, rFlagsReg cr)
+                         rax_RegI result, legVecS tmp1, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -10890,7 +10981,7 @@
 %}
 
 instruct string_compareLU(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
-                          rax_RegI result, regD tmp1, rFlagsReg cr)
+                          rax_RegI result, legVecS tmp1, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -10906,7 +10997,7 @@
 %}
 
 instruct string_compareUL(rsi_RegP str1, rdx_RegI cnt1, rdi_RegP str2, rcx_RegI cnt2,
-                          rax_RegI result, regD tmp1, rFlagsReg cr)
+                          rax_RegI result, legVecS tmp1, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -10923,7 +11014,7 @@
 
 // fast search of substring with known size.
 instruct string_indexof_conL(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, immI int_cnt2,
-                             rbx_RegI result, regD vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
+                             rbx_RegI result, legVecS vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
@@ -10952,7 +11043,7 @@
 
 // fast search of substring with known size.
 instruct string_indexof_conU(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, immI int_cnt2,
-                             rbx_RegI result, regD vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
+                             rbx_RegI result, legVecS vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
@@ -10981,7 +11072,7 @@
 
 // fast search of substring with known size.
 instruct string_indexof_conUL(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, immI int_cnt2,
-                             rbx_RegI result, regD vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
+                             rbx_RegI result, legVecS vec, rax_RegI cnt2, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
@@ -11009,7 +11100,7 @@
 %}
 
 instruct string_indexofL(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, rax_RegI cnt2,
-                         rbx_RegI result, regD vec, rcx_RegI tmp, rFlagsReg cr)
+                         rbx_RegI result, legVecS vec, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -11026,7 +11117,7 @@
 %}
 
 instruct string_indexofU(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, rax_RegI cnt2,
-                         rbx_RegI result, regD vec, rcx_RegI tmp, rFlagsReg cr)
+                         rbx_RegI result, legVecS vec, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -11043,7 +11134,7 @@
 %}
 
 instruct string_indexofUL(rdi_RegP str1, rdx_RegI cnt1, rsi_RegP str2, rax_RegI cnt2,
-                         rbx_RegI result, regD vec, rcx_RegI tmp, rFlagsReg cr)
+                         rbx_RegI result, legVecS vec, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics && (((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL));
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
@@ -11060,7 +11151,7 @@
 %}
 
 instruct string_indexofU_char(rdi_RegP str1, rdx_RegI cnt1, rax_RegI ch,
-                              rbx_RegI result, regD vec1, regD vec2, regD vec3, rcx_RegI tmp, rFlagsReg cr)
+                              rbx_RegI result, legVecS vec1, legVecS vec2, legVecS vec3, rcx_RegI tmp, rFlagsReg cr)
 %{
   predicate(UseSSE42Intrinsics);
   match(Set result (StrIndexOfChar (Binary str1 cnt1) ch));
@@ -11075,7 +11166,7 @@
 
 // fast string equals
 instruct string_equals(rdi_RegP str1, rsi_RegP str2, rcx_RegI cnt, rax_RegI result,
-                       regD tmp1, regD tmp2, rbx_RegI tmp3, rFlagsReg cr)
+                       legVecS tmp1, legVecS tmp2, rbx_RegI tmp3, rFlagsReg cr)
 %{
   match(Set result (StrEquals (Binary str1 str2) cnt));
   effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL tmp3, KILL cr);
@@ -11091,7 +11182,7 @@
 
 // fast array equals
 instruct array_equalsB(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result,
-                       regD tmp1, regD tmp2, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr)
+                       legVecS tmp1, legVecS tmp2, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr)
 %{
   predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL);
   match(Set result (AryEq ary1 ary2));
@@ -11107,7 +11198,7 @@
 %}
 
 instruct array_equalsC(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result,
-                      regD tmp1, regD tmp2, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr)
+                      legVecS tmp1, legVecS tmp2, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr)
 %{
   predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU);
   match(Set result (AryEq ary1 ary2));
@@ -11123,7 +11214,7 @@
 %}
 
 instruct has_negatives(rsi_RegP ary1, rcx_RegI len, rax_RegI result,
-                      regD tmp1, regD tmp2, rbx_RegI tmp3, rFlagsReg cr)
+                      legVecS tmp1, legVecS tmp2, rbx_RegI tmp3, rFlagsReg cr)
 %{
   match(Set result (HasNegatives ary1 len));
   effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL len, KILL tmp3, KILL cr);
@@ -11138,7 +11229,7 @@
 %}
 
 // fast char[] to byte[] compression
-instruct string_compress(rsi_RegP src, rdi_RegP dst, rdx_RegI len, regD tmp1, regD tmp2, regD tmp3, regD tmp4,
+instruct string_compress(rsi_RegP src, rdi_RegP dst, rdx_RegI len, legVecS tmp1, legVecS tmp2, legVecS tmp3, legVecS tmp4,
                          rcx_RegI tmp5, rax_RegI result, rFlagsReg cr) %{
   match(Set result (StrCompressedCopy src (Binary dst len)));
   effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL tmp5, KILL cr);
@@ -11154,7 +11245,7 @@
 
 // fast byte[] to char[] inflation
 instruct string_inflate(Universe dummy, rsi_RegP src, rdi_RegP dst, rdx_RegI len,
-                        regD tmp1, rcx_RegI tmp2, rFlagsReg cr) %{
+                        legVecS tmp1, rcx_RegI tmp2, rFlagsReg cr) %{
   match(Set dummy (StrInflatedCopy src (Binary dst len)));
   effect(TEMP tmp1, TEMP tmp2, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr);
 
@@ -11168,7 +11259,7 @@
 
 // encode char[] to byte[] in ISO_8859_1
 instruct encode_iso_array(rsi_RegP src, rdi_RegP dst, rdx_RegI len,
-                          regD tmp1, regD tmp2, regD tmp3, regD tmp4,
+                          legVecS tmp1, legVecS tmp2, legVecS tmp3, legVecS tmp4,
                           rcx_RegI tmp5, rax_RegI result, rFlagsReg cr) %{
   match(Set result (EncodeISOArray src (Binary dst len)));
   effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL tmp5, KILL cr);
--- a/src/hotspot/share/c1/c1_LIR.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/c1/c1_LIR.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -472,7 +472,6 @@
     case lir_pop:            // input always valid, result and info always invalid
     case lir_return:         // input always valid, result and info always invalid
     case lir_leal:           // input and result always valid, info always invalid
-    case lir_neg:            // input and result always valid, info always invalid
     case lir_monaddr:        // input and result always valid, info always invalid
     case lir_null_check:     // input and info always valid, result always invalid
     case lir_move:           // input and result always valid, may have info
@@ -580,6 +579,7 @@
     case lir_rem:
     case lir_sqrt:
     case lir_abs:
+    case lir_neg:
     case lir_logic_and:
     case lir_logic_or:
     case lir_logic_xor:
@@ -1662,7 +1662,6 @@
      case lir_null_check:            s = "null_check";    break;
      case lir_return:                s = "return";        break;
      case lir_safepoint:             s = "safepoint";     break;
-     case lir_neg:                   s = "neg";           break;
      case lir_leal:                  s = "leal";          break;
      case lir_branch:                s = "branch";        break;
      case lir_cond_float_branch:     s = "flt_cond_br";   break;
@@ -1690,6 +1689,7 @@
      case lir_div_strictfp:          s = "div_strictfp";  break;
      case lir_rem:                   s = "rem";           break;
      case lir_abs:                   s = "abs";           break;
+     case lir_neg:                   s = "neg";           break;
      case lir_sqrt:                  s = "sqrt";          break;
      case lir_logic_and:             s = "logic_and";     break;
      case lir_logic_or:              s = "logic_or";      break;
--- a/src/hotspot/share/c1/c1_LIR.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/c1/c1_LIR.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -911,7 +911,6 @@
       , lir_null_check
       , lir_return
       , lir_leal
-      , lir_neg
       , lir_branch
       , lir_cond_float_branch
       , lir_move
@@ -939,6 +938,7 @@
       , lir_rem
       , lir_sqrt
       , lir_abs
+      , lir_neg
       , lir_tan
       , lir_log10
       , lir_logic_and
@@ -2075,7 +2075,6 @@
 
   void branch_destination(Label* lbl)            { append(new LIR_OpLabel(lbl)); }
 
-  void negate(LIR_Opr from, LIR_Opr to)          { append(new LIR_Op1(lir_neg, from, to)); }
   void leal(LIR_Opr from, LIR_Opr result_reg, LIR_PatchCode patch_code = lir_patch_none, CodeEmitInfo* info = NULL) { append(new LIR_Op1(lir_leal, from, result_reg, T_ILLEGAL, patch_code, info)); }
 
   // result is a stack location for old backend and vreg for UseLinearScan
@@ -2159,6 +2158,7 @@
                LIR_Opr t1, LIR_Opr t2, LIR_Opr result = LIR_OprFact::illegalOpr);
 
   void abs (LIR_Opr from, LIR_Opr to, LIR_Opr tmp)                { append(new LIR_Op2(lir_abs , from, tmp, to)); }
+  void negate(LIR_Opr from, LIR_Opr to, LIR_Opr tmp = LIR_OprFact::illegalOpr)              { append(new LIR_Op2(lir_neg, from, tmp, to)); }
   void sqrt(LIR_Opr from, LIR_Opr to, LIR_Opr tmp)                { append(new LIR_Op2(lir_sqrt, from, tmp, to)); }
   void fmad(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmad, from, from1, from2, to)); }
   void fmaf(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmaf, from, from1, from2, to)); }
--- a/src/hotspot/share/c1/c1_LIRAssembler.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/c1/c1_LIRAssembler.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -554,10 +554,6 @@
       pop(op->in_opr());
       break;
 
-    case lir_neg:
-      negate(op->in_opr(), op->result_opr());
-      break;
-
     case lir_leal:
       leal(op->in_opr(), op->result_opr(), op->patch_code(), op->info());
       break;
@@ -750,6 +746,10 @@
       intrinsic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op);
       break;
 
+    case lir_neg:
+      negate(op->in_opr1(), op->result_opr(), op->in_opr2());
+      break;
+
     case lir_logic_and:
     case lir_logic_or:
     case lir_logic_xor:
--- a/src/hotspot/share/c1/c1_LIRAssembler.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/c1/c1_LIRAssembler.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -239,7 +239,7 @@
   void align_backward_branch_target();
   void align_call(LIR_Code code);
 
-  void negate(LIR_Opr left, LIR_Opr dest);
+  void negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp = LIR_OprFact::illegalOpr);
   void leal(LIR_Opr src, LIR_Opr dest, LIR_PatchCode patch_code, CodeEmitInfo* info);
 
   void rt_call(LIR_Opr result, address dest, const LIR_OprList* args, LIR_Opr tmp, CodeEmitInfo* info);
--- a/src/hotspot/share/classfile/classLoaderData.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/classfile/classLoaderData.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -605,9 +605,10 @@
   // if they are not already on the _klasses list.
   free_deallocate_list_C_heap_structures();
 
-  // Tell serviceability tools these classes are unloading
+  // Clean up class dependencies and tell serviceability tools
+  // these classes are unloading.  Must be called
   // after erroneous classes are released.
-  classes_do(InstanceKlass::notify_unload_class);
+  classes_do(InstanceKlass::unload_class);
 
   // Clean up global class iterator for compiler
   static_klass_iterator.adjust_saved_class(this);
--- a/src/hotspot/share/code/dependencyContext.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/code/dependencyContext.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -218,18 +218,6 @@
   return marked;
 }
 
-void DependencyContext::wipe() {
-  assert_locked_or_safepoint(CodeCache_lock);
-  nmethodBucket* b = dependencies();
-  set_dependencies(NULL);
-  set_has_stale_entries(false);
-  while (b != NULL) {
-    nmethodBucket* next = b->next();
-    delete b;
-    b = next;
-  }
-}
-
 #ifndef PRODUCT
 void DependencyContext::print_dependent_nmethods(bool verbose) {
   int idx = 0;
--- a/src/hotspot/share/code/dependencyContext.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/code/dependencyContext.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -141,10 +141,6 @@
 
   void expunge_stale_entries();
 
-  // Unsafe deallocation of nmethodBuckets. Used in IK::release_C_heap_structures
-  // to clean up the context possibly containing live entries pointing to unloaded nmethods.
-  void wipe();
-
 #ifndef PRODUCT
   void print_dependent_nmethods(bool verbose);
   bool is_dependent_nmethod(nmethod* nm);
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -3169,18 +3169,24 @@
 }
 
 bool G1ParEvacuateFollowersClosure::offer_termination() {
+  EventGCPhaseParallel event;
   G1ParScanThreadState* const pss = par_scan_state();
   start_term_time();
   const bool res = terminator()->offer_termination();
   end_term_time();
+  event.commit(GCId::current(), pss->worker_id(), G1GCPhaseTimes::phase_name(G1GCPhaseTimes::Termination));
   return res;
 }
 
 void G1ParEvacuateFollowersClosure::do_void() {
+  EventGCPhaseParallel event;
   G1ParScanThreadState* const pss = par_scan_state();
   pss->trim_queue();
+  event.commit(GCId::current(), pss->worker_id(), G1GCPhaseTimes::phase_name(G1GCPhaseTimes::ObjCopy));
   do {
+    EventGCPhaseParallel event;
     pss->steal_and_trim_queue(queues());
+    event.commit(GCId::current(), pss->worker_id(), G1GCPhaseTimes::phase_name(G1GCPhaseTimes::ObjCopy));
   } while (!offer_termination());
 }
 
@@ -4050,6 +4056,7 @@
         break;
       }
 
+      EventGCPhaseParallel event;
       double start_time = os::elapsedTime();
 
       end = MIN2(end, _num_work_items);
@@ -4064,9 +4071,11 @@
         if (is_young) {
           young_time += time_taken;
           has_young_time = true;
+          event.commit(GCId::current(), worker_id, G1GCPhaseTimes::phase_name(G1GCPhaseTimes::YoungFreeCSet));
         } else {
           non_young_time += time_taken;
           has_non_young_time = true;
+          event.commit(GCId::current(), worker_id, G1GCPhaseTimes::phase_name(G1GCPhaseTimes::NonYoungFreeCSet));
         }
         start_time = end_time;
       }
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -465,6 +465,48 @@
   }
 }
 
+const char* G1GCPhaseTimes::phase_name(GCParPhases phase) {
+  static const char* names[] = {
+      "GCWorkerStart",
+      "ExtRootScan",
+      "ThreadRoots",
+      "StringTableRoots",
+      "UniverseRoots",
+      "JNIRoots",
+      "ObjectSynchronizerRoots",
+      "ManagementRoots",
+      "SystemDictionaryRoots",
+      "CLDGRoots",
+      "JVMTIRoots",
+      "CMRefRoots",
+      "WaitForStrongCLD",
+      "WeakCLDRoots",
+      "SATBFiltering",
+      "UpdateRS",
+      "ScanHCC",
+      "ScanRS",
+      "CodeRoots",
+#if INCLUDE_AOT
+      "AOTCodeRoots",
+#endif
+      "ObjCopy",
+      "Termination",
+      "Other",
+      "GCWorkerTotal",
+      "GCWorkerEnd",
+      "StringDedupQueueFixup",
+      "StringDedupTableFixup",
+      "RedirtyCards",
+      "YoungFreeCSet",
+      "NonYoungFreeCSet"
+      //GCParPhasesSentinel only used to tell end of enum
+      };
+
+  STATIC_ASSERT(ARRAY_SIZE(names) == G1GCPhaseTimes::GCParPhasesSentinel); // GCParPhases enum and corresponding string array should have the same "length", this tries to assert it
+
+  return names[phase];
+}
+
 G1EvacPhaseWithTrimTimeTracker::G1EvacPhaseWithTrimTimeTracker(G1ParScanThreadState* pss, Tickspan& total_time, Tickspan& trim_time) :
   _pss(pss),
   _start(Ticks::now()),
@@ -490,7 +532,7 @@
 }
 
 G1GCParPhaseTimesTracker::G1GCParPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1GCPhaseTimes::GCParPhases phase, uint worker_id) :
-  _start_time(), _phase(phase), _phase_times(phase_times), _worker_id(worker_id) {
+  _start_time(), _phase(phase), _phase_times(phase_times), _worker_id(worker_id), _event() {
   if (_phase_times != NULL) {
     _start_time = Ticks::now();
   }
@@ -499,6 +541,7 @@
 G1GCParPhaseTimesTracker::~G1GCParPhaseTimesTracker() {
   if (_phase_times != NULL) {
     _phase_times->record_time_secs(_phase, _worker_id, (Ticks::now() - _start_time).seconds());
+    _event.commit(GCId::current(), _worker_id, G1GCPhaseTimes::phase_name(_phase));
   }
 }
 
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -27,6 +27,7 @@
 
 #include "gc/shared/referenceProcessorPhaseTimes.hpp"
 #include "gc/shared/weakProcessorPhaseTimes.hpp"
+#include "jfr/jfrEvents.hpp"
 #include "logging/logLevel.hpp"
 #include "memory/allocation.hpp"
 #include "utilities/macros.hpp"
@@ -190,6 +191,7 @@
   G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads);
   void note_gc_start();
   void print();
+  static const char* phase_name(GCParPhases phase);
 
   // record the time a phase took in seconds
   void record_time_secs(GCParPhases phase, uint worker_i, double secs);
@@ -385,6 +387,7 @@
   G1GCPhaseTimes::GCParPhases _phase;
   G1GCPhaseTimes* _phase_times;
   uint _worker_id;
+  EventGCPhaseParallel _event;
 public:
   G1GCParPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1GCPhaseTimes::GCParPhases phase, uint worker_id);
   virtual ~G1GCParPhaseTimesTracker();
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -40,6 +40,7 @@
 #include "gc/g1/heapRegionRemSet.hpp"
 #include "gc/shared/gcTraceTime.inline.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
+#include "jfr/jfrEvents.hpp"
 #include "memory/iterator.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/access.inline.hpp"
@@ -339,6 +340,7 @@
 }
 
 void G1ScanRSForRegionClosure::scan_rem_set_roots(HeapRegion* r) {
+  EventGCPhaseParallel event;
   uint const region_idx = r->hrm_index();
 
   if (_scan_state->claim_iter(region_idx)) {
@@ -392,10 +394,13 @@
 
     scan_card(mr, region_idx_for_card);
   }
+  event.commit(GCId::current(), _worker_i, G1GCPhaseTimes::phase_name(G1GCPhaseTimes::ScanRS));
 }
 
 void G1ScanRSForRegionClosure::scan_strong_code_roots(HeapRegion* r) {
+  EventGCPhaseParallel event;
   r->strong_code_roots_do(_pss->closures()->weak_codeblobs());
+  event.commit(GCId::current(), _worker_i, G1GCPhaseTimes::phase_name(G1GCPhaseTimes::CodeRoots));
 }
 
 bool G1ScanRSForRegionClosure::do_heap_region(HeapRegion* r) {
--- a/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -310,7 +310,7 @@
 
 //--------------------------- atomic operations---------------------------------
 
-static void pin_atomic_op(C2AtomicAccess& access) {
+void BarrierSetC2::pin_atomic_op(C2AtomicAccess& access) const {
   if (!access.needs_pinning()) {
     return;
   }
--- a/src/hotspot/share/gc/shared/c2/barrierSetC2.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/gc/shared/c2/barrierSetC2.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -76,14 +76,12 @@
 
 // This class wraps a node and a pointer type.
 class C2AccessValuePtr: public C2AccessValue {
-  int _alias_idx;
 
 public:
   C2AccessValuePtr(Node* node, const TypePtr* type) :
     C2AccessValue(node, reinterpret_cast<const Type*>(type)) {}
 
   const TypePtr* type() const { return reinterpret_cast<const TypePtr*>(_type); }
-  int alias_idx() const       { return _alias_idx; }
 };
 
 // This class wraps a bunch of context parameters thare are passed around in the
@@ -175,6 +173,7 @@
                                                 Node* new_val, const Type* value_type) const;
   virtual Node* atomic_xchg_at_resolved(C2AtomicAccess& access, Node* new_val, const Type* val_type) const;
   virtual Node* atomic_add_at_resolved(C2AtomicAccess& access, Node* new_val, const Type* val_type) const;
+  void pin_atomic_op(C2AtomicAccess& access) const;
 
 public:
   // This is the entry-point for the backend to perform accesses through the Access API.
--- a/src/hotspot/share/jfr/metadata/metadata.xml	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/jfr/metadata/metadata.xml	Thu Sep 27 10:49:10 2018 -0700
@@ -435,6 +435,13 @@
     <Field type="string" name="name" label="Name" />
   </Event>
 
+  <Event name="GCPhaseParallel" category="Java Virtual Machine, GC, Phases" label="GC Phase Parallel"
+         startTime="true" thread="true" description="GC phases for parallel workers">
+    <Field type="uint" name="gcId" label="GC Identifier" relation="GcId"/>
+    <Field type="uint" name="gcWorkerId" label="GC Worker Identifier" />
+    <Field type="string" name="name" label="Name" />
+  </Event>
+  
   <Event name="AllocationRequiringGC" category="Java Virtual Machine, GC, Detailed" label="Allocation Requiring GC" thread="true" stackTrace="true"
     startTime="false">
     <Field type="uint" name="gcId" label="Pending GC Identifier" relation="GcId" />
--- a/src/hotspot/share/oops/instanceKlass.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/oops/instanceKlass.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -2417,7 +2417,10 @@
 }
 #endif
 
-void InstanceKlass::notify_unload_class(InstanceKlass* ik) {
+void InstanceKlass::unload_class(InstanceKlass* ik) {
+  // Release dependencies.
+  ik->dependencies().remove_all_dependents();
+
   // notify the debugger
   if (JvmtiExport::should_post_class_unload()) {
     JvmtiExport::post_class_unload(ik);
@@ -2462,16 +2465,8 @@
     FreeHeap(jmeths);
   }
 
-  // Release dependencies.
-  // It is desirable to use DC::remove_all_dependents() here, but, unfortunately,
-  // it is not safe (see JDK-8143408). The problem is that the klass dependency
-  // context can contain live dependencies, since there's a race between nmethod &
-  // klass unloading. If the klass is dead when nmethod unloading happens, relevant
-  // dependencies aren't removed from the context associated with the class (see
-  // nmethod::flush_dependencies). It ends up during klass unloading as seemingly
-  // live dependencies pointing to unloaded nmethods and causes a crash in
-  // DC::remove_all_dependents() when it touches unloaded nmethod.
-  dependencies().wipe();
+  assert(_dep_context == DependencyContext::EMPTY,
+         "dependencies should already be cleaned");
 
 #if INCLUDE_JVMTI
   // Deallocate breakpoint records
--- a/src/hotspot/share/oops/instanceKlass.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/oops/instanceKlass.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -1180,7 +1180,7 @@
   bool on_stack() const { return _constants->on_stack(); }
 
   // callbacks for actions during class unloading
-  static void notify_unload_class(InstanceKlass* ik);
+  static void unload_class(InstanceKlass* ik);
   static void release_C_heap_structures(InstanceKlass* ik);
 
   // Naming
--- a/src/hotspot/share/opto/graphKit.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/opto/graphKit.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -605,7 +605,7 @@
 
       Node *adr = basic_plus_adr(ex_node, ex_node, offset);
       const TypeOopPtr* val_type = TypeOopPtr::make_from_klass(env()->String_klass());
-      Node *store = access_store_at(control(), ex_node, adr, adr_typ, null(), val_type, T_OBJECT, IN_HEAP);
+      Node *store = access_store_at(ex_node, adr, adr_typ, null(), val_type, T_OBJECT, IN_HEAP);
 
       add_exception_state(make_exception_state(ex_node));
       return;
@@ -1544,8 +1544,7 @@
   return st;
 }
 
-Node* GraphKit::access_store_at(Node* ctl,
-                                Node* obj,
+Node* GraphKit::access_store_at(Node* obj,
                                 Node* adr,
                                 const TypePtr* adr_type,
                                 Node* val,
@@ -1559,7 +1558,6 @@
     val = _gvn.makecon(TypePtr::NULL_PTR);
   }
 
-  set_control(ctl);
   if (stopped()) {
     return top(); // Dead path ?
   }
@@ -1612,8 +1610,7 @@
   }
 }
 
-Node* GraphKit::access_atomic_cmpxchg_val_at(Node* ctl,
-                                             Node* obj,
+Node* GraphKit::access_atomic_cmpxchg_val_at(Node* obj,
                                              Node* adr,
                                              const TypePtr* adr_type,
                                              int alias_idx,
@@ -1622,7 +1619,6 @@
                                              const Type* value_type,
                                              BasicType bt,
                                              DecoratorSet decorators) {
-  set_control(ctl);
   C2AccessValuePtr addr(adr, adr_type);
   C2AtomicAccess access(this, decorators | C2_READ_ACCESS | C2_WRITE_ACCESS,
                         bt, obj, addr, alias_idx);
@@ -1633,8 +1629,7 @@
   }
 }
 
-Node* GraphKit::access_atomic_cmpxchg_bool_at(Node* ctl,
-                                              Node* obj,
+Node* GraphKit::access_atomic_cmpxchg_bool_at(Node* obj,
                                               Node* adr,
                                               const TypePtr* adr_type,
                                               int alias_idx,
@@ -1643,7 +1638,6 @@
                                               const Type* value_type,
                                               BasicType bt,
                                               DecoratorSet decorators) {
-  set_control(ctl);
   C2AccessValuePtr addr(adr, adr_type);
   C2AtomicAccess access(this, decorators | C2_READ_ACCESS | C2_WRITE_ACCESS,
                         bt, obj, addr, alias_idx);
@@ -1654,8 +1648,7 @@
   }
 }
 
-Node* GraphKit::access_atomic_xchg_at(Node* ctl,
-                                      Node* obj,
+Node* GraphKit::access_atomic_xchg_at(Node* obj,
                                       Node* adr,
                                       const TypePtr* adr_type,
                                       int alias_idx,
@@ -1663,7 +1656,6 @@
                                       const Type* value_type,
                                       BasicType bt,
                                       DecoratorSet decorators) {
-  set_control(ctl);
   C2AccessValuePtr addr(adr, adr_type);
   C2AtomicAccess access(this, decorators | C2_READ_ACCESS | C2_WRITE_ACCESS,
                         bt, obj, addr, alias_idx);
@@ -1674,8 +1666,7 @@
   }
 }
 
-Node* GraphKit::access_atomic_add_at(Node* ctl,
-                                     Node* obj,
+Node* GraphKit::access_atomic_add_at(Node* obj,
                                      Node* adr,
                                      const TypePtr* adr_type,
                                      int alias_idx,
@@ -1683,7 +1674,6 @@
                                      const Type* value_type,
                                      BasicType bt,
                                      DecoratorSet decorators) {
-  set_control(ctl);
   C2AccessValuePtr addr(adr, adr_type);
   C2AtomicAccess access(this, decorators | C2_READ_ACCESS | C2_WRITE_ACCESS, bt, obj, addr, alias_idx);
   if (access.is_raw()) {
@@ -1693,8 +1683,7 @@
   }
 }
 
-void GraphKit::access_clone(Node* ctl, Node* src, Node* dst, Node* size, bool is_array) {
-  set_control(ctl);
+void GraphKit::access_clone(Node* src, Node* dst, Node* size, bool is_array) {
   return _barrier_set->clone(this, src, dst, size, is_array);
 }
 
@@ -3849,14 +3838,14 @@
   sync_kit(ideal);
 }
 
-Node* GraphKit::load_String_length(Node* ctrl, Node* str) {
-  Node* len = load_array_length(load_String_value(ctrl, str));
-  Node* coder = load_String_coder(ctrl, str);
+Node* GraphKit::load_String_length(Node* str, bool set_ctrl) {
+  Node* len = load_array_length(load_String_value(str, set_ctrl));
+  Node* coder = load_String_coder(str, set_ctrl);
   // Divide length by 2 if coder is UTF16
   return _gvn.transform(new RShiftINode(len, coder));
 }
 
-Node* GraphKit::load_String_value(Node* ctrl, Node* str) {
+Node* GraphKit::load_String_value(Node* str, bool set_ctrl) {
   int value_offset = java_lang_String::value_offset_in_bytes();
   const TypeInstPtr* string_type = TypeInstPtr::make(TypePtr::NotNull, C->env()->String_klass(),
                                                      false, NULL, 0);
@@ -3866,7 +3855,7 @@
                                                   ciTypeArrayKlass::make(T_BYTE), true, 0);
   Node* p = basic_plus_adr(str, str, value_offset);
   Node* load = access_load_at(str, p, value_field_type, value_type, T_OBJECT,
-                              IN_HEAP | C2_CONTROL_DEPENDENT_LOAD);
+                              IN_HEAP | (set_ctrl ? C2_CONTROL_DEPENDENT_LOAD : 0) | MO_UNORDERED);
   // String.value field is known to be @Stable.
   if (UseImplicitStableValues) {
     load = cast_array_to_stable(load, value_type);
@@ -3874,7 +3863,7 @@
   return load;
 }
 
-Node* GraphKit::load_String_coder(Node* ctrl, Node* str) {
+Node* GraphKit::load_String_coder(Node* str, bool set_ctrl) {
   if (!CompactStrings) {
     return intcon(java_lang_String::CODER_UTF16);
   }
@@ -3883,27 +3872,31 @@
                                                      false, NULL, 0);
   const TypePtr* coder_field_type = string_type->add_offset(coder_offset);
   int coder_field_idx = C->get_alias_index(coder_field_type);
-  return make_load(ctrl, basic_plus_adr(str, str, coder_offset),
-                   TypeInt::BYTE, T_BYTE, coder_field_idx, MemNode::unordered);
+
+  Node* p = basic_plus_adr(str, str, coder_offset);
+  Node* load = access_load_at(str, p, coder_field_type, TypeInt::BYTE, T_BYTE,
+                              IN_HEAP | (set_ctrl ? C2_CONTROL_DEPENDENT_LOAD : 0) | MO_UNORDERED);
+  return load;
 }
 
-void GraphKit::store_String_value(Node* ctrl, Node* str, Node* value) {
+void GraphKit::store_String_value(Node* str, Node* value) {
   int value_offset = java_lang_String::value_offset_in_bytes();
   const TypeInstPtr* string_type = TypeInstPtr::make(TypePtr::NotNull, C->env()->String_klass(),
                                                      false, NULL, 0);
   const TypePtr* value_field_type = string_type->add_offset(value_offset);
-  access_store_at(ctrl, str,  basic_plus_adr(str, value_offset), value_field_type,
-                  value, TypeAryPtr::BYTES, T_OBJECT, IN_HEAP);
+
+  access_store_at(str,  basic_plus_adr(str, value_offset), value_field_type,
+                  value, TypeAryPtr::BYTES, T_OBJECT, IN_HEAP | MO_UNORDERED);
 }
 
-void GraphKit::store_String_coder(Node* ctrl, Node* str, Node* value) {
+void GraphKit::store_String_coder(Node* str, Node* value) {
   int coder_offset = java_lang_String::coder_offset_in_bytes();
   const TypeInstPtr* string_type = TypeInstPtr::make(TypePtr::NotNull, C->env()->String_klass(),
                                                      false, NULL, 0);
   const TypePtr* coder_field_type = string_type->add_offset(coder_offset);
-  int coder_field_idx = C->get_alias_index(coder_field_type);
-  store_to_memory(ctrl, basic_plus_adr(str, coder_offset),
-                  value, T_BYTE, coder_field_idx, MemNode::unordered);
+
+  access_store_at(str, basic_plus_adr(str, coder_offset), coder_field_type,
+                  value, TypeInt::BYTE, T_BYTE, IN_HEAP | MO_UNORDERED);
 }
 
 // Capture src and dst memory state with a MergeMemNode
--- a/src/hotspot/share/opto/graphKit.hpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/opto/graphKit.hpp	Thu Sep 27 10:49:10 2018 -0700
@@ -572,8 +572,7 @@
 
   // Perform decorated accesses
 
-  Node* access_store_at(Node* ctl,
-                        Node* obj,   // containing obj
+  Node* access_store_at(Node* obj,   // containing obj
                         Node* adr,   // actual adress to store val at
                         const TypePtr* adr_type,
                         Node* val,
@@ -593,8 +592,7 @@
                     BasicType bt,
                     DecoratorSet decorators);
 
-  Node* access_atomic_cmpxchg_val_at(Node* ctl,
-                                     Node* obj,
+  Node* access_atomic_cmpxchg_val_at(Node* obj,
                                      Node* adr,
                                      const TypePtr* adr_type,
                                      int alias_idx,
@@ -604,8 +602,7 @@
                                      BasicType bt,
                                      DecoratorSet decorators);
 
-  Node* access_atomic_cmpxchg_bool_at(Node* ctl,
-                                      Node* obj,
+  Node* access_atomic_cmpxchg_bool_at(Node* obj,
                                       Node* adr,
                                       const TypePtr* adr_type,
                                       int alias_idx,
@@ -615,8 +612,7 @@
                                       BasicType bt,
                                       DecoratorSet decorators);
 
-  Node* access_atomic_xchg_at(Node* ctl,
-                              Node* obj,
+  Node* access_atomic_xchg_at(Node* obj,
                               Node* adr,
                               const TypePtr* adr_type,
                               int alias_idx,
@@ -625,8 +621,7 @@
                               BasicType bt,
                               DecoratorSet decorators);
 
-  Node* access_atomic_add_at(Node* ctl,
-                             Node* obj,
+  Node* access_atomic_add_at(Node* obj,
                              Node* adr,
                              const TypePtr* adr_type,
                              int alias_idx,
@@ -635,7 +630,7 @@
                              BasicType bt,
                              DecoratorSet decorators);
 
-  void access_clone(Node* ctl, Node* src, Node* dst, Node* size, bool is_array);
+  void access_clone(Node* src, Node* dst, Node* size, bool is_array);
 
   Node* access_resolve(Node* n, DecoratorSet decorators);
 
@@ -849,11 +844,11 @@
                   bool deoptimize_on_exception = false);
 
   // java.lang.String helpers
-  Node* load_String_length(Node* ctrl, Node* str);
-  Node* load_String_value(Node* ctrl, Node* str);
-  Node* load_String_coder(Node* ctrl, Node* str);
-  void store_String_value(Node* ctrl, Node* str, Node* value);
-  void store_String_coder(Node* ctrl, Node* str, Node* value);
+  Node* load_String_length(Node* str, bool set_ctrl);
+  Node* load_String_value(Node* str, bool set_ctrl);
+  Node* load_String_coder(Node* str, bool set_ctrl);
+  void store_String_value(Node* str, Node* value);
+  void store_String_coder(Node* str, Node* value);
   Node* capture_memory(const TypePtr* src_type, const TypePtr* dst_type);
   Node* compress_string(Node* src, const TypeAryPtr* src_type, Node* dst, Node* count);
   void inflate_string(Node* src, Node* dst, const TypeAryPtr* dst_type, Node* count);
--- a/src/hotspot/share/opto/library_call.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/opto/library_call.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -543,10 +543,7 @@
 
   case vmIntrinsics::_notify:
   case vmIntrinsics::_notifyAll:
-    if (ObjectMonitor::Knob_InlineNotify) {
-      return inline_notify(intrinsic_id());
-    }
-    return false;
+    return inline_notify(intrinsic_id());
 
   case vmIntrinsics::_addExactI:                return inline_math_addExactI(false /* add */);
   case vmIntrinsics::_addExactL:                return inline_math_addExactL(false /* add */);
@@ -1761,11 +1758,9 @@
     return false;
   }
   if (is_store) {
-    (void) store_to_memory(control(), adr, ch, T_CHAR, TypeAryPtr::BYTES, MemNode::unordered,
-                           false, false, true /* mismatched */);
+    access_store_at(value, adr, TypeAryPtr::BYTES, ch, TypeInt::CHAR, T_CHAR, IN_HEAP | MO_UNORDERED | C2_MISMATCHED);
   } else {
-    ch = make_load(control(), adr, TypeInt::CHAR, T_CHAR, TypeAryPtr::BYTES, MemNode::unordered,
-                   LoadNode::DependsOnlyOnTest, false, false, true /* mismatched */);
+    ch = access_load_at(value, adr, TypeAryPtr::BYTES, TypeInt::CHAR, T_CHAR, IN_HEAP | MO_UNORDERED | C2_MISMATCHED | C2_CONTROL_DEPENDENT_LOAD);
     set_result(ch);
   }
   return true;
@@ -2515,7 +2510,7 @@
       val = ConvL2X(val);
       val = gvn().transform(new CastX2PNode(val));
     }
-    access_store_at(control(), heap_base_oop, adr, adr_type, val, value_type, type, decorators);
+    access_store_at(heap_base_oop, adr, adr_type, val, value_type, type, decorators);
   }
 
   return true;
@@ -2734,24 +2729,24 @@
   Node* result = NULL;
   switch (kind) {
     case LS_cmp_exchange: {
-      result = access_atomic_cmpxchg_val_at(control(), base, adr, adr_type, alias_idx,
+      result = access_atomic_cmpxchg_val_at(base, adr, adr_type, alias_idx,
                                             oldval, newval, value_type, type, decorators);
       break;
     }
     case LS_cmp_swap_weak:
       decorators |= C2_WEAK_CMPXCHG;
     case LS_cmp_swap: {
-      result = access_atomic_cmpxchg_bool_at(control(), base, adr, adr_type, alias_idx,
+      result = access_atomic_cmpxchg_bool_at(base, adr, adr_type, alias_idx,
                                              oldval, newval, value_type, type, decorators);
       break;
     }
     case LS_get_set: {
-      result = access_atomic_xchg_at(control(), base, adr, adr_type, alias_idx,
+      result = access_atomic_xchg_at(base, adr, adr_type, alias_idx,
                                      newval, value_type, type, decorators);
       break;
     }
     case LS_get_add: {
-      result = access_atomic_add_at(control(), base, adr, adr_type, alias_idx,
+      result = access_atomic_add_at(base, adr, adr_type, alias_idx,
                                     newval, value_type, type, decorators);
       break;
     }
@@ -4235,7 +4230,7 @@
   // TODO: generate fields copies for small objects instead.
   Node* size = _gvn.transform(obj_size);
 
-  access_clone(control(), obj, alloc_obj, size, is_array);
+  access_clone(obj, alloc_obj, size, is_array);
 
   // Do not let reads from the cloned object float above the arraycopy.
   if (alloc != NULL) {
--- a/src/hotspot/share/opto/parse2.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/opto/parse2.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -104,7 +104,7 @@
 
   const TypeAryPtr* adr_type = TypeAryPtr::get_array_body_type(bt);
 
-  access_store_at(control(), array, adr, adr_type, val, elemtype, bt, MO_UNORDERED | IN_HEAP | IS_ARRAY);
+  access_store_at(array, adr, adr_type, val, elemtype, bt, MO_UNORDERED | IN_HEAP | IS_ARRAY);
 }
 
 
--- a/src/hotspot/share/opto/parse3.cpp	Wed Sep 26 18:36:55 2018 +0100
+++ b/src/hotspot/share/opto/parse3.cpp	Thu Sep 27 10:49:10 2018 -0700
@@ -264,7 +264,7 @@
       field_type = Type::BOTTOM;
     }
   }
-  access_store_at(control(), obj, adr, adr_type, val, field_type, bt, decorators);
+  access_store_at(obj, adr, adr_type, val, field_type, bt, decorators);
 
   if (is_field) {
     // Remember we wrote a volatile field.
@@ -351,7 +351,7 @@
       Node*