changeset 10854:ead1912d85f0

Merge
author jwilhelm
date Mon, 11 Apr 2016 20:38:50 +0000
parents 9158cd01b17b ba6ca5d9a5d0
children d878c100730a
files test/compiler/dependencies/MonomorphicObjectCall/java/lang/Object.java
diffstat 70 files changed, 2151 insertions(+), 1346 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/aarch64/vm/aarch64.ad	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/aarch64/vm/aarch64.ad	Mon Apr 11 20:38:50 2016 +0000
@@ -14242,6 +14242,48 @@
   ins_pipe(pipe_cmp_branch);
 %}
 
+instruct cmpUI_imm0_branch(cmpOpU cmp, iRegIorL2I op1, immI0 op2, label labl, rFlagsRegU cr) %{
+  match(If cmp (CmpU op1 op2));
+  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::ne
+            || n->in(1)->as_Bool()->_test._test == BoolTest::eq
+            || n->in(1)->as_Bool()->_test._test == BoolTest::gt
+            ||  n->in(1)->as_Bool()->_test._test == BoolTest::le);
+  effect(USE labl);
+
+  ins_cost(BRANCH_COST);
+  format %{ "cbw$cmp   $op1, $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
+    if (cond == Assembler::EQ || cond == Assembler::LS)
+      __ cbzw($op1$$Register, *L);
+    else
+      __ cbnzw($op1$$Register, *L);
+  %}
+  ins_pipe(pipe_cmp_branch);
+%}
+
+instruct cmpUL_imm0_branch(cmpOpU cmp, iRegL op1, immL0 op2, label labl, rFlagsRegU cr) %{
+  match(If cmp (CmpU op1 op2));
+  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::ne
+            || n->in(1)->as_Bool()->_test._test == BoolTest::eq
+            || n->in(1)->as_Bool()->_test._test == BoolTest::gt
+            || n->in(1)->as_Bool()->_test._test == BoolTest::le);
+  effect(USE labl);
+
+  ins_cost(BRANCH_COST);
+  format %{ "cb$cmp   $op1, $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
+    if (cond == Assembler::EQ || cond == Assembler::LS)
+      __ cbz($op1$$Register, *L);
+    else
+      __ cbnz($op1$$Register, *L);
+  %}
+  ins_pipe(pipe_cmp_branch);
+%}
+
 // Test bit and Branch
 
 // Patterns for short (< 32KiB) variants
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1221,6 +1221,38 @@
   INSN(caspal,  true,  true)
 #undef INSN
 
+  // 8.1 Atomic operations
+  void lse_atomic(Register Rs, Register Rt, Register Rn,
+                  enum operand_size sz, int op1, int op2, bool a, bool r) {
+    starti;
+    f(sz, 31, 30), f(0b111000, 29, 24), f(a, 23), f(r, 22), f(1, 21);
+    rf(Rs, 16), f(op1, 15), f(op2, 14, 12), f(0, 11, 10), rf(Rn, 5), zrf(Rt, 0);
+  }
+
+#define INSN(NAME, NAME_A, NAME_L, NAME_AL, op1, op2)                   \
+  void NAME(operand_size sz, Register Rs, Register Rt, Register Rn) {   \
+    lse_atomic(Rs, Rt, Rn, sz, op1, op2, false, false);                 \
+  }                                                                     \
+  void NAME_A(operand_size sz, Register Rs, Register Rt, Register Rn) { \
+    lse_atomic(Rs, Rt, Rn, sz, op1, op2, true, false);                  \
+  }                                                                     \
+  void NAME_L(operand_size sz, Register Rs, Register Rt, Register Rn) { \
+    lse_atomic(Rs, Rt, Rn, sz, op1, op2, false, true);                  \
+  }                                                                     \
+  void NAME_AL(operand_size sz, Register Rs, Register Rt, Register Rn) {\
+    lse_atomic(Rs, Rt, Rn, sz, op1, op2, true, true);                   \
+  }
+  INSN(ldadd,  ldadda,  ldaddl,  ldaddal,  0, 0b000);
+  INSN(ldbic,  ldbica,  ldbicl,  ldbical,  0, 0b001);
+  INSN(ldeor,  ldeora,  ldeorl,  ldeoral,  0, 0b010);
+  INSN(ldorr,  ldorra,  ldorrl,  ldorral,  0, 0b011);
+  INSN(ldsmax, ldsmaxa, ldsmaxl, ldsmaxal, 0, 0b100);
+  INSN(ldsmin, ldsmina, ldsminl, ldsminal, 0, 0b101);
+  INSN(ldumax, ldumaxa, ldumaxl, ldumaxal, 0, 0b110);
+  INSN(ldumin, ldumina, lduminl, lduminal, 0, 0b111);
+  INSN(swp,    swpa,    swpl,    swpal,    1, 0b000);
+#undef INSN
+
   // Load register (literal)
 #define INSN(NAME, opc, V)                                              \
   void NAME(Register Rt, address dest) {                                \
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1556,54 +1556,14 @@
 }
 
 void LIR_Assembler::casw(Register addr, Register newval, Register cmpval) {
-  if (UseLSE) {
-    __ mov(rscratch1, cmpval);
-    __ casal(Assembler::word, rscratch1, newval, addr);
-    __ cmpw(rscratch1, cmpval);
-    __ cset(rscratch1, Assembler::NE);
-  } else {
-    Label retry_load, nope;
-    // flush and load exclusive from the memory location
-    // and fail if it is not what we expect
-    __ prfm(Address(addr), PSTL1STRM);
-    __ bind(retry_load);
-    __ ldaxrw(rscratch1, addr);
-    __ cmpw(rscratch1, cmpval);
-    __ cset(rscratch1, Assembler::NE);
-    __ br(Assembler::NE, nope);
-    // if we store+flush with no intervening write rscratch1 wil be zero
-    __ stlxrw(rscratch1, newval, addr);
-    // retry so we only ever return after a load fails to compare
-    // ensures we don't return a stale value after a failed write.
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(nope);
-  }
+  __ cmpxchg(addr, cmpval, newval, Assembler::word, /* acquire*/ true, /* release*/ true, rscratch1);
+  __ cset(rscratch1, Assembler::NE);
   __ membar(__ AnyAny);
 }
 
 void LIR_Assembler::casl(Register addr, Register newval, Register cmpval) {
-  if (UseLSE) {
-    __ mov(rscratch1, cmpval);
-    __ casal(Assembler::xword, rscratch1, newval, addr);
-    __ cmp(rscratch1, cmpval);
-    __ cset(rscratch1, Assembler::NE);
-  } else {
-    Label retry_load, nope;
-    // flush and load exclusive from the memory location
-    // and fail if it is not what we expect
-    __ prfm(Address(addr), PSTL1STRM);
-    __ bind(retry_load);
-    __ ldaxr(rscratch1, addr);
-    __ cmp(rscratch1, cmpval);
-    __ cset(rscratch1, Assembler::NE);
-    __ br(Assembler::NE, nope);
-    // if we store+flush with no intervening write rscratch1 wil be zero
-    __ stlxr(rscratch1, newval, addr);
-    // retry so we only ever return after a load fails to compare
-    // ensures we don't return a stale value after a failed write.
-    __ cbnz(rscratch1, retry_load);
-    __ bind(nope);
-  }
+  __ cmpxchg(addr, cmpval, newval, Assembler::xword, /* acquire*/ true, /* release*/ true, rscratch1);
+  __ cset(rscratch1, Assembler::NE);
   __ membar(__ AnyAny);
 }
 
@@ -3121,38 +3081,32 @@
   BasicType type = src->type();
   bool is_oop = type == T_OBJECT || type == T_ARRAY;
 
-  void (MacroAssembler::* lda)(Register Rd, Register Ra);
-  void (MacroAssembler::* add)(Register Rd, Register Rn, RegisterOrConstant increment);
-  void (MacroAssembler::* stl)(Register Rs, Register Rt, Register Rn);
+  void (MacroAssembler::* add)(Register prev, RegisterOrConstant incr, Register addr);
+  void (MacroAssembler::* xchg)(Register prev, Register newv, Register addr);
 
   switch(type) {
   case T_INT:
-    lda = &MacroAssembler::ldaxrw;
-    add = &MacroAssembler::addw;
-    stl = &MacroAssembler::stlxrw;
+    xchg = &MacroAssembler::atomic_xchgalw;
+    add = &MacroAssembler::atomic_addalw;
     break;
   case T_LONG:
-    lda = &MacroAssembler::ldaxr;
-    add = &MacroAssembler::add;
-    stl = &MacroAssembler::stlxr;
+    xchg = &MacroAssembler::atomic_xchgal;
+    add = &MacroAssembler::atomic_addal;
     break;
   case T_OBJECT:
   case T_ARRAY:
     if (UseCompressedOops) {
-      lda = &MacroAssembler::ldaxrw;
-      add = &MacroAssembler::addw;
-      stl = &MacroAssembler::stlxrw;
+      xchg = &MacroAssembler::atomic_xchgalw;
+      add = &MacroAssembler::atomic_addalw;
     } else {
-      lda = &MacroAssembler::ldaxr;
-      add = &MacroAssembler::add;
-      stl = &MacroAssembler::stlxr;
+      xchg = &MacroAssembler::atomic_xchgal;
+      add = &MacroAssembler::atomic_addal;
     }
     break;
   default:
     ShouldNotReachHere();
-    lda = &MacroAssembler::ldaxr;
-    add = &MacroAssembler::add;
-    stl = &MacroAssembler::stlxr;  // unreachable
+    xchg = &MacroAssembler::atomic_xchgal;
+    add = &MacroAssembler::atomic_addal; // unreachable
   }
 
   switch (code) {
@@ -3170,14 +3124,8 @@
         assert_different_registers(inc.as_register(), dst, addr.base(), tmp,
                                    rscratch1, rscratch2);
       }
-      Label again;
       __ lea(tmp, addr);
-      __ prfm(Address(tmp), PSTL1STRM);
-      __ bind(again);
-      (_masm->*lda)(dst, tmp);
-      (_masm->*add)(rscratch1, dst, inc);
-      (_masm->*stl)(rscratch2, rscratch1, tmp);
-      __ cbnzw(rscratch2, again);
+      (_masm->*add)(dst, inc, tmp);
       break;
     }
   case lir_xchg:
@@ -3186,17 +3134,12 @@
       Register obj = as_reg(data);
       Register dst = as_reg(dest);
       if (is_oop && UseCompressedOops) {
-        __ encode_heap_oop(rscratch1, obj);
-        obj = rscratch1;
+        __ encode_heap_oop(rscratch2, obj);
+        obj = rscratch2;
       }
-      assert_different_registers(obj, addr.base(), tmp, rscratch2, dst);
-      Label again;
+      assert_different_registers(obj, addr.base(), tmp, rscratch1, dst);
       __ lea(tmp, addr);
-      __ prfm(Address(tmp), PSTL1STRM);
-      __ bind(again);
-      (_masm->*lda)(dst, tmp);
-      (_masm->*stl)(rscratch2, obj, tmp);
-      __ cbnzw(rscratch2, again);
+      (_masm->*xchg)(dst, obj, tmp);
       if (is_oop && UseCompressedOops) {
         __ decode_heap_oop(dst);
       }
--- a/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -55,6 +55,7 @@
 define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K));
 define_pd_global(intx, LoopUnrollLimit,              60);
 define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, PostLoopMultiversioning,      false);
 // InitialCodeCacheSize derived from specjbb2000 run.
 define_pd_global(intx, InitialCodeCacheSize,         2496*K); // Integral multiple of CodeCacheExpansionSize
 define_pd_global(intx, CodeCacheExpansionSize,       64*K);
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1637,6 +1637,11 @@
 }
 
 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
+  if (UseLSE) {
+    mov(tmp, 1);
+    ldadd(Assembler::word, tmp, zr, counter_addr);
+    return;
+  }
   Label retry_load;
   prfm(Address(counter_addr), PSTL1STRM);
   bind(retry_load);
@@ -2172,8 +2177,18 @@
     return a != b.as_register() && a != c && b.as_register() != c;
 }
 
-#define ATOMIC_OP(LDXR, OP, IOP, STXR)                                       \
-void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
+#define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
+void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
+  if (UseLSE) {                                                         \
+    prev = prev->is_valid() ? prev : zr;                                \
+    if (incr.is_register()) {                                           \
+      AOP(sz, incr.as_register(), prev, addr);                          \
+    } else {                                                            \
+      mov(rscratch2, incr.as_constant());                               \
+      AOP(sz, rscratch2, prev, addr);                                   \
+    }                                                                   \
+    return;                                                             \
+  }                                                                     \
   Register result = rscratch2;                                          \
   if (prev->is_valid())                                                 \
     result = different(prev, incr, addr) ? prev : rscratch2;            \
@@ -2190,13 +2205,20 @@
   }                                                                     \
 }
 
-ATOMIC_OP(ldxr, add, sub, stxr)
-ATOMIC_OP(ldxrw, addw, subw, stxrw)
+ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
+ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
+ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
+ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
 
 #undef ATOMIC_OP
 
-#define ATOMIC_XCHG(OP, LDXR, STXR)                                     \
+#define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
+  if (UseLSE) {                                                         \
+    prev = prev->is_valid() ? prev : zr;                                \
+    AOP(sz, newv, prev, addr);                                          \
+    return;                                                             \
+  }                                                                     \
   Register result = rscratch2;                                          \
   if (prev->is_valid())                                                 \
     result = different(prev, newv, addr) ? prev : rscratch2;            \
@@ -2211,8 +2233,10 @@
     mov(prev, result);                                                  \
 }
 
-ATOMIC_XCHG(xchg, ldxr, stxr)
-ATOMIC_XCHG(xchgw, ldxrw, stxrw)
+ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
+ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
+ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
+ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
 
 #undef ATOMIC_XCHG
 
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -957,9 +957,13 @@
 
   void atomic_add(Register prev, RegisterOrConstant incr, Register addr);
   void atomic_addw(Register prev, RegisterOrConstant incr, Register addr);
+  void atomic_addal(Register prev, RegisterOrConstant incr, Register addr);
+  void atomic_addalw(Register prev, RegisterOrConstant incr, Register addr);
 
   void atomic_xchg(Register prev, Register newv, Register addr);
   void atomic_xchgw(Register prev, Register newv, Register addr);
+  void atomic_xchgal(Register prev, Register newv, Register addr);
+  void atomic_xchgalw(Register prev, Register newv, Register addr);
 
   void orptr(Address adr, RegisterOrConstant src) {
     ldr(rscratch2, adr);
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1711,20 +1711,42 @@
   // to a long, int, short, or byte copy loop.
   //
   address generate_unsafe_copy(const char *name,
-                               address byte_copy_entry) {
-#ifdef PRODUCT
-    return StubRoutines::_jbyte_arraycopy;
-#else
+                               address byte_copy_entry,
+                               address short_copy_entry,
+                               address int_copy_entry,
+                               address long_copy_entry) {
+    Label L_long_aligned, L_int_aligned, L_short_aligned;
+    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
+
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
     __ enter(); // required for proper stackwalking of RuntimeStub frame
+
     // bump this on entry, not on exit:
-    __ lea(rscratch2, ExternalAddress((address)&SharedRuntime::_unsafe_array_copy_ctr));
-    __ incrementw(Address(rscratch2));
+    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
+
+    __ orr(rscratch1, s, d);
+    __ orr(rscratch1, rscratch1, count);
+
+    __ andr(rscratch1, rscratch1, BytesPerLong-1);
+    __ cbz(rscratch1, L_long_aligned);
+    __ andr(rscratch1, rscratch1, BytesPerInt-1);
+    __ cbz(rscratch1, L_int_aligned);
+    __ tbz(rscratch1, 0, L_short_aligned);
     __ b(RuntimeAddress(byte_copy_entry));
+
+    __ BIND(L_short_aligned);
+    __ lsr(count, count, LogBytesPerShort);  // size => short_count
+    __ b(RuntimeAddress(short_copy_entry));
+    __ BIND(L_int_aligned);
+    __ lsr(count, count, LogBytesPerInt);    // size => int_count
+    __ b(RuntimeAddress(int_copy_entry));
+    __ BIND(L_long_aligned);
+    __ lsr(count, count, LogBytesPerLong);   // size => long_count
+    __ b(RuntimeAddress(long_copy_entry));
+
     return start;
-#endif
   }
 
   //
@@ -2090,7 +2112,10 @@
                                                                         /*dest_uninitialized*/true);
 
     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
-                                                              entry_jbyte_arraycopy);
+                                                              entry_jbyte_arraycopy,
+                                                              entry_jshort_arraycopy,
+                                                              entry_jint_arraycopy,
+                                                              entry_jlong_arraycopy);
 
     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
                                                                entry_jbyte_arraycopy,
--- a/src/cpu/aarch64/vm/templateInterpreterGenerator_aarch64.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/aarch64/vm/templateInterpreterGenerator_aarch64.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1983,14 +1983,8 @@
   __ push(rscratch1);
   __ push(rscratch2);
   __ push(rscratch3);
-  Label L;
-  __ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
-  __ prfm(Address(rscratch2), PSTL1STRM);
-  __ bind(L);
-  __ ldxr(rscratch1, rscratch2);
-  __ add(rscratch1, rscratch1, 1);
-  __ stxr(rscratch3, rscratch1, rscratch2);
-  __ cbnzw(rscratch3, L);
+  __ mov(rscratch3, (address) &BytecodeCounter::_counter_value);
+  __ atomic_add(noreg, 1, rscratch3);
   __ pop(rscratch3);
   __ pop(rscratch2);
   __ pop(rscratch1);
--- a/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -73,6 +73,7 @@
     CPU_SHA1         = (1<<5),
     CPU_SHA2         = (1<<6),
     CPU_CRC32        = (1<<7),
+    CPU_LSE          = (1<<8),
     CPU_A53MAC       = (1 << 30),
     CPU_DMB_ATOMICS  = (1 << 31),
   };
--- a/src/cpu/ppc/vm/assembler_ppc.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/ppc/vm/assembler_ppc.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -624,6 +624,7 @@
     VNOR_OPCODE    = (4u  << OPCODE_SHIFT | 1284u     ),
     VOR_OPCODE     = (4u  << OPCODE_SHIFT | 1156u     ),
     VXOR_OPCODE    = (4u  << OPCODE_SHIFT | 1220u     ),
+    VRLD_OPCODE    = (4u  << OPCODE_SHIFT |  196u     ),
     VRLB_OPCODE    = (4u  << OPCODE_SHIFT |    4u     ),
     VRLW_OPCODE    = (4u  << OPCODE_SHIFT |  132u     ),
     VRLH_OPCODE    = (4u  << OPCODE_SHIFT |   68u     ),
@@ -2047,6 +2048,7 @@
   inline void vnor(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vor(      VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vxor(     VectorRegister d, VectorRegister a, VectorRegister b);
+  inline void vrld(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vrlb(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vrlw(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vrlh(     VectorRegister d, VectorRegister a, VectorRegister b);
--- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -839,6 +839,7 @@
 inline void Assembler::vnor(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VNOR_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vor(     VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VOR_OPCODE      | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vxor(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VXOR_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
+inline void Assembler::vrld(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLD_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vrlb(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLB_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vrlw(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLW_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vrlh(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLH_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
--- a/src/cpu/ppc/vm/c2_globals_ppc.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/ppc/vm/c2_globals_ppc.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -55,6 +55,7 @@
 define_pd_global(bool, ResizeTLAB,                   true);
 define_pd_global(intx, LoopUnrollLimit,              60);
 define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, PostLoopMultiversioning,      false);
 
 // Peephole and CISC spilling both break the graph, and so make the
 // scheduler sick.
--- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -2417,6 +2417,433 @@
     return start;
   }
 
+  // Arguments for generated stub (little endian only):
+  //   R3_ARG1   - source byte array address
+  //   R4_ARG2   - destination byte array address
+  //   R5_ARG3   - round key array
+  address generate_aescrypt_encryptBlock() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+
+    address start = __ function_entry();
+
+    Label L_doLast;
+
+    Register from           = R3_ARG1;  // source array address
+    Register to             = R4_ARG2;  // destination array address
+    Register key            = R5_ARG3;  // round key array
+
+    Register keylen         = R8;
+    Register temp           = R9;
+    Register keypos         = R10;
+    Register hex            = R11;
+    Register fifteen        = R12;
+
+    VectorRegister vRet     = VR0;
+
+    VectorRegister vKey1    = VR1;
+    VectorRegister vKey2    = VR2;
+    VectorRegister vKey3    = VR3;
+    VectorRegister vKey4    = VR4;
+
+    VectorRegister fromPerm = VR5;
+    VectorRegister keyPerm  = VR6;
+    VectorRegister toPerm   = VR7;
+    VectorRegister fSplt    = VR8;
+
+    VectorRegister vTmp1    = VR9;
+    VectorRegister vTmp2    = VR10;
+    VectorRegister vTmp3    = VR11;
+    VectorRegister vTmp4    = VR12;
+
+    VectorRegister vLow     = VR13;
+    VectorRegister vHigh    = VR14;
+
+    __ li              (hex, 16);
+    __ li              (fifteen, 15);
+    __ vspltisb        (fSplt, 0x0f);
+
+    // load unaligned from[0-15] to vsRet
+    __ lvx             (vRet, from);
+    __ lvx             (vTmp1, fifteen, from);
+    __ lvsl            (fromPerm, from);
+    __ vxor            (fromPerm, fromPerm, fSplt);
+    __ vperm           (vRet, vRet, vTmp1, fromPerm);
+
+    // load keylen (44 or 52 or 60)
+    __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
+
+    // to load keys
+    __ lvsr            (keyPerm, key);
+    __ vxor            (vTmp2, vTmp2, vTmp2);
+    __ vspltisb        (vTmp2, -16);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vsldoi          (keyPerm, keyPerm, keyPerm, -8);
+
+    // load the 1st round key to vKey1
+    __ li              (keypos, 0);
+    __ lvx             (vKey1, keypos, key);
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey1, vTmp1, vKey1, keyPerm);
+
+    // 1st round
+    __ vxor (vRet, vRet, vKey1);
+
+    // load the 2nd round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 3rd round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // load the 4th round key to vKey3
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
+
+    // load the 5th round key to vKey4
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
+
+    // 2nd - 5th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+    __ vcipher (vRet, vRet, vKey3);
+    __ vcipher (vRet, vRet, vKey4);
+
+    // load the 6th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 7th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // load the 8th round key to vKey3
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
+
+    // load the 9th round key to vKey4
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
+
+    // 6th - 9th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+    __ vcipher (vRet, vRet, vKey3);
+    __ vcipher (vRet, vRet, vKey4);
+
+    // load the 10th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 11th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // if all round keys are loaded, skip next 4 rounds
+    __ cmpwi           (CCR0, keylen, 44);
+    __ beq             (CCR0, L_doLast);
+
+    // 10th - 11th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+
+    // load the 12th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 13th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // if all round keys are loaded, skip next 2 rounds
+    __ cmpwi           (CCR0, keylen, 52);
+    __ beq             (CCR0, L_doLast);
+
+    // 12th - 13th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+
+    // load the 14th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 15th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    __ bind(L_doLast);
+
+    // last two rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipherlast (vRet, vRet, vKey2);
+
+    __ neg             (temp, to);
+    __ lvsr            (toPerm, temp);
+    __ vspltisb        (vTmp2, -1);
+    __ vxor            (vTmp1, vTmp1, vTmp1);
+    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
+    __ vxor            (toPerm, toPerm, fSplt);
+    __ lvx             (vTmp1, to);
+    __ vperm           (vRet, vRet, vRet, toPerm);
+    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
+    __ lvx             (vTmp4, fifteen, to);
+    __ stvx            (vTmp1, to);
+    __ vsel            (vRet, vRet, vTmp4, vTmp2);
+    __ stvx            (vRet, fifteen, to);
+
+    __ blr();
+     return start;
+  }
+
+  // Arguments for generated stub (little endian only):
+  //   R3_ARG1   - source byte array address
+  //   R4_ARG2   - destination byte array address
+  //   R5_ARG3   - K (key) in little endian int array
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+
+    address start = __ function_entry();
+
+    Label L_doLast;
+    Label L_do44;
+    Label L_do52;
+    Label L_do60;
+
+    Register from           = R3_ARG1;  // source array address
+    Register to             = R4_ARG2;  // destination array address
+    Register key            = R5_ARG3;  // round key array
+
+    Register keylen         = R8;
+    Register temp           = R9;
+    Register keypos         = R10;
+    Register hex            = R11;
+    Register fifteen        = R12;
+
+    VectorRegister vRet     = VR0;
+
+    VectorRegister vKey1    = VR1;
+    VectorRegister vKey2    = VR2;
+    VectorRegister vKey3    = VR3;
+    VectorRegister vKey4    = VR4;
+    VectorRegister vKey5    = VR5;
+
+    VectorRegister fromPerm = VR6;
+    VectorRegister keyPerm  = VR7;
+    VectorRegister toPerm   = VR8;
+    VectorRegister fSplt    = VR9;
+
+    VectorRegister vTmp1    = VR10;
+    VectorRegister vTmp2    = VR11;
+    VectorRegister vTmp3    = VR12;
+    VectorRegister vTmp4    = VR13;
+
+    VectorRegister vLow     = VR14;
+    VectorRegister vHigh    = VR15;
+
+    __ li              (hex, 16);
+    __ li              (fifteen, 15);
+    __ vspltisb        (fSplt, 0x0f);
+
+    // load unaligned from[0-15] to vsRet
+    __ lvx             (vRet, from);
+    __ lvx             (vTmp1, fifteen, from);
+    __ lvsl            (fromPerm, from);
+    __ vxor            (fromPerm, fromPerm, fSplt);
+    __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
+
+    // load keylen (44 or 52 or 60)
+    __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
+
+    // to load keys
+    __ lvsr            (keyPerm, key);
+    __ vxor            (vTmp2, vTmp2, vTmp2);
+    __ vspltisb        (vTmp2, -16);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vsldoi          (keyPerm, keyPerm, keyPerm, -8);
+
+    __ cmpwi           (CCR0, keylen, 44);
+    __ beq             (CCR0, L_do44);
+
+    __ cmpwi           (CCR0, keylen, 52);
+    __ beq             (CCR0, L_do52);
+
+    // load the 15th round key to vKey11
+    __ li              (keypos, 240);
+    __ lvx             (vTmp1, keypos, key);
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // load the 14th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
+
+    // load the 13th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
+
+    // load the 12th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
+
+    // load the 11th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
+
+    // 1st - 5th rounds
+    __ vxor            (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+    __ vncipher        (vRet, vRet, vKey4);
+    __ vncipher        (vRet, vRet, vKey5);
+
+    __ b               (L_doLast);
+
+    __ bind            (L_do52);
+
+    // load the 13th round key to vKey11
+    __ li              (keypos, 208);
+    __ lvx             (vTmp1, keypos, key);
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // load the 12th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
+
+    // load the 11th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
+
+    // 1st - 3rd rounds
+    __ vxor            (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+
+    __ b               (L_doLast);
+
+    __ bind            (L_do44);
+
+    // load the 11th round key to vKey11
+    __ li              (keypos, 176);
+    __ lvx             (vTmp1, keypos, key);
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // 1st round
+    __ vxor            (vRet, vRet, vKey1);
+
+    __ bind            (L_doLast);
+
+    // load the 10th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 9th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // load the 8th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
+
+    // load the 7th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
+
+    // load the 6th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey5, vTmp2, vTmp1, keyPerm);
+
+    // last 10th - 6th rounds
+    __ vncipher        (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+    __ vncipher        (vRet, vRet, vKey4);
+    __ vncipher        (vRet, vRet, vKey5);
+
+    // load the 5th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // load the 4th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
+
+    // load the 3rd round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
+
+    // load the 2nd round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
+
+    // load the 1st round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
+
+    // last 5th - 1th rounds
+    __ vncipher        (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+    __ vncipher        (vRet, vRet, vKey4);
+    __ vncipherlast    (vRet, vRet, vKey5);
+
+    __ neg             (temp, to);
+    __ lvsr            (toPerm, temp);
+    __ vspltisb        (vTmp2, -1);
+    __ vxor            (vTmp1, vTmp1, vTmp1);
+    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
+    __ vxor            (toPerm, toPerm, fSplt);
+    __ lvx             (vTmp1, to);
+    __ vperm           (vRet, vRet, vRet, toPerm);
+    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
+    __ lvx             (vTmp4, fifteen, to);
+    __ stvx            (vTmp1, to);
+    __ vsel            (vRet, vRet, vTmp4, vTmp2);
+    __ stvx            (vRet, fifteen, to);
+
+    __ blr();
+     return start;
+  }
 
   void generate_arraycopy_stubs() {
     // Note: the disjoint stubs must be generated first, some of
@@ -2693,10 +3120,6 @@
     // arraycopy stubs used by compilers
     generate_arraycopy_stubs();
 
-    if (UseAESIntrinsics) {
-      guarantee(!UseAESIntrinsics, "not yet implemented.");
-    }
-
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
                                                        &StubRoutines::_safefetch32_fault_pc,
@@ -2719,6 +3142,12 @@
       StubRoutines::_montgomerySquare
         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
     }
+
+    if (UseAESIntrinsics) {
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+    }
+
   }
 
  public:
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -122,7 +122,7 @@
                (has_fcfids()  ? " fcfids"  : ""),
                (has_vand()    ? " vand"    : ""),
                (has_lqarx()   ? " lqarx"   : ""),
-               (has_vcipher() ? " vcipher" : ""),
+               (has_vcipher() ? " aes"     : ""),
                (has_vpmsumb() ? " vpmsumb" : ""),
                (has_tcheck()  ? " tcheck"  : ""),
                (has_mfdscr()  ? " mfdscr"  : "")
@@ -186,6 +186,28 @@
   }
 
   // The AES intrinsic stubs require AES instruction support.
+#if defined(VM_LITTLE_ENDIAN)
+  if (has_vcipher()) {
+    if (FLAG_IS_DEFAULT(UseAES)) {
+      UseAES = true;
+    }
+  } else if (UseAES) {
+    if (!FLAG_IS_DEFAULT(UseAES))
+      warning("AES instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAES, false);
+  }
+
+  if (UseAES && has_vcipher()) {
+    if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+      UseAESIntrinsics = true;
+    }
+  } else if (UseAESIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseAESIntrinsics))
+      warning("AES intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+  }
+
+#else
   if (UseAES) {
     warning("AES instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseAES, false);
@@ -195,6 +217,7 @@
       warning("AES intrinsics are not available on this CPU");
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
+#endif
 
   if (UseAESCTRIntrinsics) {
     warning("AES/CTR intrinsics are not available on this CPU");
--- a/src/cpu/sparc/vm/c2_globals_sparc.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/sparc/vm/c2_globals_sparc.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -53,6 +53,7 @@
 define_pd_global(bool, ResizeTLAB,                   true);
 define_pd_global(intx, LoopUnrollLimit,              60); // Design center runs on 1.3.1
 define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, PostLoopMultiversioning,      false);
 define_pd_global(intx, MinJumpTableSize,             5);
 
 // Peephole and CISC spilling both break the graph, and so makes the
--- a/src/cpu/x86/vm/assembler_x86.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -3147,8 +3147,7 @@
 void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "some form of AVX must be enabled");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x67);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3156,7 +3155,7 @@
 void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len) {
   assert(VM_Version::supports_avx2(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x00);
   emit_int8(0xC0 | encode);
   emit_int8(imm8);
@@ -3199,8 +3198,7 @@
 void Assembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x74);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3210,8 +3208,7 @@
   assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x74);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3222,9 +3219,8 @@
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int dst_enc = kdst->encoding();
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x74);
   emit_operand(as_Register(dst_enc), src);
 }
@@ -3242,8 +3238,7 @@
 void Assembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x75);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3253,8 +3248,7 @@
   assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x75);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3265,9 +3259,8 @@
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int dst_enc = kdst->encoding();
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x75);
   emit_operand(as_Register(dst_enc), src);
 }
@@ -3285,8 +3278,7 @@
 void Assembler::vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x76);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3296,8 +3288,7 @@
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x76);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3308,9 +3299,8 @@
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int dst_enc = kdst->encoding();
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x76);
   emit_operand(as_Register(dst_enc), src);
 }
@@ -3328,8 +3318,7 @@
 void Assembler::vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x29);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3339,8 +3328,7 @@
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x29);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3352,9 +3340,8 @@
   InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int dst_enc = kdst->encoding();
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x29);
   emit_operand(as_Register(dst_enc), src);
 }
@@ -3988,7 +3975,7 @@
 void Assembler::pblendw(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x0E);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -4395,8 +4382,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
@@ -4404,8 +4390,7 @@
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4415,8 +4400,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
@@ -4424,8 +4408,7 @@
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4435,8 +4418,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
 }
@@ -4444,8 +4426,7 @@
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4455,8 +4436,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
 }
@@ -4464,8 +4444,7 @@
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4475,8 +4454,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
@@ -4484,8 +4462,7 @@
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4495,8 +4472,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
@@ -4504,8 +4480,7 @@
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4515,8 +4490,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
 }
@@ -4524,8 +4498,7 @@
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4535,8 +4508,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
 }
@@ -4544,8 +4516,7 @@
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4584,8 +4555,7 @@
 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4593,8 +4563,7 @@
 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4604,8 +4573,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
@@ -4615,8 +4583,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
@@ -4640,8 +4607,7 @@
 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4649,8 +4615,7 @@
 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4660,8 +4625,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
 }
@@ -4671,8 +4635,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
 }
@@ -4706,8 +4669,7 @@
 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4715,8 +4677,7 @@
 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4726,8 +4687,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
@@ -4737,8 +4697,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
@@ -4762,8 +4721,7 @@
 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4771,8 +4729,7 @@
 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4782,8 +4739,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
 }
@@ -4793,8 +4749,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
 }
@@ -4802,8 +4757,7 @@
 void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x51);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4857,8 +4811,7 @@
 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4866,8 +4819,7 @@
 void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4877,8 +4829,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_operand(dst, src);
 }
@@ -4888,8 +4839,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_operand(dst, src);
 }
@@ -4949,8 +4899,7 @@
 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4958,8 +4907,7 @@
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4969,8 +4917,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_operand(dst, src);
 }
@@ -4980,8 +4927,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_operand(dst, src);
 }
@@ -4991,8 +4937,7 @@
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5001,8 +4946,7 @@
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5035,7 +4979,7 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFE);
   emit_operand(dst, src);
 }
@@ -5067,8 +5011,7 @@
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFC);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5076,8 +5019,7 @@
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFD);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5085,8 +5027,7 @@
 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFE);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5094,8 +5035,7 @@
 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD4);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5105,8 +5045,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFC);
   emit_operand(dst, src);
 }
@@ -5116,8 +5055,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFD);
   emit_operand(dst, src);
 }
@@ -5127,8 +5065,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFE);
   emit_operand(dst, src);
 }
@@ -5138,8 +5075,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD4);
   emit_operand(dst, src);
 }
@@ -5178,8 +5114,7 @@
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF8);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5187,8 +5122,7 @@
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF9);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5196,8 +5130,7 @@
 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFA);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5205,8 +5138,7 @@
 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFB);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5216,8 +5148,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF8);
   emit_operand(dst, src);
 }
@@ -5227,8 +5158,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF9);
   emit_operand(dst, src);
 }
@@ -5238,8 +5168,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFA);
   emit_operand(dst, src);
 }
@@ -5249,8 +5178,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFB);
   emit_operand(dst, src);
 }
@@ -5274,8 +5202,7 @@
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD5);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5283,8 +5210,7 @@
 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5292,8 +5218,7 @@
 void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 2, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5303,8 +5228,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD5);
   emit_operand(dst, src);
 }
@@ -5314,8 +5238,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -5325,8 +5248,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -5638,8 +5560,7 @@
 void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xDB);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5649,8 +5570,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xDB);
   emit_operand(dst, src);
 }
@@ -5674,8 +5594,7 @@
 void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xEB);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5685,8 +5604,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xEB);
   emit_operand(dst, src);
 }
@@ -5702,8 +5620,7 @@
 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xEF);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5713,20 +5630,96 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xEF);
   emit_operand(dst, src);
 }
 
 
+// vinserti forms
+
+void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_avx2(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x38);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
+  assert(VM_Version::supports_avx2(), "");
+  assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x38);
+  emit_operand(dst, src);
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x38);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into q0 128 bits (0..127)
+  // 0x01 - insert into q1 128 bits (128..255)
+  // 0x02 - insert into q2 128 bits (256..383)
+  // 0x03 - insert into q3 128 bits (384..511)
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
+  assert(VM_Version::supports_avx(), "");
+  assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x18);
+  emit_operand(dst, src);
+  // 0x00 - insert into q0 128 bits (0..127)
+  // 0x01 - insert into q1 128 bits (128..255)
+  // 0x02 - insert into q2 128 bits (256..383)
+  // 0x03 - insert into q3 128 bits (384..511)
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x38);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into lower 256 bits
+  // 0x01 - insert into upper 256 bits
+  emit_int8(imm8 & 0x01);
+}
+
+
+// vinsertf forms
+
 void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 128 bits
@@ -5734,33 +5727,19 @@
   emit_int8(imm8 & 0x01);
 }
 
-void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x1A);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - insert into lower 256 bits
-  // 0x01 - insert into upper 256 bits
-  emit_int8(imm8 & 0x01);
-}
-
-void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
+  assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_64bit);
-  // swap src<->dst for encoding
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x1A);
-  emit_operand(dst, src);
-  // 0x00 - insert into lower 256 bits
-  // 0x01 - insert into upper 256 bits
+  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x18);
+  emit_operand(dst, src);
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
   emit_int8(imm8 & 0x01);
 }
 
@@ -5768,8 +5747,7 @@
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into q0 128 bits (0..127)
@@ -5784,12 +5762,10 @@
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
-  // swap src<->dst for encoding
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_operand(dst, src);
   // 0x00 - insert into q0 128 bits (0..127)
@@ -5799,98 +5775,36 @@
   emit_int8(imm8 & 0x03);
 }
 
-void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
-  assert(dst != xnoreg, "sanity");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
-  // swap src<->dst for encoding
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x18);
-  emit_operand(dst, src);
-  // 0x00 - insert into lower 128 bits
-  // 0x01 - insert into upper 128 bits
-  emit_int8(imm8 & 0x01);
-}
-
-void Assembler::vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x19);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - extract from lower 128 bits
-  // 0x01 - extract from upper 128 bits
-  emit_int8(imm8 & 0x01);
-}
-
-void Assembler::vextractf128(Address dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
-  assert(src != xnoreg, "sanity");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
-  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x19);
-  emit_operand(src, dst);
-  // 0x00 - extract from lower 128 bits
-  // 0x01 - extract from upper 128 bits
-  emit_int8(imm8 & 0x01);
-}
-
-void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx2(), "");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x38);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - insert into lower 128 bits
-  // 0x01 - insert into upper 128 bits
-  emit_int8(imm8 & 0x01);
-}
-
-void Assembler::vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x38);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x1A);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
   // 0x01 - insert into upper 256 bits
   emit_int8(imm8 & 0x01);
 }
 
-void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
-  assert(VM_Version::supports_avx2(), "");
+void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
-  // swap src<->dst for encoding
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x38);
-  emit_operand(dst, src);
-  // 0x00 - insert into lower 128 bits
-  // 0x01 - insert into upper 128 bits
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_64bit);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x1A);
+  emit_operand(dst, src);
+  // 0x00 - insert into lower 256 bits
+  // 0x01 - insert into upper 256 bits
   emit_int8(imm8 & 0x01);
 }
 
+
+// vextracti forms
+
 void Assembler::vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
@@ -5920,6 +5834,52 @@
   emit_int8(imm8 & 0x01);
 }
 
+void Assembler::vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x39);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vextracti32x4(Address dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x39);
+  emit_operand(src, dst);
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x39);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(imm8 & 0x03);
+}
+
 void Assembler::vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
@@ -5932,44 +5892,35 @@
   emit_int8(imm8 & 0x01);
 }
 
-void Assembler::vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
-  assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+
+// vextractf forms
+
+void Assembler::vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x39);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - extract from bits 127:0
-  // 0x01 - extract from bits 255:128
-  // 0x02 - extract from bits 383:256
-  // 0x03 - extract from bits 511:384
-  emit_int8(imm8 & 0x03);
-}
-
-void Assembler::vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x1B);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - extract from lower 256 bits
-  // 0x01 - extract from upper 256 bits
+  emit_int8(0x19);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from lower 128 bits
+  // 0x01 - extract from upper 128 bits
   emit_int8(imm8 & 0x01);
 }
 
-void Assembler::vextractf64x4(Address dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vextractf128(Address dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_avx(), "");
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_T4,/* input_size_in_bits */  EVEX_64bit);
+  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x1B);
+  emit_int8(0x19);
   emit_operand(src, dst);
-  // 0x00 - extract from lower 256 bits
-  // 0x01 - extract from upper 256 bits
+  // 0x00 - extract from lower 128 bits
+  // 0x01 - extract from upper 128 bits
   emit_int8(imm8 & 0x01);
 }
 
@@ -6019,7 +5970,43 @@
   emit_int8(imm8 & 0x03);
 }
 
-// duplicate 4-bytes integer data from src into 8 locations in dest
+void Assembler::vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x1B);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from lower 256 bits
+  // 0x01 - extract from upper 256 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextractf64x4(Address dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4,/* input_size_in_bits */  EVEX_64bit);
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x1B);
+  emit_operand(src, dst);
+  // 0x00 - extract from lower 256 bits
+  // 0x01 - extract from upper 256 bits
+  emit_int8(imm8 & 0x01);
+}
+
+
+// legacy word/dword replicate
+void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_avx2(), "");
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x79);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
   InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6028,16 +6015,10 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-// duplicate 2-bytes integer data from src into 16 locations in dest
-void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
-  assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
-  emit_int8(0x79);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
-// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+
+// xmm/mem sourced byte/word/dword/qword replicate
+
+// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6053,12 +6034,12 @@
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x78);
   emit_operand(dst, src);
 }
 
-// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+// duplicate 2-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6074,12 +6055,12 @@
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x79);
   emit_operand(dst, src);
 }
 
-// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+// duplicate 4-byte integer data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6095,12 +6076,12 @@
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
 
-// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+// duplicate 8-byte integer data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6116,12 +6097,15 @@
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
 
-// duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL
+
+// scalar single/double precision replicate
+
+// duplicate single precision data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6142,7 +6126,7 @@
   emit_operand(dst, src);
 }
 
-// duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL
+// duplicate double precision data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6163,7 +6147,10 @@
   emit_operand(dst, src);
 }
 
-// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+
+// gpr source broadcast forms
+
+// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6176,7 +6163,7 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+// duplicate 2-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6189,7 +6176,7 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+// duplicate 4-byte integer data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6202,7 +6189,7 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+// duplicate 8-byte integer data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6215,6 +6202,7 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+
 // Carry-Less Multiplication Quadword
 void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
   assert(VM_Version::supports_clmul(), "");
@@ -6229,8 +6217,7 @@
 void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
   assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8((unsigned char)mask);
@@ -6972,8 +6959,7 @@
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x4B);
   emit_int8((unsigned char)(0xC0 | encode));
   int src2_enc = src2->encoding();
--- a/src/cpu/x86/vm/assembler_x86.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1977,39 +1977,43 @@
   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
-  // 128bit copy from/to 256bit (YMM) vector registers
+  // vinserti forms
+  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+
+  // vinsertf forms
   void vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
-  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+
+  // vextracti forms
+  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
+  void vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextracti32x4(Address dst, XMMRegister src, uint8_t imm8);
+  void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+
+  // vextractf forms
   void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
-  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
-  void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
-  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
   void vextractf128(Address dst, XMMRegister src, uint8_t imm8);
-  void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
-
-  // 256bit copy from/to 512bit (ZMM) vector registers
-  void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
-  void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
-  void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
+  void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
   void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
   void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
-  void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
-
-  // 128bit copy from/to 256bit (YMM) or 512bit (ZMM) vector registers
-  void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
-  void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
-  void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
-  void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
-  void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
-  void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
-
-  // duplicate 4-bytes integer data from src into 8 locations in dest
+
+  // legacy xmm sourced word/dword replicate
+  void vpbroadcastw(XMMRegister dst, XMMRegister src);
   void vpbroadcastd(XMMRegister dst, XMMRegister src);
 
-  // duplicate 2-bytes integer data from src into 16 locations in dest
-  void vpbroadcastw(XMMRegister dst, XMMRegister src);
-
-  // duplicate n-bytes integer data from src into vector_len locations in dest
+  // xmm/mem sourced byte/word/dword/qword replicate
   void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
   void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
   void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
@@ -2019,11 +2023,13 @@
   void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
   void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
 
+  // scalar single/double precision replicate
   void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
   void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
   void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
   void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);
 
+  // gpr sourced byte/word/dword/qword replicate
   void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
   void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
   void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
--- a/src/cpu/x86/vm/c2_globals_x86.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/x86/vm/c2_globals_x86.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -47,6 +47,7 @@
 define_pd_global(intx, FreqInlineSize,               325);
 define_pd_global(intx, MinJumpTableSize,             10);
 define_pd_global(intx, LoopPercentProfileLimit,      30);
+define_pd_global(intx, PostLoopMultiversioning,      true);
 #ifdef AMD64
 define_pd_global(intx, INTPRESSURE,                  13);
 define_pd_global(intx, FLOATPRESSURE,                14);
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1216,7 +1216,10 @@
   void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }
 
   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
-    if (UseAVX > 1) { // vinserti128 is available only in AVX2
+    if (UseAVX > 2) {
+      Assembler::vinserti32x4(dst, dst, src, imm8);
+    } else if (UseAVX > 1) {
+      // vinserti128 is available only in AVX2
       Assembler::vinserti128(dst, nds, src, imm8);
     } else {
       Assembler::vinsertf128(dst, nds, src, imm8);
@@ -1224,7 +1227,10 @@
   }
 
   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
-    if (UseAVX > 1) { // vinserti128 is available only in AVX2
+    if (UseAVX > 2) {
+      Assembler::vinserti32x4(dst, dst, src, imm8);
+    } else if (UseAVX > 1) {
+      // vinserti128 is available only in AVX2
       Assembler::vinserti128(dst, nds, src, imm8);
     } else {
       Assembler::vinsertf128(dst, nds, src, imm8);
@@ -1232,7 +1238,10 @@
   }
 
   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-    if (UseAVX > 1) { // vextracti128 is available only in AVX2
+    if (UseAVX > 2) {
+      Assembler::vextracti32x4(dst, src, imm8);
+    } else if (UseAVX > 1) {
+      // vextracti128 is available only in AVX2
       Assembler::vextracti128(dst, src, imm8);
     } else {
       Assembler::vextractf128(dst, src, imm8);
@@ -1240,7 +1249,10 @@
   }
 
   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
-    if (UseAVX > 1) { // vextracti128 is available only in AVX2
+    if (UseAVX > 2) {
+      Assembler::vextracti32x4(dst, src, imm8);
+    } else if (UseAVX > 1) {
+      // vextracti128 is available only in AVX2
       Assembler::vextracti128(dst, src, imm8);
     } else {
       Assembler::vextractf128(dst, src, imm8);
@@ -1260,37 +1272,57 @@
   void vextracti128_high(Address dst, XMMRegister src) {
     vextracti128(dst, src, 1);
   }
+
   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
-    vinsertf128(dst, dst, src, 1);
+    if (UseAVX > 2) {
+      Assembler::vinsertf32x4(dst, dst, src, 1);
+    } else {
+      Assembler::vinsertf128(dst, dst, src, 1);
+    }
   }
+
   void vinsertf128_high(XMMRegister dst, Address src) {
-    vinsertf128(dst, dst, src, 1);
+    if (UseAVX > 2) {
+      Assembler::vinsertf32x4(dst, dst, src, 1);
+    } else {
+      Assembler::vinsertf128(dst, dst, src, 1);
+    }
   }
+
   void vextractf128_high(XMMRegister dst, XMMRegister src) {
-    vextractf128(dst, src, 1);
+    if (UseAVX > 2) {
+      Assembler::vextractf32x4(dst, src, 1);
+    } else {
+      Assembler::vextractf128(dst, src, 1);
+    }
   }
+
   void vextractf128_high(Address dst, XMMRegister src) {
-    vextractf128(dst, src, 1);
+    if (UseAVX > 2) {
+      Assembler::vextractf32x4(dst, src, 1);
+    } else {
+      Assembler::vextractf128(dst, src, 1);
+    }
   }
 
   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
-    vinserti64x4(dst, dst, src, 1);
+    Assembler::vinserti64x4(dst, dst, src, 1);
   }
   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
-    vinsertf64x4(dst, dst, src, 1);
+    Assembler::vinsertf64x4(dst, dst, src, 1);
   }
   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
-    vextracti64x4(dst, src, 1);
+    Assembler::vextracti64x4(dst, src, 1);
   }
   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
-    vextractf64x4(dst, src, 1);
+    Assembler::vextractf64x4(dst, src, 1);
   }
   void vextractf64x4_high(Address dst, XMMRegister src) {
-    vextractf64x4(dst, src, 1);
+    Assembler::vextractf64x4(dst, src, 1);
   }
   void vinsertf64x4_high(XMMRegister dst, Address src) {
-    vinsertf64x4(dst, dst, src, 1);
+    Assembler::vinsertf64x4(dst, dst, src, 1);
   }
 
   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
@@ -1306,40 +1338,59 @@
   void vextracti128_low(Address dst, XMMRegister src) {
     vextracti128(dst, src, 0);
   }
+
   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
-    vinsertf128(dst, dst, src, 0);
+    if (UseAVX > 2) {
+      Assembler::vinsertf32x4(dst, dst, src, 0);
+    } else {
+      Assembler::vinsertf128(dst, dst, src, 0);
+    }
   }
+
   void vinsertf128_low(XMMRegister dst, Address src) {
-    vinsertf128(dst, dst, src, 0);
+    if (UseAVX > 2) {
+      Assembler::vinsertf32x4(dst, dst, src, 0);
+    } else {
+      Assembler::vinsertf128(dst, dst, src, 0);
+    }
   }
+
   void vextractf128_low(XMMRegister dst, XMMRegister src) {
-    vextractf128(dst, src, 0);
+    if (UseAVX > 2) {
+      Assembler::vextractf32x4(dst, src, 0);
+    } else {
+      Assembler::vextractf128(dst, src, 0);
+    }
   }
+
   void vextractf128_low(Address dst, XMMRegister src) {
-    vextractf128(dst, src, 0);
+    if (UseAVX > 2) {
+      Assembler::vextractf32x4(dst, src, 0);
+    } else {
+      Assembler::vextractf128(dst, src, 0);
+    }
   }
 
   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
-    vinserti64x4(dst, dst, src, 0);
+    Assembler::vinserti64x4(dst, dst, src, 0);
   }
   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
-    vinsertf64x4(dst, dst, src, 0);
+    Assembler::vinsertf64x4(dst, dst, src, 0);
   }
   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
-    vextracti64x4(dst, src, 0);
+    Assembler::vextracti64x4(dst, src, 0);
   }
   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
-    vextractf64x4(dst, src, 0);
+    Assembler::vextractf64x4(dst, src, 0);
   }
   void vextractf64x4_low(Address dst, XMMRegister src) {
-    vextractf64x4(dst, src, 0);
+    Assembler::vextractf64x4(dst, src, 0);
   }
   void vinsertf64x4_low(XMMRegister dst, Address src) {
-    vinsertf64x4(dst, dst, src, 0);
+    Assembler::vinsertf64x4(dst, dst, src, 0);
   }
 
-
   // Carry-Less Multiplication Quadword
   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
     // 0x00 - multiply lower 64 bits [0:63]
--- a/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/oops/MethodCounters.java	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/oops/MethodCounters.java	Mon Apr 11 20:38:50 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -47,8 +47,10 @@
   private static synchronized void initialize(TypeDataBase db) throws WrongTypeException {
     Type type      = db.lookupType("MethodCounters");
 
-    interpreterInvocationCountField = new CIntField(type.getCIntegerField("_interpreter_invocation_count"), 0);
-    interpreterThrowoutCountField = new CIntField(type.getCIntegerField("_interpreter_throwout_count"), 0);
+    if (VM.getVM().isServerCompiler()) {
+      interpreterInvocationCountField = new CIntField(type.getCIntegerField("_interpreter_invocation_count"), 0);
+      interpreterThrowoutCountField = new CIntField(type.getCIntegerField("_interpreter_throwout_count"), 0);
+    }
     if (!VM.getVM().isCore()) {
       invocationCounter        = new CIntField(type.getCIntegerField("_invocation_counter"), 0);
       backedgeCounter          = new CIntField(type.getCIntegerField("_backedge_counter"), 0);
@@ -61,11 +63,19 @@
   private static CIntField backedgeCounter;
 
   public int interpreterInvocationCount() {
-    return (int) interpreterInvocationCountField.getValue(this);
+      if (interpreterInvocationCountField != null) {
+        return (int) interpreterInvocationCountField.getValue(this);
+      } else {
+        return 0;
+      }
   }
 
   public int interpreterThrowoutCount() {
-    return (int) interpreterThrowoutCountField.getValue(this);
+      if (interpreterThrowoutCountField != null) {
+        return (int) interpreterThrowoutCountField.getValue(this);
+      } else {
+        return 0;
+      }
   }
   public long getInvocationCounter() {
     if (Assert.ASSERTS_ENABLED) {
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotMemoryAccessProviderImpl.java	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotMemoryAccessProviderImpl.java	Mon Apr 11 20:38:50 2016 +0000
@@ -23,7 +23,6 @@
 package jdk.vm.ci.hotspot;
 
 import static jdk.vm.ci.hotspot.UnsafeAccess.UNSAFE;
-import jdk.vm.ci.common.JVMCIError;
 import jdk.vm.ci.hotspot.HotSpotVMConfig.CompressEncoding;
 import jdk.vm.ci.meta.Constant;
 import jdk.vm.ci.meta.JavaConstant;
@@ -59,7 +58,7 @@
                     return true;
                 }
             } else {
-                throw new JVMCIError("%s", metaspaceObject);
+                throw new IllegalArgumentException(String.valueOf(metaspaceObject));
             }
         }
         return false;
@@ -75,7 +74,7 @@
                 return prim.asLong();
             }
         }
-        throw new JVMCIError("%s", base);
+        throw new IllegalArgumentException(String.valueOf(base));
     }
 
     private static long readRawValue(Constant baseConstant, long displacement, int bits) {
@@ -91,7 +90,7 @@
                 case Long.SIZE:
                     return UNSAFE.getLong(base, displacement);
                 default:
-                    throw new JVMCIError("%d", bits);
+                    throw new IllegalArgumentException(String.valueOf(bits));
             }
         } else {
             long pointer = asRawPointer(baseConstant);
@@ -105,7 +104,7 @@
                 case Long.SIZE:
                     return UNSAFE.getLong(pointer + displacement);
                 default:
-                    throw new JVMCIError("%d", bits);
+                    throw new IllegalArgumentException(String.valueOf(bits));
             }
         }
     }
@@ -178,7 +177,7 @@
                 case Double:
                     return JavaConstant.forDouble(Double.longBitsToDouble(rawValue));
                 default:
-                    throw new JVMCIError("Unsupported kind: %s", kind);
+                    throw new IllegalArgumentException("Unsupported kind: " + kind);
             }
         } catch (NullPointerException e) {
             return null;
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.meta/src/jdk/vm/ci/meta/MemoryAccessProvider.java	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.meta/src/jdk/vm/ci/meta/MemoryAccessProvider.java	Mon Apr 11 20:38:50 2016 +0000
@@ -35,8 +35,10 @@
      * @param displacement the displacement within the object in bytes
      * @return the read value encapsulated in a {@link JavaConstant} object, or {@code null} if the
      *         value cannot be read.
+     * @throws IllegalArgumentException if {@code kind} is {@link JavaKind#Void} or not
+     *             {@linkplain JavaKind#isPrimitive() primitive} kind
      */
-    JavaConstant readUnsafeConstant(JavaKind kind, JavaConstant base, long displacement);
+    JavaConstant readUnsafeConstant(JavaKind kind, JavaConstant base, long displacement) throws IllegalArgumentException;
 
     /**
      * Reads a primitive value using a base address and a displacement.
@@ -46,8 +48,11 @@
      * @param displacement the displacement within the object in bytes
      * @param bits the number of bits to read from memory
      * @return the read value encapsulated in a {@link JavaConstant} object of {@link JavaKind} kind
+     * @throws IllegalArgumentException if {@code kind} is {@link JavaKind#Void} or not
+     *             {@linkplain JavaKind#isPrimitive() primitive} kind or {@code bits} is not 8, 16,
+     *             32 or 64
      */
-    JavaConstant readPrimitiveConstant(JavaKind kind, Constant base, long displacement, int bits);
+    JavaConstant readPrimitiveConstant(JavaKind kind, Constant base, long displacement, int bits) throws IllegalArgumentException;
 
     /**
      * Reads a Java {@link Object} value using a base address and a displacement.
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.meta/src/jdk/vm/ci/meta/MethodHandleAccessProvider.java	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.meta/src/jdk/vm/ci/meta/MethodHandleAccessProvider.java	Mon Apr 11 20:38:50 2016 +0000
@@ -51,6 +51,8 @@
     /**
      * Returns the method handle method intrinsic identifier for the provided method, or
      * {@code null} if the method is not an intrinsic processed by this interface.
+     *
+     * @throws NullPointerException if {@code method} is null
      */
     IntrinsicMethod lookupMethodHandleIntrinsic(ResolvedJavaMethod method);
 
@@ -58,19 +60,27 @@
      * Resolves the invocation target for an invocation of {@link IntrinsicMethod#INVOKE_BASIC
      * MethodHandle.invokeBasic} with the given constant receiver {@link MethodHandle}. Returns
      * {@code null} if the invocation target is not available at this time.
-     * <p>
+     *
      * The first invocations of a method handle can use an interpreter to lookup the actual invoked
      * method; frequently executed method handles can use Java bytecode generation to avoid the
      * interpreter overhead. If the parameter forceBytecodeGeneration is set to true, the VM should
      * try to generate bytecodes before this method returns.
+     *
+     * @returns {@code null} if {@code methodHandle} is not a {@link MethodHandle} or the invocation
+     *          target is not available at this time
+     * @throws NullPointerException if {@code methodHandle} is null
      */
     ResolvedJavaMethod resolveInvokeBasicTarget(JavaConstant methodHandle, boolean forceBytecodeGeneration);
 
     /**
      * Resolves the invocation target for an invocation of a {@code MethodHandle.linkTo*} method
      * with the given constant member name. The member name is the last parameter of the
-     * {@code linkTo*} method. Returns {@code null} if the invocation target is not available at
-     * this time.
+     * {@code linkTo*} method.
+     *
+     * @returns {@code null} if the invocation target is not available at this time
+     * @throws NullPointerException if {@code memberName} is null
+     * @throws IllegalArgumentException if {@code memberName} is not a
+     *             {@code java.lang.invoke.MemberName}
      */
     ResolvedJavaMethod resolveLinkToTarget(JavaConstant memberName);
 }
--- a/src/share/vm/c1/c1_LIRAssembler.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/c1/c1_LIRAssembler.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -556,17 +556,16 @@
       leal(op->in_opr(), op->result_opr());
       break;
 
-    case lir_null_check:
-      if (GenerateCompilerNullChecks) {
-        ImplicitNullCheckStub* stub = add_debug_info_for_null_check_here(op->info());
+    case lir_null_check: {
+      ImplicitNullCheckStub* stub = add_debug_info_for_null_check_here(op->info());
 
-        if (op->in_opr()->is_single_cpu()) {
-          _masm->null_check(op->in_opr()->as_register(), stub->entry());
-        } else {
-          Unimplemented();
-        }
+      if (op->in_opr()->is_single_cpu()) {
+        _masm->null_check(op->in_opr()->as_register(), stub->entry());
+      } else {
+        Unimplemented();
       }
       break;
+    }
 
     case lir_monaddr:
       monitor_address(op->in_opr()->as_constant_ptr()->as_jint(), op->result_opr());
--- a/src/share/vm/c1/c1_LIRGenerator.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/c1/c1_LIRGenerator.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -2041,8 +2041,7 @@
   // to avoid a fixed interval with an oop during the null check.
   // Use a copy of the CodeEmitInfo because debug information is
   // different for null_check and throw.
-  if (GenerateCompilerNullChecks &&
-      (x->exception()->as_NewInstance() == NULL && x->exception()->as_ExceptionObject() == NULL)) {
+  if (x->exception()->as_NewInstance() == NULL && x->exception()->as_ExceptionObject() == NULL) {
     // if the exception object wasn't created using new then it might be null.
     __ null_check(exception_opr, new CodeEmitInfo(info, x->state()->copy(ValueStack::ExceptionState, x->state()->bci())));
   }
--- a/src/share/vm/c1/c1_globals.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/c1/c1_globals.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -176,7 +176,7 @@
   product(bool, InlineSynchronizedMethods, true,                            \
           "Inline synchronized methods")                                    \
                                                                             \
-  diagnostic(bool, InlineNIOCheckIndex, true,                               \
+  develop(bool, InlineNIOCheckIndex, true,                                  \
           "Intrinsify java.nio.Buffer.checkIndex")                          \
                                                                             \
   develop(bool, CanonicalizeNodes, true,                                    \
--- a/src/share/vm/classfile/systemDictionary.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/classfile/systemDictionary.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -2067,7 +2067,18 @@
   int  sid  = (info >> CEIL_LG_OPTION_LIMIT);
   Symbol* symbol = vmSymbols::symbol_at((vmSymbols::SID)sid);
   InstanceKlass** klassp = &_well_known_klasses[id];
-  bool must_load = (init_opt < SystemDictionary::Opt);
+
+  bool must_load;
+#if INCLUDE_JVMCI
+  if (EnableJVMCI) {
+    // If JVMCI is enabled we require its classes to be found.
+    must_load = (init_opt < SystemDictionary::Opt) || (init_opt == SystemDictionary::Jvmci);
+  } else
+#endif
+  {
+    must_load = (init_opt < SystemDictionary::Opt);
+  }
+
   if ((*klassp) == NULL) {
     Klass* k;
     if (must_load) {
--- a/src/share/vm/classfile/systemDictionary.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/classfile/systemDictionary.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -241,7 +241,7 @@
 
     Opt,                        // preload tried; NULL if not present
 #if INCLUDE_JVMCI
-    Jvmci,                      // preload tried; error if not present, use only with JVMCI
+    Jvmci,                      // preload tried; error if not present if JVMCI enabled
 #endif
     OPTION_LIMIT,
     CEIL_LG_OPTION_LIMIT = 2    // OPTION_LIMIT <= (1<<CEIL_LG_OPTION_LIMIT)
--- a/src/share/vm/code/codeCache.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/code/codeCache.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1042,6 +1042,14 @@
   }
 }
 
+void CodeCache::cleanup_inline_caches() {
+  assert_locked_or_safepoint(CodeCache_lock);
+  NMethodIterator iter;
+  while(iter.next_alive()) {
+    iter.method()->cleanup_inline_caches(/*clean_all=*/true);
+  }
+}
+
 // Keeps track of time spent for checking dependencies
 NOT_PRODUCT(static elapsedTimer dependentCheckTime;)
 
--- a/src/share/vm/code/codeCache.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/code/codeCache.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -201,6 +201,7 @@
   static bool needs_cache_clean()                     { return _needs_cache_clean; }
   static void set_needs_cache_clean(bool v)           { _needs_cache_clean = v;    }
   static void clear_inline_caches();                  // clear all inline caches
+  static void cleanup_inline_caches();
 
   // Returns true if an own CodeHeap for the given CodeBlobType is available
   static bool heap_available(int code_blob_type);
--- a/src/share/vm/code/nmethod.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/code/nmethod.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1139,8 +1139,7 @@
   }
 }
 
-
-void nmethod::cleanup_inline_caches() {
+void nmethod::cleanup_inline_caches(bool clean_all/*=false*/) {
   assert_locked_or_safepoint(CompiledIC_lock);
 
   // If the method is not entrant or zombie then a JMP is plastered over the
@@ -1170,7 +1169,7 @@
         if( cb != NULL && cb->is_nmethod() ) {
           nmethod* nm = (nmethod*)cb;
           // Clean inline caches pointing to zombie, non-entrant and unloaded methods
-          if (!nm->is_in_use() || (nm->method()->code() != nm)) ic->set_to_clean(is_alive());
+          if (clean_all || !nm->is_in_use() || (nm->method()->code() != nm)) ic->set_to_clean(is_alive());
         }
         break;
       }
@@ -1180,7 +1179,7 @@
         if( cb != NULL && cb->is_nmethod() ) {
           nmethod* nm = (nmethod*)cb;
           // Clean inline caches pointing to zombie, non-entrant and unloaded methods
-          if (!nm->is_in_use() || (nm->method()->code() != nm)) csc->set_to_clean();
+          if (clean_all || !nm->is_in_use() || (nm->method()->code() != nm)) csc->set_to_clean();
         }
         break;
       }
--- a/src/share/vm/code/nmethod.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/code/nmethod.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -599,7 +599,7 @@
   // Inline cache support
   void clear_inline_caches();
   void clear_ic_stubs();
-  void cleanup_inline_caches();
+  void cleanup_inline_caches(bool clean_all = false);
   bool inlinecache_check_contains(address addr) const {
     return (addr >= code_begin() && addr < verified_entry_point());
   }
--- a/src/share/vm/compiler/compileBroker.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/compiler/compileBroker.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -389,13 +389,16 @@
     task = CompilationPolicy::policy()->select_task(this);
   }
 
-  // Save method pointers across unlock safepoint.  The task is removed from
-  // the compilation queue, which is walked during RedefineClasses.
-  save_method = methodHandle(task->method());
-  save_hot_method = methodHandle(task->hot_method());
+  if (task != NULL) {
+    // Save method pointers across unlock safepoint.  The task is removed from
+    // the compilation queue, which is walked during RedefineClasses.
+    save_method = methodHandle(task->method());
+    save_hot_method = methodHandle(task->hot_method());
 
-  remove(task);
-  purge_stale_tasks(); // may temporarily release MCQ lock
+    remove(task);
+    purge_stale_tasks(); // may temporarily release MCQ lock
+  }
+
   return task;
 }
 
@@ -1784,7 +1787,8 @@
   bool is_osr = (osr_bci != standard_entry_bci);
   bool should_log = (thread->log() != NULL);
   bool should_break = false;
-  int task_level = task->comp_level();
+  const int task_level = task->comp_level();
+  AbstractCompiler* comp = task->compiler();
 
   DirectiveSet* directive;
   {
@@ -1796,7 +1800,7 @@
     assert(!method->is_native(), "no longer compile natives");
 
     // Look up matching directives
-    directive = DirectivesStack::getMatchingDirective(method, compiler(task_level));
+    directive = DirectivesStack::getMatchingDirective(method, comp);
 
     // Save information about this method in case of failure.
     set_last_compile(thread, method, is_osr, task_level);
@@ -1815,13 +1819,13 @@
   int compilable = ciEnv::MethodCompilable;
   const char* failure_reason = NULL;
   const char* retry_message = NULL;
-  AbstractCompiler *comp = compiler(task_level);
 
   int system_dictionary_modification_counter;
   {
     MutexLocker locker(Compile_lock, thread);
     system_dictionary_modification_counter = SystemDictionary::number_of_modifications();
   }
+
 #if INCLUDE_JVMCI
   if (UseJVMCICompiler && comp != NULL && comp->is_jvmci()) {
     JVMCICompiler* jvmci = (JVMCICompiler*) comp;
--- a/src/share/vm/compiler/compileTask.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/compiler/compileTask.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -123,6 +123,13 @@
   _next = NULL;
 }
 
+/**
+ * Returns the compiler for this task.
+ */
+AbstractCompiler* CompileTask::compiler() {
+  return CompileBroker::compiler(_comp_level);
+}
+
 // ------------------------------------------------------------------
 // CompileTask::code/set_code
 //
--- a/src/share/vm/compiler/compileTask.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/compiler/compileTask.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -115,6 +115,8 @@
   int          comp_level()                      { return _comp_level;}
   void         set_comp_level(int comp_level)    { _comp_level = comp_level;}
 
+  AbstractCompiler* compiler();
+
   int          num_inlined_bytecodes() const     { return _num_inlined_bytecodes; }
   void         set_num_inlined_bytecodes(int n)  { _num_inlined_bytecodes = n; }
 
--- a/src/share/vm/interpreter/bytecodeInterpreter.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/interpreter/bytecodeInterpreter.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -632,9 +632,11 @@
       if (_compiling) {
         MethodCounters* mcs;
         GET_METHOD_COUNTERS(mcs);
+#if COMPILER2_OR_JVMCI
         if (ProfileInterpreter) {
           METHOD->increment_interpreter_invocation_count(THREAD);
         }
+#endif
         mcs->invocation_counter()->increment();
         if (mcs->invocation_counter()->reached_InvocationLimit(mcs->backedge_counter())) {
           CALL_VM((void)InterpreterRuntime::frequency_counter_overflow(THREAD, NULL), handle_exception);
--- a/src/share/vm/interpreter/interpreterRuntime.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/interpreter/interpreterRuntime.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -510,8 +510,10 @@
 #ifndef CC_INTERP
     continuation = Interpreter::remove_activation_entry();
 #endif
+#if COMPILER2_OR_JVMCI
     // Count this for compilation purposes
     h_method->interpreter_throwout_increment(THREAD);
+#endif
   } else {
     // handler in this method => change bci/bcp to handler bci/bcp and continue there
     handler_pc = h_method->code_base() + handler_bci;
--- a/src/share/vm/jvmci/jvmciCompilerToVM.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/jvmci/jvmciCompilerToVM.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1434,65 +1434,65 @@
 #define METASPACE_METHOD_DATA "J"
 
 JNINativeMethod CompilerToVM::methods[] = {
-  {CC"getBytecode",                                  CC"("HS_RESOLVED_METHOD")[B",                                                     FN_PTR(getBytecode)},
-  {CC"getExceptionTableStart",                       CC"("HS_RESOLVED_METHOD")J",                                                      FN_PTR(getExceptionTableStart)},
-  {CC"getExceptionTableLength",                      CC"("HS_RESOLVED_METHOD")I",                                                      FN_PTR(getExceptionTableLength)},
-  {CC"findUniqueConcreteMethod",                     CC"("HS_RESOLVED_KLASS HS_RESOLVED_METHOD")"HS_RESOLVED_METHOD,                   FN_PTR(findUniqueConcreteMethod)},
-  {CC"getImplementor",                               CC"("HS_RESOLVED_KLASS")"HS_RESOLVED_KLASS,                                       FN_PTR(getImplementor)},
-  {CC"getStackTraceElement",                         CC"("HS_RESOLVED_METHOD"I)"STACK_TRACE_ELEMENT,                                   FN_PTR(getStackTraceElement)},
-  {CC"methodIsIgnoredBySecurityStackWalk",           CC"("HS_RESOLVED_METHOD")Z",                                                      FN_PTR(methodIsIgnoredBySecurityStackWalk)},
-  {CC"doNotInlineOrCompile",                         CC"("HS_RESOLVED_METHOD")V",                                                      FN_PTR(doNotInlineOrCompile)},
-  {CC"canInlineMethod",                              CC"("HS_RESOLVED_METHOD")Z",                                                      FN_PTR(canInlineMethod)},
-  {CC"shouldInlineMethod",                           CC"("HS_RESOLVED_METHOD")Z",                                                      FN_PTR(shouldInlineMethod)},
-  {CC"lookupType",                                   CC"("STRING CLASS"Z)"HS_RESOLVED_KLASS,                                           FN_PTR(lookupType)},
-  {CC"lookupNameInPool",                             CC"("HS_CONSTANT_POOL"I)"STRING,                                                  FN_PTR(lookupNameInPool)},
-  {CC"lookupNameAndTypeRefIndexInPool",              CC"("HS_CONSTANT_POOL"I)I",                                                       FN_PTR(lookupNameAndTypeRefIndexInPool)},
-  {CC"lookupSignatureInPool",                        CC"("HS_CONSTANT_POOL"I)"STRING,                                                  FN_PTR(lookupSignatureInPool)},
-  {CC"lookupKlassRefIndexInPool",                    CC"("HS_CONSTANT_POOL"I)I",                                                       FN_PTR(lookupKlassRefIndexInPool)},
-  {CC"lookupKlassInPool",                            CC"("HS_CONSTANT_POOL"I)Ljava/lang/Object;",                                      FN_PTR(lookupKlassInPool)},
-  {CC"lookupAppendixInPool",                         CC"("HS_CONSTANT_POOL"I)"OBJECT,                                                  FN_PTR(lookupAppendixInPool)},
-  {CC"lookupMethodInPool",                           CC"("HS_CONSTANT_POOL"IB)"HS_RESOLVED_METHOD,                                     FN_PTR(lookupMethodInPool)},
-  {CC"constantPoolRemapInstructionOperandFromCache", CC"("HS_CONSTANT_POOL"I)I",                                                       FN_PTR(constantPoolRemapInstructionOperandFromCache)},
-  {CC"resolveConstantInPool",                        CC"("HS_CONSTANT_POOL"I)"OBJECT,                                                  FN_PTR(resolveConstantInPool)},
-  {CC"resolvePossiblyCachedConstantInPool",          CC"("HS_CONSTANT_POOL"I)"OBJECT,                                                  FN_PTR(resolvePossiblyCachedConstantInPool)},
-  {CC"resolveTypeInPool",                            CC"("HS_CONSTANT_POOL"I)"HS_RESOLVED_KLASS,                                       FN_PTR(resolveTypeInPool)},
-  {CC"resolveFieldInPool",                           CC"("HS_CONSTANT_POOL"IB[J)"HS_RESOLVED_KLASS,                                    FN_PTR(resolveFieldInPool)},
-  {CC"resolveInvokeDynamicInPool",                   CC"("HS_CONSTANT_POOL"I)V",                                                       FN_PTR(resolveInvokeDynamicInPool)},
-  {CC"resolveInvokeHandleInPool",                    CC"("HS_CONSTANT_POOL"I)V",                                                       FN_PTR(resolveInvokeHandleInPool)},
-  {CC"resolveMethod",                                CC"("HS_RESOLVED_KLASS HS_RESOLVED_METHOD HS_RESOLVED_KLASS")"HS_RESOLVED_METHOD, FN_PTR(resolveMethod)},
-  {CC"getVtableIndexForInterfaceMethod",             CC"("HS_RESOLVED_KLASS HS_RESOLVED_METHOD")I",                                    FN_PTR(getVtableIndexForInterfaceMethod)},
-  {CC"getClassInitializer",                          CC"("HS_RESOLVED_KLASS")"HS_RESOLVED_METHOD,                                      FN_PTR(getClassInitializer)},
-  {CC"hasFinalizableSubclass",                       CC"("HS_RESOLVED_KLASS")Z",                                                       FN_PTR(hasFinalizableSubclass)},
-  {CC"getMaxCallTargetOffset",                       CC"(J)J",                                                                         FN_PTR(getMaxCallTargetOffset)},
-  {CC"getResolvedJavaMethodAtSlot",                  CC"("CLASS"I)"HS_RESOLVED_METHOD,                                                 FN_PTR(getResolvedJavaMethodAtSlot)},
-  {CC"getResolvedJavaMethod",                        CC"(Ljava/lang/Object;J)"HS_RESOLVED_METHOD,                                      FN_PTR(getResolvedJavaMethod)},
-  {CC"getConstantPool",                              CC"(Ljava/lang/Object;J)"HS_CONSTANT_POOL,                                        FN_PTR(getConstantPool)},
-  {CC"getResolvedJavaType",                          CC"(Ljava/lang/Object;JZ)"HS_RESOLVED_KLASS,                                      FN_PTR(getResolvedJavaType)},
-  {CC"initializeConfiguration",                      CC"("HS_CONFIG")J",                                                               FN_PTR(initializeConfiguration)},
-  {CC"installCode",                                  CC"("TARGET_DESCRIPTION HS_COMPILED_CODE INSTALLED_CODE HS_SPECULATION_LOG")I",   FN_PTR(installCode)},
-  {CC"getMetadata",                                  CC"("TARGET_DESCRIPTION HS_COMPILED_CODE HS_METADATA")I",                         FN_PTR(getMetadata)},
-  {CC"resetCompilationStatistics",                   CC"()V",                                                                          FN_PTR(resetCompilationStatistics)},
-  {CC"disassembleCodeBlob",                          CC"("INSTALLED_CODE")"STRING,                                                     FN_PTR(disassembleCodeBlob)},
-  {CC"executeInstalledCode",                         CC"(["OBJECT INSTALLED_CODE")"OBJECT,                                             FN_PTR(executeInstalledCode)},
-  {CC"getLineNumberTable",                           CC"("HS_RESOLVED_METHOD")[J",                                                     FN_PTR(getLineNumberTable)},
-  {CC"getLocalVariableTableStart",                   CC"("HS_RESOLVED_METHOD")J",                                                      FN_PTR(getLocalVariableTableStart)},
-  {CC"getLocalVariableTableLength",                  CC"("HS_RESOLVED_METHOD")I",                                                      FN_PTR(getLocalVariableTableLength)},
-  {CC"reprofile",                                    CC"("HS_RESOLVED_METHOD")V",                                                      FN_PTR(reprofile)},
-  {CC"invalidateInstalledCode",                      CC"("INSTALLED_CODE")V",                                                          FN_PTR(invalidateInstalledCode)},
-  {CC"readUncompressedOop",                          CC"(J)"OBJECT,                                                                    FN_PTR(readUncompressedOop)},
-  {CC"collectCounters",                              CC"()[J",                                                                         FN_PTR(collectCounters)},
-  {CC"allocateCompileId",                            CC"("HS_RESOLVED_METHOD"I)I",                                                     FN_PTR(allocateCompileId)},
-  {CC"isMature",                                     CC"("METASPACE_METHOD_DATA")Z",                                                   FN_PTR(isMature)},
-  {CC"hasCompiledCodeForOSR",                        CC"("HS_RESOLVED_METHOD"II)Z",                                                    FN_PTR(hasCompiledCodeForOSR)},
-  {CC"getSymbol",                                    CC"(J)"STRING,                                                                    FN_PTR(getSymbol)},
-  {CC"lookupSymbol",                                 CC"("STRING")J",                                                                  FN_PTR(lookupSymbol)},
-  {CC"getNextStackFrame",                            CC"("HS_STACK_FRAME_REF "["RESOLVED_METHOD"I)"HS_STACK_FRAME_REF,                 FN_PTR(getNextStackFrame)},
-  {CC"materializeVirtualObjects",                    CC"("HS_STACK_FRAME_REF"Z)V",                                                     FN_PTR(materializeVirtualObjects)},
-  {CC"shouldDebugNonSafepoints",                     CC"()Z",                                                                          FN_PTR(shouldDebugNonSafepoints)},
-  {CC"writeDebugOutput",                             CC"([BII)V",                                                                      FN_PTR(writeDebugOutput)},
-  {CC"flushDebugOutput",                             CC"()V",                                                                          FN_PTR(flushDebugOutput)},
-  {CC"methodDataProfileDataSize",                    CC"(JI)I",                                                                        FN_PTR(methodDataProfileDataSize)},
-  {CC"interpreterFrameSize",                         CC"("BYTECODE_FRAME")I",                                                          FN_PTR(interpreterFrameSize)},
+  {CC "getBytecode",                                  CC "(" HS_RESOLVED_METHOD ")[B",                                                      FN_PTR(getBytecode)},
+  {CC "getExceptionTableStart",                       CC "(" HS_RESOLVED_METHOD ")J",                                                       FN_PTR(getExceptionTableStart)},
+  {CC "getExceptionTableLength",                      CC "(" HS_RESOLVED_METHOD ")I",                                                       FN_PTR(getExceptionTableLength)},
+  {CC "findUniqueConcreteMethod",                     CC "(" HS_RESOLVED_KLASS HS_RESOLVED_METHOD ")" HS_RESOLVED_METHOD,                   FN_PTR(findUniqueConcreteMethod)},
+  {CC "getImplementor",                               CC "(" HS_RESOLVED_KLASS ")" HS_RESOLVED_KLASS,                                       FN_PTR(getImplementor)},
+  {CC "getStackTraceElement",                         CC "(" HS_RESOLVED_METHOD "I)" STACK_TRACE_ELEMENT,                                   FN_PTR(getStackTraceElement)},
+  {CC "methodIsIgnoredBySecurityStackWalk",           CC "(" HS_RESOLVED_METHOD ")Z",                                                       FN_PTR(methodIsIgnoredBySecurityStackWalk)},
+  {CC "doNotInlineOrCompile",                         CC "(" HS_RESOLVED_METHOD ")V",                                                       FN_PTR(doNotInlineOrCompile)},
+  {CC "canInlineMethod",                              CC "(" HS_RESOLVED_METHOD ")Z",                                                       FN_PTR(canInlineMethod)},
+  {CC "shouldInlineMethod",                           CC "(" HS_RESOLVED_METHOD ")Z",                                                       FN_PTR(shouldInlineMethod)},
+  {CC "lookupType",                                   CC "(" STRING CLASS "Z)" HS_RESOLVED_KLASS,                                           FN_PTR(lookupType)},
+  {CC "lookupNameInPool",                             CC "(" HS_CONSTANT_POOL "I)" STRING,                                                  FN_PTR(lookupNameInPool)},
+  {CC "lookupNameAndTypeRefIndexInPool",              CC "(" HS_CONSTANT_POOL "I)I",                                                        FN_PTR(lookupNameAndTypeRefIndexInPool)},
+  {CC "lookupSignatureInPool",                        CC "(" HS_CONSTANT_POOL "I)" STRING,                                                  FN_PTR(lookupSignatureInPool)},
+  {CC "lookupKlassRefIndexInPool",                    CC "(" HS_CONSTANT_POOL "I)I",                                                        FN_PTR(lookupKlassRefIndexInPool)},
+  {CC "lookupKlassInPool",                            CC "(" HS_CONSTANT_POOL "I)Ljava/lang/Object;",                                       FN_PTR(lookupKlassInPool)},
+  {CC "lookupAppendixInPool",                         CC "(" HS_CONSTANT_POOL "I)" OBJECT,                                                  FN_PTR(lookupAppendixInPool)},
+  {CC "lookupMethodInPool",                           CC "(" HS_CONSTANT_POOL "IB)" HS_RESOLVED_METHOD,                                     FN_PTR(lookupMethodInPool)},
+  {CC "constantPoolRemapInstructionOperandFromCache", CC "(" HS_CONSTANT_POOL "I)I",                                                        FN_PTR(constantPoolRemapInstructionOperandFromCache)},
+  {CC "resolveConstantInPool",                        CC "(" HS_CONSTANT_POOL "I)" OBJECT,                                                  FN_PTR(resolveConstantInPool)},
+  {CC "resolvePossiblyCachedConstantInPool",          CC "(" HS_CONSTANT_POOL "I)" OBJECT,                                                  FN_PTR(resolvePossiblyCachedConstantInPool)},
+  {CC "resolveTypeInPool",                            CC "(" HS_CONSTANT_POOL "I)" HS_RESOLVED_KLASS,                                       FN_PTR(resolveTypeInPool)},
+  {CC "resolveFieldInPool",                           CC "(" HS_CONSTANT_POOL "IB[J)" HS_RESOLVED_KLASS,                                    FN_PTR(resolveFieldInPool)},
+  {CC "resolveInvokeDynamicInPool",                   CC "(" HS_CONSTANT_POOL "I)V",                                                        FN_PTR(resolveInvokeDynamicInPool)},
+  {CC "resolveInvokeHandleInPool",                    CC "(" HS_CONSTANT_POOL "I)V",                                                        FN_PTR(resolveInvokeHandleInPool)},
+  {CC "resolveMethod",                                CC "(" HS_RESOLVED_KLASS HS_RESOLVED_METHOD HS_RESOLVED_KLASS ")" HS_RESOLVED_METHOD, FN_PTR(resolveMethod)},
+  {CC "getVtableIndexForInterfaceMethod",             CC "(" HS_RESOLVED_KLASS HS_RESOLVED_METHOD ")I",                                     FN_PTR(getVtableIndexForInterfaceMethod)},
+  {CC "getClassInitializer",                          CC "(" HS_RESOLVED_KLASS ")" HS_RESOLVED_METHOD,                                      FN_PTR(getClassInitializer)},
+  {CC "hasFinalizableSubclass",                       CC "(" HS_RESOLVED_KLASS ")Z",                                                        FN_PTR(hasFinalizableSubclass)},
+  {CC "getMaxCallTargetOffset",                       CC "(J)J",                                                                            FN_PTR(getMaxCallTargetOffset)},
+  {CC "getResolvedJavaMethodAtSlot",                  CC "(" CLASS "I)" HS_RESOLVED_METHOD,                                                 FN_PTR(getResolvedJavaMethodAtSlot)},
+  {CC "getResolvedJavaMethod",                        CC "(Ljava/lang/Object;J)" HS_RESOLVED_METHOD,                                        FN_PTR(getResolvedJavaMethod)},
+  {CC "getConstantPool",                              CC "(Ljava/lang/Object;J)" HS_CONSTANT_POOL,                                          FN_PTR(getConstantPool)},
+  {CC "getResolvedJavaType",                          CC "(Ljava/lang/Object;JZ)" HS_RESOLVED_KLASS,                                        FN_PTR(getResolvedJavaType)},
+  {CC "initializeConfiguration",                      CC "(" HS_CONFIG ")J",                                                                FN_PTR(initializeConfiguration)},
+  {CC "installCode",                                  CC "(" TARGET_DESCRIPTION HS_COMPILED_CODE INSTALLED_CODE HS_SPECULATION_LOG ")I",    FN_PTR(installCode)},
+  {CC "getMetadata",                                  CC "(" TARGET_DESCRIPTION HS_COMPILED_CODE HS_METADATA ")I",                          FN_PTR(getMetadata)},
+  {CC "resetCompilationStatistics",                   CC "()V",                                                                             FN_PTR(resetCompilationStatistics)},
+  {CC "disassembleCodeBlob",                          CC "(" INSTALLED_CODE ")" STRING,                                                     FN_PTR(disassembleCodeBlob)},
+  {CC "executeInstalledCode",                         CC "([" OBJECT INSTALLED_CODE ")" OBJECT,                                             FN_PTR(executeInstalledCode)},
+  {CC "getLineNumberTable",                           CC "(" HS_RESOLVED_METHOD ")[J",                                                      FN_PTR(getLineNumberTable)},
+  {CC "getLocalVariableTableStart",                   CC "(" HS_RESOLVED_METHOD ")J",                                                       FN_PTR(getLocalVariableTableStart)},
+  {CC "getLocalVariableTableLength",                  CC "(" HS_RESOLVED_METHOD ")I",                                                       FN_PTR(getLocalVariableTableLength)},
+  {CC "reprofile",                                    CC "(" HS_RESOLVED_METHOD ")V",                                                       FN_PTR(reprofile)},
+  {CC "invalidateInstalledCode",                      CC "(" INSTALLED_CODE ")V",                                                           FN_PTR(invalidateInstalledCode)},
+  {CC "readUncompressedOop",                          CC "(J)" OBJECT,                                                                      FN_PTR(readUncompressedOop)},
+  {CC "collectCounters",                              CC "()[J",                                                                            FN_PTR(collectCounters)},
+  {CC "allocateCompileId",                            CC "(" HS_RESOLVED_METHOD "I)I",                                                      FN_PTR(allocateCompileId)},
+  {CC "isMature",                                     CC "(" METASPACE_METHOD_DATA ")Z",                                                    FN_PTR(isMature)},
+  {CC "hasCompiledCodeForOSR",                        CC "(" HS_RESOLVED_METHOD "II)Z",                                                     FN_PTR(hasCompiledCodeForOSR)},
+  {CC "getSymbol",                                    CC "(J)" STRING,                                                                      FN_PTR(getSymbol)},
+  {CC "lookupSymbol",                                 CC "(" STRING ")J",                                                                   FN_PTR(lookupSymbol)},
+  {CC "getNextStackFrame",                            CC "(" HS_STACK_FRAME_REF "[" RESOLVED_METHOD "I)" HS_STACK_FRAME_REF,                FN_PTR(getNextStackFrame)},
+  {CC "materializeVirtualObjects",                    CC "(" HS_STACK_FRAME_REF "Z)V",                                                      FN_PTR(materializeVirtualObjects)},
+  {CC "shouldDebugNonSafepoints",                     CC "()Z",                                                                             FN_PTR(shouldDebugNonSafepoints)},
+  {CC "writeDebugOutput",                             CC "([BII)V",                                                                         FN_PTR(writeDebugOutput)},
+  {CC "flushDebugOutput",                             CC "()V",                                                                             FN_PTR(flushDebugOutput)},
+  {CC "methodDataProfileDataSize",                    CC "(JI)I",                                                                           FN_PTR(methodDataProfileDataSize)},
+  {CC "interpreterFrameSize",                         CC "(" BYTECODE_FRAME ")I",                                                           FN_PTR(interpreterFrameSize)},
 };
 
 int CompilerToVM::methods_count() {
--- a/src/share/vm/jvmci/jvmciEnv.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/jvmci/jvmciEnv.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -437,7 +437,7 @@
       stringStream st(buffer, O_BUFLEN);
       deps.print_dependency(witness, true, &st);
       *failure_detail = st.as_string();
-      if (env == NULL || counter_changed) {
+      if (env == NULL || counter_changed || deps.type() == Dependencies::evol_method) {
         return JVMCIEnv::dependencies_failed;
       } else {
         // The dependencies were invalid at the time of installation
--- a/src/share/vm/oops/method.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/oops/method.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -263,6 +263,7 @@
   int highest_osr_comp_level() const;
   void set_highest_osr_comp_level(int level);
 
+#if defined(COMPILER2) || INCLUDE_JVMCI
   // Count of times method was exited via exception while interpreting
   void interpreter_throwout_increment(TRAPS) {
     MethodCounters* mcs = get_method_counters(CHECK);
@@ -270,6 +271,7 @@
       mcs->interpreter_throwout_increment();
     }
   }
+#endif
 
   int  interpreter_throwout_count() const        {
     MethodCounters* mcs = method_counters();
@@ -406,11 +408,13 @@
       return (mcs == NULL) ? 0 : mcs->interpreter_invocation_count();
     }
   }
+#if defined(COMPILER2) || INCLUDE_JVMCI
   int increment_interpreter_invocation_count(TRAPS) {
     if (TieredCompilation) ShouldNotReachHere();
     MethodCounters* mcs = get_method_counters(CHECK_0);
     return (mcs == NULL) ? 0 : mcs->increment_interpreter_invocation_count();
   }
+#endif
 
 #ifndef PRODUCT
   int  compiled_invocation_count() const         { return _compiled_invocation_count;  }
--- a/src/share/vm/oops/methodCounters.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/oops/methodCounters.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,8 +34,10 @@
  friend class VMStructs;
  friend class JVMCIVMStructs;
  private:
+#if defined(COMPILER2) || INCLUDE_JVMCI
   int               _interpreter_invocation_count; // Count of times invoked (reused as prev_event_count in tiered)
   u2                _interpreter_throwout_count; // Count of times method was exited via exception while interpreting
+#endif
   u2                _number_of_breakpoints;      // fullspeed debugging support
   InvocationCounter _invocation_counter;         // Incremented before each activation of the method - used to trigger frequency-based optimizations
   InvocationCounter _backedge_counter;           // Incremented before each backedge taken - used to trigger frequencey-based optimizations
@@ -60,9 +62,7 @@
   u1                _highest_osr_comp_level;      // Same for OSR level
 #endif
 
-  MethodCounters(methodHandle mh) : _interpreter_invocation_count(0),
-                                    _interpreter_throwout_count(0),
-                                    _number_of_breakpoints(0),
+  MethodCounters(methodHandle mh) : _number_of_breakpoints(0),
                                     _nmethod_age(INT_MAX)
 #ifdef TIERED
                                  , _rate(0),
@@ -71,6 +71,8 @@
                                    _highest_osr_comp_level(0)
 #endif
   {
+    set_interpreter_invocation_count(0);
+    set_interpreter_throwout_count(0);
     invocation_counter()->init();
     backedge_counter()->init();
 
@@ -109,6 +111,8 @@
 
   void clear_counters();
 
+#if defined(COMPILER2) || INCLUDE_JVMCI
+
   int interpreter_invocation_count() {
     return _interpreter_invocation_count;
   }
@@ -131,6 +135,24 @@
     _interpreter_throwout_count = count;
   }
 
+#else // defined(COMPILER2) || INCLUDE_JVMCI
+
+  int interpreter_invocation_count() {
+    return 0;
+  }
+  void set_interpreter_invocation_count(int count) {
+    assert(count == 0, "count must be 0");
+  }
+
+  int  interpreter_throwout_count() const {
+    return 0;
+  }
+  void set_interpreter_throwout_count(int count) {
+    assert(count == 0, "count must be 0");
+  }
+
+#endif // defined(COMPILER2) || INCLUDE_JVMCI
+
   u2   number_of_breakpoints() const   { return _number_of_breakpoints; }
   void incr_number_of_breakpoints()    { ++_number_of_breakpoints; }
   void decr_number_of_breakpoints()    { --_number_of_breakpoints; }
@@ -170,10 +192,25 @@
     return byte_offset_of(MethodCounters, _nmethod_age);
   }
 
+#if defined(COMPILER2) || INCLUDE_JVMCI
+
   static ByteSize interpreter_invocation_counter_offset() {
     return byte_offset_of(MethodCounters, _interpreter_invocation_count);
   }
 
+  static int interpreter_invocation_counter_offset_in_bytes() {
+    return offset_of(MethodCounters, _interpreter_invocation_count);
+  }
+
+#else // defined(COMPILER2) || INCLUDE_JVMCI
+
+  static ByteSize interpreter_invocation_counter_offset() {
+    ShouldNotReachHere();
+    return in_ByteSize(0);
+  }
+
+#endif // defined(COMPILER2) || INCLUDE_JVMCI
+
   static ByteSize invocation_counter_offset()    {
     return byte_offset_of(MethodCounters, _invocation_counter);
   }
@@ -182,10 +219,6 @@
     return byte_offset_of(MethodCounters, _backedge_counter);
   }
 
-  static int interpreter_invocation_counter_offset_in_bytes() {
-    return offset_of(MethodCounters, _interpreter_invocation_count);
-  }
-
   static ByteSize interpreter_invocation_limit_offset() {
     return byte_offset_of(MethodCounters, _interpreter_invocation_limit);
   }
--- a/src/share/vm/opto/c2_globals.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/opto/c2_globals.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -194,6 +194,9 @@
            "Map number of unrolls for main loop via "                       \
            "Superword Level Parallelism analysis")                          \
                                                                             \
+  product_pd(bool, PostLoopMultiversioning,                                 \
+           "Multi versioned post loops to eliminate range checks")          \
+                                                                            \
   notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false,                 \
           "Trace what Superword Level Parallelism analysis applies")        \
                                                                             \
@@ -229,21 +232,12 @@
   develop(bool, TraceLoopOpts, false,                                       \
           "Trace executed loop optimizations")                              \
                                                                             \
-  diagnostic(bool, LoopLimitCheck, true,                                    \
-          "Generate a loop limits check for overflow")                      \
-                                                                            \
   develop(bool, TraceLoopLimitCheck, false,                                 \
           "Trace generation of loop limits checks")                         \
                                                                             \
-  diagnostic(bool, RangeLimitCheck, true,                                   \
-          "Additional overflow checks during range check elimination")      \
-                                                                            \
   develop(bool, TraceRangeLimitCheck, false,                                \
           "Trace additional overflow checks in RCE")                        \
                                                                             \
-  diagnostic(bool, UnrollLimitCheck, true,                                  \
-          "Additional overflow checks during loop unroll")                  \
-                                                                            \
   /* OptimizeFill not yet supported on PowerPC. */                          \
   product(bool, OptimizeFill, true PPC64_ONLY(&& false),                    \
           "convert fill/copy loops into intrinsic")                         \
@@ -595,26 +589,26 @@
   product(bool, BlockLayoutRotateLoops, true,                               \
           "Allow back branches to be fall throughs in the block layour")    \
                                                                             \
-  diagnostic(bool, InlineReflectionGetCallerClass, true,                    \
+  develop(bool, InlineReflectionGetCallerClass, true,                       \
           "inline sun.reflect.Reflection.getCallerClass(), known to be "    \
           "part of base library DLL")                                       \
                                                                             \
-  diagnostic(bool, InlineObjectCopy, true,                                  \
+  develop(bool, InlineObjectCopy, true,                                     \
           "inline Object.clone and Arrays.copyOf[Range] intrinsics")        \
                                                                             \
-  diagnostic(bool, SpecialStringCompareTo, true,                            \
+  develop(bool, SpecialStringCompareTo, true,                               \
           "special version of string compareTo")                            \
                                                                             \
-  diagnostic(bool, SpecialStringIndexOf, true,                              \
+  develop(bool, SpecialStringIndexOf, true,                                 \
           "special version of string indexOf")                              \
                                                                             \
-  diagnostic(bool, SpecialStringEquals, true,                               \
+  develop(bool, SpecialStringEquals, true,                                  \
           "special version of string equals")                               \
                                                                             \
-  diagnostic(bool, SpecialArraysEquals, true,                               \
+  develop(bool, SpecialArraysEquals, true,                                  \
           "special version of Arrays.equals(char[],char[])")                \
                                                                             \
-  diagnostic(bool, SpecialEncodeISOArray, true,                             \
+  product(bool, SpecialEncodeISOArray, true,                                \
           "special version of ISO_8859_1$Encoder.encodeISOArray")           \
                                                                             \
   develop(bool, BailoutToInterpreterForThrows, false,                       \
@@ -716,22 +710,22 @@
   diagnostic(bool, OptimizeExpensiveOps, true,                              \
           "Find best control for expensive operations")                     \
                                                                             \
-  diagnostic(bool, UseMathExactIntrinsics, true,                            \
+  product(bool, UseMathExactIntrinsics, true,                               \
           "Enables intrinsification of various java.lang.Math functions")   \
                                                                             \
-  diagnostic(bool, UseMultiplyToLenIntrinsic, false,                        \
+  product(bool, UseMultiplyToLenIntrinsic, false,                           \
           "Enables intrinsification of BigInteger.multiplyToLen()")         \
                                                                             \
-  diagnostic(bool, UseSquareToLenIntrinsic, false,                          \
+  product(bool, UseSquareToLenIntrinsic, false,                             \
           "Enables intrinsification of BigInteger.squareToLen()")           \
                                                                             \
-  diagnostic(bool, UseMulAddIntrinsic, false,                               \
+  product(bool, UseMulAddIntrinsic, false,                                  \
           "Enables intrinsification of BigInteger.mulAdd()")                \
                                                                             \
-  diagnostic(bool, UseMontgomeryMultiplyIntrinsic, false,                   \
+  product(bool, UseMontgomeryMultiplyIntrinsic, false,                      \
           "Enables intrinsification of BigInteger.montgomeryMultiply()")    \
                                                                             \
-  diagnostic(bool, UseMontgomerySquareIntrinsic, false,                     \
+  product(bool, UseMontgomerySquareIntrinsic, false,                        \
           "Enables intrinsification of BigInteger.montgomerySquare()")      \
                                                                             \
   product(bool, UseTypeSpeculation, true,                                   \
--- a/src/share/vm/opto/graphKit.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/opto/graphKit.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1191,11 +1191,6 @@
                                   bool speculative) {
   assert(!assert_null || null_control == NULL, "not both at once");
   if (stopped())  return top();
-  if (!GenerateCompilerNullChecks && !assert_null && null_control == NULL) {
-    // For some performance testing, we may wish to suppress null checking.
-    value = cast_not_null(value);   // Make it appear to be non-null (4962416).
-    return value;
-  }
   NOT_PRODUCT(explicit_null_checks_inserted++);
 
   // Construct NULL check
@@ -1687,6 +1682,9 @@
   const Type* elemtype = arytype->elem();
   BasicType elembt = elemtype->array_element_basic_type();
   Node* adr = array_element_address(ary, idx, elembt, arytype->size());
+  if (elembt == T_NARROWOOP) {
+    elembt = T_OBJECT; // To satisfy switch in LoadNode::make()
+  }
   Node* ld = make_load(ctl, adr, elemtype, elembt, arytype, MemNode::unordered);
   return ld;
 }
@@ -3771,9 +3769,7 @@
     add_predicate_impl(Deoptimization::Reason_predicate, nargs);
   }
   // loop's limit check predicate should be near the loop.
-  if (LoopLimitCheck) {
-    add_predicate_impl(Deoptimization::Reason_loop_limit_check, nargs);
-  }
+  add_predicate_impl(Deoptimization::Reason_loop_limit_check, nargs);
 }
 
 //----------------------------- store barriers ----------------------------
--- a/src/share/vm/opto/library_call.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/opto/library_call.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -6273,7 +6273,20 @@
 
 //------------------------------get_key_start_from_aescrypt_object-----------------------
 Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) {
+#ifdef PPC64
+  // MixColumns for decryption can be reduced by preprocessing MixColumns with round keys.
+  // Intel's extention is based on this optimization and AESCrypt generates round keys by preprocessing MixColumns.
+  // However, ppc64 vncipher processes MixColumns and requires the same round keys with encryption.
+  // The ppc64 stubs of encryption and decryption use the same round keys (sessionK[0]).
+  Node* objSessionK = load_field_from_object(aescrypt_object, "sessionK", "[[I", /*is_exact*/ false);
+  assert (objSessionK != NULL, "wrong version of com.sun.crypto.provider.AESCrypt");
+  if (objSessionK == NULL) {
+    return (Node *) NULL;
+  }
+  Node* objAESCryptKey = load_array_element(control(), objSessionK, intcon(0), TypeAryPtr::OOPS);
+#else
   Node* objAESCryptKey = load_field_from_object(aescrypt_object, "K", "[I", /*is_exact*/ false);
+#endif // PPC64
   assert (objAESCryptKey != NULL, "wrong version of com.sun.crypto.provider.AESCrypt");
   if (objAESCryptKey == NULL) return (Node *) NULL;
 
--- a/src/share/vm/opto/loopPredicate.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/opto/loopPredicate.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -313,11 +313,9 @@
   // Search original predicates
   Node* entry = old_entry;
   ProjNode* limit_check_proj = NULL;
-  if (LoopLimitCheck) {
-    limit_check_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-    if (limit_check_proj != NULL) {
-      entry = entry->in(0)->in(0);
-    }
+  limit_check_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+  if (limit_check_proj != NULL) {
+    entry = entry->in(0)->in(0);
   }
   if (UseLoopPredicate) {
     ProjNode* predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
@@ -353,11 +351,9 @@
 // Skip related predicates.
 Node* PhaseIdealLoop::skip_loop_predicates(Node* entry) {
   Node* predicate = NULL;
-  if (LoopLimitCheck) {
-    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-    if (predicate != NULL) {
-      entry = entry->in(0)->in(0);
-    }
+  predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+  if (predicate != NULL) {
+    entry = entry->in(0)->in(0);
   }
   if (UseLoopPredicate) {
     predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
@@ -393,11 +389,9 @@
 // Find a predicate
 Node* PhaseIdealLoop::find_predicate(Node* entry) {
   Node* predicate = NULL;
-  if (LoopLimitCheck) {
-    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-    if (predicate != NULL) { // right pattern that can be used by loop predication
-      return entry;
-    }
+  predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+  if (predicate != NULL) { // right pattern that can be used by loop predication
+    return entry;
   }
   if (UseLoopPredicate) {
     predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
@@ -646,19 +640,13 @@
   Node* max_idx_expr  = init;
   int stride_con = stride->get_int();
   if ((stride_con > 0) == (scale > 0) == upper) {
-    if (LoopLimitCheck) {
-      // With LoopLimitCheck limit is not exact.
-      // Calculate exact limit here.
-      // Note, counted loop's test is '<' or '>'.
-      limit = exact_limit(loop);
-      max_idx_expr = new SubINode(limit, stride);
-      register_new_node(max_idx_expr, ctrl);
-      if (TraceLoopPredicate) predString->print("(limit - stride) ");
-    } else {
-      max_idx_expr = new SubINode(limit, stride);
-      register_new_node(max_idx_expr, ctrl);
-      if (TraceLoopPredicate) predString->print("(limit - stride) ");
-    }
+    // Limit is not exact.
+    // Calculate exact limit here.
+    // Note, counted loop's test is '<' or '>'.
+    limit = exact_limit(loop);
+    max_idx_expr = new SubINode(limit, stride);
+    register_new_node(max_idx_expr, ctrl);
+    if (TraceLoopPredicate) predString->print("(limit - stride) ");
   } else {
     if (TraceLoopPredicate) predString->print("init ");
   }
@@ -721,12 +709,9 @@
   Node* entry = head->in(LoopNode::EntryControl);
   ProjNode *predicate_proj = NULL;
   // Loop limit check predicate should be near the loop.
-  if (LoopLimitCheck) {
-    predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-    if (predicate_proj != NULL)
-      entry = predicate_proj->in(0)->in(0);
-  }
-
+  predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+  if (predicate_proj != NULL)
+    entry = predicate_proj->in(0)->in(0);
   predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
   if (!predicate_proj) {
 #ifndef PRODUCT
--- a/src/share/vm/opto/loopTransform.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/opto/loopTransform.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1027,82 +1027,9 @@
     _igvn.replace_input_of(bol, 1, cmp);
   }
 
-  //------------------------------
-  // Step A: Create Post-Loop.
-  Node* main_exit = main_end->proj_out(false);
-  assert( main_exit->Opcode() == Op_IfFalse, "" );
-  int dd_main_exit = dom_depth(main_exit);
-
-  // Step A1: Clone the loop body.  The clone becomes the post-loop.  The main
-  // loop pre-header illegally has 2 control users (old & new loops).
-  clone_loop( loop, old_new, dd_main_exit );
-  assert( old_new[main_end ->_idx]->Opcode() == Op_CountedLoopEnd, "" );
-  CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
-  post_head->set_post_loop(main_head);
-
-  // Reduce the post-loop trip count.
-  CountedLoopEndNode* post_end = old_new[main_end ->_idx]->as_CountedLoopEnd();
-  post_end->_prob = PROB_FAIR;
-
-  // Build the main-loop normal exit.
-  IfFalseNode *new_main_exit = new IfFalseNode(main_end);
-  _igvn.register_new_node_with_optimizer( new_main_exit );
-  set_idom(new_main_exit, main_end, dd_main_exit );
-  set_loop(new_main_exit, loop->_parent);
-
-  // Step A2: Build a zero-trip guard for the post-loop.  After leaving the
-  // main-loop, the post-loop may not execute at all.  We 'opaque' the incr
-  // (the main-loop trip-counter exit value) because we will be changing
-  // the exit value (via unrolling) so we cannot constant-fold away the zero
-  // trip guard until all unrolling is done.
-  Node *zer_opaq = new Opaque1Node(C, incr);
-  Node *zer_cmp  = new CmpINode( zer_opaq, limit );
-  Node *zer_bol  = new BoolNode( zer_cmp, b_test );
-  register_new_node( zer_opaq, new_main_exit );
-  register_new_node( zer_cmp , new_main_exit );
-  register_new_node( zer_bol , new_main_exit );
-
-  // Build the IfNode
-  IfNode *zer_iff = new IfNode( new_main_exit, zer_bol, PROB_FAIR, COUNT_UNKNOWN );
-  _igvn.register_new_node_with_optimizer( zer_iff );
-  set_idom(zer_iff, new_main_exit, dd_main_exit);
-  set_loop(zer_iff, loop->_parent);
-
-  // Plug in the false-path, taken if we need to skip post-loop
-  _igvn.replace_input_of(main_exit, 0, zer_iff);
-  set_idom(main_exit, zer_iff, dd_main_exit);
-  set_idom(main_exit->unique_out(), zer_iff, dd_main_exit);
-  // Make the true-path, must enter the post loop
-  Node *zer_taken = new IfTrueNode( zer_iff );
-  _igvn.register_new_node_with_optimizer( zer_taken );
-  set_idom(zer_taken, zer_iff, dd_main_exit);
-  set_loop(zer_taken, loop->_parent);
-  // Plug in the true path
-  _igvn.hash_delete( post_head );
-  post_head->set_req(LoopNode::EntryControl, zer_taken);
-  set_idom(post_head, zer_taken, dd_main_exit);
-
-  Arena *a = Thread::current()->resource_area();
-  VectorSet visited(a);
-  Node_Stack clones(a, main_head->back_control()->outcnt());
-  // Step A3: Make the fall-in values to the post-loop come from the
-  // fall-out values of the main-loop.
-  for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) {
-    Node* main_phi = main_head->fast_out(i);
-    if( main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() >0 ) {
-      Node *post_phi = old_new[main_phi->_idx];
-      Node *fallmain  = clone_up_backedge_goo(main_head->back_control(),
-                                              post_head->init_control(),
-                                              main_phi->in(LoopNode::LoopBackControl),
-                                              visited, clones);
-      _igvn.hash_delete(post_phi);
-      post_phi->set_req( LoopNode::EntryControl, fallmain );
-    }
-  }
-
-  // Update local caches for next stanza
-  main_exit = new_main_exit;
-
+  // Add the post loop
+  CountedLoopNode *post_head = NULL;
+  Node *main_exit = insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
 
   //------------------------------
   // Step B: Create Pre-Loop.
@@ -1158,8 +1085,9 @@
   main_head->set_req(LoopNode::EntryControl, min_taken);
   set_idom(main_head, min_taken, dd_main_head);
 
-  visited.Clear();
-  clones.clear();
+  Arena *a = Thread::current()->resource_area();
+  VectorSet visited(a);
+  Node_Stack clones(a, main_head->back_control()->outcnt());
   // Step B3: Make the fall-in values to the main-loop come from the
   // fall-out values of the pre-loop.
   for (DUIterator_Fast i2max, i2 = main_head->fast_outs(i2max); i2 < i2max; i2++) {
@@ -1185,12 +1113,8 @@
   // variable value and the induction variable Phi to preserve correct
   // dependencies.
 
-  // CastII for the post loop:
-  bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head);
-  assert(inserted, "no castII inserted");
-
   // CastII for the main loop:
-  inserted = cast_incr_before_loop(pre_incr, min_taken, main_head);
+  bool inserted = cast_incr_before_loop( pre_incr, min_taken, main_head );
   assert(inserted, "no castII inserted");
 
   // Step B4: Shorten the pre-loop to run only 1 iteration (for now).
@@ -1298,19 +1222,82 @@
   guarantee(main_end != NULL, "no loop exit node");
   // diagnostic to show loop end is not properly formed
   assert(main_end->outcnt() == 2, "1 true, 1 false path only");
-  uint dd_main_head = dom_depth(main_head);
-  uint max = main_head->outcnt();
 
   // mark this loop as processed
   main_head->mark_has_atomic_post_loop();
 
-  Node *pre_header = main_head->in(LoopNode::EntryControl);
-  Node *init = main_head->init_trip();
   Node *incr = main_end->incr();
   Node *limit = main_end->limit();
-  Node *stride = main_end->stride();
-  Node *cmp = main_end->cmp_node();
-  BoolTest::mask b_test = main_end->test_trip();
+
+  // In this case we throw away the result as we are not using it to connect anything else.
+  CountedLoopNode *post_head = NULL;
+  insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
+
+  // It's difficult to be precise about the trip-counts
+  // for post loops.  They are usually very short,
+  // so guess that unit vector trips is a reasonable value.
+  post_head->set_profile_trip_cnt(cur_unroll);
+
+  // Now force out all loop-invariant dominating tests.  The optimizer
+  // finds some, but we _know_ they are all useless.
+  peeled_dom_test_elim(loop, old_new);
+  loop->record_for_igvn();
+}
+
+
+//-------------------------insert_scalar_rced_post_loop------------------------
+// Insert a copy of the rce'd main loop as a post loop,
+// We have not unrolled the main loop, so this is the right time to inject this.
+// Later we will examine the partner of this post loop pair which still has range checks
+// to see inject code which tests at runtime if the range checks are applicable.
+void PhaseIdealLoop::insert_scalar_rced_post_loop(IdealLoopTree *loop, Node_List &old_new) {
+  if (!loop->_head->is_CountedLoop()) return;
+
+  CountedLoopNode *cl = loop->_head->as_CountedLoop();
+
+  // only process RCE'd main loops
+  if (!cl->is_main_loop() || cl->range_checks_present()) return;
+
+#ifndef PRODUCT
+  if (TraceLoopOpts) {
+    tty->print("PostScalarRce  ");
+    loop->dump_head();
+  }
+#endif
+  C->set_major_progress();
+
+  // Find common pieces of the loop being guarded with pre & post loops
+  CountedLoopNode *main_head = loop->_head->as_CountedLoop();
+  CountedLoopEndNode *main_end = main_head->loopexit();
+  guarantee(main_end != NULL, "no loop exit node");
+  // diagnostic to show loop end is not properly formed
+  assert(main_end->outcnt() == 2, "1 true, 1 false path only");
+
+  Node *incr = main_end->incr();
+  Node *limit = main_end->limit();
+
+  // In this case we throw away the result as we are not using it to connect anything else.
+  CountedLoopNode *post_head = NULL;
+  insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
+
+  // It's difficult to be precise about the trip-counts
+  // for post loops.  They are usually very short,
+  // so guess that unit vector trips is a reasonable value.
+  post_head->set_profile_trip_cnt(4.0);
+  post_head->set_is_rce_post_loop();
+
+  // Now force out all loop-invariant dominating tests.  The optimizer
+  // finds some, but we _know_ they are all useless.
+  peeled_dom_test_elim(loop, old_new);
+  loop->record_for_igvn();
+}
+
+
+//------------------------------insert_post_loop-------------------------------
+// Insert post loops.  Add a post loop to the given loop passed.
+Node *PhaseIdealLoop::insert_post_loop(IdealLoopTree *loop, Node_List &old_new,
+                                       CountedLoopNode *main_head, CountedLoopEndNode *main_end,
+                                       Node *incr, Node *limit, CountedLoopNode *&post_head) {
 
   //------------------------------
   // Step A: Create a new post-Loop.
@@ -1322,7 +1309,7 @@
   // The main loop pre-header illegally has 2 control users (old & new loops).
   clone_loop(loop, old_new, dd_main_exit);
   assert(old_new[main_end->_idx]->Opcode() == Op_CountedLoopEnd, "");
-  CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
+  post_head = old_new[main_head->_idx]->as_CountedLoop();
   post_head->set_normal_loop();
   post_head->set_post_loop(main_head);
 
@@ -1336,14 +1323,14 @@
   set_idom(new_main_exit, main_end, dd_main_exit);
   set_loop(new_main_exit, loop->_parent);
 
-  // Step A2: Build a zero-trip guard for the vector post-loop.  After leaving the
-  // main-loop, the vector post-loop may not execute at all.  We 'opaque' the incr
-  // (the vectorized main-loop trip-counter exit value) because we will be changing
+  // Step A2: Build a zero-trip guard for the post-loop.  After leaving the
+  // main-loop, the post-loop may not execute at all.  We 'opaque' the incr
+  // (the previous loop trip-counter exit value) because we will be changing
   // the exit value (via additional unrolling) so we cannot constant-fold away the zero
   // trip guard until all unrolling is done.
   Node *zer_opaq = new Opaque1Node(C, incr);
   Node *zer_cmp = new CmpINode(zer_opaq, limit);
-  Node *zer_bol = new BoolNode(zer_cmp, b_test);
+  Node *zer_bol = new BoolNode(zer_cmp, main_end->test_trip());
   register_new_node(zer_opaq, new_main_exit);
   register_new_node(zer_cmp, new_main_exit);
   register_new_node(zer_bol, new_main_exit);
@@ -1354,11 +1341,11 @@
   set_idom(zer_iff, new_main_exit, dd_main_exit);
   set_loop(zer_iff, loop->_parent);
 
-  // Plug in the false-path, taken if we need to skip vector post-loop
+  // Plug in the false-path, taken if we need to skip this post-loop
   _igvn.replace_input_of(main_exit, 0, zer_iff);
   set_idom(main_exit, zer_iff, dd_main_exit);
   set_idom(main_exit->unique_out(), zer_iff, dd_main_exit);
-  // Make the true-path, must enter the vector post loop
+  // Make the true-path, must enter this post loop
   Node *zer_taken = new IfTrueNode(zer_iff);
   _igvn.register_new_node_with_optimizer(zer_taken);
   set_idom(zer_taken, zer_iff, dd_main_exit);
@@ -1371,7 +1358,7 @@
   Arena *a = Thread::current()->resource_area();
   VectorSet visited(a);
   Node_Stack clones(a, main_head->back_control()->outcnt());
-  // Step A3: Make the fall-in values to the vector post-loop come from the
+  // Step A3: Make the fall-in values to the post-loop come from the
   // fall-out values of the main-loop.
   for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) {
     Node* main_phi = main_head->fast_out(i);
@@ -1390,15 +1377,7 @@
   bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head);
   assert(inserted, "no castII inserted");
 
-  // It's difficult to be precise about the trip-counts
-  // for post loops.  They are usually very short,
-  // so guess that unit vector trips is a reasonable value.
-  post_head->set_profile_trip_cnt((float)slp_max_unroll_factor);
-
-  // Now force out all loop-invariant dominating tests.  The optimizer
-  // finds some, but we _know_ they are all useless.
-  peeled_dom_test_elim(loop, old_new);
-  loop->record_for_igvn();
+  return new_main_exit;
 }
 
 //------------------------------is_invariant-----------------------------
@@ -1457,7 +1436,7 @@
     // Check the shape of the graph at the loop entry. If an inappropriate
     // graph shape is encountered, the compiler bails out loop unrolling;
     // compilation of the method will still succeed.
-    if (!is_canonical_main_loop_entry(loop_head)) {
+    if (!is_canonical_loop_entry(loop_head)) {
       return;
     }
     opaq = ctrl->in(0)->in(1)->in(1)->in(2);
@@ -1468,209 +1447,156 @@
   C->set_major_progress();
 
   Node* new_limit = NULL;
-  if (UnrollLimitCheck) {
-    int stride_con = stride->get_int();
-    int stride_p = (stride_con > 0) ? stride_con : -stride_con;
-    uint old_trip_count = loop_head->trip_count();
-    // Verify that unroll policy result is still valid.
-    assert(old_trip_count > 1 &&
-           (!adjust_min_trip || stride_p <= (1<<3)*loop_head->unrolled_count()), "sanity");
-
-    // Adjust loop limit to keep valid iterations number after unroll.
-    // Use (limit - stride) instead of (((limit - init)/stride) & (-2))*stride
-    // which may overflow.
-    if (!adjust_min_trip) {
-      assert(old_trip_count > 1 && (old_trip_count & 1) == 0,
-             "odd trip count for maximally unroll");
-      // Don't need to adjust limit for maximally unroll since trip count is even.
-    } else if (loop_head->has_exact_trip_count() && init->is_Con()) {
-      // Loop's limit is constant. Loop's init could be constant when pre-loop
-      // become peeled iteration.
-      jlong init_con = init->get_int();
-      // We can keep old loop limit if iterations count stays the same:
-      //   old_trip_count == new_trip_count * 2
-      // Note: since old_trip_count >= 2 then new_trip_count >= 1
-      // so we also don't need to adjust zero trip test.
-      jlong limit_con  = limit->get_int();
-      // (stride_con*2) not overflow since stride_con <= 8.
-      int new_stride_con = stride_con * 2;
-      int stride_m    = new_stride_con - (stride_con > 0 ? 1 : -1);
-      jlong trip_count = (limit_con - init_con + stride_m)/new_stride_con;
-      // New trip count should satisfy next conditions.
-      assert(trip_count > 0 && (julong)trip_count < (julong)max_juint/2, "sanity");
-      uint new_trip_count = (uint)trip_count;
-      adjust_min_trip = (old_trip_count != new_trip_count*2);
+  int stride_con = stride->get_int();
+  int stride_p = (stride_con > 0) ? stride_con : -stride_con;
+  uint old_trip_count = loop_head->trip_count();
+  // Verify that unroll policy result is still valid.
+  assert(old_trip_count > 1 &&
+      (!adjust_min_trip || stride_p <= (1<<3)*loop_head->unrolled_count()), "sanity");
+
+  // Adjust loop limit to keep valid iterations number after unroll.
+  // Use (limit - stride) instead of (((limit - init)/stride) & (-2))*stride
+  // which may overflow.
+  if (!adjust_min_trip) {
+    assert(old_trip_count > 1 && (old_trip_count & 1) == 0,
+        "odd trip count for maximally unroll");
+    // Don't need to adjust limit for maximally unroll since trip count is even.
+  } else if (loop_head->has_exact_trip_count() && init->is_Con()) {
+    // Loop's limit is constant. Loop's init could be constant when pre-loop
+    // become peeled iteration.
+    jlong init_con = init->get_int();
+    // We can keep old loop limit if iterations count stays the same:
+    //   old_trip_count == new_trip_count * 2
+    // Note: since old_trip_count >= 2 then new_trip_count >= 1
+    // so we also don't need to adjust zero trip test.
+    jlong limit_con  = limit->get_int();
+    // (stride_con*2) not overflow since stride_con <= 8.
+    int new_stride_con = stride_con * 2;
+    int stride_m    = new_stride_con - (stride_con > 0 ? 1 : -1);
+    jlong trip_count = (limit_con - init_con + stride_m)/new_stride_con;
+    // New trip count should satisfy next conditions.
+    assert(trip_count > 0 && (julong)trip_count < (julong)max_juint/2, "sanity");
+    uint new_trip_count = (uint)trip_count;
+    adjust_min_trip = (old_trip_count != new_trip_count*2);
+  }
+
+  if (adjust_min_trip) {
+    // Step 2: Adjust the trip limit if it is called for.
+    // The adjustment amount is -stride. Need to make sure if the
+    // adjustment underflows or overflows, then the main loop is skipped.
+    Node* cmp = loop_end->cmp_node();
+    assert(cmp->in(2) == limit, "sanity");
+    assert(opaq != NULL && opaq->in(1) == limit, "sanity");
+
+    // Verify that policy_unroll result is still valid.
+    const TypeInt* limit_type = _igvn.type(limit)->is_int();
+    assert(stride_con > 0 && ((limit_type->_hi - stride_con) < limit_type->_hi) ||
+        stride_con < 0 && ((limit_type->_lo - stride_con) > limit_type->_lo), "sanity");
+
+    if (limit->is_Con()) {
+      // The check in policy_unroll and the assert above guarantee
+      // no underflow if limit is constant.
+      new_limit = _igvn.intcon(limit->get_int() - stride_con);
+      set_ctrl(new_limit, C->root());
+    } else {
+      // Limit is not constant.
+      if (loop_head->unrolled_count() == 1) { // only for first unroll
+        // Separate limit by Opaque node in case it is an incremented
+        // variable from previous loop to avoid using pre-incremented
+        // value which could increase register pressure.
+        // Otherwise reorg_offsets() optimization will create a separate
+        // Opaque node for each use of trip-counter and as result
+        // zero trip guard limit will be different from loop limit.
+        assert(has_ctrl(opaq), "should have it");
+        Node* opaq_ctrl = get_ctrl(opaq);
+        limit = new Opaque2Node( C, limit );
+        register_new_node( limit, opaq_ctrl );
+      }
+      if (stride_con > 0 && (java_subtract(limit_type->_lo, stride_con) < limit_type->_lo) ||
+          stride_con < 0 && (java_subtract(limit_type->_hi, stride_con) > limit_type->_hi)) {
+        // No underflow.
+        new_limit = new SubINode(limit, stride);
+      } else {
+        // (limit - stride) may underflow.
+        // Clamp the adjustment value with MININT or MAXINT:
+        //
+        //   new_limit = limit-stride
+        //   if (stride > 0)
+        //     new_limit = (limit < new_limit) ? MININT : new_limit;
+        //   else
+        //     new_limit = (limit > new_limit) ? MAXINT : new_limit;
+        //
+        BoolTest::mask bt = loop_end->test_trip();
+        assert(bt == BoolTest::lt || bt == BoolTest::gt, "canonical test is expected");
+        Node* adj_max = _igvn.intcon((stride_con > 0) ? min_jint : max_jint);
+        set_ctrl(adj_max, C->root());
+        Node* old_limit = NULL;
+        Node* adj_limit = NULL;
+        Node* bol = limit->is_CMove() ? limit->in(CMoveNode::Condition) : NULL;
+        if (loop_head->unrolled_count() > 1 &&
+            limit->is_CMove() && limit->Opcode() == Op_CMoveI &&
+            limit->in(CMoveNode::IfTrue) == adj_max &&
+            bol->as_Bool()->_test._test == bt &&
+            bol->in(1)->Opcode() == Op_CmpI &&
+            bol->in(1)->in(2) == limit->in(CMoveNode::IfFalse)) {
+          // Loop was unrolled before.
+          // Optimize the limit to avoid nested CMove:
+          // use original limit as old limit.
+          old_limit = bol->in(1)->in(1);
+          // Adjust previous adjusted limit.
+          adj_limit = limit->in(CMoveNode::IfFalse);
+          adj_limit = new SubINode(adj_limit, stride);
+        } else {
+          old_limit = limit;
+          adj_limit = new SubINode(limit, stride);
+        }
+        assert(old_limit != NULL && adj_limit != NULL, "");
+        register_new_node( adj_limit, ctrl ); // adjust amount
+        Node* adj_cmp = new CmpINode(old_limit, adj_limit);
+        register_new_node( adj_cmp, ctrl );
+        Node* adj_bool = new BoolNode(adj_cmp, bt);
+        register_new_node( adj_bool, ctrl );
+        new_limit = new CMoveINode(adj_bool, adj_limit, adj_max, TypeInt::INT);
+      }
+      register_new_node(new_limit, ctrl);
     }
-
-    if (adjust_min_trip) {
-      // Step 2: Adjust the trip limit if it is called for.
-      // The adjustment amount is -stride. Need to make sure if the
-      // adjustment underflows or overflows, then the main loop is skipped.
-      Node* cmp = loop_end->cmp_node();
-      assert(cmp->in(2) == limit, "sanity");
-      assert(opaq != NULL && opaq->in(1) == limit, "sanity");
-
-      // Verify that policy_unroll result is still valid.
-      const TypeInt* limit_type = _igvn.type(limit)->is_int();
-      assert(stride_con > 0 && ((limit_type->_hi - stride_con) < limit_type->_hi) ||
-             stride_con < 0 && ((limit_type->_lo - stride_con) > limit_type->_lo), "sanity");
-
-      if (limit->is_Con()) {
-        // The check in policy_unroll and the assert above guarantee
-        // no underflow if limit is constant.
-        new_limit = _igvn.intcon(limit->get_int() - stride_con);
-        set_ctrl(new_limit, C->root());
-      } else {
-        // Limit is not constant.
-        if (loop_head->unrolled_count() == 1) { // only for first unroll
-          // Separate limit by Opaque node in case it is an incremented
-          // variable from previous loop to avoid using pre-incremented
-          // value which could increase register pressure.
-          // Otherwise reorg_offsets() optimization will create a separate
-          // Opaque node for each use of trip-counter and as result
-          // zero trip guard limit will be different from loop limit.
-          assert(has_ctrl(opaq), "should have it");
-          Node* opaq_ctrl = get_ctrl(opaq);
-          limit = new Opaque2Node( C, limit );
-          register_new_node( limit, opaq_ctrl );
-        }
-        if (stride_con > 0 && (java_subtract(limit_type->_lo, stride_con) < limit_type->_lo) ||
-            stride_con < 0 && (java_subtract(limit_type->_hi, stride_con) > limit_type->_hi)) {
-          // No underflow.
-          new_limit = new SubINode(limit, stride);
-        } else {
-          // (limit - stride) may underflow.
-          // Clamp the adjustment value with MININT or MAXINT:
-          //
-          //   new_limit = limit-stride
-          //   if (stride > 0)
-          //     new_limit = (limit < new_limit) ? MININT : new_limit;
-          //   else
-          //     new_limit = (limit > new_limit) ? MAXINT : new_limit;
-          //
-          BoolTest::mask bt = loop_end->test_trip();
-          assert(bt == BoolTest::lt || bt == BoolTest::gt, "canonical test is expected");
-          Node* adj_max = _igvn.intcon((stride_con > 0) ? min_jint : max_jint);
-          set_ctrl(adj_max, C->root());
-          Node* old_limit = NULL;
-          Node* adj_limit = NULL;
-          Node* bol = limit->is_CMove() ? limit->in(CMoveNode::Condition) : NULL;
-          if (loop_head->unrolled_count() > 1 &&
-              limit->is_CMove() && limit->Opcode() == Op_CMoveI &&
-              limit->in(CMoveNode::IfTrue) == adj_max &&
-              bol->as_Bool()->_test._test == bt &&
-              bol->in(1)->Opcode() == Op_CmpI &&
-              bol->in(1)->in(2) == limit->in(CMoveNode::IfFalse)) {
-            // Loop was unrolled before.
-            // Optimize the limit to avoid nested CMove:
-            // use original limit as old limit.
-            old_limit = bol->in(1)->in(1);
-            // Adjust previous adjusted limit.
-            adj_limit = limit->in(CMoveNode::IfFalse);
-            adj_limit = new SubINode(adj_limit, stride);
-          } else {
-            old_limit = limit;
-            adj_limit = new SubINode(limit, stride);
-          }
-          assert(old_limit != NULL && adj_limit != NULL, "");
-          register_new_node( adj_limit, ctrl ); // adjust amount
-          Node* adj_cmp = new CmpINode(old_limit, adj_limit);
-          register_new_node( adj_cmp, ctrl );
-          Node* adj_bool = new BoolNode(adj_cmp, bt);
-          register_new_node( adj_bool, ctrl );
-          new_limit = new CMoveINode(adj_bool, adj_limit, adj_max, TypeInt::INT);
-        }
-        register_new_node(new_limit, ctrl);
-      }
-      assert(new_limit != NULL, "");
-      // Replace in loop test.
-      assert(loop_end->in(1)->in(1) == cmp, "sanity");
-      if (cmp->outcnt() == 1 && loop_end->in(1)->outcnt() == 1) {
-        // Don't need to create new test since only one user.
-        _igvn.hash_delete(cmp);
-        cmp->set_req(2, new_limit);
-      } else {
-        // Create new test since it is shared.
-        Node* ctrl2 = loop_end->in(0);
-        Node* cmp2  = cmp->clone();
-        cmp2->set_req(2, new_limit);
-        register_new_node(cmp2, ctrl2);
-        Node* bol2 = loop_end->in(1)->clone();
-        bol2->set_req(1, cmp2);
-        register_new_node(bol2, ctrl2);
-        _igvn.replace_input_of(loop_end, 1, bol2);
-      }
-      // Step 3: Find the min-trip test guaranteed before a 'main' loop.
-      // Make it a 1-trip test (means at least 2 trips).
-
-      // Guard test uses an 'opaque' node which is not shared.  Hence I
-      // can edit it's inputs directly.  Hammer in the new limit for the
-      // minimum-trip guard.
-      assert(opaq->outcnt() == 1, "");
-      _igvn.replace_input_of(opaq, 1, new_limit);
+    assert(new_limit != NULL, "");
+    // Replace in loop test.
+    assert(loop_end->in(1)->in(1) == cmp, "sanity");
+    if (cmp->outcnt() == 1 && loop_end->in(1)->outcnt() == 1) {
+      // Don't need to create new test since only one user.
+      _igvn.hash_delete(cmp);
+      cmp->set_req(2, new_limit);
+    } else {
+      // Create new test since it is shared.
+      Node* ctrl2 = loop_end->in(0);
+      Node* cmp2  = cmp->clone();
+      cmp2->set_req(2, new_limit);
+      register_new_node(cmp2, ctrl2);
+      Node* bol2 = loop_end->in(1)->clone();
+      bol2->set_req(1, cmp2);
+      register_new_node(bol2, ctrl2);
+      _igvn.replace_input_of(loop_end, 1, bol2);
     }
-
-    // Adjust max trip count. The trip count is intentionally rounded
-    // down here (e.g. 15-> 7-> 3-> 1) because if we unwittingly over-unroll,
-    // the main, unrolled, part of the loop will never execute as it is protected
-    // by the min-trip test.  See bug 4834191 for a case where we over-unrolled
-    // and later determined that part of the unrolled loop was dead.
-    loop_head->set_trip_count(old_trip_count / 2);
-
-    // Double the count of original iterations in the unrolled loop body.
-    loop_head->double_unrolled_count();
-
-  } else { // LoopLimitCheck
-
-    // Adjust max trip count. The trip count is intentionally rounded
-    // down here (e.g. 15-> 7-> 3-> 1) because if we unwittingly over-unroll,
-    // the main, unrolled, part of the loop will never execute as it is protected
-    // by the min-trip test.  See bug 4834191 for a case where we over-unrolled
-    // and later determined that part of the unrolled loop was dead.
-    loop_head->set_trip_count(loop_head->trip_count() / 2);
-
-    // Double the count of original iterations in the unrolled loop body.
-    loop_head->double_unrolled_count();
-
-    // -----------
-    // Step 2: Cut back the trip counter for an unroll amount of 2.
-    // Loop will normally trip (limit - init)/stride_con.  Since it's a
-    // CountedLoop this is exact (stride divides limit-init exactly).
-    // We are going to double the loop body, so we want to knock off any
-    // odd iteration: (trip_cnt & ~1).  Then back compute a new limit.
-    Node *span = new SubINode( limit, init );
-    register_new_node( span, ctrl );
-    Node *trip = new DivINode( 0, span, stride );
-    register_new_node( trip, ctrl );
-    Node *mtwo = _igvn.intcon(-2);
-    set_ctrl(mtwo, C->root());
-    Node *rond = new AndINode( trip, mtwo );
-    register_new_node( rond, ctrl );
-    Node *spn2 = new MulINode( rond, stride );
-    register_new_node( spn2, ctrl );
-    new_limit = new AddINode( spn2, init );
-    register_new_node( new_limit, ctrl );
-
-    // Hammer in the new limit
-    Node *ctrl2 = loop_end->in(0);
-    Node *cmp2 = new CmpINode( loop_head->incr(), new_limit );
-    register_new_node( cmp2, ctrl2 );
-    Node *bol2 = new BoolNode( cmp2, loop_end->test_trip() );
-    register_new_node( bol2, ctrl2 );
-    _igvn.replace_input_of(loop_end, CountedLoopEndNode::TestValue, bol2);
-
     // Step 3: Find the min-trip test guaranteed before a 'main' loop.
     // Make it a 1-trip test (means at least 2 trips).
-    if( adjust_min_trip ) {
-      assert( new_limit != NULL, "" );
-      // Guard test uses an 'opaque' node which is not shared.  Hence I
-      // can edit it's inputs directly.  Hammer in the new limit for the
-      // minimum-trip guard.
-      assert( opaq->outcnt() == 1, "" );
-      _igvn.hash_delete(opaq);
-      opaq->set_req(1, new_limit);
-    }
-  } // LoopLimitCheck
+
+    // Guard test uses an 'opaque' node which is not shared.  Hence I
+    // can edit it's inputs directly.  Hammer in the new limit for the
+    // minimum-trip guard.
+    assert(opaq->outcnt() == 1, "");
+    _igvn.replace_input_of(opaq, 1, new_limit);
+  }
+
+  // Adjust max trip count. The trip count is intentionally rounded
+  // down here (e.g. 15-> 7-> 3-> 1) because if we unwittingly over-unroll,
+  // the main, unrolled, part of the loop will never execute as it is protected
+  // by the min-trip test.  See bug 4834191 for a case where we over-unrolled
+  // and later determined that part of the unrolled loop was dead.
+  loop_head->set_trip_count(old_trip_count / 2);
+
+  // Double the count of original iterations in the unrolled loop body.
+  loop_head->double_unrolled_count();
 
   // ---------
   // Step 4: Clone the loop body.  Move it inside the loop.  This loop body
@@ -1904,7 +1830,6 @@
     //   )
 
     if (low_limit->get_int() == -max_jint) {
-      if (!RangeLimitCheck) return;
       // We need this guard when scale*pre_limit+offset >= limit
       // due to underflow. So we need execute pre-loop until
       // scale*I+offset >= min_int. But (min_int-offset) will
@@ -1956,7 +1881,6 @@
     *pre_limit = adjust_limit((-stride_con), scale, plus_one, upper_limit, *pre_limit, pre_ctrl);
 
     if (low_limit->get_int() == -max_jint) {
-      if (!RangeLimitCheck) return;
       // We need this guard when scale*main_limit+offset >= limit
       // due to underflow. So we need execute main-loop while
       // scale*I+offset+1 > min_int. But (min_int-offset-1) will
@@ -2091,7 +2015,7 @@
 
 //------------------------------do_range_check---------------------------------
 // Eliminate range-checks and other trip-counter vs loop-invariant tests.
-void PhaseIdealLoop::do_range_check( IdealLoopTree *loop, Node_List &old_new ) {
+int PhaseIdealLoop::do_range_check( IdealLoopTree *loop, Node_List &old_new ) {
 #ifndef PRODUCT
   if (PrintOpto && VerifyLoopOptimizations) {
     tty->print("Range Check Elimination ");
@@ -2103,10 +2027,12 @@
 #endif
   assert(RangeCheckElimination, "");
   CountedLoopNode *cl = loop->_head->as_CountedLoop();
+  // If we fail before trying to eliminate range checks, set multiversion state
+  int closed_range_checks = 1;
 
   // protect against stride not being a constant
   if (!cl->stride_is_con())
-    return;
+    return closed_range_checks;
 
   // Find the trip counter; we are iteration splitting based on it
   Node *trip_counter = cl->phi();
@@ -2117,8 +2043,8 @@
   // Check graph shape. Cannot optimize a loop if zero-trip
   // Opaque1 node is optimized away and then another round
   // of loop opts attempted.
-  if (!is_canonical_main_loop_entry(cl)) {
-    return;
+  if (!is_canonical_loop_entry(cl)) {
+    return closed_range_checks;
   }
 
   // Need to find the main-loop zero-trip guard
@@ -2132,7 +2058,7 @@
   Node *p_f = iffm->in(0);
   // pre loop may have been optimized out
   if (p_f->Opcode() != Op_IfFalse) {
-    return;
+    return closed_range_checks;
   }
   CountedLoopEndNode *pre_end = p_f->in(0)->as_CountedLoopEnd();
   assert(pre_end->loopnode()->is_pre_loop(), "");
@@ -2141,7 +2067,7 @@
   // optimized away and then another round of loop opts attempted.
   // We can not optimize this particular loop in that case.
   if (pre_opaq1->Opcode() != Op_Opaque1)
-    return;
+    return closed_range_checks;
   Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
   Node *pre_limit = pre_opaq->in(1);
 
@@ -2152,7 +2078,7 @@
   // pre-loop Opaque1 node.
   Node *orig_limit = pre_opaq->original_loop_limit();
   if (orig_limit == NULL || _igvn.type(orig_limit) == Type::TOP)
-    return;
+    return closed_range_checks;
 
   // Must know if its a count-up or count-down loop
 
@@ -2173,6 +2099,10 @@
   // executed.
   bool conditional_rc = false;
 
+  // Count number of range checks and reduce by load range limits, if zero,
+  // the loop is in canonical form to multiversion.
+  closed_range_checks = 0;
+
   // Check loop body for tests of trip-counter plus loop-invariant vs
   // loop-invariant.
   for( uint i = 0; i < loop->_body.size(); i++ ) {
@@ -2181,6 +2111,7 @@
         iff->Opcode() == Op_RangeCheck) { // Test?
       // Test is an IfNode, has 2 projections.  If BOTH are in the loop
       // we need loop unswitching instead of iteration splitting.
+      closed_range_checks++;
       Node *exit = loop->is_loop_exit(iff);
       if( !exit ) continue;
       int flip = (exit->Opcode() == Op_IfTrue) ? 1 : 0;
@@ -2258,7 +2189,7 @@
           add_constraint( stride_con, scale_con, offset, zero, limit, pre_ctrl, &pre_limit, &main_limit );
           if (!conditional_rc) {
             // (0-offset)/scale could be outside of loop iterations range.
-            conditional_rc = !loop->dominates_backedge(iff) || RangeLimitCheck;
+            conditional_rc = !loop->dominates_backedge(iff);
           }
         } else {
           if (PrintOpto) {
@@ -2275,7 +2206,7 @@
           scale_con = -scale_con;
           offset = new SubINode( zero, offset );
           register_new_node( offset, pre_ctrl );
-          limit  = new SubINode( zero, limit  );
+          limit  = new SubINode( zero, limit );
           register_new_node( limit, pre_ctrl );
           // Fall into LE case
         case BoolTest::le:
@@ -2294,7 +2225,7 @@
             // ((MIN_INT+1)-offset)/scale could be outside of loop iterations range.
             // Note: negative offset is replaced with 0 but (MIN_INT+1)/scale could
             // still be outside of loop range.
-            conditional_rc = !loop->dominates_backedge(iff) || RangeLimitCheck;
+            conditional_rc = !loop->dominates_backedge(iff);
           }
           break;
         default:
@@ -2324,6 +2255,9 @@
           --imax;
         }
       }
+      if (limit->Opcode() == Op_LoadRange) {
+        closed_range_checks--;
+      }
 
     } // End of is IF
 
@@ -2340,26 +2274,6 @@
   // Note:: we are making the main loop limit no longer precise;
   // need to round up based on stride.
   cl->set_nonexact_trip_count();
-  if (!LoopLimitCheck && stride_con != 1 && stride_con != -1) { // Cutout for common case
-    // "Standard" round-up logic:  ([main_limit-init+(y-1)]/y)*y+init
-    // Hopefully, compiler will optimize for powers of 2.
-    Node *ctrl = get_ctrl(main_limit);
-    Node *stride = cl->stride();
-    Node *init = cl->init_trip()->uncast();
-    Node *span = new SubINode(main_limit,init);
-    register_new_node(span,ctrl);
-    Node *rndup = _igvn.intcon(stride_con + ((stride_con>0)?-1:1));
-    Node *add = new AddINode(span,rndup);
-    register_new_node(add,ctrl);
-    Node *div = new DivINode(0,add,stride);
-    register_new_node(div,ctrl);
-    Node *mul = new MulINode(div,stride);
-    register_new_node(mul,ctrl);
-    Node *newlim = new AddINode(mul,init);
-    register_new_node(newlim,ctrl);
-    main_limit = newlim;
-  }
-
   Node *main_cle = cl->loopexit();
   Node *main_bol = main_cle->in(1);
   // Hacking loop bounds; need private copies of exit test
@@ -2379,6 +2293,169 @@
   // The OpaqueNode is unshared by design
   assert( opqzm->outcnt() == 1, "cannot hack shared node" );
   _igvn.replace_input_of(opqzm, 1, main_limit);
+
+  return closed_range_checks;
+}
+
+//------------------------------has_range_checks-------------------------------
+// Check to see if RCE cleaned the current loop of range-checks.
+void PhaseIdealLoop::has_range_checks(IdealLoopTree *loop) {
+  assert(RangeCheckElimination, "");
+
+  // skip if not a counted loop
+  if (!loop->is_counted()) return;
+
+  CountedLoopNode *cl = loop->_head->as_CountedLoop();
+
+  // skip this loop if it is already checked
+  if (cl->has_been_range_checked()) return;
+
+  // Now check for existance of range checks
+  for (uint i = 0; i < loop->_body.size(); i++) {
+    Node *iff = loop->_body[i];
+    int iff_opc = iff->Opcode();
+    if (iff_opc == Op_If || iff_opc == Op_RangeCheck) {
+      cl->mark_has_range_checks();
+      break;
+    }
+  }
+  cl->set_has_been_range_checked();
+}
+
+//-------------------------multi_version_post_loops----------------------------
+// Check the range checks that remain, if simple, use the bounds to guard
+// which version to a post loop we execute, one with range checks or one without
+bool PhaseIdealLoop::multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop) {
+  bool multi_version_succeeded = false;
+  assert(RangeCheckElimination, "");
+  CountedLoopNode *legacy_cl = legacy_loop->_head->as_CountedLoop();
+  assert(legacy_cl->is_post_loop(), "");
+
+  // Check for existance of range checks using the unique instance to make a guard with
+  Unique_Node_List worklist;
+  for (uint i = 0; i < legacy_loop->_body.size(); i++) {
+    Node *iff = legacy_loop->_body[i];
+    int iff_opc = iff->Opcode();
+    if (iff_opc == Op_If || iff_opc == Op_RangeCheck) {
+      worklist.push(iff);
+    }
+  }
+
+  // Find RCE'd post loop so that we can stage its guard.
+  if (!is_canonical_loop_entry(legacy_cl)) return multi_version_succeeded;
+  Node* ctrl = legacy_cl->in(LoopNode::EntryControl);
+  Node* iffm = ctrl->in(0);
+
+  // Now we test that both the post loops are connected
+  Node* post_loop_region = iffm->in(0);
+  if (post_loop_region == NULL) return multi_version_succeeded;
+  if (!post_loop_region->is_Region()) return multi_version_succeeded;
+  Node* covering_region = post_loop_region->in(RegionNode::Control+1);
+  if (covering_region == NULL) return multi_version_succeeded;
+  if (!covering_region->is_Region()) return multi_version_succeeded;
+  Node* p_f = covering_region->in(RegionNode::Control);
+  if (p_f == NULL) return multi_version_succeeded;
+  if (!p_f->is_IfFalse()) return multi_version_succeeded;
+  if (!p_f->in(0)->is_CountedLoopEnd()) return multi_version_succeeded;
+  CountedLoopEndNode* rce_loop_end = p_f->in(0)->as_CountedLoopEnd();
+  if (rce_loop_end == NULL) return multi_version_succeeded;
+  CountedLoopNode* rce_cl = rce_loop_end->loopnode();
+  if (rce_cl == NULL || !rce_cl->is_post_loop()) return multi_version_succeeded;
+  CountedLoopNode *known_rce_cl = rce_loop->_head->as_CountedLoop();
+  if (rce_cl != known_rce_cl) return multi_version_succeeded;
+
+  // Then we fetch the cover entry test
+  ctrl = rce_cl->in(LoopNode::EntryControl);
+  if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return multi_version_succeeded;
+
+#ifndef PRODUCT
+  if (TraceLoopOpts) {
+    tty->print("PostMultiVersion\n");
+    rce_loop->dump_head();
+    legacy_loop->dump_head();
+  }
+#endif
+
+  // Now fetch the limit we want to compare against
+  Node *limit = rce_cl->limit();
+  bool first_time = true;
+
+  // If we got this far, we identified the post loop which has been RCE'd and
+  // we have a work list.  Now we will try to transform the if guard to cause
+  // the loop pair to be multi version executed with the determination left to runtime
+  // or the optimizer if full information is known about the given arrays at compile time.
+  Node *last_min = NULL;
+  multi_version_succeeded = true;
+  while (worklist.size()) {
+    Node* rc_iffm = worklist.pop();
+    if (rc_iffm->is_If()) {
+      Node *rc_bolzm = rc_iffm->in(1);
+      if (rc_bolzm->is_Bool()) {
+        Node *rc_cmpzm = rc_bolzm->in(1);
+        if (rc_cmpzm->is_Cmp()) {
+          Node *rc_left = rc_cmpzm->in(2);
+          if (rc_left->Opcode() != Op_LoadRange) {
+            multi_version_succeeded = false;
+            break;
+          }
+          if (first_time) {
+            last_min = rc_left;
+            first_time = false;
+          } else {
+            Node *cur_min = new MinINode(last_min, rc_left);
+            last_min = cur_min;
+            _igvn.register_new_node_with_optimizer(last_min);
+          }
+        }
+      }
+    }
+  }
+
+  // All we have to do is update the limit of the rce loop
+  // with the min of our expression and the current limit.
+  // We will use this expression to replace the current limit.
+  if (last_min && multi_version_succeeded) {
+    Node *cur_min = new MinINode(last_min, limit);
+    _igvn.register_new_node_with_optimizer(cur_min);
+    Node *cmp_node = rce_loop_end->cmp_node();
+    _igvn.replace_input_of(cmp_node, 2, cur_min);
+    set_idom(cmp_node, cur_min, dom_depth(ctrl));
+    set_ctrl(cur_min, ctrl);
+    set_loop(cur_min, rce_loop->_parent);
+
+    legacy_cl->mark_is_multiversioned();
+    rce_cl->mark_is_multiversioned();
+    multi_version_succeeded = true;
+
+    C->set_major_progress();
+  }
+
+  return multi_version_succeeded;
+}
+
+//-------------------------poison_rce_post_loop--------------------------------
+// Causes the rce'd post loop to be optimized away if multiverioning fails
+void PhaseIdealLoop::poison_rce_post_loop(IdealLoopTree *rce_loop) {
+  CountedLoopNode *rce_cl = rce_loop->_head->as_CountedLoop();
+  Node* ctrl = rce_cl->in(LoopNode::EntryControl);
+  if (ctrl->is_IfTrue() || ctrl->is_IfFalse()) {
+    Node* iffm = ctrl->in(0);
+    if (iffm->is_If()) {
+      Node* cur_bool = iffm->in(1);
+      if (cur_bool->is_Bool()) {
+        Node* cur_cmp = cur_bool->in(1);
+        if (cur_cmp->is_Cmp()) {
+          BoolTest::mask new_test = BoolTest::gt;
+          BoolNode *new_bool = new BoolNode(cur_cmp, new_test);
+          _igvn.replace_node(cur_bool, new_bool);
+          _igvn._worklist.push(new_bool);
+          Node* left_op = cur_cmp->in(1);
+          _igvn.replace_input_of(cur_cmp, 2, left_op);
+          C->set_major_progress();
+        }
+      }
+    }
+  }
 }
 
 //------------------------------DCE_loop_body----------------------------------
@@ -2738,8 +2815,20 @@
     // Adjust the pre- and main-loop limits to let the pre and post loops run
     // with full checks, but the main-loop with no checks.  Remove said
     // checks from the main body.
-    if (should_rce)
-      phase->do_range_check(this,old_new);
+    if (should_rce) {
+      if (phase->do_range_check(this, old_new) != 0) {
+        cl->mark_has_range_checks();
+      }
+    } else {
+      phase->has_range_checks(this);
+    }
+
+    if (should_unroll && !should_peel && PostLoopMultiversioning) {
+      // Try to setup multiversioning on main loops before they are unrolled
+      if (cl->is_main_loop() && (cl->unrolled_count() == 1)) {
+        phase->insert_scalar_rced_post_loop(this, old_new);
+      }
+    }
 
     // Double loop body for unrolling.  Adjust the minimum-trip test (will do
     // twice as many iterations as before) and the main body limit (only do
--- a/src/share/vm/opto/loopUnswitch.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/opto/loopUnswitch.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -138,7 +138,7 @@
   Node* uniqc = proj_true->unique_ctrl_out();
   Node* entry = head->in(LoopNode::EntryControl);
   Node* predicate = find_predicate(entry);
-  if (predicate != NULL && LoopLimitCheck && UseLoopPredicate) {
+  if (predicate != NULL && UseLoopPredicate) {
     // We may have two predicates, find first.
     entry = find_predicate(entry->in(0)->in(0));
     if (entry != NULL) predicate = entry;
--- a/src/share/vm/opto/loopnode.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/opto/loopnode.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -464,8 +464,6 @@
 
   Node *hook = new Node(6);
 
-  if (LoopLimitCheck) {
-
   // ===================================================
   // Generate loop limit check to avoid integer overflow
   // in cases like next (cyclic loops):
@@ -594,103 +592,6 @@
   }
   set_subtree_ctrl( limit );
 
-  } else { // LoopLimitCheck
-
-  // If compare points to incr, we are ok.  Otherwise the compare
-  // can directly point to the phi; in this case adjust the compare so that
-  // it points to the incr by adjusting the limit.
-  if (cmp->in(1) == phi || cmp->in(2) == phi)
-    limit = gvn->transform(new AddINode(limit,stride));
-
-  // trip-count for +-tive stride should be: (limit - init_trip + stride - 1)/stride.
-  // Final value for iterator should be: trip_count * stride + init_trip.
-  Node *one_p = gvn->intcon( 1);
-  Node *one_m = gvn->intcon(-1);
-
-  Node *trip_count = NULL;
-  switch( bt ) {
-  case BoolTest::eq:
-    ShouldNotReachHere();
-  case BoolTest::ne:            // Ahh, the case we desire
-    if (stride_con == 1)
-      trip_count = gvn->transform(new SubINode(limit,init_trip));
-    else if (stride_con == -1)
-      trip_count = gvn->transform(new SubINode(init_trip,limit));
-    else
-      ShouldNotReachHere();
-    set_subtree_ctrl(trip_count);
-    //_loop.map(trip_count->_idx,loop(limit));
-    break;
-  case BoolTest::le:            // Maybe convert to '<' case
-    limit = gvn->transform(new AddINode(limit,one_p));
-    set_subtree_ctrl( limit );
-    hook->init_req(4, limit);
-
-    bt = BoolTest::lt;
-    // Make the new limit be in the same loop nest as the old limit
-    //_loop.map(limit->_idx,limit_loop);
-    // Fall into next case
-  case BoolTest::lt: {          // Maybe convert to '!=' case
-    if (stride_con < 0) // Count down loop rolls through MAXINT
-      ShouldNotReachHere();
-    Node *range = gvn->transform(new SubINode(limit,init_trip));
-    set_subtree_ctrl( range );
-    hook->init_req(0, range);
-
-    Node *bias  = gvn->transform(new AddINode(range,stride));
-    set_subtree_ctrl( bias );
-    hook->init_req(1, bias);
-
-    Node *bias1 = gvn->transform(new AddINode(bias,one_m));
-    set_subtree_ctrl( bias1 );
-    hook->init_req(2, bias1);
-
-    trip_count  = gvn->transform(new DivINode(0,bias1,stride));
-    set_subtree_ctrl( trip_count );
-    hook->init_req(3, trip_count);
-    break;
-  }
-
-  case BoolTest::ge:            // Maybe convert to '>' case
-    limit = gvn->transform(new AddINode(limit,one_m));
-    set_subtree_ctrl( limit );
-    hook->init_req(4 ,limit);
-
-    bt = BoolTest::gt;
-    // Make the new limit be in the same loop nest as the old limit
-    //_loop.map(limit->_idx,limit_loop);
-    // Fall into next case
-  case BoolTest::gt: {          // Maybe convert to '!=' case
-    if (stride_con > 0) // count up loop rolls through MININT
-      ShouldNotReachHere();
-    Node *range = gvn->transform(new SubINode(limit,init_trip));
-    set_subtree_ctrl( range );
-    hook->init_req(0, range);
-
-    Node *bias  = gvn->transform(new AddINode(range,stride));
-    set_subtree_ctrl( bias );
-    hook->init_req(1, bias);
-
-    Node *bias1 = gvn->transform(new AddINode(bias,one_p));
-    set_subtree_ctrl( bias1 );
-    hook->init_req(2, bias1);
-
-    trip_count  = gvn->transform(new DivINode(0,bias1,stride));
-    set_subtree_ctrl( trip_count );
-    hook->init_req(3, trip_count);
-    break;
-  }
-  } // switch( bt )
-
-  Node *span = gvn->transform(new MulINode(trip_count,stride));
-  set_subtree_ctrl( span );
-  hook->init_req(5, span);
-
-  limit = gvn->transform(new AddINode(span,init_trip));
-  set_subtree_ctrl( limit );
-
-  } // LoopLimitCheck
-
   if (!UseCountedLoopSafepoints) {
     // Check for SafePoint on backedge and remove
     Node *sfpt = x->in(LoopNode::LoopBackControl);
@@ -830,7 +731,7 @@
   CountedLoopNode *cl = loop->_head->as_CountedLoop();
   assert(cl->is_valid_counted_loop(), "");
 
-  if (!LoopLimitCheck || ABS(cl->stride_con()) == 1 ||
+  if (ABS(cl->stride_con()) == 1 ||
       cl->limit()->Opcode() == Op_LoopLimit) {
     // Old code has exact limit (it could be incorrect in case of int overflow).
     // Loop limit is exact with stride == 1. And loop may already have exact limit.
@@ -1898,12 +1799,10 @@
   tty->print("Loop: N%d/N%d ",_head->_idx,_tail->_idx);
   if (_irreducible) tty->print(" IRREDUCIBLE");
   Node* entry = _head->in(LoopNode::EntryControl);
-  if (LoopLimitCheck) {
-    Node* predicate = PhaseIdealLoop::find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-    if (predicate != NULL ) {
-      tty->print(" limit_check");
-      entry = entry->in(0)->in(0);
-    }
+  Node* predicate = PhaseIdealLoop::find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+  if (predicate != NULL ) {
+    tty->print(" limit_check");
+    entry = entry->in(0)->in(0);
   }
   if (UseLoopPredicate) {
     entry = PhaseIdealLoop::find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
@@ -1934,6 +1833,9 @@
     if (cl->is_pre_loop ()) tty->print(" pre" );
     if (cl->is_main_loop()) tty->print(" main");
     if (cl->is_post_loop()) tty->print(" post");
+    if (cl->is_vectorized_loop()) tty->print(" vector");
+    if (cl->range_checks_present()) tty->print(" rc ");
+    if (cl->is_multiversioned()) tty->print(" multi ");
   }
   if (_has_call) tty->print(" has_call");
   if (_has_sfpt) tty->print(" has_sfpt");
@@ -2323,7 +2225,7 @@
   // Some parser-inserted loop predicates could never be used by loop
   // predication or they were moved away from loop during some optimizations.
   // For example, peeling. Eliminate them before next loop optimizations.
-  if (UseLoopPredicate || LoopLimitCheck) {
+  if (UseLoopPredicate) {
     eliminate_useless_predicates();
   }
 
@@ -2452,7 +2354,30 @@
     for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
       IdealLoopTree* lpt = iter.current();
       if (lpt->is_counted()) {
-        sw.transform_loop(lpt, true);
+        CountedLoopNode *cl = lpt->_head->as_CountedLoop();
+
+        if (PostLoopMultiversioning && cl->is_rce_post_loop() && !cl->is_vectorized_loop()) {
+          // Check that the rce'd post loop is encountered first, multiversion after all
+          // major main loop optimization are concluded
+          if (!C->major_progress()) {
+            IdealLoopTree *lpt_next = lpt->_next;
+            if (lpt_next && lpt_next->is_counted()) {
+              CountedLoopNode *cl = lpt_next->_head->as_CountedLoop();
+              has_range_checks(lpt_next);
+              if (cl->is_post_loop() && cl->range_checks_present()) {
+                if (!cl->is_multiversioned()) {
+                  if (multi_version_post_loops(lpt, lpt_next) == false) {
+                    // Cause the rce loop to be optimized away if we fail
+                    cl->mark_is_multiversioned();
+                    poison_rce_post_loop(lpt);
+                  }
+                }
+              }
+            }
+          }
+        } else if (cl->is_main_loop()) {
+          sw.transform_loop(lpt, true);
+        }
       }
     }
   }
@@ -3286,8 +3211,10 @@
 // loop unswitching, and IGVN, or a combination of them) can freely change
 // the graph's shape. As a result, the graph shape outlined below cannot
 // be guaranteed anymore.
-bool PhaseIdealLoop::is_canonical_main_loop_entry(CountedLoopNode* cl) {
-  assert(cl->is_main_loop(), "check should be applied to main loops");
+bool PhaseIdealLoop::is_canonical_loop_entry(CountedLoopNode* cl) {
+  if (!cl->is_main_loop() && !cl->is_post_loop()) {
+    return false;
+  }
   Node* ctrl = cl->in(LoopNode::EntryControl);
   if (ctrl == NULL || (!ctrl->is_IfTrue() && !ctrl->is_IfFalse())) {
     return false;
@@ -3304,8 +3231,16 @@
   if (cmpzm == NULL || !cmpzm->is_Cmp()) {
     return false;
   }
-  Node* opqzm = cmpzm->in(2);
-  if (opqzm == NULL || opqzm->Opcode() != Op_Opaque1) {
+  // compares can get conditionally flipped
+  bool found_opaque = false;
+  for (uint i = 1; i < cmpzm->req(); i++) {
+    Node* opnd = cmpzm->in(i);
+    if (opnd && opnd->Opcode() == Op_Opaque1) {
+      found_opaque = true;
+      break;
+    }
+  }
+  if (!found_opaque) {
     return false;
   }
   return true;
--- a/src/share/vm/opto/loopnode.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/opto/loopnode.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -69,9 +69,13 @@
          PassedSlpAnalysis=512,
          DoUnrollOnly=1024,
          VectorizedLoop=2048,
-         HasAtomicPostLoop=4096 };
+         HasAtomicPostLoop=4096,
+         HasRangeChecks=8192,
+         IsMultiversioned=16384};
   char _unswitch_count;
   enum { _unswitch_max=3 };
+  char _postloop_flags;
+  enum { LoopNotRCEChecked = 0, LoopRCEChecked = 1, RCEPostLoop = 2 };
 
 public:
   // Names for edge indices
@@ -80,9 +84,13 @@
   int is_inner_loop() const { return _loop_flags & InnerLoop; }
   void set_inner_loop() { _loop_flags |= InnerLoop; }
 
+  int range_checks_present() const { return _loop_flags & HasRangeChecks; }
+  int is_multiversioned() const { return _loop_flags & IsMultiversioned; }
+  int is_vectorized_loop() const { return _loop_flags & VectorizedLoop; }
   int is_partial_peel_loop() const { return _loop_flags & PartialPeelLoop; }
   void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; }
   int partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; }
+
   void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
   void mark_has_reductions() { _loop_flags |= HasReductions; }
   void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; }
@@ -90,15 +98,23 @@
   void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
   void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; }
   void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; }
+  void mark_has_range_checks() { _loop_flags |=  HasRangeChecks; }
+  void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; }
 
   int unswitch_max() { return _unswitch_max; }
   int unswitch_count() { return _unswitch_count; }
+
+  int has_been_range_checked() const { return _postloop_flags & LoopRCEChecked; }
+  void set_has_been_range_checked() { _postloop_flags |= LoopRCEChecked; }
+  int is_rce_post_loop() const { return _postloop_flags & RCEPostLoop; }
+  void set_is_rce_post_loop() { _postloop_flags |= RCEPostLoop; }
+
   void set_unswitch_count(int val) {
     assert (val <= unswitch_max(), "too many unswitches");
     _unswitch_count = val;
   }
 
-  LoopNode( Node *entry, Node *backedge ) : RegionNode(3), _loop_flags(0), _unswitch_count(0) {
+  LoopNode(Node *entry, Node *backedge) : RegionNode(3), _loop_flags(0), _unswitch_count(0), _postloop_flags(0) {
     init_class_id(Class_Loop);
     init_req(EntryControl, entry);
     init_req(LoopBackControl, backedge);
@@ -225,7 +241,6 @@
   int has_passed_slp   () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
   int do_unroll_only      () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
   int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
-  int is_vectorized_loop    () const { return (_loop_flags & VectorizedLoop) == VectorizedLoop; }
   int has_atomic_post_loop  () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; }
   void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
 
@@ -657,7 +672,7 @@
 
 public:
 
-  static bool is_canonical_main_loop_entry(CountedLoopNode* cl);
+  static bool is_canonical_loop_entry(CountedLoopNode* cl);
 
   bool has_node( Node* n ) const {
     guarantee(n != NULL, "No Node.");
@@ -911,6 +926,15 @@
   // Add pre and post loops around the given loop.  These loops are used
   // during RCE, unrolling and aligning loops.
   void insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_new, bool peel_only );
+
+  // Add post loop after the given loop.
+  Node *insert_post_loop(IdealLoopTree *loop, Node_List &old_new,
+                         CountedLoopNode *main_head, CountedLoopEndNode *main_end,
+                         Node *incr, Node *limit, CountedLoopNode *&post_head);
+
+  // Add an RCE'd post loop which we will multi-version adapt for run time test path usage
+  void insert_scalar_rced_post_loop( IdealLoopTree *loop, Node_List &old_new );
+
   // Add a vector post loop between a vector main loop and the current post loop
   void insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new);
   // If Node n lives in the back_ctrl block, we clone a private version of n
@@ -983,7 +1007,17 @@
   }
 
   // Eliminate range-checks and other trip-counter vs loop-invariant tests.
-  void do_range_check( IdealLoopTree *loop, Node_List &old_new );
+  int do_range_check( IdealLoopTree *loop, Node_List &old_new );
+
+  // Check to see if do_range_check(...) cleaned the main loop of range-checks
+  void has_range_checks(IdealLoopTree *loop);
+
+  // Process post loops which have range checks and try to build a multi-version
+  // guard to safely determine if we can execute the post loop which was RCE'd.
+  bool multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop);
+
+  // Cause the rce'd post loop to optimized away, this happens if we cannot complete multiverioning
+  void poison_rce_post_loop(IdealLoopTree *rce_loop);
 
   // Create a slow version of the loop by cloning the loop
   // and inserting an if to select fast-slow versions.
--- a/src/share/vm/opto/parse1.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/opto/parse1.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -662,8 +662,7 @@
         // (Note that dead locals do not get phis built, ever.)
         ensure_phis_everywhere();
 
-        if (block->is_SEL_head() &&
-            (UseLoopPredicate || LoopLimitCheck)) {
+        if (block->is_SEL_head() && UseLoopPredicate) {
           // Add predicate to single entry (not irreducible) loop head.
           assert(!block->has_merged_backedge(), "only entry paths should be merged for now");
           // Need correct bci for predicate.
--- a/src/share/vm/opto/superword.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/opto/superword.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -3077,7 +3077,7 @@
 CountedLoopEndNode* SuperWord::get_pre_loop_end(CountedLoopNode* cl) {
   // The loop cannot be optimized if the graph shape at
   // the loop entry is inappropriate.
-  if (!PhaseIdealLoop::is_canonical_main_loop_entry(cl)) {
+  if (!PhaseIdealLoop::is_canonical_loop_entry(cl)) {
     return NULL;
   }
 
--- a/src/share/vm/prims/whitebox.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/prims/whitebox.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1373,8 +1373,8 @@
   return ConstantPool::encode_invokedynamic_index(index);
 WB_END
 
-WB_ENTRY(void, WB_ClearInlineCaches(JNIEnv* env, jobject wb))
-  VM_ClearICs clear_ics;
+WB_ENTRY(void, WB_ClearInlineCaches(JNIEnv* env, jobject wb, jboolean preserve_static_stubs))
+  VM_ClearICs clear_ics(preserve_static_stubs == JNI_TRUE);
   VMThread::execute(&clear_ics);
 WB_END
 
@@ -1754,7 +1754,7 @@
   {CC"isShared",           CC"(Ljava/lang/Object;)Z", (void*)&WB_IsShared },
   {CC"isSharedClass",      CC"(Ljava/lang/Class;)Z",  (void*)&WB_IsSharedClass },
   {CC"areSharedStringsIgnored",           CC"()Z",    (void*)&WB_AreSharedStringsIgnored },
-  {CC"clearInlineCaches",  CC"()V",                   (void*)&WB_ClearInlineCaches },
+  {CC"clearInlineCaches0",  CC"(Z)V",                 (void*)&WB_ClearInlineCaches },
   {CC"addCompilerDirective",    CC"(Ljava/lang/String;)I",
                                                       (void*)&WB_AddCompilerDirective },
   {CC"removeCompilerDirective", CC"(I)V",             (void*)&WB_RemoveCompilerDirective },
--- a/src/share/vm/runtime/arguments.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/runtime/arguments.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -3657,6 +3657,11 @@
   }
 #endif
 
+#if !defined(COMPILER2) && !INCLUDE_JVMCI
+  UNSUPPORTED_OPTION(ProfileInterpreter);
+  NOT_PRODUCT(UNSUPPORTED_OPTION(TraceProfileInterpreter));
+#endif
+
 #ifndef TIERED
   // Tiered compilation is undefined.
   UNSUPPORTED_OPTION(TieredCompilation);
--- a/src/share/vm/runtime/globals.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/runtime/globals.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -735,7 +735,7 @@
           "Control whether SHA instructions can be used "                   \
           "on SPARC, on ARM and on x86")                                    \
                                                                             \
-  diagnostic(bool, UseGHASHIntrinsics, false,                               \
+  product(bool, UseGHASHIntrinsics, false,                                  \
           "Use intrinsics for GHASH versions of crypto")                    \
                                                                             \
   product(size_t, LargePageSizeInBytes, 0,                                  \
@@ -805,27 +805,27 @@
   product(bool, UseInlineCaches, true,                                      \
           "Use Inline Caches for virtual calls ")                           \
                                                                             \
-  diagnostic(bool, InlineArrayCopy, true,                                   \
+  develop(bool, InlineArrayCopy, true,                                      \
           "Inline arraycopy native that is known to be part of "            \
           "base library DLL")                                               \
                                                                             \
-  diagnostic(bool, InlineObjectHash, true,                                  \
+  develop(bool, InlineObjectHash, true,                                     \
           "Inline Object::hashCode() native that is known to be part "      \
           "of base library DLL")                                            \
                                                                             \
-  diagnostic(bool, InlineNatives, true,                                     \
+  develop(bool, InlineNatives, true,                                        \
           "Inline natives that are known to be part of base library DLL")   \
                                                                             \
-  diagnostic(bool, InlineMathNatives, true,                                 \
+  develop(bool, InlineMathNatives, true,                                    \
           "Inline SinD, CosD, etc.")                                        \
                                                                             \
-  diagnostic(bool, InlineClassNatives, true,                                \
+  develop(bool, InlineClassNatives, true,                                   \
           "Inline Class.isInstance, etc")                                   \
                                                                             \
-  diagnostic(bool, InlineThreadNatives, true,                               \
+  develop(bool, InlineThreadNatives, true,                                  \
           "Inline Thread.currentThread, etc")                               \
                                                                             \
-  diagnostic(bool, InlineUnsafeOps, true,                                   \
+  develop(bool, InlineUnsafeOps, true,                                      \
           "Inline memory ops (native methods) from Unsafe")                 \
                                                                             \
   product(bool, CriticalJNINatives, true,                                   \
@@ -834,34 +834,34 @@
   notproduct(bool, StressCriticalJNINatives, false,                         \
           "Exercise register saving code in critical natives")              \
                                                                             \
-  diagnostic(bool, UseAESIntrinsics, false,                                 \
+  product(bool, UseAESIntrinsics, false,                                    \
           "Use intrinsics for AES versions of crypto")                      \
                                                                             \
-  diagnostic(bool, UseAESCTRIntrinsics, false,                              \
+  product(bool, UseAESCTRIntrinsics, false,                                 \
           "Use intrinsics for the paralleled version of AES/CTR crypto")    \
                                                                             \
-  diagnostic(bool, UseSHA1Intrinsics, false,                                \
+  product(bool, UseSHA1Intrinsics, false,                                   \
           "Use intrinsics for SHA-1 crypto hash function. "                 \
           "Requires that UseSHA is enabled.")                               \
                                                                             \
-  diagnostic(bool, UseSHA256Intrinsics, false,                              \
+  product(bool, UseSHA256Intrinsics, false,                                 \
           "Use intrinsics for SHA-224 and SHA-256 crypto hash functions. "  \
           "Requires that UseSHA is enabled.")                               \
                                                                             \
-  diagnostic(bool, UseSHA512Intrinsics, false,                              \
+  product(bool, UseSHA512Intrinsics, false,                                 \
           "Use intrinsics for SHA-384 and SHA-512 crypto hash functions. "  \
           "Requires that UseSHA is enabled.")                               \
                                                                             \
-  diagnostic(bool, UseCRC32Intrinsics, false,                               \
+  product(bool, UseCRC32Intrinsics, false,                                  \
           "use intrinsics for java.util.zip.CRC32")                         \
                                                                             \
-  diagnostic(bool, UseCRC32CIntrinsics, false,                              \
+  product(bool, UseCRC32CIntrinsics, false,                                 \
           "use intrinsics for java.util.zip.CRC32C")                        \
                                                                             \
-  diagnostic(bool, UseAdler32Intrinsics, false,                             \
+  product(bool, UseAdler32Intrinsics, false,                                \
           "use intrinsics for java.util.zip.Adler32")                       \
                                                                             \
-  diagnostic(bool, UseVectorizedMismatchIntrinsic, false,                   \
+  product(bool, UseVectorizedMismatchIntrinsic, false,                      \
           "Enables intrinsification of ArraysSupport.vectorizedMismatch()") \
                                                                             \
   diagnostic(ccstrlist, DisableIntrinsic, "",                               \
@@ -2501,9 +2501,6 @@
           "generate locking/unlocking code for synchronized methods and "   \
           "monitors")                                                       \
                                                                             \
-  develop(bool, GenerateCompilerNullChecks, true,                           \
-          "Generate explicit null checks for loads/stores/calls")           \
-                                                                            \
   develop(bool, GenerateRangeChecks, true,                                  \
           "Generate range checks for array accesses")                       \
                                                                             \
--- a/src/share/vm/runtime/simpleThresholdPolicy.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/runtime/simpleThresholdPolicy.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -233,6 +233,14 @@
   if (level == CompLevel_none) {
     return;
   }
+
+#if INCLUDE_JVMCI
+  // We can't compile with a JVMCI compiler until the module system is initialized.
+  if (level == CompLevel_full_optimization && UseJVMCICompiler && !Universe::is_module_initialized()) {
+    return;
+  }
+#endif
+
   // Check if the method can be compiled. If it cannot be compiled with C1, continue profiling
   // in the interpreter and then compile with C2 (the transition function will request that,
   // see common() ). If the method cannot be compiled with C2 but still can with C1, compile it with
--- a/src/share/vm/runtime/thread.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/runtime/thread.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -792,10 +792,6 @@
   handle_area()->oops_do(f);
 }
 
-void Thread::nmethods_do(CodeBlobClosure* cf) {
-  // no nmethods in a generic thread...
-}
-
 void Thread::metadata_handles_do(void f(Metadata*)) {
   // Only walk the Handles in Thread.
   if (metadata_handles() != NULL) {
@@ -2828,8 +2824,6 @@
 }
 
 void JavaThread::nmethods_do(CodeBlobClosure* cf) {
-  Thread::nmethods_do(cf);  // (super method is a no-op)
-
   assert((!has_last_Java_frame() && java_call_counter() == 0) ||
          (has_last_Java_frame() && java_call_counter() > 0), "wrong java_sp info!");
 
@@ -3304,6 +3298,7 @@
 : JavaThread(&sweeper_thread_entry) {
   _scanned_nmethod = NULL;
 }
+
 void CodeCacheSweeperThread::oops_do(OopClosure* f, CLDClosure* cld_f, CodeBlobClosure* cf) {
   JavaThread::oops_do(f, cld_f, cf);
   if (_scanned_nmethod != NULL && cf != NULL) {
@@ -3314,6 +3309,16 @@
   }
 }
 
+void CodeCacheSweeperThread::nmethods_do(CodeBlobClosure* cf) {
+  JavaThread::nmethods_do(cf);
+  if (_scanned_nmethod != NULL && cf != NULL) {
+    // Safepoints can occur when the sweeper is scanning an nmethod so
+    // process it here to make sure it isn't unloaded in the middle of
+    // a scan.
+    cf->do_code_blob(_scanned_nmethod);
+  }
+}
+
 
 // ======= Threads ========
 
@@ -4342,9 +4347,13 @@
 
 void Threads::nmethods_do(CodeBlobClosure* cf) {
   ALL_JAVA_THREADS(p) {
-    p->nmethods_do(cf);
+    // This is used by the code cache sweeper to mark nmethods that are active
+    // on the stack of a Java thread. Ignore the sweeper thread itself to avoid
+    // marking CodeCacheSweeperThread::_scanned_nmethod as active.
+    if(!p->is_Code_cache_sweeper_thread()) {
+      p->nmethods_do(cf);
+    }
   }
-  VMThread::vm_thread()->nmethods_do(cf);
 }
 
 void Threads::metadata_do(void f(Metadata*)) {
--- a/src/share/vm/runtime/thread.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/runtime/thread.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -509,9 +509,6 @@
     }
   }
 
-  // Sweeper support
-  void nmethods_do(CodeBlobClosure* cf);
-
   // jvmtiRedefineClasses support
   void metadata_handles_do(void f(Metadata*));
 
@@ -1649,7 +1646,7 @@
   void oops_do(OopClosure* f, CLDClosure* cld_f, CodeBlobClosure* cf);
 
   // Sweeper operations
-  void nmethods_do(CodeBlobClosure* cf);
+  virtual void nmethods_do(CodeBlobClosure* cf);
 
   // RedefineClasses Support
   void metadata_do(void f(Metadata*));
@@ -1997,10 +1994,10 @@
   bool is_hidden_from_external_view() const { return true; }
 
   bool is_Code_cache_sweeper_thread() const { return true; }
-  // GC support
-  // Apply "f->do_oop" to all root oops in "this".
-  // Apply "cf->do_code_blob" (if !NULL) to all code blobs active in frames
+
+  // Prevent GC from unloading _scanned_nmethod
   void oops_do(OopClosure* f, CLDClosure* cld_f, CodeBlobClosure* cf);
+  void nmethods_do(CodeBlobClosure* cf);
 };
 
 // A thread used for Compilation.
--- a/src/share/vm/runtime/vmStructs.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/runtime/vmStructs.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -385,8 +385,8 @@
   nonstatic_field(MethodCounters,              _interpreter_profile_limit,                    int)                                   \
   nonstatic_field(MethodCounters,              _invoke_mask,                                  int)                                   \
   nonstatic_field(MethodCounters,              _backedge_mask,                                int)                                   \
-  nonstatic_field(MethodCounters,              _interpreter_invocation_count,                 int)                                   \
-  nonstatic_field(MethodCounters,              _interpreter_throwout_count,                   u2)                                    \
+  COMPILER2_OR_JVMCI_PRESENT(nonstatic_field(MethodCounters, _interpreter_invocation_count,   int))                                  \
+  COMPILER2_OR_JVMCI_PRESENT(nonstatic_field(MethodCounters, _interpreter_throwout_count,     u2))                                   \
   nonstatic_field(MethodCounters,              _number_of_breakpoints,                        u2)                                    \
   nonstatic_field(MethodCounters,              _invocation_counter,                           InvocationCounter)                     \
   nonstatic_field(MethodCounters,              _backedge_counter,                             InvocationCounter)                     \
--- a/src/share/vm/runtime/vm_operations.cpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/runtime/vm_operations.cpp	Mon Apr 11 20:38:50 2016 +0000
@@ -105,6 +105,14 @@
   }
 }
 
+void VM_ClearICs::doit() {
+  if (_preserve_static_stubs) {
+    CodeCache::cleanup_inline_caches();
+  } else {
+    CodeCache::clear_inline_caches();
+  }
+}
+
 void VM_Deoptimize::doit() {
   // We do not want any GCs to happen while we are in the middle of this VM operation
   ResourceMark rm;
--- a/src/share/vm/runtime/vm_operations.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/runtime/vm_operations.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -231,9 +231,11 @@
 };
 
 class VM_ClearICs: public VM_Operation {
+ private:
+  bool _preserve_static_stubs;
  public:
-  VM_ClearICs() {}
-  void doit()         { CodeCache::clear_inline_caches(); }
+  VM_ClearICs(bool preserve_static_stubs) { _preserve_static_stubs = preserve_static_stubs; }
+  void doit();
   VMOp_Type type() const { return VMOp_ClearICs; }
 };
 
--- a/src/share/vm/utilities/macros.hpp	Mon Apr 11 12:50:08 2016 +0000
+++ b/src/share/vm/utilities/macros.hpp	Mon Apr 11 20:38:50 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -206,6 +206,17 @@
 #define NOT_COMPILER2(code) code
 #endif // COMPILER2
 
+// COMPILER2 or JVMCI
+#if defined(COMPILER2) || INCLUDE_JVMCI
+#define COMPILER2_OR_JVMCI 1
+#define COMPILER2_OR_JVMCI_PRESENT(code) code
+#define NOT_COMPILER2_OR_JVMCI(code)
+#else
+#define COMPILER2_OR_JVMCI 0
+#define COMPILER2_OR_JVMCI_PRESENT(code)
+#define NOT_COMPILER2_OR_JVMCI(code) code
+#endif
+
 #ifdef TIERED
 #define TIERED_ONLY(code) code
 #define NOT_TIERED(code)
--- a/test/compiler/dependencies/MonomorphicObjectCall/TestMonomorphicObjectCall.java	Mon Apr 11 12:50:08 2016 +0000
+++ b/test/compiler/dependencies/MonomorphicObjectCall/TestMonomorphicObjectCall.java	Mon Apr 11 20:38:50 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -21,25 +21,14 @@
  * questions.
  */
 
-import java.io.File;
-import java.nio.file.Files;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Collections;
-
-import jdk.test.lib.*;
-
 /*
  * @test
  * @bug 8050079
  * @summary Compiles a monomorphic call to finalizeObject() on a modified java.lang.Object to test C1 CHA.
- * @library /testlibrary
- * @modules java.base/sun.misc
- *          java.management
- *          java.base/jdk.internal
- * @ignore 8132924
- * @compile -XDignore.symbol.file java/lang/Object.java TestMonomorphicObjectCall.java
- * @run main TestMonomorphicObjectCall
+ * @build java.base/java.lang.Object
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -Xcomp -XX:-VerifyDependencies
+ *                   -XX:TieredStopAtLevel=1 -XX:CompileOnly=TestMonomorphicObjectCall::callFinalize
+ *                   -XX:CompileOnly=java.lang.Object::finalizeObject TestMonomorphicObjectCall
  */
 public class TestMonomorphicObjectCall {
 
@@ -51,32 +40,7 @@
     }
 
     public static void main(String[] args) throws Throwable {
-        if (args.length == 0) {
-            byte[] bytecode = Files.readAllBytes(Paths.get(System.getProperty("test.classes") + File.separator +
-                "java" + File.separator + "lang" + File.separator + "Object.class"));
-            ClassFileInstaller.writeClassToDisk("java.lang.Object", bytecode, "mods/java.base");
-            // Execute new instance with modified java.lang.Object
-            executeTestJvm();
-        } else {
-            // Trigger compilation of 'callFinalize'
-            callFinalize(new Object());
-        }
-    }
-
-    public static void executeTestJvm() throws Throwable {
-        // Execute test with modified version of java.lang.Object
-        // in -Xbootclasspath.
-        String[] vmOpts = new String[] {
-                "-Xpatch:mods",
-                "-Xcomp",
-                "-XX:+IgnoreUnrecognizedVMOptions",
-                "-XX:-VerifyDependencies",
-                "-XX:CompileOnly=TestMonomorphicObjectCall::callFinalize",
-                "-XX:CompileOnly=Object::finalizeObject",
-                "-XX:TieredStopAtLevel=1",
-                TestMonomorphicObjectCall.class.getName(),
-                "true"};
-        OutputAnalyzer output = ProcessTools.executeTestJvm(vmOpts);
-        output.shouldHaveExitValue(0);
+        // Trigger compilation of 'callFinalize'
+        callFinalize(new Object());
     }
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/dependencies/MonomorphicObjectCall/java.base/java/lang/Object.java	Mon Apr 11 20:38:50 2016 +0000
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 1994, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package java.lang;
+
+import jdk.internal.HotSpotIntrinsicCandidate;
+
+/**
+ * Slightly modified version of java.lang.Object that replaces
+ * finalize() by finalizeObject() to avoid overriding in subclasses.
+ */
+public class Object {
+
+    @HotSpotIntrinsicCandidate
+    public Object() {}
+
+    private static native void registerNatives();
+    static {
+        registerNatives();
+    }
+
+    @HotSpotIntrinsicCandidate
+    public final native Class<?> getClass();
+
+    @HotSpotIntrinsicCandidate
+    public native int hashCode();
+
+    public boolean equals(Object obj) {
+        return (this == obj);
+    }
+
+    @HotSpotIntrinsicCandidate
+    protected native Object clone() throws CloneNotSupportedException;
+
+    public String toString() {
+        return getClass().getName() + "@" + Integer.toHexString(hashCode());
+    }
+
+    @HotSpotIntrinsicCandidate
+    public final native void notify();
+
+    @HotSpotIntrinsicCandidate
+    public final native void notifyAll();
+
+    public final native void wait(long timeout) throws InterruptedException;
+
+    public final void wait(long timeout, int nanos) throws InterruptedException {
+        if (timeout < 0) {
+            throw new IllegalArgumentException("timeout value is negative");
+        }
+
+        if (nanos < 0 || nanos > 999999) {
+            throw new IllegalArgumentException(
+                                "nanosecond timeout value out of range");
+        }
+
+        if (nanos >= 500000 || (nanos != 0 && timeout == 0)) {
+            timeout++;
+        }
+
+        wait(timeout);
+    }
+
+    public final void wait() throws InterruptedException {
+        wait(0);
+    }
+
+    /**
+     * Replaces original finalize() method and is therefore not
+     * overridden by any subclasses of Object.
+     * @throws Throwable
+     */
+    // protected void finalize() throws Throwable { }
+    public void finalizeObject() throws Throwable { }
+}
--- a/test/compiler/dependencies/MonomorphicObjectCall/java/lang/Object.java	Mon Apr 11 12:50:08 2016 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 1994, 2014, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.  Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-package java.lang;
-
-import jdk.internal.HotSpotIntrinsicCandidate;
-
-/**
- * Slightly modified version of java.lang.Object that replaces
- * finalize() by finalizeObject() to avoid overriding in subclasses.
- */
-public class Object {
-
-    @HotSpotIntrinsicCandidate
-    public Object() {}
-
-    private static native void registerNatives();
-    static {
-        registerNatives();
-    }
-
-    @HotSpotIntrinsicCandidate
-    public final native Class<?> getClass();
-
-    @HotSpotIntrinsicCandidate
-    public native int hashCode();
-
-    public boolean equals(Object obj) {
-        return (this == obj);
-    }
-
-    @HotSpotIntrinsicCandidate
-    protected native Object clone() throws CloneNotSupportedException;
-
-    public String toString() {
-        return getClass().getName() + "@" + Integer.toHexString(hashCode());
-    }
-
-    @HotSpotIntrinsicCandidate
-    public final native void notify();
-
-    @HotSpotIntrinsicCandidate
-    public final native void notifyAll();
-
-    public final native void wait(long timeout) throws InterruptedException;
-
-    public final void wait(long timeout, int nanos) throws InterruptedException {
-        if (timeout < 0) {
-            throw new IllegalArgumentException("timeout value is negative");
-        }
-
-        if (nanos < 0 || nanos > 999999) {
-            throw new IllegalArgumentException(
-                                "nanosecond timeout value out of range");
-        }
-
-        if (nanos >= 500000 || (nanos != 0 && timeout == 0)) {
-            timeout++;
-        }
-
-        wait(timeout);
-    }
-
-    public final void wait() throws InterruptedException {
-        wait(0);
-    }
-
-    /**
-     * Replaces original finalize() method and is therefore not
-     * overridden by any subclasses of Object.
-     * @throws Throwable
-     */
-    // protected void finalize() throws Throwable { }
-    public void finalizeObject() throws Throwable { }
-}
--- a/test/compiler/intrinsics/muladd/TestMulAdd.java	Mon Apr 11 12:50:08 2016 +0000
+++ b/test/compiler/intrinsics/muladd/TestMulAdd.java	Mon Apr 11 20:38:50 2016 +0000
@@ -28,7 +28,7 @@
  * @summary Add C2 x86 intrinsic for BigInteger::mulAdd() method
  *
  * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
- *      -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions -XX:-UseSquareToLenIntrinsic -XX:-UseMultiplyToLenIntrinsic
+ *      -XX:+IgnoreUnrecognizedVMOptions -XX:-UseSquareToLenIntrinsic -XX:-UseMultiplyToLenIntrinsic
  *      -XX:CompileCommand=dontinline,TestMulAdd::main
  *      -XX:CompileCommand=option,TestMulAdd::base_multiply,ccstr,DisableIntrinsic,_mulAdd
  *      -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_mulAdd
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/jsr292/InvokerGC.java	Mon Apr 11 20:38:50 2016 +0000
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8067247
+ * @library /test/lib /compiler/whitebox /
+ * @run main/bootclasspath -Xcomp -Xbatch
+ *      -XX:CompileCommand=compileonly,InvokerGC::test
+ *      -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI
+ *      InvokerGC
+ */
+
+import java.lang.invoke.*;
+import sun.hotspot.WhiteBox;
+
+public class InvokerGC {
+    static final WhiteBox WB = WhiteBox.getWhiteBox();
+
+    static MethodHandle mh;
+    static {
+        try {
+            mh = MethodHandles.lookup().findStatic(InvokerGC.class, "dummy", MethodType.methodType(void.class));
+        } catch (Exception e) {
+            throw new Error(e);
+        }
+    }
+
+    static void dummy() {}
+
+    static void test() {
+        try {
+            mh.invoke();
+        } catch (Throwable e) {
+            throw new Error(e);
+        }
+    }
+
+    public static void main(String[] args) throws Throwable {
+        mh.invoke(); // Pre-generate an invoker for ()V signature
+
+        test(); // trigger method compilation
+        test();
+
+        WB.fullGC(); // WB.fullGC has always clear softref policy.
+
+        test();
+
+        WB.clearInlineCaches(true); // Preserve static stubs.
+
+        test(); // Trigger call site re-resolution. Invoker LambdaForm should stay the same.
+
+        System.out.println("TEST PASSED");
+    }
+}
--- a/test/compiler/jvmci/events/JvmciNotifyInstallEventTest.java	Mon Apr 11 12:50:08 2016 +0000
+++ b/test/compiler/jvmci/events/JvmciNotifyInstallEventTest.java	Mon Apr 11 20:38:50 2016 +0000
@@ -47,14 +47,20 @@
  *     compiler.jvmci.common.CTVMUtilities
  *     compiler.jvmci.common.testcases.SimpleClass
  *     jdk.test.lib.Asserts
+ *     jdk.test.lib.Utils
  * @run main/othervm -XX:+UnlockExperimentalVMOptions -XX:+EnableJVMCI
  *     -Djvmci.compiler=EmptyCompiler -Xbootclasspath/a:. -Xmixed
  *     -XX:+UseJVMCICompiler -XX:-BootstrapJVMCI
- *     -Dcompiler.jvmci.events.JvmciNotifyInstallEventTest.noevent=false
+ *     -Dcompiler.jvmci.events.JvmciNotifyInstallEventTest.failoninit=false
+ *     compiler.jvmci.events.JvmciNotifyInstallEventTest
+ * @run main/othervm -XX:+UnlockExperimentalVMOptions -XX:+EnableJVMCI
+ *     -Djvmci.compiler=EmptyCompiler -Xbootclasspath/a:. -Xmixed
+ *     -XX:+UseJVMCICompiler -XX:-BootstrapJVMCI -XX:JVMCINMethodSizeLimit=0
+ *     -Dcompiler.jvmci.events.JvmciNotifyInstallEventTest.failoninit=false
  *     compiler.jvmci.events.JvmciNotifyInstallEventTest
  * @run main/othervm -XX:+UnlockExperimentalVMOptions -XX:-EnableJVMCI
  *     -Djvmci.compiler=EmptyCompiler -Xbootclasspath/a:. -Xmixed
- *     -Dcompiler.jvmci.events.JvmciNotifyInstallEventTest.noevent=true
+ *     -Dcompiler.jvmci.events.JvmciNotifyInstallEventTest.failoninit=true
  *     compiler.jvmci.events.JvmciNotifyInstallEventTest
  */
 
@@ -64,6 +70,7 @@
 import compiler.jvmci.common.testcases.SimpleClass;
 import jdk.test.lib.Asserts;
 import java.lang.reflect.Method;
+import jdk.test.lib.Utils;
 import jdk.vm.ci.hotspot.HotSpotVMEventListener;
 import jdk.vm.ci.code.CompiledCode;
 import jdk.vm.ci.code.InstalledCode;
@@ -79,8 +86,8 @@
 
 public class JvmciNotifyInstallEventTest implements HotSpotVMEventListener {
     private static final String METHOD_NAME = "testMethod";
-    private static final boolean IS_POSITIVE = !Boolean.getBoolean(
-            "compiler.jvmci.events.JvmciNotifyInstallEventTest.noevent");
+    private static final boolean FAIL_ON_INIT = !Boolean.getBoolean(
+            "compiler.jvmci.events.JvmciNotifyInstallEventTest.failoninit");
     private static volatile int gotInstallNotification = 0;
 
     public static void main(String args[]) {
@@ -91,12 +98,12 @@
         if (gotInstallNotification != 0) {
             throw new Error("Got install notification before test actions");
         }
-        HotSpotCodeCacheProvider codeCache = null;
+        HotSpotCodeCacheProvider codeCache;
         try {
             codeCache = (HotSpotCodeCacheProvider) HotSpotJVMCIRuntime.runtime()
                     .getHostJVMCIBackend().getCodeCache();
         } catch (InternalError ie) {
-            if (IS_POSITIVE) {
+            if (FAIL_ON_INIT) {
                 throw new AssertionError(
                         "Got unexpected InternalError trying to get code cache",
                         ie);
@@ -104,7 +111,7 @@
             // passed
             return;
         }
-        Asserts.assertTrue(IS_POSITIVE,
+        Asserts.assertTrue(FAIL_ON_INIT,
                     "Haven't caught InternalError in negative case");
         Method testMethod;
         try {
@@ -114,18 +121,30 @@
         }
         HotSpotResolvedJavaMethod method = CTVMUtilities
                 .getResolvedMethod(SimpleClass.class, testMethod);
-        HotSpotCompiledCode compiledCode = new HotSpotCompiledCode(METHOD_NAME, new byte[0], 0, new Site[0],
-                new Assumption[0], new ResolvedJavaMethod[]{method}, new Comment[0], new byte[0], 16,
-                new DataPatch[0], false, 0, null);
-        codeCache.installCode(method, compiledCode, /* installedCode = */ null, /* speculationLog = */ null,
-                /* isDefault = */ false);
+        HotSpotCompiledCode compiledCode = new HotSpotCompiledCode(METHOD_NAME,
+                new byte[0], 0, new Site[0], new Assumption[0],
+                new ResolvedJavaMethod[]{method}, new Comment[0], new byte[0],
+                16, new DataPatch[0], false, 0, null);
+        codeCache.installCode(method, compiledCode, /* installedCode = */ null,
+                /* speculationLog = */ null, /* isDefault = */ false);
         Asserts.assertEQ(gotInstallNotification, 1,
                 "Got unexpected event count after 1st install attempt");
         // since "empty" compilation result is ok, a second attempt should be ok
-        codeCache.installCode(method, compiledCode, /* installedCode = */ null, /* speculationLog = */ null,
-                /* isDefault = */ false);
+        codeCache.installCode(method, compiledCode, /* installedCode = */ null,
+                /* speculationLog = */ null, /* isDefault = */ false);
         Asserts.assertEQ(gotInstallNotification, 2,
                 "Got unexpected event count after 2nd install attempt");
+        // and an incorrect cases
+        Utils.runAndCheckException(() -> {
+            codeCache.installCode(method, null, null, null, true);
+        }, NullPointerException.class);
+        Asserts.assertEQ(gotInstallNotification, 2,
+                "Got unexpected event count after 3rd install attempt");
+        Utils.runAndCheckException(() -> {
+            codeCache.installCode(null, null, null, null, true);
+        }, NullPointerException.class);
+        Asserts.assertEQ(gotInstallNotification, 2,
+                "Got unexpected event count after 4th install attempt");
     }
 
     @Override
--- a/test/compiler/loopopts/TestCastIINoLoopLimitCheck.java	Mon Apr 11 12:50:08 2016 +0000
+++ b/test/compiler/loopopts/TestCastIINoLoopLimitCheck.java	Mon Apr 11 20:38:50 2016 +0000
@@ -26,10 +26,19 @@
  * @test
  * @bug 8073184
  * @summary CastII that guards counted loops confuses range check elimination with LoopLimitCheck off
- * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions -XX:-LoopLimitCheck -XX:CompileOnly=TestCastIINoLoopLimitCheck.m -Xcomp  TestCastIINoLoopLimitCheck
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:CompileOnly=TestCastIINoLoopLimitCheck.m -Xcomp  TestCastIINoLoopLimitCheck
  *
  */
 
+/*
+ * The test was originally run with
+ *
+ * -XX:+UnlockDiagnosticVMOptions -XX:-LoopLimitCheck
+ *
+ * to trigger a problem with code guarded with !LoopLimitCheck.
+ * JDK-8072422 has removed that code but kept the test because the
+ * test generates an interesting graph shape.
+ */
 public class TestCastIINoLoopLimitCheck {
 
     static void m(int i, int index, char[] buf) {
--- a/test/compiler/runtime/6859338/Test6859338.java	Mon Apr 11 12:50:08 2016 +0000
+++ b/test/compiler/runtime/6859338/Test6859338.java	Mon Apr 11 20:38:50 2016 +0000
@@ -27,7 +27,7 @@
  * @bug 6859338
  * @summary Assertion failure in sharedRuntime.cpp
  *
- * @run main/othervm -Xcomp -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions -XX:-InlineObjectHash -Xbatch -XX:-ProfileInterpreter Test6859338
+ * @run main/othervm -Xcomp -XX:+IgnoreUnrecognizedVMOptions  -XX:-InlineObjectHash -Xbatch -XX:-ProfileInterpreter Test6859338
  */
 
 public class Test6859338 {
--- a/test/testlibrary/jittester/src/jdk/test/lib/jittester/visitors/JavaCodeVisitor.java	Mon Apr 11 12:50:08 2016 +0000
+++ b/test/testlibrary/jittester/src/jdk/test/lib/jittester/visitors/JavaCodeVisitor.java	Mon Apr 11 20:38:50 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -164,21 +164,15 @@
         code.append(node.getChildren().stream()
                 .map(p -> p.accept(this))
                 .collect(Collectors.joining("][", "[", "]")));
-        code.append(";\n")
-                .append(PrintingUtils.align(node.getParent().getLevel()))
+        code.append(";\n");
+        if (!TypeList.isBuiltIn(arrayType)) {
+            code.append(PrintingUtils.align(node.getParent().getLevel()))
                 .append("java.util.Arrays.fill(")
                 .append(name)
-                .append(", ");
-        if (TypeList.find("boolean") == arrayType) {
-            code.append("false");
-        } else if (TypeList.isBuiltIn(arrayType)) {
-            code.append("0");
-        } else {
-            code.append("new ")
+                .append(", new ")
                 .append(type)
-                .append("()");
+                .append("());\n");
         }
-        code.append(");\n");
         return code.toString();
     }