changeset 10909:057f09eab4e6

Merge
author rprotacio
date Thu, 14 Apr 2016 14:05:09 +0000
parents 8cca19173bcb 52aa0ee93b32
children d40b70cbc101
files src/share/vm/utilities/top.hpp test/compiler/dependencies/MonomorphicObjectCall/java/lang/Object.java
diffstat 242 files changed, 4107 insertions(+), 2451 deletions(-) [+]
line wrap: on
line diff
--- a/make/share/makefiles/mapfile-vers	Thu Apr 14 09:46:03 2016 -0400
+++ b/make/share/makefiles/mapfile-vers	Thu Apr 14 14:05:09 2016 +0000
@@ -41,7 +41,6 @@
                 JVM_DumpAllStacks;
                 JVM_DumpThreads;
                 JVM_FillInStackTrace;
-                JVM_FillStackFrames;
                 JVM_FindClassFromCaller;
                 JVM_FindClassFromClass;
                 JVM_FindClassFromBootLoader;
@@ -157,13 +156,13 @@
                 JVM_SetClassSigners;
                 JVM_SetNativeThreadName;
                 JVM_SetPrimitiveArrayElement;
-                JVM_SetMethodInfo;
                 JVM_SetThreadPriority;
                 JVM_Sleep;
                 JVM_StartThread;
                 JVM_StopThread;
                 JVM_SuspendThread;
                 JVM_SupportsCX8;
+                JVM_ToStackTraceElement;
                 JVM_TotalMemory;
                 JVM_UnloadLibrary;
                 JVM_Yield;
--- a/makefiles/gensrc/GensrcAdlc.gmk	Thu Apr 14 09:46:03 2016 -0400
+++ b/makefiles/gensrc/GensrcAdlc.gmk	Thu Apr 14 14:05:09 2016 +0000
@@ -41,9 +41,8 @@
     ADLC_CFLAGS := -m64
     ADLC_CFLAGS_WARNINGS := +w
   else ifeq ($(OPENJDK_BUILD_OS), aix)
-    # FIXME: Not implemented. These flags are likely, however
-    # ADLC_LDFLAGS := -q64
-    # ADLC_CFLAGS := -qnortti -qnoeh -q64
+    ADLC_LDFLAGS := -q64
+    ADLC_CFLAGS := -qnortti -qeh -q64 -DAIX
   else ifeq ($(OPENJDK_BUILD_OS), windows)
     ADLC_LDFLAGS := -nologo
     ADLC_CFLAGS := -nologo -EHsc
@@ -89,7 +88,7 @@
   else ifeq ($(OPENJDK_TARGET_OS), solaris)
     ADLCFLAGS += -DSOLARIS=1 -DSPARC_WORKS=1
   else ifeq ($(OPENJDK_TARGET_OS), aix)
-    # FIXME: Not implemented
+    ADLCFLAGS += -DAIX=1
   else ifeq ($(OPENJDK_TARGET_OS), macosx)
     ADLCFLAGS += -D_ALLBSD_SOURCE=1 -D_GNU_SOURCE=1
   endif
--- a/makefiles/symbols/symbols-unix	Thu Apr 14 09:46:03 2016 -0400
+++ b/makefiles/symbols/symbols-unix	Thu Apr 14 14:05:09 2016 +0000
@@ -58,7 +58,6 @@
 JVM_DumpAllStacks
 JVM_DumpThreads
 JVM_FillInStackTrace
-JVM_FillStackFrames
 JVM_FindClassFromCaller
 JVM_FindClassFromClass
 JVM_FindLibraryEntry
@@ -169,7 +168,6 @@
 JVM_ResumeThread
 JVM_SetArrayElement
 JVM_SetClassSigners
-JVM_SetMethodInfo
 JVM_SetNativeThreadName
 JVM_SetPrimitiveArrayElement
 JVM_SetThreadPriority
@@ -178,6 +176,7 @@
 JVM_StopThread
 JVM_SupportsCX8
 JVM_SuspendThread
+JVM_ToStackTraceElement
 JVM_TotalMemory
 JVM_UnloadLibrary
 JVM_Yield
--- a/src/cpu/aarch64/vm/aarch64.ad	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu Apr 14 14:05:09 2016 +0000
@@ -14242,6 +14242,48 @@
   ins_pipe(pipe_cmp_branch);
 %}
 
+instruct cmpUI_imm0_branch(cmpOpU cmp, iRegIorL2I op1, immI0 op2, label labl, rFlagsRegU cr) %{
+  match(If cmp (CmpU op1 op2));
+  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::ne
+            || n->in(1)->as_Bool()->_test._test == BoolTest::eq
+            || n->in(1)->as_Bool()->_test._test == BoolTest::gt
+            ||  n->in(1)->as_Bool()->_test._test == BoolTest::le);
+  effect(USE labl);
+
+  ins_cost(BRANCH_COST);
+  format %{ "cbw$cmp   $op1, $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
+    if (cond == Assembler::EQ || cond == Assembler::LS)
+      __ cbzw($op1$$Register, *L);
+    else
+      __ cbnzw($op1$$Register, *L);
+  %}
+  ins_pipe(pipe_cmp_branch);
+%}
+
+instruct cmpUL_imm0_branch(cmpOpU cmp, iRegL op1, immL0 op2, label labl, rFlagsRegU cr) %{
+  match(If cmp (CmpU op1 op2));
+  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::ne
+            || n->in(1)->as_Bool()->_test._test == BoolTest::eq
+            || n->in(1)->as_Bool()->_test._test == BoolTest::gt
+            || n->in(1)->as_Bool()->_test._test == BoolTest::le);
+  effect(USE labl);
+
+  ins_cost(BRANCH_COST);
+  format %{ "cb$cmp   $op1, $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
+    if (cond == Assembler::EQ || cond == Assembler::LS)
+      __ cbz($op1$$Register, *L);
+    else
+      __ cbnz($op1$$Register, *L);
+  %}
+  ins_pipe(pipe_cmp_branch);
+%}
+
 // Test bit and Branch
 
 // Patterns for short (< 32KiB) variants
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1221,6 +1221,38 @@
   INSN(caspal,  true,  true)
 #undef INSN
 
+  // 8.1 Atomic operations
+  void lse_atomic(Register Rs, Register Rt, Register Rn,
+                  enum operand_size sz, int op1, int op2, bool a, bool r) {
+    starti;
+    f(sz, 31, 30), f(0b111000, 29, 24), f(a, 23), f(r, 22), f(1, 21);
+    rf(Rs, 16), f(op1, 15), f(op2, 14, 12), f(0, 11, 10), rf(Rn, 5), zrf(Rt, 0);
+  }
+
+#define INSN(NAME, NAME_A, NAME_L, NAME_AL, op1, op2)                   \
+  void NAME(operand_size sz, Register Rs, Register Rt, Register Rn) {   \
+    lse_atomic(Rs, Rt, Rn, sz, op1, op2, false, false);                 \
+  }                                                                     \
+  void NAME_A(operand_size sz, Register Rs, Register Rt, Register Rn) { \
+    lse_atomic(Rs, Rt, Rn, sz, op1, op2, true, false);                  \
+  }                                                                     \
+  void NAME_L(operand_size sz, Register Rs, Register Rt, Register Rn) { \
+    lse_atomic(Rs, Rt, Rn, sz, op1, op2, false, true);                  \
+  }                                                                     \
+  void NAME_AL(operand_size sz, Register Rs, Register Rt, Register Rn) {\
+    lse_atomic(Rs, Rt, Rn, sz, op1, op2, true, true);                   \
+  }
+  INSN(ldadd,  ldadda,  ldaddl,  ldaddal,  0, 0b000);
+  INSN(ldbic,  ldbica,  ldbicl,  ldbical,  0, 0b001);
+  INSN(ldeor,  ldeora,  ldeorl,  ldeoral,  0, 0b010);
+  INSN(ldorr,  ldorra,  ldorrl,  ldorral,  0, 0b011);
+  INSN(ldsmax, ldsmaxa, ldsmaxl, ldsmaxal, 0, 0b100);
+  INSN(ldsmin, ldsmina, ldsminl, ldsminal, 0, 0b101);
+  INSN(ldumax, ldumaxa, ldumaxl, ldumaxal, 0, 0b110);
+  INSN(ldumin, ldumina, lduminl, lduminal, 0, 0b111);
+  INSN(swp,    swpa,    swpl,    swpal,    1, 0b000);
+#undef INSN
+
   // Load register (literal)
 #define INSN(NAME, opc, V)                                              \
   void NAME(Register Rt, address dest) {                                \
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1556,54 +1556,14 @@
 }
 
 void LIR_Assembler::casw(Register addr, Register newval, Register cmpval) {
-  if (UseLSE) {
-    __ mov(rscratch1, cmpval);
-    __ casal(Assembler::word, rscratch1, newval, addr);
-    __ cmpw(rscratch1, cmpval);
-    __ cset(rscratch1, Assembler::NE);
-  } else {
-    Label retry_load, nope;
-    // flush and load exclusive from the memory location
-    // and fail if it is not what we expect
-    __ prfm(Address(addr), PSTL1STRM);
-    __ bind(retry_load);
-    __ ldaxrw(rscratch1, addr);
-    __ cmpw(rscratch1, cmpval);
-    __ cset(rscratch1, Assembler::NE);
-    __ br(Assembler::NE, nope);
-    // if we store+flush with no intervening write rscratch1 wil be zero
-    __ stlxrw(rscratch1, newval, addr);
-    // retry so we only ever return after a load fails to compare
-    // ensures we don't return a stale value after a failed write.
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(nope);
-  }
+  __ cmpxchg(addr, cmpval, newval, Assembler::word, /* acquire*/ true, /* release*/ true, rscratch1);
+  __ cset(rscratch1, Assembler::NE);
   __ membar(__ AnyAny);
 }
 
 void LIR_Assembler::casl(Register addr, Register newval, Register cmpval) {
-  if (UseLSE) {
-    __ mov(rscratch1, cmpval);
-    __ casal(Assembler::xword, rscratch1, newval, addr);
-    __ cmp(rscratch1, cmpval);
-    __ cset(rscratch1, Assembler::NE);
-  } else {
-    Label retry_load, nope;
-    // flush and load exclusive from the memory location
-    // and fail if it is not what we expect
-    __ prfm(Address(addr), PSTL1STRM);
-    __ bind(retry_load);
-    __ ldaxr(rscratch1, addr);
-    __ cmp(rscratch1, cmpval);
-    __ cset(rscratch1, Assembler::NE);
-    __ br(Assembler::NE, nope);
-    // if we store+flush with no intervening write rscratch1 wil be zero
-    __ stlxr(rscratch1, newval, addr);
-    // retry so we only ever return after a load fails to compare
-    // ensures we don't return a stale value after a failed write.
-    __ cbnz(rscratch1, retry_load);
-    __ bind(nope);
-  }
+  __ cmpxchg(addr, cmpval, newval, Assembler::xword, /* acquire*/ true, /* release*/ true, rscratch1);
+  __ cset(rscratch1, Assembler::NE);
   __ membar(__ AnyAny);
 }
 
@@ -3121,38 +3081,32 @@
   BasicType type = src->type();
   bool is_oop = type == T_OBJECT || type == T_ARRAY;
 
-  void (MacroAssembler::* lda)(Register Rd, Register Ra);
-  void (MacroAssembler::* add)(Register Rd, Register Rn, RegisterOrConstant increment);
-  void (MacroAssembler::* stl)(Register Rs, Register Rt, Register Rn);
+  void (MacroAssembler::* add)(Register prev, RegisterOrConstant incr, Register addr);
+  void (MacroAssembler::* xchg)(Register prev, Register newv, Register addr);
 
   switch(type) {
   case T_INT:
-    lda = &MacroAssembler::ldaxrw;
-    add = &MacroAssembler::addw;
-    stl = &MacroAssembler::stlxrw;
+    xchg = &MacroAssembler::atomic_xchgalw;
+    add = &MacroAssembler::atomic_addalw;
     break;
   case T_LONG:
-    lda = &MacroAssembler::ldaxr;
-    add = &MacroAssembler::add;
-    stl = &MacroAssembler::stlxr;
+    xchg = &MacroAssembler::atomic_xchgal;
+    add = &MacroAssembler::atomic_addal;
     break;
   case T_OBJECT:
   case T_ARRAY:
     if (UseCompressedOops) {
-      lda = &MacroAssembler::ldaxrw;
-      add = &MacroAssembler::addw;
-      stl = &MacroAssembler::stlxrw;
+      xchg = &MacroAssembler::atomic_xchgalw;
+      add = &MacroAssembler::atomic_addalw;
     } else {
-      lda = &MacroAssembler::ldaxr;
-      add = &MacroAssembler::add;
-      stl = &MacroAssembler::stlxr;
+      xchg = &MacroAssembler::atomic_xchgal;
+      add = &MacroAssembler::atomic_addal;
     }
     break;
   default:
     ShouldNotReachHere();
-    lda = &MacroAssembler::ldaxr;
-    add = &MacroAssembler::add;
-    stl = &MacroAssembler::stlxr;  // unreachable
+    xchg = &MacroAssembler::atomic_xchgal;
+    add = &MacroAssembler::atomic_addal; // unreachable
   }
 
   switch (code) {
@@ -3170,14 +3124,8 @@
         assert_different_registers(inc.as_register(), dst, addr.base(), tmp,
                                    rscratch1, rscratch2);
       }
-      Label again;
       __ lea(tmp, addr);
-      __ prfm(Address(tmp), PSTL1STRM);
-      __ bind(again);
-      (_masm->*lda)(dst, tmp);
-      (_masm->*add)(rscratch1, dst, inc);
-      (_masm->*stl)(rscratch2, rscratch1, tmp);
-      __ cbnzw(rscratch2, again);
+      (_masm->*add)(dst, inc, tmp);
       break;
     }
   case lir_xchg:
@@ -3186,17 +3134,12 @@
       Register obj = as_reg(data);
       Register dst = as_reg(dest);
       if (is_oop && UseCompressedOops) {
-        __ encode_heap_oop(rscratch1, obj);
-        obj = rscratch1;
+        __ encode_heap_oop(rscratch2, obj);
+        obj = rscratch2;
       }
-      assert_different_registers(obj, addr.base(), tmp, rscratch2, dst);
-      Label again;
+      assert_different_registers(obj, addr.base(), tmp, rscratch1, dst);
       __ lea(tmp, addr);
-      __ prfm(Address(tmp), PSTL1STRM);
-      __ bind(again);
-      (_masm->*lda)(dst, tmp);
-      (_masm->*stl)(rscratch2, obj, tmp);
-      __ cbnzw(rscratch2, again);
+      (_masm->*xchg)(dst, obj, tmp);
       if (is_oop && UseCompressedOops) {
         __ decode_heap_oop(dst);
       }
--- a/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -55,6 +55,7 @@
 define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K));
 define_pd_global(intx, LoopUnrollLimit,              60);
 define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, PostLoopMultiversioning,      false);
 // InitialCodeCacheSize derived from specjbb2000 run.
 define_pd_global(intx, InitialCodeCacheSize,         2496*K); // Integral multiple of CodeCacheExpansionSize
 define_pd_global(intx, CodeCacheExpansionSize,       64*K);
--- a/src/cpu/aarch64/vm/debug_aarch64.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/debug_aarch64.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -30,6 +30,5 @@
 #include "runtime/init.hpp"
 #include "runtime/os.hpp"
 #include "utilities/debug.hpp"
-#include "utilities/top.hpp"
 
 void pd_ps(frame f) {}
--- a/src/cpu/aarch64/vm/frame_aarch64.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/frame_aarch64.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -27,7 +27,6 @@
 #define CPU_AARCH64_VM_FRAME_AARCH64_HPP
 
 #include "runtime/synchronizer.hpp"
-#include "utilities/top.hpp"
 
 // A frame represents a physical stack frame (an activation).  Frames can be
 // C or Java frames, and the Java frames can be interpreted or compiled.
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1637,6 +1637,11 @@
 }
 
 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
+  if (UseLSE) {
+    mov(tmp, 1);
+    ldadd(Assembler::word, tmp, zr, counter_addr);
+    return;
+  }
   Label retry_load;
   prfm(Address(counter_addr), PSTL1STRM);
   bind(retry_load);
@@ -2172,8 +2177,18 @@
     return a != b.as_register() && a != c && b.as_register() != c;
 }
 
-#define ATOMIC_OP(LDXR, OP, IOP, STXR)                                       \
-void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
+#define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
+void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
+  if (UseLSE) {                                                         \
+    prev = prev->is_valid() ? prev : zr;                                \
+    if (incr.is_register()) {                                           \
+      AOP(sz, incr.as_register(), prev, addr);                          \
+    } else {                                                            \
+      mov(rscratch2, incr.as_constant());                               \
+      AOP(sz, rscratch2, prev, addr);                                   \
+    }                                                                   \
+    return;                                                             \
+  }                                                                     \
   Register result = rscratch2;                                          \
   if (prev->is_valid())                                                 \
     result = different(prev, incr, addr) ? prev : rscratch2;            \
@@ -2190,13 +2205,20 @@
   }                                                                     \
 }
 
-ATOMIC_OP(ldxr, add, sub, stxr)
-ATOMIC_OP(ldxrw, addw, subw, stxrw)
+ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
+ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
+ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
+ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
 
 #undef ATOMIC_OP
 
-#define ATOMIC_XCHG(OP, LDXR, STXR)                                     \
+#define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
+  if (UseLSE) {                                                         \
+    prev = prev->is_valid() ? prev : zr;                                \
+    AOP(sz, newv, prev, addr);                                          \
+    return;                                                             \
+  }                                                                     \
   Register result = rscratch2;                                          \
   if (prev->is_valid())                                                 \
     result = different(prev, newv, addr) ? prev : rscratch2;            \
@@ -2211,8 +2233,10 @@
     mov(prev, result);                                                  \
 }
 
-ATOMIC_XCHG(xchg, ldxr, stxr)
-ATOMIC_XCHG(xchgw, ldxrw, stxrw)
+ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
+ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
+ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
+ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
 
 #undef ATOMIC_XCHG
 
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -957,9 +957,13 @@
 
   void atomic_add(Register prev, RegisterOrConstant incr, Register addr);
   void atomic_addw(Register prev, RegisterOrConstant incr, Register addr);
+  void atomic_addal(Register prev, RegisterOrConstant incr, Register addr);
+  void atomic_addalw(Register prev, RegisterOrConstant incr, Register addr);
 
   void atomic_xchg(Register prev, Register newv, Register addr);
   void atomic_xchgw(Register prev, Register newv, Register addr);
+  void atomic_xchgal(Register prev, Register newv, Register addr);
+  void atomic_xchgalw(Register prev, Register newv, Register addr);
 
   void orptr(Address adr, RegisterOrConstant src) {
     ldr(rscratch2, adr);
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -30,7 +30,6 @@
 #include "memory/allocation.hpp"
 #include "runtime/icache.hpp"
 #include "runtime/os.hpp"
-#include "utilities/top.hpp"
 
 // We have interfaces for the following instructions:
 // - NativeInstruction
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -198,6 +198,16 @@
 bool SharedRuntime::is_wide_vector(int size) {
   return size > 8;
 }
+
+size_t SharedRuntime::trampoline_size() {
+  return 16;
+}
+
+void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
+  __ mov(rscratch1, destination);
+  __ br(rscratch1);
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -39,7 +39,6 @@
 #include "runtime/stubCodeGenerator.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "runtime/thread.inline.hpp"
-#include "utilities/top.hpp"
 #ifdef COMPILER2
 #include "opto/runtime.hpp"
 #endif
@@ -1711,20 +1710,42 @@
   // to a long, int, short, or byte copy loop.
   //
   address generate_unsafe_copy(const char *name,
-                               address byte_copy_entry) {
-#ifdef PRODUCT
-    return StubRoutines::_jbyte_arraycopy;
-#else
+                               address byte_copy_entry,
+                               address short_copy_entry,
+                               address int_copy_entry,
+                               address long_copy_entry) {
+    Label L_long_aligned, L_int_aligned, L_short_aligned;
+    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
+
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
     __ enter(); // required for proper stackwalking of RuntimeStub frame
+
     // bump this on entry, not on exit:
-    __ lea(rscratch2, ExternalAddress((address)&SharedRuntime::_unsafe_array_copy_ctr));
-    __ incrementw(Address(rscratch2));
+    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
+
+    __ orr(rscratch1, s, d);
+    __ orr(rscratch1, rscratch1, count);
+
+    __ andr(rscratch1, rscratch1, BytesPerLong-1);
+    __ cbz(rscratch1, L_long_aligned);
+    __ andr(rscratch1, rscratch1, BytesPerInt-1);
+    __ cbz(rscratch1, L_int_aligned);
+    __ tbz(rscratch1, 0, L_short_aligned);
     __ b(RuntimeAddress(byte_copy_entry));
+
+    __ BIND(L_short_aligned);
+    __ lsr(count, count, LogBytesPerShort);  // size => short_count
+    __ b(RuntimeAddress(short_copy_entry));
+    __ BIND(L_int_aligned);
+    __ lsr(count, count, LogBytesPerInt);    // size => int_count
+    __ b(RuntimeAddress(int_copy_entry));
+    __ BIND(L_long_aligned);
+    __ lsr(count, count, LogBytesPerLong);   // size => long_count
+    __ b(RuntimeAddress(long_copy_entry));
+
     return start;
-#endif
   }
 
   //
@@ -2090,7 +2111,10 @@
                                                                         /*dest_uninitialized*/true);
 
     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
-                                                              entry_jbyte_arraycopy);
+                                                              entry_jbyte_arraycopy,
+                                                              entry_jshort_arraycopy,
+                                                              entry_jint_arraycopy,
+                                                              entry_jlong_arraycopy);
 
     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
                                                                entry_jbyte_arraycopy,
--- a/src/cpu/aarch64/vm/templateInterpreterGenerator_aarch64.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/templateInterpreterGenerator_aarch64.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1983,14 +1983,8 @@
   __ push(rscratch1);
   __ push(rscratch2);
   __ push(rscratch3);
-  Label L;
-  __ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
-  __ prfm(Address(rscratch2), PSTL1STRM);
-  __ bind(L);
-  __ ldxr(rscratch1, rscratch2);
-  __ add(rscratch1, rscratch1, 1);
-  __ stxr(rscratch3, rscratch1, rscratch2);
-  __ cbnzw(rscratch3, L);
+  __ mov(rscratch3, (address) &BytecodeCounter::_counter_value);
+  __ atomic_add(noreg, 1, rscratch3);
   __ pop(rscratch3);
   __ pop(rscratch2);
   __ pop(rscratch1);
--- a/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -73,6 +73,7 @@
     CPU_SHA1         = (1<<5),
     CPU_SHA2         = (1<<6),
     CPU_CRC32        = (1<<7),
+    CPU_LSE          = (1<<8),
     CPU_A53MAC       = (1 << 30),
     CPU_DMB_ATOMICS  = (1 << 31),
   };
--- a/src/cpu/ppc/vm/assembler_ppc.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/ppc/vm/assembler_ppc.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -624,6 +624,7 @@
     VNOR_OPCODE    = (4u  << OPCODE_SHIFT | 1284u     ),
     VOR_OPCODE     = (4u  << OPCODE_SHIFT | 1156u     ),
     VXOR_OPCODE    = (4u  << OPCODE_SHIFT | 1220u     ),
+    VRLD_OPCODE    = (4u  << OPCODE_SHIFT |  196u     ),
     VRLB_OPCODE    = (4u  << OPCODE_SHIFT |    4u     ),
     VRLW_OPCODE    = (4u  << OPCODE_SHIFT |  132u     ),
     VRLH_OPCODE    = (4u  << OPCODE_SHIFT |   68u     ),
@@ -2047,6 +2048,7 @@
   inline void vnor(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vor(      VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vxor(     VectorRegister d, VectorRegister a, VectorRegister b);
+  inline void vrld(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vrlb(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vrlw(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vrlh(     VectorRegister d, VectorRegister a, VectorRegister b);
--- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -839,6 +839,7 @@
 inline void Assembler::vnor(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VNOR_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vor(     VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VOR_OPCODE      | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vxor(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VXOR_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
+inline void Assembler::vrld(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLD_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vrlb(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLB_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vrlw(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLW_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vrlh(    VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLH_OPCODE     | vrt(d) | vra(a) | vrb(b)); }
--- a/src/cpu/ppc/vm/c2_globals_ppc.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/ppc/vm/c2_globals_ppc.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -55,6 +55,7 @@
 define_pd_global(bool, ResizeTLAB,                   true);
 define_pd_global(intx, LoopUnrollLimit,              60);
 define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, PostLoopMultiversioning,      false);
 
 // Peephole and CISC spilling both break the graph, and so make the
 // scheduler sick.
--- a/src/cpu/ppc/vm/debug_ppc.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/ppc/vm/debug_ppc.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -30,6 +30,5 @@
 #include "runtime/init.hpp"
 #include "runtime/os.hpp"
 #include "utilities/debug.hpp"
-#include "utilities/top.hpp"
 
 void pd_ps(frame f) {}
--- a/src/cpu/ppc/vm/frame_ppc.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/ppc/vm/frame_ppc.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -27,7 +27,6 @@
 #define CPU_PPC_VM_FRAME_PPC_HPP
 
 #include "runtime/synchronizer.hpp"
-#include "utilities/top.hpp"
 
   //  C frame layout on PPC-64.
   //
--- a/src/cpu/ppc/vm/nativeInst_ppc.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/ppc/vm/nativeInst_ppc.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -31,7 +31,6 @@
 #include "memory/allocation.hpp"
 #include "runtime/icache.hpp"
 #include "runtime/os.hpp"
-#include "utilities/top.hpp"
 
 // We have interfaces for the following instructions:
 //
--- a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -483,6 +483,18 @@
   assert(size <= 8, "%d bytes vectors are not supported", size);
   return size > 8;
 }
+
+size_t SharedRuntime::trampoline_size() {
+  return Assembler::load_const_size + 8;
+}
+
+void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
+  Register Rtemp = R12;
+  __ load_const(Rtemp, destination);
+  __ mtctr(Rtemp);
+  __ bctr();
+}
+
 #ifdef COMPILER2
 static int reg2slot(VMReg r) {
   return r->reg2stack() + SharedRuntime::out_preserve_stack_slots();
--- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -37,7 +37,6 @@
 #include "runtime/sharedRuntime.hpp"
 #include "runtime/stubCodeGenerator.hpp"
 #include "runtime/stubRoutines.hpp"
-#include "utilities/top.hpp"
 #include "runtime/thread.inline.hpp"
 
 #define __ _masm->
@@ -2417,6 +2416,433 @@
     return start;
   }
 
+  // Arguments for generated stub (little endian only):
+  //   R3_ARG1   - source byte array address
+  //   R4_ARG2   - destination byte array address
+  //   R5_ARG3   - round key array
+  address generate_aescrypt_encryptBlock() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+
+    address start = __ function_entry();
+
+    Label L_doLast;
+
+    Register from           = R3_ARG1;  // source array address
+    Register to             = R4_ARG2;  // destination array address
+    Register key            = R5_ARG3;  // round key array
+
+    Register keylen         = R8;
+    Register temp           = R9;
+    Register keypos         = R10;
+    Register hex            = R11;
+    Register fifteen        = R12;
+
+    VectorRegister vRet     = VR0;
+
+    VectorRegister vKey1    = VR1;
+    VectorRegister vKey2    = VR2;
+    VectorRegister vKey3    = VR3;
+    VectorRegister vKey4    = VR4;
+
+    VectorRegister fromPerm = VR5;
+    VectorRegister keyPerm  = VR6;
+    VectorRegister toPerm   = VR7;
+    VectorRegister fSplt    = VR8;
+
+    VectorRegister vTmp1    = VR9;
+    VectorRegister vTmp2    = VR10;
+    VectorRegister vTmp3    = VR11;
+    VectorRegister vTmp4    = VR12;
+
+    VectorRegister vLow     = VR13;
+    VectorRegister vHigh    = VR14;
+
+    __ li              (hex, 16);
+    __ li              (fifteen, 15);
+    __ vspltisb        (fSplt, 0x0f);
+
+    // load unaligned from[0-15] to vsRet
+    __ lvx             (vRet, from);
+    __ lvx             (vTmp1, fifteen, from);
+    __ lvsl            (fromPerm, from);
+    __ vxor            (fromPerm, fromPerm, fSplt);
+    __ vperm           (vRet, vRet, vTmp1, fromPerm);
+
+    // load keylen (44 or 52 or 60)
+    __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
+
+    // to load keys
+    __ lvsr            (keyPerm, key);
+    __ vxor            (vTmp2, vTmp2, vTmp2);
+    __ vspltisb        (vTmp2, -16);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vsldoi          (keyPerm, keyPerm, keyPerm, -8);
+
+    // load the 1st round key to vKey1
+    __ li              (keypos, 0);
+    __ lvx             (vKey1, keypos, key);
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey1, vTmp1, vKey1, keyPerm);
+
+    // 1st round
+    __ vxor (vRet, vRet, vKey1);
+
+    // load the 2nd round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 3rd round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // load the 4th round key to vKey3
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
+
+    // load the 5th round key to vKey4
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
+
+    // 2nd - 5th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+    __ vcipher (vRet, vRet, vKey3);
+    __ vcipher (vRet, vRet, vKey4);
+
+    // load the 6th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 7th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // load the 8th round key to vKey3
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
+
+    // load the 9th round key to vKey4
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
+
+    // 6th - 9th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+    __ vcipher (vRet, vRet, vKey3);
+    __ vcipher (vRet, vRet, vKey4);
+
+    // load the 10th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 11th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // if all round keys are loaded, skip next 4 rounds
+    __ cmpwi           (CCR0, keylen, 44);
+    __ beq             (CCR0, L_doLast);
+
+    // 10th - 11th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+
+    // load the 12th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 13th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // if all round keys are loaded, skip next 2 rounds
+    __ cmpwi           (CCR0, keylen, 52);
+    __ beq             (CCR0, L_doLast);
+
+    // 12th - 13th rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipher (vRet, vRet, vKey2);
+
+    // load the 14th round key to vKey1
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 15th round key to vKey2
+    __ addi            (keypos, keypos, 16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    __ bind(L_doLast);
+
+    // last two rounds
+    __ vcipher (vRet, vRet, vKey1);
+    __ vcipherlast (vRet, vRet, vKey2);
+
+    __ neg             (temp, to);
+    __ lvsr            (toPerm, temp);
+    __ vspltisb        (vTmp2, -1);
+    __ vxor            (vTmp1, vTmp1, vTmp1);
+    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
+    __ vxor            (toPerm, toPerm, fSplt);
+    __ lvx             (vTmp1, to);
+    __ vperm           (vRet, vRet, vRet, toPerm);
+    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
+    __ lvx             (vTmp4, fifteen, to);
+    __ stvx            (vTmp1, to);
+    __ vsel            (vRet, vRet, vTmp4, vTmp2);
+    __ stvx            (vRet, fifteen, to);
+
+    __ blr();
+     return start;
+  }
+
+  // Arguments for generated stub (little endian only):
+  //   R3_ARG1   - source byte array address
+  //   R4_ARG2   - destination byte array address
+  //   R5_ARG3   - K (key) in little endian int array
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+
+    address start = __ function_entry();
+
+    Label L_doLast;
+    Label L_do44;
+    Label L_do52;
+    Label L_do60;
+
+    Register from           = R3_ARG1;  // source array address
+    Register to             = R4_ARG2;  // destination array address
+    Register key            = R5_ARG3;  // round key array
+
+    Register keylen         = R8;
+    Register temp           = R9;
+    Register keypos         = R10;
+    Register hex            = R11;
+    Register fifteen        = R12;
+
+    VectorRegister vRet     = VR0;
+
+    VectorRegister vKey1    = VR1;
+    VectorRegister vKey2    = VR2;
+    VectorRegister vKey3    = VR3;
+    VectorRegister vKey4    = VR4;
+    VectorRegister vKey5    = VR5;
+
+    VectorRegister fromPerm = VR6;
+    VectorRegister keyPerm  = VR7;
+    VectorRegister toPerm   = VR8;
+    VectorRegister fSplt    = VR9;
+
+    VectorRegister vTmp1    = VR10;
+    VectorRegister vTmp2    = VR11;
+    VectorRegister vTmp3    = VR12;
+    VectorRegister vTmp4    = VR13;
+
+    VectorRegister vLow     = VR14;
+    VectorRegister vHigh    = VR15;
+
+    __ li              (hex, 16);
+    __ li              (fifteen, 15);
+    __ vspltisb        (fSplt, 0x0f);
+
+    // load unaligned from[0-15] to vsRet
+    __ lvx             (vRet, from);
+    __ lvx             (vTmp1, fifteen, from);
+    __ lvsl            (fromPerm, from);
+    __ vxor            (fromPerm, fromPerm, fSplt);
+    __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
+
+    // load keylen (44 or 52 or 60)
+    __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
+
+    // to load keys
+    __ lvsr            (keyPerm, key);
+    __ vxor            (vTmp2, vTmp2, vTmp2);
+    __ vspltisb        (vTmp2, -16);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vrld            (keyPerm, keyPerm, vTmp2);
+    __ vsldoi          (keyPerm, keyPerm, keyPerm, -8);
+
+    __ cmpwi           (CCR0, keylen, 44);
+    __ beq             (CCR0, L_do44);
+
+    __ cmpwi           (CCR0, keylen, 52);
+    __ beq             (CCR0, L_do52);
+
+    // load the 15th round key to vKey11
+    __ li              (keypos, 240);
+    __ lvx             (vTmp1, keypos, key);
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // load the 14th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
+
+    // load the 13th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
+
+    // load the 12th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
+
+    // load the 11th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
+
+    // 1st - 5th rounds
+    __ vxor            (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+    __ vncipher        (vRet, vRet, vKey4);
+    __ vncipher        (vRet, vRet, vKey5);
+
+    __ b               (L_doLast);
+
+    __ bind            (L_do52);
+
+    // load the 13th round key to vKey11
+    __ li              (keypos, 208);
+    __ lvx             (vTmp1, keypos, key);
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // load the 12th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
+
+    // load the 11th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
+
+    // 1st - 3rd rounds
+    __ vxor            (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+
+    __ b               (L_doLast);
+
+    __ bind            (L_do44);
+
+    // load the 11th round key to vKey11
+    __ li              (keypos, 176);
+    __ lvx             (vTmp1, keypos, key);
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // 1st round
+    __ vxor            (vRet, vRet, vKey1);
+
+    __ bind            (L_doLast);
+
+    // load the 10th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
+
+    // load the 9th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
+
+    // load the 8th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
+
+    // load the 7th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
+
+    // load the 6th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey5, vTmp2, vTmp1, keyPerm);
+
+    // last 10th - 6th rounds
+    __ vncipher        (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+    __ vncipher        (vRet, vRet, vKey4);
+    __ vncipher        (vRet, vRet, vKey5);
+
+    // load the 5th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
+
+    // load the 4th round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
+
+    // load the 3rd round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
+
+    // load the 2nd round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp1, keypos, key);
+    __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
+
+    // load the 1st round key to vKey10
+    __ addi            (keypos, keypos, -16);
+    __ lvx             (vTmp2, keypos, key);
+    __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
+
+    // last 5th - 1th rounds
+    __ vncipher        (vRet, vRet, vKey1);
+    __ vncipher        (vRet, vRet, vKey2);
+    __ vncipher        (vRet, vRet, vKey3);
+    __ vncipher        (vRet, vRet, vKey4);
+    __ vncipherlast    (vRet, vRet, vKey5);
+
+    __ neg             (temp, to);
+    __ lvsr            (toPerm, temp);
+    __ vspltisb        (vTmp2, -1);
+    __ vxor            (vTmp1, vTmp1, vTmp1);
+    __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
+    __ vxor            (toPerm, toPerm, fSplt);
+    __ lvx             (vTmp1, to);
+    __ vperm           (vRet, vRet, vRet, toPerm);
+    __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
+    __ lvx             (vTmp4, fifteen, to);
+    __ stvx            (vTmp1, to);
+    __ vsel            (vRet, vRet, vTmp4, vTmp2);
+    __ stvx            (vRet, fifteen, to);
+
+    __ blr();
+     return start;
+  }
 
   void generate_arraycopy_stubs() {
     // Note: the disjoint stubs must be generated first, some of
@@ -2693,10 +3119,6 @@
     // arraycopy stubs used by compilers
     generate_arraycopy_stubs();
 
-    if (UseAESIntrinsics) {
-      guarantee(!UseAESIntrinsics, "not yet implemented.");
-    }
-
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
                                                        &StubRoutines::_safefetch32_fault_pc,
@@ -2719,6 +3141,12 @@
       StubRoutines::_montgomerySquare
         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
     }
+
+    if (UseAESIntrinsics) {
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+    }
+
   }
 
  public:
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -122,7 +122,7 @@
                (has_fcfids()  ? " fcfids"  : ""),
                (has_vand()    ? " vand"    : ""),
                (has_lqarx()   ? " lqarx"   : ""),
-               (has_vcipher() ? " vcipher" : ""),
+               (has_vcipher() ? " aes"     : ""),
                (has_vpmsumb() ? " vpmsumb" : ""),
                (has_tcheck()  ? " tcheck"  : ""),
                (has_mfdscr()  ? " mfdscr"  : "")
@@ -186,6 +186,28 @@
   }
 
   // The AES intrinsic stubs require AES instruction support.
+#if defined(VM_LITTLE_ENDIAN)
+  if (has_vcipher()) {
+    if (FLAG_IS_DEFAULT(UseAES)) {
+      UseAES = true;
+    }
+  } else if (UseAES) {
+    if (!FLAG_IS_DEFAULT(UseAES))
+      warning("AES instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAES, false);
+  }
+
+  if (UseAES && has_vcipher()) {
+    if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+      UseAESIntrinsics = true;
+    }
+  } else if (UseAESIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseAESIntrinsics))
+      warning("AES intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+  }
+
+#else
   if (UseAES) {
     warning("AES instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseAES, false);
@@ -195,6 +217,7 @@
       warning("AES intrinsics are not available on this CPU");
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
+#endif
 
   if (UseAESCTRIntrinsics) {
     warning("AES/CTR intrinsics are not available on this CPU");
--- a/src/cpu/sparc/vm/c2_globals_sparc.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/sparc/vm/c2_globals_sparc.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -53,6 +53,7 @@
 define_pd_global(bool, ResizeTLAB,                   true);
 define_pd_global(intx, LoopUnrollLimit,              60); // Design center runs on 1.3.1
 define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, PostLoopMultiversioning,      false);
 define_pd_global(intx, MinJumpTableSize,             5);
 
 // Peephole and CISC spilling both break the graph, and so makes the
--- a/src/cpu/sparc/vm/debug_sparc.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/sparc/vm/debug_sparc.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -29,7 +29,6 @@
 #include "runtime/init.hpp"
 #include "runtime/os.hpp"
 #include "utilities/debug.hpp"
-#include "utilities/top.hpp"
 
 #ifndef PRODUCT
 
--- a/src/cpu/sparc/vm/frame_sparc.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/sparc/vm/frame_sparc.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -26,7 +26,6 @@
 #define CPU_SPARC_VM_FRAME_SPARC_HPP
 
 #include "runtime/synchronizer.hpp"
-#include "utilities/top.hpp"
 
 // A frame represents a physical stack frame (an activation).  Frames can be
 // C or Java frames, and the Java frames can be interpreted or compiled.
--- a/src/cpu/sparc/vm/metaspaceShared_sparc.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/sparc/vm/metaspaceShared_sparc.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -65,8 +65,6 @@
   *vtable = dummy_vtable;
   *md_top += vtable_bytes;
 
-  guarantee(*md_top <= md_end, "Insufficient space for vtables.");
-
   // Get ready to generate dummy methods.
 
   CodeBuffer cb((unsigned char*)*mc_top, mc_end - *mc_top);
--- a/src/cpu/sparc/vm/nativeInst_sparc.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/sparc/vm/nativeInst_sparc.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -29,7 +29,6 @@
 #include "memory/allocation.hpp"
 #include "runtime/icache.hpp"
 #include "runtime/os.hpp"
-#include "utilities/top.hpp"
 
 // We have interface for the following instructions:
 // - NativeInstruction
--- a/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -324,6 +324,16 @@
   return size > 8;
 }
 
+size_t SharedRuntime::trampoline_size() {
+  return 40;
+}
+
+void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
+  __ set((intptr_t)destination, G3_scratch);
+  __ JMP(G3_scratch, 0);
+  __ delayed()->nop();
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -37,7 +37,6 @@
 #include "runtime/stubCodeGenerator.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "runtime/thread.inline.hpp"
-#include "utilities/top.hpp"
 #ifdef COMPILER2
 #include "opto/runtime.hpp"
 #endif
--- a/src/cpu/x86/vm/assembler_x86.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -3147,8 +3147,7 @@
 void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "some form of AVX must be enabled");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x67);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3156,7 +3155,7 @@
 void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len) {
   assert(VM_Version::supports_avx2(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x00);
   emit_int8(0xC0 | encode);
   emit_int8(imm8);
@@ -3199,8 +3198,7 @@
 void Assembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x74);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3210,8 +3208,7 @@
   assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x74);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3222,9 +3219,8 @@
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int dst_enc = kdst->encoding();
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x74);
   emit_operand(as_Register(dst_enc), src);
 }
@@ -3242,8 +3238,7 @@
 void Assembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x75);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3253,8 +3248,7 @@
   assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x75);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3265,9 +3259,8 @@
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int dst_enc = kdst->encoding();
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x75);
   emit_operand(as_Register(dst_enc), src);
 }
@@ -3285,8 +3278,7 @@
 void Assembler::vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x76);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3296,8 +3288,7 @@
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x76);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3308,9 +3299,8 @@
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int dst_enc = kdst->encoding();
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x76);
   emit_operand(as_Register(dst_enc), src);
 }
@@ -3328,8 +3318,7 @@
 void Assembler::vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x29);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3339,8 +3328,7 @@
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x29);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3352,9 +3340,8 @@
   InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_is_evex_instruction();
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int dst_enc = kdst->encoding();
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x29);
   emit_operand(as_Register(dst_enc), src);
 }
@@ -3988,7 +3975,7 @@
 void Assembler::pblendw(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x0E);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -4395,8 +4382,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
@@ -4404,8 +4390,7 @@
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4415,8 +4400,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
@@ -4424,8 +4408,7 @@
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4435,8 +4418,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
 }
@@ -4444,8 +4426,7 @@
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4455,8 +4436,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
 }
@@ -4464,8 +4444,7 @@
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4475,8 +4454,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
@@ -4484,8 +4462,7 @@
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4495,8 +4472,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
@@ -4504,8 +4480,7 @@
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4515,8 +4490,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
 }
@@ -4524,8 +4498,7 @@
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4535,8 +4508,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
 }
@@ -4544,8 +4516,7 @@
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4584,8 +4555,7 @@
 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4593,8 +4563,7 @@
 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4604,8 +4573,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
@@ -4615,8 +4583,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
@@ -4640,8 +4607,7 @@
 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4649,8 +4615,7 @@
 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4660,8 +4625,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
 }
@@ -4671,8 +4635,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_operand(dst, src);
 }
@@ -4706,8 +4669,7 @@
 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4715,8 +4677,7 @@
 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4726,8 +4687,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
@@ -4737,8 +4697,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
@@ -4762,8 +4721,7 @@
 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4771,8 +4729,7 @@
 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4782,8 +4739,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
 }
@@ -4793,8 +4749,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
   emit_operand(dst, src);
 }
@@ -4802,8 +4757,7 @@
 void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x51);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4857,8 +4811,7 @@
 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4866,8 +4819,7 @@
 void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4877,8 +4829,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_operand(dst, src);
 }
@@ -4888,8 +4839,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x54);
   emit_operand(dst, src);
 }
@@ -4949,8 +4899,7 @@
 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4958,8 +4907,7 @@
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -4969,8 +4917,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_operand(dst, src);
 }
@@ -4980,8 +4927,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8(0x57);
   emit_operand(dst, src);
 }
@@ -4991,8 +4937,7 @@
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5001,8 +4946,7 @@
   assert(VM_Version::supports_avx() && (vector_len == 0) ||
          VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5035,7 +4979,7 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFE);
   emit_operand(dst, src);
 }
@@ -5067,8 +5011,7 @@
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFC);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5076,8 +5019,7 @@
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFD);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5085,8 +5027,7 @@
 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFE);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5094,8 +5035,7 @@
 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD4);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5105,8 +5045,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFC);
   emit_operand(dst, src);
 }
@@ -5116,8 +5055,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFD);
   emit_operand(dst, src);
 }
@@ -5127,8 +5065,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFE);
   emit_operand(dst, src);
 }
@@ -5138,8 +5075,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD4);
   emit_operand(dst, src);
 }
@@ -5178,8 +5114,7 @@
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF8);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5187,8 +5122,7 @@
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF9);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5196,8 +5130,7 @@
 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFA);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5205,8 +5138,7 @@
 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFB);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5216,8 +5148,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF8);
   emit_operand(dst, src);
 }
@@ -5227,8 +5158,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF9);
   emit_operand(dst, src);
 }
@@ -5238,8 +5168,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFA);
   emit_operand(dst, src);
 }
@@ -5249,8 +5178,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFB);
   emit_operand(dst, src);
 }
@@ -5274,8 +5202,7 @@
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD5);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5283,8 +5210,7 @@
 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5292,8 +5218,7 @@
 void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 2, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5303,8 +5228,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD5);
   emit_operand(dst, src);
 }
@@ -5314,8 +5238,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -5325,8 +5248,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -5638,8 +5560,7 @@
 void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xDB);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5649,8 +5570,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xDB);
   emit_operand(dst, src);
 }
@@ -5674,8 +5594,7 @@
 void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xEB);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5685,8 +5604,7 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xEB);
   emit_operand(dst, src);
 }
@@ -5702,8 +5620,7 @@
 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xEF);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5713,20 +5630,96 @@
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xEF);
   emit_operand(dst, src);
 }
 
 
+// vinserti forms
+
+void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_avx2(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x38);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
+  assert(VM_Version::supports_avx2(), "");
+  assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x38);
+  emit_operand(dst, src);
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x38);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into q0 128 bits (0..127)
+  // 0x01 - insert into q1 128 bits (128..255)
+  // 0x02 - insert into q2 128 bits (256..383)
+  // 0x03 - insert into q3 128 bits (384..511)
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
+  assert(VM_Version::supports_avx(), "");
+  assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x18);
+  emit_operand(dst, src);
+  // 0x00 - insert into q0 128 bits (0..127)
+  // 0x01 - insert into q1 128 bits (128..255)
+  // 0x02 - insert into q2 128 bits (256..383)
+  // 0x03 - insert into q3 128 bits (384..511)
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x38);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into lower 256 bits
+  // 0x01 - insert into upper 256 bits
+  emit_int8(imm8 & 0x01);
+}
+
+
+// vinsertf forms
+
 void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 128 bits
@@ -5734,33 +5727,19 @@
   emit_int8(imm8 & 0x01);
 }
 
-void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x1A);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - insert into lower 256 bits
-  // 0x01 - insert into upper 256 bits
-  emit_int8(imm8 & 0x01);
-}
-
-void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
+  assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_64bit);
-  // swap src<->dst for encoding
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x1A);
-  emit_operand(dst, src);
-  // 0x00 - insert into lower 256 bits
-  // 0x01 - insert into upper 256 bits
+  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x18);
+  emit_operand(dst, src);
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
   emit_int8(imm8 & 0x01);
 }
 
@@ -5768,8 +5747,7 @@
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into q0 128 bits (0..127)
@@ -5784,12 +5762,10 @@
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x03, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
-  // swap src<->dst for encoding
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_operand(dst, src);
   // 0x00 - insert into q0 128 bits (0..127)
@@ -5799,98 +5775,36 @@
   emit_int8(imm8 & 0x03);
 }
 
-void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
-  assert(dst != xnoreg, "sanity");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
-  // swap src<->dst for encoding
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x18);
-  emit_operand(dst, src);
-  // 0x00 - insert into lower 128 bits
-  // 0x01 - insert into upper 128 bits
-  emit_int8(imm8 & 0x01);
-}
-
-void Assembler::vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x19);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - extract from lower 128 bits
-  // 0x01 - extract from upper 128 bits
-  emit_int8(imm8 & 0x01);
-}
-
-void Assembler::vextractf128(Address dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
-  assert(src != xnoreg, "sanity");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
-  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x19);
-  emit_operand(src, dst);
-  // 0x00 - extract from lower 128 bits
-  // 0x01 - extract from upper 128 bits
-  emit_int8(imm8 & 0x01);
-}
-
-void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx2(), "");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x38);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - insert into lower 128 bits
-  // 0x01 - insert into upper 128 bits
-  emit_int8(imm8 & 0x01);
-}
-
-void Assembler::vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x38);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x1A);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
   // 0x01 - insert into upper 256 bits
   emit_int8(imm8 & 0x01);
 }
 
-void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
-  assert(VM_Version::supports_avx2(), "");
+void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
   assert(dst != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
-  // swap src<->dst for encoding
-  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x38);
-  emit_operand(dst, src);
-  // 0x00 - insert into lower 128 bits
-  // 0x01 - insert into upper 128 bits
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_64bit);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x1A);
+  emit_operand(dst, src);
+  // 0x00 - insert into lower 256 bits
+  // 0x01 - insert into upper 256 bits
   emit_int8(imm8 & 0x01);
 }
 
+
+// vextracti forms
+
 void Assembler::vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
@@ -5920,6 +5834,52 @@
   emit_int8(imm8 & 0x01);
 }
 
+void Assembler::vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x39);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vextracti32x4(Address dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x39);
+  emit_operand(src, dst);
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x39);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(imm8 & 0x03);
+}
+
 void Assembler::vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
@@ -5932,44 +5892,35 @@
   emit_int8(imm8 & 0x01);
 }
 
-void Assembler::vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
-  assert(imm8 <= 0x03, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+
+// vextractf forms
+
+void Assembler::vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x39);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - extract from bits 127:0
-  // 0x01 - extract from bits 255:128
-  // 0x02 - extract from bits 383:256
-  // 0x03 - extract from bits 511:384
-  emit_int8(imm8 & 0x03);
-}
-
-void Assembler::vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
-  assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x1B);
-  emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - extract from lower 256 bits
-  // 0x01 - extract from upper 256 bits
+  emit_int8(0x19);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from lower 128 bits
+  // 0x01 - extract from upper 128 bits
   emit_int8(imm8 & 0x01);
 }
 
-void Assembler::vextractf64x4(Address dst, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_evex(), "");
+void Assembler::vextractf128(Address dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_avx(), "");
   assert(src != xnoreg, "sanity");
   assert(imm8 <= 0x01, "imm8: %u", imm8);
-  InstructionMark im(this);
-  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_T4,/* input_size_in_bits */  EVEX_64bit);
+  int vector_len = VM_Version::supports_avx512novl() ? AVX_512bit : AVX_256bit;
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
-  emit_int8(0x1B);
+  emit_int8(0x19);
   emit_operand(src, dst);
-  // 0x00 - extract from lower 256 bits
-  // 0x01 - extract from upper 256 bits
+  // 0x00 - extract from lower 128 bits
+  // 0x01 - extract from upper 128 bits
   emit_int8(imm8 & 0x01);
 }
 
@@ -6019,7 +5970,43 @@
   emit_int8(imm8 & 0x03);
 }
 
-// duplicate 4-bytes integer data from src into 8 locations in dest
+void Assembler::vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x1B);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from lower 256 bits
+  // 0x01 - extract from upper 256 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextractf64x4(Address dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4,/* input_size_in_bits */  EVEX_64bit);
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x1B);
+  emit_operand(src, dst);
+  // 0x00 - extract from lower 256 bits
+  // 0x01 - extract from upper 256 bits
+  emit_int8(imm8 & 0x01);
+}
+
+
+// legacy word/dword replicate
+void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_avx2(), "");
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x79);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
   InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6028,16 +6015,10 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-// duplicate 2-bytes integer data from src into 16 locations in dest
-void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
-  assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
-  emit_int8(0x79);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
-// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+
+// xmm/mem sourced byte/word/dword/qword replicate
+
+// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6053,12 +6034,12 @@
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x78);
   emit_operand(dst, src);
 }
 
-// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+// duplicate 2-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6074,12 +6055,12 @@
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x79);
   emit_operand(dst, src);
 }
 
-// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+// duplicate 4-byte integer data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6095,12 +6076,12 @@
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x58);
   emit_operand(dst, src);
 }
 
-// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+// duplicate 8-byte integer data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6116,12 +6097,15 @@
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x59);
   emit_operand(dst, src);
 }
 
-// duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL
+
+// scalar single/double precision replicate
+
+// duplicate single precision data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6142,7 +6126,7 @@
   emit_operand(dst, src);
 }
 
-// duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL
+// duplicate double precision data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6163,7 +6147,10 @@
   emit_operand(dst, src);
 }
 
-// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+
+// gpr source broadcast forms
+
+// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6176,7 +6163,7 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+// duplicate 2-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -6189,7 +6176,7 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+// duplicate 4-byte integer data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6202,7 +6189,7 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+// duplicate 8-byte integer data from src into programmed locations in dest : requires AVX512VL
 void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -6215,6 +6202,7 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+
 // Carry-Less Multiplication Quadword
 void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
   assert(VM_Version::supports_clmul(), "");
@@ -6229,8 +6217,7 @@
 void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
   assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8((unsigned char)mask);
@@ -6972,8 +6959,7 @@
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8((unsigned char)0x4B);
   emit_int8((unsigned char)(0xC0 | encode));
   int src2_enc = src2->encoding();
--- a/src/cpu/x86/vm/assembler_x86.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1977,39 +1977,43 @@
   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
-  // 128bit copy from/to 256bit (YMM) vector registers
+  // vinserti forms
+  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+
+  // vinsertf forms
   void vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
-  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+
+  // vextracti forms
+  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
+  void vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextracti32x4(Address dst, XMMRegister src, uint8_t imm8);
+  void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+
+  // vextractf forms
   void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
-  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
-  void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
-  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
   void vextractf128(Address dst, XMMRegister src, uint8_t imm8);
-  void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
-
-  // 256bit copy from/to 512bit (ZMM) vector registers
-  void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
-  void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
-  void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
+  void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
   void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
   void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
-  void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
-
-  // 128bit copy from/to 256bit (YMM) or 512bit (ZMM) vector registers
-  void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
-  void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
-  void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
-  void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
-  void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
-  void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
-
-  // duplicate 4-bytes integer data from src into 8 locations in dest
+
+  // legacy xmm sourced word/dword replicate
+  void vpbroadcastw(XMMRegister dst, XMMRegister src);
   void vpbroadcastd(XMMRegister dst, XMMRegister src);
 
-  // duplicate 2-bytes integer data from src into 16 locations in dest
-  void vpbroadcastw(XMMRegister dst, XMMRegister src);
-
-  // duplicate n-bytes integer data from src into vector_len locations in dest
+  // xmm/mem sourced byte/word/dword/qword replicate
   void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
   void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
   void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
@@ -2019,11 +2023,13 @@
   void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
   void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
 
+  // scalar single/double precision replicate
   void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
   void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
   void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
   void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);
 
+  // gpr sourced byte/word/dword/qword replicate
   void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
   void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
   void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
--- a/src/cpu/x86/vm/c2_globals_x86.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/c2_globals_x86.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -47,6 +47,7 @@
 define_pd_global(intx, FreqInlineSize,               325);
 define_pd_global(intx, MinJumpTableSize,             10);
 define_pd_global(intx, LoopPercentProfileLimit,      30);
+define_pd_global(intx, PostLoopMultiversioning,      true);
 #ifdef AMD64
 define_pd_global(intx, INTPRESSURE,                  13);
 define_pd_global(intx, FLOATPRESSURE,                14);
--- a/src/cpu/x86/vm/debug_x86.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/debug_x86.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -29,6 +29,5 @@
 #include "runtime/init.hpp"
 #include "runtime/os.hpp"
 #include "utilities/debug.hpp"
-#include "utilities/top.hpp"
 
 void pd_ps(frame f) {}
--- a/src/cpu/x86/vm/frame_x86.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/frame_x86.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -26,7 +26,6 @@
 #define CPU_X86_VM_FRAME_X86_HPP
 
 #include "runtime/synchronizer.hpp"
-#include "utilities/top.hpp"
 
 // A frame represents a physical stack frame (an activation).  Frames can be
 // C or Java frames, and the Java frames can be interpreted or compiled.
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1216,7 +1216,10 @@
   void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }
 
   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
-    if (UseAVX > 1) { // vinserti128 is available only in AVX2
+    if (UseAVX > 2) {
+      Assembler::vinserti32x4(dst, dst, src, imm8);
+    } else if (UseAVX > 1) {
+      // vinserti128 is available only in AVX2
       Assembler::vinserti128(dst, nds, src, imm8);
     } else {
       Assembler::vinsertf128(dst, nds, src, imm8);
@@ -1224,7 +1227,10 @@
   }
 
   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
-    if (UseAVX > 1) { // vinserti128 is available only in AVX2
+    if (UseAVX > 2) {
+      Assembler::vinserti32x4(dst, dst, src, imm8);
+    } else if (UseAVX > 1) {
+      // vinserti128 is available only in AVX2
       Assembler::vinserti128(dst, nds, src, imm8);
     } else {
       Assembler::vinsertf128(dst, nds, src, imm8);
@@ -1232,7 +1238,10 @@
   }
 
   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
-    if (UseAVX > 1) { // vextracti128 is available only in AVX2
+    if (UseAVX > 2) {
+      Assembler::vextracti32x4(dst, src, imm8);
+    } else if (UseAVX > 1) {
+      // vextracti128 is available only in AVX2
       Assembler::vextracti128(dst, src, imm8);
     } else {
       Assembler::vextractf128(dst, src, imm8);
@@ -1240,7 +1249,10 @@
   }
 
   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
-    if (UseAVX > 1) { // vextracti128 is available only in AVX2
+    if (UseAVX > 2) {
+      Assembler::vextracti32x4(dst, src, imm8);
+    } else if (UseAVX > 1) {
+      // vextracti128 is available only in AVX2
       Assembler::vextracti128(dst, src, imm8);
     } else {
       Assembler::vextractf128(dst, src, imm8);
@@ -1260,37 +1272,57 @@
   void vextracti128_high(Address dst, XMMRegister src) {
     vextracti128(dst, src, 1);
   }
+
   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
-    vinsertf128(dst, dst, src, 1);
+    if (UseAVX > 2) {
+      Assembler::vinsertf32x4(dst, dst, src, 1);
+    } else {
+      Assembler::vinsertf128(dst, dst, src, 1);
+    }
   }
+
   void vinsertf128_high(XMMRegister dst, Address src) {
-    vinsertf128(dst, dst, src, 1);
+    if (UseAVX > 2) {
+      Assembler::vinsertf32x4(dst, dst, src, 1);
+    } else {
+      Assembler::vinsertf128(dst, dst, src, 1);
+    }
   }
+
   void vextractf128_high(XMMRegister dst, XMMRegister src) {
-    vextractf128(dst, src, 1);
+    if (UseAVX > 2) {
+      Assembler::vextractf32x4(dst, src, 1);
+    } else {
+      Assembler::vextractf128(dst, src, 1);
+    }
   }
+
   void vextractf128_high(Address dst, XMMRegister src) {
-    vextractf128(dst, src, 1);
+    if (UseAVX > 2) {
+      Assembler::vextractf32x4(dst, src, 1);
+    } else {
+      Assembler::vextractf128(dst, src, 1);
+    }
   }
 
   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
-    vinserti64x4(dst, dst, src, 1);
+    Assembler::vinserti64x4(dst, dst, src, 1);
   }
   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
-    vinsertf64x4(dst, dst, src, 1);
+    Assembler::vinsertf64x4(dst, dst, src, 1);
   }
   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
-    vextracti64x4(dst, src, 1);
+    Assembler::vextracti64x4(dst, src, 1);
   }
   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
-    vextractf64x4(dst, src, 1);
+    Assembler::vextractf64x4(dst, src, 1);
   }
   void vextractf64x4_high(Address dst, XMMRegister src) {
-    vextractf64x4(dst, src, 1);
+    Assembler::vextractf64x4(dst, src, 1);
   }
   void vinsertf64x4_high(XMMRegister dst, Address src) {
-    vinsertf64x4(dst, dst, src, 1);
+    Assembler::vinsertf64x4(dst, dst, src, 1);
   }
 
   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
@@ -1306,40 +1338,59 @@
   void vextracti128_low(Address dst, XMMRegister src) {
     vextracti128(dst, src, 0);
   }
+
   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
-    vinsertf128(dst, dst, src, 0);
+    if (UseAVX > 2) {
+      Assembler::vinsertf32x4(dst, dst, src, 0);
+    } else {
+      Assembler::vinsertf128(dst, dst, src, 0);
+    }
   }
+
   void vinsertf128_low(XMMRegister dst, Address src) {
-    vinsertf128(dst, dst, src, 0);
+    if (UseAVX > 2) {
+      Assembler::vinsertf32x4(dst, dst, src, 0);
+    } else {
+      Assembler::vinsertf128(dst, dst, src, 0);
+    }
   }
+
   void vextractf128_low(XMMRegister dst, XMMRegister src) {
-    vextractf128(dst, src, 0);
+    if (UseAVX > 2) {
+      Assembler::vextractf32x4(dst, src, 0);
+    } else {
+      Assembler::vextractf128(dst, src, 0);
+    }
   }
+
   void vextractf128_low(Address dst, XMMRegister src) {
-    vextractf128(dst, src, 0);
+    if (UseAVX > 2) {
+      Assembler::vextractf32x4(dst, src, 0);
+    } else {
+      Assembler::vextractf128(dst, src, 0);
+    }
   }
 
   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
-    vinserti64x4(dst, dst, src, 0);
+    Assembler::vinserti64x4(dst, dst, src, 0);
   }
   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
-    vinsertf64x4(dst, dst, src, 0);
+    Assembler::vinsertf64x4(dst, dst, src, 0);
   }
   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
-    vextracti64x4(dst, src, 0);
+    Assembler::vextracti64x4(dst, src, 0);
   }
   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
-    vextractf64x4(dst, src, 0);
+    Assembler::vextractf64x4(dst, src, 0);
   }
   void vextractf64x4_low(Address dst, XMMRegister src) {
-    vextractf64x4(dst, src, 0);
+    Assembler::vextractf64x4(dst, src, 0);
   }
   void vinsertf64x4_low(XMMRegister dst, Address src) {
-    vinsertf64x4(dst, dst, src, 0);
+    Assembler::vinsertf64x4(dst, dst, src, 0);
   }
 
-
   // Carry-Less Multiplication Quadword
   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
     // 0x00 - multiply lower 64 bits [0:63]
--- a/src/cpu/x86/vm/nativeInst_x86.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/nativeInst_x86.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -29,7 +29,6 @@
 #include "memory/allocation.hpp"
 #include "runtime/icache.hpp"
 #include "runtime/os.hpp"
-#include "utilities/top.hpp"
 
 // We have interfaces for the following instructions:
 // - NativeInstruction
--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -355,6 +355,14 @@
   return size > 16;
 }
 
+size_t SharedRuntime::trampoline_size() {
+  return 16;
+}
+
+void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
+  __ jump(RuntimeAddress(destination));
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -391,6 +391,14 @@
   return size > 16;
 }
 
+size_t SharedRuntime::trampoline_size() {
+  return 16;
+}
+
+void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
+  __ jump(RuntimeAddress(destination));
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -38,7 +38,6 @@
 #include "runtime/stubCodeGenerator.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "runtime/thread.inline.hpp"
-#include "utilities/top.hpp"
 #ifdef COMPILER2
 #include "opto/runtime.hpp"
 #endif
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -38,7 +38,6 @@
 #include "runtime/stubCodeGenerator.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "runtime/thread.inline.hpp"
-#include "utilities/top.hpp"
 #ifdef COMPILER2
 #include "opto/runtime.hpp"
 #endif
--- a/src/cpu/zero/vm/debug_zero.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/zero/vm/debug_zero.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -30,7 +30,6 @@
 #include "runtime/init.hpp"
 #include "runtime/os.hpp"
 #include "utilities/debug.hpp"
-#include "utilities/top.hpp"
 
 void pd_ps(frame f) {
   ShouldNotCallThis();
--- a/src/cpu/zero/vm/frame_zero.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/zero/vm/frame_zero.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -27,7 +27,6 @@
 #define CPU_ZERO_VM_FRAME_ZERO_HPP
 
 #include "runtime/synchronizer.hpp"
-#include "utilities/top.hpp"
 
 // A frame represents a physical stack frame on the Zero stack.
 
--- a/src/cpu/zero/vm/nativeInst_zero.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/zero/vm/nativeInst_zero.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -30,7 +30,6 @@
 #include "memory/allocation.hpp"
 #include "runtime/icache.hpp"
 #include "runtime/os.hpp"
-#include "utilities/top.hpp"
 
 // We have interfaces for the following instructions:
 // - NativeInstruction
--- a/src/cpu/zero/vm/sharedRuntime_zero.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/zero/vm/sharedRuntime_zero.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -132,6 +132,15 @@
   return generate_empty_runtime_stub("resolve_blob");
 }
 
+size_t SharedRuntime::trampoline_size() {
+  ShouldNotCallThis();
+  return 0;
+}
+
+void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
+  ShouldNotCallThis();
+  return;
+}
 
 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
                                          VMRegPair *regs,
--- a/src/cpu/zero/vm/stubGenerator_zero.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/cpu/zero/vm/stubGenerator_zero.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -40,7 +40,6 @@
 #include "runtime/stubRoutines.hpp"
 #include "runtime/thread.inline.hpp"
 #include "stack_zero.inline.hpp"
-#include "utilities/top.hpp"
 #ifdef COMPILER2
 #include "opto/runtime.hpp"
 #endif
--- a/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/SAGetopt.java	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/SAGetopt.java	Thu Apr 14 14:05:09 2016 +0000
@@ -55,11 +55,11 @@
     private void extractOptarg(String opt) {
         // Argument expected
         if (_optind > _argv.length) {
-            throw new RuntimeException("Not enough arguments for '" + opt + "'");
+            throw new SAGetoptException("Not enough arguments for '" + opt + "'");
         }
 
         if (! _argv[_optind].isEmpty() && _argv[_optind].charAt(0) == '-') {
-            throw new RuntimeException("Argument is expected for '" + opt + "'");
+            throw new SAGetoptException("Argument is expected for '" + opt + "'");
         }
 
         _optarg = _argv[_optind];
@@ -72,7 +72,7 @@
 
         if (los.contains(ca[0])) {
             if (ca.length > 1) {
-                throw new RuntimeException("Argument is not expected for '" + ca[0] + "'");
+                throw new SAGetoptException("Argument is not expected for '" + ca[0] + "'");
             }
             return carg;
         }
@@ -87,14 +87,14 @@
                 try {
                     extractOptarg(ca[0]);
                 } catch (ArrayIndexOutOfBoundsException e) {
-                    throw new RuntimeException("Argument is expected for '" + ca[0] + "'");
+                    throw new SAGetoptException("Argument is expected for '" + ca[0] + "'");
                 }
             }
 
             return ca[0];
         }
 
-        throw new RuntimeException("Invalid option '" + ca[0] + "'");
+        throw new SAGetoptException("Invalid option '" + ca[0] + "'");
     }
 
     public String next(String optStr, String[] longOptStr) {
@@ -148,7 +148,7 @@
 
         int chIndex = optStr.indexOf(ch);
         if (chIndex == -1) {
-            throw new RuntimeException("Invalid option '" + ch + "'");
+            throw new SAGetoptException("Invalid option '" + ch + "'");
         }
 
         if (_optopt >= carg.length()) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/SAGetoptException.java	Thu Apr 14 14:05:09 2016 +0000
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot;
+
+public class SAGetoptException extends IllegalArgumentException {
+
+    public SAGetoptException(String message) {
+        super(message);
+    }
+
+}
--- a/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/SALauncher.java	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/SALauncher.java	Thu Apr 14 14:05:09 2016 +0000
@@ -111,34 +111,31 @@
         return launcherHelp();
     }
 
-    private static void buildAttachArgs(ArrayList<String> newArgs,
-                                        String pid, String exe, String core) {
-        if ((pid == null) && (exe == null)) {
-            throw new IllegalArgumentException(
-                                     "You have to set --pid or --exe.");
+    private static void buildAttachArgs(ArrayList<String> newArgs, String pid,
+                                  String exe, String core, boolean allowEmpty) {
+        if (!allowEmpty && (pid == null) && (exe == null)) {
+            throw new SAGetoptException("You have to set --pid or --exe.");
         }
 
         if (pid != null) { // Attach to live process
             if (exe != null) {
-                throw new IllegalArgumentException(
-                                             "Unnecessary argument: --exe");
+                throw new SAGetoptException("Unnecessary argument: --exe");
             } else if (core != null) {
-                throw new IllegalArgumentException(
-                                             "Unnecessary argument: --core");
+                throw new SAGetoptException("Unnecessary argument: --core");
             } else if (!pid.matches("^\\d+$")) {
-                throw new IllegalArgumentException("Invalid pid: " + pid);
+                throw new SAGetoptException("Invalid pid: " + pid);
             }
 
             newArgs.add(pid);
-        } else {
+        } else if (exe != null) {
             if (exe.length() == 0) {
-                throw new IllegalArgumentException("You have to set --exe.");
+                throw new SAGetoptException("You have to set --exe.");
             }
 
             newArgs.add(exe);
 
             if ((core == null) || (core.length() == 0)) {
-                throw new IllegalArgumentException("You have to set --core.");
+                throw new SAGetoptException("You have to set --core.");
             }
 
             newArgs.add(core);
@@ -170,7 +167,7 @@
             }
         }
 
-        buildAttachArgs(newArgs, pid, exe, core);
+        buildAttachArgs(newArgs, pid, exe, core, true);
         CLHSDB.main(newArgs.toArray(new String[newArgs.size()]));
     }
 
@@ -199,7 +196,7 @@
             }
         }
 
-        buildAttachArgs(newArgs, pid, exe, core);
+        buildAttachArgs(newArgs, pid, exe, core, true);
         HSDB.main(newArgs.toArray(new String[newArgs.size()]));
     }
 
@@ -237,7 +234,7 @@
             }
         }
 
-        buildAttachArgs(newArgs, pid, exe, core);
+        buildAttachArgs(newArgs, pid, exe, core, false);
         JStack.main(newArgs.toArray(new String[newArgs.size()]));
     }
 
@@ -287,7 +284,7 @@
             }
         }
 
-        buildAttachArgs(newArgs, pid, exe, core);
+        buildAttachArgs(newArgs, pid, exe, core, false);
         JMap.main(newArgs.toArray(new String[newArgs.size()]));
     }
 
@@ -325,7 +322,7 @@
             }
         }
 
-        buildAttachArgs(newArgs, pid, exe, core);
+        buildAttachArgs(newArgs, pid, exe, core, false);
         JInfo.main(newArgs.toArray(new String[newArgs.size()]));
     }
 
@@ -358,7 +355,7 @@
             }
         }
 
-        buildAttachArgs(newArgs, pid, exe, core);
+        buildAttachArgs(newArgs, pid, exe, core, false);
         JSnap.main(newArgs.toArray(new String[newArgs.size()]));
     }
 
@@ -416,8 +413,8 @@
                 return;
             }
 
-            throw new IllegalArgumentException("Unknown tool: " + args[0]);
-        } catch (Exception e) {
+            throw new SAGetoptException("Unknown tool: " + args[0]);
+        } catch (SAGetoptException e) {
             System.err.println(e.getMessage());
             toolHelp(args[0]);
         }
--- a/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/oops/MethodCounters.java	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/oops/MethodCounters.java	Thu Apr 14 14:05:09 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -47,8 +47,10 @@
   private static synchronized void initialize(TypeDataBase db) throws WrongTypeException {
     Type type      = db.lookupType("MethodCounters");
 
-    interpreterInvocationCountField = new CIntField(type.getCIntegerField("_interpreter_invocation_count"), 0);
-    interpreterThrowoutCountField = new CIntField(type.getCIntegerField("_interpreter_throwout_count"), 0);
+    if (VM.getVM().isServerCompiler()) {
+      interpreterInvocationCountField = new CIntField(type.getCIntegerField("_interpreter_invocation_count"), 0);
+      interpreterThrowoutCountField = new CIntField(type.getCIntegerField("_interpreter_throwout_count"), 0);
+    }
     if (!VM.getVM().isCore()) {
       invocationCounter        = new CIntField(type.getCIntegerField("_invocation_counter"), 0);
       backedgeCounter          = new CIntField(type.getCIntegerField("_backedge_counter"), 0);
@@ -61,11 +63,19 @@
   private static CIntField backedgeCounter;
 
   public int interpreterInvocationCount() {
-    return (int) interpreterInvocationCountField.getValue(this);
+      if (interpreterInvocationCountField != null) {
+        return (int) interpreterInvocationCountField.getValue(this);
+      } else {
+        return 0;
+      }
   }
 
   public int interpreterThrowoutCount() {
-    return (int) interpreterThrowoutCountField.getValue(this);
+      if (interpreterThrowoutCountField != null) {
+        return (int) interpreterThrowoutCountField.getValue(this);
+      } else {
+        return 0;
+      }
   }
   public long getInvocationCounter() {
     if (Assert.ASSERTS_ENABLED) {
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotMemoryAccessProviderImpl.java	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotMemoryAccessProviderImpl.java	Thu Apr 14 14:05:09 2016 +0000
@@ -23,7 +23,6 @@
 package jdk.vm.ci.hotspot;
 
 import static jdk.vm.ci.hotspot.UnsafeAccess.UNSAFE;
-import jdk.vm.ci.common.JVMCIError;
 import jdk.vm.ci.hotspot.HotSpotVMConfig.CompressEncoding;
 import jdk.vm.ci.meta.Constant;
 import jdk.vm.ci.meta.JavaConstant;
@@ -59,7 +58,7 @@
                     return true;
                 }
             } else {
-                throw new JVMCIError("%s", metaspaceObject);
+                throw new IllegalArgumentException(String.valueOf(metaspaceObject));
             }
         }
         return false;
@@ -75,7 +74,7 @@
                 return prim.asLong();
             }
         }
-        throw new JVMCIError("%s", base);
+        throw new IllegalArgumentException(String.valueOf(base));
     }
 
     private static long readRawValue(Constant baseConstant, long displacement, int bits) {
@@ -91,7 +90,7 @@
                 case Long.SIZE:
                     return UNSAFE.getLong(base, displacement);
                 default:
-                    throw new JVMCIError("%d", bits);
+                    throw new IllegalArgumentException(String.valueOf(bits));
             }
         } else {
             long pointer = asRawPointer(baseConstant);
@@ -105,7 +104,7 @@
                 case Long.SIZE:
                     return UNSAFE.getLong(pointer + displacement);
                 default:
-                    throw new JVMCIError("%d", bits);
+                    throw new IllegalArgumentException(String.valueOf(bits));
             }
         }
     }
@@ -178,7 +177,7 @@
                 case Double:
                     return JavaConstant.forDouble(Double.longBitsToDouble(rawValue));
                 default:
-                    throw new JVMCIError("Unsupported kind: %s", kind);
+                    throw new IllegalArgumentException("Unsupported kind: " + kind);
             }
         } catch (NullPointerException e) {
             return null;
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.meta/src/jdk/vm/ci/meta/MemoryAccessProvider.java	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.meta/src/jdk/vm/ci/meta/MemoryAccessProvider.java	Thu Apr 14 14:05:09 2016 +0000
@@ -35,8 +35,10 @@
      * @param displacement the displacement within the object in bytes
      * @return the read value encapsulated in a {@link JavaConstant} object, or {@code null} if the
      *         value cannot be read.
+     * @throws IllegalArgumentException if {@code kind} is {@link JavaKind#Void} or not
+     *             {@linkplain JavaKind#isPrimitive() primitive} kind
      */
-    JavaConstant readUnsafeConstant(JavaKind kind, JavaConstant base, long displacement);
+    JavaConstant readUnsafeConstant(JavaKind kind, JavaConstant base, long displacement) throws IllegalArgumentException;
 
     /**
      * Reads a primitive value using a base address and a displacement.
@@ -46,8 +48,11 @@
      * @param displacement the displacement within the object in bytes
      * @param bits the number of bits to read from memory
      * @return the read value encapsulated in a {@link JavaConstant} object of {@link JavaKind} kind
+     * @throws IllegalArgumentException if {@code kind} is {@link JavaKind#Void} or not
+     *             {@linkplain JavaKind#isPrimitive() primitive} kind or {@code bits} is not 8, 16,
+     *             32 or 64
      */
-    JavaConstant readPrimitiveConstant(JavaKind kind, Constant base, long displacement, int bits);
+    JavaConstant readPrimitiveConstant(JavaKind kind, Constant base, long displacement, int bits) throws IllegalArgumentException;
 
     /**
      * Reads a Java {@link Object} value using a base address and a displacement.
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.meta/src/jdk/vm/ci/meta/MethodHandleAccessProvider.java	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.meta/src/jdk/vm/ci/meta/MethodHandleAccessProvider.java	Thu Apr 14 14:05:09 2016 +0000
@@ -51,6 +51,8 @@
     /**
      * Returns the method handle method intrinsic identifier for the provided method, or
      * {@code null} if the method is not an intrinsic processed by this interface.
+     *
+     * @throws NullPointerException if {@code method} is null
      */
     IntrinsicMethod lookupMethodHandleIntrinsic(ResolvedJavaMethod method);
 
@@ -58,19 +60,27 @@
      * Resolves the invocation target for an invocation of {@link IntrinsicMethod#INVOKE_BASIC
      * MethodHandle.invokeBasic} with the given constant receiver {@link MethodHandle}. Returns
      * {@code null} if the invocation target is not available at this time.
-     * <p>
+     *
      * The first invocations of a method handle can use an interpreter to lookup the actual invoked
      * method; frequently executed method handles can use Java bytecode generation to avoid the
      * interpreter overhead. If the parameter forceBytecodeGeneration is set to true, the VM should
      * try to generate bytecodes before this method returns.
+     *
+     * @returns {@code null} if {@code methodHandle} is not a {@link MethodHandle} or the invocation
+     *          target is not available at this time
+     * @throws NullPointerException if {@code methodHandle} is null
      */
     ResolvedJavaMethod resolveInvokeBasicTarget(JavaConstant methodHandle, boolean forceBytecodeGeneration);
 
     /**
      * Resolves the invocation target for an invocation of a {@code MethodHandle.linkTo*} method
      * with the given constant member name. The member name is the last parameter of the
-     * {@code linkTo*} method. Returns {@code null} if the invocation target is not available at
-     * this time.
+     * {@code linkTo*} method.
+     *
+     * @returns {@code null} if the invocation target is not available at this time
+     * @throws NullPointerException if {@code memberName} is null
+     * @throws IllegalArgumentException if {@code memberName} is not a
+     *             {@code java.lang.invoke.MemberName}
      */
     ResolvedJavaMethod resolveLinkToTarget(JavaConstant memberName);
 }
--- a/src/os/aix/vm/perfMemory_aix.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os/aix/vm/perfMemory_aix.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -956,7 +956,7 @@
 #ifdef O_NOFOLLOW
   RESTARTABLE(::open(filename, oflags), result);
 #else
-  open_o_nofollow(filename, oflags);
+  result = open_o_nofollow(filename, oflags);
 #endif
 
   if (result == OS_ERR) {
--- a/src/os/bsd/vm/os_bsd.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os/bsd/vm/os_bsd.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -3732,6 +3732,28 @@
   return ::stat(pathbuf, sbuf);
 }
 
+static inline struct timespec get_mtime(const char* filename) {
+  struct stat st;
+  int ret = os::stat(filename, &st);
+  assert(ret == 0, "failed to stat() file '%s': %s", filename, strerror(errno));
+#ifdef __APPLE__
+  return st.st_mtimespec;
+#else
+  return st.st_mtim;
+#endif
+}
+
+int os::compare_file_modified_times(const char* file1, const char* file2) {
+  struct timespec filetime1 = get_mtime(file1);
+  struct timespec filetime2 = get_mtime(file2);
+  int diff = filetime1.tv_sec - filetime2.tv_sec;
+  if (diff == 0) {
+    return filetime1.tv_nsec - filetime2.tv_nsec;
+  }
+  return diff;
+}
+
+
 bool os::check_heap(bool force) {
   return true;
 }
--- a/src/os/linux/vm/os_linux.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os/linux/vm/os_linux.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -6008,7 +6008,22 @@
   return yes;
 }
 
-
+static inline struct timespec get_mtime(const char* filename) {
+  struct stat st;
+  int ret = os::stat(filename, &st);
+  assert(ret == 0, "failed to stat() file '%s': %s", filename, strerror(errno));
+  return st.st_mtim;
+}
+
+int os::compare_file_modified_times(const char* file1, const char* file2) {
+  struct timespec filetime1 = get_mtime(file1);
+  struct timespec filetime2 = get_mtime(file2);
+  int diff = filetime1.tv_sec - filetime2.tv_sec;
+  if (diff == 0) {
+    return filetime1.tv_nsec - filetime2.tv_nsec;
+  }
+  return diff;
+}
 
 /////////////// Unit tests ///////////////
 
--- a/src/os/posix/vm/os_posix.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os/posix/vm/os_posix.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -181,6 +181,10 @@
     return vsnprintf(buf, len, fmt, args);
 }
 
+int os::fileno(FILE* fp) {
+  return ::fileno(fp);
+}
+
 void os::Posix::print_load_average(outputStream* st) {
   st->print("load average:");
   double loadavg[3];
--- a/src/os/solaris/vm/os_solaris.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os/solaris/vm/os_solaris.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -161,6 +161,7 @@
 
 address os::Solaris::_main_stack_base = NULL;  // 4352906 workaround
 
+os::Solaris::pthread_setname_np_func_t os::Solaris::_pthread_setname_np = NULL;
 
 // "default" initializers for missing libc APIs
 extern "C" {
@@ -441,8 +442,15 @@
 }
 
 void os::set_native_thread_name(const char *name) {
-  // Not yet implemented.
-  return;
+  if (Solaris::_pthread_setname_np != NULL) {
+    // Only the first 31 bytes of 'name' are processed by pthread_setname_np
+    // but we explicitly copy into a size-limited buffer to avoid any
+    // possible overflow.
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%s", name);
+    buf[sizeof(buf) - 1] = '\0';
+    Solaris::_pthread_setname_np(pthread_self(), buf);
+  }
 }
 
 bool os::distribute_processes(uint length, uint* distribution) {
@@ -1819,6 +1827,19 @@
   return ::stat(pathbuf, sbuf);
 }
 
+static inline time_t get_mtime(const char* filename) {
+  struct stat st;
+  int ret = os::stat(filename, &st);
+  assert(ret == 0, "failed to stat() file '%s': %s", filename, strerror(errno));
+  return st.st_mtime;
+}
+
+int os::compare_file_modified_times(const char* file1, const char* file2) {
+  time_t t1 = get_mtime(file1);
+  time_t t2 = get_mtime(file2);
+  return t1 - t2;
+}
+
 static bool _print_ascii_file(const char* filename, outputStream* st) {
   int fd = ::open(filename, O_RDONLY);
   if (fd == -1) {
@@ -4410,6 +4431,13 @@
   // the minimum of what the OS supports (thr_min_stack()), and
   // enough to allow the thread to get to user bytecode execution.
   Solaris::min_stack_allowed = MAX2(thr_min_stack(), Solaris::min_stack_allowed);
+
+  // retrieve entry point for pthread_setname_np
+  void * handle = dlopen("libc.so.1", RTLD_LAZY);
+  if (handle != NULL) {
+    Solaris::_pthread_setname_np =
+        (Solaris::pthread_setname_np_func_t)dlsym(handle, "pthread_setname_np");
+  }
 }
 
 // To install functions for atexit system call
--- a/src/os/solaris/vm/os_solaris.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os/solaris/vm/os_solaris.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -122,6 +122,9 @@
   static int _SIGasync;                      // user-overridable ASYNC_SIGNAL
   static void set_SIGasync(int newsig) { _SIGasync = newsig; }
 
+  typedef int (*pthread_setname_np_func_t)(pthread_t, const char*);
+  static pthread_setname_np_func_t _pthread_setname_np;
+
  public:
   // Large Page Support--ISM.
   static bool largepage_range(char* addr, size_t size);
--- a/src/os/windows/vm/os_windows.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os/windows/vm/os_windows.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1594,6 +1594,19 @@
   return ret;
 }
 
+static inline time_t get_mtime(const char* filename) {
+  struct stat st;
+  int ret = os::stat(filename, &st);
+  assert(ret == 0, "failed to stat() file '%s': %s", filename, strerror(errno));
+  return st.st_mtime;
+}
+
+int os::compare_file_modified_times(const char* file1, const char* file2) {
+  time_t t1 = get_mtime(file1);
+  time_t t2 = get_mtime(file2);
+  return t1 - t2;
+}
+
 void os::print_os_info_brief(outputStream* st) {
   os::print_os_info(st);
 }
@@ -3006,9 +3019,7 @@
       }
 #ifdef ASSERT
       if (should_inject_error) {
-        if (TracePageSizes && Verbose) {
-          tty->print_cr("Reserving pages individually failed.");
-        }
+        log_develop_debug(pagesize)("Reserving pages individually failed.");
       }
 #endif
       return NULL;
@@ -3192,9 +3203,8 @@
   // 1) the UseLargePagesIndividualAllocation flag is set (set by default on WS2003)
   // 2) NUMA Interleaving is enabled, in which case we use a different node for each page
   if (UseLargePagesIndividualAllocation || UseNUMAInterleaving) {
-    if (TracePageSizes && Verbose) {
-      tty->print_cr("Reserving large pages individually.");
-    }
+    log_debug(pagesize)("Reserving large pages individually.");
+
     char * p_buf = allocate_pages_individually(bytes, addr, flags, prot, LargePagesIndividualAllocationInjectError);
     if (p_buf == NULL) {
       // give an appropriate warning message
@@ -3211,9 +3221,8 @@
     return p_buf;
 
   } else {
-    if (TracePageSizes && Verbose) {
-      tty->print_cr("Reserving large pages in a single large chunk.");
-    }
+    log_debug(pagesize)("Reserving large pages in a single large chunk.");
+
     // normal policy just allocate it all at once
     DWORD flag = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
     char * res = (char *)VirtualAlloc(addr, bytes, flag, prot);
@@ -4593,6 +4602,9 @@
   return 0;
 }
 
+int os::fileno(FILE* fp) {
+  return _fileno(fp);
+}
 
 // This code is a copy of JDK's sysSync
 // from src/windows/hpi/src/sys_api_md.c
--- a/src/os_cpu/bsd_x86/vm/thread_bsd_x86.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os_cpu/bsd_x86/vm/thread_bsd_x86.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -23,6 +23,7 @@
  */
 
 #include "precompiled.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/thread.inline.hpp"
 
@@ -64,6 +65,14 @@
       return false;
     }
 
+#if INCLUDE_CDS
+    if (UseSharedSpaces && MetaspaceShared::is_in_shared_region(addr.pc(), MetaspaceShared::md)) {
+      // In the middle of a trampoline call. Bail out for safety.
+      // This happens rarely so shouldn't affect profiling.
+      return false;
+    }
+#endif
+
     frame ret_frame(ret_sp, ret_fp, addr.pc());
     if (!ret_frame.safe_for_sender(jt)) {
 #if defined(COMPILER2) || INCLUDE_JVMCI
--- a/src/os_cpu/linux_aarch64/vm/thread_linux_aarch64.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os_cpu/linux_aarch64/vm/thread_linux_aarch64.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -24,6 +24,7 @@
  */
 
 #include "precompiled.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/thread.inline.hpp"
 
@@ -66,6 +67,14 @@
       return false;
     }
 
+#if INCLUDE_CDS
+    if (UseSharedSpaces && MetaspaceShared::is_in_shared_region(addr.pc(), MetaspaceShared::md)) {
+      // In the middle of a trampoline call. Bail out for safety.
+      // This happens rarely so shouldn't affect profiling.
+      return false;
+    }
+#endif
+
     frame ret_frame(ret_sp, ret_fp, addr.pc());
     if (!ret_frame.safe_for_sender(jt)) {
 #ifdef COMPILER2
--- a/src/os_cpu/linux_sparc/vm/thread_linux_sparc.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os_cpu/linux_sparc/vm/thread_linux_sparc.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -23,6 +23,7 @@
  */
 
 #include "precompiled.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/thread.inline.hpp"
 
@@ -64,6 +65,14 @@
     return false;
   }
 
+#if INCLUDE_CDS
+  if (UseSharedSpaces && MetaspaceShared::is_in_shared_region(addr.pc(), MetaspaceShared::md)) {
+    // In the middle of a trampoline call. Bail out for safety.
+    // This happens rarely so shouldn't affect profiling.
+    return false;
+  }
+#endif
+
   // we were running Java code when SIGPROF came in
   if (isInJava) {
     // If we have a last_Java_sp, then the SIGPROF signal caught us
--- a/src/os_cpu/linux_x86/vm/thread_linux_x86.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os_cpu/linux_x86/vm/thread_linux_x86.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -23,6 +23,7 @@
  */
 
 #include "precompiled.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/thread.inline.hpp"
 
@@ -65,6 +66,14 @@
       return false;
     }
 
+#if INCLUDE_CDS
+    if (UseSharedSpaces && MetaspaceShared::is_in_shared_region(addr.pc(), MetaspaceShared::md)) {
+      // In the middle of a trampoline call. Bail out for safety.
+      // This happens rarely so shouldn't affect profiling.
+      return false;
+    }
+#endif
+
     frame ret_frame(ret_sp, ret_fp, addr.pc());
     if (!ret_frame.safe_for_sender(jt)) {
 #if defined(COMPILER2) || INCLUDE_JVMCI
--- a/src/os_cpu/solaris_sparc/vm/thread_solaris_sparc.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os_cpu/solaris_sparc/vm/thread_solaris_sparc.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -23,6 +23,7 @@
  */
 
 #include "precompiled.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/thread.inline.hpp"
 
@@ -77,6 +78,14 @@
     return false;
   }
 
+#if INCLUDE_CDS
+  if (UseSharedSpaces && MetaspaceShared::is_in_shared_region(addr.pc(), MetaspaceShared::md)) {
+    // In the middle of a trampoline call. Bail out for safety.
+    // This happens rarely so shouldn't affect profiling.
+    return false;
+  }
+#endif
+
   frame ret_frame(ret_sp, frame::unpatchable, addr.pc());
 
   // we were running Java code when SIGPROF came in
--- a/src/os_cpu/solaris_x86/vm/thread_solaris_x86.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os_cpu/solaris_x86/vm/thread_solaris_x86.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -23,6 +23,7 @@
  */
 
 #include "precompiled.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/thread.inline.hpp"
 
@@ -70,6 +71,14 @@
     return false;
   }
 
+#if INCLUDE_CDS
+  if (UseSharedSpaces && MetaspaceShared::is_in_shared_region(addr.pc(), MetaspaceShared::md)) {
+    // In the middle of a trampoline call. Bail out for safety.
+    // This happens rarely so shouldn't affect profiling.
+    return false;
+  }
+#endif
+
   // If sp and fp are nonsense just leave them out
 
   if (!jt->on_local_stack((address)ret_sp)) {
--- a/src/os_cpu/windows_x86/vm/thread_windows_x86.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/os_cpu/windows_x86/vm/thread_windows_x86.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -23,6 +23,7 @@
  */
 
 #include "precompiled.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/thread.inline.hpp"
 
@@ -72,6 +73,14 @@
       return false;
     }
 
+#if INCLUDE_CDS
+    if (UseSharedSpaces && MetaspaceShared::is_in_shared_region(addr.pc(), MetaspaceShared::md)) {
+      // In the middle of a trampoline call. Bail out for safety.
+      // This happens rarely so shouldn't affect profiling.
+      return false;
+    }
+#endif
+
     frame ret_frame(ret_sp, ret_fp, addr.pc());
     if (!ret_frame.safe_for_sender(jt)) {
 #if defined(COMPILER2) || INCLUDE_JVMCI
--- a/src/share/vm/asm/assembler.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/asm/assembler.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -33,7 +33,6 @@
 #include "runtime/vm_version.hpp"
 #include "utilities/debug.hpp"
 #include "utilities/growableArray.hpp"
-#include "utilities/top.hpp"
 
 // This file contains platform-independent assembler declarations.
 
--- a/src/share/vm/asm/register.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/asm/register.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -25,7 +25,8 @@
 #ifndef SHARE_VM_ASM_REGISTER_HPP
 #define SHARE_VM_ASM_REGISTER_HPP
 
-#include "utilities/top.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/globalDefinitions.hpp"
 
 // Use AbstractRegister as shortcut
 class AbstractRegisterImpl;
--- a/src/share/vm/c1/c1_LIRAssembler.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/c1/c1_LIRAssembler.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -556,17 +556,16 @@
       leal(op->in_opr(), op->result_opr());
       break;
 
-    case lir_null_check:
-      if (GenerateCompilerNullChecks) {
-        ImplicitNullCheckStub* stub = add_debug_info_for_null_check_here(op->info());
+    case lir_null_check: {
+      ImplicitNullCheckStub* stub = add_debug_info_for_null_check_here(op->info());
 
-        if (op->in_opr()->is_single_cpu()) {
-          _masm->null_check(op->in_opr()->as_register(), stub->entry());
-        } else {
-          Unimplemented();
-        }
+      if (op->in_opr()->is_single_cpu()) {
+        _masm->null_check(op->in_opr()->as_register(), stub->entry());
+      } else {
+        Unimplemented();
       }
       break;
+    }
 
     case lir_monaddr:
       monitor_address(op->in_opr()->as_constant_ptr()->as_jint(), op->result_opr());
--- a/src/share/vm/c1/c1_LIRAssembler.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/c1/c1_LIRAssembler.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -28,7 +28,6 @@
 #include "c1/c1_CodeStubs.hpp"
 #include "ci/ciMethodData.hpp"
 #include "oops/methodData.hpp"
-#include "utilities/top.hpp"
 
 class Compilation;
 class ScopeValue;
--- a/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -2041,8 +2041,7 @@
   // to avoid a fixed interval with an oop during the null check.
   // Use a copy of the CodeEmitInfo because debug information is
   // different for null_check and throw.
-  if (GenerateCompilerNullChecks &&
-      (x->exception()->as_NewInstance() == NULL && x->exception()->as_ExceptionObject() == NULL)) {
+  if (x->exception()->as_NewInstance() == NULL && x->exception()->as_ExceptionObject() == NULL) {
     // if the exception object wasn't created using new then it might be null.
     __ null_check(exception_opr, new CodeEmitInfo(info, x->state()->copy(ValueStack::ExceptionState, x->state()->bci())));
   }
--- a/src/share/vm/c1/c1_globals.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/c1/c1_globals.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -176,7 +176,7 @@
   product(bool, InlineSynchronizedMethods, true,                            \
           "Inline synchronized methods")                                    \
                                                                             \
-  diagnostic(bool, InlineNIOCheckIndex, true,                               \
+  develop(bool, InlineNIOCheckIndex, true,                                  \
           "Intrinsify java.nio.Buffer.checkIndex")                          \
                                                                             \
   develop(bool, CanonicalizeNodes, true,                                    \
--- a/src/share/vm/ci/ciFlags.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/ci/ciFlags.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -29,6 +29,7 @@
 #include "memory/allocation.hpp"
 #include "prims/jvm.h"
 #include "utilities/accessFlags.hpp"
+#include "utilities/ostream.hpp"
 
 // ciFlags
 //
--- a/src/share/vm/classfile/classFileStream.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/classfile/classFileStream.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -25,8 +25,9 @@
 #ifndef SHARE_VM_CLASSFILE_CLASSFILESTREAM_HPP
 #define SHARE_VM_CLASSFILE_CLASSFILESTREAM_HPP
 
+#include "memory/allocation.hpp"
 #include "utilities/bytes.hpp"
-#include "utilities/top.hpp"
+#include "utilities/exceptions.hpp"
 
 // Input stream for reading .class file
 //
--- a/src/share/vm/classfile/javaClasses.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/classfile/javaClasses.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -2187,43 +2187,19 @@
 }
 
 Method* java_lang_StackFrameInfo::get_method(Handle stackFrame, InstanceKlass* holder, TRAPS) {
-  if (MemberNameInStackFrame) {
-    Handle mname(THREAD, stackFrame->obj_field(_memberName_offset));
-    Method* method = (Method*)java_lang_invoke_MemberName::vmtarget(mname());
-    // we should expand MemberName::name when Throwable uses StackTrace
-    // MethodHandles::expand_MemberName(mname, MethodHandles::_suppress_defc|MethodHandles::_suppress_type, CHECK_NULL);
-    return method;
-  } else {
-    short mid       = stackFrame->short_field(_mid_offset);
-    short version   = stackFrame->short_field(_version_offset);
-    return holder->method_with_orig_idnum(mid, version);
-  }
-}
-
-Symbol* java_lang_StackFrameInfo::get_file_name(Handle stackFrame, InstanceKlass* holder) {
-  if (MemberNameInStackFrame) {
-    return holder->source_file_name();
-  } else {
-    short version = stackFrame->short_field(_version_offset);
-    return Backtrace::get_source_file_name(holder, version);
-  }
+  Handle mname(THREAD, stackFrame->obj_field(_memberName_offset));
+  Method* method = (Method*)java_lang_invoke_MemberName::vmtarget(mname());
+  // we should expand MemberName::name when Throwable uses StackTrace
+  // MethodHandles::expand_MemberName(mname, MethodHandles::_suppress_defc|MethodHandles::_suppress_type, CHECK_NULL);
+  return method;
 }
 
 void java_lang_StackFrameInfo::set_method_and_bci(Handle stackFrame, const methodHandle& method, int bci) {
   // set Method* or mid/cpref
-  if (MemberNameInStackFrame) {
-    oop mname = stackFrame->obj_field(_memberName_offset);
-    InstanceKlass* ik = method->method_holder();
-    CallInfo info(method(), ik);
-    MethodHandles::init_method_MemberName(mname, info);
-  } else {
-    int mid = method->orig_method_idnum();
-    int cpref = method->name_index();
-    assert((jushort)mid == mid,        "mid should be short");
-    assert((jushort)cpref == cpref,    "cpref should be short");
-    java_lang_StackFrameInfo::set_mid(stackFrame(),     (short)mid);
-    java_lang_StackFrameInfo::set_cpref(stackFrame(),   (short)cpref);
-  }
+  oop mname = stackFrame->obj_field(_memberName_offset);
+  InstanceKlass* ik = method->method_holder();
+  CallInfo info(method(), ik);
+  MethodHandles::init_method_MemberName(mname, info);
   // set bci
   java_lang_StackFrameInfo::set_bci(stackFrame(), bci);
   // method may be redefined; store the version
@@ -2232,52 +2208,23 @@
   java_lang_StackFrameInfo::set_version(stackFrame(), (short)version);
 }
 
-void java_lang_StackFrameInfo::fill_methodInfo(Handle stackFrame, TRAPS) {
+void java_lang_StackFrameInfo::to_stack_trace_element(Handle stackFrame, Handle stack_trace_element, TRAPS) {
   ResourceMark rm(THREAD);
-  oop k = stackFrame->obj_field(_declaringClass_offset);
-  InstanceKlass* holder = InstanceKlass::cast(java_lang_Class::as_Klass(k));
+  Handle k (THREAD, stackFrame->obj_field(_declaringClass_offset));
+  InstanceKlass* holder = InstanceKlass::cast(java_lang_Class::as_Klass(k()));
   Method* method = java_lang_StackFrameInfo::get_method(stackFrame, holder, CHECK);
-  int bci = stackFrame->int_field(_bci_offset);
-
-  // The method can be NULL if the requested class version is gone
-  Symbol* sym = (method != NULL) ? method->name() : NULL;
-  if (MemberNameInStackFrame) {
-    assert(sym != NULL, "MemberName must have method name");
-  } else {
-      // The method can be NULL if the requested class version is gone
-    if (sym == NULL) {
-      short cpref   = stackFrame->short_field(_cpref_offset);
-      sym = holder->constants()->symbol_at(cpref);
-    }
-  }
-
-  // set method name
-  oop methodname = StringTable::intern(sym, CHECK);
-  java_lang_StackFrameInfo::set_methodName(stackFrame(), methodname);
-
-  // set file name and line number
-  Symbol* source = get_file_name(stackFrame, holder);
-  if (source != NULL) {
-    oop filename = StringTable::intern(source, CHECK);
-    java_lang_StackFrameInfo::set_fileName(stackFrame(), filename);
-  }
-
-  // if the method has been redefined, the bci is no longer applicable
+
   short version = stackFrame->short_field(_version_offset);
-  if (version_matches(method, version)) {
-    int line_number = Backtrace::get_line_number(method, bci);
-    java_lang_StackFrameInfo::set_lineNumber(stackFrame(), line_number);
-  }
+  short bci = stackFrame->short_field(_bci_offset);
+  int cpref = method->name_index();
+  java_lang_StackTraceElement::fill_in(stack_trace_element, holder, method, version, bci, cpref, CHECK);
 }
 
 void java_lang_StackFrameInfo::compute_offsets() {
   Klass* k = SystemDictionary::StackFrameInfo_klass();
   compute_offset(_declaringClass_offset, k, vmSymbols::declaringClass_name(),  vmSymbols::class_signature());
   compute_offset(_memberName_offset,     k, vmSymbols::memberName_name(),  vmSymbols::object_signature());
-  compute_offset(_bci_offset,            k, vmSymbols::bci_name(),         vmSymbols::int_signature());
-  compute_offset(_methodName_offset,     k, vmSymbols::methodName_name(),  vmSymbols::string_signature());
-  compute_offset(_fileName_offset,       k, vmSymbols::fileName_name(),    vmSymbols::string_signature());
-  compute_offset(_lineNumber_offset,     k, vmSymbols::lineNumber_name(),  vmSymbols::int_signature());
+  compute_offset(_bci_offset,            k, vmSymbols::bci_name(),         vmSymbols::short_signature());
   STACKFRAMEINFO_INJECTED_FIELDS(INJECTED_FIELD_COMPUTE_OFFSET);
 }
 
@@ -3690,12 +3637,7 @@
 int java_lang_StackFrameInfo::_declaringClass_offset;
 int java_lang_StackFrameInfo::_memberName_offset;
 int java_lang_StackFrameInfo::_bci_offset;
-int java_lang_StackFrameInfo::_methodName_offset;
-int java_lang_StackFrameInfo::_fileName_offset;
-int java_lang_StackFrameInfo::_lineNumber_offset;
-int java_lang_StackFrameInfo::_mid_offset;
 int java_lang_StackFrameInfo::_version_offset;
-int java_lang_StackFrameInfo::_cpref_offset;
 int java_lang_LiveStackFrameInfo::_monitors_offset;
 int java_lang_LiveStackFrameInfo::_locals_offset;
 int java_lang_LiveStackFrameInfo::_operands_offset;
@@ -3741,34 +3683,14 @@
   element->obj_field_put(_declaringClass_offset, value);
 }
 
-void java_lang_StackFrameInfo::set_mid(oop element, short value) {
-  element->short_field_put(_mid_offset, value);
-}
-
 void java_lang_StackFrameInfo::set_version(oop element, short value) {
   element->short_field_put(_version_offset, value);
 }
 
-void java_lang_StackFrameInfo::set_cpref(oop element, short value) {
-  element->short_field_put(_cpref_offset, value);
-}
-
 void java_lang_StackFrameInfo::set_bci(oop element, int value) {
   element->int_field_put(_bci_offset, value);
 }
 
-void java_lang_StackFrameInfo::set_fileName(oop element, oop value) {
-  element->obj_field_put(_fileName_offset, value);
-}
-
-void java_lang_StackFrameInfo::set_methodName(oop element, oop value) {
-  element->obj_field_put(_methodName_offset, value);
-}
-
-void java_lang_StackFrameInfo::set_lineNumber(oop element, int value) {
-  element->int_field_put(_lineNumber_offset, value);
-}
-
 void java_lang_LiveStackFrameInfo::set_monitors(oop element, oop value) {
   element->obj_field_put(_monitors_offset, value);
 }
--- a/src/share/vm/classfile/javaClasses.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/classfile/javaClasses.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1364,25 +1364,16 @@
 // Interface to java.lang.StackFrameInfo objects
 
 #define STACKFRAMEINFO_INJECTED_FIELDS(macro)                      \
-  macro(java_lang_StackFrameInfo, mid,     short_signature, false) \
-  macro(java_lang_StackFrameInfo, version, short_signature, false) \
-  macro(java_lang_StackFrameInfo, cpref,   short_signature, false)
+  macro(java_lang_StackFrameInfo, version, short_signature, false)
 
 class java_lang_StackFrameInfo: AllStatic {
 private:
   static int _declaringClass_offset;
   static int _memberName_offset;
   static int _bci_offset;
-  static int _methodName_offset;
-  static int _fileName_offset;
-  static int _lineNumber_offset;
-
-  static int _mid_offset;
   static int _version_offset;
-  static int _cpref_offset;
 
   static Method* get_method(Handle stackFrame, InstanceKlass* holder, TRAPS);
-  static Symbol* get_file_name(Handle stackFrame, InstanceKlass* holder);
 
 public:
   // Setters
@@ -1390,19 +1381,12 @@
   static void set_method_and_bci(Handle stackFrame, const methodHandle& method, int bci);
   static void set_bci(oop info, int value);
 
-  // set method info in an instance of StackFrameInfo
-  static void fill_methodInfo(Handle info, TRAPS);
-  static void set_methodName(oop info, oop value);
-  static void set_fileName(oop info, oop value);
-  static void set_lineNumber(oop info, int value);
-
-  // these injected fields are only used if -XX:-MemberNameInStackFrame set
-  static void set_mid(oop info, short value);
   static void set_version(oop info, short value);
-  static void set_cpref(oop info, short value);
 
   static void compute_offsets();
 
+  static void to_stack_trace_element(Handle stackFrame, Handle stack_trace_element, TRAPS);
+
   // Debugging
   friend class JavaClasses;
 };
--- a/src/share/vm/classfile/javaClasses.inline.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/classfile/javaClasses.inline.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -222,20 +222,17 @@
   return line_number;
 }
 
-/*
- * Returns the source file name of a given InstanceKlass and version
- */
 inline Symbol* Backtrace::get_source_file_name(InstanceKlass* holder, int version) {
-  // Find the specific ik version that contains this source_file_name_index
-  // via the previous versions list, but use the current version's
-  // constant pool to look it up.  The previous version's index has been
-  // merged for the current constant pool.
-  InstanceKlass* ik = holder->get_klass_version(version);
-  // This version has been cleaned up.
-  if (ik == NULL) return NULL;
-  int source_file_name_index = ik->source_file_name_index();
-  return (source_file_name_index == 0) ?
-      (Symbol*)NULL : holder->constants()->symbol_at(source_file_name_index);
+  // RedefineClasses() currently permits redefine operations to
+  // happen in parallel using a "last one wins" philosophy. That
+  // spec laxness allows the constant pool entry associated with
+  // the source_file_name_index for any older constant pool version
+  // to be unstable so we shouldn't try to use it.
+  if (holder->constants()->version() != version) {
+    return NULL;
+  } else {
+    return holder->source_file_name();
+  }
 }
 
 #endif // SHARE_VM_CLASSFILE_JAVACLASSES_INLINE_HPP
--- a/src/share/vm/classfile/systemDictionary.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/classfile/systemDictionary.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -2067,7 +2067,18 @@
   int  sid  = (info >> CEIL_LG_OPTION_LIMIT);
   Symbol* symbol = vmSymbols::symbol_at((vmSymbols::SID)sid);
   InstanceKlass** klassp = &_well_known_klasses[id];
-  bool must_load = (init_opt < SystemDictionary::Opt);
+
+  bool must_load;
+#if INCLUDE_JVMCI
+  if (EnableJVMCI) {
+    // If JVMCI is enabled we require its classes to be found.
+    must_load = (init_opt < SystemDictionary::Opt) || (init_opt == SystemDictionary::Jvmci);
+  } else
+#endif
+  {
+    must_load = (init_opt < SystemDictionary::Opt);
+  }
+
   if ((*klassp) == NULL) {
     Klass* k;
     if (must_load) {
--- a/src/share/vm/classfile/systemDictionary.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/classfile/systemDictionary.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -241,7 +241,7 @@
 
     Opt,                        // preload tried; NULL if not present
 #if INCLUDE_JVMCI
-    Jvmci,                      // preload tried; error if not present, use only with JVMCI
+    Jvmci,                      // preload tried; error if not present if JVMCI enabled
 #endif
     OPTION_LIMIT,
     CEIL_LG_OPTION_LIMIT = 2    // OPTION_LIMIT <= (1<<CEIL_LG_OPTION_LIMIT)
--- a/src/share/vm/code/codeCache.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/code/codeCache.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1042,6 +1042,14 @@
   }
 }
 
+void CodeCache::cleanup_inline_caches() {
+  assert_locked_or_safepoint(CodeCache_lock);
+  NMethodIterator iter;
+  while(iter.next_alive()) {
+    iter.method()->cleanup_inline_caches(/*clean_all=*/true);
+  }
+}
+
 // Keeps track of time spent for checking dependencies
 NOT_PRODUCT(static elapsedTimer dependentCheckTime;)
 
--- a/src/share/vm/code/codeCache.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/code/codeCache.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -201,6 +201,7 @@
   static bool needs_cache_clean()                     { return _needs_cache_clean; }
   static void set_needs_cache_clean(bool v)           { _needs_cache_clean = v;    }
   static void clear_inline_caches();                  // clear all inline caches
+  static void cleanup_inline_caches();
 
   // Returns true if an own CodeHeap for the given CodeBlobType is available
   static bool heap_available(int code_blob_type);
--- a/src/share/vm/code/nmethod.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/code/nmethod.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1139,8 +1139,7 @@
   }
 }
 
-
-void nmethod::cleanup_inline_caches() {
+void nmethod::cleanup_inline_caches(bool clean_all/*=false*/) {
   assert_locked_or_safepoint(CompiledIC_lock);
 
   // If the method is not entrant or zombie then a JMP is plastered over the
@@ -1170,7 +1169,7 @@
         if( cb != NULL && cb->is_nmethod() ) {
           nmethod* nm = (nmethod*)cb;
           // Clean inline caches pointing to zombie, non-entrant and unloaded methods
-          if (!nm->is_in_use() || (nm->method()->code() != nm)) ic->set_to_clean(is_alive());
+          if (clean_all || !nm->is_in_use() || (nm->method()->code() != nm)) ic->set_to_clean(is_alive());
         }
         break;
       }
@@ -1180,7 +1179,7 @@
         if( cb != NULL && cb->is_nmethod() ) {
           nmethod* nm = (nmethod*)cb;
           // Clean inline caches pointing to zombie, non-entrant and unloaded methods
-          if (!nm->is_in_use() || (nm->method()->code() != nm)) csc->set_to_clean();
+          if (clean_all || !nm->is_in_use() || (nm->method()->code() != nm)) csc->set_to_clean();
         }
         break;
       }
--- a/src/share/vm/code/nmethod.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/code/nmethod.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -599,7 +599,7 @@
   // Inline cache support
   void clear_inline_caches();
   void clear_ic_stubs();
-  void cleanup_inline_caches();
+  void cleanup_inline_caches(bool clean_all = false);
   bool inlinecache_check_contains(address addr) const {
     return (addr >= code_begin() && addr < verified_entry_point());
   }
--- a/src/share/vm/code/relocInfo.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/code/relocInfo.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -26,8 +26,9 @@
 #define SHARE_VM_CODE_RELOCINFO_HPP
 
 #include "memory/allocation.hpp"
-#include "utilities/top.hpp"
+#include "runtime/os.hpp"
 
+class Metadata;
 class NativeMovConstReg;
 
 // Types in this file:
--- a/src/share/vm/code/vmreg.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/code/vmreg.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -28,10 +28,9 @@
 #include "asm/register.hpp"
 #include "memory/allocation.hpp"
 #include "utilities/globalDefinitions.hpp"
-
+#include "utilities/ostream.hpp"
 #ifdef COMPILER2
 #include "opto/adlcVMDeps.hpp"
-#include "utilities/ostream.hpp"
 #endif
 
 //------------------------------VMReg------------------------------------------
--- a/src/share/vm/compiler/compileBroker.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/compiler/compileBroker.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -389,13 +389,16 @@
     task = CompilationPolicy::policy()->select_task(this);
   }
 
-  // Save method pointers across unlock safepoint.  The task is removed from
-  // the compilation queue, which is walked during RedefineClasses.
-  save_method = methodHandle(task->method());
-  save_hot_method = methodHandle(task->hot_method());
+  if (task != NULL) {
+    // Save method pointers across unlock safepoint.  The task is removed from
+    // the compilation queue, which is walked during RedefineClasses.
+    save_method = methodHandle(task->method());
+    save_hot_method = methodHandle(task->hot_method());
 
-  remove(task);
-  purge_stale_tasks(); // may temporarily release MCQ lock
+    remove(task);
+    purge_stale_tasks(); // may temporarily release MCQ lock
+  }
+
   return task;
 }
 
@@ -1784,7 +1787,8 @@
   bool is_osr = (osr_bci != standard_entry_bci);
   bool should_log = (thread->log() != NULL);
   bool should_break = false;
-  int task_level = task->comp_level();
+  const int task_level = task->comp_level();
+  AbstractCompiler* comp = task->compiler();
 
   DirectiveSet* directive;
   {
@@ -1796,7 +1800,7 @@
     assert(!method->is_native(), "no longer compile natives");
 
     // Look up matching directives
-    directive = DirectivesStack::getMatchingDirective(method, compiler(task_level));
+    directive = DirectivesStack::getMatchingDirective(method, comp);
 
     // Save information about this method in case of failure.
     set_last_compile(thread, method, is_osr, task_level);
@@ -1815,13 +1819,13 @@
   int compilable = ciEnv::MethodCompilable;
   const char* failure_reason = NULL;
   const char* retry_message = NULL;
-  AbstractCompiler *comp = compiler(task_level);
 
   int system_dictionary_modification_counter;
   {
     MutexLocker locker(Compile_lock, thread);
     system_dictionary_modification_counter = SystemDictionary::number_of_modifications();
   }
+
 #if INCLUDE_JVMCI
   if (UseJVMCICompiler && comp != NULL && comp->is_jvmci()) {
     JVMCICompiler* jvmci = (JVMCICompiler*) comp;
--- a/src/share/vm/compiler/compileTask.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/compiler/compileTask.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -123,6 +123,13 @@
   _next = NULL;
 }
 
+/**
+ * Returns the compiler for this task.
+ */
+AbstractCompiler* CompileTask::compiler() {
+  return CompileBroker::compiler(_comp_level);
+}
+
 // ------------------------------------------------------------------
 // CompileTask::code/set_code
 //
--- a/src/share/vm/compiler/compileTask.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/compiler/compileTask.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -115,6 +115,8 @@
   int          comp_level()                      { return _comp_level;}
   void         set_comp_level(int comp_level)    { _comp_level = comp_level;}
 
+  AbstractCompiler* compiler();
+
   int          num_inlined_bytecodes() const     { return _num_inlined_bytecodes; }
   void         set_num_inlined_bytecodes(int n)  { _num_inlined_bytecodes = n; }
 
--- a/src/share/vm/gc/cms/compactibleFreeListSpace.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/cms/compactibleFreeListSpace.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -32,6 +32,7 @@
 #include "gc/shared/genCollectedHeap.hpp"
 #include "gc/shared/space.inline.hpp"
 #include "gc/shared/spaceDecorator.hpp"
+#include "logging/logStream.inline.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/resourceArea.hpp"
 #include "memory/universe.inline.hpp"
@@ -505,10 +506,13 @@
     return;
   }
   log.debug("%s", title);
-  _dictionary->report_statistics(log.debug_stream());
+
+  LogStream out(log.debug());
+  _dictionary->report_statistics(&out);
+
   if (log.is_trace()) {
-    ResourceMark rm;
-    reportIndexedFreeListStatistics(log.trace_stream());
+    LogStream trace_out(log.trace());
+    reportIndexedFreeListStatistics(&trace_out);
     size_t total_size = totalSizeInIndexedFreeLists() +
                        _dictionary->total_chunk_size(DEBUG_ONLY(freelistLock()));
     log.trace(" free=" SIZE_FORMAT " frag=%1.4f", total_size, flsFrag());
--- a/src/share/vm/gc/cms/compactibleFreeListSpace.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/cms/compactibleFreeListSpace.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -82,6 +82,8 @@
   template <typename SpaceType>
   friend void CompactibleSpace::scan_and_compact(SpaceType* space);
   template <typename SpaceType>
+  friend void CompactibleSpace::verify_up_to_first_dead(SpaceType* space);
+  template <typename SpaceType>
   friend void CompactibleSpace::scan_and_forward(SpaceType* space, CompactPoint* cp);
 
   // "Size" of chunks of work (executed during parallel remark phases
--- a/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -425,7 +425,7 @@
     st->print(",cms_consumption_rate=%g,time_until_full=%g",
               cms_consumption_rate(), time_until_cms_gen_full());
   }
-  st->print(" ");
+  st->cr();
 }
 #endif // #ifndef PRODUCT
 
@@ -1108,8 +1108,10 @@
 }
 
 bool CMSCollector::shouldConcurrentCollect() {
+  LogTarget(Trace, gc) log;
+
   if (_full_gc_requested) {
-    log_trace(gc)("CMSCollector: collect because of explicit  gc request (or GCLocker)");
+    log.print("CMSCollector: collect because of explicit  gc request (or GCLocker)");
     return true;
   }
 
@@ -1117,21 +1119,22 @@
   // ------------------------------------------------------------------
   // Print out lots of information which affects the initiation of
   // a collection.
-  Log(gc) log;
-  if (log.is_trace() && stats().valid()) {
-    log.trace("CMSCollector shouldConcurrentCollect: ");
-    ResourceMark rm;
-    stats().print_on(log.debug_stream());
-    log.trace("time_until_cms_gen_full %3.7f", stats().time_until_cms_gen_full());
-    log.trace("free=" SIZE_FORMAT, _cmsGen->free());
-    log.trace("contiguous_available=" SIZE_FORMAT, _cmsGen->contiguous_available());
-    log.trace("promotion_rate=%g", stats().promotion_rate());
-    log.trace("cms_allocation_rate=%g", stats().cms_allocation_rate());
-    log.trace("occupancy=%3.7f", _cmsGen->occupancy());
-    log.trace("initiatingOccupancy=%3.7f", _cmsGen->initiating_occupancy());
-    log.trace("cms_time_since_begin=%3.7f", stats().cms_time_since_begin());
-    log.trace("cms_time_since_end=%3.7f", stats().cms_time_since_end());
-    log.trace("metadata initialized %d", MetaspaceGC::should_concurrent_collect());
+  if (log.is_enabled() && stats().valid()) {
+    log.print("CMSCollector shouldConcurrentCollect: ");
+
+    LogStream out(log);
+    stats().print_on(&out);
+
+    log.print("time_until_cms_gen_full %3.7f", stats().time_until_cms_gen_full());
+    log.print("free=" SIZE_FORMAT, _cmsGen->free());
+    log.print("contiguous_available=" SIZE_FORMAT, _cmsGen->contiguous_available());
+    log.print("promotion_rate=%g", stats().promotion_rate());
+    log.print("cms_allocation_rate=%g", stats().cms_allocation_rate());
+    log.print("occupancy=%3.7f", _cmsGen->occupancy());
+    log.print("initiatingOccupancy=%3.7f", _cmsGen->initiating_occupancy());
+    log.print("cms_time_since_begin=%3.7f", stats().cms_time_since_begin());
+    log.print("cms_time_since_end=%3.7f", stats().cms_time_since_end());
+    log.print("metadata initialized %d", MetaspaceGC::should_concurrent_collect());
   }
   // ------------------------------------------------------------------
 
@@ -1149,8 +1152,8 @@
       // this branch will not fire after the first successful CMS
       // collection because the stats should then be valid.
       if (_cmsGen->occupancy() >= _bootstrap_occupancy) {
-        log_trace(gc)(" CMSCollector: collect for bootstrapping statistics: occupancy = %f, boot occupancy = %f",
-                      _cmsGen->occupancy(), _bootstrap_occupancy);
+        log.print(" CMSCollector: collect for bootstrapping statistics: occupancy = %f, boot occupancy = %f",
+                  _cmsGen->occupancy(), _bootstrap_occupancy);
         return true;
       }
     }
@@ -1162,7 +1165,7 @@
   // XXX We need to make sure that the gen expansion
   // criterion dovetails well with this. XXX NEED TO FIX THIS
   if (_cmsGen->should_concurrent_collect()) {
-    log_trace(gc)("CMS old gen initiated");
+    log.print("CMS old gen initiated");
     return true;
   }
 
@@ -1173,12 +1176,12 @@
   assert(gch->collector_policy()->is_generation_policy(),
          "You may want to check the correctness of the following");
   if (gch->incremental_collection_will_fail(true /* consult_young */)) {
-    log_trace(gc)("CMSCollector: collect because incremental collection will fail ");
+    log.print("CMSCollector: collect because incremental collection will fail ");
     return true;
   }
 
   if (MetaspaceGC::should_concurrent_collect()) {
-    log_trace(gc)("CMSCollector: collect for metadata allocation ");
+    log.print("CMSCollector: collect for metadata allocation ");
     return true;
   }
 
@@ -1193,10 +1196,10 @@
     // as we want to be able to trigger the first CMS cycle as well)
     if (stats().cms_time_since_begin() >= (CMSTriggerInterval / ((double) MILLIUNITS))) {
       if (stats().valid()) {
-        log_trace(gc)("CMSCollector: collect because of trigger interval (time since last begin %3.7f secs)",
-                      stats().cms_time_since_begin());
+        log.print("CMSCollector: collect because of trigger interval (time since last begin %3.7f secs)",
+                  stats().cms_time_since_begin());
       } else {
-        log_trace(gc)("CMSCollector: collect because of trigger interval (first collection)");
+        log.print("CMSCollector: collect because of trigger interval (first collection)");
       }
       return true;
     }
--- a/src/share/vm/gc/g1/g1CardLiveData.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/g1/g1CardLiveData.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -206,8 +206,14 @@
       return 0;
     }
     if (hr->is_humongous()) {
-      mark_card_bitmap_range(start, hr->top());
-      return pointer_delta(hr->top(), start, 1);
+      HeapRegion* start_region = hr->humongous_start_region();
+      if (mark_bitmap->isMarked(start_region->bottom())) {
+        mark_card_bitmap_range(start, hr->top());
+        return pointer_delta(hr->top(), start, 1);
+      } else {
+        // Humongous start object was actually dead.
+        return 0;
+      }
     }
 
     assert(start <= hr->end() && start <= ntams && ntams <= hr->end(),
--- a/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1829,10 +1829,14 @@
                                          HeapRegion::GrainBytes,
                                          translation_factor,
                                          mtGC);
-  if (TracePageSizes) {
-    tty->print_cr("G1 '%s': pg_sz=" SIZE_FORMAT " base=" PTR_FORMAT " size=" SIZE_FORMAT " alignment=" SIZE_FORMAT " reqsize=" SIZE_FORMAT,
-                  description, preferred_page_size, p2i(rs.base()), rs.size(), rs.alignment(), size);
-  }
+
+  os::trace_page_sizes_for_requested_size(description,
+                                          size,
+                                          preferred_page_size,
+                                          rs.alignment(),
+                                          rs.base(),
+                                          rs.size());
+
   return result;
 }
 
@@ -1906,26 +1910,28 @@
                                          HeapRegion::GrainBytes,
                                          1,
                                          mtJavaHeap);
-  os::trace_page_sizes("G1 Heap", collector_policy()->min_heap_byte_size(),
-                       max_byte_size, page_size,
+  os::trace_page_sizes("Heap",
+                       collector_policy()->min_heap_byte_size(),
+                       max_byte_size,
+                       page_size,
                        heap_rs.base(),
                        heap_rs.size());
   heap_storage->set_mapping_changed_listener(&_listener);
 
   // Create storage for the BOT, card table, card counts table (hot card cache) and the bitmaps.
   G1RegionToSpaceMapper* bot_storage =
-    create_aux_memory_mapper("Block offset table",
+    create_aux_memory_mapper("Block Offset Table",
                              G1BlockOffsetTable::compute_size(g1_rs.size() / HeapWordSize),
                              G1BlockOffsetTable::heap_map_factor());
 
   ReservedSpace cardtable_rs(G1SATBCardTableLoggingModRefBS::compute_size(g1_rs.size() / HeapWordSize));
   G1RegionToSpaceMapper* cardtable_storage =
-    create_aux_memory_mapper("Card table",
+    create_aux_memory_mapper("Card Table",
                              G1SATBCardTableLoggingModRefBS::compute_size(g1_rs.size() / HeapWordSize),
                              G1SATBCardTableLoggingModRefBS::heap_map_factor());
 
   G1RegionToSpaceMapper* card_counts_storage =
-    create_aux_memory_mapper("Card counts table",
+    create_aux_memory_mapper("Card Counts Table",
                              G1CardCounts::compute_size(g1_rs.size() / HeapWordSize),
                              G1CardCounts::heap_map_factor());
 
@@ -2736,7 +2742,7 @@
   _cmThread->print_on(st);
   st->cr();
   _cm->print_worker_threads_on(st);
-  _cg1r->print_worker_threads_on(st);
+  _cg1r->print_worker_threads_on(st); // also prints the sample thread
   if (G1StringDedup::is_enabled()) {
     G1StringDedup::print_worker_threads_on(st);
   }
@@ -2745,7 +2751,8 @@
 void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const {
   workers()->threads_do(tc);
   tc->do_thread(_cmThread);
-  _cg1r->threads_do(tc);
+  _cm->threads_do(tc);
+  _cg1r->threads_do(tc); // also iterates over the sample thread
   if (G1StringDedup::is_enabled()) {
     G1StringDedup::threads_do(tc);
   }
@@ -2940,13 +2947,17 @@
       : rset->is_empty();
   }
 
-  bool is_typeArray_region(HeapRegion* region) const {
-    return oop(region->bottom())->is_typeArray();
-  }
-
   bool humongous_region_is_candidate(G1CollectedHeap* heap, HeapRegion* region) const {
     assert(region->is_starts_humongous(), "Must start a humongous object");
 
+    oop obj = oop(region->bottom());
+
+    // Dead objects cannot be eager reclaim candidates. Due to class
+    // unloading it is unsafe to query their classes so we return early.
+    if (heap->is_obj_dead(obj, region)) {
+      return false;
+    }
+
     // Candidate selection must satisfy the following constraints
     // while concurrent marking is in progress:
     //
@@ -2983,7 +2994,7 @@
     // important use case for eager reclaim, and this special handling
     // may reduce needed headroom.
 
-    return is_typeArray_region(region) && is_remset_small(region);
+    return obj->is_typeArray() && is_remset_small(region);
   }
 
  public:
@@ -4441,7 +4452,6 @@
 }
 
 void G1CollectedHeap::preserve_cm_referents(G1ParScanThreadStateSet* per_thread_states) {
-  double preserve_cm_referents_start = os::elapsedTime();
   // Any reference objects, in the collection set, that were 'discovered'
   // by the CM ref processor should have already been copied (either by
   // applying the external root copy closure to the discovered lists, or
@@ -4462,16 +4472,24 @@
   // objects discovered by the STW ref processor in case one of these
   // referents points to another object which is also referenced by an
   // object discovered by the STW ref processor.
-
-  uint no_of_gc_workers = workers()->active_workers();
-
-  G1ParPreserveCMReferentsTask keep_cm_referents(this,
-                                                 per_thread_states,
-                                                 no_of_gc_workers,
-                                                 _task_queues);
-  workers()->run_task(&keep_cm_referents);
-
-  g1_policy()->phase_times()->record_preserve_cm_referents_time_ms((os::elapsedTime() - preserve_cm_referents_start) * 1000.0);
+  double preserve_cm_referents_time = 0.0;
+
+  // To avoid spawning task when there is no work to do, check that
+  // a concurrent cycle is active and that some references have been
+  // discovered.
+  if (concurrent_mark()->cmThread()->during_cycle() &&
+      ref_processor_cm()->has_discovered_references()) {
+    double preserve_cm_referents_start = os::elapsedTime();
+    uint no_of_gc_workers = workers()->active_workers();
+    G1ParPreserveCMReferentsTask keep_cm_referents(this,
+                                                   per_thread_states,
+                                                   no_of_gc_workers,
+                                                   _task_queues);
+    workers()->run_task(&keep_cm_referents);
+    preserve_cm_referents_time = os::elapsedTime() - preserve_cm_referents_start;
+  }
+
+  g1_policy()->phase_times()->record_preserve_cm_referents_time_ms(preserve_cm_referents_time * 1000.0);
 }
 
 // Weak Reference processing during an evacuation pause (part 1).
@@ -4818,6 +4836,9 @@
 
     workers()->run_task(&cleanup_task);
 #ifndef PRODUCT
+    // Need to synchronize with concurrent cleanup since it needs to
+    // finish its card table clearing before we can verify.
+    wait_while_free_regions_coming();
     _verifier->verify_card_table_cleanup();
 #endif
   }
--- a/src/share/vm/gc/g1/g1ConcurrentMark.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/g1/g1ConcurrentMark.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -2112,6 +2112,10 @@
   _parallel_workers->print_worker_threads_on(st);
 }
 
+void G1ConcurrentMark::threads_do(ThreadClosure* tc) const {
+  _parallel_workers->threads_do(tc);
+}
+
 void G1ConcurrentMark::print_on_error(outputStream* st) const {
   st->print_cr("Marking Bits (Prev, Next): (CMBitMap*) " PTR_FORMAT ", (CMBitMap*) " PTR_FORMAT,
       p2i(_prevMarkBitMap), p2i(_nextMarkBitMap));
--- a/src/share/vm/gc/g1/g1ConcurrentMark.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/g1/g1ConcurrentMark.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -621,6 +621,7 @@
   void print_summary_info();
 
   void print_worker_threads_on(outputStream* st) const;
+  void threads_do(ThreadClosure* tc) const;
 
   void print_on_error(outputStream* st) const;
 
--- a/src/share/vm/gc/g1/g1YoungGenSizer.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/g1/g1YoungGenSizer.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -25,12 +25,13 @@
 #include "precompiled.hpp"
 #include "gc/g1/g1YoungGenSizer.hpp"
 #include "gc/g1/heapRegion.hpp"
+#include "logging/log.hpp"
 
 G1YoungGenSizer::G1YoungGenSizer() : _sizer_kind(SizerDefaults), _adaptive_size(true),
         _min_desired_young_length(0), _max_desired_young_length(0) {
   if (FLAG_IS_CMDLINE(NewRatio)) {
     if (FLAG_IS_CMDLINE(NewSize) || FLAG_IS_CMDLINE(MaxNewSize)) {
-      warning("-XX:NewSize and -XX:MaxNewSize override -XX:NewRatio");
+      log_warning(gc, ergo)("-XX:NewSize and -XX:MaxNewSize override -XX:NewRatio");
     } else {
       _sizer_kind = SizerNewRatio;
       _adaptive_size = false;
@@ -40,9 +41,9 @@
 
   if (NewSize > MaxNewSize) {
     if (FLAG_IS_CMDLINE(MaxNewSize)) {
-      warning("NewSize (" SIZE_FORMAT "k) is greater than the MaxNewSize (" SIZE_FORMAT "k). "
-              "A new max generation size of " SIZE_FORMAT "k will be used.",
-              NewSize/K, MaxNewSize/K, NewSize/K);
+      log_warning(gc, ergo)("NewSize (" SIZE_FORMAT "k) is greater than the MaxNewSize (" SIZE_FORMAT "k). "
+                            "A new max generation size of " SIZE_FORMAT "k will be used.",
+                            NewSize/K, MaxNewSize/K, NewSize/K);
     }
     MaxNewSize = NewSize;
   }
--- a/src/share/vm/gc/parallel/generationSizer.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/parallel/generationSizer.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -26,18 +26,6 @@
 #include "gc/parallel/generationSizer.hpp"
 #include "gc/shared/collectorPolicy.hpp"
 
-void GenerationSizer::trace_gen_sizes(const char* const str) {
-  if (TracePageSizes) {
-    tty->print_cr("%s:  " SIZE_FORMAT "," SIZE_FORMAT " "
-                  SIZE_FORMAT "," SIZE_FORMAT " "
-                  SIZE_FORMAT,
-                  str,
-                  _min_old_size / K, _max_old_size / K,
-                  _min_young_size / K, _max_young_size / K,
-                  _max_heap_byte_size / K);
-  }
-}
-
 void GenerationSizer::initialize_alignments() {
   _space_alignment = _gen_alignment = default_gen_alignment();
   _heap_alignment = compute_heap_alignment();
@@ -60,7 +48,6 @@
 }
 
 void GenerationSizer::initialize_size_info() {
-  trace_gen_sizes("ps heap raw");
   const size_t max_page_sz = os::page_size_for_region_aligned(_max_heap_byte_size, 8);
   const size_t min_pages = 4; // 1 for eden + 1 for each survivor + 1 for old
   const size_t min_page_sz = os::page_size_for_region_aligned(_min_heap_byte_size, min_pages);
@@ -76,6 +63,4 @@
     initialize_flags();
   }
   GenCollectorPolicy::initialize_size_info();
-
-  trace_gen_sizes("ps heap rnd");
 }
--- a/src/share/vm/gc/parallel/generationSizer.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/parallel/generationSizer.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -33,8 +33,6 @@
 class GenerationSizer : public GenCollectorPolicy {
  private:
 
-  void trace_gen_sizes(const char* const str);
-
   // The alignment used for boundary between young gen and old gen
   static size_t default_gen_alignment() { return 64 * K * HeapWordSize; }
 
--- a/src/share/vm/gc/parallel/parMarkBitMap.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/parallel/parMarkBitMap.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -49,7 +49,7 @@
   const size_t rs_align = page_sz == (size_t) os::vm_page_size() ? 0 :
     MAX2(page_sz, granularity);
   ReservedSpace rs(_reserved_byte_size, rs_align, rs_align > 0);
-  os::trace_page_sizes("par bitmap", raw_bytes, raw_bytes, page_sz,
+  os::trace_page_sizes("Mark Bitmap", raw_bytes, raw_bytes, page_sz,
                        rs.base(), rs.size());
 
   MemTracker::record_virtual_memory_type((address)rs.base(), mtGC);
--- a/src/share/vm/gc/parallel/parallelScavengeHeap.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/parallel/parallelScavengeHeap.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -60,8 +60,10 @@
 
   ReservedSpace heap_rs = Universe::reserve_heap(heap_size, _collector_policy->heap_alignment());
 
-  os::trace_page_sizes("ps main", _collector_policy->min_heap_byte_size(),
-                       heap_size, generation_alignment(),
+  os::trace_page_sizes("Heap",
+                       _collector_policy->min_heap_byte_size(),
+                       heap_size,
+                       generation_alignment(),
                        heap_rs.base(),
                        heap_rs.size());
 
--- a/src/share/vm/gc/parallel/psAdaptiveSizePolicy.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/parallel/psAdaptiveSizePolicy.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -32,7 +32,6 @@
 #include "gc/shared/gcPolicyCounters.hpp"
 #include "logging/log.hpp"
 #include "runtime/timer.hpp"
-#include "utilities/top.hpp"
 
 #include <math.h>
 
--- a/src/share/vm/gc/parallel/psParallelCompact.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/parallel/psParallelCompact.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -426,7 +426,7 @@
   const size_t rs_align = page_sz == (size_t) os::vm_page_size() ? 0 :
     MAX2(page_sz, granularity);
   ReservedSpace rs(_reserved_byte_size, rs_align, rs_align > 0);
-  os::trace_page_sizes("par compact", raw_bytes, raw_bytes, page_sz, rs.base(),
+  os::trace_page_sizes("Parallel Compact Data", raw_bytes, raw_bytes, page_sz, rs.base(),
                        rs.size());
 
   MemTracker::record_virtual_memory_type((address)rs.base(), mtGC);
--- a/src/share/vm/gc/shared/cardTableModRefBS.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/shared/cardTableModRefBS.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -93,7 +93,7 @@
 
   MemTracker::record_virtual_memory_type((address)heap_rs.base(), mtGC);
 
-  os::trace_page_sizes("card table", _guard_index + 1, _guard_index + 1,
+  os::trace_page_sizes("Card Table", _guard_index + 1, _guard_index + 1,
                        _page_size, heap_rs.base(), heap_rs.size());
   if (!heap_rs.is_reserved()) {
     vm_exit_during_initialization("Could not reserve enough space for the "
--- a/src/share/vm/gc/shared/genCollectedHeap.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/shared/genCollectedHeap.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -167,6 +167,14 @@
          SIZE_FORMAT, total_reserved, alignment);
 
   *heap_rs = Universe::reserve_heap(total_reserved, alignment);
+
+  os::trace_page_sizes("Heap",
+                       collector_policy()->min_heap_byte_size(),
+                       total_reserved,
+                       alignment,
+                       heap_rs->base(),
+                       heap_rs->size());
+
   return heap_rs->base();
 }
 
--- a/src/share/vm/gc/shared/referenceProcessor.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/shared/referenceProcessor.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1090,6 +1090,15 @@
   return true;
 }
 
+bool ReferenceProcessor::has_discovered_references() {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+    if (!_discovered_refs[i].is_empty()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 // Preclean the discovered references by removing those
 // whose referents are alive, and by marking from those that
 // are not active. These lists can be handled here
--- a/src/share/vm/gc/shared/referenceProcessor.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/shared/referenceProcessor.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -412,6 +412,9 @@
   // Discover a Reference object, using appropriate discovery criteria
   bool discover_reference(oop obj, ReferenceType rt);
 
+  // Has discovered references that need handling
+  bool has_discovered_references();
+
   // Process references found during GC (called by the garbage collector)
   ReferenceProcessorStats
   process_discovered_references(BoolObjectClosure*           is_alive,
--- a/src/share/vm/gc/shared/space.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/shared/space.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -411,22 +411,6 @@
   return compact_top;
 }
 
-
-bool CompactibleSpace::insert_deadspace(size_t& allowed_deadspace_words,
-                                        HeapWord* q, size_t deadlength) {
-  if (allowed_deadspace_words >= deadlength) {
-    allowed_deadspace_words -= deadlength;
-    CollectedHeap::fill_with_object(q, deadlength);
-    oop(q)->set_mark(oop(q)->mark()->set_marked());
-    assert((int) deadlength == oop(q)->size(), "bad filler object size");
-    // Recall that we required "q == compaction_top".
-    return true;
-  } else {
-    allowed_deadspace_words = 0;
-    return false;
-  }
-}
-
 void ContiguousSpace::prepare_for_compaction(CompactPoint* cp) {
   scan_and_forward(this, cp);
 }
--- a/src/share/vm/gc/shared/space.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/shared/space.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -362,6 +362,12 @@
 
   inline size_t obj_size(const HeapWord* addr) const;
 
+  template <class SpaceType>
+  static inline void verify_up_to_first_dead(SpaceType* space) NOT_DEBUG_RETURN;
+
+  template <class SpaceType>
+  static inline void clear_empty_region(SpaceType* space);
+
 public:
   CompactibleSpace() :
    _compaction_top(NULL), _next_compaction_space(NULL) {}
@@ -455,16 +461,6 @@
     return end();
   }
 
-  // Requires "allowed_deadspace_words > 0", that "q" is the start of a
-  // free block of the given "word_len", and that "q", were it an object,
-  // would not move if forwarded.  If the size allows, fill the free
-  // block with an object, to prevent excessive compaction.  Returns "true"
-  // iff the free region was made deadspace, and modifies
-  // "allowed_deadspace_words" to reflect the number of available deadspace
-  // words remaining after this operation.
-  bool insert_deadspace(size_t& allowed_deadspace_words, HeapWord* q,
-                        size_t word_len);
-
   // Below are template functions for scan_and_* algorithms (avoiding virtual calls).
   // The space argument should be a subclass of CompactibleSpace, implementing
   // scan_limit(), scanned_block_is_obj(), and scanned_block_size(),
--- a/src/share/vm/gc/shared/space.inline.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/gc/shared/space.inline.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -31,6 +31,7 @@
 #include "gc/shared/space.hpp"
 #include "gc/shared/spaceDecorator.hpp"
 #include "memory/universe.hpp"
+#include "oops/oopsHierarchy.hpp"
 #include "runtime/prefetch.inline.hpp"
 #include "runtime/safepoint.hpp"
 
@@ -75,11 +76,61 @@
   return oop(addr)->size();
 }
 
+class DeadSpacer : StackObj {
+  size_t _allowed_deadspace_words;
+  bool _active;
+  CompactibleSpace* _space;
+
+public:
+  DeadSpacer(CompactibleSpace* space) : _space(space), _allowed_deadspace_words(0) {
+    size_t ratio = _space->allowed_dead_ratio();
+    _active = ratio > 0;
+
+    if (_active) {
+      assert(!UseG1GC, "G1 should not be using dead space");
+
+      // We allow some amount of garbage towards the bottom of the space, so
+      // we don't start compacting before there is a significant gain to be made.
+      // Occasionally, we want to ensure a full compaction, which is determined
+      // by the MarkSweepAlwaysCompactCount parameter.
+      if ((MarkSweep::total_invocations() % MarkSweepAlwaysCompactCount) != 0) {
+        _allowed_deadspace_words = (space->capacity() * ratio / 100) / HeapWordSize;
+      } else {
+        _active = false;
+      }
+    }
+  }
+
+
+  bool insert_deadspace(HeapWord* dead_start, HeapWord* dead_end) {
+    if (!_active) {
+      return false;
+    }
+
+    size_t dead_length = pointer_delta(dead_end, dead_start);
+    if (_allowed_deadspace_words >= dead_length) {
+      _allowed_deadspace_words -= dead_length;
+      CollectedHeap::fill_with_object(dead_start, dead_length);
+      oop obj = oop(dead_start);
+      obj->set_mark(obj->mark()->set_marked());
+
+      assert(dead_length == (size_t)obj->size(), "bad filler object size");
+      log_develop_trace(gc, compaction)("Inserting object to dead space: " PTR_FORMAT ", " PTR_FORMAT ", " SIZE_FORMAT "b",
+          p2i(dead_start), p2i(dead_end), dead_length * HeapWordSize);
+
+      return true;
+    } else {
+      _active = false;
+      return false;
+    }
+  }
+
+};
+
 template <class SpaceType>
 inline void CompactibleSpace::scan_and_forward(SpaceType* space, CompactPoint* cp) {
   // Compute the new addresses for the live objects and store it in the mark
   // Used by universe::mark_sweep_phase2()
-  HeapWord* compact_top; // This is where we are currently compacting to.
 
   // We're sure to be here before any objects are compacted into this
   // space, so this is a good time to initialize this:
@@ -90,89 +141,73 @@
     assert(cp->threshold == NULL, "just checking");
     assert(cp->gen->first_compaction_space() == space, "just checking");
     cp->space = cp->gen->first_compaction_space();
-    compact_top = cp->space->bottom();
-    cp->space->set_compaction_top(compact_top);
     cp->threshold = cp->space->initialize_threshold();
-  } else {
-    compact_top = cp->space->compaction_top();
+    cp->space->set_compaction_top(cp->space->bottom());
   }
 
-  // We allow some amount of garbage towards the bottom of the space, so
-  // we don't start compacting before there is a significant gain to be made.
-  // Occasionally, we want to ensure a full compaction, which is determined
-  // by the MarkSweepAlwaysCompactCount parameter.
-  uint invocations = MarkSweep::total_invocations();
-  bool skip_dead = ((invocations % MarkSweepAlwaysCompactCount) != 0);
+  HeapWord* compact_top = cp->space->compaction_top(); // This is where we are currently compacting to.
 
-  size_t allowed_deadspace = 0;
-  if (skip_dead) {
-    const size_t ratio = space->allowed_dead_ratio();
-    allowed_deadspace = (space->capacity() * ratio / 100) / HeapWordSize;
-  }
+  DeadSpacer dead_spacer(space);
 
-  HeapWord* q = space->bottom();
-  HeapWord* t = space->scan_limit();
-
-  HeapWord*  end_of_live= q;            // One byte beyond the last byte of the last
-                                        // live object.
-  HeapWord*  first_dead = space->end(); // The first dead object.
+  HeapWord*  end_of_live = space->bottom();  // One byte beyond the last byte of the last live object.
+  HeapWord*  first_dead = NULL; // The first dead object.
 
   const intx interval = PrefetchScanIntervalInBytes;
 
-  while (q < t) {
-    assert(!space->scanned_block_is_obj(q) ||
-           oop(q)->mark()->is_marked() || oop(q)->mark()->is_unlocked() ||
-           oop(q)->mark()->has_bias_pattern(),
+  HeapWord* cur_obj = space->bottom();
+  HeapWord* scan_limit = space->scan_limit();
+
+  while (cur_obj < scan_limit) {
+    assert(!space->scanned_block_is_obj(cur_obj) ||
+           oop(cur_obj)->mark()->is_marked() || oop(cur_obj)->mark()->is_unlocked() ||
+           oop(cur_obj)->mark()->has_bias_pattern(),
            "these are the only valid states during a mark sweep");
-    if (space->scanned_block_is_obj(q) && oop(q)->is_gc_marked()) {
-      // prefetch beyond q
-      Prefetch::write(q, interval);
-      size_t size = space->scanned_block_size(q);
-      compact_top = cp->space->forward(oop(q), size, cp, compact_top);
-      q += size;
-      end_of_live = q;
+    if (space->scanned_block_is_obj(cur_obj) && oop(cur_obj)->is_gc_marked()) {
+      // prefetch beyond cur_obj
+      Prefetch::write(cur_obj, interval);
+      size_t size = space->scanned_block_size(cur_obj);
+      compact_top = cp->space->forward(oop(cur_obj), size, cp, compact_top);
+      cur_obj += size;
+      end_of_live = cur_obj;
     } else {
       // run over all the contiguous dead objects
-      HeapWord* end = q;
+      HeapWord* end = cur_obj;
       do {
         // prefetch beyond end
         Prefetch::write(end, interval);
         end += space->scanned_block_size(end);
-      } while (end < t && (!space->scanned_block_is_obj(end) || !oop(end)->is_gc_marked()));
+      } while (end < scan_limit && (!space->scanned_block_is_obj(end) || !oop(end)->is_gc_marked()));
 
       // see if we might want to pretend this object is alive so that
       // we don't have to compact quite as often.
-      if (allowed_deadspace > 0 && q == compact_top) {
-        size_t sz = pointer_delta(end, q);
-        if (space->insert_deadspace(allowed_deadspace, q, sz)) {
-          compact_top = cp->space->forward(oop(q), sz, cp, compact_top);
-          q = end;
-          end_of_live = end;
-          continue;
+      if (cur_obj == compact_top && dead_spacer.insert_deadspace(cur_obj, end)) {
+        oop obj = oop(cur_obj);
+        compact_top = cp->space->forward(obj, obj->size(), cp, compact_top);
+        end_of_live = end;
+      } else {
+        // otherwise, it really is a free region.
+
+        // cur_obj is a pointer to a dead object. Use this dead memory to store a pointer to the next live object.
+        *(HeapWord**)cur_obj = end;
+
+        // see if this is the first dead region.
+        if (first_dead == NULL) {
+          first_dead = cur_obj;
         }
       }
 
-      // otherwise, it really is a free region.
-
-      // q is a pointer to a dead object. Use this dead memory to store a pointer to the next live object.
-      (*(HeapWord**)q) = end;
-
-      // see if this is the first dead region.
-      if (q < first_dead) {
-        first_dead = q;
-      }
-
       // move on to the next object
-      q = end;
+      cur_obj = end;
     }
   }
 
-  assert(q == t, "just checking");
+  assert(cur_obj == scan_limit, "just checking");
   space->_end_of_live = end_of_live;
-  if (end_of_live < first_dead) {
-    first_dead = end_of_live;
+  if (first_dead != NULL) {
+    space->_first_dead = first_dead;
+  } else {
+    space->_first_dead = end_of_live;
   }
-  space->_first_dead = first_dead;
 
   // save the compaction_top of the compaction space.
   cp->space->set_compaction_top(compact_top);
@@ -183,127 +218,58 @@
   // adjust all the interior pointers to point at the new locations of objects
   // Used by MarkSweep::mark_sweep_phase3()
 
-  HeapWord* q = space->bottom();
-  HeapWord* t = space->_end_of_live;  // Established by "prepare_for_compaction".
+  HeapWord* cur_obj = space->bottom();
+  HeapWord* const end_of_live = space->_end_of_live;  // Established by "scan_and_forward".
+  HeapWord* const first_dead = space->_first_dead;    // Established by "scan_and_forward".
 
-  assert(space->_first_dead <= space->_end_of_live, "Stands to reason, no?");
+  assert(first_dead <= end_of_live, "Stands to reason, no?");
 
-  if (q < t && space->_first_dead > q && !oop(q)->is_gc_marked()) {
-    // we have a chunk of the space which hasn't moved and we've
-    // reinitialized the mark word during the previous pass, so we can't
-    // use is_gc_marked for the traversal.
-    HeapWord* end = space->_first_dead;
+  const intx interval = PrefetchScanIntervalInBytes;
 
-    while (q < end) {
-      // I originally tried to conjoin "block_start(q) == q" to the
-      // assertion below, but that doesn't work, because you can't
-      // accurately traverse previous objects to get to the current one
-      // after their pointers have been
-      // updated, until the actual compaction is done.  dld, 4/00
-      assert(space->block_is_obj(q), "should be at block boundaries, and should be looking at objs");
-
+  debug_only(HeapWord* prev_obj = NULL);
+  while (cur_obj < end_of_live) {
+    Prefetch::write(cur_obj, interval);
+    if (cur_obj < first_dead || oop(cur_obj)->is_gc_marked()) {
+      // cur_obj is alive
       // point all the oops to the new location
-      size_t size = MarkSweep::adjust_pointers(oop(q));
+      size_t size = MarkSweep::adjust_pointers(oop(cur_obj));
       size = space->adjust_obj_size(size);
-
-      q += size;
-    }
-
-    if (space->_first_dead == t) {
-      q = t;
+      debug_only(prev_obj = cur_obj);
+      cur_obj += size;
     } else {
-      // The first dead object is no longer an object. At that memory address,
-      // there is a pointer to the first live object that the previous phase found.
-      q = *((HeapWord**)(space->_first_dead));
+      debug_only(prev_obj = cur_obj);
+      // cur_obj is not a live object, instead it points at the next live object
+      cur_obj = *(HeapWord**)cur_obj;
+      assert(cur_obj > prev_obj, "we should be moving forward through memory, cur_obj: " PTR_FORMAT ", prev_obj: " PTR_FORMAT, p2i(cur_obj), p2i(prev_obj));
     }
   }
 
-  const intx interval = PrefetchScanIntervalInBytes;
-
-  debug_only(HeapWord* prev_q = NULL);
-  while (q < t) {
-    // prefetch beyond q
-    Prefetch::write(q, interval);
-    if (oop(q)->is_gc_marked()) {
-      // q is alive
-      // point all the oops to the new location
-      size_t size = MarkSweep::adjust_pointers(oop(q));
-      size = space->adjust_obj_size(size);
-      debug_only(prev_q = q);
-      q += size;
-    } else {
-      debug_only(prev_q = q);
-      // q is not a live object, instead it points at the next live object
-      q = *(HeapWord**)q;
-      assert(q > prev_q, "we should be moving forward through memory, q: " PTR_FORMAT ", prev_q: " PTR_FORMAT, p2i(q), p2i(prev_q));
-    }
-  }
-
-  assert(q == t, "just checking");
+  assert(cur_obj == end_of_live, "just checking");
 }
 
+#ifdef ASSERT
 template <class SpaceType>
-inline void CompactibleSpace::scan_and_compact(SpaceType* space) {
-  // Copy all live objects to their new location
-  // Used by MarkSweep::mark_sweep_phase4()
+inline void CompactibleSpace::verify_up_to_first_dead(SpaceType* space) {
+  HeapWord* cur_obj = space->bottom();
 
-  HeapWord*       q = space->bottom();
-  HeapWord* const t = space->_end_of_live;
-  debug_only(HeapWord* prev_q = NULL);
+  if (cur_obj < space->_end_of_live && space->_first_dead > cur_obj && !oop(cur_obj)->is_gc_marked()) {
+     // we have a chunk of the space which hasn't moved and we've reinitialized
+     // the mark word during the previous pass, so we can't use is_gc_marked for
+     // the traversal.
+     HeapWord* prev_obj = NULL;
 
-  if (q < t && space->_first_dead > q && !oop(q)->is_gc_marked()) {
-    #ifdef ASSERT // Debug only
-      // we have a chunk of the space which hasn't moved and we've reinitialized
-      // the mark word during the previous pass, so we can't use is_gc_marked for
-      // the traversal.
-      HeapWord* const end = space->_first_dead;
+     while (cur_obj < space->_first_dead) {
+       size_t size = space->obj_size(cur_obj);
+       assert(!oop(cur_obj)->is_gc_marked(), "should be unmarked (special dense prefix handling)");
+       prev_obj = cur_obj;
+       cur_obj += size;
+     }
+  }
+}
+#endif
 
-      while (q < end) {
-        size_t size = space->obj_size(q);
-        assert(!oop(q)->is_gc_marked(), "should be unmarked (special dense prefix handling)");
-        prev_q = q;
-        q += size;
-      }
-    #endif
-
-    if (space->_first_dead == t) {
-      q = t;
-    } else {
-      // $$$ Funky
-      q = (HeapWord*) oop(space->_first_dead)->mark()->decode_pointer();
-    }
-  }
-
-  const intx scan_interval = PrefetchScanIntervalInBytes;
-  const intx copy_interval = PrefetchCopyIntervalInBytes;
-  while (q < t) {
-    if (!oop(q)->is_gc_marked()) {
-      // mark is pointer to next marked oop
-      debug_only(prev_q = q);
-      q = (HeapWord*) oop(q)->mark()->decode_pointer();
-      assert(q > prev_q, "we should be moving forward through memory");
-    } else {
-      // prefetch beyond q
-      Prefetch::read(q, scan_interval);
-
-      // size and destination
-      size_t size = space->obj_size(q);
-      HeapWord* compaction_top = (HeapWord*)oop(q)->forwardee();
-
-      // prefetch beyond compaction_top
-      Prefetch::write(compaction_top, copy_interval);
-
-      // copy object and reinit its mark
-      assert(q != compaction_top, "everything in this pass should be moving");
-      Copy::aligned_conjoint_words(q, compaction_top, size);
-      oop(compaction_top)->init_mark();
-      assert(oop(compaction_top)->klass() != NULL, "should have a class");
-
-      debug_only(prev_q = q);
-      q += size;
-    }
-  }
-
+template <class SpaceType>
+inline void CompactibleSpace::clear_empty_region(SpaceType* space) {
   // Let's remember if we were empty before we did the compaction.
   bool was_empty = space->used_region().is_empty();
   // Reset space after compaction is complete
@@ -320,6 +286,65 @@
   }
 }
 
+template <class SpaceType>
+inline void CompactibleSpace::scan_and_compact(SpaceType* space) {
+  // Copy all live objects to their new location
+  // Used by MarkSweep::mark_sweep_phase4()
+
+  verify_up_to_first_dead(space);
+
+  HeapWord* const end_of_live = space->_end_of_live;
+
+  assert(space->_first_dead <= end_of_live, "Invariant. _first_dead: " PTR_FORMAT " <= end_of_live: " PTR_FORMAT, p2i(space->_first_dead), p2i(end_of_live));
+  if (space->_first_dead == end_of_live && !oop(space->bottom())->is_gc_marked()) {
+    // Nothing to compact. The space is either empty or all live object should be left in place.
+    clear_empty_region(space);
+    return;
+  }
+
+  const intx scan_interval = PrefetchScanIntervalInBytes;
+  const intx copy_interval = PrefetchCopyIntervalInBytes;
+
+  assert(space->bottom() < end_of_live, "bottom: " PTR_FORMAT " should be < end_of_live: " PTR_FORMAT, p2i(space->bottom()), p2i(end_of_live));
+  HeapWord* cur_obj = space->bottom();
+  if (space->_first_dead > cur_obj && !oop(cur_obj)->is_gc_marked()) {
+    // All object before _first_dead can be skipped. They should not be moved.
+    // A pointer to the first live object is stored at the memory location for _first_dead.
+    cur_obj = *(HeapWord**)(space->_first_dead);
+  }
+
+  debug_only(HeapWord* prev_obj = NULL);
+  while (cur_obj < end_of_live) {
+    if (!oop(cur_obj)->is_gc_marked()) {
+      debug_only(prev_obj = cur_obj);
+      // The first word of the dead object contains a pointer to the next live object or end of space.
+      cur_obj = *(HeapWord**)cur_obj;
+      assert(cur_obj > prev_obj, "we should be moving forward through memory");
+    } else {
+      // prefetch beyond q
+      Prefetch::read(cur_obj, scan_interval);
+
+      // size and destination
+      size_t size = space->obj_size(cur_obj);
+      HeapWord* compaction_top = (HeapWord*)oop(cur_obj)->forwardee();
+
+      // prefetch beyond compaction_top
+      Prefetch::write(compaction_top, copy_interval);
+
+      // copy object and reinit its mark
+      assert(cur_obj != compaction_top, "everything in this pass should be moving");
+      Copy::aligned_conjoint_words(cur_obj, compaction_top, size);
+      oop(compaction_top)->init_mark();
+      assert(oop(compaction_top)->klass() != NULL, "should have a class");
+
+      debug_only(prev_obj = cur_obj);
+      cur_obj += size;
+    }
+  }
+
+  clear_empty_region(space);
+}
+
 size_t ContiguousSpace::scanned_block_size(const HeapWord* addr) const {
   return oop(addr)->size();
 }
--- a/src/share/vm/interpreter/abstractInterpreter.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/interpreter/abstractInterpreter.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -25,6 +25,7 @@
 #include "precompiled.hpp"
 #include "asm/macroAssembler.hpp"
 #include "asm/macroAssembler.inline.hpp"
+#include "compiler/disassembler.hpp"
 #include "interpreter/bytecodeHistogram.hpp"
 #include "interpreter/bytecodeInterpreter.hpp"
 #include "interpreter/interpreter.hpp"
@@ -32,6 +33,7 @@
 #include "interpreter/interp_masm.hpp"
 #include "interpreter/templateTable.hpp"
 #include "memory/allocation.inline.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/arrayOop.hpp"
 #include "oops/methodData.hpp"
@@ -93,6 +95,7 @@
 address    AbstractInterpreter::_native_entry_end                           = NULL;
 address    AbstractInterpreter::_slow_signature_handler;
 address    AbstractInterpreter::_entry_table            [AbstractInterpreter::number_of_method_entries];
+address    AbstractInterpreter::_cds_entry_table        [AbstractInterpreter::number_of_method_entries];
 address    AbstractInterpreter::_native_abi_to_tosca    [AbstractInterpreter::number_of_result_handlers];
 
 //------------------------------------------------------------------------------------------------------------------------
@@ -204,15 +207,42 @@
   return zerolocals;
 }
 
+#if INCLUDE_CDS
+
+address AbstractInterpreter::get_trampoline_code_buffer(AbstractInterpreter::MethodKind kind) {
+  const size_t trampoline_size = SharedRuntime::trampoline_size();
+  address addr = MetaspaceShared::cds_i2i_entry_code_buffers((size_t)(AbstractInterpreter::number_of_method_entries) * trampoline_size);
+  addr += (size_t)(kind) * trampoline_size;
+
+  return addr;
+}
+
+void AbstractInterpreter::update_cds_entry_table(AbstractInterpreter::MethodKind kind) {
+  if (DumpSharedSpaces || UseSharedSpaces) {
+    address trampoline = get_trampoline_code_buffer(kind);
+    _cds_entry_table[kind] = trampoline;
+
+    CodeBuffer buffer(trampoline, (int)(SharedRuntime::trampoline_size()));
+    MacroAssembler _masm(&buffer);
+    SharedRuntime::generate_trampoline(&_masm, _entry_table[kind]);
+
+    if (PrintInterpreter) {
+      Disassembler::decode(buffer.insts_begin(), buffer.insts_end());
+    }
+  }
+}
+
+#endif
 
 void AbstractInterpreter::set_entry_for_kind(AbstractInterpreter::MethodKind kind, address entry) {
   assert(kind >= method_handle_invoke_FIRST &&
          kind <= method_handle_invoke_LAST, "late initialization only for MH entry points");
   assert(_entry_table[kind] == _entry_table[abstract], "previous value must be AME entry");
   _entry_table[kind] = entry;
+
+  update_cds_entry_table(kind);
 }
 
-
 // Return true if the interpreter can prove that the given bytecode has
 // not yet been executed (in Java semantics, not in actual operation).
 bool AbstractInterpreter::is_not_reached(const methodHandle& method, int bci) {
@@ -416,5 +446,6 @@
   for (int i = method_handle_invoke_FIRST; i <= method_handle_invoke_LAST; i++) {
     MethodKind kind = (MethodKind) i;
     _entry_table[kind] = _entry_table[Interpreter::abstract];
+    Interpreter::update_cds_entry_table(kind);
   }
 }
--- a/src/share/vm/interpreter/abstractInterpreter.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/interpreter/abstractInterpreter.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -28,9 +28,8 @@
 #include "asm/macroAssembler.hpp"
 #include "code/stubs.hpp"
 #include "interpreter/bytecodes.hpp"
-#include "runtime/thread.inline.hpp"
+#include "runtime/thread.hpp"
 #include "runtime/vmThread.hpp"
-#include "utilities/top.hpp"
 
 // This file contains the platform-independent parts
 // of the abstract interpreter and the abstract interpreter generator.
@@ -113,6 +112,7 @@
 
   // method entry points
   static address    _entry_table[number_of_method_entries];     // entry points for a given method
+  static address    _cds_entry_table[number_of_method_entries]; // entry points for methods in the CDS archive
   static address    _native_abi_to_tosca[number_of_result_handlers];  // for native method result handlers
   static address    _slow_signature_handler;                              // the native method generic (slow) signature handler
 
@@ -132,6 +132,17 @@
   static address    entry_for_kind(MethodKind k)                { assert(0 <= k && k < number_of_method_entries, "illegal kind"); return _entry_table[k]; }
   static address    entry_for_method(methodHandle m)            { return entry_for_kind(method_kind(m)); }
 
+  static address entry_for_cds_method(methodHandle m) {
+    MethodKind k = method_kind(m);
+    assert(0 <= k && k < number_of_method_entries, "illegal kind");
+    return _cds_entry_table[k];
+  }
+
+  // used by class data sharing
+  static void       update_cds_entry_table(MethodKind kind) NOT_CDS_RETURN;
+
+  static address    get_trampoline_code_buffer(AbstractInterpreter::MethodKind kind) NOT_CDS_RETURN_(0);
+
   // used for bootstrapping method handles:
   static void       set_entry_for_kind(MethodKind k, address e);
 
--- a/src/share/vm/interpreter/bytecodeInterpreter.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/interpreter/bytecodeInterpreter.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -632,9 +632,11 @@
       if (_compiling) {
         MethodCounters* mcs;
         GET_METHOD_COUNTERS(mcs);
+#if COMPILER2_OR_JVMCI
         if (ProfileInterpreter) {
           METHOD->increment_interpreter_invocation_count(THREAD);
         }
+#endif
         mcs->invocation_counter()->increment();
         if (mcs->invocation_counter()->reached_InvocationLimit(mcs->backedge_counter())) {
           CALL_VM((void)InterpreterRuntime::frequency_counter_overflow(THREAD, NULL), handle_exception);
--- a/src/share/vm/interpreter/bytecodeTracer.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/interpreter/bytecodeTracer.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -26,6 +26,7 @@
 #define SHARE_VM_INTERPRETER_BYTECODETRACER_HPP
 
 #include "memory/allocation.hpp"
+#include "utilities/ostream.hpp"
 
 // The BytecodeTracer is a helper class used by the interpreter for run-time
 // bytecode tracing. If bytecode tracing is turned on, trace() will be called
--- a/src/share/vm/interpreter/bytecodes.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/interpreter/bytecodes.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -26,11 +26,12 @@
 #define SHARE_VM_INTERPRETER_BYTECODES_HPP
 
 #include "memory/allocation.hpp"
-#include "utilities/top.hpp"
 
 // Bytecodes specifies all bytecodes used in the VM and
 // provides utility functions to get bytecode attributes.
 
+class Method;
+
 // NOTE: replicated in SA in vm/agent/sun/jvm/hotspot/interpreter/Bytecodes.java
 class Bytecodes: AllStatic {
  public:
--- a/src/share/vm/interpreter/interpreterRuntime.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/interpreter/interpreterRuntime.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -510,8 +510,10 @@
 #ifndef CC_INTERP
     continuation = Interpreter::remove_activation_entry();
 #endif
+#if COMPILER2_OR_JVMCI
     // Count this for compilation purposes
     h_method->interpreter_throwout_increment(THREAD);
+#endif
   } else {
     // handler in this method => change bci/bcp to handler bci/bcp and continue there
     handler_pc = h_method->code_base() + handler_bci;
--- a/src/share/vm/interpreter/interpreterRuntime.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/interpreter/interpreterRuntime.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -31,8 +31,7 @@
 #include "oops/method.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/signature.hpp"
-#include "runtime/thread.inline.hpp"
-#include "utilities/top.hpp"
+#include "runtime/thread.hpp"
 
 // The InterpreterRuntime is called by the interpreter for everything
 // that cannot/should not be dealt with in assembly and needs C support.
--- a/src/share/vm/interpreter/linkResolver.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/interpreter/linkResolver.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -26,7 +26,6 @@
 #define SHARE_VM_INTERPRETER_LINKRESOLVER_HPP
 
 #include "oops/method.hpp"
-#include "utilities/top.hpp"
 
 // All the necessary definitions for run-time link resolution.
 
--- a/src/share/vm/interpreter/templateInterpreterGenerator.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/interpreter/templateInterpreterGenerator.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -212,6 +212,7 @@
 #define method_entry(kind)                                              \
       { CodeletMark cm(_masm, "method entry point (kind = " #kind ")"); \
         Interpreter::_entry_table[Interpreter::kind] = generate_method_entry(Interpreter::kind); \
+        Interpreter::update_cds_entry_table(Interpreter::kind); \
       }
 
       // all non-native method kinds
--- a/src/share/vm/jvmci/jvmciCompilerToVM.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/jvmci/jvmciCompilerToVM.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1434,65 +1434,65 @@
 #define METASPACE_METHOD_DATA "J"
 
 JNINativeMethod CompilerToVM::methods[] = {
-  {CC"getBytecode",                                  CC"("HS_RESOLVED_METHOD")[B",                                                     FN_PTR(getBytecode)},
-  {CC"getExceptionTableStart",                       CC"("HS_RESOLVED_METHOD")J",                                                      FN_PTR(getExceptionTableStart)},
-  {CC"getExceptionTableLength",                      CC"("HS_RESOLVED_METHOD")I",                                                      FN_PTR(getExceptionTableLength)},
-  {CC"findUniqueConcreteMethod",                     CC"("HS_RESOLVED_KLASS HS_RESOLVED_METHOD")"HS_RESOLVED_METHOD,                   FN_PTR(findUniqueConcreteMethod)},
-  {CC"getImplementor",                               CC"("HS_RESOLVED_KLASS")"HS_RESOLVED_KLASS,                                       FN_PTR(getImplementor)},
-  {CC"getStackTraceElement",                         CC"("HS_RESOLVED_METHOD"I)"STACK_TRACE_ELEMENT,                                   FN_PTR(getStackTraceElement)},
-  {CC"methodIsIgnoredBySecurityStackWalk",           CC"("HS_RESOLVED_METHOD")Z",                                                      FN_PTR(methodIsIgnoredBySecurityStackWalk)},
-  {CC"doNotInlineOrCompile",                         CC"("HS_RESOLVED_METHOD")V",                                                      FN_PTR(doNotInlineOrCompile)},
-  {CC"canInlineMethod",                              CC"("HS_RESOLVED_METHOD")Z",                                                      FN_PTR(canInlineMethod)},
-  {CC"shouldInlineMethod",                           CC"("HS_RESOLVED_METHOD")Z",                                                      FN_PTR(shouldInlineMethod)},
-  {CC"lookupType",                                   CC"("STRING CLASS"Z)"HS_RESOLVED_KLASS,                                           FN_PTR(lookupType)},
-  {CC"lookupNameInPool",                             CC"("HS_CONSTANT_POOL"I)"STRING,                                                  FN_PTR(lookupNameInPool)},
-  {CC"lookupNameAndTypeRefIndexInPool",              CC"("HS_CONSTANT_POOL"I)I",                                                       FN_PTR(lookupNameAndTypeRefIndexInPool)},
-  {CC"lookupSignatureInPool",                        CC"("HS_CONSTANT_POOL"I)"STRING,                                                  FN_PTR(lookupSignatureInPool)},
-  {CC"lookupKlassRefIndexInPool",                    CC"("HS_CONSTANT_POOL"I)I",                                                       FN_PTR(lookupKlassRefIndexInPool)},
-  {CC"lookupKlassInPool",                            CC"("HS_CONSTANT_POOL"I)Ljava/lang/Object;",                                      FN_PTR(lookupKlassInPool)},
-  {CC"lookupAppendixInPool",                         CC"("HS_CONSTANT_POOL"I)"OBJECT,                                                  FN_PTR(lookupAppendixInPool)},
-  {CC"lookupMethodInPool",                           CC"("HS_CONSTANT_POOL"IB)"HS_RESOLVED_METHOD,                                     FN_PTR(lookupMethodInPool)},
-  {CC"constantPoolRemapInstructionOperandFromCache", CC"("HS_CONSTANT_POOL"I)I",                                                       FN_PTR(constantPoolRemapInstructionOperandFromCache)},
-  {CC"resolveConstantInPool",                        CC"("HS_CONSTANT_POOL"I)"OBJECT,                                                  FN_PTR(resolveConstantInPool)},
-  {CC"resolvePossiblyCachedConstantInPool",          CC"("HS_CONSTANT_POOL"I)"OBJECT,                                                  FN_PTR(resolvePossiblyCachedConstantInPool)},
-  {CC"resolveTypeInPool",                            CC"("HS_CONSTANT_POOL"I)"HS_RESOLVED_KLASS,                                       FN_PTR(resolveTypeInPool)},
-  {CC"resolveFieldInPool",                           CC"("HS_CONSTANT_POOL"IB[J)"HS_RESOLVED_KLASS,                                    FN_PTR(resolveFieldInPool)},
-  {CC"resolveInvokeDynamicInPool",                   CC"("HS_CONSTANT_POOL"I)V",                                                       FN_PTR(resolveInvokeDynamicInPool)},
-  {CC"resolveInvokeHandleInPool",                    CC"("HS_CONSTANT_POOL"I)V",                                                       FN_PTR(resolveInvokeHandleInPool)},
-  {CC"resolveMethod",                                CC"("HS_RESOLVED_KLASS HS_RESOLVED_METHOD HS_RESOLVED_KLASS")"HS_RESOLVED_METHOD, FN_PTR(resolveMethod)},
-  {CC"getVtableIndexForInterfaceMethod",             CC"("HS_RESOLVED_KLASS HS_RESOLVED_METHOD")I",                                    FN_PTR(getVtableIndexForInterfaceMethod)},
-  {CC"getClassInitializer",                          CC"("HS_RESOLVED_KLASS")"HS_RESOLVED_METHOD,                                      FN_PTR(getClassInitializer)},
-  {CC"hasFinalizableSubclass",                       CC"("HS_RESOLVED_KLASS")Z",                                                       FN_PTR(hasFinalizableSubclass)},
-  {CC"getMaxCallTargetOffset",                       CC"(J)J",                                                                         FN_PTR(getMaxCallTargetOffset)},
-  {CC"getResolvedJavaMethodAtSlot",                  CC"("CLASS"I)"HS_RESOLVED_METHOD,                                                 FN_PTR(getResolvedJavaMethodAtSlot)},
-  {CC"getResolvedJavaMethod",                        CC"(Ljava/lang/Object;J)"HS_RESOLVED_METHOD,                                      FN_PTR(getResolvedJavaMethod)},
-  {CC"getConstantPool",                              CC"(Ljava/lang/Object;J)"HS_CONSTANT_POOL,                                        FN_PTR(getConstantPool)},
-  {CC"getResolvedJavaType",                          CC"(Ljava/lang/Object;JZ)"HS_RESOLVED_KLASS,                                      FN_PTR(getResolvedJavaType)},
-  {CC"initializeConfiguration",                      CC"("HS_CONFIG")J",                                                               FN_PTR(initializeConfiguration)},
-  {CC"installCode",                                  CC"("TARGET_DESCRIPTION HS_COMPILED_CODE INSTALLED_CODE HS_SPECULATION_LOG")I",   FN_PTR(installCode)},
-  {CC"getMetadata",                                  CC"("TARGET_DESCRIPTION HS_COMPILED_CODE HS_METADATA")I",                         FN_PTR(getMetadata)},
-  {CC"resetCompilationStatistics",                   CC"()V",                                                                          FN_PTR(resetCompilationStatistics)},
-  {CC"disassembleCodeBlob",                          CC"("INSTALLED_CODE")"STRING,                                                     FN_PTR(disassembleCodeBlob)},
-  {CC"executeInstalledCode",                         CC"(["OBJECT INSTALLED_CODE")"OBJECT,                                             FN_PTR(executeInstalledCode)},
-  {CC"getLineNumberTable",                           CC"("HS_RESOLVED_METHOD")[J",                                                     FN_PTR(getLineNumberTable)},
-  {CC"getLocalVariableTableStart",                   CC"("HS_RESOLVED_METHOD")J",                                                      FN_PTR(getLocalVariableTableStart)},
-  {CC"getLocalVariableTableLength",                  CC"("HS_RESOLVED_METHOD")I",                                                      FN_PTR(getLocalVariableTableLength)},
-  {CC"reprofile",                                    CC"("HS_RESOLVED_METHOD")V",                                                      FN_PTR(reprofile)},
-  {CC"invalidateInstalledCode",                      CC"("INSTALLED_CODE")V",                                                          FN_PTR(invalidateInstalledCode)},
-  {CC"readUncompressedOop",                          CC"(J)"OBJECT,                                                                    FN_PTR(readUncompressedOop)},
-  {CC"collectCounters",                              CC"()[J",                                                                         FN_PTR(collectCounters)},
-  {CC"allocateCompileId",                            CC"("HS_RESOLVED_METHOD"I)I",                                                     FN_PTR(allocateCompileId)},
-  {CC"isMature",                                     CC"("METASPACE_METHOD_DATA")Z",                                                   FN_PTR(isMature)},
-  {CC"hasCompiledCodeForOSR",                        CC"("HS_RESOLVED_METHOD"II)Z",                                                    FN_PTR(hasCompiledCodeForOSR)},
-  {CC"getSymbol",                                    CC"(J)"STRING,                                                                    FN_PTR(getSymbol)},
-  {CC"lookupSymbol",                                 CC"("STRING")J",                                                                  FN_PTR(lookupSymbol)},
-  {CC"getNextStackFrame",                            CC"("HS_STACK_FRAME_REF "["RESOLVED_METHOD"I)"HS_STACK_FRAME_REF,                 FN_PTR(getNextStackFrame)},
-  {CC"materializeVirtualObjects",                    CC"("HS_STACK_FRAME_REF"Z)V",                                                     FN_PTR(materializeVirtualObjects)},
-  {CC"shouldDebugNonSafepoints",                     CC"()Z",                                                                          FN_PTR(shouldDebugNonSafepoints)},
-  {CC"writeDebugOutput",                             CC"([BII)V",                                                                      FN_PTR(writeDebugOutput)},
-  {CC"flushDebugOutput",                             CC"()V",                                                                          FN_PTR(flushDebugOutput)},
-  {CC"methodDataProfileDataSize",                    CC"(JI)I",                                                                        FN_PTR(methodDataProfileDataSize)},
-  {CC"interpreterFrameSize",                         CC"("BYTECODE_FRAME")I",                                                          FN_PTR(interpreterFrameSize)},
+  {CC "getBytecode",                                  CC "(" HS_RESOLVED_METHOD ")[B",                                                      FN_PTR(getBytecode)},
+  {CC "getExceptionTableStart",                       CC "(" HS_RESOLVED_METHOD ")J",                                                       FN_PTR(getExceptionTableStart)},
+  {CC "getExceptionTableLength",                      CC "(" HS_RESOLVED_METHOD ")I",                                                       FN_PTR(getExceptionTableLength)},
+  {CC "findUniqueConcreteMethod",                     CC "(" HS_RESOLVED_KLASS HS_RESOLVED_METHOD ")" HS_RESOLVED_METHOD,                   FN_PTR(findUniqueConcreteMethod)},
+  {CC "getImplementor",                               CC "(" HS_RESOLVED_KLASS ")" HS_RESOLVED_KLASS,                                       FN_PTR(getImplementor)},
+  {CC "getStackTraceElement",                         CC "(" HS_RESOLVED_METHOD "I)" STACK_TRACE_ELEMENT,                                   FN_PTR(getStackTraceElement)},
+  {CC "methodIsIgnoredBySecurityStackWalk",           CC "(" HS_RESOLVED_METHOD ")Z",                                                       FN_PTR(methodIsIgnoredBySecurityStackWalk)},
+  {CC "doNotInlineOrCompile",                         CC "(" HS_RESOLVED_METHOD ")V",                                                       FN_PTR(doNotInlineOrCompile)},
+  {CC "canInlineMethod",                              CC "(" HS_RESOLVED_METHOD ")Z",                                                       FN_PTR(canInlineMethod)},
+  {CC "shouldInlineMethod",                           CC "(" HS_RESOLVED_METHOD ")Z",                                                       FN_PTR(shouldInlineMethod)},
+  {CC "lookupType",                                   CC "(" STRING CLASS "Z)" HS_RESOLVED_KLASS,                                           FN_PTR(lookupType)},
+  {CC "lookupNameInPool",                             CC "(" HS_CONSTANT_POOL "I)" STRING,                                                  FN_PTR(lookupNameInPool)},
+  {CC "lookupNameAndTypeRefIndexInPool",              CC "(" HS_CONSTANT_POOL "I)I",                                                        FN_PTR(lookupNameAndTypeRefIndexInPool)},
+  {CC "lookupSignatureInPool",                        CC "(" HS_CONSTANT_POOL "I)" STRING,                                                  FN_PTR(lookupSignatureInPool)},
+  {CC "lookupKlassRefIndexInPool",                    CC "(" HS_CONSTANT_POOL "I)I",                                                        FN_PTR(lookupKlassRefIndexInPool)},
+  {CC "lookupKlassInPool",                            CC "(" HS_CONSTANT_POOL "I)Ljava/lang/Object;",                                       FN_PTR(lookupKlassInPool)},
+  {CC "lookupAppendixInPool",                         CC "(" HS_CONSTANT_POOL "I)" OBJECT,                                                  FN_PTR(lookupAppendixInPool)},
+  {CC "lookupMethodInPool",                           CC "(" HS_CONSTANT_POOL "IB)" HS_RESOLVED_METHOD,                                     FN_PTR(lookupMethodInPool)},
+  {CC "constantPoolRemapInstructionOperandFromCache", CC "(" HS_CONSTANT_POOL "I)I",                                                        FN_PTR(constantPoolRemapInstructionOperandFromCache)},
+  {CC "resolveConstantInPool",                        CC "(" HS_CONSTANT_POOL "I)" OBJECT,                                                  FN_PTR(resolveConstantInPool)},
+  {CC "resolvePossiblyCachedConstantInPool",          CC "(" HS_CONSTANT_POOL "I)" OBJECT,                                                  FN_PTR(resolvePossiblyCachedConstantInPool)},
+  {CC "resolveTypeInPool",                            CC "(" HS_CONSTANT_POOL "I)" HS_RESOLVED_KLASS,                                       FN_PTR(resolveTypeInPool)},
+  {CC "resolveFieldInPool",                           CC "(" HS_CONSTANT_POOL "IB[J)" HS_RESOLVED_KLASS,                                    FN_PTR(resolveFieldInPool)},
+  {CC "resolveInvokeDynamicInPool",                   CC "(" HS_CONSTANT_POOL "I)V",                                                        FN_PTR(resolveInvokeDynamicInPool)},
+  {CC "resolveInvokeHandleInPool",                    CC "(" HS_CONSTANT_POOL "I)V",                                                        FN_PTR(resolveInvokeHandleInPool)},
+  {CC "resolveMethod",                                CC "(" HS_RESOLVED_KLASS HS_RESOLVED_METHOD HS_RESOLVED_KLASS ")" HS_RESOLVED_METHOD, FN_PTR(resolveMethod)},
+  {CC "getVtableIndexForInterfaceMethod",             CC "(" HS_RESOLVED_KLASS HS_RESOLVED_METHOD ")I",                                     FN_PTR(getVtableIndexForInterfaceMethod)},
+  {CC "getClassInitializer",                          CC "(" HS_RESOLVED_KLASS ")" HS_RESOLVED_METHOD,                                      FN_PTR(getClassInitializer)},
+  {CC "hasFinalizableSubclass",                       CC "(" HS_RESOLVED_KLASS ")Z",                                                        FN_PTR(hasFinalizableSubclass)},
+  {CC "getMaxCallTargetOffset",                       CC "(J)J",                                                                            FN_PTR(getMaxCallTargetOffset)},
+  {CC "getResolvedJavaMethodAtSlot",                  CC "(" CLASS "I)" HS_RESOLVED_METHOD,                                                 FN_PTR(getResolvedJavaMethodAtSlot)},
+  {CC "getResolvedJavaMethod",                        CC "(Ljava/lang/Object;J)" HS_RESOLVED_METHOD,                                        FN_PTR(getResolvedJavaMethod)},
+  {CC "getConstantPool",                              CC "(Ljava/lang/Object;J)" HS_CONSTANT_POOL,                                          FN_PTR(getConstantPool)},
+  {CC "getResolvedJavaType",                          CC "(Ljava/lang/Object;JZ)" HS_RESOLVED_KLASS,                                        FN_PTR(getResolvedJavaType)},
+  {CC "initializeConfiguration",                      CC "(" HS_CONFIG ")J",                                                                FN_PTR(initializeConfiguration)},
+  {CC "installCode",                                  CC "(" TARGET_DESCRIPTION HS_COMPILED_CODE INSTALLED_CODE HS_SPECULATION_LOG ")I",    FN_PTR(installCode)},
+  {CC "getMetadata",                                  CC "(" TARGET_DESCRIPTION HS_COMPILED_CODE HS_METADATA ")I",                          FN_PTR(getMetadata)},
+  {CC "resetCompilationStatistics",                   CC "()V",                                                                             FN_PTR(resetCompilationStatistics)},
+  {CC "disassembleCodeBlob",                          CC "(" INSTALLED_CODE ")" STRING,                                                     FN_PTR(disassembleCodeBlob)},
+  {CC "executeInstalledCode",                         CC "([" OBJECT INSTALLED_CODE ")" OBJECT,                                             FN_PTR(executeInstalledCode)},
+  {CC "getLineNumberTable",                           CC "(" HS_RESOLVED_METHOD ")[J",                                                      FN_PTR(getLineNumberTable)},
+  {CC "getLocalVariableTableStart",                   CC "(" HS_RESOLVED_METHOD ")J",                                                       FN_PTR(getLocalVariableTableStart)},
+  {CC "getLocalVariableTableLength",                  CC "(" HS_RESOLVED_METHOD ")I",                                                       FN_PTR(getLocalVariableTableLength)},
+  {CC "reprofile",                                    CC "(" HS_RESOLVED_METHOD ")V",                                                       FN_PTR(reprofile)},
+  {CC "invalidateInstalledCode",                      CC "(" INSTALLED_CODE ")V",                                                           FN_PTR(invalidateInstalledCode)},
+  {CC "readUncompressedOop",                          CC "(J)" OBJECT,                                                                      FN_PTR(readUncompressedOop)},
+  {CC "collectCounters",                              CC "()[J",                                                                            FN_PTR(collectCounters)},
+  {CC "allocateCompileId",                            CC "(" HS_RESOLVED_METHOD "I)I",                                                      FN_PTR(allocateCompileId)},
+  {CC "isMature",                                     CC "(" METASPACE_METHOD_DATA ")Z",                                                    FN_PTR(isMature)},
+  {CC "hasCompiledCodeForOSR",                        CC "(" HS_RESOLVED_METHOD "II)Z",                                                     FN_PTR(hasCompiledCodeForOSR)},
+  {CC "getSymbol",                                    CC "(J)" STRING,                                                                      FN_PTR(getSymbol)},
+  {CC "lookupSymbol",                                 CC "(" STRING ")J",                                                                   FN_PTR(lookupSymbol)},
+  {CC "getNextStackFrame",                            CC "(" HS_STACK_FRAME_REF "[" RESOLVED_METHOD "I)" HS_STACK_FRAME_REF,                FN_PTR(getNextStackFrame)},
+  {CC "materializeVirtualObjects",                    CC "(" HS_STACK_FRAME_REF "Z)V",                                                      FN_PTR(materializeVirtualObjects)},
+  {CC "shouldDebugNonSafepoints",                     CC "()Z",                                                                             FN_PTR(shouldDebugNonSafepoints)},
+  {CC "writeDebugOutput",                             CC "([BII)V",                                                                         FN_PTR(writeDebugOutput)},
+  {CC "flushDebugOutput",                             CC "()V",                                                                             FN_PTR(flushDebugOutput)},
+  {CC "methodDataProfileDataSize",                    CC "(JI)I",                                                                           FN_PTR(methodDataProfileDataSize)},
+  {CC "interpreterFrameSize",                         CC "(" BYTECODE_FRAME ")I",                                                           FN_PTR(interpreterFrameSize)},
 };
 
 int CompilerToVM::methods_count() {
--- a/src/share/vm/jvmci/jvmciEnv.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/jvmci/jvmciEnv.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -437,7 +437,7 @@
       stringStream st(buffer, O_BUFLEN);
       deps.print_dependency(witness, true, &st);
       *failure_detail = st.as_string();
-      if (env == NULL || counter_changed) {
+      if (env == NULL || counter_changed || deps.type() == Dependencies::evol_method) {
         return JVMCIEnv::dependencies_failed;
       } else {
         // The dependencies were invalid at the time of installation
--- a/src/share/vm/logging/log.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/log.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -31,7 +31,10 @@
 #include "gc/shared/gcTraceTime.inline.hpp"
 #include "logging/log.hpp"
 #include "logging/logConfiguration.hpp"
+#include "logging/logFileOutput.hpp"
 #include "logging/logOutput.hpp"
+#include "logging/logTagLevelExpression.hpp"
+#include "logging/logTagSet.hpp"
 #include "logging/logStream.inline.hpp"
 #include "memory/resourceArea.hpp"
 
@@ -44,6 +47,13 @@
 #define assert_char_not_in(c, s) \
   assert(strchr(s, c) == NULL, "Expected '%s' to *not* contain character '%c'", s, c)
 
+void Test_log_tag_combinations_limit() {
+  assert(LogTagLevelExpression::MaxCombinations > LogTagSet::ntagsets(),
+      "Combination limit (" SIZE_FORMAT ") not sufficient "
+      "for configuring all available tag sets (" SIZE_FORMAT ")",
+      LogTagLevelExpression::MaxCombinations, LogTagSet::ntagsets());
+}
+
 class TestLogFile {
  private:
   char file_name[256];
@@ -129,6 +139,131 @@
   assert_str_eq("all=off", stdoutput->config_string());
 }
 
+static size_t number_of_lines_with_substring_in_file(const char* filename,
+                                                     const char* substr) {
+  ResourceMark rm;
+  size_t ret = 0;
+  FILE* fp = fopen(filename, "r");
+  assert(fp != NULL, "error opening file %s: %s", filename, strerror(errno));
+
+  int buflen = 512;
+  char* buf = NEW_RESOURCE_ARRAY(char, buflen);
+  long pos = 0;
+
+  while (fgets(buf, buflen, fp) != NULL) {
+    if (buf[strlen(buf) - 1] != '\n' && !feof(fp)) {
+      // retry with a larger buffer
+      buf = REALLOC_RESOURCE_ARRAY(char, buf, buflen, buflen * 2);
+      buflen *= 2;
+      // rewind to beginning of line
+      fseek(fp, pos, SEEK_SET);
+      continue;
+    }
+    pos = ftell(fp);
+    if (strstr(buf, substr) != NULL) {
+      ret++;
+    }
+  }
+
+  fclose(fp);
+  return ret;
+}
+
+static bool file_exists(const char* filename) {
+  struct stat st;
+  return os::stat(filename, &st) == 0;
+}
+
+static void delete_file(const char* filename) {
+  if (!file_exists(filename)) {
+    return;
+  }
+  int ret = remove(filename);
+  assert(ret == 0, "failed to remove file '%s': %s", filename, strerror(errno));
+}
+
+static void create_directory(const char* name) {
+  assert(!file_exists(name), "can't create directory: %s already exists", name);
+  bool failed;
+#ifdef _WINDOWS
+  failed = !CreateDirectory(name, NULL);
+#else
+  failed = mkdir(name, 0777);
+#endif
+  assert(!failed, "failed to create directory %s", name);
+}
+
+static const char* ExpectedLine = "a (hopefully) unique log line for testing";
+
+static void init_file(const char* filename, const char* options = "") {
+  LogConfiguration::parse_log_arguments(filename, "logging=trace", "", options,
+                                        Log(logging)::error_stream());
+  log_debug(logging)("%s", ExpectedLine);
+  LogConfiguration::parse_log_arguments(filename, "all=off", "", "",
+                                        Log(logging)::error_stream());
+}
+
+void Test_log_file_startup_rotation() {
+  ResourceMark rm;
+  const size_t rotations = 5;
+  const char* filename = "start-rotate-test";
+  char* rotated_file[rotations];
+  for (size_t i = 0; i < rotations; i++) {
+    size_t len = strlen(filename) + 3;
+    rotated_file[i] = NEW_RESOURCE_ARRAY(char, len);
+    jio_snprintf(rotated_file[i], len, "%s." SIZE_FORMAT, filename, i);
+    delete_file(rotated_file[i]);
+  };
+
+  delete_file(filename);
+  init_file(filename);
+  assert(file_exists(filename),
+         "configured logging to file '%s' but file was not found", filename);
+
+  // Initialize the same file a bunch more times to trigger rotations
+  for (size_t i = 0; i < rotations; i++) {
+    init_file(filename);
+    assert(file_exists(rotated_file[i]), "existing file was not rotated");
+  }
+
+  // Remove a file and expect its slot to be re-used
+  delete_file(rotated_file[1]);
+  init_file(filename);
+  assert(file_exists(rotated_file[1]), "log file not properly rotated");
+
+  // Clean up after test
+  delete_file(filename);
+  for (size_t i = 0; i < rotations; i++) {
+    delete_file(rotated_file[i]);
+  }
+}
+
+void Test_log_file_startup_truncation() {
+  ResourceMark rm;
+  const char* filename = "start-truncate-test";
+  const char* archived_filename = "start-truncate-test.0";
+
+  delete_file(filename);
+  delete_file(archived_filename);
+
+  // Use the same log file twice and expect it to be overwritten/truncated
+  init_file(filename, "filecount=0");
+  assert(file_exists(filename), "couldn't find log file: %s", filename);
+
+  init_file(filename, "filecount=0");
+  assert(file_exists(filename), "couldn't find log file: %s", filename);
+  assert(!file_exists(archived_filename),
+         "existing log file %s was not properly truncated when filecount was 0",
+         filename);
+
+  // Verify that the file was really truncated and not just appended
+  assert(number_of_lines_with_substring_in_file(filename, ExpectedLine) == 1,
+         "log file %s appended rather than truncated", filename);
+
+  delete_file(filename);
+  delete_file(archived_filename);
+}
+
 static int Test_logconfiguration_subscribe_triggered = 0;
 
 static void Test_logconfiguration_subscribe_helper() {
@@ -361,11 +496,32 @@
   Test_logstream_helper(stream);
 }
 
+static void Test_logstreamcheap_log() {
+  Log(gc) log;
+  LogStreamCHeap stream(log.debug());
+
+  Test_logstream_helper(&stream);
+}
+
+static void Test_logstreamcheap_logtarget() {
+  LogTarget(Debug, gc) log;
+  LogStreamCHeap stream(log);
+
+  Test_logstream_helper(&stream);
+}
+
 void Test_logstream() {
+  // Test LogStreams with embedded ResourceMark.
   Test_logstream_log();
   Test_logstream_logtarget();
   Test_logstream_logstreamhandle();
+
+  // Test LogStreams without embedded ResourceMark.
   Test_logstream_no_rm();
+
+  // Test LogStreams backed by CHeap memory.
+  Test_logstreamcheap_log();
+  Test_logstreamcheap_logtarget();
 }
 
 void Test_loghandle_on() {
@@ -711,4 +867,20 @@
   Test_log_gctracetime_no_heap_no_cause();
 }
 
+void Test_invalid_log_file() {
+  ResourceMark rm;
+  stringStream ss;
+  const char* target_name = "tmplogdir";
+
+  // Attempt to log to a directory (existing log not a regular file)
+  create_directory(target_name);
+  LogFileOutput bad_file("tmplogdir");
+  assert(bad_file.initialize("", &ss) == false, "file was initialized "
+         "when there was an existing directory with the same name");
+  assert(strstr(ss.as_string(), "tmplogdir is not a regular file") != NULL,
+         "missing expected error message, received msg: %s", ss.as_string());
+  ss.reset();
+  remove(target_name);
+}
+
 #endif // PRODUCT
--- a/src/share/vm/logging/logConfiguration.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logConfiguration.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -145,7 +145,7 @@
     return NULL;
   }
 
-  bool success = output->initialize(options);
+  bool success = output->initialize(options, errstream);
   if (!success) {
     errstream->print_cr("Initialization of output '%s' using options '%s' failed.", name, options);
     delete output;
--- a/src/share/vm/logging/logFileOutput.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logFileOutput.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -41,8 +41,9 @@
 
 LogFileOutput::LogFileOutput(const char* name)
     : LogFileStreamOutput(NULL), _name(os::strdup_check_oom(name, mtLogging)),
-      _file_name(NULL), _archive_name(NULL), _archive_name_len(0), _current_size(0),
-      _rotate_size(0), _current_file(1), _file_count(0), _rotation_semaphore(1) {
+      _file_name(NULL), _archive_name(NULL), _archive_name_len(0),
+      _rotate_size(DefaultFileSize), _file_count(DefaultFileCount),
+      _current_size(0), _current_file(0), _rotation_semaphore(1) {
   _file_name = make_file_name(name, _pid_str, _vm_start_time_str);
 }
 
@@ -59,9 +60,6 @@
 
 LogFileOutput::~LogFileOutput() {
   if (_stream != NULL) {
-    if (_archive_name != NULL) {
-      archive();
-    }
     if (fclose(_stream) != 0) {
       jio_fprintf(defaultStream::error_stream(), "Could not close log file '%s' (%s).\n",
                   _file_name, os::strerror(errno));
@@ -72,7 +70,7 @@
   os::free(const_cast<char*>(_name));
 }
 
-size_t LogFileOutput::parse_value(const char* value_str) {
+static size_t parse_value(const char* value_str) {
   char* end;
   unsigned long long value = strtoull(value_str, &end, 10);
   if (!isdigit(*value_str) || end != value_str + strlen(value_str) || value >= SIZE_MAX) {
@@ -81,7 +79,80 @@
   return value;
 }
 
-bool LogFileOutput::configure_rotation(const char* options) {
+static bool file_exists(const char* filename) {
+  struct stat dummy_stat;
+  return os::stat(filename, &dummy_stat) == 0;
+}
+
+static uint number_of_digits(uint number) {
+  return number < 10 ? 1 : (number < 100 ? 2 : 3);
+}
+
+static bool is_regular_file(const char* filename) {
+  struct stat st;
+  int ret = os::stat(filename, &st);
+  if (ret != 0) {
+    return false;
+  }
+#ifdef _WINDOWS
+  return (st.st_mode & S_IFMT) == _S_IFREG;
+#else
+  return S_ISREG(st.st_mode);
+#endif
+}
+
+// Try to find the next number that should be used for file rotation.
+// Return UINT_MAX on error.
+static uint next_file_number(const char* filename,
+                             uint number_of_digits,
+                             uint filecount,
+                             outputStream* errstream) {
+  bool found = false;
+  uint next_num = 0;
+
+  // len is filename + dot + digits + null char
+  size_t len = strlen(filename) + number_of_digits + 2;
+  char* archive_name = NEW_C_HEAP_ARRAY(char, len, mtLogging);
+  char* oldest_name = NEW_C_HEAP_ARRAY(char, len, mtLogging);
+
+  for (uint i = 0; i < filecount; i++) {
+    int ret = jio_snprintf(archive_name, len, "%s.%0*u",
+                           filename, number_of_digits, i);
+    assert(ret > 0 && static_cast<size_t>(ret) == len - 1,
+           "incorrect buffer length calculation");
+
+    if (file_exists(archive_name) && !is_regular_file(archive_name)) {
+      // We've encountered something that's not a regular file among the
+      // possible file rotation targets. Fail immediately to prevent
+      // problems later.
+      errstream->print_cr("Possible rotation target file '%s' already exists "
+                          "but is not a regular file.", archive_name);
+      next_num = UINT_MAX;
+      break;
+    }
+
+    // Stop looking if we find an unused file name
+    if (!file_exists(archive_name)) {
+      next_num = i;
+      found = true;
+      break;
+    }
+
+    // Keep track of oldest existing log file
+    if (!found
+        || os::compare_file_modified_times(oldest_name, archive_name) > 0) {
+      strcpy(oldest_name, archive_name);
+      next_num = i;
+      found = true;
+    }
+  }
+
+  FREE_C_HEAP_ARRAY(char, oldest_name);
+  FREE_C_HEAP_ARRAY(char, archive_name);
+  return next_num;
+}
+
+bool LogFileOutput::parse_options(const char* options, outputStream* errstream) {
   if (options == NULL || strlen(options) == 0) {
     return true;
   }
@@ -107,22 +178,25 @@
 
     if (strcmp(FileCountOptionKey, key) == 0) {
       size_t value = parse_value(value_str);
-      if (value == SIZE_MAX || value >= UINT_MAX) {
+      if (value > MaxRotationFileCount) {
+        errstream->print_cr("Invalid option: %s must be in range [0, %u]",
+                            FileCountOptionKey,
+                            MaxRotationFileCount);
         success = false;
         break;
       }
       _file_count = static_cast<uint>(value);
-      _file_count_max_digits = static_cast<uint>(log10(static_cast<double>(_file_count)) + 1);
-      _archive_name_len = 2 + strlen(_file_name) + _file_count_max_digits;
-      _archive_name = NEW_C_HEAP_ARRAY(char, _archive_name_len, mtLogging);
     } else if (strcmp(FileSizeOptionKey, key) == 0) {
       size_t value = parse_value(value_str);
       if (value == SIZE_MAX || value > SIZE_MAX / K) {
+        errstream->print_cr("Invalid option: %s must be in range [0, "
+                            SIZE_FORMAT "]", FileSizeOptionKey, SIZE_MAX / K);
         success = false;
         break;
       }
       _rotate_size = value * K;
     } else {
+      errstream->print_cr("Invalid option '%s' for log file output.", key);
       success = false;
       break;
     }
@@ -133,15 +207,54 @@
   return success;
 }
 
-bool LogFileOutput::initialize(const char* options) {
-  if (!configure_rotation(options)) {
+bool LogFileOutput::initialize(const char* options, outputStream* errstream) {
+  if (!parse_options(options, errstream)) {
     return false;
   }
+
+  if (_file_count > 0) {
+    // compute digits with filecount - 1 since numbers will start from 0
+    _file_count_max_digits = number_of_digits(_file_count - 1);
+    _archive_name_len = 2 + strlen(_file_name) + _file_count_max_digits;
+    _archive_name = NEW_C_HEAP_ARRAY(char, _archive_name_len, mtLogging);
+  }
+
+  log_trace(logging)("Initializing logging to file '%s' (filecount: %u"
+                     ", filesize: " SIZE_FORMAT " KiB).",
+                     _file_name, _file_count, _rotate_size / K);
+
+  if (_file_count > 0 && file_exists(_file_name)) {
+    if (!is_regular_file(_file_name)) {
+      errstream->print_cr("Unable to log to file %s with log file rotation: "
+                          "%s is not a regular file",
+                          _file_name, _file_name);
+      return false;
+    }
+    _current_file = next_file_number(_file_name,
+                                     _file_count_max_digits,
+                                     _file_count,
+                                     errstream);
+    if (_current_file == UINT_MAX) {
+      return false;
+    }
+    log_trace(logging)("Existing log file found, saving it as '%s.%0*u'",
+                       _file_name, _file_count_max_digits, _current_file);
+    archive();
+    increment_file_count();
+  }
+
   _stream = fopen(_file_name, FileOpenMode);
   if (_stream == NULL) {
-    log_error(logging)("Could not open log file '%s' (%s).\n", _file_name, os::strerror(errno));
+    errstream->print_cr("Error opening log file '%s': %s",
+                        _file_name, strerror(errno));
     return false;
   }
+
+  if (_file_count == 0 && is_regular_file(_file_name)) {
+    log_trace(logging)("Truncating log file");
+    os::ftruncate(os::fileno(_stream), 0);
+  }
+
   return true;
 }
 
@@ -210,7 +323,7 @@
 
   // Reset accumulated size, increase current file counter, and check for file count wrap-around.
   _current_size = 0;
-  _current_file = (_current_file >= _file_count ? 1 : _current_file + 1);
+  increment_file_count();
 }
 
 char* LogFileOutput::make_file_name(const char* file_name,
--- a/src/share/vm/logging/logFileOutput.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logFileOutput.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -39,8 +39,11 @@
   static const char*  PidFilenamePlaceholder;
   static const char*  TimestampFilenamePlaceholder;
   static const char*  TimestampFormat;
+  static const size_t DefaultFileCount = 5;
+  static const size_t DefaultFileSize = 20 * M;
   static const size_t StartTimeBufferSize = 20;
-  static const size_t PidBufferSize       = 21;
+  static const size_t PidBufferSize = 21;
+  static const uint   MaxRotationFileCount = 1000;
   static char         _pid_str[PidBufferSize];
   static char         _vm_start_time_str[StartTimeBufferSize];
 
@@ -61,18 +64,24 @@
 
   void archive();
   void rotate();
-  bool configure_rotation(const char* options);
+  bool parse_options(const char* options, outputStream* errstream);
   char *make_file_name(const char* file_name, const char* pid_string, const char* timestamp_string);
-  static size_t parse_value(const char* value_str);
 
   bool should_rotate() {
     return _file_count > 0 && _rotate_size > 0 && _current_size >= _rotate_size;
   }
 
+  void increment_file_count() {
+    _current_file++;
+    if (_current_file == _file_count) {
+      _current_file = 0;
+    }
+  }
+
  public:
   LogFileOutput(const char *name);
   virtual ~LogFileOutput();
-  virtual bool initialize(const char* options);
+  virtual bool initialize(const char* options, outputStream* errstream);
   virtual int write(const LogDecorations& decorations, const char* msg);
   virtual void force_rotate();
 
--- a/src/share/vm/logging/logFileStreamOutput.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logFileStreamOutput.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -53,7 +53,7 @@
   LogStdoutOutput() : LogFileStreamOutput(stdout) {
     set_config_string("all=off");
   }
-  virtual bool initialize(const char* options) {
+  virtual bool initialize(const char* options, outputStream* errstream) {
     return false;
   }
  public:
@@ -69,7 +69,7 @@
   LogStderrOutput() : LogFileStreamOutput(stderr) {
     set_config_string("all=warning");
   }
-  virtual bool initialize(const char* options) {
+  virtual bool initialize(const char* options, outputStream* errstream) {
     return false;
   }
  public:
--- a/src/share/vm/logging/logHandle.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logHandle.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -67,13 +67,13 @@
 // This can be used to pass a Log instance as a parameter without
 // polluting the surrounding API with template functions.
 class LogTargetHandle {
-  friend class LogStream;
-
 private:
   const LogLevelType _level;
   LogTagSet*         _tagset;
 
 public:
+  LogTargetHandle(LogLevelType level, LogTagSet* tagset) : _level(level), _tagset(tagset) {}
+
   template <LogLevelType level, LogTagType T0, LogTagType T1, LogTagType T2, LogTagType T3, LogTagType T4, LogTagType GuardTag>
   LogTargetHandle(const LogTargetImpl<level, T0, T1, T2, T3, T4, GuardTag>& type_carrier) :
       _level(level),
--- a/src/share/vm/logging/logOutput.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logOutput.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -82,7 +82,7 @@
   }
 
   virtual const char* name() const = 0;
-  virtual bool initialize(const char* options) = 0;
+  virtual bool initialize(const char* options, outputStream* errstream) = 0;
   virtual int write(const LogDecorations &decorations, const char* msg) = 0;
 };
 
--- a/src/share/vm/logging/logStream.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logStream.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -24,7 +24,7 @@
 
 #include "precompiled.hpp"
 #include "logging/log.hpp"
-#include "logging/logStream.hpp"
+#include "logging/logStream.inline.hpp"
 
 // Create a log stream without an embedded ResourceMark.
 // The function is placed here to be called out-of-line in log.hpp.
--- a/src/share/vm/logging/logStream.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logStream.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -26,23 +26,100 @@
 #define SHARE_VM_LOGGING_LOGSTREAM_HPP
 
 #include "logging/log.hpp"
+#include "logging/logHandle.hpp"
+#include "memory/resourceArea.hpp"
 #include "utilities/ostream.hpp"
 
-// An output stream that logs to the logging framework.
-// Requires a ResourceMark on the stack.
-class LogStreamNoResourceMark : public outputStream {
-private:
-  stringStream _current_line;
-  LogLevelType _level;
-  LogTagSet*   _tagset;
+// The base class of an output stream that logs to the logging framework.
+template <class streamClass>
+class LogStreamBase : public outputStream {
+  streamClass     _current_line;
+  LogTargetHandle _log_handle;
 
 public:
-  LogStreamNoResourceMark(LogLevelType level, LogTagSet* tagset) : _level(level), _tagset(tagset) {}
-  ~LogStreamNoResourceMark() {
+  // Constructor to support creation from a LogTarget instance.
+  //
+  // LogTarget(Debug, gc) log;
+  // LogStreamBase(log) stream;
+  template <LogLevelType level, LogTagType T0, LogTagType T1, LogTagType T2, LogTagType T3, LogTagType T4, LogTagType GuardTag>
+  LogStreamBase(const LogTargetImpl<level, T0, T1, T2, T3, T4, GuardTag>& type_carrier) :
+      _log_handle(level, &LogTagSetMapping<T0, T1, T2, T3, T4>::tagset()) {}
+
+  // Constructor to support creation from typed (likely NULL) pointer. Mostly used by the logging framework.
+  //
+  // LogStreamBase stream(log.debug());
+  //  or
+  // LogStreamBase stream((LogTargetImpl<level, T0, T1, T2, T3, T4, GuardTag>*)NULL);
+  template <LogLevelType level, LogTagType T0, LogTagType T1, LogTagType T2, LogTagType T3, LogTagType T4, LogTagType GuardTag>
+  LogStreamBase(const LogTargetImpl<level, T0, T1, T2, T3, T4, GuardTag>* type_carrier) :
+      _log_handle(level, &LogTagSetMapping<T0, T1, T2, T3, T4>::tagset()) {}
+
+  // Constructor to support creation from a LogTargetHandle.
+  //
+  // LogTarget(Debug, gc) log;
+  // LogTargetHandle(log) handle;
+  // LogStreamBase stream(handle);
+  LogStreamBase(LogTargetHandle handle) : _log_handle(handle) {}
+
+  // Constructor to support creation from a log level and tagset.
+  //
+  // LogStreamBase(level, tageset);
+  LogStreamBase(LogLevelType level, LogTagSet* tagset) : _log_handle(level, tagset) {}
+
+  ~LogStreamBase() {
     guarantee(_current_line.size() == 0, "Buffer not flushed. Missing call to print_cr()?");
   }
 
+public:
   void write(const char* s, size_t len);
 };
 
+// A stringStream with an embedded ResourceMark.
+class stringStreamWithResourceMark : outputStream {
+ private:
+  // The stringStream Resource allocate in the constructor,
+  // so the order of the fields is important.
+  ResourceMark _embedded_resource_mark;
+  stringStream _stream;
+
+ public:
+  stringStreamWithResourceMark(size_t initial_bufsize = 256) :
+      _embedded_resource_mark(),
+      _stream(initial_bufsize) {}
+
+  virtual void write(const char* c, size_t len) { _stream.write(c, len); }
+  size_t      size()                            { return _stream.size(); }
+  const char* base()                            { return _stream.base(); }
+  void  reset()                                 { _stream.reset(); }
+  char* as_string()                             { return _stream.as_string(); }
+};
+
+// An output stream that logs to the logging framework.
+//
+// The backing buffer is allocated in Resource memory.
+// The caller is required to have a ResourceMark on the stack.
+typedef LogStreamBase<stringStream> LogStreamNoResourceMark;
+
+// An output stream that logs to the logging framework.
+//
+// The backing buffer is allocated in CHeap memory.
+typedef LogStreamBase<bufferedStream> LogStreamCHeap;
+
+// An output stream that logs to the logging framework, and embeds a ResourceMark.
+//
+// The backing buffer is allocated in Resource memory.
+// The class is intended to be stack allocated.
+// The class provides its own ResourceMark,
+//  so care needs to be taken when nested ResourceMarks are used.
+typedef LogStreamBase<stringStreamWithResourceMark> LogStream;
+
+// Support creation of a LogStream without having to provide a LogTarget pointer.
+#define LogStreamHandle(level, ...) LogStreamTemplate<LogLevel::level, LOG_TAGS(__VA_ARGS__)>
+
+template <LogLevelType level, LogTagType T0, LogTagType T1, LogTagType T2, LogTagType T3, LogTagType T4, LogTagType GuardTag>
+class LogStreamTemplate : public LogStream {
+public:
+  LogStreamTemplate() : LogStream((LogTargetImpl<level, T0, T1, T2, T3, T4, GuardTag>*)NULL) {}
+};
+
 #endif // SHARE_VM_LOGGING_LOGSTREAM_HPP
--- a/src/share/vm/logging/logStream.inline.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logStream.inline.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -30,10 +30,12 @@
 #include "memory/resourceArea.hpp"
 #include "utilities/ostream.hpp"
 
-inline void LogStreamNoResourceMark::write(const char* s, size_t len) {
+template <class streamClass>
+inline void LogStreamBase<streamClass>::write(const char* s, size_t len) {
   if (len > 0 && s[len - 1] == '\n') {
     _current_line.write(s, len - 1);
-    _tagset->write(_level, "%s", _current_line.as_string());
+    _current_line.write("\0", 1);
+    _log_handle.print("%s", _current_line.base());
     _current_line.reset();
   } else {
     _current_line.write(s, len);
@@ -41,54 +43,4 @@
   update_position(s, len);
 }
 
-// An output stream that logs to the logging framework, and embeds a ResourceMark.
-//
-//  The class is intended to be stack allocated.
-//  Care needs to be taken when nested ResourceMarks are used.
-class LogStream : public outputStream {
-private:
-  ResourceMark            _embedded_resource_mark;
-  LogStreamNoResourceMark _stream;
-
-public:
-  // Constructor to support creation from a LogTarget instance.
-  //
-  // LogTarget(Debug, gc) log;
-  // LogStream(log) stream;
-  template <LogLevelType level, LogTagType T0, LogTagType T1, LogTagType T2, LogTagType T3, LogTagType T4, LogTagType GuardTag>
-  LogStream(const LogTargetImpl<level, T0, T1, T2, T3, T4, GuardTag>& type_carrier) :
-      _embedded_resource_mark(),
-      _stream(level, &LogTagSetMapping<T0, T1, T2, T3, T4>::tagset()) {}
-
-  // Constructor to support creation from typed (likely NULL) pointer. Mostly used by the logging framework.
-  //
-  // LogStream stream(log.debug());
-  // LogStream stream((LogTargetImpl<level, T0, T1, T2, T3, T4, GuardTag>*)NULL);
-  template <LogLevelType level, LogTagType T0, LogTagType T1, LogTagType T2, LogTagType T3, LogTagType T4, LogTagType GuardTag>
-  LogStream(const LogTargetImpl<level, T0, T1, T2, T3, T4, GuardTag>* type_carrier) :
-      _embedded_resource_mark(),
-      _stream(level, &LogTagSetMapping<T0, T1, T2, T3, T4>::tagset()) {}
-
-  // Constructor to support creation from a LogTargetHandle.
-  //
-  // LogTarget(Debug, gc) log;
-  // LogTargetHandle(log) handle;
-  // LogStream stream(handle);
-  LogStream(LogTargetHandle handle) :
-      _embedded_resource_mark(),
-      _stream(handle._level, handle._tagset) {}
-
-  // Override of outputStream::write.
-  void write(const char* s, size_t len) { _stream.write(s, len); }
-};
-
-// Support creation of a LogStream without having to provide a LogTarget pointer.
-#define LogStreamHandle(level, ...) LogStreamTemplate<LogLevel::level, LOG_TAGS(__VA_ARGS__)>
-
-template <LogLevelType level, LogTagType T0, LogTagType T1, LogTagType T2, LogTagType T3, LogTagType T4, LogTagType GuardTag>
-class LogStreamTemplate : public LogStream {
-public:
-  LogStreamTemplate() : LogStream((LogTargetImpl<level, T0, T1, T2, T3, T4, GuardTag>*)NULL) {}
-};
-
 #endif // SHARE_VM_LOGGING_LOGSTREAM_INLINE_HPP
--- a/src/share/vm/logging/logTag.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logTag.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -70,6 +70,7 @@
   LOG_TAG(monitorinflation) \
   LOG_TAG(monitormismatch) \
   LOG_TAG(os) \
+  LOG_TAG(pagesize) \
   LOG_TAG(phases) \
   LOG_TAG(plab) \
   LOG_TAG(promotion) \
--- a/src/share/vm/logging/logTagLevelExpression.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logTagLevelExpression.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -36,9 +36,12 @@
 // Class used to temporary encode a 'what'-expression during log configuration.
 // Consists of a combination of tags and levels, e.g. "tag1+tag2=level1,tag3*=level2".
 class LogTagLevelExpression : public StackObj {
+ public:
+  static const size_t MaxCombinations = 256;
+
+ private:
   friend void LogConfiguration::configure_stdout(LogLevelType, bool, ...);
- private:
-  static const size_t MaxCombinations = 32;
+
   static const char* DefaultExpressionString;
 
   size_t        _ntags, _ncombinations;
--- a/src/share/vm/logging/logTagSet.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logTagSet.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -98,6 +98,7 @@
 const size_t vwrite_buffer_size = 512;
 
 void LogTagSet::vwrite(LogLevelType level, const char* fmt, va_list args) {
+  assert(level >= LogLevel::First && level <= LogLevel::Last, "Log level:%d is incorrect", level);
   char buf[vwrite_buffer_size];
   va_list saved_args;           // For re-format on buf overflow.
   va_copy(saved_args, args);
--- a/src/share/vm/logging/logTagSet.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/logging/logTagSet.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -64,6 +64,10 @@
     return _list;
   }
 
+  static size_t ntagsets() {
+    return _ntagsets;
+  }
+
   LogTagSet* next() {
     return _next;
   }
--- a/src/share/vm/memory/allocation.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/memory/allocation.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -143,8 +143,9 @@
   mtTest              = 0x0D,  // Test type for verifying NMT
   mtTracing           = 0x0E,  // memory used for Tracing
   mtLogging           = 0x0F,  // memory for logging
-  mtNone              = 0x10,  // undefined
-  mt_number_of_types  = 0x11   // number of memory types (mtDontTrack
+  mtArguments         = 0x10,  // memory for argument processing
+  mtNone              = 0x11,  // undefined
+  mt_number_of_types  = 0x12   // number of memory types (mtDontTrack
                                  // is not included as validate type)
 };
 
--- a/src/share/vm/memory/filemap.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/memory/filemap.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -959,6 +959,16 @@
   return false;
 }
 
+// Check if a given address is within one of the shared regions (ro, rw, md, mc)
+bool FileMapInfo::is_in_shared_region(const void* p, int idx) {
+  assert((idx >= MetaspaceShared::ro) && (idx <= MetaspaceShared::mc), "invalid region index");
+  char* base = _header->region_addr(idx);
+  if (p >= base && p < base + _header->_space[idx]._used) {
+    return true;
+  }
+  return false;
+}
+
 void FileMapInfo::print_shared_spaces() {
   tty->print_cr("Shared Spaces:");
   for (int i = 0; i < MetaspaceShared::n_regions; i++) {
--- a/src/share/vm/memory/filemap.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/memory/filemap.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -107,6 +107,8 @@
     int     _narrow_klass_shift;      // save narrow klass base and shift
     address _narrow_klass_base;
     char*   _misc_data_patching_start;
+    address _cds_i2i_entry_code_buffers;
+    size_t  _cds_i2i_entry_code_buffers_size;
 
     struct space_info {
       int    _crc;           // crc checksum of the current space
@@ -195,6 +197,19 @@
   char* misc_data_patching_start()            { return _header->_misc_data_patching_start; }
   void set_misc_data_patching_start(char* p)  { _header->_misc_data_patching_start = p; }
 
+  address cds_i2i_entry_code_buffers() {
+    return _header->_cds_i2i_entry_code_buffers;
+  }
+  void set_cds_i2i_entry_code_buffers(address addr) {
+    _header->_cds_i2i_entry_code_buffers = addr;
+  }
+  size_t cds_i2i_entry_code_buffers_size() {
+    return _header->_cds_i2i_entry_code_buffers_size;
+  }
+  void set_cds_i2i_entry_code_buffers_size(size_t s) {
+    _header->_cds_i2i_entry_code_buffers_size = s;
+  }
+
   static FileMapInfo* current_info() {
     CDS_ONLY(return _current_info;)
     NOT_CDS(return NULL;)
@@ -234,6 +249,7 @@
 
   // Return true if given address is in the mapped shared space.
   bool is_in_shared_space(const void* p) NOT_CDS_RETURN_(false);
+  bool is_in_shared_region(const void* p, int idx) NOT_CDS_RETURN_(false);
   void print_shared_spaces() NOT_CDS_RETURN;
 
   static size_t shared_spaces_size() {
--- a/src/share/vm/memory/iterator.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/memory/iterator.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -27,7 +27,7 @@
 
 #include "memory/allocation.hpp"
 #include "memory/memRegion.hpp"
-#include "utilities/top.hpp"
+#include "oops/oopsHierarchy.hpp"
 
 class CodeBlob;
 class nmethod;
@@ -35,6 +35,7 @@
 class DataLayout;
 class KlassClosure;
 class ClassLoaderData;
+class Symbol;
 
 // The following classes are C++ `closures` for iterating over objects, roots and spaces
 
--- a/src/share/vm/memory/metaspaceShared.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/memory/metaspaceShared.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -59,6 +59,8 @@
 bool MetaspaceShared::_check_classes_made_progress;
 bool MetaspaceShared::_has_error_classes;
 bool MetaspaceShared::_archive_loading_failed = false;
+address MetaspaceShared::_cds_i2i_entry_code_buffers = NULL;
+size_t MetaspaceShared::_cds_i2i_entry_code_buffers_size = 0;
 SharedMiscRegion MetaspaceShared::_mc;
 SharedMiscRegion MetaspaceShared::_md;
 
@@ -129,6 +131,21 @@
   soc->do_tag(666);
 }
 
+address MetaspaceShared::cds_i2i_entry_code_buffers(size_t total_size) {
+  if (DumpSharedSpaces) {
+    if (_cds_i2i_entry_code_buffers == NULL) {
+      _cds_i2i_entry_code_buffers = (address)misc_data_space_alloc(total_size);
+      _cds_i2i_entry_code_buffers_size = total_size;
+    }
+  } else if (UseSharedSpaces) {
+    assert(_cds_i2i_entry_code_buffers != NULL, "must already been initialized");
+  } else {
+    return NULL;
+  }
+
+  assert(_cds_i2i_entry_code_buffers_size == total_size, "must not change");
+  return _cds_i2i_entry_code_buffers;
+}
 
 // CDS code for dumping shared archive.
 
@@ -576,6 +593,8 @@
                                      &md_top, md_end,
                                      &mc_top, mc_end);
 
+  guarantee(md_top <= md_end, "Insufficient space for vtables.");
+
   // Reorder the system dictionary.  (Moving the symbols affects
   // how the hash table indices are calculated.)
   // Not doing this either.
@@ -668,6 +687,8 @@
   FileMapInfo* mapinfo = new FileMapInfo();
   mapinfo->populate_header(MetaspaceShared::max_alignment());
   mapinfo->set_misc_data_patching_start((char*)vtbl_list);
+  mapinfo->set_cds_i2i_entry_code_buffers(MetaspaceShared::cds_i2i_entry_code_buffers());
+  mapinfo->set_cds_i2i_entry_code_buffers_size(MetaspaceShared::cds_i2i_entry_code_buffers_size());
 
   for (int pass=1; pass<=2; pass++) {
     if (pass == 1) {
@@ -686,7 +707,7 @@
     mapinfo->write_region(MetaspaceShared::md, _md_vs.low(),
                           pointer_delta(md_top, _md_vs.low(), sizeof(char)),
                           SharedMiscDataSize,
-                          false, false);
+                          false, true);
     mapinfo->write_region(MetaspaceShared::mc, _mc_vs.low(),
                           pointer_delta(mc_top, _mc_vs.low(), sizeof(char)),
                           SharedMiscCodeSize,
@@ -980,6 +1001,11 @@
   return UseSharedSpaces && FileMapInfo::current_info()->is_in_shared_space(p);
 }
 
+// Return true if given address is in the misc data region
+bool MetaspaceShared::is_in_shared_region(const void* p, int idx) {
+  return UseSharedSpaces && FileMapInfo::current_info()->is_in_shared_region(p, idx);
+}
+
 bool MetaspaceShared::is_string_region(int idx) {
   return (idx >= MetaspaceShared::first_string &&
           idx < MetaspaceShared::first_string + MetaspaceShared::max_strings);
@@ -1053,6 +1079,8 @@
 
 void MetaspaceShared::initialize_shared_spaces() {
   FileMapInfo *mapinfo = FileMapInfo::current_info();
+  _cds_i2i_entry_code_buffers = mapinfo->cds_i2i_entry_code_buffers();
+  _cds_i2i_entry_code_buffers_size = mapinfo->cds_i2i_entry_code_buffers_size();
   char* buffer = mapinfo->misc_data_patching_start();
 
   // Skip over (reserve space for) a list of addresses of C++ vtables
--- a/src/share/vm/memory/metaspaceShared.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/memory/metaspaceShared.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -50,17 +50,14 @@
 #define MIN_SHARED_READ_ONLY_SIZE       (NOT_LP64(8*M) LP64_ONLY(9*M))
 
 // the MIN_SHARED_MISC_DATA_SIZE and MIN_SHARED_MISC_CODE_SIZE estimates are based on
-// MetaspaceShared::generate_vtable_methods().
-// The minimum size only accounts for the vtable methods. Any size less than the
-// minimum required size would cause vm crash when allocating the vtable methods.
-#define SHARED_MISC_SIZE_FOR(size)      (DEFAULT_VTBL_VIRTUALS_COUNT*DEFAULT_VTBL_LIST_SIZE*size)
+// the sizes required for dumping the archive using the default classlist. The sizes
+// are multiplied by 1.5 for a safety margin.
 
 #define DEFAULT_SHARED_MISC_DATA_SIZE   (NOT_LP64(2*M) LP64_ONLY(4*M))
-#define MIN_SHARED_MISC_DATA_SIZE       (SHARED_MISC_SIZE_FOR(sizeof(void*)))
+#define MIN_SHARED_MISC_DATA_SIZE       (NOT_LP64(1*M) LP64_ONLY(1200*K))
 
 #define DEFAULT_SHARED_MISC_CODE_SIZE   (120*K)
-#define MIN_SHARED_MISC_CODE_SIZE       (SHARED_MISC_SIZE_FOR(sizeof(void*))+SHARED_MISC_SIZE_FOR(DEFAULT_VTBL_METHOD_SIZE)+DEFAULT_VTBL_COMMON_CODE_SIZE)
-
+#define MIN_SHARED_MISC_CODE_SIZE       (NOT_LP64(63*K) LP64_ONLY(69*K))
 #define DEFAULT_COMBINED_SIZE           (DEFAULT_SHARED_READ_WRITE_SIZE+DEFAULT_SHARED_READ_ONLY_SIZE+DEFAULT_SHARED_MISC_DATA_SIZE+DEFAULT_SHARED_MISC_CODE_SIZE)
 
 // the max size is the MAX size (ie. 0x7FFFFFFF) - the total size of
@@ -128,6 +125,8 @@
   static bool _check_classes_made_progress;
   static bool _has_error_classes;
   static bool _archive_loading_failed;
+  static address _cds_i2i_entry_code_buffers;
+  static size_t  _cds_i2i_entry_code_buffers_size;
 
   // Used only during dumping.
   static SharedMiscRegion _md;
@@ -185,6 +184,9 @@
   // Return true if given address is in the mapped shared space.
   static bool is_in_shared_space(const void* p) NOT_CDS_RETURN_(false);
 
+  // Return true if given address is in the shared region corresponding to the idx
+  static bool is_in_shared_region(const void* p, int idx) NOT_CDS_RETURN_(false);
+
   static bool is_string_region(int idx) NOT_CDS_RETURN_(false);
 
   static void generate_vtable_methods(void** vtbl_list,
@@ -218,6 +220,15 @@
   static char* misc_code_space_alloc(size_t num_bytes) {  return _mc.alloc(num_bytes); }
   static char* misc_data_space_alloc(size_t num_bytes) {  return _md.alloc(num_bytes); }
 
+  static address cds_i2i_entry_code_buffers(size_t total_size);
+
+  static address cds_i2i_entry_code_buffers() {
+    return _cds_i2i_entry_code_buffers;
+  }
+  static size_t cds_i2i_entry_code_buffers_size() {
+    return _cds_i2i_entry_code_buffers_size;
+  }
+
   static SharedMiscRegion* misc_code_region() {
     assert(DumpSharedSpaces, "used during dumping only");
     return &_mc;
--- a/src/share/vm/memory/resourceArea.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/memory/resourceArea.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -26,7 +26,7 @@
 #define SHARE_VM_MEMORY_RESOURCEAREA_HPP
 
 #include "memory/allocation.hpp"
-#include "runtime/thread.inline.hpp"
+#include "runtime/thread.hpp"
 
 // The resource area holds temporary data structures in the VM.
 // The actual allocation areas are thread local. Typical usage:
--- a/src/share/vm/oops/constMethod.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/oops/constMethod.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -121,6 +121,7 @@
 };
 
 class KlassSizeStats;
+class AdapterHandlerEntry;
 
 // Class to collect the sizes of ConstMethod inline tables
 #define INLINE_TABLES_DO(do_element)            \
@@ -201,6 +202,12 @@
   // Raw stackmap data for the method
   Array<u1>*        _stackmap_data;
 
+  // Adapter blob (i2c/c2i) for this Method*. Set once when method is linked.
+  union {
+    AdapterHandlerEntry* _adapter;
+    AdapterHandlerEntry** _adapter_trampoline;
+  };
+
   int               _constMethod_size;
   u2                _flags;
 
@@ -276,6 +283,29 @@
   void copy_stackmap_data(ClassLoaderData* loader_data, u1* sd, int length, TRAPS);
   bool has_stackmap_table() const { return _stackmap_data != NULL; }
 
+  // adapter
+  void set_adapter_entry(AdapterHandlerEntry* adapter) {
+    assert(!is_shared(), "shared methods have fixed adapter_trampoline");
+    _adapter = adapter;
+  }
+  void set_adapter_trampoline(AdapterHandlerEntry** trampoline) {
+    assert(DumpSharedSpaces, "must be");
+    assert(*trampoline == NULL, "must be NULL during dump time, to be initialized at run time");
+    _adapter_trampoline = trampoline;
+  }
+  void update_adapter_trampoline(AdapterHandlerEntry* adapter) {
+    assert(is_shared(), "must be");
+    *_adapter_trampoline = adapter;
+    assert(this->adapter() == adapter, "must be");
+  }
+  AdapterHandlerEntry* adapter() {
+    if (is_shared()) {
+      return *_adapter_trampoline;
+    } else {
+      return _adapter;
+    }
+  }
+
   void init_fingerprint() {
     const uint64_t initval = UCONST64(0x8000000000000000);
     _fingerprint = initval;
--- a/src/share/vm/oops/constantPool.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/oops/constantPool.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -283,8 +283,9 @@
   this_key->record_dependency(k(), CHECK_NULL); // Can throw OOM
 
   // logging for classresolve tag.
-  trace_class_resolution(this_cp, k);
-
+  if (log_is_enabled(Debug, classresolve)){
+    trace_class_resolution(this_cp, k);
+  }
   this_cp->klass_at_put(which, k());
   entry = this_cp->resolved_klass_at(which);
   assert(entry.is_resolved() && entry.get_klass()->is_klass(), "must be resolved at this point");
@@ -340,9 +341,7 @@
   int cache_index = decode_cpcache_index(which, true);
   if (!(cache_index >= 0 && cache_index < cpool->cache()->length())) {
     // FIXME: should be an assert
-    if (PrintMiscellaneous && (Verbose||WizardMode)) {
-      tty->print_cr("bad operand %d in:", which); cpool->print();
-    }
+    log_debug(classresolve)("bad operand %d in:", which); cpool->print();
     return NULL;
   }
   ConstantPoolCacheEntry* e = cpool->cache()->entry_at(cache_index);
@@ -396,7 +395,7 @@
   int i = which;
   if (!uncached && cache() != NULL) {
     if (ConstantPool::is_invokedynamic_index(which)) {
-      // Invokedynamic index is index into resolved_references
+      // Invokedynamic index is index into the constant pool cache
       int pool_index = invokedynamic_cp_cache_entry_at(which)->constant_pool_index();
       pool_index = invoke_dynamic_name_and_type_ref_index_at(pool_index);
       assert(tag_at(pool_index).is_name_and_type(), "");
@@ -672,10 +671,11 @@
       int callee_index             = this_cp->method_handle_klass_index_at(index);
       Symbol*  name =      this_cp->method_handle_name_ref_at(index);
       Symbol*  signature = this_cp->method_handle_signature_ref_at(index);
-      if (PrintMiscellaneous)
-        tty->print_cr("resolve JVM_CONSTANT_MethodHandle:%d [%d/%d/%d] %s.%s",
-                      ref_kind, index, this_cp->method_handle_index_at(index),
-                      callee_index, name->as_C_string(), signature->as_C_string());
+      { ResourceMark rm(THREAD);
+        log_debug(classresolve)("resolve JVM_CONSTANT_MethodHandle:%d [%d/%d/%d] %s.%s",
+                              ref_kind, index, this_cp->method_handle_index_at(index),
+                              callee_index, name->as_C_string(), signature->as_C_string());
+      }
       KlassHandle callee;
       { Klass* k = klass_at_impl(this_cp, callee_index, true, CHECK_NULL);
         callee = KlassHandle(THREAD, k);
@@ -694,10 +694,11 @@
   case JVM_CONSTANT_MethodType:
     {
       Symbol*  signature = this_cp->method_type_signature_at(index);
-      if (PrintMiscellaneous)
-        tty->print_cr("resolve JVM_CONSTANT_MethodType [%d/%d] %s",
-                      index, this_cp->method_type_index_at(index),
-                      signature->as_C_string());
+      { ResourceMark rm(THREAD);
+        log_debug(classresolve)("resolve JVM_CONSTANT_MethodType [%d/%d] %s",
+                              index, this_cp->method_type_index_at(index),
+                              signature->as_C_string());
+      }
       KlassHandle klass(THREAD, this_cp->pool_holder());
       Handle value = SystemDictionary::find_method_handle_type(signature, klass, THREAD);
       result_oop = value();
@@ -964,8 +965,8 @@
 
   case JVM_CONSTANT_MethodType:
   {
-    int k1 = method_type_index_at_error_ok(index1);
-    int k2 = cp2->method_type_index_at_error_ok(index2);
+    int k1 = method_type_index_at(index1);
+    int k2 = cp2->method_type_index_at(index2);
     bool match = compare_entry_to(k1, cp2, k2, CHECK_false);
     if (match) {
       return true;
@@ -974,11 +975,11 @@
 
   case JVM_CONSTANT_MethodHandle:
   {
-    int k1 = method_handle_ref_kind_at_error_ok(index1);
-    int k2 = cp2->method_handle_ref_kind_at_error_ok(index2);
+    int k1 = method_handle_ref_kind_at(index1);
+    int k2 = cp2->method_handle_ref_kind_at(index2);
     if (k1 == k2) {
-      int i1 = method_handle_index_at_error_ok(index1);
-      int i2 = cp2->method_handle_index_at_error_ok(index2);
+      int i1 = method_handle_index_at(index1);
+      int i2 = cp2->method_handle_index_at(index2);
       bool match = compare_entry_to(i1, cp2, i2, CHECK_false);
       if (match) {
         return true;
@@ -1310,15 +1311,15 @@
   case JVM_CONSTANT_MethodType:
   case JVM_CONSTANT_MethodTypeInError:
   {
-    jint k = from_cp->method_type_index_at_error_ok(from_i);
+    jint k = from_cp->method_type_index_at(from_i);
     to_cp->method_type_index_at_put(to_i, k);
   } break;
 
   case JVM_CONSTANT_MethodHandle:
   case JVM_CONSTANT_MethodHandleInError:
   {
-    int k1 = from_cp->method_handle_ref_kind_at_error_ok(from_i);
-    int k2 = from_cp->method_handle_index_at_error_ok(from_i);
+    int k1 = from_cp->method_handle_ref_kind_at(from_i);
+    int k2 = from_cp->method_handle_index_at(from_i);
     to_cp->method_handle_index_at_put(to_i, k1, k2);
   } break;
 
@@ -1754,8 +1755,8 @@
       case JVM_CONSTANT_MethodHandle:
       case JVM_CONSTANT_MethodHandleInError: {
         *bytes = JVM_CONSTANT_MethodHandle;
-        int kind = method_handle_ref_kind_at_error_ok(idx);
-        idx1 = method_handle_index_at_error_ok(idx);
+        int kind = method_handle_ref_kind_at(idx);
+        idx1 = method_handle_index_at(idx);
         *(bytes+1) = (unsigned char) kind;
         Bytes::put_Java_u2((address) (bytes+2), idx1);
         DBG(printf("JVM_CONSTANT_MethodHandle: %d %hd", kind, idx1));
@@ -1764,7 +1765,7 @@
       case JVM_CONSTANT_MethodType:
       case JVM_CONSTANT_MethodTypeInError: {
         *bytes = JVM_CONSTANT_MethodType;
-        idx1 = method_type_index_at_error_ok(idx);
+        idx1 = method_type_index_at(idx);
         Bytes::put_Java_u2((address) (bytes+1), idx1);
         DBG(printf("JVM_CONSTANT_MethodType: %hd", idx1));
         break;
@@ -1952,12 +1953,12 @@
       break;
     case JVM_CONSTANT_MethodHandle :
     case JVM_CONSTANT_MethodHandleInError :
-      st->print("ref_kind=%d", method_handle_ref_kind_at_error_ok(index));
-      st->print(" ref_index=%d", method_handle_index_at_error_ok(index));
+      st->print("ref_kind=%d", method_handle_ref_kind_at(index));
+      st->print(" ref_index=%d", method_handle_index_at(index));
       break;
     case JVM_CONSTANT_MethodType :
     case JVM_CONSTANT_MethodTypeInError :
-      st->print("signature_index=%d", method_type_index_at_error_ok(index));
+      st->print("signature_index=%d", method_type_index_at(index));
       break;
     case JVM_CONSTANT_InvokeDynamic :
       {
--- a/src/share/vm/oops/constantPool.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/oops/constantPool.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -460,41 +460,21 @@
     return *int_at_addr(which);
   }
 
- private:
-  int method_handle_ref_kind_at(int which, bool error_ok) {
+  int method_handle_ref_kind_at(int which) {
     assert(tag_at(which).is_method_handle() ||
-           (error_ok && tag_at(which).is_method_handle_in_error()), "Corrupted constant pool");
+           tag_at(which).is_method_handle_in_error(), "Corrupted constant pool");
     return extract_low_short_from_int(*int_at_addr(which));  // mask out unwanted ref_index bits
   }
-  int method_handle_index_at(int which, bool error_ok) {
+  int method_handle_index_at(int which) {
     assert(tag_at(which).is_method_handle() ||
-           (error_ok && tag_at(which).is_method_handle_in_error()), "Corrupted constant pool");
+           tag_at(which).is_method_handle_in_error(), "Corrupted constant pool");
     return extract_high_short_from_int(*int_at_addr(which));  // shift out unwanted ref_kind bits
   }
-  int method_type_index_at(int which, bool error_ok) {
+  int method_type_index_at(int which) {
     assert(tag_at(which).is_method_type() ||
-           (error_ok && tag_at(which).is_method_type_in_error()), "Corrupted constant pool");
+           tag_at(which).is_method_type_in_error(), "Corrupted constant pool");
     return *int_at_addr(which);
   }
- public:
-  int method_handle_ref_kind_at(int which) {
-    return method_handle_ref_kind_at(which, false);
-  }
-  int method_handle_ref_kind_at_error_ok(int which) {
-    return method_handle_ref_kind_at(which, true);
-  }
-  int method_handle_index_at(int which) {
-    return method_handle_index_at(which, false);
-  }
-  int method_handle_index_at_error_ok(int which) {
-    return method_handle_index_at(which, true);
-  }
-  int method_type_index_at(int which) {
-    return method_type_index_at(which, false);
-  }
-  int method_type_index_at_error_ok(int which) {
-    return method_type_index_at(which, true);
-  }
 
   // Derived queries:
   Symbol* method_handle_name_ref_at(int which) {
--- a/src/share/vm/oops/method.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/oops/method.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -38,6 +38,7 @@
 #include "interpreter/oopMapCache.hpp"
 #include "memory/heapInspection.hpp"
 #include "memory/metadataFactory.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "memory/oopFactory.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/constMethod.hpp"
@@ -123,18 +124,18 @@
 }
 
 address Method::get_i2c_entry() {
-  assert(_adapter != NULL, "must have");
-  return _adapter->get_i2c_entry();
+  assert(adapter() != NULL, "must have");
+  return adapter()->get_i2c_entry();
 }
 
 address Method::get_c2i_entry() {
-  assert(_adapter != NULL, "must have");
-  return _adapter->get_c2i_entry();
+  assert(adapter() != NULL, "must have");
+  return adapter()->get_c2i_entry();
 }
 
 address Method::get_c2i_unverified_entry() {
-  assert(_adapter != NULL, "must have");
-  return _adapter->get_c2i_unverified_entry();
+  assert(adapter() != NULL, "must have");
+  return adapter()->get_c2i_unverified_entry();
 }
 
 char* Method::name_and_sig_as_C_string() const {
@@ -892,10 +893,10 @@
 
   // this may be NULL if c2i adapters have not been made yet
   // Only should happen at allocate time.
-  if (_adapter == NULL) {
+  if (adapter() == NULL) {
     _from_compiled_entry    = NULL;
   } else {
-    _from_compiled_entry    = _adapter->get_c2i_entry();
+    _from_compiled_entry    = adapter()->get_c2i_entry();
   }
   OrderAccess::storestore();
   _from_interpreted_entry = _i2i_entry;
@@ -903,47 +904,68 @@
   _code = NULL;
 }
 
+#if INCLUDE_CDS
 // Called by class data sharing to remove any entry points (which are not shared)
 void Method::unlink_method() {
   _code = NULL;
-  _i2i_entry = NULL;
-  _from_interpreted_entry = NULL;
+
+  assert(DumpSharedSpaces, "dump time only");
+  // Set the values to what they should be at run time. Note that
+  // this Method can no longer be executed during dump time.
+  _i2i_entry = Interpreter::entry_for_cds_method(this);
+  _from_interpreted_entry = _i2i_entry;
+
   if (is_native()) {
     *native_function_addr() = NULL;
     set_signature_handler(NULL);
   }
   NOT_PRODUCT(set_compiled_invocation_count(0);)
-  _adapter = NULL;
-  _from_compiled_entry = NULL;
+
+  CDSAdapterHandlerEntry* cds_adapter = (CDSAdapterHandlerEntry*)adapter();
+  constMethod()->set_adapter_trampoline(cds_adapter->get_adapter_trampoline());
+  _from_compiled_entry = cds_adapter->get_c2i_entry_trampoline();
+  assert(*((int*)_from_compiled_entry) == 0, "must be NULL during dump time, to be initialized at run time");
+
 
   // In case of DumpSharedSpaces, _method_data should always be NULL.
-  //
-  // During runtime (!DumpSharedSpaces), when we are cleaning a
-  // shared class that failed to load, this->link_method() may
-  // have already been called (before an exception happened), so
-  // this->_method_data may not be NULL.
-  assert(!DumpSharedSpaces || _method_data == NULL, "unexpected method data?");
+  assert(_method_data == NULL, "unexpected method data?");
 
   set_method_data(NULL);
   clear_method_counters();
 }
+#endif
 
 // Called when the method_holder is getting linked. Setup entrypoints so the method
 // is ready to be called from interpreter, compiler, and vtables.
 void Method::link_method(const methodHandle& h_method, TRAPS) {
   // If the code cache is full, we may reenter this function for the
   // leftover methods that weren't linked.
-  if (_i2i_entry != NULL) return;
+  if (is_shared()) {
+    if (adapter() != NULL) return;
+  } else {
+    if (_i2i_entry != NULL) return;
 
-  assert(_adapter == NULL, "init'd to NULL" );
+    assert(adapter() == NULL, "init'd to NULL" );
+  }
   assert( _code == NULL, "nothing compiled yet" );
 
   // Setup interpreter entrypoint
   assert(this == h_method(), "wrong h_method()" );
-  address entry = Interpreter::entry_for_method(h_method);
+  address entry;
+
+  if (this->is_shared()) {
+    entry = Interpreter::entry_for_cds_method(h_method);
+  } else {
+    entry = Interpreter::entry_for_method(h_method);
+  }
   assert(entry != NULL, "interpreter entry must be non-null");
-  // Sets both _i2i_entry and _from_interpreted_entry
-  set_interpreter_entry(entry);
+  if (is_shared()) {
+    assert(entry == _i2i_entry && entry == _from_interpreted_entry,
+           "should be correctly set during dump time");
+  } else {
+    // Sets both _i2i_entry and _from_interpreted_entry
+    set_interpreter_entry(entry);
+  }
 
   // Don't overwrite already registered native entries.
   if (is_native() && !has_native_function()) {
@@ -975,8 +997,13 @@
     THROW_MSG_NULL(vmSymbols::java_lang_VirtualMachineError(), "Out of space in CodeCache for adapters");
   }
 
-  mh->set_adapter_entry(adapter);
-  mh->_from_compiled_entry = adapter->get_c2i_entry();
+  if (mh->is_shared()) {
+    assert(mh->adapter() == adapter, "must be");
+    assert(mh->_from_compiled_entry != NULL, "must be"); // FIXME, the instructions also not NULL
+  } else {
+    mh->set_adapter_entry(adapter);
+    mh->_from_compiled_entry = adapter->get_c2i_entry();
+  }
   return adapter->get_c2i_entry();
 }
 
@@ -992,6 +1019,14 @@
   }
 }
 
+volatile address Method::from_compiled_entry_no_trampoline() const {
+  nmethod *code = (nmethod *)OrderAccess::load_ptr_acquire(&_code);
+  if (code) {
+    return code->verified_entry_point();
+  } else {
+    return adapter()->get_c2i_entry();
+  }
+}
 
 // The verified_code_entry() must be called when a invoke is resolved
 // on this method.
--- a/src/share/vm/oops/method.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/oops/method.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -93,8 +93,6 @@
 #endif
   // Entry point for calling both from and to the interpreter.
   address _i2i_entry;           // All-args-on-stack calling convention
-  // Adapter blob (i2c/c2i) for this Method*. Set once when method is linked.
-  AdapterHandlerEntry* _adapter;
   // Entry point for calling from compiled code, to compiled code if it exists
   // or else the interpreter.
   volatile address _from_compiled_entry;        // Cache of: _code ? _code->entry_point() : _adapter->c2i_entry()
@@ -137,6 +135,7 @@
 
   static address make_adapters(methodHandle mh, TRAPS);
   volatile address from_compiled_entry() const   { return (address)OrderAccess::load_ptr_acquire(&_from_compiled_entry); }
+  volatile address from_compiled_entry_no_trampoline() const;
   volatile address from_interpreted_entry() const{ return (address)OrderAccess::load_ptr_acquire(&_from_interpreted_entry); }
 
   // access flag
@@ -264,6 +263,7 @@
   int highest_osr_comp_level() const;
   void set_highest_osr_comp_level(int level);
 
+#if defined(COMPILER2) || INCLUDE_JVMCI
   // Count of times method was exited via exception while interpreting
   void interpreter_throwout_increment(TRAPS) {
     MethodCounters* mcs = get_method_counters(CHECK);
@@ -271,6 +271,7 @@
       mcs->interpreter_throwout_increment();
     }
   }
+#endif
 
   int  interpreter_throwout_count() const        {
     MethodCounters* mcs = method_counters();
@@ -407,11 +408,13 @@
       return (mcs == NULL) ? 0 : mcs->interpreter_invocation_count();
     }
   }
+#if defined(COMPILER2) || INCLUDE_JVMCI
   int increment_interpreter_invocation_count(TRAPS) {
     if (TieredCompilation) ShouldNotReachHere();
     MethodCounters* mcs = get_method_counters(CHECK_0);
     return (mcs == NULL) ? 0 : mcs->increment_interpreter_invocation_count();
   }
+#endif
 
 #ifndef PRODUCT
   int  compiled_invocation_count() const         { return _compiled_invocation_count;  }
@@ -431,15 +434,23 @@
   nmethod* volatile code() const                 { assert( check_code(), "" ); return (nmethod *)OrderAccess::load_ptr_acquire(&_code); }
   void clear_code();            // Clear out any compiled code
   static void set_code(methodHandle mh, nmethod* code);
-  void set_adapter_entry(AdapterHandlerEntry* adapter) {  _adapter = adapter; }
+  void set_adapter_entry(AdapterHandlerEntry* adapter) {
+    constMethod()->set_adapter_entry(adapter);
+  }
+  void update_adapter_trampoline(AdapterHandlerEntry* adapter) {
+    constMethod()->update_adapter_trampoline(adapter);
+  }
+
   address get_i2c_entry();
   address get_c2i_entry();
   address get_c2i_unverified_entry();
-  AdapterHandlerEntry* adapter() {  return _adapter; }
+  AdapterHandlerEntry* adapter() const {
+    return constMethod()->adapter();
+  }
   // setup entry points
   void link_method(const methodHandle& method, TRAPS);
-  // clear entry points. Used by sharing code
-  void unlink_method();
+  // clear entry points. Used by sharing code during dump time
+  void unlink_method() NOT_CDS_RETURN;
 
   // vtable index
   enum VtableIndexFlag {
@@ -465,7 +476,15 @@
   // interpreter entry
   address interpreter_entry() const              { return _i2i_entry; }
   // Only used when first initialize so we can set _i2i_entry and _from_interpreted_entry
-  void set_interpreter_entry(address entry)      { _i2i_entry = entry;  _from_interpreted_entry = entry; }
+  void set_interpreter_entry(address entry) {
+    assert(!is_shared(), "shared method's interpreter entry should not be changed at run time");
+    if (_i2i_entry != entry) {
+      _i2i_entry = entry;
+    }
+    if (_from_interpreted_entry != entry) {
+      _from_interpreted_entry = entry;
+    }
+  }
 
   // native function (used for native methods only)
   enum {
--- a/src/share/vm/oops/methodCounters.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/oops/methodCounters.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,8 +34,10 @@
  friend class VMStructs;
  friend class JVMCIVMStructs;
  private:
+#if defined(COMPILER2) || INCLUDE_JVMCI
   int               _interpreter_invocation_count; // Count of times invoked (reused as prev_event_count in tiered)
   u2                _interpreter_throwout_count; // Count of times method was exited via exception while interpreting
+#endif
   u2                _number_of_breakpoints;      // fullspeed debugging support
   InvocationCounter _invocation_counter;         // Incremented before each activation of the method - used to trigger frequency-based optimizations
   InvocationCounter _backedge_counter;           // Incremented before each backedge taken - used to trigger frequencey-based optimizations
@@ -60,9 +62,7 @@
   u1                _highest_osr_comp_level;      // Same for OSR level
 #endif
 
-  MethodCounters(methodHandle mh) : _interpreter_invocation_count(0),
-                                    _interpreter_throwout_count(0),
-                                    _number_of_breakpoints(0),
+  MethodCounters(methodHandle mh) : _number_of_breakpoints(0),
                                     _nmethod_age(INT_MAX)
 #ifdef TIERED
                                  , _rate(0),
@@ -71,6 +71,8 @@
                                    _highest_osr_comp_level(0)
 #endif
   {
+    set_interpreter_invocation_count(0);
+    set_interpreter_throwout_count(0);
     invocation_counter()->init();
     backedge_counter()->init();
 
@@ -109,6 +111,8 @@
 
   void clear_counters();
 
+#if defined(COMPILER2) || INCLUDE_JVMCI
+
   int interpreter_invocation_count() {
     return _interpreter_invocation_count;
   }
@@ -131,6 +135,24 @@
     _interpreter_throwout_count = count;
   }
 
+#else // defined(COMPILER2) || INCLUDE_JVMCI
+
+  int interpreter_invocation_count() {
+    return 0;
+  }
+  void set_interpreter_invocation_count(int count) {
+    assert(count == 0, "count must be 0");
+  }
+
+  int  interpreter_throwout_count() const {
+    return 0;
+  }
+  void set_interpreter_throwout_count(int count) {
+    assert(count == 0, "count must be 0");
+  }
+
+#endif // defined(COMPILER2) || INCLUDE_JVMCI
+
   u2   number_of_breakpoints() const   { return _number_of_breakpoints; }
   void incr_number_of_breakpoints()    { ++_number_of_breakpoints; }
   void decr_number_of_breakpoints()    { --_number_of_breakpoints; }
@@ -170,10 +192,25 @@
     return byte_offset_of(MethodCounters, _nmethod_age);
   }
 
+#if defined(COMPILER2) || INCLUDE_JVMCI
+
   static ByteSize interpreter_invocation_counter_offset() {
     return byte_offset_of(MethodCounters, _interpreter_invocation_count);
   }
 
+  static int interpreter_invocation_counter_offset_in_bytes() {
+    return offset_of(MethodCounters, _interpreter_invocation_count);
+  }
+
+#else // defined(COMPILER2) || INCLUDE_JVMCI
+
+  static ByteSize interpreter_invocation_counter_offset() {
+    ShouldNotReachHere();
+    return in_ByteSize(0);
+  }
+
+#endif // defined(COMPILER2) || INCLUDE_JVMCI
+
   static ByteSize invocation_counter_offset()    {
     return byte_offset_of(MethodCounters, _invocation_counter);
   }
@@ -182,10 +219,6 @@
     return byte_offset_of(MethodCounters, _backedge_counter);
   }
 
-  static int interpreter_invocation_counter_offset_in_bytes() {
-    return offset_of(MethodCounters, _interpreter_invocation_count);
-  }
-
   static ByteSize interpreter_invocation_limit_offset() {
     return byte_offset_of(MethodCounters, _interpreter_invocation_limit);
   }
--- a/src/share/vm/oops/methodData.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/oops/methodData.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1729,6 +1729,7 @@
 }
 
 void MethodData::clean_method_data(BoolObjectClosure* is_alive) {
+  ResourceMark rm;
   for (ProfileData* data = first_data();
        is_valid(data);
        data = next_data(data)) {
@@ -1745,6 +1746,7 @@
 }
 
 void MethodData::clean_weak_method_links() {
+  ResourceMark rm;
   for (ProfileData* data = first_data();
        is_valid(data);
        data = next_data(data)) {
@@ -1758,6 +1760,7 @@
 
 #ifdef ASSERT
 void MethodData::verify_clean_weak_method_links() {
+  ResourceMark rm;
   for (ProfileData* data = first_data();
        is_valid(data);
        data = next_data(data)) {
--- a/src/share/vm/oops/oop.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/oops/oop.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -30,7 +30,6 @@
 #include "memory/memRegion.hpp"
 #include "oops/metadata.hpp"
 #include "utilities/macros.hpp"
-#include "utilities/top.hpp"
 
 // oopDesc is the top baseclass for objects classes. The {name}Desc classes describe
 // the format of Java objects so the fields can be accessed from C++.
--- a/src/share/vm/oops/symbol.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/oops/symbol.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -25,9 +25,10 @@
 #ifndef SHARE_VM_OOPS_SYMBOL_HPP
 #define SHARE_VM_OOPS_SYMBOL_HPP
 
-#include "utilities/utf8.hpp"
 #include "memory/allocation.hpp"
 #include "runtime/atomic.hpp"
+#include "utilities/exceptions.hpp"
+#include "utilities/utf8.hpp"
 
 // A Symbol is a canonicalized string.
 // All Symbols reside in global SymbolTable and are reference counted.
--- a/src/share/vm/opto/c2_globals.hpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/opto/c2_globals.hpp	Thu Apr 14 14:05:09 2016 +0000
@@ -194,6 +194,9 @@
            "Map number of unrolls for main loop via "                       \
            "Superword Level Parallelism analysis")                          \
                                                                             \
+  product_pd(bool, PostLoopMultiversioning,                                 \
+           "Multi versioned post loops to eliminate range checks")          \
+                                                                            \
   notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false,                 \
           "Trace what Superword Level Parallelism analysis applies")        \
                                                                             \
@@ -229,21 +232,12 @@
   develop(bool, TraceLoopOpts, false,                                       \
           "Trace executed loop optimizations")                              \
                                                                             \
-  diagnostic(bool, LoopLimitCheck, true,                                    \
-          "Generate a loop limits check for overflow")                      \
-                                                                            \
   develop(bool, TraceLoopLimitCheck, false,                                 \
           "Trace generation of loop limits checks")                         \
                                                                             \
-  diagnostic(bool, RangeLimitCheck, true,                                   \
-          "Additional overflow checks during range check elimination")      \
-                                                                            \
   develop(bool, TraceRangeLimitCheck, false,                                \
           "Trace additional overflow checks in RCE")                        \
                                                                             \
-  diagnostic(bool, UnrollLimitCheck, true,                                  \
-          "Additional overflow checks during loop unroll")                  \
-                                                                            \
   /* OptimizeFill not yet supported on PowerPC. */                          \
   product(bool, OptimizeFill, true PPC64_ONLY(&& false),                    \
           "convert fill/copy loops into intrinsic")                         \
@@ -595,26 +589,26 @@
   product(bool, BlockLayoutRotateLoops, true,                               \
           "Allow back branches to be fall throughs in the block layour")    \
                                                                             \
-  diagnostic(bool, InlineReflectionGetCallerClass, true,                    \
+  develop(bool, InlineReflectionGetCallerClass, true,                       \
           "inline sun.reflect.Reflection.getCallerClass(), known to be "    \
           "part of base library DLL")                                       \
                                                                             \
-  diagnostic(bool, InlineObjectCopy, true,                                  \
+  develop(bool, InlineObjectCopy, true,                                     \
           "inline Object.clone and Arrays.copyOf[Range] intrinsics")        \
                                                                             \
-  diagnostic(bool, SpecialStringCompareTo, true,                            \
+  develop(bool, SpecialStringCompareTo, true,                               \
           "special version of string compareTo")                            \
                                                                             \
-  diagnostic(bool, SpecialStringIndexOf, true,                              \
+  develop(bool, SpecialStringIndexOf, true,                                 \
           "special version of string indexOf")                              \
                                                                             \
-  diagnostic(bool, SpecialStringEquals, true,                               \
+  develop(bool, SpecialStringEquals, true,                                  \
           "special version of string equals")                               \
                                                                             \
-  diagnostic(bool, SpecialArraysEquals, true,                               \
+  develop(bool, SpecialArraysEquals, true,                                  \
           "special version of Arrays.equals(char[],char[])")                \
                                                                             \
-  diagnostic(bool, SpecialEncodeISOArray, true,                             \
+  product(bool, SpecialEncodeISOArray, true,                                \
           "special version of ISO_8859_1$Encoder.encodeISOArray")           \
                                                                             \
   develop(bool, BailoutToInterpreterForThrows, false,                       \
@@ -716,22 +710,22 @@
   diagnostic(bool, OptimizeExpensiveOps, true,                              \
           "Find best control for expensive operations")                     \
                                                                             \
-  diagnostic(bool, UseMathExactIntrinsics, true,                            \
+  product(bool, UseMathExactIntrinsics, true,                               \
           "Enables intrinsification of various java.lang.Math functions")   \
                                                                             \
-  diagnostic(bool, UseMultiplyToLenIntrinsic, false,                        \
+  product(bool, UseMultiplyToLenIntrinsic, false,                           \
           "Enables intrinsification of BigInteger.multiplyToLen()")         \
                                                                             \
-  diagnostic(bool, UseSquareToLenIntrinsic, false,                          \
+  product(bool, UseSquareToLenIntrinsic, false,                             \
           "Enables intrinsification of BigInteger.squareToLen()")           \
                                                                             \
-  diagnostic(bool, UseMulAddIntrinsic, false,                               \
+  product(bool, UseMulAddIntrinsic, false,                                  \
           "Enables intrinsification of BigInteger.mulAdd()")                \
                                                                             \
-  diagnostic(bool, UseMontgomeryMultiplyIntrinsic, false,                   \
+  product(bool, UseMontgomeryMultiplyIntrinsic, false,                      \
           "Enables intrinsification of BigInteger.montgomeryMultiply()")    \
                                                                             \
-  diagnostic(bool, UseMontgomerySquareIntrinsic, false,                     \
+  product(bool, UseMontgomerySquareIntrinsic, false,                        \
           "Enables intrinsification of BigInteger.montgomerySquare()")      \
                                                                             \
   product(bool, UseTypeSpeculation, true,                                   \
--- a/src/share/vm/opto/graphKit.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/opto/graphKit.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1191,11 +1191,6 @@
                                   bool speculative) {
   assert(!assert_null || null_control == NULL, "not both at once");
   if (stopped())  return top();
-  if (!GenerateCompilerNullChecks && !assert_null && null_control == NULL) {
-    // For some performance testing, we may wish to suppress null checking.
-    value = cast_not_null(value);   // Make it appear to be non-null (4962416).
-    return value;
-  }
   NOT_PRODUCT(explicit_null_checks_inserted++);
 
   // Construct NULL check
@@ -1687,6 +1682,9 @@
   const Type* elemtype = arytype->elem();
   BasicType elembt = elemtype->array_element_basic_type();
   Node* adr = array_element_address(ary, idx, elembt, arytype->size());
+  if (elembt == T_NARROWOOP) {
+    elembt = T_OBJECT; // To satisfy switch in LoadNode::make()
+  }
   Node* ld = make_load(ctl, adr, elemtype, elembt, arytype, MemNode::unordered);
   return ld;
 }
@@ -3771,9 +3769,7 @@
     add_predicate_impl(Deoptimization::Reason_predicate, nargs);
   }
   // loop's limit check predicate should be near the loop.
-  if (LoopLimitCheck) {
-    add_predicate_impl(Deoptimization::Reason_loop_limit_check, nargs);
-  }
+  add_predicate_impl(Deoptimization::Reason_loop_limit_check, nargs);
 }
 
 //----------------------------- store barriers ----------------------------
--- a/src/share/vm/opto/library_call.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/opto/library_call.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -6273,7 +6273,20 @@
 
 //------------------------------get_key_start_from_aescrypt_object-----------------------
 Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) {
+#ifdef PPC64
+  // MixColumns for decryption can be reduced by preprocessing MixColumns with round keys.
+  // Intel's extention is based on this optimization and AESCrypt generates round keys by preprocessing MixColumns.
+  // However, ppc64 vncipher processes MixColumns and requires the same round keys with encryption.
+  // The ppc64 stubs of encryption and decryption use the same round keys (sessionK[0]).
+  Node* objSessionK = load_field_from_object(aescrypt_object, "sessionK", "[[I", /*is_exact*/ false);
+  assert (objSessionK != NULL, "wrong version of com.sun.crypto.provider.AESCrypt");
+  if (objSessionK == NULL) {
+    return (Node *) NULL;
+  }
+  Node* objAESCryptKey = load_array_element(control(), objSessionK, intcon(0), TypeAryPtr::OOPS);
+#else
   Node* objAESCryptKey = load_field_from_object(aescrypt_object, "K", "[I", /*is_exact*/ false);
+#endif // PPC64
   assert (objAESCryptKey != NULL, "wrong version of com.sun.crypto.provider.AESCrypt");
   if (objAESCryptKey == NULL) return (Node *) NULL;
 
--- a/src/share/vm/opto/loopPredicate.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/opto/loopPredicate.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -313,11 +313,9 @@
   // Search original predicates
   Node* entry = old_entry;
   ProjNode* limit_check_proj = NULL;
-  if (LoopLimitCheck) {
-    limit_check_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-    if (limit_check_proj != NULL) {
-      entry = entry->in(0)->in(0);
-    }
+  limit_check_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+  if (limit_check_proj != NULL) {
+    entry = entry->in(0)->in(0);
   }
   if (UseLoopPredicate) {
     ProjNode* predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
@@ -353,11 +351,9 @@
 // Skip related predicates.
 Node* PhaseIdealLoop::skip_loop_predicates(Node* entry) {
   Node* predicate = NULL;
-  if (LoopLimitCheck) {
-    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-    if (predicate != NULL) {
-      entry = entry->in(0)->in(0);
-    }
+  predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+  if (predicate != NULL) {
+    entry = entry->in(0)->in(0);
   }
   if (UseLoopPredicate) {
     predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
@@ -393,11 +389,9 @@
 // Find a predicate
 Node* PhaseIdealLoop::find_predicate(Node* entry) {
   Node* predicate = NULL;
-  if (LoopLimitCheck) {
-    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-    if (predicate != NULL) { // right pattern that can be used by loop predication
-      return entry;
-    }
+  predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+  if (predicate != NULL) { // right pattern that can be used by loop predication
+    return entry;
   }
   if (UseLoopPredicate) {
     predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
@@ -646,19 +640,13 @@
   Node* max_idx_expr  = init;
   int stride_con = stride->get_int();
   if ((stride_con > 0) == (scale > 0) == upper) {
-    if (LoopLimitCheck) {
-      // With LoopLimitCheck limit is not exact.
-      // Calculate exact limit here.
-      // Note, counted loop's test is '<' or '>'.
-      limit = exact_limit(loop);
-      max_idx_expr = new SubINode(limit, stride);
-      register_new_node(max_idx_expr, ctrl);
-      if (TraceLoopPredicate) predString->print("(limit - stride) ");
-    } else {
-      max_idx_expr = new SubINode(limit, stride);
-      register_new_node(max_idx_expr, ctrl);
-      if (TraceLoopPredicate) predString->print("(limit - stride) ");
-    }
+    // Limit is not exact.
+    // Calculate exact limit here.
+    // Note, counted loop's test is '<' or '>'.
+    limit = exact_limit(loop);
+    max_idx_expr = new SubINode(limit, stride);
+    register_new_node(max_idx_expr, ctrl);
+    if (TraceLoopPredicate) predString->print("(limit - stride) ");
   } else {
     if (TraceLoopPredicate) predString->print("init ");
   }
@@ -721,12 +709,9 @@
   Node* entry = head->in(LoopNode::EntryControl);
   ProjNode *predicate_proj = NULL;
   // Loop limit check predicate should be near the loop.
-  if (LoopLimitCheck) {
-    predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-    if (predicate_proj != NULL)
-      entry = predicate_proj->in(0)->in(0);
-  }
-
+  predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+  if (predicate_proj != NULL)
+    entry = predicate_proj->in(0)->in(0);
   predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
   if (!predicate_proj) {
 #ifndef PRODUCT
--- a/src/share/vm/opto/loopTransform.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/opto/loopTransform.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -1027,82 +1027,9 @@
     _igvn.replace_input_of(bol, 1, cmp);
   }
 
-  //------------------------------
-  // Step A: Create Post-Loop.
-  Node* main_exit = main_end->proj_out(false);
-  assert( main_exit->Opcode() == Op_IfFalse, "" );
-  int dd_main_exit = dom_depth(main_exit);
-
-  // Step A1: Clone the loop body.  The clone becomes the post-loop.  The main
-  // loop pre-header illegally has 2 control users (old & new loops).
-  clone_loop( loop, old_new, dd_main_exit );
-  assert( old_new[main_end ->_idx]->Opcode() == Op_CountedLoopEnd, "" );
-  CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
-  post_head->set_post_loop(main_head);
-
-  // Reduce the post-loop trip count.
-  CountedLoopEndNode* post_end = old_new[main_end ->_idx]->as_CountedLoopEnd();
-  post_end->_prob = PROB_FAIR;
-
-  // Build the main-loop normal exit.
-  IfFalseNode *new_main_exit = new IfFalseNode(main_end);
-  _igvn.register_new_node_with_optimizer( new_main_exit );
-  set_idom(new_main_exit, main_end, dd_main_exit );
-  set_loop(new_main_exit, loop->_parent);
-
-  // Step A2: Build a zero-trip guard for the post-loop.  After leaving the
-  // main-loop, the post-loop may not execute at all.  We 'opaque' the incr
-  // (the main-loop trip-counter exit value) because we will be changing
-  // the exit value (via unrolling) so we cannot constant-fold away the zero
-  // trip guard until all unrolling is done.
-  Node *zer_opaq = new Opaque1Node(C, incr);
-  Node *zer_cmp  = new CmpINode( zer_opaq, limit );
-  Node *zer_bol  = new BoolNode( zer_cmp, b_test );
-  register_new_node( zer_opaq, new_main_exit );
-  register_new_node( zer_cmp , new_main_exit );
-  register_new_node( zer_bol , new_main_exit );
-
-  // Build the IfNode
-  IfNode *zer_iff = new IfNode( new_main_exit, zer_bol, PROB_FAIR, COUNT_UNKNOWN );
-  _igvn.register_new_node_with_optimizer( zer_iff );
-  set_idom(zer_iff, new_main_exit, dd_main_exit);
-  set_loop(zer_iff, loop->_parent);
-
-  // Plug in the false-path, taken if we need to skip post-loop
-  _igvn.replace_input_of(main_exit, 0, zer_iff);
-  set_idom(main_exit, zer_iff, dd_main_exit);
-  set_idom(main_exit->unique_out(), zer_iff, dd_main_exit);
-  // Make the true-path, must enter the post loop
-  Node *zer_taken = new IfTrueNode( zer_iff );
-  _igvn.register_new_node_with_optimizer( zer_taken );
-  set_idom(zer_taken, zer_iff, dd_main_exit);
-  set_loop(zer_taken, loop->_parent);
-  // Plug in the true path
-  _igvn.hash_delete( post_head );
-  post_head->set_req(LoopNode::EntryControl, zer_taken);
-  set_idom(post_head, zer_taken, dd_main_exit);
-
-  Arena *a = Thread::current()->resource_area();
-  VectorSet visited(a);
-  Node_Stack clones(a, main_head->back_control()->outcnt());
-  // Step A3: Make the fall-in values to the post-loop come from the
-  // fall-out values of the main-loop.
-  for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) {
-    Node* main_phi = main_head->fast_out(i);
-    if( main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() >0 ) {
-      Node *post_phi = old_new[main_phi->_idx];
-      Node *fallmain  = clone_up_backedge_goo(main_head->back_control(),
-                                              post_head->init_control(),
-                                              main_phi->in(LoopNode::LoopBackControl),
-                                              visited, clones);
-      _igvn.hash_delete(post_phi);
-      post_phi->set_req( LoopNode::EntryControl, fallmain );
-    }
-  }
-
-  // Update local caches for next stanza
-  main_exit = new_main_exit;
-
+  // Add the post loop
+  CountedLoopNode *post_head = NULL;
+  Node *main_exit = insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
 
   //------------------------------
   // Step B: Create Pre-Loop.
@@ -1158,8 +1085,9 @@
   main_head->set_req(LoopNode::EntryControl, min_taken);
   set_idom(main_head, min_taken, dd_main_head);
 
-  visited.Clear();
-  clones.clear();
+  Arena *a = Thread::current()->resource_area();
+  VectorSet visited(a);
+  Node_Stack clones(a, main_head->back_control()->outcnt());
   // Step B3: Make the fall-in values to the main-loop come from the
   // fall-out values of the pre-loop.
   for (DUIterator_Fast i2max, i2 = main_head->fast_outs(i2max); i2 < i2max; i2++) {
@@ -1185,12 +1113,8 @@
   // variable value and the induction variable Phi to preserve correct
   // dependencies.
 
-  // CastII for the post loop:
-  bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head);
-  assert(inserted, "no castII inserted");
-
   // CastII for the main loop:
-  inserted = cast_incr_before_loop(pre_incr, min_taken, main_head);
+  bool inserted = cast_incr_before_loop( pre_incr, min_taken, main_head );
   assert(inserted, "no castII inserted");
 
   // Step B4: Shorten the pre-loop to run only 1 iteration (for now).
@@ -1298,19 +1222,82 @@
   guarantee(main_end != NULL, "no loop exit node");
   // diagnostic to show loop end is not properly formed
   assert(main_end->outcnt() == 2, "1 true, 1 false path only");
-  uint dd_main_head = dom_depth(main_head);
-  uint max = main_head->outcnt();
 
   // mark this loop as processed
   main_head->mark_has_atomic_post_loop();
 
-  Node *pre_header = main_head->in(LoopNode::EntryControl);
-  Node *init = main_head->init_trip();
   Node *incr = main_end->incr();
   Node *limit = main_end->limit();
-  Node *stride = main_end->stride();
-  Node *cmp = main_end->cmp_node();
-  BoolTest::mask b_test = main_end->test_trip();
+
+  // In this case we throw away the result as we are not using it to connect anything else.
+  CountedLoopNode *post_head = NULL;
+  insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
+
+  // It's difficult to be precise about the trip-counts
+  // for post loops.  They are usually very short,
+  // so guess that unit vector trips is a reasonable value.
+  post_head->set_profile_trip_cnt(cur_unroll);
+
+  // Now force out all loop-invariant dominating tests.  The optimizer
+  // finds some, but we _know_ they are all useless.
+  peeled_dom_test_elim(loop, old_new);
+  loop->record_for_igvn();
+}
+
+
+//-------------------------insert_scalar_rced_post_loop------------------------
+// Insert a copy of the rce'd main loop as a post loop,
+// We have not unrolled the main loop, so this is the right time to inject this.
+// Later we will examine the partner of this post loop pair which still has range checks
+// to see inject code which tests at runtime if the range checks are applicable.
+void PhaseIdealLoop::insert_scalar_rced_post_loop(IdealLoopTree *loop, Node_List &old_new) {
+  if (!loop->_head->is_CountedLoop()) return;
+
+  CountedLoopNode *cl = loop->_head->as_CountedLoop();
+
+  // only process RCE'd main loops
+  if (!cl->is_main_loop() || cl->range_checks_present()) return;
+
+#ifndef PRODUCT
+  if (TraceLoopOpts) {
+    tty->print("PostScalarRce  ");
+    loop->dump_head();
+  }
+#endif
+  C->set_major_progress();
+
+  // Find common pieces of the loop being guarded with pre & post loops
+  CountedLoopNode *main_head = loop->_head->as_CountedLoop();
+  CountedLoopEndNode *main_end = main_head->loopexit();
+  guarantee(main_end != NULL, "no loop exit node");
+  // diagnostic to show loop end is not properly formed
+  assert(main_end->outcnt() == 2, "1 true, 1 false path only");
+
+  Node *incr = main_end->incr();
+  Node *limit = main_end->limit();
+
+  // In this case we throw away the result as we are not using it to connect anything else.
+  CountedLoopNode *post_head = NULL;
+  insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
+
+  // It's difficult to be precise about the trip-counts
+  // for post loops.  They are usually very short,
+  // so guess that unit vector trips is a reasonable value.
+  post_head->set_profile_trip_cnt(4.0);
+  post_head->set_is_rce_post_loop();
+
+  // Now force out all loop-invariant dominating tests.  The optimizer
+  // finds some, but we _know_ they are all useless.
+  peeled_dom_test_elim(loop, old_new);
+  loop->record_for_igvn();
+}
+
+
+//------------------------------insert_post_loop-------------------------------
+// Insert post loops.  Add a post loop to the given loop passed.
+Node *PhaseIdealLoop::insert_post_loop(IdealLoopTree *loop, Node_List &old_new,
+                                       CountedLoopNode *main_head, CountedLoopEndNode *main_end,
+                                       Node *incr, Node *limit, CountedLoopNode *&post_head) {
 
   //------------------------------
   // Step A: Create a new post-Loop.
@@ -1322,7 +1309,7 @@
   // The main loop pre-header illegally has 2 control users (old & new loops).
   clone_loop(loop, old_new, dd_main_exit);
   assert(old_new[main_end->_idx]->Opcode() == Op_CountedLoopEnd, "");
-  CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
+  post_head = old_new[main_head->_idx]->as_CountedLoop();
   post_head->set_normal_loop();
   post_head->set_post_loop(main_head);
 
@@ -1336,14 +1323,14 @@
   set_idom(new_main_exit, main_end, dd_main_exit);
   set_loop(new_main_exit, loop->_parent);
 
-  // Step A2: Build a zero-trip guard for the vector post-loop.  After leaving the
-  // main-loop, the vector post-loop may not execute at all.  We 'opaque' the incr
-  // (the vectorized main-loop trip-counter exit value) because we will be changing
+  // Step A2: Build a zero-trip guard for the post-loop.  After leaving the
+  // main-loop, the post-loop may not execute at all.  We 'opaque' the incr
+  // (the previous loop trip-counter exit value) because we will be changing
   // the exit value (via additional unrolling) so we cannot constant-fold away the zero
   // trip guard until all unrolling is done.
   Node *zer_opaq = new Opaque1Node(C, incr);
   Node *zer_cmp = new CmpINode(zer_opaq, limit);
-  Node *zer_bol = new BoolNode(zer_cmp, b_test);
+  Node *zer_bol = new BoolNode(zer_cmp, main_end->test_trip());
   register_new_node(zer_opaq, new_main_exit);
   register_new_node(zer_cmp, new_main_exit);
   register_new_node(zer_bol, new_main_exit);
@@ -1354,11 +1341,11 @@
   set_idom(zer_iff, new_main_exit, dd_main_exit);
   set_loop(zer_iff, loop->_parent);
 
-  // Plug in the false-path, taken if we need to skip vector post-loop
+  // Plug in the false-path, taken if we need to skip this post-loop
   _igvn.replace_input_of(main_exit, 0, zer_iff);
   set_idom(main_exit, zer_iff, dd_main_exit);
   set_idom(main_exit->unique_out(), zer_iff, dd_main_exit);
-  // Make the true-path, must enter the vector post loop
+  // Make the true-path, must enter this post loop
   Node *zer_taken = new IfTrueNode(zer_iff);
   _igvn.register_new_node_with_optimizer(zer_taken);
   set_idom(zer_taken, zer_iff, dd_main_exit);
@@ -1371,7 +1358,7 @@
   Arena *a = Thread::current()->resource_area();
   VectorSet visited(a);
   Node_Stack clones(a, main_head->back_control()->outcnt());
-  // Step A3: Make the fall-in values to the vector post-loop come from the
+  // Step A3: Make the fall-in values to the post-loop come from the
   // fall-out values of the main-loop.
   for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) {
     Node* main_phi = main_head->fast_out(i);
@@ -1390,15 +1377,7 @@
   bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head);
   assert(inserted, "no castII inserted");
 
-  // It's difficult to be precise about the trip-counts
-  // for post loops.  They are usually very short,
-  // so guess that unit vector trips is a reasonable value.
-  post_head->set_profile_trip_cnt((float)slp_max_unroll_factor);
-
-  // Now force out all loop-invariant dominating tests.  The optimizer
-  // finds some, but we _know_ they are all useless.
-  peeled_dom_test_elim(loop, old_new);
-  loop->record_for_igvn();
+  return new_main_exit;
 }
 
 //------------------------------is_invariant-----------------------------
@@ -1457,7 +1436,7 @@
     // Check the shape of the graph at the loop entry. If an inappropriate
     // graph shape is encountered, the compiler bails out loop unrolling;
     // compilation of the method will still succeed.
-    if (!is_canonical_main_loop_entry(loop_head)) {
+    if (!is_canonical_loop_entry(loop_head)) {
       return;
     }
     opaq = ctrl->in(0)->in(1)->in(1)->in(2);
@@ -1468,209 +1447,156 @@
   C->set_major_progress();
 
   Node* new_limit = NULL;
-  if (UnrollLimitCheck) {
-    int stride_con = stride->get_int();
-    int stride_p = (stride_con > 0) ? stride_con : -stride_con;
-    uint old_trip_count = loop_head->trip_count();
-    // Verify that unroll policy result is still valid.
-    assert(old_trip_count > 1 &&
-           (!adjust_min_trip || stride_p <= (1<<3)*loop_head->unrolled_count()), "sanity");
-
-    // Adjust loop limit to keep valid iterations number after unroll.
-    // Use (limit - stride) instead of (((limit - init)/stride) & (-2))*stride
-    // which may overflow.
-    if (!adjust_min_trip) {
-      assert(old_trip_count > 1 && (old_trip_count & 1) == 0,
-             "odd trip count for maximally unroll");
-      // Don't need to adjust limit for maximally unroll since trip count is even.
-    } else if (loop_head->has_exact_trip_count() && init->is_Con()) {
-      // Loop's limit is constant. Loop's init could be constant when pre-loop
-      // become peeled iteration.
-      jlong init_con = init->get_int();
-      // We can keep old loop limit if iterations count stays the same:
-      //   old_trip_count == new_trip_count * 2
-      // Note: since old_trip_count >= 2 then new_trip_count >= 1
-      // so we also don't need to adjust zero trip test.
-      jlong limit_con  = limit->get_int();
-      // (stride_con*2) not overflow since stride_con <= 8.
-      int new_stride_con = stride_con * 2;
-      int stride_m    = new_stride_con - (stride_con > 0 ? 1 : -1);
-      jlong trip_count = (limit_con - init_con + stride_m)/new_stride_con;
-      // New trip count should satisfy next conditions.
-      assert(trip_count > 0 && (julong)trip_count < (julong)max_juint/2, "sanity");
-      uint new_trip_count = (uint)trip_count;
-      adjust_min_trip = (old_trip_count != new_trip_count*2);
+  int stride_con = stride->get_int();
+  int stride_p = (stride_con > 0) ? stride_con : -stride_con;
+  uint old_trip_count = loop_head->trip_count();
+  // Verify that unroll policy result is still valid.
+  assert(old_trip_count > 1 &&
+      (!adjust_min_trip || stride_p <= (1<<3)*loop_head->unrolled_count()), "sanity");
+
+  // Adjust loop limit to keep valid iterations number after unroll.
+  // Use (limit - stride) instead of (((limit - init)/stride) & (-2))*stride
+  // which may overflow.
+  if (!adjust_min_trip) {
+    assert(old_trip_count > 1 && (old_trip_count & 1) == 0,
+        "odd trip count for maximally unroll");
+    // Don't need to adjust limit for maximally unroll since trip count is even.
+  } else if (loop_head->has_exact_trip_count() && init->is_Con()) {
+    // Loop's limit is constant. Loop's init could be constant when pre-loop
+    // become peeled iteration.
+    jlong init_con = init->get_int();
+    // We can keep old loop limit if iterations count stays the same:
+    //   old_trip_count == new_trip_count * 2
+    // Note: since old_trip_count >= 2 then new_trip_count >= 1
+    // so we also don't need to adjust zero trip test.
+    jlong limit_con  = limit->get_int();
+    // (stride_con*2) not overflow since stride_con <= 8.
+    int new_stride_con = stride_con * 2;
+    int stride_m    = new_stride_con - (stride_con > 0 ? 1 : -1);
+    jlong trip_count = (limit_con - init_con + stride_m)/new_stride_con;
+    // New trip count should satisfy next conditions.
+    assert(trip_count > 0 && (julong)trip_count < (julong)max_juint/2, "sanity");
+    uint new_trip_count = (uint)trip_count;
+    adjust_min_trip = (old_trip_count != new_trip_count*2);
+  }
+
+  if (adjust_min_trip) {
+    // Step 2: Adjust the trip limit if it is called for.
+    // The adjustment amount is -stride. Need to make sure if the
+    // adjustment underflows or overflows, then the main loop is skipped.
+    Node* cmp = loop_end->cmp_node();
+    assert(cmp->in(2) == limit, "sanity");
+    assert(opaq != NULL && opaq->in(1) == limit, "sanity");
+
+    // Verify that policy_unroll result is still valid.
+    const TypeInt* limit_type = _igvn.type(limit)->is_int();
+    assert(stride_con > 0 && ((limit_type->_hi - stride_con) < limit_type->_hi) ||
+        stride_con < 0 && ((limit_type->_lo - stride_con) > limit_type->_lo), "sanity");
+
+    if (limit->is_Con()) {
+      // The check in policy_unroll and the assert above guarantee
+      // no underflow if limit is constant.
+      new_limit = _igvn.intcon(limit->get_int() - stride_con);
+      set_ctrl(new_limit, C->root());
+    } else {
+      // Limit is not constant.
+      if (loop_head->unrolled_count() == 1) { // only for first unroll
+        // Separate limit by Opaque node in case it is an incremented
+        // variable from previous loop to avoid using pre-incremented
+        // value which could increase register pressure.
+        // Otherwise reorg_offsets() optimization will create a separate
+        // Opaque node for each use of trip-counter and as result
+        // zero trip guard limit will be different from loop limit.
+        assert(has_ctrl(opaq), "should have it");
+        Node* opaq_ctrl = get_ctrl(opaq);
+        limit = new Opaque2Node( C, limit );
+        register_new_node( limit, opaq_ctrl );
+      }
+      if (stride_con > 0 && (java_subtract(limit_type->_lo, stride_con) < limit_type->_lo) ||
+          stride_con < 0 && (java_subtract(limit_type->_hi, stride_con) > limit_type->_hi)) {
+        // No underflow.
+        new_limit = new SubINode(limit, stride);
+      } else {
+        // (limit - stride) may underflow.
+        // Clamp the adjustment value with MININT or MAXINT:
+        //
+        //   new_limit = limit-stride
+        //   if (stride > 0)
+        //     new_limit = (limit < new_limit) ? MININT : new_limit;
+        //   else
+        //     new_limit = (limit > new_limit) ? MAXINT : new_limit;
+        //
+        BoolTest::mask bt = loop_end->test_trip();
+        assert(bt == BoolTest::lt || bt == BoolTest::gt, "canonical test is expected");
+        Node* adj_max = _igvn.intcon((stride_con > 0) ? min_jint : max_jint);
+        set_ctrl(adj_max, C->root());
+        Node* old_limit = NULL;
+        Node* adj_limit = NULL;
+        Node* bol = limit->is_CMove() ? limit->in(CMoveNode::Condition) : NULL;
+        if (loop_head->unrolled_count() > 1 &&
+            limit->is_CMove() && limit->Opcode() == Op_CMoveI &&
+            limit->in(CMoveNode::IfTrue) == adj_max &&
+            bol->as_Bool()->_test._test == bt &&
+            bol->in(1)->Opcode() == Op_CmpI &&
+            bol->in(1)->in(2) == limit->in(CMoveNode::IfFalse)) {
+          // Loop was unrolled before.
+          // Optimize the limit to avoid nested CMove:
+          // use original limit as old limit.
+          old_limit = bol->in(1)->in(1);
+          // Adjust previous adjusted limit.
+          adj_limit = limit->in(CMoveNode::IfFalse);
+          adj_limit = new SubINode(adj_limit, stride);
+        } else {
+          old_limit = limit;
+          adj_limit = new SubINode(limit, stride);
+        }
+        assert(old_limit != NULL && adj_limit != NULL, "");
+        register_new_node( adj_limit, ctrl ); // adjust amount
+        Node* adj_cmp = new CmpINode(old_limit, adj_limit);
+        register_new_node( adj_cmp, ctrl );
+        Node* adj_bool = new BoolNode(adj_cmp, bt);
+        register_new_node( adj_bool, ctrl );
+        new_limit = new CMoveINode(adj_bool, adj_limit, adj_max, TypeInt::INT);
+      }
+      register_new_node(new_limit, ctrl);
     }
-
-    if (adjust_min_trip) {
-      // Step 2: Adjust the trip limit if it is called for.
-      // The adjustment amount is -stride. Need to make sure if the
-      // adjustment underflows or overflows, then the main loop is skipped.
-      Node* cmp = loop_end->cmp_node();
-      assert(cmp->in(2) == limit, "sanity");
-      assert(opaq != NULL && opaq->in(1) == limit, "sanity");
-
-      // Verify that policy_unroll result is still valid.
-      const TypeInt* limit_type = _igvn.type(limit)->is_int();
-      assert(stride_con > 0 && ((limit_type->_hi - stride_con) < limit_type->_hi) ||
-             stride_con < 0 && ((limit_type->_lo - stride_con) > limit_type->_lo), "sanity");
-
-      if (limit->is_Con()) {
-        // The check in policy_unroll and the assert above guarantee
-        // no underflow if limit is constant.
-        new_limit = _igvn.intcon(limit->get_int() - stride_con);
-        set_ctrl(new_limit, C->root());
-      } else {
-        // Limit is not constant.
-        if (loop_head->unrolled_count() == 1) { // only for first unroll
-          // Separate limit by Opaque node in case it is an incremented
-          // variable from previous loop to avoid using pre-incremented
-          // value which could increase register pressure.
-          // Otherwise reorg_offsets() optimization will create a separate
-          // Opaque node for each use of trip-counter and as result
-          // zero trip guard limit will be different from loop limit.
-          assert(has_ctrl(opaq), "should have it");
-          Node* opaq_ctrl = get_ctrl(opaq);
-          limit = new Opaque2Node( C, limit );
-          register_new_node( limit, opaq_ctrl );
-        }
-        if (stride_con > 0 && (java_subtract(limit_type->_lo, stride_con) < limit_type->_lo) ||
-            stride_con < 0 && (java_subtract(limit_type->_hi, stride_con) > limit_type->_hi)) {
-          // No underflow.
-          new_limit = new SubINode(limit, stride);
-        } else {
-          // (limit - stride) may underflow.
-          // Clamp the adjustment value with MININT or MAXINT:
-          //
-          //   new_limit = limit-stride
-          //   if (stride > 0)
-          //     new_limit = (limit < new_limit) ? MININT : new_limit;
-          //   else
-          //     new_limit = (limit > new_limit) ? MAXINT : new_limit;
-          //
-          BoolTest::mask bt = loop_end->test_trip();
-          assert(bt == BoolTest::lt || bt == BoolTest::gt, "canonical test is expected");
-          Node* adj_max = _igvn.intcon((stride_con > 0) ? min_jint : max_jint);
-          set_ctrl(adj_max, C->root());
-          Node* old_limit = NULL;
-          Node* adj_limit = NULL;
-          Node* bol = limit->is_CMove() ? limit->in(CMoveNode::Condition) : NULL;
-          if (loop_head->unrolled_count() > 1 &&
-              limit->is_CMove() && limit->Opcode() == Op_CMoveI &&
-              limit->in(CMoveNode::IfTrue) == adj_max &&
-              bol->as_Bool()->_test._test == bt &&
-              bol->in(1)->Opcode() == Op_CmpI &&
-              bol->in(1)->in(2) == limit->in(CMoveNode::IfFalse)) {
-            // Loop was unrolled before.
-            // Optimize the limit to avoid nested CMove:
-            // use original limit as old limit.
-            old_limit = bol->in(1)->in(1);
-            // Adjust previous adjusted limit.
-            adj_limit = limit->in(CMoveNode::IfFalse);
-            adj_limit = new SubINode(adj_limit, stride);
-          } else {
-            old_limit = limit;
-            adj_limit = new SubINode(limit, stride);
-          }
-          assert(old_limit != NULL && adj_limit != NULL, "");
-          register_new_node( adj_limit, ctrl ); // adjust amount
-          Node* adj_cmp = new CmpINode(old_limit, adj_limit);
-          register_new_node( adj_cmp, ctrl );
-          Node* adj_bool = new BoolNode(adj_cmp, bt);
-          register_new_node( adj_bool, ctrl );
-          new_limit = new CMoveINode(adj_bool, adj_limit, adj_max, TypeInt::INT);
-        }
-        register_new_node(new_limit, ctrl);
-      }
-      assert(new_limit != NULL, "");
-      // Replace in loop test.
-      assert(loop_end->in(1)->in(1) == cmp, "sanity");
-      if (cmp->outcnt() == 1 && loop_end->in(1)->outcnt() == 1) {
-        // Don't need to create new test since only one user.
-        _igvn.hash_delete(cmp);
-        cmp->set_req(2, new_limit);
-      } else {
-        // Create new test since it is shared.
-        Node* ctrl2 = loop_end->in(0);
-        Node* cmp2  = cmp->clone();
-        cmp2->set_req(2, new_limit);
-        register_new_node(cmp2, ctrl2);
-        Node* bol2 = loop_end->in(1)->clone();
-        bol2->set_req(1, cmp2);
-        register_new_node(bol2, ctrl2);
-        _igvn.replace_input_of(loop_end, 1, bol2);
-      }
-      // Step 3: Find the min-trip test guaranteed before a 'main' loop.
-      // Make it a 1-trip test (means at least 2 trips).
-
-      // Guard test uses an 'opaque' node which is not shared.  Hence I
-      // can edit it's inputs directly.  Hammer in the new limit for the
-      // minimum-trip guard.
-      assert(opaq->outcnt() == 1, "");
-      _igvn.replace_input_of(opaq, 1, new_limit);
+    assert(new_limit != NULL, "");
+    // Replace in loop test.
+    assert(loop_end->in(1)->in(1) == cmp, "sanity");
+    if (cmp->outcnt() == 1 && loop_end->in(1)->outcnt() == 1) {
+      // Don't need to create new test since only one user.
+      _igvn.hash_delete(cmp);
+      cmp->set_req(2, new_limit);
+    } else {
+      // Create new test since it is shared.
+      Node* ctrl2 = loop_end->in(0);
+      Node* cmp2  = cmp->clone();
+      cmp2->set_req(2, new_limit);
+      register_new_node(cmp2, ctrl2);
+      Node* bol2 = loop_end->in(1)->clone();
+      bol2->set_req(1, cmp2);
+      register_new_node(bol2, ctrl2);
+      _igvn.replace_input_of(loop_end, 1, bol2);
     }
-
-    // Adjust max trip count. The trip count is intentionally rounded
-    // down here (e.g. 15-> 7-> 3-> 1) because if we unwittingly over-unroll,
-    // the main, unrolled, part of the loop will never execute as it is protected
-    // by the min-trip test.  See bug 4834191 for a case where we over-unrolled
-    // and later determined that part of the unrolled loop was dead.
-    loop_head->set_trip_count(old_trip_count / 2);
-
-    // Double the count of original iterations in the unrolled loop body.
-    loop_head->double_unrolled_count();
-
-  } else { // LoopLimitCheck
-
-    // Adjust max trip count. The trip count is intentionally rounded
-    // down here (e.g. 15-> 7-> 3-> 1) because if we unwittingly over-unroll,
-    // the main, unrolled, part of the loop will never execute as it is protected
-    // by the min-trip test.  See bug 4834191 for a case where we over-unrolled
-    // and later determined that part of the unrolled loop was dead.
-    loop_head->set_trip_count(loop_head->trip_count() / 2);
-
-    // Double the count of original iterations in the unrolled loop body.
-    loop_head->double_unrolled_count();
-
-    // -----------
-    // Step 2: Cut back the trip counter for an unroll amount of 2.
-    // Loop will normally trip (limit - init)/stride_con.  Since it's a
-    // CountedLoop this is exact (stride divides limit-init exactly).
-    // We are going to double the loop body, so we want to knock off any
-    // odd iteration: (trip_cnt & ~1).  Then back compute a new limit.
-    Node *span = new SubINode( limit, init );
-    register_new_node( span, ctrl );
-    Node *trip = new DivINode( 0, span, stride );
-    register_new_node( trip, ctrl );
-    Node *mtwo = _igvn.intcon(-2);
-    set_ctrl(mtwo, C->root());
-    Node *rond = new AndINode( trip, mtwo );
-    register_new_node( rond, ctrl );
-    Node *spn2 = new MulINode( rond, stride );
-    register_new_node( spn2, ctrl );
-    new_limit = new AddINode( spn2, init );
-    register_new_node( new_limit, ctrl );
-
-    // Hammer in the new limit
-    Node *ctrl2 = loop_end->in(0);
-    Node *cmp2 = new CmpINode( loop_head->incr(), new_limit );
-    register_new_node( cmp2, ctrl2 );
-    Node *bol2 = new BoolNode( cmp2, loop_end->test_trip() );
-    register_new_node( bol2, ctrl2 );
-    _igvn.replace_input_of(loop_end, CountedLoopEndNode::TestValue, bol2);
-
     // Step 3: Find the min-trip test guaranteed before a 'main' loop.
     // Make it a 1-trip test (means at least 2 trips).
-    if( adjust_min_trip ) {
-      assert( new_limit != NULL, "" );
-      // Guard test uses an 'opaque' node which is not shared.  Hence I
-      // can edit it's inputs directly.  Hammer in the new limit for the
-      // minimum-trip guard.
-      assert( opaq->outcnt() == 1, "" );
-      _igvn.hash_delete(opaq);
-      opaq->set_req(1, new_limit);
-    }
-  } // LoopLimitCheck
+
+    // Guard test uses an 'opaque' node which is not shared.  Hence I
+    // can edit it's inputs directly.  Hammer in the new limit for the
+    // minimum-trip guard.
+    assert(opaq->outcnt() == 1, "");
+    _igvn.replace_input_of(opaq, 1, new_limit);
+  }
+
+  // Adjust max trip count. The trip count is intentionally rounded
+  // down here (e.g. 15-> 7-> 3-> 1) because if we unwittingly over-unroll,
+  // the main, unrolled, part of the loop will never execute as it is protected
+  // by the min-trip test.  See bug 4834191 for a case where we over-unrolled
+  // and later determined that part of the unrolled loop was dead.
+  loop_head->set_trip_count(old_trip_count / 2);
+
+  // Double the count of original iterations in the unrolled loop body.
+  loop_head->double_unrolled_count();
 
   // ---------
   // Step 4: Clone the loop body.  Move it inside the loop.  This loop body
@@ -1904,7 +1830,6 @@
     //   )
 
     if (low_limit->get_int() == -max_jint) {
-      if (!RangeLimitCheck) return;
       // We need this guard when scale*pre_limit+offset >= limit
       // due to underflow. So we need execute pre-loop until
       // scale*I+offset >= min_int. But (min_int-offset) will
@@ -1956,7 +1881,6 @@
     *pre_limit = adjust_limit((-stride_con), scale, plus_one, upper_limit, *pre_limit, pre_ctrl);
 
     if (low_limit->get_int() == -max_jint) {
-      if (!RangeLimitCheck) return;
       // We need this guard when scale*main_limit+offset >= limit
       // due to underflow. So we need execute main-loop while
       // scale*I+offset+1 > min_int. But (min_int-offset-1) will
@@ -2091,7 +2015,7 @@
 
 //------------------------------do_range_check---------------------------------
 // Eliminate range-checks and other trip-counter vs loop-invariant tests.
-void PhaseIdealLoop::do_range_check( IdealLoopTree *loop, Node_List &old_new ) {
+int PhaseIdealLoop::do_range_check( IdealLoopTree *loop, Node_List &old_new ) {
 #ifndef PRODUCT
   if (PrintOpto && VerifyLoopOptimizations) {
     tty->print("Range Check Elimination ");
@@ -2103,10 +2027,12 @@
 #endif
   assert(RangeCheckElimination, "");
   CountedLoopNode *cl = loop->_head->as_CountedLoop();
+  // If we fail before trying to eliminate range checks, set multiversion state
+  int closed_range_checks = 1;
 
   // protect against stride not being a constant
   if (!cl->stride_is_con())
-    return;
+    return closed_range_checks;
 
   // Find the trip counter; we are iteration splitting based on it
   Node *trip_counter = cl->phi();
@@ -2117,8 +2043,8 @@
   // Check graph shape. Cannot optimize a loop if zero-trip
   // Opaque1 node is optimized away and then another round
   // of loop opts attempted.
-  if (!is_canonical_main_loop_entry(cl)) {
-    return;
+  if (!is_canonical_loop_entry(cl)) {
+    return closed_range_checks;
   }
 
   // Need to find the main-loop zero-trip guard
@@ -2132,7 +2058,7 @@
   Node *p_f = iffm->in(0);
   // pre loop may have been optimized out
   if (p_f->Opcode() != Op_IfFalse) {
-    return;
+    return closed_range_checks;
   }
   CountedLoopEndNode *pre_end = p_f->in(0)->as_CountedLoopEnd();
   assert(pre_end->loopnode()->is_pre_loop(), "");
@@ -2141,7 +2067,7 @@
   // optimized away and then another round of loop opts attempted.
   // We can not optimize this particular loop in that case.
   if (pre_opaq1->Opcode() != Op_Opaque1)
-    return;
+    return closed_range_checks;
   Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
   Node *pre_limit = pre_opaq->in(1);
 
@@ -2152,7 +2078,7 @@
   // pre-loop Opaque1 node.
   Node *orig_limit = pre_opaq->original_loop_limit();
   if (orig_limit == NULL || _igvn.type(orig_limit) == Type::TOP)
-    return;
+    return closed_range_checks;
 
   // Must know if its a count-up or count-down loop
 
@@ -2173,6 +2099,10 @@
   // executed.
   bool conditional_rc = false;
 
+  // Count number of range checks and reduce by load range limits, if zero,
+  // the loop is in canonical form to multiversion.
+  closed_range_checks = 0;
+
   // Check loop body for tests of trip-counter plus loop-invariant vs
   // loop-invariant.
   for( uint i = 0; i < loop->_body.size(); i++ ) {
@@ -2181,6 +2111,7 @@
         iff->Opcode() == Op_RangeCheck) { // Test?
       // Test is an IfNode, has 2 projections.  If BOTH are in the loop
       // we need loop unswitching instead of iteration splitting.
+      closed_range_checks++;
       Node *exit = loop->is_loop_exit(iff);
       if( !exit ) continue;
       int flip = (exit->Opcode() == Op_IfTrue) ? 1 : 0;
@@ -2258,7 +2189,7 @@
           add_constraint( stride_con, scale_con, offset, zero, limit, pre_ctrl, &pre_limit, &main_limit );
           if (!conditional_rc) {
             // (0-offset)/scale could be outside of loop iterations range.
-            conditional_rc = !loop->dominates_backedge(iff) || RangeLimitCheck;
+            conditional_rc = !loop->dominates_backedge(iff);
           }
         } else {
           if (PrintOpto) {
@@ -2275,7 +2206,7 @@
           scale_con = -scale_con;
           offset = new SubINode( zero, offset );
           register_new_node( offset, pre_ctrl );
-          limit  = new SubINode( zero, limit  );
+          limit  = new SubINode( zero, limit );
           register_new_node( limit, pre_ctrl );
           // Fall into LE case
         case BoolTest::le:
@@ -2294,7 +2225,7 @@
             // ((MIN_INT+1)-offset)/scale could be outside of loop iterations range.
             // Note: negative offset is replaced with 0 but (MIN_INT+1)/scale could
             // still be outside of loop range.
-            conditional_rc = !loop->dominates_backedge(iff) || RangeLimitCheck;
+            conditional_rc = !loop->dominates_backedge(iff);
           }
           break;
         default:
@@ -2324,6 +2255,9 @@
           --imax;
         }
       }
+      if (limit->Opcode() == Op_LoadRange) {
+        closed_range_checks--;
+      }
 
     } // End of is IF
 
@@ -2340,26 +2274,6 @@
   // Note:: we are making the main loop limit no longer precise;
   // need to round up based on stride.
   cl->set_nonexact_trip_count();
-  if (!LoopLimitCheck && stride_con != 1 && stride_con != -1) { // Cutout for common case
-    // "Standard" round-up logic:  ([main_limit-init+(y-1)]/y)*y+init
-    // Hopefully, compiler will optimize for powers of 2.
-    Node *ctrl = get_ctrl(main_limit);
-    Node *stride = cl->stride();
-    Node *init = cl->init_trip()->uncast();
-    Node *span = new SubINode(main_limit,init);
-    register_new_node(span,ctrl);
-    Node *rndup = _igvn.intcon(stride_con + ((stride_con>0)?-1:1));
-    Node *add = new AddINode(span,rndup);
-    register_new_node(add,ctrl);
-    Node *div = new DivINode(0,add,stride);
-    register_new_node(div,ctrl);
-    Node *mul = new MulINode(div,stride);
-    register_new_node(mul,ctrl);
-    Node *newlim = new AddINode(mul,init);
-    register_new_node(newlim,ctrl);
-    main_limit = newlim;
-  }
-
   Node *main_cle = cl->loopexit();
   Node *main_bol = main_cle->in(1);
   // Hacking loop bounds; need private copies of exit test
@@ -2379,6 +2293,169 @@
   // The OpaqueNode is unshared by design
   assert( opqzm->outcnt() == 1, "cannot hack shared node" );
   _igvn.replace_input_of(opqzm, 1, main_limit);
+
+  return closed_range_checks;
+}
+
+//------------------------------has_range_checks-------------------------------
+// Check to see if RCE cleaned the current loop of range-checks.
+void PhaseIdealLoop::has_range_checks(IdealLoopTree *loop) {
+  assert(RangeCheckElimination, "");
+
+  // skip if not a counted loop
+  if (!loop->is_counted()) return;
+
+  CountedLoopNode *cl = loop->_head->as_CountedLoop();
+
+  // skip this loop if it is already checked
+  if (cl->has_been_range_checked()) return;
+
+  // Now check for existance of range checks
+  for (uint i = 0; i < loop->_body.size(); i++) {
+    Node *iff = loop->_body[i];
+    int iff_opc = iff->Opcode();
+    if (iff_opc == Op_If || iff_opc == Op_RangeCheck) {
+      cl->mark_has_range_checks();
+      break;
+    }
+  }
+  cl->set_has_been_range_checked();
+}
+
+//-------------------------multi_version_post_loops----------------------------
+// Check the range checks that remain, if simple, use the bounds to guard
+// which version to a post loop we execute, one with range checks or one without
+bool PhaseIdealLoop::multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop) {
+  bool multi_version_succeeded = false;
+  assert(RangeCheckElimination, "");
+  CountedLoopNode *legacy_cl = legacy_loop->_head->as_CountedLoop();
+  assert(legacy_cl->is_post_loop(), "");
+
+  // Check for existance of range checks using the unique instance to make a guard with
+  Unique_Node_List worklist;
+  for (uint i = 0; i < legacy_loop->_body.size(); i++) {
+    Node *iff = legacy_loop->_body[i];
+    int iff_opc = iff->Opcode();
+    if (iff_opc == Op_If || iff_opc == Op_RangeCheck) {
+      worklist.push(iff);
+    }
+  }
+
+  // Find RCE'd post loop so that we can stage its guard.
+  if (!is_canonical_loop_entry(legacy_cl)) return multi_version_succeeded;
+  Node* ctrl = legacy_cl->in(LoopNode::EntryControl);
+  Node* iffm = ctrl->in(0);
+
+  // Now we test that both the post loops are connected
+  Node* post_loop_region = iffm->in(0);
+  if (post_loop_region == NULL) return multi_version_succeeded;
+  if (!post_loop_region->is_Region()) return multi_version_succeeded;
+  Node* covering_region = post_loop_region->in(RegionNode::Control+1);
+  if (covering_region == NULL) return multi_version_succeeded;
+  if (!covering_region->is_Region()) return multi_version_succeeded;
+  Node* p_f = covering_region->in(RegionNode::Control);
+  if (p_f == NULL) return multi_version_succeeded;
+  if (!p_f->is_IfFalse()) return multi_version_succeeded;
+  if (!p_f->in(0)->is_CountedLoopEnd()) return multi_version_succeeded;
+  CountedLoopEndNode* rce_loop_end = p_f->in(0)->as_CountedLoopEnd();
+  if (rce_loop_end == NULL) return multi_version_succeeded;
+  CountedLoopNode* rce_cl = rce_loop_end->loopnode();
+  if (rce_cl == NULL || !rce_cl->is_post_loop()) return multi_version_succeeded;
+  CountedLoopNode *known_rce_cl = rce_loop->_head->as_CountedLoop();
+  if (rce_cl != known_rce_cl) return multi_version_succeeded;
+
+  // Then we fetch the cover entry test
+  ctrl = rce_cl->in(LoopNode::EntryControl);
+  if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return multi_version_succeeded;
+
+#ifndef PRODUCT
+  if (TraceLoopOpts) {
+    tty->print("PostMultiVersion\n");
+    rce_loop->dump_head();
+    legacy_loop->dump_head();
+  }
+#endif
+
+  // Now fetch the limit we want to compare against
+  Node *limit = rce_cl->limit();
+  bool first_time = true;
+
+  // If we got this far, we identified the post loop which has been RCE'd and
+  // we have a work list.  Now we will try to transform the if guard to cause
+  // the loop pair to be multi version executed with the determination left to runtime
+  // or the optimizer if full information is known about the given arrays at compile time.
+  Node *last_min = NULL;
+  multi_version_succeeded = true;
+  while (worklist.size()) {
+    Node* rc_iffm = worklist.pop();
+    if (rc_iffm->is_If()) {
+      Node *rc_bolzm = rc_iffm->in(1);
+      if (rc_bolzm->is_Bool()) {
+        Node *rc_cmpzm = rc_bolzm->in(1);
+        if (rc_cmpzm->is_Cmp()) {
+          Node *rc_left = rc_cmpzm->in(2);
+          if (rc_left->Opcode() != Op_LoadRange) {
+            multi_version_succeeded = false;
+            break;
+          }
+          if (first_time) {
+            last_min = rc_left;
+            first_time = false;
+          } else {
+            Node *cur_min = new MinINode(last_min, rc_left);
+            last_min = cur_min;
+            _igvn.register_new_node_with_optimizer(last_min);
+          }
+        }
+      }
+    }
+  }
+
+  // All we have to do is update the limit of the rce loop
+  // with the min of our expression and the current limit.
+  // We will use this expression to replace the current limit.
+  if (last_min && multi_version_succeeded) {
+    Node *cur_min = new MinINode(last_min, limit);
+    _igvn.register_new_node_with_optimizer(cur_min);
+    Node *cmp_node = rce_loop_end->cmp_node();
+    _igvn.replace_input_of(cmp_node, 2, cur_min);
+    set_idom(cmp_node, cur_min, dom_depth(ctrl));
+    set_ctrl(cur_min, ctrl);
+    set_loop(cur_min, rce_loop->_parent);
+
+    legacy_cl->mark_is_multiversioned();
+    rce_cl->mark_is_multiversioned();
+    multi_version_succeeded = true;
+
+    C->set_major_progress();
+  }
+
+  return multi_version_succeeded;
+}
+
+//-------------------------poison_rce_post_loop--------------------------------
+// Causes the rce'd post loop to be optimized away if multiverioning fails
+void PhaseIdealLoop::poison_rce_post_loop(IdealLoopTree *rce_loop) {
+  CountedLoopNode *rce_cl = rce_loop->_head->as_CountedLoop();
+  Node* ctrl = rce_cl->in(LoopNode::EntryControl);
+  if (ctrl->is_IfTrue() || ctrl->is_IfFalse()) {
+    Node* iffm = ctrl->in(0);
+    if (iffm->is_If()) {
+      Node* cur_bool = iffm->in(1);
+      if (cur_bool->is_Bool()) {
+        Node* cur_cmp = cur_bool->in(1);
+        if (cur_cmp->is_Cmp()) {
+          BoolTest::mask new_test = BoolTest::gt;
+          BoolNode *new_bool = new BoolNode(cur_cmp, new_test);
+          _igvn.replace_node(cur_bool, new_bool);
+          _igvn._worklist.push(new_bool);
+          Node* left_op = cur_cmp->in(1);
+          _igvn.replace_input_of(cur_cmp, 2, left_op);
+          C->set_major_progress();
+        }
+      }
+    }
+  }
 }
 
 //------------------------------DCE_loop_body----------------------------------
@@ -2738,8 +2815,20 @@
     // Adjust the pre- and main-loop limits to let the pre and post loops run
     // with full checks, but the main-loop with no checks.  Remove said
     // checks from the main body.
-    if (should_rce)
-      phase->do_range_check(this,old_new);
+    if (should_rce) {
+      if (phase->do_range_check(this, old_new) != 0) {
+        cl->mark_has_range_checks();
+      }
+    } else {
+      phase->has_range_checks(this);
+    }
+
+    if (should_unroll && !should_peel && PostLoopMultiversioning) {
+      // Try to setup multiversioning on main loops before they are unrolled
+      if (cl->is_main_loop() && (cl->unrolled_count() == 1)) {
+        phase->insert_scalar_rced_post_loop(this, old_new);
+      }
+    }
 
     // Double loop body for unrolling.  Adjust the minimum-trip test (will do
     // twice as many iterations as before) and the main body limit (only do
--- a/src/share/vm/opto/loopUnswitch.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/opto/loopUnswitch.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -138,7 +138,7 @@
   Node* uniqc = proj_true->unique_ctrl_out();
   Node* entry = head->in(LoopNode::EntryControl);
   Node* predicate = find_predicate(entry);
-  if (predicate != NULL && LoopLimitCheck && UseLoopPredicate) {
+  if (predicate != NULL && UseLoopPredicate) {
     // We may have two predicates, find first.
     entry = find_predicate(entry->in(0)->in(0));
     if (entry != NULL) predicate = entry;
--- a/src/share/vm/opto/loopnode.cpp	Thu Apr 14 09:46:03 2016 -0400
+++ b/src/share/vm/opto/loopnode.cpp	Thu Apr 14 14:05:09 2016 +0000
@@ -464,8 +464,6 @@
 
   Node *hook = new Node(6);
 
-  if (LoopLimitCheck) {
-
   // ===================================================
   // Generate loop limit check to avoid integer overflow
   // in cases like next (cyclic loops):
@@ -594,103 +592,6 @@
   }
   set_subtree_ctrl( limit );
 
-  } else { // LoopLimitCheck
-
-  // If compare points to incr, we are ok.  Otherwise the compare
-  // can directly point to the phi; in this case adjust the compare so that
-  // it points to the incr by adjusting the limit.
-  if (cmp->in(1) == phi || cmp->in(2) == phi)
-    limit = gvn->transform(new AddINode(limit,stride));
-
-  // trip-count for +-tive stride should be: (limit - init_trip + stride - 1)/stride.
-  // Final value for iterator should be: trip_count * stride + init_trip.
-  Node *one_p = gvn->intcon( 1);
-  Node *one_m = gvn->intcon(-1);
-
-  Node *trip_count = NULL;
-  switch( bt ) {
-  case BoolTest::eq:
-    ShouldNotReachHere();
-  case BoolTest::ne:            // Ahh, the case we desire
-    if (stride_con == 1)
-      trip_count = gvn->transform(new SubINode(limit,init_trip));
-    else if (stride_con == -1)
-      trip_count = gvn->transform(new SubINode(init_trip,limit));
-    else
-      ShouldNotReachHere();
-    set_subtree_ctrl(trip_count);
-    //_loop.map(trip_count->_idx,loop(limit));
-    break;
-  case BoolTest::le:            // Maybe convert to '<' case
-    limit = gvn->transform(new AddINode(limit,one_p));
-    set_subtree_ctrl( limit );
-    hook->init_req(4, limit);
-
-    bt = BoolTest::lt;
-    // Make the new limit be in the same loop nest as the old limit
-    //_loop.map(limit->_idx,limit_loop);
-    // Fall into next case
-  case BoolTest::lt: {          // Maybe convert to '!=' case
-    if (stride_con < 0) // Count down loop rolls through MAXINT
-      ShouldNotReachHere();
-    Node *range = gvn->transform(new SubINode(limit,init_trip));
-    set_subtree_ctrl( range );
-    hook->init_req(0, range);
-
-    Node *bias  = gvn->transform(new AddINode(range,stride));
-    set_subtree_ctrl( bias