changeset 9381:f01629221703

Merge
author amurillo
date Thu, 08 Oct 2015 14:28:55 -0700
parents 03845376ea9d 017224c13b0e
children 53c5cb9d3fed 13c4fa17712e
files test/compiler/TestMoveStoresOutOfLoopsStoreNoCtrl.java
diffstat 156 files changed, 7517 insertions(+), 2243 deletions(-) [+]
line wrap: on
line diff
--- a/agent/src/share/classes/sun/jvm/hotspot/compiler/ImmutableOopMapSet.java	Tue Oct 06 08:41:31 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/compiler/ImmutableOopMapSet.java	Thu Oct 08 14:28:55 2015 -0700
@@ -67,9 +67,6 @@
       }
     }
 
-    public void visitValueLocation(Address valueAddr) {
-    }
-
     public void visitNarrowOopLocation(Address narrowOopAddr) {
       addressVisitor.visitCompOopAddress(narrowOopAddr);
     }
@@ -216,9 +213,9 @@
       }
     }
 
-    // We want narow oop, value and oop oop_types
-    OopMapValue.OopTypes[] values = new OopMapValue.OopTypes[]{
-        OopMapValue.OopTypes.OOP_VALUE, OopMapValue.OopTypes.VALUE_VALUE, OopMapValue.OopTypes.NARROWOOP_VALUE
+    // We want narow oop and oop oop_types
+    OopMapValue.OopTypes[] values = new OopMapValue.OopTypes[] {
+        OopMapValue.OopTypes.OOP_VALUE, OopMapValue.OopTypes.NARROWOOP_VALUE
     };
 
     {
@@ -231,8 +228,6 @@
             // to detect in the debugging system
             // assert(Universe::is_heap_or_null(*loc), "found non oop pointer");
             visitor.visitOopLocation(loc);
-          } else if (omv.getType() == OopMapValue.OopTypes.VALUE_VALUE) {
-            visitor.visitValueLocation(loc);
           } else if (omv.getType() == OopMapValue.OopTypes.NARROWOOP_VALUE) {
             visitor.visitNarrowOopLocation(loc);
           }
--- a/agent/src/share/classes/sun/jvm/hotspot/compiler/OopMapValue.java	Tue Oct 06 08:41:31 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/compiler/OopMapValue.java	Thu Oct 08 14:28:55 2015 -0700
@@ -49,7 +49,6 @@
   // Types of OopValues
   static int UNUSED_VALUE;
   static int OOP_VALUE;
-  static int VALUE_VALUE;
   static int NARROWOOP_VALUE;
   static int CALLEE_SAVED_VALUE;
   static int DERIVED_OOP_VALUE;
@@ -73,7 +72,6 @@
     REGISTER_MASK_IN_PLACE = db.lookupIntConstant("OopMapValue::register_mask_in_place").intValue();
     UNUSED_VALUE           = db.lookupIntConstant("OopMapValue::unused_value").intValue();
     OOP_VALUE              = db.lookupIntConstant("OopMapValue::oop_value").intValue();
-    VALUE_VALUE            = db.lookupIntConstant("OopMapValue::value_value").intValue();
     NARROWOOP_VALUE        = db.lookupIntConstant("OopMapValue::narrowoop_value").intValue();
     CALLEE_SAVED_VALUE     = db.lookupIntConstant("OopMapValue::callee_saved_value").intValue();
     DERIVED_OOP_VALUE      = db.lookupIntConstant("OopMapValue::derived_oop_value").intValue();
@@ -82,7 +80,6 @@
   public static abstract class OopTypes {
     public static final OopTypes UNUSED_VALUE       = new OopTypes() { int getValue() { return OopMapValue.UNUSED_VALUE;       }};
     public static final OopTypes OOP_VALUE          = new OopTypes() { int getValue() { return OopMapValue.OOP_VALUE;          }};
-    public static final OopTypes VALUE_VALUE        = new OopTypes() { int getValue() { return OopMapValue.VALUE_VALUE;        }};
     public static final OopTypes NARROWOOP_VALUE    = new OopTypes() { int getValue() { return OopMapValue.NARROWOOP_VALUE;         }};
     public static final OopTypes CALLEE_SAVED_VALUE = new OopTypes() { int getValue() { return OopMapValue.CALLEE_SAVED_VALUE; }};
     public static final OopTypes DERIVED_OOP_VALUE  = new OopTypes() { int getValue() { return OopMapValue.DERIVED_OOP_VALUE;  }};
@@ -105,7 +102,6 @@
 
   // Querying
   public boolean isOop()         { return (getValue() & TYPE_MASK_IN_PLACE) == OOP_VALUE;          }
-  public boolean isValue()       { return (getValue() & TYPE_MASK_IN_PLACE) == VALUE_VALUE;        }
   public boolean isNarrowOop()   { return (getValue() & TYPE_MASK_IN_PLACE) == NARROWOOP_VALUE;    }
   public boolean isCalleeSaved() { return (getValue() & TYPE_MASK_IN_PLACE) == CALLEE_SAVED_VALUE; }
   public boolean isDerivedOop()  { return (getValue() & TYPE_MASK_IN_PLACE) == DERIVED_OOP_VALUE;  }
@@ -117,7 +113,6 @@
     int which = (getValue() & TYPE_MASK_IN_PLACE);
          if (which == UNUSED_VALUE) return OopTypes.UNUSED_VALUE;
     else if (which == OOP_VALUE)    return OopTypes.OOP_VALUE;
-    else if (which == VALUE_VALUE)  return OopTypes.VALUE_VALUE;
     else if (which == NARROWOOP_VALUE)   return OopTypes.NARROWOOP_VALUE;
     else if (which == CALLEE_SAVED_VALUE) return OopTypes.CALLEE_SAVED_VALUE;
     else if (which == DERIVED_OOP_VALUE)  return OopTypes.DERIVED_OOP_VALUE;
--- a/agent/src/share/classes/sun/jvm/hotspot/compiler/OopMapVisitor.java	Tue Oct 06 08:41:31 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/compiler/OopMapVisitor.java	Thu Oct 08 14:28:55 2015 -0700
@@ -31,6 +31,5 @@
 public interface OopMapVisitor {
   public void visitOopLocation(Address oopAddr);
   public void visitDerivedOopLocation(Address baseOopAddr, Address derivedOopAddr);
-  public void visitValueLocation(Address valueAddr);
   public void visitNarrowOopLocation(Address narrowOopAddr);
 }
--- a/agent/src/share/classes/sun/jvm/hotspot/runtime/Frame.java	Tue Oct 06 08:41:31 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/Frame.java	Thu Oct 08 14:28:55 2015 -0700
@@ -536,9 +536,6 @@
       }
     }
 
-    public void visitValueLocation(Address valueAddr) {
-    }
-
     public void visitNarrowOopLocation(Address compOopAddr) {
       addressVisitor.visitCompOopAddress(compOopAddr);
     }
--- a/agent/src/share/classes/sun/jvm/hotspot/ui/classbrowser/HTMLGenerator.java	Tue Oct 06 08:41:31 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/ui/classbrowser/HTMLGenerator.java	Thu Oct 08 14:28:55 2015 -0700
@@ -1220,9 +1220,6 @@
       oms = new OopMapStream(map, OopMapValue.OopTypes.NARROWOOP_VALUE);
       buf.append(omvIterator.iterate(oms, "NarrowOops:", false));
 
-      oms = new OopMapStream(map, OopMapValue.OopTypes.VALUE_VALUE);
-      buf.append(omvIterator.iterate(oms, "Values:", false));
-
       oms = new OopMapStream(map, OopMapValue.OopTypes.CALLEE_SAVED_VALUE);
       buf.append(omvIterator.iterate(oms, "Callee saved:",  true));
 
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu Oct 08 14:28:55 2015 -0700
@@ -1039,6 +1039,7 @@
   bool leading_membar(const MemBarNode *barrier);
 
   bool is_card_mark_membar(const MemBarNode *barrier);
+  bool is_CAS(int opcode);
 
   MemBarNode *leading_to_normal(MemBarNode *leading);
   MemBarNode *normal_to_leading(const MemBarNode *barrier);
@@ -1057,6 +1058,9 @@
   bool unnecessary_volatile(const Node *barrier);
   bool needs_releasing_store(const Node *store);
 
+  // predicate controlling translation of CompareAndSwapX
+  bool needs_acquiring_load_exclusive(const Node *load);
+
   // predicate controlling translation of StoreCM
   bool unnecessary_storestore(const Node *storecm);
 %}
@@ -1088,15 +1092,58 @@
   //   str<x>
   //   dmb ish
   //
+  // We can also use ldaxr and stlxr to implement compare and swap CAS
+  // sequences. These are normally translated to an instruction
+  // sequence like the following
+  //
+  //   dmb      ish
+  // retry:
+  //   ldxr<x>   rval raddr
+  //   cmp       rval rold
+  //   b.ne done
+  //   stlxr<x>  rval, rnew, rold
+  //   cbnz      rval retry
+  // done:
+  //   cset      r0, eq
+  //   dmb ishld
+  //
+  // Note that the exclusive store is already using an stlxr
+  // instruction. That is required to ensure visibility to other
+  // threads of the exclusive write (assuming it succeeds) before that
+  // of any subsequent writes.
+  //
+  // The following instruction sequence is an improvement on the above
+  //
+  // retry:
+  //   ldaxr<x>  rval raddr
+  //   cmp       rval rold
+  //   b.ne done
+  //   stlxr<x>  rval, rnew, rold
+  //   cbnz      rval retry
+  // done:
+  //   cset      r0, eq
+  //
+  // We don't need the leading dmb ish since the stlxr guarantees
+  // visibility of prior writes in the case that the swap is
+  // successful. Crucially we don't have to worry about the case where
+  // the swap is not successful since no valid program should be
+  // relying on visibility of prior changes by the attempting thread
+  // in the case where the CAS fails.
+  //
+  // Similarly, we don't need the trailing dmb ishld if we substitute
+  // an ldaxr instruction since that will provide all the guarantees we
+  // require regarding observation of changes made by other threads
+  // before any change to the CAS address observed by the load.
+  //
   // In order to generate the desired instruction sequence we need to
   // be able to identify specific 'signature' ideal graph node
   // sequences which i) occur as a translation of a volatile reads or
-  // writes and ii) do not occur through any other translation or
-  // graph transformation. We can then provide alternative aldc
-  // matching rules which translate these node sequences to the
-  // desired machine code sequences. Selection of the alternative
-  // rules can be implemented by predicates which identify the
-  // relevant node sequences.
+  // writes or CAS operations and ii) do not occur through any other
+  // translation or graph transformation. We can then provide
+  // alternative aldc matching rules which translate these node
+  // sequences to the desired machine code sequences. Selection of the
+  // alternative rules can be implemented by predicates which identify
+  // the relevant node sequences.
   //
   // The ideal graph generator translates a volatile read to the node
   // sequence
@@ -1163,6 +1210,15 @@
   // get if it is fed and feeds a cpuorder membar and if its feed
   // membar also feeds an acquiring load.
   //
+  // Finally an inlined (Unsafe) CAS operation is translated to the
+  // following ideal graph
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //   CompareAndSwapX {CardMark}-optional
+  //   MemBarCPUOrder
+  //   MemBarAcquire
+  //
   // So, where we can identify these volatile read and write
   // signatures we can choose to plant either of the above two code
   // sequences. For a volatile read we can simply plant a normal
@@ -1177,6 +1233,14 @@
   // and MemBarVolatile and instead plant a simple stlr<x>
   // instruction.
   //
+  // when we recognise a CAS signature we can choose to plant a dmb
+  // ish as a translation for the MemBarRelease, the conventional
+  // macro-instruction sequence for the CompareAndSwap node (which
+  // uses ldxr<x>) and then a dmb ishld for the MemBarAcquire.
+  // Alternatively, we can elide generation of the dmb instructions
+  // and plant the alternative CompareAndSwap macro-instruction
+  // sequence (which uses ldaxr<x>).
+  // 
   // Of course, the above only applies when we see these signature
   // configurations. We still want to plant dmb instructions in any
   // other cases where we may see a MemBarAcquire, MemBarRelease or
@@ -1194,7 +1258,8 @@
   // relevant dmb instructions.
   //
 
-  // graph traversal helpers used for volatile put/get optimization
+  // graph traversal helpers used for volatile put/get and CAS
+  // optimization
 
   // 1) general purpose helpers
 
@@ -1220,16 +1285,19 @@
 	return NULL;
     }
 
-    if (!ctl || !mem || !ctl->is_Proj() || !mem->is_Proj())
+    if (!ctl || !mem || !ctl->is_Proj() || !mem->is_Proj()) {
       return NULL;
+    }
 
     membar = ctl->lookup(0);
 
-    if (!membar || !membar->is_MemBar())
+    if (!membar || !membar->is_MemBar()) {
       return NULL;
-
-    if (mem->lookup(0) != membar)
+    }
+
+    if (mem->lookup(0) != membar) {
       return NULL;
+    }
 
     return membar->as_MemBar();
   }
@@ -1259,8 +1327,9 @@
       }
     }
 
-    if (child == NULL)
+    if (child == NULL) {
       return NULL;
+    }
 
     for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
       x = mem->fast_out(i);
@@ -1283,15 +1352,18 @@
   {
     int opcode = barrier->Opcode();
     // if this is a release membar we are ok
-    if (opcode == Op_MemBarRelease)
+    if (opcode == Op_MemBarRelease) {
       return true;
+    }
     // if its a cpuorder membar . . .
-    if (opcode != Op_MemBarCPUOrder)
+    if (opcode != Op_MemBarCPUOrder) {
       return false;
+    }
     // then the parent has to be a release membar
     MemBarNode *parent = parent_membar(barrier);
-    if (!parent)
+    if (!parent) {
       return false;
+    }
     opcode = parent->Opcode();
     return opcode == Op_MemBarRelease;
   }
@@ -1314,11 +1386,13 @@
   
   bool is_card_mark_membar(const MemBarNode *barrier)
   {
-    if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark))
+    if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark)) {
       return false;
-
-    if (barrier->Opcode() != Op_MemBarVolatile)
+    }
+
+    if (barrier->Opcode() != Op_MemBarVolatile) {
       return false;
+    }
 
     ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
 
@@ -1333,8 +1407,8 @@
   }
 
 
-  // 3) helper predicates to traverse volatile put graphs which may
-  // contain GC barrier subgraphs
+  // 3) helper predicates to traverse volatile put or CAS graphs which
+  // may contain GC barrier subgraphs
 
   // Preamble
   // --------
@@ -1404,8 +1478,7 @@
   // currently being unmarked in which case the volatile put graph
   // will look slightly different
   //
-  //   MemBarRelease
-  //   MemBarCPUOrder___________________________________________
+  //   MemBarRelease____________________________________________
   //         ||    \\               Ctl \     Ctl \     \\  Mem \
   //         ||    StoreN/P[mo_release] CastP2X   If   LoadB     |
   //         | \     /                              \            |
@@ -1419,7 +1492,7 @@
   // memory flow includes the following subgraph:
   //
   //   MemBarRelease
-  //   MemBarCPUOrder
+  //  {MemBarCPUOrder}
   //          |  \      . . .
   //          |  StoreX[mo_release]  . . .
   //          |   /
@@ -1431,8 +1504,48 @@
   // detected starting from any candidate MemBarRelease,
   // StoreX[mo_release] or MemBarVolatile.
   //
+  // A simple variation on this normal case occurs for an unsafe CAS
+  // operation. The basic graph for a non-object CAS is
+  //
+  //   MemBarRelease
+  //         ||
+  //   MemBarCPUOrder
+  //         ||     \\   . . .
+  //         ||     CompareAndSwapX
+  //         ||       |
+  //         ||     SCMemProj
+  //         | \     /
+  //         | MergeMem
+  //         | /
+  //   MemBarCPUOrder
+  //         ||
+  //   MemBarAcquire
+  //
+  // The same basic variations on this arrangement (mutatis mutandis)
+  // occur when a card mark is introduced. i.e. we se the same basic
+  // shape but the StoreP/N is replaced with CompareAndSawpP/N and the
+  // tail of the graph is a pair comprising a MemBarCPUOrder +
+  // MemBarAcquire.
+  //
+  // So, in the case of a CAS the normal graph has the variant form
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //          |   \      . . .
+  //          |  CompareAndSwapX  . . .
+  //          |    |
+  //          |   SCMemProj
+  //          |   /  . . .
+  //         MergeMem
+  //          |
+  //   MemBarCPUOrder
+  //   MemBarAcquire
+  //
+  // This graph can also easily be detected starting from any
+  // candidate MemBarRelease, CompareAndSwapX or MemBarAcquire.
+  //
   // the code below uses two helper predicates, leading_to_normal and
-  // normal_to_leading to identify this configuration, one validating
+  // normal_to_leading to identify these normal graphs, one validating
   // the layout starting from the top membar and searching down and
   // the other validating the layout starting from the lower membar
   // and searching up.
@@ -1450,7 +1563,9 @@
   // they are only inserted for object puts. This significantly
   // complicates the task of identifying whether a MemBarRelease,
   // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
-  // when using these GC configurations (see below).
+  // when using these GC configurations (see below). It adds similar
+  // complexity to the task of identifying whether a MemBarRelease,
+  // CompareAndSwapX or MemBarAcquire forms part of a CAS.
   //
   // In both cases the post-write subtree includes an auxiliary
   // MemBarVolatile (StoreLoad barrier) separating the object put and
@@ -1489,7 +1604,8 @@
   // (LoadB) from the card. Ctl and Mem are fed to the If via an
   // intervening StoreLoad barrier (MemBarVolatile).
   //
-  // So, with CMS we may see a node graph which looks like this
+  // So, with CMS we may see a node graph for a volatile object store
+  // which looks like this
   //
   //   MemBarRelease
   //   MemBarCPUOrder_(leading)__________________
@@ -1524,6 +1640,55 @@
   // from the StoreCM into the trailing membar (n.b. the latter
   // proceeds via a Phi associated with the If region).
   //
+  // The graph for a CAS varies slightly, the obvious difference being
+  // that the StoreN/P node is replaced by a CompareAndSwapP/N node
+  // and the trailing MemBarVolatile by a MemBarCPUOrder +
+  // MemBarAcquire pair. The other important difference is that the
+  // CompareAndSwap node's SCMemProj is not merged into the card mark
+  // membar - it still feeds the trailing MergeMem. This also means
+  // that the card mark membar receives its Mem feed directly from the
+  // leading membar rather than via a MergeMem.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder__(leading)_________________________
+  //       ||                       \\                 C \
+  //   MemBarVolatile (card mark)  CompareAndSwapN/P  CastP2X
+  //     C |  ||    M |              |
+  //       | LoadB    |       ______/|
+  //       |   |      |      /       |
+  //       | Cmp      |     /      SCMemProj
+  //       | /        |    /         |
+  //       If         |   /         /
+  //       | \        |  /         /
+  // IfFalse  IfTrue  | /         /
+  //       \     / \  |/ prec    /
+  //        \   / StoreCM       /
+  //         \ /      |        /
+  //        Region   . . .    /
+  //          | \            /
+  //          |  . . .  \   / Bot
+  //          |       MergeMem
+  //          |          |
+  //        MemBarCPUOrder
+  //        MemBarAcquire (trailing)
+  //
+  // This has a slightly different memory subgraph to the one seen
+  // previously but the core of it is the same as for the CAS normal
+  // sungraph
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder____
+  //      ||             \      . . .
+  //   MemBarVolatile  CompareAndSwapX  . . .
+  //      |  \            |
+  //        . . .   SCMemProj
+  //          |     /  . . .
+  //         MergeMem
+  //          |
+  //   MemBarCPUOrder
+  //   MemBarAcquire
+  //
+  //
   // G1 is quite a lot more complicated. The nodes inserted on behalf
   // of G1 may comprise: a pre-write graph which adds the old value to
   // the SATB queue; the releasing store itself; and, finally, a
@@ -1575,12 +1740,16 @@
   // n.b. the LoadB in this subgraph is not the card read -- it's a
   // read of the SATB queue active flag.
   //
+  // Once again the CAS graph is a minor variant on the above with the
+  // expected substitutions of CompareAndSawpX for StoreN/P and
+  // MemBarCPUOrder + MemBarAcquire for trailing MemBarVolatile.
+  //
   // The G1 post-write subtree is also optional, this time when the
   // new value being written is either null or can be identified as a
   // newly allocated (young gen) object with no intervening control
   // flow. The latter cannot happen but the former may, in which case
-  // the card mark membar is omitted and the memory feeds from the
-  // leading membar and the StoreN/P are merged direct into the
+  // the card mark membar is omitted and the memory feeds form the
+  // leading membar and the SToreN/P are merged direct into the
   // trailing membar as per the normal subgraph. So, the only special
   // case which arises is when the post-write subgraph is generated.
   //
@@ -1668,113 +1837,53 @@
   // value check has been elided the total number of Phis is 2
   // otherwise it is 3.
   //
+  // The CAS graph when using G1GC also includes a pre-write subgraph
+  // and an optional post-write subgraph. Teh sam evarioations are
+  // introduced as for CMS with conditional card marking i.e. the
+  // StoreP/N is swapped for a CompareAndSwapP/N, the tariling
+  // MemBarVolatile for a MemBarCPUOrder + MemBarAcquire pair and the
+  // Mem feed from the CompareAndSwapP/N includes a precedence
+  // dependency feed to the StoreCM and a feed via an SCMemProj to the
+  // trailing membar. So, as before the configuration includes the
+  // normal CAS graph as a subgraph of the memory flow.
+  //
   // So, the upshot is that in all cases the volatile put graph will
   // include a *normal* memory subgraph betwen the leading membar and
-  // its child membar. When that child is not a card mark membar then
-  // it marks the end of a volatile put subgraph. If the child is a
-  // card mark membar then the normal subgraph will form part of a
-  // volatile put subgraph if and only if the child feeds an
-  // AliasIdxBot Mem feed to a trailing barrier via a MergeMem. That
-  // feed is either direct (for CMS) or via 2 or 3 Phi nodes merging
-  // the leading barrier memory flow (for G1).
+  // its child membar, either a volatile put graph (including a
+  // releasing StoreX) or a CAS graph (including a CompareAndSwapX).
+  // When that child is not a card mark membar then it marks the end
+  // of the volatile put or CAS subgraph. If the child is a card mark
+  // membar then the normal subgraph will form part of a volatile put
+  // subgraph if and only if the child feeds an AliasIdxBot Mem feed
+  // to a trailing barrier via a MergeMem. That feed is either direct
+  // (for CMS) or via 2 or 3 Phi nodes merging the leading barrier
+  // memory flow (for G1).
   // 
   // The predicates controlling generation of instructions for store
   // and barrier nodes employ a few simple helper functions (described
-  // below) which identify the presence or absence of these subgraph
-  // configurations and provide a means of traversing from one node in
-  // the subgraph to another.
+  // below) which identify the presence or absence of all these
+  // subgraph configurations and provide a means of traversing from
+  // one node in the subgraph to another.
+
+  // is_CAS(int opcode)
+  //
+  // return true if opcode is one of the possible CompareAndSwapX
+  // values otherwise false.
+
+  bool is_CAS(int opcode)
+  {
+    return (opcode == Op_CompareAndSwapI ||
+	    opcode == Op_CompareAndSwapL ||
+	    opcode == Op_CompareAndSwapN ||
+	    opcode == Op_CompareAndSwapP);
+  }
 
   // leading_to_normal
   //
-  //graph traversal helper which detects the normal case Mem feed
-  // from a release membar (or, optionally, its cpuorder child) to a
-  // dependent volatile membar i.e. it ensures that the following Mem
-  // flow subgraph is present.
-  //
-  //   MemBarRelease
-  //   MemBarCPUOrder
-  //          |  \      . . .
-  //          |  StoreN/P[mo_release]  . . .
-  //          |   /
-  //         MergeMem
-  //          |
-  //   MemBarVolatile
-  //
-  // if the correct configuration is present returns the volatile
-  // membar otherwise NULL.
-  //
-  // the input membar is expected to be either a cpuorder membar or a
-  // release membar. in the latter case it should not have a cpu membar
-  // child.
-  //
-  // the returned membar may be a card mark membar rather than a
-  // trailing membar.
-
-  MemBarNode *leading_to_normal(MemBarNode *leading)
-  {
-    assert((leading->Opcode() == Op_MemBarRelease ||
-	    leading->Opcode() == Op_MemBarCPUOrder),
-	   "expecting a volatile or cpuroder membar!");
-
-    // check the mem flow
-    ProjNode *mem = leading->proj_out(TypeFunc::Memory);
-
-    if (!mem)
-      return NULL;
-
-    Node *x = NULL;
-    StoreNode * st = NULL;
-    MergeMemNode *mm = NULL;
-
-    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-      x = mem->fast_out(i);
-      if (x->is_MergeMem()) {
-	if (mm != NULL)
-	  return NULL;
-	// two merge mems is one too many
-	mm = x->as_MergeMem();
-      } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
-	// two releasing stores is one too many
-	if (st != NULL)
-	  return NULL;
-	st = x->as_Store();
-      }
-    }
-
-    if (!mm || !st)
-      return NULL;
-
-    bool found = false;
-    // ensure the store feeds the merge
-    for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
-      if (st->fast_out(i) == mm) {
-	found = true;
-	break;
-      }
-    }
-
-    if (!found)
-      return NULL;
-
-    MemBarNode *mbvol = NULL;
-    // ensure the merge feeds a volatile membar
-    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-      x = mm->fast_out(i);
-      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-	mbvol = x->as_MemBar();
-	break;
-      }
-    }
-
-    return mbvol;
-  }
-
-  // normal_to_leading
-  //
-  // graph traversal helper which detects the normal case Mem feed
-  // from either a card mark or a trailing membar to a preceding
-  // release membar (optionally its cpuorder child) i.e. it ensures
-  // that the following Mem flow subgraph is present.
+  //graph traversal helper which detects the normal case Mem feed from
+  // a release membar (or, optionally, its cpuorder child) to a
+  // dependent volatile membar i.e. it ensures that one or other of
+  // the following Mem flow subgraph is present.
   //
   //   MemBarRelease
   //   MemBarCPUOrder {leading}
@@ -1783,7 +1892,165 @@
   //          |   /
   //         MergeMem
   //          |
-  //   MemBarVolatile
+  //   MemBarVolatile {trailing or card mark}
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //      |       \      . . .
+  //      |     CompareAndSwapX  . . .
+  //               |
+  //     . . .    SCMemProj
+  //           \   |
+  //      |    MergeMem
+  //      |       /
+  //    MemBarCPUOrder
+  //    MemBarAcquire {trailing}
+  //
+  // if the correct configuration is present returns the trailing
+  // membar otherwise NULL.
+  //
+  // the input membar is expected to be either a cpuorder membar or a
+  // release membar. in the latter case it should not have a cpu membar
+  // child.
+  //
+  // the returned value may be a card mark or trailing membar
+  //
+
+  MemBarNode *leading_to_normal(MemBarNode *leading)
+  {
+    assert((leading->Opcode() == Op_MemBarRelease ||
+	    leading->Opcode() == Op_MemBarCPUOrder),
+	   "expecting a volatile or cpuroder membar!");
+
+    // check the mem flow
+    ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+
+    if (!mem) {
+      return NULL;
+    }
+
+    Node *x = NULL;
+    StoreNode * st = NULL;
+    LoadStoreNode *cas = NULL;
+    MergeMemNode *mm = NULL;
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      if (x->is_MergeMem()) {
+	if (mm != NULL) {
+	  return NULL;
+	}
+	// two merge mems is one too many
+	mm = x->as_MergeMem();
+      } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+	// two releasing stores/CAS nodes is one too many
+	if (st != NULL || cas != NULL) {
+	  return NULL;
+	}
+	st = x->as_Store();
+      } else if (is_CAS(x->Opcode())) {
+	if (st != NULL || cas != NULL) {
+	  return NULL;
+	}
+	cas = x->as_LoadStore();
+      }
+    }
+
+    // must have a store or a cas
+    if (!st && !cas) {
+      return NULL;
+    }
+
+    // must have a merge if we also have st
+    if (st && !mm) {
+      return NULL;
+    }
+
+    Node *y = NULL;
+    if (cas) {
+      // look for an SCMemProj
+      for (DUIterator_Fast imax, i = cas->fast_outs(imax); i < imax; i++) {
+	x = cas->fast_out(i);
+	if (x->is_Proj()) {
+	  y = x;
+	  break;
+	}
+      }
+      if (y == NULL) {
+	return NULL;
+      }
+      // the proj must feed a MergeMem
+      for (DUIterator_Fast imax, i = y->fast_outs(imax); i < imax; i++) {
+	x = y->fast_out(i);
+	if (x->is_MergeMem()) {
+	  mm = x->as_MergeMem();
+	  break;
+	}
+      }
+      if (mm == NULL)
+	return NULL;
+    } else {
+      // ensure the store feeds the existing mergemem;
+      for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+	if (st->fast_out(i) == mm) {
+	  y = st;
+	  break;
+	}
+      }
+      if (y == NULL) {
+	return NULL;
+      }
+    }
+
+    MemBarNode *mbar = NULL;
+    // ensure the merge feeds to the expected type of membar
+    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+      x = mm->fast_out(i);
+      if (x->is_MemBar()) {
+	int opcode = x->Opcode();
+	if (opcode == Op_MemBarVolatile && st) {
+	  mbar = x->as_MemBar();
+	} else if (cas && opcode == Op_MemBarCPUOrder) {
+	  MemBarNode *y =  x->as_MemBar();
+	  y = child_membar(y);
+	  if (y != NULL && y->Opcode() == Op_MemBarAcquire) {
+	    mbar = y;
+	  }
+	}
+	break;
+      }
+    }
+
+    return mbar;
+  }
+
+  // normal_to_leading
+  //
+  // graph traversal helper which detects the normal case Mem feed
+  // from either a card mark or a trailing membar to a preceding
+  // release membar (optionally its cpuorder child) i.e. it ensures
+  // that one or other of the following Mem flow subgraphs is present.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //          |  \      . . .
+  //          |  StoreN/P[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile {card mark or trailing}
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //      |       \      . . .
+  //      |     CompareAndSwapX  . . .
+  //               |
+  //     . . .    SCMemProj
+  //           \   |
+  //      |    MergeMem
+  //      |        /
+  //    MemBarCPUOrder
+  //    MemBarAcquire {trailing}
   //
   // this predicate checks for the same flow as the previous predicate
   // but starting from the bottom rather than the top.
@@ -1797,51 +2064,116 @@
   MemBarNode *normal_to_leading(const MemBarNode *barrier)
   {
     // input must be a volatile membar
-    assert(barrier->Opcode() == Op_MemBarVolatile, "expecting a volatile membar");
+    assert((barrier->Opcode() == Op_MemBarVolatile ||
+	    barrier->Opcode() == Op_MemBarAcquire),
+	   "expecting a volatile or an acquire membar");
     Node *x;
+    bool is_cas = barrier->Opcode() == Op_MemBarAcquire;
+
+    // if we have an acquire membar then it must be fed via a CPUOrder
+    // membar
+
+    if (is_cas) {
+      // skip to parent barrier which must be a cpuorder
+      x = parent_membar(barrier);
+      if (x->Opcode() != Op_MemBarCPUOrder)
+	return NULL;
+    } else {
+      // start from the supplied barrier
+      x = (Node *)barrier;
+    }
 
     // the Mem feed to the membar should be a merge
-    x = barrier->in(TypeFunc::Memory);
+    x = x ->in(TypeFunc::Memory);
     if (!x->is_MergeMem())
       return NULL;
 
     MergeMemNode *mm = x->as_MergeMem();
 
-    // the AliasIdxBot slice should be another MemBar projection
-    x = mm->in(Compile::AliasIdxBot);
+    if (is_cas) {
+      // the merge should be fed from the CAS via an SCMemProj node
+      x = NULL;
+      for (uint idx = 1; idx < mm->req(); idx++) {
+	if (mm->in(idx)->Opcode() == Op_SCMemProj) {
+	  x = mm->in(idx);
+	  break;
+	}
+      }
+      if (x == NULL) {
+	return NULL;
+      }
+      // check for a CAS feeding this proj
+      x = x->in(0);
+      int opcode = x->Opcode();
+      if (!is_CAS(opcode)) {
+	return NULL;
+      }
+      // the CAS should get its mem feed from the leading membar
+      x = x->in(MemNode::Memory);
+    } else {
+      // the merge should get its Bottom mem feed from the leading membar
+      x = mm->in(Compile::AliasIdxBot);      
+    } 
+
     // ensure this is a non control projection
-    if (!x->is_Proj() || x->is_CFG())
+    if (!x->is_Proj() || x->is_CFG()) {
       return NULL;
+    }
     // if it is fed by a membar that's the one we want
     x = x->in(0);
 
-    if (!x->is_MemBar())
+    if (!x->is_MemBar()) {
       return NULL;
+    }
 
     MemBarNode *leading = x->as_MemBar();
     // reject invalid candidates
-    if (!leading_membar(leading))
+    if (!leading_membar(leading)) {
       return NULL;
-
-    // ok, we have a leading ReleaseMembar, now for the sanity clauses
-
-    // the leading membar must feed Mem to a releasing store
+    }
+
+    // ok, we have a leading membar, now for the sanity clauses
+
+    // the leading membar must feed Mem to a releasing store or CAS
     ProjNode *mem = leading->proj_out(TypeFunc::Memory);
     StoreNode *st = NULL;
+    LoadStoreNode *cas = NULL;
     for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
       x = mem->fast_out(i);
       if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+	// two stores or CASes is one too many
+	if (st != NULL || cas != NULL) {
+	  return NULL;
+	}
 	st = x->as_Store();
-	break;
+      } else if (is_CAS(x->Opcode())) {
+	if (st != NULL || cas != NULL) {
+	  return NULL;
+	}
+	cas = x->as_LoadStore();
       }
     }
-    if (st == NULL)
+
+    // we should not have both a store and a cas
+    if (st == NULL & cas == NULL) {
       return NULL;
-
-    // the releasing store has to feed the same merge
-    for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
-      if (st->fast_out(i) == mm)
-	return leading;
+    }
+
+    if (st == NULL) {
+      // nothing more to check
+      return leading;
+    } else {
+      // we should not have a store if we started from an acquire
+      if (is_cas) {
+	return NULL;
+      }
+
+      // the store should feed the merge we used to get here
+      for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+	if (st->fast_out(i) == mm) {
+	  return leading;
+	}
+      }
     }
 
     return NULL;
@@ -1865,8 +2197,8 @@
   //  Bot |  / 
   //   MergeMem 
   //      |
-  //   MemBarVolatile (trailing)
-  //
+  //      |
+  //    MemBarVolatile {trailing}
   //
   // 2)
   //   MemBarRelease/CPUOrder (leading)
@@ -1884,7 +2216,8 @@
   //     Bot |   /
   //       MergeMem
   //         |
-  //   MemBarVolatile (trailing)
+  //    MemBarVolatile {trailing}
+  //
   //
   // 3)
   //   MemBarRelease/CPUOrder (leading)
@@ -1905,7 +2238,8 @@
   //     Bot |   /
   //       MergeMem
   //         |
-  //   MemBarVolatile (trailing)
+  //         |
+  //    MemBarVolatile {trailing}
   //
   // configuration 1 is only valid if UseConcMarkSweepGC &&
   // UseCondCardMark
@@ -1955,8 +2289,9 @@
 	    break;
 	  }
 	}
-	if (!phi)
+	if (!phi) {
 	  return NULL;
+	}
 	// look for another merge below this phi
 	feed = phi;
       } else {
@@ -1969,7 +2304,7 @@
     assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
 
     MemBarNode *trailing = NULL;
-    // be sure we have a volatile membar below the merge
+    // be sure we have a trailing membar the merge
     for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
       x = mm->fast_out(i);
       if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
@@ -1984,24 +2319,32 @@
   // trailing_to_card_mark
   //
   // graph traversal helper which detects extra, non-normal Mem feed
-  // from a trailing membar to a preceding card mark volatile membar
-  // i.e. it identifies whether one of the three possible extra GC
-  // post-write Mem flow subgraphs is present
+  // from a trailing volatile membar to a preceding card mark volatile
+  // membar i.e. it identifies whether one of the three possible extra
+  // GC post-write Mem flow subgraphs is present
   //
   // this predicate checks for the same flow as the previous predicate
   // but starting from the bottom rather than the top.
   //
-  // if the configurationis present returns the card mark membar
+  // if the configuration is present returns the card mark membar
   // otherwise NULL
+  //
+  // n.b. the supplied membar is expected to be a trailing
+  // MemBarVolatile i.e. the caller must ensure the input node has the
+  // correct opcode
 
   MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
   {
-    assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
-
+    assert(trailing->Opcode() == Op_MemBarVolatile,
+	   "expecting a volatile membar");
+    assert(!is_card_mark_membar(trailing),
+	   "not expecting a card mark membar");
+
+    // the Mem feed to the membar should be a merge
     Node *x = trailing->in(TypeFunc::Memory);
-    // the Mem feed to the membar should be a merge
-    if (!x->is_MergeMem())
+    if (!x->is_MergeMem()) {
       return NULL;
+    }
 
     MergeMemNode *mm = x->as_MergeMem();
 
@@ -2054,13 +2397,15 @@
     }
     // the proj has to come from the card mark membar
     x = x->in(0);
-    if (!x->is_MemBar())
+    if (!x->is_MemBar()) {
       return NULL;
+    }
 
     MemBarNode *card_mark_membar = x->as_MemBar();
 
-    if (!is_card_mark_membar(card_mark_membar))
+    if (!is_card_mark_membar(card_mark_membar)) {
       return NULL;
+    }
 
     return card_mark_membar;
   }
@@ -2068,7 +2413,7 @@
   // trailing_to_leading
   //
   // graph traversal helper which checks the Mem flow up the graph
-  // from a (non-card mark) volatile membar attempting to locate and
+  // from a (non-card mark) trailing membar attempting to locate and
   // return an associated leading membar. it first looks for a
   // subgraph in the normal configuration (relying on helper
   // normal_to_leading). failing that it then looks for one of the
@@ -2081,22 +2426,35 @@
   // if the configuration is valid returns the cpuorder member for
   // preference or when absent the release membar otherwise NULL.
   //
-  // n.b. the input membar is expected to be a volatile membar but
-  // must *not* be a card mark membar.
+  // n.b. the input membar is expected to be either a volatile or
+  // acquire membar but in the former case must *not* be a card mark
+  // membar.
 
   MemBarNode *trailing_to_leading(const MemBarNode *trailing)
   {
-    assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+    assert((trailing->Opcode() == Op_MemBarAcquire ||
+	    trailing->Opcode() == Op_MemBarVolatile),
+	   "expecting an acquire or volatile membar");
+    assert((trailing->Opcode() != Op_MemBarVolatile ||
+	    !is_card_mark_membar(trailing)),
+	   "not expecting a card mark membar");
 
     MemBarNode *leading = normal_to_leading(trailing);
 
-    if (leading)
+    if (leading) {
       return leading;
+    }
+
+    // nothing more to do if this is an acquire
+    if (trailing->Opcode() == Op_MemBarAcquire) {
+      return NULL;
+    }
 
     MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
 
-    if (!card_mark_membar)
+    if (!card_mark_membar) {
       return NULL;
+    }
 
     return normal_to_leading(card_mark_membar);
   }
@@ -2105,10 +2463,12 @@
 
 bool unnecessary_acquire(const Node *barrier)
 {
-  // assert barrier->is_MemBar();
-  if (UseBarriersForVolatile)
+  assert(barrier->is_MemBar(), "expecting a membar");
+
+  if (UseBarriersForVolatile) {
     // we need to plant a dmb
     return false;
+  }
 
   // a volatile read derived from bytecode (or also from an inlined
   // SHA field read via LibraryCallKit::load_field_from_object)
@@ -2140,8 +2500,9 @@
     //
     // where * tags node we were passed
     // and |k means input k
-    if (x->is_DecodeNarrowPtr())
+    if (x->is_DecodeNarrowPtr()) {
       x = x->in(1);
+    }
 
     return (x->is_Load() && x->as_Load()->is_acquire());
   }
@@ -2167,8 +2528,9 @@
     return false;
   ctl = parent->proj_out(TypeFunc::Control);
   mem = parent->proj_out(TypeFunc::Memory);
-  if (!ctl || !mem)
+  if (!ctl || !mem) {
     return false;
+  }
   // ensure the proj nodes both feed a LoadX[mo_acquire]
   LoadNode *ld = NULL;
   for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
@@ -2180,38 +2542,46 @@
     }
   }
   // it must be an acquiring load
-  if (! ld || ! ld->is_acquire())
-    return false;
-  for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-    x = mem->fast_out(i);
-    // if we see the same load we drop it and stop searching
-    if (x == ld) {
-      ld = NULL;
-      break;
-    }
-  }
-  // we must have dropped the load
-  if (ld)
-    return false;
-  // check for a child cpuorder membar
-  MemBarNode *child  = child_membar(barrier->as_MemBar());
-  if (!child || child->Opcode() != Op_MemBarCPUOrder)
-    return false;
-
-  return true;
+  if (ld && ld->is_acquire()) {
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      // if we see the same load we drop it and stop searching
+      if (x == ld) {
+	ld = NULL;
+	break;
+      }
+    }
+    // we must have dropped the load
+    if (ld == NULL) {
+      // check for a child cpuorder membar
+      MemBarNode *child  = child_membar(barrier->as_MemBar());
+      if (child && child->Opcode() == Op_MemBarCPUOrder)
+	return true;
+    }
+  }
+
+  // final option for unnecessary mebar is that it is a trailing node
+  // belonging to a CAS
+
+  MemBarNode *leading = trailing_to_leading(barrier->as_MemBar());
+
+  return leading != NULL;
 }
 
 bool needs_acquiring_load(const Node *n)
 {
-  // assert n->is_Load();
-  if (UseBarriersForVolatile)
+  assert(n->is_Load(), "expecting a load");
+  if (UseBarriersForVolatile) {
     // we use a normal load and a dmb
     return false;
+  }
 
   LoadNode *ld = n->as_Load();
 
-  if (!ld->is_acquire())
+  if (!ld->is_acquire()) {
     return false;
+  }
 
   // check if this load is feeding an acquire membar
   //
@@ -2261,20 +2631,23 @@
 
   membar = parent_membar(ld);
 
-  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
+  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) {
     return false;
+  }
 
   // ensure that there is a CPUOrder->Acquire->CPUOrder membar chain
 
   membar = child_membar(membar);
 
-  if (!membar || !membar->Opcode() == Op_MemBarAcquire)
+  if (!membar || !membar->Opcode() == Op_MemBarAcquire) {
     return false;
+  }
 
   membar = child_membar(membar);
   
-  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
+  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) {
     return false;
+  }
 
   return true;
 }
@@ -2285,9 +2658,10 @@
 	  n->Opcode() == Op_MemBarRelease),
 	 "expecting a release membar");
 
-  if (UseBarriersForVolatile)
+  if (UseBarriersForVolatile) {
     // we need to plant a dmb
     return false;
+  }
 
   // if there is a dependent CPUOrder barrier then use that as the
   // leading
@@ -2303,12 +2677,14 @@
   // must start with a normal feed
   MemBarNode *child_barrier = leading_to_normal(barrier);
 
-  if (!child_barrier)
+  if (!child_barrier) {
     return false;
-
-  if (!is_card_mark_membar(child_barrier))
+  }
+
+  if (!is_card_mark_membar(child_barrier)) {
     // this is the trailing membar and we are done
     return true;
+  }
 
   // must be sure this card mark feeds a trailing membar
   MemBarNode *trailing = card_mark_to_trailing(child_barrier);
@@ -2318,17 +2694,19 @@
 bool unnecessary_volatile(const Node *n)
 {
   // assert n->is_MemBar();
-  if (UseBarriersForVolatile)
+  if (UseBarriersForVolatile) {
     // we need to plant a dmb
     return false;
+  }
 
   MemBarNode *mbvol = n->as_MemBar();
 
   // first we check if this is part of a card mark. if so then we have
   // to generate a StoreLoad barrier
   
-  if (is_card_mark_membar(mbvol))
+  if (is_card_mark_membar(mbvol)) {
       return false;
+  }
 
   // ok, if it's not a card mark then we still need to check if it is
   // a trailing membar of a volatile put hgraph.
@@ -2341,29 +2719,33 @@
 bool needs_releasing_store(const Node *n)
 {
   // assert n->is_Store();
-  if (UseBarriersForVolatile)
+  if (UseBarriersForVolatile) {
     // we use a normal store and dmb combination
     return false;
+  }
 
   StoreNode *st = n->as_Store();
 
   // the store must be marked as releasing
-  if (!st->is_release())
+  if (!st->is_release()) {
     return false;
+  }
 
   // the store must be fed by a membar
 
   Node *x = st->lookup(StoreNode::Memory);
 
-  if (! x || !x->is_Proj())
+  if (! x || !x->is_Proj()) {
     return false;
+  }
 
   ProjNode *proj = x->as_Proj();
 
   x = proj->lookup(0);
 
-  if (!x || !x->is_MemBar())
+  if (!x || !x->is_MemBar()) {
     return false;
+  }
 
   MemBarNode *barrier = x->as_MemBar();
 
@@ -2372,24 +2754,76 @@
   // volatile put graph.
 
   // reject invalid candidates
-  if (!leading_membar(barrier))
+  if (!leading_membar(barrier)) {
     return false;
+  }
 
   // does this lead a normal subgraph?
   MemBarNode *mbvol = leading_to_normal(barrier);
 
-  if (!mbvol)
+  if (!mbvol) {
     return false;
+  }
 
   // all done unless this is a card mark
-  if (!is_card_mark_membar(mbvol))
+  if (!is_card_mark_membar(mbvol)) {
     return true;
+  }
   
   // we found a card mark -- just make sure we have a trailing barrier
 
   return (card_mark_to_trailing(mbvol) != NULL);
 }
 
+// predicate controlling translation of CAS
+//
+// returns true if CAS needs to use an acquiring load otherwise false
+
+bool needs_acquiring_load_exclusive(const Node *n)
+{
+  assert(is_CAS(n->Opcode()), "expecting a compare and swap");
+  if (UseBarriersForVolatile) {
+    return false;
+  }
+
+  // CAS nodes only ought to turn up in inlined unsafe CAS operations
+#ifdef ASSERT
+  LoadStoreNode *st = n->as_LoadStore();
+
+  // the store must be fed by a membar
+
+  Node *x = st->lookup(StoreNode::Memory);
+
+  assert (x && x->is_Proj(), "CAS not fed by memory proj!");
+
+  ProjNode *proj = x->as_Proj();
+
+  x = proj->lookup(0);
+
+  assert (x && x->is_MemBar(), "CAS not fed by membar!");
+
+  MemBarNode *barrier = x->as_MemBar();
+
+  // the barrier must be a cpuorder mmebar fed by a release membar
+
+  assert(barrier->Opcode() == Op_MemBarCPUOrder,
+	 "CAS not fed by cpuorder membar!");
+      
+  MemBarNode *b = parent_membar(barrier);
+  assert ((b != NULL && b->Opcode() == Op_MemBarRelease),
+	  "CAS not fed by cpuorder+release membar pair!");
+
+  // does this lead a normal subgraph?
+  MemBarNode *mbar = leading_to_normal(barrier);
+
+  assert(mbar != NULL, "CAS not embedded in normal graph!");
+
+  assert(mbar->Opcode() == Op_MemBarAcquire, "trailing membar should be an acquire");
+#endif // ASSERT
+  // so we can just return true here
+  return true;
+}
+
 // predicate controlling translation of StoreCM
 //
 // returns true if a StoreStore must precede the card write otherwise
@@ -2403,14 +2837,16 @@
   // and the associated card mark when we are using CMS without
   // conditional card marking
 
-  if (!UseConcMarkSweepGC || UseCondCardMark)
+  if (!UseConcMarkSweepGC || UseCondCardMark) {
     return true;
+  }
 
   // if we are implementing volatile puts using barriers then the
   // object put as an str so we must insert the dmb ishst
 
-  if (UseBarriersForVolatile)
+  if (UseBarriersForVolatile) {
     return false;
+  }
 
   // we can omit the dmb ishst if this StoreCM is part of a volatile
   // put because in thta case the put will be implemented by stlr
@@ -2422,19 +2858,22 @@
 
   Node *x = storecm->in(StoreNode::Memory);
 
-  if (!x->is_Proj())
+  if (!x->is_Proj()) {
     return false;
+  }
 
   x = x->in(0);
 
-  if (!x->is_MemBar())
+  if (!x->is_MemBar()) {
     return false;
+  }
 
   MemBarNode *leading = x->as_MemBar();
 
   // reject invalid candidates
-  if (!leading_membar(leading))
+  if (!leading_membar(leading)) {
     return false;
+  }
 
   // we can omit the StoreStore if it is the head of a normal subgraph
   return (leading_to_normal(leading) != NULL);
@@ -3024,6 +3463,10 @@
   return true;  // Per default match rules are supported.
 }
 
+const int Matcher::float_pressure(int default_pressure_threshold) {
+  return default_pressure_threshold;
+}
+
 int Matcher::regnum_to_fpu_offset(int regnum)
 {
   Unimplemented();
@@ -8365,9 +8808,13 @@
 // XXX No flag versions for CompareAndSwap{I,L,P,N} because matcher
 // can't match them
 
+// standard CompareAndSwapX when we are using barriers
+// these have higher priority than the rules selected by a predicate
+
 instruct compareAndSwapI(iRegINoSp res, indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
@@ -8385,6 +8832,7 @@
 instruct compareAndSwapL(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
@@ -8402,6 +8850,7 @@
 instruct compareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
@@ -8419,6 +8868,7 @@
 instruct compareAndSwapN(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
@@ -8433,6 +8883,84 @@
   ins_pipe(pipe_slow);
 %}
 
+// alternative CompareAndSwapX when we are eliding barriers
+
+instruct compareAndSwapIAcq(iRegINoSp res, indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchgw_acq $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapLAcq(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchg_acq $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchg_acq $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapNAcq(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchgw_acq $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
 
 instruct get_and_setI(indirect mem, iRegINoSp newv, iRegI prev) %{
   match(Set prev (GetAndSetI mem newv));
@@ -13286,6 +13814,25 @@
   ins_pipe(pipe_cmp_branch);
 %}
 
+instruct cmpP_narrowOop_imm0_branch(cmpOp cmp, iRegN oop, immP0 zero, label labl, rFlagsReg cr) %{
+  match(If cmp (CmpP (DecodeN oop) zero));
+  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::ne
+            || n->in(1)->as_Bool()->_test._test == BoolTest::eq);
+  effect(USE labl);
+
+  ins_cost(BRANCH_COST);
+  format %{ "cb$cmp   $oop, $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
+    if (cond == Assembler::EQ)
+      __ cbzw($oop$$Register, *L);
+    else
+      __ cbnzw($oop$$Register, *L);
+  %}
+  ins_pipe(pipe_cmp_branch);
+%}
+
 // Conditional Far Branch
 // Conditional Far Branch Unsigned
 // TODO: fixme
@@ -14662,6 +15209,102 @@
   ins_pipe(pipe_class_default);
 %}
 
+// --------------------------------- SQRT -------------------------------------
+
+instruct vsqrt2D(vecX dst, vecX src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SqrtVD src));
+  format %{ "fsqrt  $dst, $src\t# vector (2D)" %}
+  ins_encode %{
+    __ fsqrt(as_FloatRegister($dst$$reg), __ T2D,
+             as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- ABS --------------------------------------
+
+instruct vabs2F(vecD dst, vecD src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AbsVF src));
+  ins_cost(INSN_COST * 3);
+  format %{ "fabs  $dst,$src\t# vector (2S)" %}
+  ins_encode %{
+    __ fabs(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vabs4F(vecX dst, vecX src)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AbsVF src));
+  ins_cost(INSN_COST * 3);
+  format %{ "fabs  $dst,$src\t# vector (4S)" %}
+  ins_encode %{
+    __ fabs(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vabs2D(vecX dst, vecX src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AbsVD src));
+  ins_cost(INSN_COST * 3);
+  format %{ "fabs  $dst,$src\t# vector (2D)" %}
+  ins_encode %{
+    __ fabs(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- NEG --------------------------------------
+
+instruct vneg2F(vecD dst, vecD src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (NegVF src));
+  ins_cost(INSN_COST * 3);
+  format %{ "fneg  $dst,$src\t# vector (2S)" %}
+  ins_encode %{
+    __ fneg(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vneg4F(vecX dst, vecX src)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (NegVF src));
+  ins_cost(INSN_COST * 3);
+  format %{ "fneg  $dst,$src\t# vector (4S)" %}
+  ins_encode %{
+    __ fneg(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vneg2D(vecX dst, vecX src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (NegVD src));
+  ins_cost(INSN_COST * 3);
+  format %{ "fneg  $dst,$src\t# vector (2D)" %}
+  ins_encode %{
+    __ fneg(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 // --------------------------------- AND --------------------------------------
 
 instruct vand8B(vecD dst, vecD src1, vecD src2)
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -2311,6 +2311,12 @@
 
 #define MSG "invalid arrangement"
 
+#define ASSERTION (T == T2S || T == T4S || T == T2D)
+  INSN(fsqrt, 1, 0b11111);
+  INSN(fabs,  0, 0b01111);
+  INSN(fneg,  1, 0b01111);
+#undef ASSERTION
+
 #define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H || T == T2S || T == T4S)
   INSN(rev64, 0, 0b00000);
 #undef ASSERTION
--- a/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -72,6 +72,7 @@
 define_pd_global(bool, UseCISCSpill,                 true);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoRegScheduling,            false);
 
 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
 define_pd_global(intx, NonProfiledCodeHeapSize,      21*M);
--- a/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -42,6 +42,11 @@
 
 // Implementation of InterpreterMacroAssembler
 
+void InterpreterMacroAssembler::jump_to_entry(address entry) {
+  assert(entry, "Entry must have been generated by now");
+  b(entry);
+}
+
 #ifndef CC_INTERP
 
 void InterpreterMacroAssembler::check_and_handle_popframe(Register java_thread) {
--- a/src/cpu/aarch64/vm/interp_masm_aarch64.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/interp_masm_aarch64.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -66,6 +66,8 @@
 
   void load_earlyret_value(TosState state);
 
+  void jump_to_entry(address entry);
+
 #ifdef CC_INTERP
   void save_bcp()                                          { /*  not needed in c++ interpreter and harmless */ }
   void restore_bcp()                                       { /*  not needed in c++ interpreter and harmless */ }
--- a/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -41,13 +41,13 @@
   address generate_native_entry(bool synchronized);
   address generate_abstract_entry(void);
   address generate_math_entry(AbstractInterpreter::MethodKind kind);
-  address generate_jump_to_normal_entry(void);
-  address generate_accessor_entry(void) { return generate_jump_to_normal_entry(); }
-  address generate_empty_entry(void) { return generate_jump_to_normal_entry(); }
+  address generate_accessor_entry(void) { return NULL; }
+  address generate_empty_entry(void) { return NULL; }
   void generate_transcendental_entry(AbstractInterpreter::MethodKind kind, int fpargs);
   address generate_Reference_get_entry();
   address generate_CRC32_update_entry();
   address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+  address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
   void lock_method(void);
   void generate_stack_overflow_check(void);
 
--- a/src/cpu/aarch64/vm/interpreter_aarch64.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/interpreter_aarch64.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -236,17 +236,6 @@
   __ blrt(rscratch1, gpargs, fpargs, rtype);
 }
 
-// Jump into normal path for accessor and empty entry to jump to normal entry
-// The "fast" optimization don't update compilation count therefore can disable inlining
-// for these functions that should be inlined.
-address InterpreterGenerator::generate_jump_to_normal_entry(void) {
-  address entry_point = __ pc();
-
-  assert(Interpreter::entry_for_kind(Interpreter::zerolocals) != NULL, "should already be generated");
-  __ b(Interpreter::entry_for_kind(Interpreter::zerolocals));
-  return entry_point;
-}
-
 // Abstract method entry
 // Attempt to execute abstract method. Throw exception
 address InterpreterGenerator::generate_abstract_entry(void) {
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -2286,18 +2286,30 @@
 }
 #endif
 
-void MacroAssembler::push_CPU_state() {
-    push(0x3fffffff, sp);         // integer registers except lr & sp
-
+void MacroAssembler::push_CPU_state(bool save_vectors) {
+  push(0x3fffffff, sp);         // integer registers except lr & sp
+
+  if (!save_vectors) {
     for (int i = 30; i >= 0; i -= 2)
       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
            Address(pre(sp, -2 * wordSize)));
+  } else {
+    for (int i = 30; i >= 0; i -= 2)
+      stpq(as_FloatRegister(i), as_FloatRegister(i+1),
+           Address(pre(sp, -4 * wordSize)));
+  }
 }
 
-void MacroAssembler::pop_CPU_state() {
-  for (int i = 0; i < 32; i += 2)
-    ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
-         Address(post(sp, 2 * wordSize)));
+void MacroAssembler::pop_CPU_state(bool restore_vectors) {
+  if (!restore_vectors) {
+    for (int i = 0; i < 32; i += 2)
+      ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
+           Address(post(sp, 2 * wordSize)));
+  } else {
+    for (int i = 0; i < 32; i += 2)
+      ldpq(as_FloatRegister(i), as_FloatRegister(i+1),
+           Address(post(sp, 4 * wordSize)));
+  }
 
   pop(0x3fffffff, sp);         // integer registers except lr & sp
 }
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -777,8 +777,8 @@
 
   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 
-  void push_CPU_state();
-  void pop_CPU_state() ;
+  void push_CPU_state(bool save_vectors = false);
+  void pop_CPU_state(bool restore_vectors = false) ;
 
   // Round up to a power of two
   void round_to(Register reg, int modulus);
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -75,8 +75,8 @@
 // FIXME -- this is used by C1
 class RegisterSaver {
  public:
-  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words);
-  static void restore_live_registers(MacroAssembler* masm);
+  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
+  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 
   // Offsets into the register save area
   // Used by deoptimization when it is managing result register
@@ -108,7 +108,17 @@
 
 };
 
-OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words) {
+OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
+#ifdef COMPILER2
+  if (save_vectors) {
+    // Save upper half of vector registers
+    int vect_words = 32 * 8 / wordSize;
+    additional_frame_words += vect_words;
+  }
+#else
+  assert(!save_vectors, "vectors are generated only by C2");
+#endif
+
   int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
                                      reg_save_size*BytesPerInt, 16);
   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
@@ -122,7 +132,7 @@
   // Save registers, fpu state, and flags.
 
   __ enter();
-  __ push_CPU_state();
+  __ push_CPU_state(save_vectors);
 
   // Set an oopmap for the call site.  This oopmap will map all
   // oop-registers and debug-info registers as callee-saved.  This
@@ -139,14 +149,14 @@
                                     // register slots are 8 bytes
                                     // wide, 32 floating-point
                                     // registers
-      oop_map->set_callee_saved(VMRegImpl::stack2reg(sp_offset),
+      oop_map->set_callee_saved(VMRegImpl::stack2reg(sp_offset + additional_frame_slots),
                                 r->as_VMReg());
     }
   }
 
   for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) {
     FloatRegister r = as_FloatRegister(i);
-    int sp_offset = 2 * i;
+    int sp_offset = save_vectors ? (4 * i) : (2 * i);
     oop_map->set_callee_saved(VMRegImpl::stack2reg(sp_offset),
                               r->as_VMReg());
   }
@@ -154,8 +164,11 @@
   return oop_map;
 }
 
-void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
-  __ pop_CPU_state();
+void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
+#ifndef COMPILER2
+  assert(!restore_vectors, "vectors are generated only by C2");
+#endif
+  __ pop_CPU_state(restore_vectors);
   __ leave();
 }
 
@@ -177,9 +190,9 @@
 }
 
 // Is vector's size (in bytes) bigger than a size saved by default?
-// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
+// 8 bytes vector registers are saved by default on AArch64.
 bool SharedRuntime::is_wide_vector(int size) {
-  return size > 16;
+  return size > 8;
 }
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
@@ -1146,7 +1159,7 @@
     assert((unsigned)gpargs < 256, "eek!");
     assert((unsigned)fpargs < 32, "eek!");
     __ lea(rscratch1, RuntimeAddress(dest));
-    __ mov(rscratch2, (gpargs << 6) | (fpargs << 2) | type);
+    if (UseBuiltinSim)   __ mov(rscratch2, (gpargs << 6) | (fpargs << 2) | type);
     __ blrt(rscratch1, rscratch2);
     __ maybe_isb();
   }
@@ -1521,14 +1534,13 @@
 
   int vep_offset = ((intptr_t)__ pc()) - start;
 
-  // Generate stack overflow check
-
   // If we have to make this method not-entrant we'll overwrite its
   // first instruction with a jump.  For this action to be legal we
   // must ensure that this first instruction is a B, BL, NOP, BKPT,
   // SVC, HVC, or SMC.  Make it a NOP.
   __ nop();
 
+  // Generate stack overflow check
   if (UseStackBanging) {
     __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
   } else {
@@ -1709,23 +1721,20 @@
   // need to spill before we call out
   int c_arg = total_c_args - total_in_args;
 
-  // Pre-load a static method's oop into r20.  Used both by locking code and
-  // the normal JNI call code.
+  // Pre-load a static method's oop into c_rarg1.
   if (method->is_static() && !is_critical_native) {
 
     //  load oop into a register
-    __ movoop(oop_handle_reg,
+    __ movoop(c_rarg1,
               JNIHandles::make_local(method->method_holder()->java_mirror()),
               /*immediate*/true);
 
     // Now handlize the static class mirror it's known not-null.
-    __ str(oop_handle_reg, Address(sp, klass_offset));
+    __ str(c_rarg1, Address(sp, klass_offset));
     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
 
     // Now get the handle
-    __ lea(oop_handle_reg, Address(sp, klass_offset));
-    // store the klass handle as second argument
-    __ mov(c_rarg1, oop_handle_reg);
+    __ lea(c_rarg1, Address(sp, klass_offset));
     // and protect the arg if we must spill
     c_arg--;
   }
@@ -1740,19 +1749,13 @@
 
   __ set_last_Java_frame(sp, noreg, (address)the_pc, rscratch1);
 
-
-  // We have all of the arguments setup at this point. We must not touch any register
-  // argument registers at this point (what if we save/restore them there are no oop?
-
+  Label dtrace_method_entry, dtrace_method_entry_done;
   {
-    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
-    // protect the args we've loaded
-    save_args(masm, total_c_args, c_arg, out_regs);
-    __ mov_metadata(c_rarg1, method());
-    __ call_VM_leaf(
-      CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
-      rthread, c_rarg1);
-    restore_args(masm, total_c_args, c_arg, out_regs);
+    unsigned long offset;
+    __ adrp(rscratch1, ExternalAddress((address)&DTraceMethodProbes), offset);
+    __ ldrb(rscratch1, Address(rscratch1, offset));
+    __ cbnzw(rscratch1, dtrace_method_entry);
+    __ bind(dtrace_method_entry_done);
   }
 
   // RedefineClasses() tracing support for obsolete method entry
@@ -1782,7 +1785,6 @@
   if (method->is_synchronized()) {
     assert(!is_critical_native, "unhandled");
 
-
     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
 
     // Get the handle (the 2nd argument)
@@ -1838,7 +1840,6 @@
 
   // Finally just about ready to make the JNI call
 
-
   // get JNIEnv* which is first argument to native
   if (!is_critical_native) {
     __ lea(c_rarg0, Address(rthread, in_bytes(JavaThread::jni_environment_offset())));
@@ -1879,9 +1880,9 @@
 
   // Unpack native results.
   switch (ret_type) {
-  case T_BOOLEAN: __ ubfx(r0, r0, 0, 8);            break;
+  case T_BOOLEAN: __ ubfx(r0, r0, 0, 8);             break;
   case T_CHAR   : __ ubfx(r0, r0, 0, 16);            break;
-  case T_BYTE   : __ sbfx(r0, r0, 0, 8);            break;
+  case T_BYTE   : __ sbfx(r0, r0, 0, 8);             break;
   case T_SHORT  : __ sbfx(r0, r0, 0, 16);            break;
   case T_INT    : __ sbfx(r0, r0, 0, 32);            break;
   case T_DOUBLE :
@@ -1904,14 +1905,17 @@
   //     Thread A is resumed to finish this native method, but doesn't block here since it
   //     didn't see any synchronization is progress, and escapes.
   __ mov(rscratch1, _thread_in_native_trans);
-  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
-  __ stlrw(rscratch1, rscratch2);
 
   if(os::is_MP()) {
     if (UseMembar) {
+      __ strw(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+
       // Force this write out before the read below
       __ dmb(Assembler::SY);
     } else {
+      __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+      __ stlrw(rscratch1, rscratch2);
+
       // Write serialization page so VM thread can do a pseudo remote membar.
       // We use the current thread pointer to calculate a thread specific
       // offset to write to within the page. This minimizes bus traffic
@@ -1920,25 +1924,220 @@
     }
   }
 
+  // check for safepoint operation in progress and/or pending suspend requests
+  Label safepoint_in_progress, safepoint_in_progress_done;
+  {
+    assert(SafepointSynchronize::_not_synchronized == 0, "fix this code");
+    unsigned long offset;
+    __ adrp(rscratch1,
+            ExternalAddress((address)SafepointSynchronize::address_of_state()),
+            offset);
+    __ ldrw(rscratch1, Address(rscratch1, offset));
+    __ cbnzw(rscratch1, safepoint_in_progress);
+    __ ldrw(rscratch1, Address(rthread, JavaThread::suspend_flags_offset()));
+    __ cbnzw(rscratch1, safepoint_in_progress);
+    __ bind(safepoint_in_progress_done);
+  }
+
+  // change thread state
   Label after_transition;
-
-  // check for safepoint operation in progress and/or pending suspend requests
+  __ mov(rscratch1, _thread_in_Java);
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ stlrw(rscratch1, rscratch2);
+  __ bind(after_transition);
+
+  Label reguard;
+  Label reguard_done;
+  __ ldrb(rscratch1, Address(rthread, JavaThread::stack_guard_state_offset()));
+  __ cmpw(rscratch1, JavaThread::stack_guard_yellow_disabled);
+  __ br(Assembler::EQ, reguard);
+  __ bind(reguard_done);
+
+  // native result if any is live
+
+  // Unlock
+  Label unlock_done;
+  Label slow_path_unlock;
+  if (method->is_synchronized()) {
+
+    // Get locked oop from the handle we passed to jni
+    __ ldr(obj_reg, Address(oop_handle_reg, 0));
+
+    Label done;
+
+    if (UseBiasedLocking) {
+      __ biased_locking_exit(obj_reg, old_hdr, done);
+    }
+
+    // Simple recursive lock?
+
+    __ ldr(rscratch1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+    __ cbz(rscratch1, done);
+
+    // Must save r0 if if it is live now because cmpxchg must use it
+    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
+      save_native_result(masm, ret_type, stack_slots);
+    }
+
+
+    // get address of the stack lock
+    __ lea(r0, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+    //  get old displaced header
+    __ ldr(old_hdr, Address(r0, 0));
+
+    // Atomic swap old header if oop still contains the stack lock
+    Label succeed;
+    __ cmpxchgptr(r0, old_hdr, obj_reg, rscratch1, succeed, &slow_path_unlock);
+    __ bind(succeed);
+
+    // slow path re-enters here
+    __ bind(unlock_done);
+    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
+      restore_native_result(masm, ret_type, stack_slots);
+    }
+
+    __ bind(done);
+  }
+
+  Label dtrace_method_exit, dtrace_method_exit_done;
   {
-    Label Continue;
-
-    { unsigned long offset;
-      __ adrp(rscratch1,
-              ExternalAddress((address)SafepointSynchronize::address_of_state()),
-              offset);
-      __ ldrw(rscratch1, Address(rscratch1, offset));
+    unsigned long offset;
+    __ adrp(rscratch1, ExternalAddress((address)&DTraceMethodProbes), offset);
+    __ ldrb(rscratch1, Address(rscratch1, offset));
+    __ cbnzw(rscratch1, dtrace_method_exit);
+    __ bind(dtrace_method_exit_done);
+  }
+
+  __ reset_last_Java_frame(false, true);
+
+  // Unpack oop result
+  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
+      Label L;
+      __ cbz(r0, L);
+      __ ldr(r0, Address(r0, 0));
+      __ bind(L);
+      __ verify_oop(r0);
+  }
+
+  if (!is_critical_native) {
+    // reset handle block
+    __ ldr(r2, Address(rthread, JavaThread::active_handles_offset()));
+    __ str(zr, Address(r2, JNIHandleBlock::top_offset_in_bytes()));
+  }
+
+  __ leave();
+
+  if (!is_critical_native) {
+    // Any exception pending?
+    __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+    __ cbnz(rscratch1, exception_pending);
+  }
+
+  // record exit from native wrapper code
+  if (NotifySimulator) {
+    __ notify(Assembler::method_reentry);
+  }
+
+  // We're done
+  __ ret(lr);
+
+  // Unexpected paths are out of line and go here
+
+  if (!is_critical_native) {
+    // forward the exception
+    __ bind(exception_pending);
+
+    // and forward the exception
+    __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+  }
+
+  // Slow path locking & unlocking
+  if (method->is_synchronized()) {
+
+    __ block_comment("Slow path lock {");
+    __ bind(slow_path_lock);
+
+    // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
+    // args are (oop obj, BasicLock* lock, JavaThread* thread)
+
+    // protect the args we've loaded
+    save_args(masm, total_c_args, c_arg, out_regs);
+
+    __ mov(c_rarg0, obj_reg);
+    __ mov(c_rarg1, lock_reg);
+    __ mov(c_rarg2, rthread);
+
+    // Not a leaf but we have last_Java_frame setup as we want
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
+    restore_args(masm, total_c_args, c_arg, out_regs);
+
+#ifdef ASSERT
+    { Label L;
+      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+      __ cbz(rscratch1, L);
+      __ stop("no pending exception allowed on exit from monitorenter");
+      __ bind(L);
     }
-    __ cmpw(rscratch1, SafepointSynchronize::_not_synchronized);
-
-    Label L;
-    __ br(Assembler::NE, L);
-    __ ldrw(rscratch1, Address(rthread, JavaThread::suspend_flags_offset()));
-    __ cbz(rscratch1, Continue);
-    __ bind(L);
+#endif
+    __ b(lock_done);
+
+    __ block_comment("} Slow path lock");
+
+    __ block_comment("Slow path unlock {");
+    __ bind(slow_path_unlock);
+
+    // If we haven't already saved the native result we must save it now as xmm registers
+    // are still exposed.
+
+    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
+      save_native_result(masm, ret_type, stack_slots);
+    }
+
+    __ mov(c_rarg2, rthread);
+    __ lea(c_rarg1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+    __ mov(c_rarg0, obj_reg);
+
+    // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
+    // NOTE that obj_reg == r19 currently
+    __ ldr(r19, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+    __ str(zr, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+
+    rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 3, 0, 1);
+
+#ifdef ASSERT
+    {
+      Label L;
+      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+      __ cbz(rscratch1, L);
+      __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
+      __ bind(L);
+    }
+#endif /* ASSERT */
+
+    __ str(r19, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+
+    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
+      restore_native_result(masm, ret_type, stack_slots);
+    }
+    __ b(unlock_done);
+
+    __ block_comment("} Slow path unlock");
+
+  } // synchronized
+
+  // SLOW PATH Reguard the stack if needed
+
+  __ bind(reguard);
+  save_native_result(masm, ret_type, stack_slots);
+  rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages), 0, 0, 0);
+  restore_native_result(masm, ret_type, stack_slots);
+  // and continue
+  __ b(reguard_done);
+
+  // SLOW PATH safepoint
+  {
+    __ block_comment("safepoint {");
+    __ bind(safepoint_in_progress);
 
     // Don't use call_VM as it will see a possible pending exception and forward it
     // and never return here preventing us from clearing _last_native_pc down below.
@@ -1960,209 +2159,45 @@
 
     if (is_critical_native) {
       // The call above performed the transition to thread_in_Java so
-      // skip the transition logic below.
+      // skip the transition logic above.
       __ b(after_transition);
     }
 
-    __ bind(Continue);
+    __ b(safepoint_in_progress_done);
+    __ block_comment("} safepoint");
   }
 
-  // change thread state
-  __ mov(rscratch1, _thread_in_Java);
-  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
-  __ stlrw(rscratch1, rscratch2);
-  __ bind(after_transition);
-
-  Label reguard;
-  Label reguard_done;
-  __ ldrb(rscratch1, Address(rthread, JavaThread::stack_guard_state_offset()));
-  __ cmpw(rscratch1, JavaThread::stack_guard_yellow_disabled);
-  __ br(Assembler::EQ, reguard);
-  __ bind(reguard_done);
-
-  // native result if any is live
-
-  // Unlock
-  Label unlock_done;
-  Label slow_path_unlock;
-  if (method->is_synchronized()) {
-
-    // Get locked oop from the handle we passed to jni
-    __ ldr(obj_reg, Address(oop_handle_reg, 0));
-
-    Label done;
-
-    if (UseBiasedLocking) {
-      __ biased_locking_exit(obj_reg, old_hdr, done);
-    }
-
-    // Simple recursive lock?
-
-    __ ldr(rscratch1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
-    __ cbz(rscratch1, done);
-
-    // Must save r0 if if it is live now because cmpxchg must use it
-    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
-      save_native_result(masm, ret_type, stack_slots);
-    }
-
-
-    // get address of the stack lock
-    __ lea(r0, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
-    //  get old displaced header
-    __ ldr(old_hdr, Address(r0, 0));
-
-    // Atomic swap old header if oop still contains the stack lock
-    Label succeed;
-    __ cmpxchgptr(r0, old_hdr, obj_reg, rscratch1, succeed, &slow_path_unlock);
-    __ bind(succeed);
-
-    // slow path re-enters here
-    __ bind(unlock_done);
-    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
-      restore_native_result(masm, ret_type, stack_slots);
-    }
-
-    __ bind(done);
-
+  // SLOW PATH dtrace support
+  {
+    __ block_comment("dtrace entry {");
+    __ bind(dtrace_method_entry);
+
+    // We have all of the arguments setup at this point. We must not touch any register
+    // argument registers at this point (what if we save/restore them there are no oop?
+
+    save_args(masm, total_c_args, c_arg, out_regs);
+    __ mov_metadata(c_rarg1, method());
+    __ call_VM_leaf(
+      CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
+      rthread, c_rarg1);
+    restore_args(masm, total_c_args, c_arg, out_regs);
+    __ b(dtrace_method_entry_done);
+    __ block_comment("} dtrace entry");
   }
+
   {
-    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
+    __ block_comment("dtrace exit {");
+    __ bind(dtrace_method_exit);
     save_native_result(masm, ret_type, stack_slots);
     __ mov_metadata(c_rarg1, method());
     __ call_VM_leaf(
          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
          rthread, c_rarg1);
     restore_native_result(masm, ret_type, stack_slots);
+    __ b(dtrace_method_exit_done);
+    __ block_comment("} dtrace exit");
   }
 
-  __ reset_last_Java_frame(false, true);
-
-  // Unpack oop result
-  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
-      Label L;
-      __ cbz(r0, L);
-      __ ldr(r0, Address(r0, 0));
-      __ bind(L);
-      __ verify_oop(r0);
-  }
-
-  if (!is_critical_native) {
-    // reset handle block
-    __ ldr(r2, Address(rthread, JavaThread::active_handles_offset()));
-    __ str(zr, Address(r2, JNIHandleBlock::top_offset_in_bytes()));
-  }
-
-  __ leave();
-
-  if (!is_critical_native) {
-    // Any exception pending?
-    __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
-    __ cbnz(rscratch1, exception_pending);
-  }
-
-  // record exit from native wrapper code
-  if (NotifySimulator) {
-    __ notify(Assembler::method_reentry);
-  }
-
-  // We're done
-  __ ret(lr);
-
-  // Unexpected paths are out of line and go here
-
-  if (!is_critical_native) {
-    // forward the exception
-    __ bind(exception_pending);
-
-    // and forward the exception
-    __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
-  }
-
-  // Slow path locking & unlocking
-  if (method->is_synchronized()) {
-
-    // BEGIN Slow path lock
-    __ bind(slow_path_lock);
-
-    // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
-    // args are (oop obj, BasicLock* lock, JavaThread* thread)
-
-    // protect the args we've loaded
-    save_args(masm, total_c_args, c_arg, out_regs);
-
-    __ mov(c_rarg0, obj_reg);
-    __ mov(c_rarg1, lock_reg);
-    __ mov(c_rarg2, rthread);
-
-    // Not a leaf but we have last_Java_frame setup as we want
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
-    restore_args(masm, total_c_args, c_arg, out_regs);
-
-#ifdef ASSERT
-    { Label L;
-      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
-      __ cbz(rscratch1, L);
-      __ stop("no pending exception allowed on exit from monitorenter");
-      __ bind(L);
-    }
-#endif
-    __ b(lock_done);
-
-    // END Slow path lock
-
-    // BEGIN Slow path unlock
-    __ bind(slow_path_unlock);
-
-    // If we haven't already saved the native result we must save it now as xmm registers
-    // are still exposed.
-
-    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
-      save_native_result(masm, ret_type, stack_slots);
-    }
-
-    __ mov(c_rarg2, rthread);
-    __ lea(c_rarg1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
-    __ mov(c_rarg0, obj_reg);
-
-    // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
-    // NOTE that obj_reg == r19 currently
-    __ ldr(r19, Address(rthread, in_bytes(Thread::pending_exception_offset())));
-    __ str(zr, Address(rthread, in_bytes(Thread::pending_exception_offset())));
-
-    rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 3, 0, 1);
-
-#ifdef ASSERT
-    {
-      Label L;
-      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
-      __ cbz(rscratch1, L);
-      __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
-      __ bind(L);
-    }
-#endif /* ASSERT */
-
-    __ str(r19, Address(rthread, in_bytes(Thread::pending_exception_offset())));
-
-    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
-      restore_native_result(masm, ret_type, stack_slots);
-    }
-    __ b(unlock_done);
-
-    // END Slow path unlock
-
-  } // synchronized
-
-  // SLOW PATH Reguard the stack if needed
-
-  __ bind(reguard);
-  save_native_result(masm, ret_type, stack_slots);
-  rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages), 0, 0, 0);
-  restore_native_result(masm, ret_type, stack_slots);
-  // and continue
-  __ b(reguard_done);
-
-
 
   __ flush();
 
@@ -2742,7 +2777,7 @@
   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
 
   // Save registers, fpu state, and flags
-  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
 
   // The following is basically a call_VM.  However, we need the precise
   // address of the call in order to generate an oopmap. Hence, we do all the
@@ -2793,7 +2828,7 @@
   __ bind(noException);
 
   // Normal exit, restore registers and exit.
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);
 
   __ ret(lr);
 
--- a/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -721,8 +721,7 @@
 
     // generate a vanilla interpreter entry as the slow path
     __ bind(slow_path);
-    (void) generate_normal_entry(false);
-
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
     return entry;
   }
 #endif // INCLUDE_ALL_GCS
@@ -779,12 +778,10 @@
 
     // generate a vanilla native entry as the slow path
     __ bind(slow_path);
-
-    (void) generate_native_entry(false);
-
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
     return entry;
   }
-  return generate_native_entry(false);
+  return NULL;
 }
 
 /**
@@ -841,12 +838,10 @@
 
     // generate a vanilla native entry as the slow path
     __ bind(slow_path);
-
-    (void) generate_native_entry(false);
-
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
     return entry;
   }
-  return generate_native_entry(false);
+  return NULL;
 }
 
 void InterpreterGenerator::bang_stack_shadow_pages(bool native_call) {
--- a/src/cpu/ppc/vm/c2_globals_ppc.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/ppc/vm/c2_globals_ppc.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -60,6 +60,7 @@
 define_pd_global(bool, OptoPeephole,                 false);
 define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoRegScheduling,            false);
 // GL:
 // Detected a problem with unscaled compressed oops and
 // narrow_oop_use_complex_address() == false.
--- a/src/cpu/ppc/vm/interp_masm_ppc_64.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/ppc/vm/interp_masm_ppc_64.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -46,7 +46,7 @@
   MacroAssembler::null_check_throw(a, offset, temp_reg, exception_entry);
 }
 
-void InterpreterMacroAssembler::branch_to_entry(address entry, Register Rscratch) {
+void InterpreterMacroAssembler::jump_to_entry(address entry, Register Rscratch) {
   assert(entry, "Entry must have been generated by now");
   if (is_within_range_of_b(entry, pc())) {
     b(entry);
--- a/src/cpu/ppc/vm/interp_masm_ppc_64.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/ppc/vm/interp_masm_ppc_64.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -39,7 +39,7 @@
 
   void null_check_throw(Register a, int offset, Register temp_reg);
 
-  void branch_to_entry(address entry, Register Rscratch);
+  void jump_to_entry(address entry, Register Rscratch);
 
   // Handy address generation macros.
 #define thread_(field_name) in_bytes(JavaThread::field_name ## _offset()), R16_thread
--- a/src/cpu/ppc/vm/interpreterGenerator_ppc.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/ppc/vm/interpreterGenerator_ppc.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -31,12 +31,12 @@
  private:
 
   address generate_abstract_entry(void);
-  address generate_jump_to_normal_entry(void);
-  address generate_accessor_entry(void) { return generate_jump_to_normal_entry(); }
-  address generate_empty_entry(void) { return generate_jump_to_normal_entry(); }
+  address generate_accessor_entry(void) { return NULL; }
+  address generate_empty_entry(void) { return NULL; }
   address generate_Reference_get_entry(void);
 
   address generate_CRC32_update_entry();
   address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+  address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
 
 #endif // CPU_PPC_VM_INTERPRETERGENERATOR_PPC_HPP
--- a/src/cpu/ppc/vm/interpreter_ppc.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/ppc/vm/interpreter_ppc.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -427,18 +427,6 @@
   return entry;
 }
 
-// Call an accessor method (assuming it is resolved, otherwise drop into
-// vanilla (slow path) entry.
-address InterpreterGenerator::generate_jump_to_normal_entry(void) {
-  address entry = __ pc();
-  address normal_entry = Interpreter::entry_for_kind(Interpreter::zerolocals);
-  assert(normal_entry != NULL, "should already be generated.");
-  __ branch_to_entry(normal_entry, R11_scratch1);
-  __ flush();
-
-  return entry;
-}
-
 // Abstract method entry.
 //
 address InterpreterGenerator::generate_abstract_entry(void) {
@@ -529,13 +517,13 @@
   //   regular method entry code to generate the NPE.
   //
 
-  address entry = __ pc();
+  if (UseG1GC) {
+    address entry = __ pc();
 
-  const int referent_offset = java_lang_ref_Reference::referent_offset;
-  guarantee(referent_offset > 0, "referent offset not initialized");
+    const int referent_offset = java_lang_ref_Reference::referent_offset;
+    guarantee(referent_offset > 0, "referent offset not initialized");
 
-  if (UseG1GC) {
-     Label slow_path;
+    Label slow_path;
 
     // Debugging not possible, so can't use __ skip_if_jvmti_mode(slow_path, GR31_SCRATCH);
 
@@ -577,13 +565,11 @@
 
     // Generate regular method entry.
     __ bind(slow_path);
-    __ branch_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals), R11_scratch1);
-    __ flush();
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals), R11_scratch1);
+    return entry;
+  }
 
-    return entry;
-  } else {
-    return generate_jump_to_normal_entry();
-  }
+  return NULL;
 }
 
 void Deoptimization::unwind_callee_save_values(frame* f, vframeArray* vframe_array) {
--- a/src/cpu/ppc/vm/ppc.ad	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/ppc/vm/ppc.ad	Thu Oct 08 14:28:55 2015 -0700
@@ -2064,6 +2064,10 @@
   return true;  // Per default match rules are supported.
 }
 
+const int Matcher::float_pressure(int default_pressure_threshold) {
+  return default_pressure_threshold;
+}
+
 int Matcher::regnum_to_fpu_offset(int regnum) {
   // No user for this method?
   Unimplemented();
--- a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -620,7 +620,7 @@
 address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKind kind) {
   if (!math_entry_available(kind)) {
     NOT_PRODUCT(__ should_not_reach_here();)
-    return Interpreter::entry_for_kind(Interpreter::zerolocals);
+    return NULL;
   }
 
   address entry = __ pc();
@@ -1126,14 +1126,6 @@
 
   generate_fixed_frame(false, Rsize_of_parameters, Rsize_of_locals);
 
-#ifdef FAST_DISPATCH
-  __ unimplemented("Fast dispatch in generate_normal_entry");
-#if 0
-  __ set((intptr_t)Interpreter::dispatch_table(), IdispatchTables);
-  // Set bytecode dispatch table base.
-#endif
-#endif
-
   // --------------------------------------------------------------------------
   // Zero out non-parameter locals.
   // Note: *Always* zero out non-parameter locals as Sparc does. It's not
@@ -1266,9 +1258,8 @@
  *   int java.util.zip.CRC32.update(int crc, int b)
  */
 address InterpreterGenerator::generate_CRC32_update_entry() {
-  address start = __ pc();  // Remember stub start address (is rtn value).
-
   if (UseCRC32Intrinsics) {
+    address start = __ pc();  // Remember stub start address (is rtn value).
     Label slow_path;
 
     // Safepoint check
@@ -1313,11 +1304,11 @@
     // Generate a vanilla native entry as the slow path.
     BLOCK_COMMENT("} CRC32_update");
     BIND(slow_path);
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native), R11_scratch1);
+    return start;
   }
 
-  (void) generate_native_entry(false);
-
-  return start;
+  return NULL;
 }
 
 // CRC32 Intrinsics.
@@ -1327,9 +1318,8 @@
  *   int java.util.zip.CRC32.updateByteBuffer(int crc, long* buf, int off, int len)
  */
 address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
-  address start = __ pc();  // Remember stub start address (is rtn value).
-
   if (UseCRC32Intrinsics) {
+    address start = __ pc();  // Remember stub start address (is rtn value).
     Label slow_path;
 
     // Safepoint check
@@ -1406,11 +1396,11 @@
     // Generate a vanilla native entry as the slow path.
     BLOCK_COMMENT("} CRC32_updateBytes(Buffer)");
     BIND(slow_path);
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native), R11_scratch1);
+    return start;
   }
 
-  (void) generate_native_entry(false);
-
-  return start;
+  return NULL;
 }
 
 // These should never be compiled since the interpreter will prefer
--- a/src/cpu/sparc/vm/c2_globals_sparc.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/sparc/vm/c2_globals_sparc.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -64,6 +64,7 @@
 define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoScheduling,               true);
+define_pd_global(bool, OptoRegScheduling,            false);
 
 #ifdef _LP64
 // We need to make sure that all generated code is within
--- a/src/cpu/sparc/vm/cppInterpreter_sparc.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/sparc/vm/cppInterpreter_sparc.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -468,7 +468,7 @@
 
   // If G1 is not enabled then attempt to go through the accessor entry point
   // Reference.get is an accessor
-  return generate_jump_to_normal_entry();
+  return NULL;
 }
 
 //
--- a/src/cpu/sparc/vm/interp_masm_sparc.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/sparc/vm/interp_masm_sparc.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -59,6 +59,13 @@
 
 #endif // CC_INTERP
 
+void InterpreterMacroAssembler::jump_to_entry(address entry) {
+  assert(entry, "Entry must have been generated by now");
+  AddressLiteral al(entry);
+  jump_to(al, G3_scratch);
+  delayed()->nop();
+}
+
 void InterpreterMacroAssembler::compute_extra_locals_size_in_bytes(Register args_size, Register locals_size, Register delta) {
   // Note: this algorithm is also used by C1's OSR entry sequence.
   // Any changes should also be applied to CodeEmitter::emit_osr_entry().
--- a/src/cpu/sparc/vm/interp_masm_sparc.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/sparc/vm/interp_masm_sparc.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -80,6 +80,8 @@
   InterpreterMacroAssembler(CodeBuffer* c)
     : MacroAssembler(c) {}
 
+  void jump_to_entry(address entry);
+
 #ifndef CC_INTERP
   virtual void load_earlyret_value(TosState state);
 
--- a/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -34,9 +34,8 @@
   address generate_abstract_entry(void);
   // there are no math intrinsics on sparc
   address generate_math_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
-  address generate_jump_to_normal_entry(void);
-  address generate_accessor_entry(void) { return generate_jump_to_normal_entry(); }
-  address generate_empty_entry(void) { return generate_jump_to_normal_entry(); }
+  address generate_accessor_entry(void) { return NULL; }
+  address generate_empty_entry(void) { return NULL; }
   address generate_Reference_get_entry(void);
   void lock_method(void);
   void save_native_result(void);
@@ -48,4 +47,5 @@
   // Not supported
   address generate_CRC32_update_entry() { return NULL; }
   address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
+  address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
 #endif // CPU_SPARC_VM_INTERPRETERGENERATOR_SPARC_HPP
--- a/src/cpu/sparc/vm/interpreter_sparc.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/sparc/vm/interpreter_sparc.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -241,15 +241,6 @@
 
 // Various method entries
 
-address InterpreterGenerator::generate_jump_to_normal_entry(void) {
-  address entry = __ pc();
-  assert(Interpreter::entry_for_kind(Interpreter::zerolocals) != NULL, "should already be generated");
-  AddressLiteral al(Interpreter::entry_for_kind(Interpreter::zerolocals));
-  __ jump_to(al, G3_scratch);
-  __ delayed()->nop();
-  return entry;
-}
-
 // Abstract method entry
 // Attempt to execute abstract method. Throw exception
 //
--- a/src/cpu/sparc/vm/sparc.ad	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/sparc/vm/sparc.ad	Thu Oct 08 14:28:55 2015 -0700
@@ -1860,6 +1860,10 @@
   return true;  // Per default match rules are supported.
 }
 
+const int Matcher::float_pressure(int default_pressure_threshold) {
+  return default_pressure_threshold;
+}
+
 int Matcher::regnum_to_fpu_offset(int regnum) {
   return regnum - 32; // The FP registers are in the second chunk
 }
--- a/src/cpu/sparc/vm/templateInterpreter_sparc.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/sparc/vm/templateInterpreter_sparc.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -779,14 +779,14 @@
 
     // Generate regular method entry
     __ bind(slow_path);
-    (void) generate_normal_entry(false);
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
     return entry;
   }
 #endif // INCLUDE_ALL_GCS
 
   // If G1 is not enabled then attempt to go through the accessor entry point
   // Reference.get is an accessor
-  return generate_jump_to_normal_entry();
+  return NULL;
 }
 
 //
--- a/src/cpu/x86/vm/assembler_x86.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -770,6 +770,7 @@
     case 0x55: // andnps
     case 0x56: // orps
     case 0x57: // xorps
+    case 0x59: //mulpd
     case 0x6E: // movd
     case 0x7E: // movd
     case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
@@ -1604,6 +1605,85 @@
   emit_int8((unsigned char)0xA2);
 }
 
+// Opcode / Instruction                      Op /  En  64 - Bit Mode     Compat / Leg Mode Description                  Implemented
+// F2 0F 38 F0 / r       CRC32 r32, r / m8   RM        Valid             Valid             Accumulate CRC32 on r / m8.  v
+// F2 REX 0F 38 F0 / r   CRC32 r32, r / m8*  RM        Valid             N.E.              Accumulate CRC32 on r / m8.  -
+// F2 REX.W 0F 38 F0 / r CRC32 r64, r / m8   RM        Valid             N.E.              Accumulate CRC32 on r / m8.  -
+//
+// F2 0F 38 F1 / r       CRC32 r32, r / m16  RM        Valid             Valid             Accumulate CRC32 on r / m16. v
+//
+// F2 0F 38 F1 / r       CRC32 r32, r / m32  RM        Valid             Valid             Accumulate CRC32 on r / m32. v
+//
+// F2 REX.W 0F 38 F1 / r CRC32 r64, r / m64  RM        Valid             N.E.              Accumulate CRC32 on r / m64. v
+void Assembler::crc32(Register crc, Register v, int8_t sizeInBytes) {
+  assert(VM_Version::supports_sse4_2(), "");
+  int8_t w = 0x01;
+  Prefix p = Prefix_EMPTY;
+
+  emit_int8((int8_t)0xF2);
+  switch (sizeInBytes) {
+  case 1:
+    w = 0;
+    break;
+  case 2:
+  case 4:
+    break;
+  LP64_ONLY(case 8:)
+    // This instruction is not valid in 32 bits
+    // Note:
+    // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+    //
+    // Page B - 72   Vol. 2C says
+    // qwreg2 to qwreg            1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : 11 qwreg1 qwreg2
+    // mem64 to qwreg             1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : mod qwreg r / m
+    //                                                                            F0!!!
+    // while 3 - 208 Vol. 2A
+    // F2 REX.W 0F 38 F1 / r       CRC32 r64, r / m64             RM         Valid      N.E.Accumulate CRC32 on r / m64.
+    //
+    // the 0 on a last bit is reserved for a different flavor of this instruction :
+    // F2 REX.W 0F 38 F0 / r       CRC32 r64, r / m8              RM         Valid      N.E.Accumulate CRC32 on r / m8.
+    p = REX_W;
+    break;
+  default:
+    assert(0, "Unsupported value for a sizeInBytes argument");
+    break;
+  }
+  LP64_ONLY(prefix(crc, v, p);)
+  emit_int8((int8_t)0x0F);
+  emit_int8(0x38);
+  emit_int8((int8_t)(0xF0 | w));
+  emit_int8(0xC0 | ((crc->encoding() & 0x7) << 3) | (v->encoding() & 7));
+}
+
+void Assembler::crc32(Register crc, Address adr, int8_t sizeInBytes) {
+  assert(VM_Version::supports_sse4_2(), "");
+  InstructionMark im(this);
+  int8_t w = 0x01;
+  Prefix p = Prefix_EMPTY;
+
+  emit_int8((int8_t)0xF2);
+  switch (sizeInBytes) {
+  case 1:
+    w = 0;
+    break;
+  case 2:
+  case 4:
+    break;
+  LP64_ONLY(case 8:)
+    // This instruction is not valid in 32 bits
+    p = REX_W;
+    break;
+  default:
+    assert(0, "Unsupported value for a sizeInBytes argument");
+    break;
+  }
+  LP64_ONLY(prefix(crc, adr, p);)
+  emit_int8((int8_t)0x0F);
+  emit_int8(0x38);
+  emit_int8((int8_t)(0xF0 | w));
+  emit_operand(crc, adr);
+}
+
 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3, /* no_mask_reg */ false, /* legacy_mode */ true);
@@ -2951,6 +3031,15 @@
   emit_int8(imm8);
 }
 
+void Assembler::pextrw(Register dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse2(), "");
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
+  emit_int8(0x15);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(imm8);
+}
+
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true,
@@ -2969,6 +3058,15 @@
   emit_int8(imm8);
 }
 
+void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) {
+  assert(VM_Version::supports_sse2(), "");
+  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true,
+                                      VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw);
+  emit_int8((unsigned char)0xC4);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(imm8);
+}
+
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   if (VM_Version::supports_evex()) {
@@ -3984,6 +4082,16 @@
   }
 }
 
+void Assembler::mulpd(XMMRegister dst, Address src) {
+  _instruction_uses_vl = true;
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x59, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x59, dst, src, VEX_SIMD_66);
+  }
+}
+
 void Assembler::mulps(XMMRegister dst, XMMRegister src) {
   _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@@ -4172,6 +4280,26 @@
   emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq);
 }
 
+void Assembler::unpckhpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x15, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x15, dst, src, VEX_SIMD_66);
+  }
+}
+
+void Assembler::unpcklpd(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x14, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x14, dst, src, VEX_SIMD_66);
+  }
+}
+
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   if (VM_Version::supports_avx512dq()) {
@@ -4792,8 +4920,9 @@
 }
 
 
-// AND packed integers
+// logical operations packed integers
 void Assembler::pand(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0xDB, dst, src, VEX_SIMD_66);
 }
@@ -4814,6 +4943,17 @@
   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
+void Assembler::pandn(XMMRegister dst, XMMRegister src) {
+  _instruction_uses_vl = true;
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0xDF, dst, src, VEX_SIMD_66);
+  }
+  else {
+    emit_simd_arith(0xDF, dst, src, VEX_SIMD_66);
+  }
+}
+
 void Assembler::por(XMMRegister dst, XMMRegister src) {
   _instruction_uses_vl = true;
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@@ -6223,6 +6363,14 @@
   emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
 }
 
+// 0F A4 / r ib
+void Assembler::shldl(Register dst, Register src, int8_t imm8) {
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xA4);
+  emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
+  emit_int8(imm8);
+}
+
 void Assembler::shrdl(Register dst, Register src) {
   emit_int8(0x0F);
   emit_int8((unsigned char)0xAD);
@@ -6408,6 +6556,40 @@
   }
 }
 
+void Assembler::prefix(Register dst, Register src, Prefix p) {
+  if (src->encoding() >= 8) {
+    p = (Prefix)(p | REX_B);
+  }
+  if (dst->encoding() >= 8) {
+    p = (Prefix)( p | REX_R);
+  }
+  if (p != Prefix_EMPTY) {
+    // do not generate an empty prefix
+    prefix(p);
+  }
+}
+
+void Assembler::prefix(Register dst, Address adr, Prefix p) {
+  if (adr.base_needs_rex()) {
+    if (adr.index_needs_rex()) {
+      assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X");
+    } else {
+      prefix(REX_B);
+    }
+  } else {
+    if (adr.index_needs_rex()) {
+      assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X");
+    }
+  }
+  if (dst->encoding() >= 8) {
+    p = (Prefix)(p | REX_R);
+  }
+  if (p != Prefix_EMPTY) {
+    // do not generate an empty prefix
+    prefix(p);
+  }
+}
+
 void Assembler::prefix(Address adr) {
   if (adr.base_needs_rex()) {
     if (adr.index_needs_rex()) {
--- a/src/cpu/x86/vm/assembler_x86.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -506,7 +506,8 @@
 
     VEX_3bytes = 0xC4,
     VEX_2bytes = 0xC5,
-    EVEX_4bytes = 0x62
+    EVEX_4bytes = 0x62,
+    Prefix_EMPTY = 0x0
   };
 
   enum VexPrefix {
@@ -615,6 +616,8 @@
   int prefixq_and_encode(int dst_enc, int src_enc);
 
   void prefix(Register reg);
+  void prefix(Register dst, Register src, Prefix p);
+  void prefix(Register dst, Address adr, Prefix p);
   void prefix(Address adr);
   void prefixq(Address adr);
 
@@ -1177,6 +1180,10 @@
   // Identify processor type and features
   void cpuid();
 
+  // CRC32C
+  void crc32(Register crc, Register v, int8_t sizeInBytes);
+  void crc32(Register crc, Address adr, int8_t sizeInBytes);
+
   // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
   void cvtsd2ss(XMMRegister dst, XMMRegister src);
   void cvtsd2ss(XMMRegister dst, Address src);
@@ -1672,10 +1679,14 @@
   // SSE 4.1 extract
   void pextrd(Register dst, XMMRegister src, int imm8);
   void pextrq(Register dst, XMMRegister src, int imm8);
+  // SSE 2 extract
+  void pextrw(Register dst, XMMRegister src, int imm8);
 
   // SSE 4.1 insert
   void pinsrd(XMMRegister dst, Register src, int imm8);
   void pinsrq(XMMRegister dst, Register src, int imm8);
+  // SSE 2 insert
+  void pinsrw(XMMRegister dst, Register src, int imm8);
 
   // SSE4.1 packed move
   void pmovzxbw(XMMRegister dst, XMMRegister src);
@@ -1783,6 +1794,7 @@
   void setb(Condition cc, Register dst);
 
   void shldl(Register dst, Register src);
+  void shldl(Register dst, Register src, int8_t imm8);
 
   void shll(Register dst, int imm8);
   void shll(Register dst);
@@ -1925,6 +1937,7 @@
 
   // Multiply Packed Floating-Point Values
   void mulpd(XMMRegister dst, XMMRegister src);
+  void mulpd(XMMRegister dst, Address src);
   void mulps(XMMRegister dst, XMMRegister src);
   void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@@ -1951,6 +1964,9 @@
   void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
   void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
+  void unpckhpd(XMMRegister dst, XMMRegister src);
+  void unpcklpd(XMMRegister dst, XMMRegister src);
+
   // Bitwise Logical XOR of Packed Floating-Point Values
   void xorpd(XMMRegister dst, XMMRegister src);
   void xorps(XMMRegister dst, XMMRegister src);
@@ -2046,6 +2062,9 @@
   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
+  // Andn packed integers
+  void pandn(XMMRegister dst, XMMRegister src);
+
   // Or packed integers
   void por(XMMRegister dst, XMMRegister src);
   void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
--- a/src/cpu/x86/vm/assembler_x86.inline.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/assembler_x86.inline.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -37,6 +37,8 @@
 inline int Assembler::prefixq_and_encode(int dst_enc, int src_enc) { return dst_enc << 3 | src_enc; }
 
 inline void Assembler::prefix(Register reg) {}
+inline void Assembler::prefix(Register dst, Register src, Prefix p) {}
+inline void Assembler::prefix(Register dst, Address adr, Prefix p) {}
 inline void Assembler::prefix(Address adr) {}
 inline void Assembler::prefixq(Address adr) {}
 
--- a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -2457,9 +2457,6 @@
         // Should consider not saving rbx, if not necessary
         __ trigfunc('t', op->as_Op2()->fpu_stack_size());
         break;
-      case lir_exp :
-        __ exp_with_fallback(op->as_Op2()->fpu_stack_size());
-        break;
       case lir_pow :
         __ pow_with_fallback(op->as_Op2()->fpu_stack_size());
         break;
--- a/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -808,6 +808,12 @@
 
 void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
   assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
+
+  if (x->id() == vmIntrinsics::_dexp) {
+    do_ExpIntrinsic(x);
+    return;
+  }
+
   LIRItem value(x->argument_at(0), this);
 
   bool use_fpu = false;
@@ -818,7 +824,6 @@
       case vmIntrinsics::_dtan:
       case vmIntrinsics::_dlog:
       case vmIntrinsics::_dlog10:
-      case vmIntrinsics::_dexp:
       case vmIntrinsics::_dpow:
         use_fpu = true;
     }
@@ -870,7 +875,6 @@
     case vmIntrinsics::_dtan:   __ tan  (calc_input, calc_result, tmp1, tmp2);              break;
     case vmIntrinsics::_dlog:   __ log  (calc_input, calc_result, tmp1);                    break;
     case vmIntrinsics::_dlog10: __ log10(calc_input, calc_result, tmp1);                    break;
-    case vmIntrinsics::_dexp:   __ exp  (calc_input, calc_result,              tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
     case vmIntrinsics::_dpow:   __ pow  (calc_input, calc_input2, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
     default:                    ShouldNotReachHere();
   }
@@ -880,6 +884,32 @@
   }
 }
 
+void LIRGenerator::do_ExpIntrinsic(Intrinsic* x) {
+  LIRItem value(x->argument_at(0), this);
+  value.set_destroys_register();
+
+  LIR_Opr calc_result = rlock_result(x);
+  LIR_Opr result_reg = result_register_for(x->type());
+
+  BasicTypeList signature(1);
+  signature.append(T_DOUBLE);
+  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+
+  value.load_item_force(cc->at(0));
+
+#ifndef _LP64
+  LIR_Opr tmp = FrameMap::fpu0_double_opr;
+  result_reg = tmp;
+  if (VM_Version::supports_sse2()) {
+    __ call_runtime_leaf(StubRoutines::dexp(), getThreadTemp(), result_reg, cc->args());
+  } else {
+    __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dexp), getThreadTemp(), result_reg, cc->args());
+  }
+#else
+  __ call_runtime_leaf(StubRoutines::dexp(), getThreadTemp(), result_reg, cc->args());
+#endif
+  __ move(result_reg, calc_result);
+}
 
 void LIRGenerator::do_ArrayCopy(Intrinsic* x) {
   assert(x->number_of_arguments() == 5, "wrong type");
--- a/src/cpu/x86/vm/c1_LinearScan_x86.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/c1_LinearScan_x86.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -814,8 +814,7 @@
 
     case lir_tan:
     case lir_sin:
-    case lir_cos:
-    case lir_exp: {
+    case lir_cos: {
       // sin, cos and exp need two temporary fpu stack slots, so there are two temporary
       // registers (stored in right and temp of the operation).
       // the stack allocator must guarantee that the stack slots are really free,
--- a/src/cpu/x86/vm/c2_globals_x86.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/c2_globals_x86.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -48,11 +48,11 @@
 
 define_pd_global(intx, OnStackReplacePercentage,     140);
 define_pd_global(intx, ConditionalMoveLimit,         3);
-define_pd_global(intx, FLOATPRESSURE,                6);
 define_pd_global(intx, FreqInlineSize,               325);
 define_pd_global(intx, MinJumpTableSize,             10);
 #ifdef AMD64
 define_pd_global(intx, INTPRESSURE,                  13);
+define_pd_global(intx, FLOATPRESSURE,                14);
 define_pd_global(intx, InteriorEntryAlignment,       16);
 define_pd_global(size_t, NewSizeThreadIncrease,      ScaleForWordSize(4*K));
 define_pd_global(intx, LoopUnrollLimit,              60);
@@ -64,6 +64,7 @@
 define_pd_global(uint64_t, MaxRAM,                   128ULL*G);
 #else
 define_pd_global(intx, INTPRESSURE,                  6);
+define_pd_global(intx, FLOATPRESSURE,                6);
 define_pd_global(intx, InteriorEntryAlignment,       4);
 define_pd_global(size_t, NewSizeThreadIncrease,      4*K);
 define_pd_global(intx, LoopUnrollLimit,              50);     // Design center runs on 1.3.1
@@ -82,6 +83,7 @@
 define_pd_global(bool, UseCISCSpill,                 true);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoRegScheduling,            true);
 
 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
 define_pd_global(intx, NonProfiledCodeHeapSize,      21*M);
--- a/src/cpu/x86/vm/cppInterpreter_x86.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/cppInterpreter_x86.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -807,7 +807,7 @@
 
   // If G1 is not enabled then attempt to go through the accessor entry point
   // Reference.get is an accessor
-  return generate_jump_to_normal_entry();
+  return NULL;
 }
 
 //
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cpu/x86/vm/crc32c.h	Thu Oct 08 14:28:55 2015 -0700
@@ -0,0 +1,66 @@
+/*
+* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+*
+* This code is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License version 2 only, as
+* published by the Free Software Foundation.
+*
+* This code is distributed in the hope that it will be useful, but WITHOUT
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+* version 2 for more details (a copy is included in the LICENSE file that
+* accompanied this code).
+*
+* You should have received a copy of the GNU General Public License version
+* 2 along with this work; if not, write to the Free Software Foundation,
+* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+*
+* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+* or visit www.oracle.com if you need additional information or have any
+* questions.
+*
+*/
+
+enum {
+  // S. Gueron / Information Processing Letters 112 (2012) 184
+  // shows than anything above 6K and below 32K is a good choice
+  // 32K does not deliver any further performance gains
+  // 6K=8*256 (*3 as we compute 3 blocks together)
+  //
+  // Thus selecting the smallest value so it could apply to the largest number
+  // of buffer sizes.
+  CRC32C_HIGH = 8 * 256,
+
+  // empirical
+  // based on ubench study using methodology described in
+  // V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 8
+  //
+  // arbitrary value between 27 and 256
+  CRC32C_MIDDLE = 8 * 86,
+
+  // V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 9
+  // shows that 240 and 1024 are equally good choices as the 216==8*27
+  //
+  // Selecting the smallest value which resulted in a significant performance improvement over
+  // sequential version
+  CRC32C_LOW = 8 * 27,
+
+  CRC32C_NUM_ChunkSizeInBytes = 3,
+
+  // We need to compute powers of 64N and 128N for each "chunk" size
+  CRC32C_NUM_PRECOMPUTED_CONSTANTS = ( 2 * CRC32C_NUM_ChunkSizeInBytes )
+};
+// Notes:
+// 1. Why we need to choose a "chunk" approach?
+// Overhead of computing a powers and powers of for an arbitrary buffer of size N is significant
+// (implementation approaches a library perf.)
+// 2. Why only 3 "chunks"?
+// Performance experiments results showed that a HIGH+LOW was not delivering a stable speedup
+// curve.
+//
+// Disclaimer:
+// If you ever decide to increase/decrease number of "chunks" be sure to modify
+// a) constants table generation (hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp)
+// b) constant fetch from that table (macroAssembler_x86.cpp)
+// c) unrolled for loop (macroAssembler_x86.cpp)
--- a/src/cpu/x86/vm/interp_masm_x86.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/interp_masm_x86.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -40,6 +40,11 @@
 
 // Implementation of InterpreterMacroAssembler
 
+void InterpreterMacroAssembler::jump_to_entry(address entry) {
+  assert(entry, "Entry must have been generated by now");
+  jump(RuntimeAddress(entry));
+}
+
 #ifndef CC_INTERP
 void InterpreterMacroAssembler::profile_obj_type(Register obj, const Address& mdo_addr) {
   Label update, next, none;
--- a/src/cpu/x86/vm/interp_masm_x86.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/interp_masm_x86.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -60,6 +60,8 @@
     _locals_register(LP64_ONLY(r14) NOT_LP64(rdi)),
     _bcp_register(LP64_ONLY(r13) NOT_LP64(rsi)) {}
 
+  void jump_to_entry(address entry);
+
   void load_earlyret_value(TosState state);
 
 #ifdef CC_INTERP
--- a/src/cpu/x86/vm/interpreterGenerator_x86.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/interpreterGenerator_x86.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -31,17 +31,6 @@
 
 #define __ _masm->
 
-// Jump into normal path for accessor and empty entry to jump to normal entry
-// The "fast" optimization don't update compilation count therefore can disable inlining
-// for these functions that should be inlined.
-address InterpreterGenerator::generate_jump_to_normal_entry(void) {
-  address entry_point = __ pc();
-
-  assert(Interpreter::entry_for_kind(Interpreter::zerolocals) != NULL, "should already be generated");
-  __ jump(RuntimeAddress(Interpreter::entry_for_kind(Interpreter::zerolocals)));
-  return entry_point;
-}
-
 // Abstract method entry
 // Attempt to execute abstract method. Throw exception
 address InterpreterGenerator::generate_abstract_entry(void) {
--- a/src/cpu/x86/vm/interpreterGenerator_x86.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/interpreterGenerator_x86.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -36,12 +36,12 @@
   address generate_native_entry(bool synchronized);
   address generate_abstract_entry(void);
   address generate_math_entry(AbstractInterpreter::MethodKind kind);
-  address generate_jump_to_normal_entry(void);
-  address generate_accessor_entry(void) { return generate_jump_to_normal_entry(); }
-  address generate_empty_entry(void) { return generate_jump_to_normal_entry(); }
+  address generate_accessor_entry(void) { return NULL; }
+  address generate_empty_entry(void) { return NULL; }
   address generate_Reference_get_entry();
   address generate_CRC32_update_entry();
   address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+  address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind);
 #ifndef _LP64
   address generate_Float_intBitsToFloat_entry();
   address generate_Float_floatToRawIntBits_entry();
--- a/src/cpu/x86/vm/interpreter_x86_32.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/interpreter_x86_32.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -151,11 +151,15 @@
       __ pop_fTOS();
       break;
     case Interpreter::java_lang_math_exp:
-      __ exp_with_fallback(0);
-      // Store to stack to convert 80bit precision back to 64bits
-      __ push_fTOS();
-      __ pop_fTOS();
-      break;
+      __ subptr(rsp, 2*wordSize);
+      __ fstp_d(Address(rsp, 0));
+      if (VM_Version::supports_sse2()) {
+        __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dexp())));
+      } else {
+        __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::dexp)));
+      }
+      __ addptr(rsp, 2*wordSize);
+    break;
     default                              :
         ShouldNotReachHere();
   }
--- a/src/cpu/x86/vm/interpreter_x86_64.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/interpreter_x86_64.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -252,6 +252,9 @@
 
   if (kind == Interpreter::java_lang_math_sqrt) {
     __ sqrtsd(xmm0, Address(rsp, wordSize));
+  } else if (kind == Interpreter::java_lang_math_exp) {
+    __ movdbl(xmm0, Address(rsp, wordSize));
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dexp())));
   } else {
     __ fld_d(Address(rsp, wordSize));
     switch (kind) {
@@ -278,9 +281,6 @@
                                               // empty stack slot)
           __ pow_with_fallback(0);
           break;
-      case Interpreter::java_lang_math_exp:
-          __ exp_with_fallback(0);
-           break;
       default                              :
           ShouldNotReachHere();
     }
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -45,6 +45,7 @@
 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
 #include "gc/g1/heapRegion.hpp"
 #endif // INCLUDE_ALL_GCS
+#include "crc32c.h"
 
 #ifdef PRODUCT
 #define BLOCK_COMMENT(str) /* nothing */
@@ -3032,6 +3033,15 @@
   Assembler::fldcw(as_Address(src));
 }
 
+void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::mulpd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::mulpd(dst, Address(rscratch1, 0));
+  }
+}
+
 void MacroAssembler::pow_exp_core_encoding() {
   // kills rax, rcx, rdx
   subptr(rsp,sizeof(jdouble));
@@ -3104,19 +3114,7 @@
   BLOCK_COMMENT("} fast_pow");
 }
 
-void MacroAssembler::fast_exp() {
-  // computes exp(X) = 2^(X * log2(e))
-  // if fast computation is not possible, result is NaN. Requires
-  // fallback from user of this macro.
-  // increase precision for intermediate steps of the computation
-  increase_precision();
-  fldl2e();                // Stack: log2(e) X ...
-  fmulp(1);                // Stack: (X*log2(e)) ...
-  pow_exp_core_encoding(); // Stack: exp(X) ...
-  restore_precision();
-}
-
-void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
+void MacroAssembler::pow_or_exp(int num_fpu_regs_in_use) {
   // kills rax, rcx, rdx
   // pow and exp needs 2 extra registers on the fpu stack.
   Label slow_case, done;
@@ -3128,182 +3126,164 @@
   Register tmp2 = rax;
   Register tmp3 = rcx;
 
-  if (is_exp) {
-    // Stack: X
-    fld_s(0);                   // duplicate argument for runtime call. Stack: X X
-    fast_exp();                 // Stack: exp(X) X
-    fcmp(tmp, 0, false, false); // Stack: exp(X) X
-    // exp(X) not equal to itself: exp(X) is NaN go to slow case.
-    jcc(Assembler::parity, slow_case);
-    // get rid of duplicate argument. Stack: exp(X)
-    if (num_fpu_regs_in_use > 0) {
-      fxch();
-      fpop();
-    } else {
-      ffree(1);
-    }
-    jmp(done);
-  } else {
-    // Stack: X Y
-    Label x_negative, y_not_2;
-
-    static double two = 2.0;
-    ExternalAddress two_addr((address)&two);
-
-    // constant maybe too far on 64 bit
-    lea(tmp2, two_addr);
-    fld_d(Address(tmp2, 0));    // Stack: 2 X Y
-    fcmp(tmp, 2, true, false);  // Stack: X Y
-    jcc(Assembler::parity, y_not_2);
-    jcc(Assembler::notEqual, y_not_2);
-
-    fxch(); fpop();             // Stack: X
-    fmul(0);                    // Stack: X*X
-
-    jmp(done);
-
-    bind(y_not_2);
-
-    fldz();                     // Stack: 0 X Y
-    fcmp(tmp, 1, true, false);  // Stack: X Y
-    jcc(Assembler::above, x_negative);
-
-    // X >= 0
-
-    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
-    fld_s(1);                   // Stack: X Y X Y
-    fast_pow();                 // Stack: X^Y X Y
-    fcmp(tmp, 0, false, false); // Stack: X^Y X Y
-    // X^Y not equal to itself: X^Y is NaN go to slow case.
-    jcc(Assembler::parity, slow_case);
-    // get rid of duplicate arguments. Stack: X^Y
-    if (num_fpu_regs_in_use > 0) {
-      fxch(); fpop();
-      fxch(); fpop();
-    } else {
-      ffree(2);
-      ffree(1);
-    }
-    jmp(done);
-
-    // X <= 0
-    bind(x_negative);
-
-    fld_s(1);                   // Stack: Y X Y
-    frndint();                  // Stack: int(Y) X Y
-    fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
-    jcc(Assembler::notEqual, slow_case);
-
-    subptr(rsp, 8);
-
-    // For X^Y, when X < 0, Y has to be an integer and the final
-    // result depends on whether it's odd or even. We just checked
-    // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
-    // integer to test its parity. If int(Y) is huge and doesn't fit
-    // in the 64 bit integer range, the integer indefinite value will
-    // end up in the gp registers. Huge numbers are all even, the
-    // integer indefinite number is even so it's fine.
+  // Stack: X Y
+  Label x_negative, y_not_2;
+
+  static double two = 2.0;
+  ExternalAddress two_addr((address)&two);
+
+  // constant maybe too far on 64 bit
+  lea(tmp2, two_addr);
+  fld_d(Address(tmp2, 0));    // Stack: 2 X Y
+  fcmp(tmp, 2, true, false);  // Stack: X Y
+  jcc(Assembler::parity, y_not_2);
+  jcc(Assembler::notEqual, y_not_2);
+
+  fxch(); fpop();             // Stack: X
+  fmul(0);                    // Stack: X*X
+
+  jmp(done);
+
+  bind(y_not_2);
+
+  fldz();                     // Stack: 0 X Y
+  fcmp(tmp, 1, true, false);  // Stack: X Y
+  jcc(Assembler::above, x_negative);
+
+  // X >= 0
+
+  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
+  fld_s(1);                   // Stack: X Y X Y
+  fast_pow();                 // Stack: X^Y X Y
+  fcmp(tmp, 0, false, false); // Stack: X^Y X Y
+  // X^Y not equal to itself: X^Y is NaN go to slow case.
+  jcc(Assembler::parity, slow_case);
+  // get rid of duplicate arguments. Stack: X^Y
+  if (num_fpu_regs_in_use > 0) {
+    fxch(); fpop();
+    fxch(); fpop();
+  } else {
+    ffree(2);
+    ffree(1);
+  }
+  jmp(done);
+
+  // X <= 0
+  bind(x_negative);
+
+  fld_s(1);                   // Stack: Y X Y
+  frndint();                  // Stack: int(Y) X Y
+  fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
+  jcc(Assembler::notEqual, slow_case);
+
+  subptr(rsp, 8);
+
+  // For X^Y, when X < 0, Y has to be an integer and the final
+  // result depends on whether it's odd or even. We just checked
+  // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
+  // integer to test its parity. If int(Y) is huge and doesn't fit
+  // in the 64 bit integer range, the integer indefinite value will
+  // end up in the gp registers. Huge numbers are all even, the
+  // integer indefinite number is even so it's fine.
 
 #ifdef ASSERT
-    // Let's check we don't end up with an integer indefinite number
-    // when not expected. First test for huge numbers: check whether
-    // int(Y)+1 == int(Y) which is true for very large numbers and
-    // those are all even. A 64 bit integer is guaranteed to not
-    // overflow for numbers where y+1 != y (when precision is set to
-    // double precision).
-    Label y_not_huge;
-
-    fld1();                     // Stack: 1 int(Y) X Y
-    fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
+  // Let's check we don't end up with an integer indefinite number
+  // when not expected. First test for huge numbers: check whether
+  // int(Y)+1 == int(Y) which is true for very large numbers and
+  // those are all even. A 64 bit integer is guaranteed to not
+  // overflow for numbers where y+1 != y (when precision is set to
+  // double precision).
+  Label y_not_huge;
+
+  fld1();                     // Stack: 1 int(Y) X Y
+  fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
 
 #ifdef _LP64
-    // trip to memory to force the precision down from double extended
-    // precision
-    fstp_d(Address(rsp, 0));
-    fld_d(Address(rsp, 0));
+  // trip to memory to force the precision down from double extended
+  // precision
+  fstp_d(Address(rsp, 0));
+  fld_d(Address(rsp, 0));
 #endif
 
-    fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
+  fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
 #endif
 
-    // move int(Y) as 64 bit integer to thread's stack
-    fistp_d(Address(rsp,0));    // Stack: X Y
+  // move int(Y) as 64 bit integer to thread's stack
+  fistp_d(Address(rsp,0));    // Stack: X Y
 
 #ifdef ASSERT
-    jcc(Assembler::notEqual, y_not_huge);
-
-    // Y is huge so we know it's even. It may not fit in a 64 bit
-    // integer and we don't want the debug code below to see the
-    // integer indefinite value so overwrite int(Y) on the thread's
-    // stack with 0.
-    movl(Address(rsp, 0), 0);
-    movl(Address(rsp, 4), 0);
-
-    bind(y_not_huge);
+  jcc(Assembler::notEqual, y_not_huge);
+
+  // Y is huge so we know it's even. It may not fit in a 64 bit
+  // integer and we don't want the debug code below to see the
+  // integer indefinite value so overwrite int(Y) on the thread's
+  // stack with 0.
+  movl(Address(rsp, 0), 0);
+  movl(Address(rsp, 4), 0);
+
+  bind(y_not_huge);
 #endif
 
-    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
-    fld_s(1);                   // Stack: X Y X Y
-    fabs();                     // Stack: abs(X) Y X Y
-    fast_pow();                 // Stack: abs(X)^Y X Y
-    fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
-    // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
-
-    pop(tmp2);
-    NOT_LP64(pop(tmp3));
-    jcc(Assembler::parity, slow_case);
+  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
+  fld_s(1);                   // Stack: X Y X Y
+  fabs();                     // Stack: abs(X) Y X Y
+  fast_pow();                 // Stack: abs(X)^Y X Y
+  fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
+  // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
+
+  pop(tmp2);
+  NOT_LP64(pop(tmp3));
+  jcc(Assembler::parity, slow_case);
 
 #ifdef ASSERT
-    // Check that int(Y) is not integer indefinite value (int
-    // overflow). Shouldn't happen because for values that would
-    // overflow, 1+int(Y)==Y which was tested earlier.
+  // Check that int(Y) is not integer indefinite value (int
+  // overflow). Shouldn't happen because for values that would
+  // overflow, 1+int(Y)==Y which was tested earlier.
 #ifndef _LP64
-    {
-      Label integer;
-      testl(tmp2, tmp2);
-      jcc(Assembler::notZero, integer);
-      cmpl(tmp3, 0x80000000);
-      jcc(Assembler::notZero, integer);
-      STOP("integer indefinite value shouldn't be seen here");
-      bind(integer);
-    }
+  {
+    Label integer;
+    testl(tmp2, tmp2);
+    jcc(Assembler::notZero, integer);
+    cmpl(tmp3, 0x80000000);
+    jcc(Assembler::notZero, integer);
+    STOP("integer indefinite value shouldn't be seen here");
+    bind(integer);
+  }
 #else
-    {
-      Label integer;
-      mov(tmp3, tmp2); // preserve tmp2 for parity check below
-      shlq(tmp3, 1);
-      jcc(Assembler::carryClear, integer);
-      jcc(Assembler::notZero, integer);
-      STOP("integer indefinite value shouldn't be seen here");
-      bind(integer);
-    }
+  {
+    Label integer;
+    mov(tmp3, tmp2); // preserve tmp2 for parity check below
+    shlq(tmp3, 1);
+    jcc(Assembler::carryClear, integer);
+    jcc(Assembler::notZero, integer);
+    STOP("integer indefinite value shouldn't be seen here");
+    bind(integer);
+  }
 #endif
 #endif
 
-    // get rid of duplicate arguments. Stack: X^Y
-    if (num_fpu_regs_in_use > 0) {
-      fxch(); fpop();
-      fxch(); fpop();
-    } else {
-      ffree(2);
-      ffree(1);
-    }
-
-    testl(tmp2, 1);
-    jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
-    // X <= 0, Y even: X^Y = -abs(X)^Y
-
-    fchs();                     // Stack: -abs(X)^Y Y
-    jmp(done);
-  }
+  // get rid of duplicate arguments. Stack: X^Y
+  if (num_fpu_regs_in_use > 0) {
+    fxch(); fpop();
+    fxch(); fpop();
+  } else {
+    ffree(2);
+    ffree(1);
+  }
+
+  testl(tmp2, 1);
+  jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
+  // X <= 0, Y even: X^Y = -abs(X)^Y
+
+  fchs();                     // Stack: -abs(X)^Y Y
+  jmp(done);
 
   // slow case: runtime call
   bind(slow_case);
 
   fpop();                       // pop incorrect result or int(Y)
 
-  fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
-                      is_exp ? 1 : 2, num_fpu_regs_in_use);
+  fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), 2, num_fpu_regs_in_use);
 
   // Come here with result in F-TOS
   bind(done);
@@ -8636,6 +8616,471 @@
   notl(crc); // ~c
 }
 
+#ifdef _LP64
+// S. Gueron / Information Processing Letters 112 (2012) 184
+// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
+// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
+// Output: the 64-bit carry-less product of B * CONST
+void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
+                                     Register tmp1, Register tmp2, Register tmp3) {
+  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
+  if (n > 0) {
+    addq(tmp3, n * 256 * 8);
+  }
+  //    Q1 = TABLEExt[n][B & 0xFF];
+  movl(tmp1, in);
+  andl(tmp1, 0x000000FF);
+  shll(tmp1, 3);
+  addq(tmp1, tmp3);
+  movq(tmp1, Address(tmp1, 0));
+
+  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
+  movl(tmp2, in);
+  shrl(tmp2, 8);
+  andl(tmp2, 0x000000FF);
+  shll(tmp2, 3);
+  addq(tmp2, tmp3);
+  movq(tmp2, Address(tmp2, 0));
+
+  shlq(tmp2, 8);
+  xorq(tmp1, tmp2);
+
+  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
+  movl(tmp2, in);
+  shrl(tmp2, 16);
+  andl(tmp2, 0x000000FF);
+  shll(tmp2, 3);
+  addq(tmp2, tmp3);
+  movq(tmp2, Address(tmp2, 0));
+
+  shlq(tmp2, 16);
+  xorq(tmp1, tmp2);
+
+  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
+  shrl(in, 24);
+  andl(in, 0x000000FF);
+  shll(in, 3);
+  addq(in, tmp3);
+  movq(in, Address(in, 0));
+
+  shlq(in, 24);
+  xorq(in, tmp1);
+  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
+}
+
+void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
+                                      Register in_out,
+                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+                                      XMMRegister w_xtmp2,
+                                      Register tmp1,
+                                      Register n_tmp2, Register n_tmp3) {
+  if (is_pclmulqdq_supported) {
+    movdl(w_xtmp1, in_out); // modified blindly
+
+    movl(tmp1, const_or_pre_comp_const_index);
+    movdl(w_xtmp2, tmp1);
+    pclmulqdq(w_xtmp1, w_xtmp2, 0);
+
+    movdq(in_out, w_xtmp1);
+  } else {
+    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
+  }
+}
+
+// Recombination Alternative 2: No bit-reflections
+// T1 = (CRC_A * U1) << 1
+// T2 = (CRC_B * U2) << 1
+// C1 = T1 >> 32
+// C2 = T2 >> 32
+// T1 = T1 & 0xFFFFFFFF
+// T2 = T2 & 0xFFFFFFFF
+// T1 = CRC32(0, T1)
+// T2 = CRC32(0, T2)
+// C1 = C1 ^ T1
+// C2 = C2 ^ T2
+// CRC = C1 ^ C2 ^ CRC_C
+void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                     Register tmp1, Register tmp2,
+                                     Register n_tmp3) {
+  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+  shlq(in_out, 1);
+  movl(tmp1, in_out);
+  shrq(in_out, 32);
+  xorl(tmp2, tmp2);
+  crc32(tmp2, tmp1, 4);
+  xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
+  shlq(in1, 1);
+  movl(tmp1, in1);
+  shrq(in1, 32);
+  xorl(tmp2, tmp2);
+  crc32(tmp2, tmp1, 4);
+  xorl(in1, tmp2);
+  xorl(in_out, in1);
+  xorl(in_out, in2);
+}
+
+// Set N to predefined value
+// Subtract from a lenght of a buffer
+// execute in a loop:
+// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
+// for i = 1 to N do
+//  CRC_A = CRC32(CRC_A, A[i])
+//  CRC_B = CRC32(CRC_B, B[i])
+//  CRC_C = CRC32(CRC_C, C[i])
+// end for
+// Recombine
+void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+                                       Register in_out1, Register in_out2, Register in_out3,
+                                       Register tmp1, Register tmp2, Register tmp3,
+                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                       Register tmp4, Register tmp5,
+                                       Register n_tmp6) {
+  Label L_processPartitions;
+  Label L_processPartition;
+  Label L_exit;
+
+  bind(L_processPartitions);
+  cmpl(in_out1, 3 * size);
+  jcc(Assembler::less, L_exit);
+    xorl(tmp1, tmp1);
+    xorl(tmp2, tmp2);
+    movq(tmp3, in_out2);
+    addq(tmp3, size);
+
+    bind(L_processPartition);
+      crc32(in_out3, Address(in_out2, 0), 8);
+      crc32(tmp1, Address(in_out2, size), 8);
+      crc32(tmp2, Address(in_out2, size * 2), 8);
+      addq(in_out2, 8);
+      cmpq(in_out2, tmp3);
+      jcc(Assembler::less, L_processPartition);
+    crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
+            w_xtmp1, w_xtmp2, w_xtmp3,
+            tmp4, tmp5,
+            n_tmp6);
+    addq(in_out2, 2 * size);
+    subl(in_out1, 3 * size);
+    jmp(L_processPartitions);
+
+  bind(L_exit);
+}
+#else
+void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
+                                     Register tmp1, Register tmp2, Register tmp3,
+                                     XMMRegister xtmp1, XMMRegister xtmp2) {
+  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
+  if (n > 0) {
+    addl(tmp3, n * 256 * 8);
+  }
+  //    Q1 = TABLEExt[n][B & 0xFF];
+  movl(tmp1, in_out);
+  andl(tmp1, 0x000000FF);
+  shll(tmp1, 3);
+  addl(tmp1, tmp3);
+  movq(xtmp1, Address(tmp1, 0));
+
+  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
+  movl(tmp2, in_out);
+  shrl(tmp2, 8);
+  andl(tmp2, 0x000000FF);
+  shll(tmp2, 3);
+  addl(tmp2, tmp3);
+  movq(xtmp2, Address(tmp2, 0));
+
+  psllq(xtmp2, 8);
+  pxor(xtmp1, xtmp2);
+
+  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
+  movl(tmp2, in_out);
+  shrl(tmp2, 16);
+  andl(tmp2, 0x000000FF);
+  shll(tmp2, 3);
+  addl(tmp2, tmp3);
+  movq(xtmp2, Address(tmp2, 0));
+
+  psllq(xtmp2, 16);
+  pxor(xtmp1, xtmp2);
+
+  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
+  shrl(in_out, 24);
+  andl(in_out, 0x000000FF);
+  shll(in_out, 3);
+  addl(in_out, tmp3);
+  movq(xtmp2, Address(in_out, 0));
+
+  psllq(xtmp2, 24);
+  pxor(xtmp1, xtmp2); // Result in CXMM
+  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
+}
+
+void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
+                                      Register in_out,
+                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+                                      XMMRegister w_xtmp2,
+                                      Register tmp1,
+                                      Register n_tmp2, Register n_tmp3) {
+  if (is_pclmulqdq_supported) {
+    movdl(w_xtmp1, in_out);
+
+    movl(tmp1, const_or_pre_comp_const_index);
+    movdl(w_xtmp2, tmp1);
+    pclmulqdq(w_xtmp1, w_xtmp2, 0);
+    // Keep result in XMM since GPR is 32 bit in length
+  } else {
+    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
+  }
+}
+
+void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                     Register tmp1, Register tmp2,
+                                     Register n_tmp3) {
+  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+
+  psllq(w_xtmp1, 1);
+  movdl(tmp1, w_xtmp1);
+  psrlq(w_xtmp1, 32);
+  movdl(in_out, w_xtmp1);
+
+  xorl(tmp2, tmp2);
+  crc32(tmp2, tmp1, 4);
+  xorl(in_out, tmp2);
+
+  psllq(w_xtmp2, 1);
+  movdl(tmp1, w_xtmp2);
+  psrlq(w_xtmp2, 32);
+  movdl(in1, w_xtmp2);
+
+  xorl(tmp2, tmp2);
+  crc32(tmp2, tmp1, 4);
+  xorl(in1, tmp2);
+  xorl(in_out, in1);
+  xorl(in_out, in2);
+}
+
+void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+                                       Register in_out1, Register in_out2, Register in_out3,
+                                       Register tmp1, Register tmp2, Register tmp3,
+                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                       Register tmp4, Register tmp5,
+                                       Register n_tmp6) {
+  Label L_processPartitions;
+  Label L_processPartition;
+  Label L_exit;
+
+  bind(L_processPartitions);
+  cmpl(in_out1, 3 * size);
+  jcc(Assembler::less, L_exit);
+    xorl(tmp1, tmp1);
+    xorl(tmp2, tmp2);
+    movl(tmp3, in_out2);
+    addl(tmp3, size);
+
+    bind(L_processPartition);
+      crc32(in_out3, Address(in_out2, 0), 4);
+      crc32(tmp1, Address(in_out2, size), 4);
+      crc32(tmp2, Address(in_out2, size*2), 4);
+      crc32(in_out3, Address(in_out2, 0+4), 4);
+      crc32(tmp1, Address(in_out2, size+4), 4);
+      crc32(tmp2, Address(in_out2, size*2+4), 4);
+      addl(in_out2, 8);
+      cmpl(in_out2, tmp3);
+      jcc(Assembler::less, L_processPartition);
+
+        push(tmp3);
+        push(in_out1);
+        push(in_out2);
+        tmp4 = tmp3;
+        tmp5 = in_out1;
+        n_tmp6 = in_out2;
+
+      crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
+            w_xtmp1, w_xtmp2, w_xtmp3,
+            tmp4, tmp5,
+            n_tmp6);
+
+        pop(in_out2);
+        pop(in_out1);
+        pop(tmp3);
+
+    addl(in_out2, 2 * size);
+    subl(in_out1, 3 * size);
+    jmp(L_processPartitions);
+
+  bind(L_exit);
+}
+#endif //LP64
+
+#ifdef _LP64
+// Algorithm 2: Pipelined usage of the CRC32 instruction.
+// Input: A buffer I of L bytes.
+// Output: the CRC32C value of the buffer.
+// Notations:
+// Write L = 24N + r, with N = floor (L/24).
+// r = L mod 24 (0 <= r < 24).
+// Consider I as the concatenation of A|B|C|R, where A, B, C, each,
+// N quadwords, and R consists of r bytes.
+// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
+// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
+// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
+// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
+void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+                                          Register tmp1, Register tmp2, Register tmp3,
+                                          Register tmp4, Register tmp5, Register tmp6,
+                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                          bool is_pclmulqdq_supported) {
+  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+  Label L_wordByWord;
+  Label L_byteByByteProlog;
+  Label L_byteByByte;
+  Label L_exit;
+
+  if (is_pclmulqdq_supported ) {
+    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
+    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
+
+    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
+    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
+
+    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
+    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
+    assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
+  } else {
+    const_or_pre_comp_const_index[0] = 1;
+    const_or_pre_comp_const_index[1] = 0;
+
+    const_or_pre_comp_const_index[2] = 3;
+    const_or_pre_comp_const_index[3] = 2;
+
+    const_or_pre_comp_const_index[4] = 5;
+    const_or_pre_comp_const_index[5] = 4;
+   }
+  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  movl(tmp1, in2);
+  andl(tmp1, 0x00000007);
+  negl(tmp1);
+  addl(tmp1, in2);
+  addq(tmp1, in1);
+
+  BIND(L_wordByWord);
+  cmpq(in1, tmp1);
+  jcc(Assembler::greaterEqual, L_byteByByteProlog);
+    crc32(in_out, Address(in1, 0), 4);
+    addq(in1, 4);
+    jmp(L_wordByWord);
+
+  BIND(L_byteByByteProlog);
+  andl(in2, 0x00000007);
+  movl(tmp2, 1);
+
+  BIND(L_byteByByte);
+  cmpl(tmp2, in2);
+  jccb(Assembler::greater, L_exit);
+    crc32(in_out, Address(in1, 0), 1);
+    incq(in1);
+    incl(tmp2);
+    jmp(L_byteByByte);
+
+  BIND(L_exit);
+}
+#else
+void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+                                          Register tmp1, Register  tmp2, Register tmp3,
+                                          Register tmp4, Register  tmp5, Register tmp6,
+                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                          bool is_pclmulqdq_supported) {
+  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+  Label L_wordByWord;
+  Label L_byteByByteProlog;
+  Label L_byteByByte;
+  Label L_exit;
+
+  if (is_pclmulqdq_supported) {
+    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
+    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
+
+    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
+    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
+
+    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
+    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
+  } else {
+    const_or_pre_comp_const_index[0] = 1;
+    const_or_pre_comp_const_index[1] = 0;
+
+    const_or_pre_comp_const_index[2] = 3;
+    const_or_pre_comp_const_index[3] = 2;
+
+    const_or_pre_comp_const_index[4] = 5;
+    const_or_pre_comp_const_index[5] = 4;
+  }
+  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  movl(tmp1, in2);
+  andl(tmp1, 0x00000007);
+  negl(tmp1);
+  addl(tmp1, in2);
+  addl(tmp1, in1);
+
+  BIND(L_wordByWord);
+  cmpl(in1, tmp1);
+  jcc(Assembler::greaterEqual, L_byteByByteProlog);
+    crc32(in_out, Address(in1,0), 4);
+    addl(in1, 4);
+    jmp(L_wordByWord);
+
+  BIND(L_byteByByteProlog);
+  andl(in2, 0x00000007);
+  movl(tmp2, 1);
+
+  BIND(L_byteByByte);
+  cmpl(tmp2, in2);
+  jccb(Assembler::greater, L_exit);
+    movb(tmp1, Address(in1, 0));
+    crc32(in_out, tmp1, 1);
+    incl(in1);
+    incl(tmp2);
+    jmp(L_byteByByte);
+
+  BIND(L_exit);
+}
+#endif // LP64
 #undef BIND
 #undef BLOCK_COMMENT
 
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -907,14 +907,14 @@
   // all corner cases and may result in NaN and require fallback to a
   // runtime call.
   void fast_pow();
-  void fast_exp();
+  void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
+                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+                Register rax, Register rcx, Register rdx, Register tmp);
   void increase_precision();
   void restore_precision();
 
-  // computes exp(x). Fallback to runtime call included.
-  void exp_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(true, num_fpu_regs_in_use); }
   // computes pow(x,y). Fallback to runtime call included.
-  void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(false, num_fpu_regs_in_use); }
+  void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(num_fpu_regs_in_use); }
 
 private:
 
@@ -925,7 +925,7 @@
   void pow_exp_core_encoding();
 
   // computes pow(x,y) or exp(x). Fallback to runtime call included.
-  void pow_or_exp(bool is_exp, int num_fpu_regs_in_use);
+  void pow_or_exp(int num_fpu_regs_in_use);
 
   // these are private because users should be doing movflt/movdbl
 
@@ -971,6 +971,10 @@
   void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
   void movsd(XMMRegister dst, AddressLiteral src);
 
+  void mulpd(XMMRegister dst, XMMRegister src)    { Assembler::mulpd(dst, src); }
+  void mulpd(XMMRegister dst, Address src)        { Assembler::mulpd(dst, src); }
+  void mulpd(XMMRegister dst, AddressLiteral src);
+
   void mulsd(XMMRegister dst, XMMRegister src)    { Assembler::mulsd(dst, src); }
   void mulsd(XMMRegister dst, Address src)        { Assembler::mulsd(dst, src); }
   void mulsd(XMMRegister dst, AddressLiteral src);
@@ -1278,9 +1282,42 @@
                Register raxReg);
 #endif
 
-  // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
+  // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
   void update_byte_crc32(Register crc, Register val, Register table);
   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
+  // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
+  // Note on a naming convention:
+  // Prefix w = register only used on a Westmere+ architecture
+  // Prefix n = register only used on a Nehalem architecture
+#ifdef _LP64
+  void crc32c_ipl_alg4(Register in_out, uint32_t n,
+                       Register tmp1, Register tmp2, Register tmp3);
+#else
+  void crc32c_ipl_alg4(Register in_out, uint32_t n,
+                       Register tmp1, Register tmp2, Register tmp3,
+                       XMMRegister xtmp1, XMMRegister xtmp2);
+#endif
+  void crc32c_pclmulqdq(XMMRegister w_xtmp1,
+                        Register in_out,
+                        uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+                        XMMRegister w_xtmp2,
+                        Register tmp1,
+                        Register n_tmp2, Register n_tmp3);
+  void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                       Register tmp1, Register tmp2,
+                       Register n_tmp3);
+  void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+                         Register in_out1, Register in_out2, Register in_out3,
+                         Register tmp1, Register tmp2, Register tmp3,
+                         XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                         Register tmp4, Register tmp5,
+                         Register n_tmp6);
+  void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+                            Register tmp1, Register tmp2, Register tmp3,
+                            Register tmp4, Register tmp5, Register tmp6,
+                            XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                            bool is_pclmulqdq_supported);
   // Fold 128-bit data chunk
   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cpu/x86/vm/macroAssembler_x86_libm.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -0,0 +1,677 @@
+/*
+ * Copyright (c) 2015, Intel Corporation.
+ * Intel Math Library (LIBM) Source Code
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/******************************************************************************/
+//                     ALGORITHM DESCRIPTION
+//                     ---------------------
+//
+// Description:
+//  Let K = 64 (table size).
+//        x    x/log(2)     n
+//       e  = 2          = 2 * T[j] * (1 + P(y))
+//  where
+//       x = m*log(2)/K + y,    y in [-log(2)/K..log(2)/K]
+//       m = n*K + j,           m,n,j - signed integer, j in [-K/2..K/2]
+//                  j/K
+//       values of 2   are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]).
+//
+//       P(y) is a minimax polynomial approximation of exp(x)-1
+//       on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V).
+//
+//  To avoid problems with arithmetic overflow and underflow,
+//            n                        n1  n2
+//  value of 2  is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2]
+//  where BIAS is a value of exponent bias.
+//
+// Special cases:
+//  exp(NaN) = NaN
+//  exp(+INF) = +INF
+//  exp(-INF) = 0
+//  exp(x) = 1 for subnormals
+//  for finite argument, only exp(0)=1 is exact
+//  For IEEE double
+//    if x >  709.782712893383973096 then exp(x) overflow
+//    if x < -745.133219101941108420 then exp(x) underflow
+//
+/******************************************************************************/
+
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "macroAssembler_x86.hpp"
+
+#ifdef _MSC_VER
+#define ALIGNED_(x) __declspec(align(x))
+#else
+#define ALIGNED_(x) __attribute__ ((aligned(x)))
+#endif
+
+#ifdef _LP64
+
+ALIGNED_(16) juint _cv[] =
+{
+    0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL, 0xfefa0000UL,
+    0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL, 0xbc9e3b3aUL, 0x3d1cf79aUL,
+    0xbc9e3b3aUL, 0x3d1cf79aUL, 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL,
+    0x3fdfffffUL, 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL,
+    0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
+};
+
+ALIGNED_(16) juint _shifter[] =
+{
+    0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
+};
+
+ALIGNED_(16) juint _mmask[] =
+{
+    0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
+};
+
+ALIGNED_(16) juint _bias[] =
+{
+    0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
+};
+
+ALIGNED_(16) juint _Tbl_addr[] =
+{
+    0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x0e03754dUL,
+    0x3cad7bbfUL, 0x3e778060UL, 0x00002c9aUL, 0x3567f613UL, 0x3c8cd252UL,
+    0xd3158574UL, 0x000059b0UL, 0x61e6c861UL, 0x3c60f74eUL, 0x18759bc8UL,
+    0x00008745UL, 0x5d837b6cUL, 0x3c979aa6UL, 0x6cf9890fUL, 0x0000b558UL,
+    0x702f9cd1UL, 0x3c3ebe3dUL, 0x32d3d1a2UL, 0x0000e3ecUL, 0x1e63bcd8UL,
+    0x3ca3516eUL, 0xd0125b50UL, 0x00011301UL, 0x26f0387bUL, 0x3ca4c554UL,
+    0xaea92ddfUL, 0x0001429aUL, 0x62523fb6UL, 0x3ca95153UL, 0x3c7d517aUL,
+    0x000172b8UL, 0x3f1353bfUL, 0x3c8b898cUL, 0xeb6fcb75UL, 0x0001a35bUL,
+    0x3e3a2f5fUL, 0x3c9aecf7UL, 0x3168b9aaUL, 0x0001d487UL, 0x44a6c38dUL,
+    0x3c8a6f41UL, 0x88628cd6UL, 0x0002063bUL, 0xe3a8a894UL, 0x3c968efdUL,
+    0x6e756238UL, 0x0002387aUL, 0x981fe7f2UL, 0x3c80472bUL, 0x65e27cddUL,
+    0x00026b45UL, 0x6d09ab31UL, 0x3c82f7e1UL, 0xf51fdee1UL, 0x00029e9dUL,
+    0x720c0ab3UL, 0x3c8b3782UL, 0xa6e4030bUL, 0x0002d285UL, 0x4db0abb6UL,
+    0x3c834d75UL, 0x0a31b715UL, 0x000306feUL, 0x5dd3f84aUL, 0x3c8fdd39UL,
+    0xb26416ffUL, 0x00033c08UL, 0xcc187d29UL, 0x3ca12f8cUL, 0x373aa9caUL,
+    0x000371a7UL, 0x738b5e8bUL, 0x3ca7d229UL, 0x34e59ff6UL, 0x0003a7dbUL,
+    0xa72a4c6dUL, 0x3c859f48UL, 0x4c123422UL, 0x0003dea6UL, 0x259d9205UL,
+    0x3ca8b846UL, 0x21f72e29UL, 0x0004160aUL, 0x60c2ac12UL, 0x3c4363edUL,
+    0x6061892dUL, 0x00044e08UL, 0xdaa10379UL, 0x3c6ecce1UL, 0xb5c13cd0UL,
+    0x000486a2UL, 0xbb7aafb0UL, 0x3c7690ceUL, 0xd5362a27UL, 0x0004bfdaUL,
+    0x9b282a09UL, 0x3ca083ccUL, 0x769d2ca6UL, 0x0004f9b2UL, 0xc1aae707UL,
+    0x3ca509b0UL, 0x569d4f81UL, 0x0005342bUL, 0x18fdd78eUL, 0x3c933505UL,
+    0x36b527daUL, 0x00056f47UL, 0xe21c5409UL, 0x3c9063e1UL, 0xdd485429UL,
+    0x0005ab07UL, 0x2b64c035UL, 0x3c9432e6UL, 0x15ad2148UL, 0x0005e76fUL,
+    0x99f08c0aUL, 0x3ca01284UL, 0xb03a5584UL, 0x0006247eUL, 0x0073dc06UL,
+    0x3c99f087UL, 0x82552224UL, 0x00066238UL, 0x0da05571UL, 0x3c998d4dUL,
+    0x667f3bccUL, 0x0006a09eUL, 0x86ce4786UL, 0x3ca52bb9UL, 0x3c651a2eUL,
+    0x0006dfb2UL, 0x206f0dabUL, 0x3ca32092UL, 0xe8ec5f73UL, 0x00071f75UL,
+    0x8e17a7a6UL, 0x3ca06122UL, 0x564267c8UL, 0x00075febUL, 0x461e9f86UL,
+    0x3ca244acUL, 0x73eb0186UL, 0x0007a114UL, 0xabd66c55UL, 0x3c65ebe1UL,
+    0x36cf4e62UL, 0x0007e2f3UL, 0xbbff67d0UL, 0x3c96fe9fUL, 0x994cce12UL,
+    0x00082589UL, 0x14c801dfUL, 0x3c951f14UL, 0x9b4492ecUL, 0x000868d9UL,
+    0xc1f0eab4UL, 0x3c8db72fUL, 0x422aa0dbUL, 0x0008ace5UL, 0x59f35f44UL,
+    0x3c7bf683UL, 0x99157736UL, 0x0008f1aeUL, 0x9c06283cUL, 0x3ca360baUL,
+    0xb0cdc5e4UL, 0x00093737UL, 0x20f962aaUL, 0x3c95e8d1UL, 0x9fde4e4fUL,
+    0x00097d82UL, 0x2b91ce27UL, 0x3c71affcUL, 0x82a3f090UL, 0x0009c491UL,
+    0x589a2ebdUL, 0x3c9b6d34UL, 0x7b5de564UL, 0x000a0c66UL, 0x9ab89880UL,
+    0x3c95277cUL, 0xb23e255cUL, 0x000a5503UL, 0x6e735ab3UL, 0x3c846984UL,
+    0x5579fdbfUL, 0x000a9e6bUL, 0x92cb3387UL, 0x3c8c1a77UL, 0x995ad3adUL,
+    0x000ae89fUL, 0xdc2d1d96UL, 0x3ca22466UL, 0xb84f15faUL, 0x000b33a2UL,
+    0xb19505aeUL, 0x3ca1112eUL, 0xf2fb5e46UL, 0x000b7f76UL, 0x0a5fddcdUL,
+    0x3c74ffd7UL, 0x904bc1d2UL, 0x000bcc1eUL, 0x30af0cb3UL, 0x3c736eaeUL,
+    0xdd85529cUL, 0x000c199bUL, 0xd10959acUL, 0x3c84e08fUL, 0x2e57d14bUL,
+    0x000c67f1UL, 0x6c921968UL, 0x3c676b2cUL, 0xdcef9069UL, 0x000cb720UL,
+    0x36df99b3UL, 0x3c937009UL, 0x4a07897bUL, 0x000d072dUL, 0xa63d07a7UL,
+    0x3c74a385UL, 0xdcfba487UL, 0x000d5818UL, 0xd5c192acUL, 0x3c8e5a50UL,
+    0x03db3285UL, 0x000da9e6UL, 0x1c4a9792UL, 0x3c98bb73UL, 0x337b9b5eUL,
+    0x000dfc97UL, 0x603a88d3UL, 0x3c74b604UL, 0xe78b3ff6UL, 0x000e502eUL,
+    0x92094926UL, 0x3c916f27UL, 0xa2a490d9UL, 0x000ea4afUL, 0x41aa2008UL,
+    0x3c8ec3bcUL, 0xee615a27UL, 0x000efa1bUL, 0x31d185eeUL, 0x3c8a64a9UL,
+    0x5b6e4540UL, 0x000f5076UL, 0x4d91cd9dUL, 0x3c77893bUL, 0x819e90d8UL,
+    0x000fa7c1UL
+};
+
+ALIGNED_(16) juint _ALLONES[] =
+{
+    0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL
+};
+
+ALIGNED_(16) juint _ebias[] =
+{
+    0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x3ff00000UL
+};
+
+ALIGNED_(4) juint _XMAX[] =
+{
+    0xffffffffUL, 0x7fefffffUL
+};
+
+ALIGNED_(4) juint _XMIN[] =
+{
+    0x00000000UL, 0x00100000UL
+};
+
+ALIGNED_(4) juint _INF[] =
+{
+    0x00000000UL, 0x7ff00000UL
+};
+
+ALIGNED_(4) juint _ZERO[] =
+{
+    0x00000000UL, 0x00000000UL
+};
+
+ALIGNED_(4) juint _ONE_val[] =
+{
+    0x00000000UL, 0x3ff00000UL
+};
+
+
+// Registers:
+// input: xmm0
+// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+//          rax, rdx, rcx, tmp - r11
+
+// Code generated by Intel C compiler for LIBM library
+
+void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
+  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
+  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
+  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
+  Label L_2TAG_PACKET_12_0_2, B1_3, B1_5, start;
+
+  assert_different_registers(tmp, eax, ecx, edx);
+  jmp(start);
+  address cv = (address)_cv;
+  address Shifter = (address)_shifter;
+  address mmask = (address)_mmask;
+  address bias = (address)_bias;
+  address Tbl_addr = (address)_Tbl_addr;
+  address ALLONES = (address)_ALLONES;
+  address ebias = (address)_ebias;
+  address XMAX = (address)_XMAX;
+  address XMIN = (address)_XMIN;
+  address INF = (address)_INF;
+  address ZERO = (address)_ZERO;
+  address ONE_val = (address)_ONE_val;
+
+  bind(start);
+  subq(rsp, 24);
+  movsd(Address(rsp, 8), xmm0);
+  unpcklpd(xmm0, xmm0);
+  movdqu(xmm1, ExternalAddress(cv));       // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL
+  movdqu(xmm6, ExternalAddress(Shifter));  // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
+  movdqu(xmm2, ExternalAddress(16+cv));    // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL
+  movdqu(xmm3, ExternalAddress(32+cv));    // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL
+  pextrw(eax, xmm0, 3);
+  andl(eax, 32767);
+  movl(edx, 16527);
+  subl(edx, eax);
+  subl(eax, 15504);
+  orl(edx, eax);
+  cmpl(edx, INT_MIN);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
+  mulpd(xmm1, xmm0);
+  addpd(xmm1, xmm6);
+  movapd(xmm7, xmm1);
+  subpd(xmm1, xmm6);
+  mulpd(xmm2, xmm1);
+  movdqu(xmm4, ExternalAddress(64+cv));    // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL
+  mulpd(xmm3, xmm1);
+  movdqu(xmm5, ExternalAddress(80+cv));    // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
+  subpd(xmm0, xmm2);
+  movdl(eax, xmm7);
+  movl(ecx, eax);
+  andl(ecx, 63);
+  shll(ecx, 4);
+  sarl(eax, 6);
+  movl(edx, eax);
+  movdqu(xmm6, ExternalAddress(mmask));    // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
+  pand(xmm7, xmm6);
+  movdqu(xmm6, ExternalAddress(bias));     // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
+  paddq(xmm7, xmm6);
+  psllq(xmm7, 46);
+  subpd(xmm0, xmm3);
+  lea(tmp, ExternalAddress(Tbl_addr));
+  movdqu(xmm2, Address(ecx,tmp));
+  mulpd(xmm4, xmm0);
+  movapd(xmm6, xmm0);
+  movapd(xmm1, xmm0);
+  mulpd(xmm6, xmm6);
+  mulpd(xmm0, xmm6);
+  addpd(xmm5, xmm4);
+  mulsd(xmm0, xmm6);
+  mulpd(xmm6, ExternalAddress(48+cv));     // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL
+  addsd(xmm1, xmm2);
+  unpckhpd(xmm2, xmm2);
+  mulpd(xmm0, xmm5);
+  addsd(xmm1, xmm0);
+  por(xmm2, xmm7);
+  unpckhpd(xmm0, xmm0);
+  addsd(xmm0, xmm1);
+  addsd(xmm0, xmm6);
+  addl(edx, 894);
+  cmpl(edx, 1916);
+  jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
+  mulsd(xmm0, xmm2);
+  addsd(xmm0, xmm2);
+  jmp (B1_5);
+
+  bind(L_2TAG_PACKET_1_0_2);
+  xorpd(xmm3, xmm3);
+  movdqu(xmm4, ExternalAddress(ALLONES));  // 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL
+  movl(edx, -1022);
+  subl(edx, eax);
+  movdl(xmm5, edx);
+  psllq(xmm4, xmm5);
+  movl(ecx, eax);
+  sarl(eax, 1);
+  pinsrw(xmm3, eax, 3);
+  movdqu(xmm6, ExternalAddress(ebias));    // 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x3ff00000UL
+  psllq(xmm3, 4);
+  psubd(xmm2, xmm3);
+  mulsd(xmm0, xmm2);
+  cmpl(edx, 52);
+  jcc(Assembler::greater, L_2TAG_PACKET_2_0_2);
+  pand(xmm4, xmm2);
+  paddd(xmm3, xmm6);
+  subsd(xmm2, xmm4);
+  addsd(xmm0, xmm2);
+  cmpl(ecx, 1023);
+  jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
+  pextrw(ecx, xmm0, 3);
+  andl(ecx, 32768);
+  orl(edx, ecx);
+  cmpl(edx, 0);
+  jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
+  movapd(xmm6, xmm0);
+  addsd(xmm0, xmm4);
+  mulsd(xmm0, xmm3);
+  pextrw(ecx, xmm0, 3);
+  andl(ecx, 32752);
+  cmpl(ecx, 0);
+  jcc(Assembler::equal, L_2TAG_PACKET_5_0_2);
+  jmp(B1_5);
+
+  bind(L_2TAG_PACKET_5_0_2);
+  mulsd(xmm6, xmm3);
+  mulsd(xmm4, xmm3);
+  movdqu(xmm0, xmm6);
+  pxor(xmm6, xmm4);
+  psrad(xmm6, 31);
+  pshufd(xmm6, xmm6, 85);
+  psllq(xmm0, 1);
+  psrlq(xmm0, 1);
+  pxor(xmm0, xmm6);
+  psrlq(xmm6, 63);
+  paddq(xmm0, xmm6);
+  paddq(xmm0, xmm4);
+  movl(Address(rsp,0), 15);
+  jmp(L_2TAG_PACKET_6_0_2);
+
+  bind(L_2TAG_PACKET_4_0_2);
+  addsd(xmm0, xmm4);
+  mulsd(xmm0, xmm3);
+  jmp(B1_5);
+
+  bind(L_2TAG_PACKET_3_0_2);
+  addsd(xmm0, xmm4);
+  mulsd(xmm0, xmm3);
+  pextrw(ecx, xmm0, 3);
+  andl(ecx, 32752);
+  cmpl(ecx, 32752);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
+  jmp(B1_5);
+
+  bind(L_2TAG_PACKET_2_0_2);
+  paddd(xmm3, xmm6);
+  addpd(xmm0, xmm2);
+  mulsd(xmm0, xmm3);
+  movl(Address(rsp,0), 15);
+  jmp(L_2TAG_PACKET_6_0_2);
+
+  bind(L_2TAG_PACKET_8_0_2);
+  cmpl(eax, 2146435072);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_9_0_2);
+  movl(eax, Address(rsp,12));
+  cmpl(eax, INT_MIN);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_10_0_2);
+  movsd(xmm0, ExternalAddress(XMAX));      // 0xffffffffUL, 0x7fefffffUL
+  mulsd(xmm0, xmm0);
+
+  bind(L_2TAG_PACKET_7_0_2);
+  movl(Address(rsp,0), 14);
+  jmp(L_2TAG_PACKET_6_0_2);
+
+  bind(L_2TAG_PACKET_10_0_2);
+  movsd(xmm0, ExternalAddress(XMIN));      // 0x00000000UL, 0x00100000UL
+  mulsd(xmm0, xmm0);
+  movl(Address(rsp,0), 15);
+  jmp(L_2TAG_PACKET_6_0_2);
+
+  bind(L_2TAG_PACKET_9_0_2);
+  movl(edx, Address(rsp,8));
+  cmpl(eax, 2146435072);
+  jcc(Assembler::above, L_2TAG_PACKET_11_0_2);
+  cmpl(edx, 0);
+  jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
+  movl(eax, Address(rsp,12));
+  cmpl(eax, 2146435072);
+  jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_2);
+  movsd(xmm0, ExternalAddress(INF));       // 0x00000000UL, 0x7ff00000UL
+  jmp(B1_5);
+
+  bind(L_2TAG_PACKET_12_0_2);
+  movsd(xmm0, ExternalAddress(ZERO));      // 0x00000000UL, 0x00000000UL
+  jmp(B1_5);
+
+  bind(L_2TAG_PACKET_11_0_2);
+  movsd(xmm0, Address(rsp, 8));
+  addsd(xmm0, xmm0);
+  jmp(B1_5);
+
+  bind(L_2TAG_PACKET_0_0_2);
+  movl(eax, Address(rsp, 12));
+  andl(eax, 2147483647);
+  cmpl(eax, 1083179008);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_8_0_2);
+  movsd(Address(rsp, 8), xmm0);
+  addsd(xmm0, ExternalAddress(ONE_val));   // 0x00000000UL, 0x3ff00000UL
+  jmp(B1_5);
+
+  bind(L_2TAG_PACKET_6_0_2);
+  movq(Address(rsp, 16), xmm0);
+
+  bind(B1_3);
+  movq(xmm0, Address(rsp, 16));
+
+  bind(B1_5);
+  addq(rsp, 24);
+}
+#endif
+
+#ifndef _LP64
+
+ALIGNED_(16) juint _static_const_table[] =
+{
+    0x00000000UL, 0xfff00000UL, 0x00000000UL, 0xfff00000UL, 0xffffffc0UL,
+    0x00000000UL, 0xffffffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL,
+    0x0000ffc0UL, 0x00000000UL, 0x00000000UL, 0x43380000UL, 0x00000000UL,
+    0x43380000UL, 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL,
+    0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL, 0xbc9e3b3aUL,
+    0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xfffffffeUL, 0x3fdfffffUL,
+    0xfffffffeUL, 0x3fdfffffUL, 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL,
+    0x3fa55555UL, 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL,
+    0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x0e03754dUL,
+    0x3cad7bbfUL, 0x3e778060UL, 0x00002c9aUL, 0x3567f613UL, 0x3c8cd252UL,
+    0xd3158574UL, 0x000059b0UL, 0x61e6c861UL, 0x3c60f74eUL, 0x18759bc8UL,
+    0x00008745UL, 0x5d837b6cUL, 0x3c979aa6UL, 0x6cf9890fUL, 0x0000b558UL,
+    0x702f9cd1UL, 0x3c3ebe3dUL, 0x32d3d1a2UL, 0x0000e3ecUL, 0x1e63bcd8UL,
+    0x3ca3516eUL, 0xd0125b50UL, 0x00011301UL, 0x26f0387bUL, 0x3ca4c554UL,
+    0xaea92ddfUL, 0x0001429aUL, 0x62523fb6UL, 0x3ca95153UL, 0x3c7d517aUL,
+    0x000172b8UL, 0x3f1353bfUL, 0x3c8b898cUL, 0xeb6fcb75UL, 0x0001a35bUL,
+    0x3e3a2f5fUL, 0x3c9aecf7UL, 0x3168b9aaUL, 0x0001d487UL, 0x44a6c38dUL,
+    0x3c8a6f41UL, 0x88628cd6UL, 0x0002063bUL, 0xe3a8a894UL, 0x3c968efdUL,
+    0x6e756238UL, 0x0002387aUL, 0x981fe7f2UL, 0x3c80472bUL, 0x65e27cddUL,
+    0x00026b45UL, 0x6d09ab31UL, 0x3c82f7e1UL, 0xf51fdee1UL, 0x00029e9dUL,
+    0x720c0ab3UL, 0x3c8b3782UL, 0xa6e4030bUL, 0x0002d285UL, 0x4db0abb6UL,
+    0x3c834d75UL, 0x0a31b715UL, 0x000306feUL, 0x5dd3f84aUL, 0x3c8fdd39UL,
+    0xb26416ffUL, 0x00033c08UL, 0xcc187d29UL, 0x3ca12f8cUL, 0x373aa9caUL,
+    0x000371a7UL, 0x738b5e8bUL, 0x3ca7d229UL, 0x34e59ff6UL, 0x0003a7dbUL,
+    0xa72a4c6dUL, 0x3c859f48UL, 0x4c123422UL, 0x0003dea6UL, 0x259d9205UL,
+    0x3ca8b846UL, 0x21f72e29UL, 0x0004160aUL, 0x60c2ac12UL, 0x3c4363edUL,
+    0x6061892dUL, 0x00044e08UL, 0xdaa10379UL, 0x3c6ecce1UL, 0xb5c13cd0UL,
+    0x000486a2UL, 0xbb7aafb0UL, 0x3c7690ceUL, 0xd5362a27UL, 0x0004bfdaUL,
+    0x9b282a09UL, 0x3ca083ccUL, 0x769d2ca6UL, 0x0004f9b2UL, 0xc1aae707UL,
+    0x3ca509b0UL, 0x569d4f81UL, 0x0005342bUL, 0x18fdd78eUL, 0x3c933505UL,
+    0x36b527daUL, 0x00056f47UL, 0xe21c5409UL, 0x3c9063e1UL, 0xdd485429UL,
+    0x0005ab07UL, 0x2b64c035UL, 0x3c9432e6UL, 0x15ad2148UL, 0x0005e76fUL,
+    0x99f08c0aUL, 0x3ca01284UL, 0xb03a5584UL, 0x0006247eUL, 0x0073dc06UL,
+    0x3c99f087UL, 0x82552224UL, 0x00066238UL, 0x0da05571UL, 0x3c998d4dUL,
+    0x667f3bccUL, 0x0006a09eUL, 0x86ce4786UL, 0x3ca52bb9UL, 0x3c651a2eUL,
+    0x0006dfb2UL, 0x206f0dabUL, 0x3ca32092UL, 0xe8ec5f73UL, 0x00071f75UL,
+    0x8e17a7a6UL, 0x3ca06122UL, 0x564267c8UL, 0x00075febUL, 0x461e9f86UL,
+    0x3ca244acUL, 0x73eb0186UL, 0x0007a114UL, 0xabd66c55UL, 0x3c65ebe1UL,
+    0x36cf4e62UL, 0x0007e2f3UL, 0xbbff67d0UL, 0x3c96fe9fUL, 0x994cce12UL,
+    0x00082589UL, 0x14c801dfUL, 0x3c951f14UL, 0x9b4492ecUL, 0x000868d9UL,
+    0xc1f0eab4UL, 0x3c8db72fUL, 0x422aa0dbUL, 0x0008ace5UL, 0x59f35f44UL,
+    0x3c7bf683UL, 0x99157736UL, 0x0008f1aeUL, 0x9c06283cUL, 0x3ca360baUL,
+    0xb0cdc5e4UL, 0x00093737UL, 0x20f962aaUL, 0x3c95e8d1UL, 0x9fde4e4fUL,
+    0x00097d82UL, 0x2b91ce27UL, 0x3c71affcUL, 0x82a3f090UL, 0x0009c491UL,
+    0x589a2ebdUL, 0x3c9b6d34UL, 0x7b5de564UL, 0x000a0c66UL, 0x9ab89880UL,
+    0x3c95277cUL, 0xb23e255cUL, 0x000a5503UL, 0x6e735ab3UL, 0x3c846984UL,
+    0x5579fdbfUL, 0x000a9e6bUL, 0x92cb3387UL, 0x3c8c1a77UL, 0x995ad3adUL,
+    0x000ae89fUL, 0xdc2d1d96UL, 0x3ca22466UL, 0xb84f15faUL, 0x000b33a2UL,
+    0xb19505aeUL, 0x3ca1112eUL, 0xf2fb5e46UL, 0x000b7f76UL, 0x0a5fddcdUL,
+    0x3c74ffd7UL, 0x904bc1d2UL, 0x000bcc1eUL, 0x30af0cb3UL, 0x3c736eaeUL,
+    0xdd85529cUL, 0x000c199bUL, 0xd10959acUL, 0x3c84e08fUL, 0x2e57d14bUL,
+    0x000c67f1UL, 0x6c921968UL, 0x3c676b2cUL, 0xdcef9069UL, 0x000cb720UL,
+    0x36df99b3UL, 0x3c937009UL, 0x4a07897bUL, 0x000d072dUL, 0xa63d07a7UL,
+    0x3c74a385UL, 0xdcfba487UL, 0x000d5818UL, 0xd5c192acUL, 0x3c8e5a50UL,
+    0x03db3285UL, 0x000da9e6UL, 0x1c4a9792UL, 0x3c98bb73UL, 0x337b9b5eUL,
+    0x000dfc97UL, 0x603a88d3UL, 0x3c74b604UL, 0xe78b3ff6UL, 0x000e502eUL,
+    0x92094926UL, 0x3c916f27UL, 0xa2a490d9UL, 0x000ea4afUL, 0x41aa2008UL,
+    0x3c8ec3bcUL, 0xee615a27UL, 0x000efa1bUL, 0x31d185eeUL, 0x3c8a64a9UL,
+    0x5b6e4540UL, 0x000f5076UL, 0x4d91cd9dUL, 0x3c77893bUL, 0x819e90d8UL,
+    0x000fa7c1UL, 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x7ff00000UL,
+    0x00000000UL, 0x00000000UL, 0xffffffffUL, 0x7fefffffUL, 0x00000000UL,
+    0x00100000UL
+};
+
+//registers,
+// input: (rbp + 8)
+// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+//          rax, rdx, rcx, rbx (tmp)
+
+// Code generated by Intel C compiler for LIBM library
+
+void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
+  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
+  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
+  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
+  Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;
+
+  assert_different_registers(tmp, eax, ecx, edx);
+  jmp(start);
+  address static_const_table = (address)_static_const_table;
+
+  bind(start);
+  subl(rsp, 120);
+  movl(Address(rsp, 64), tmp);
+  lea(tmp, ExternalAddress(static_const_table));
+  movdqu(xmm0, Address(rsp, 128));
+  unpcklpd(xmm0, xmm0);
+  movdqu(xmm1, Address(tmp, 64));          // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL
+  movdqu(xmm6, Address(tmp, 48));          // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
+  movdqu(xmm2, Address(tmp, 80));          // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL
+  movdqu(xmm3, Address(tmp, 96));          // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL
+  pextrw(eax, xmm0, 3);
+  andl(eax, 32767);
+  movl(edx, 16527);
+  subl(edx, eax);
+  subl(eax, 15504);
+  orl(edx, eax);
+  cmpl(edx, INT_MIN);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
+  mulpd(xmm1, xmm0);
+  addpd(xmm1, xmm6);
+  movapd(xmm7, xmm1);
+  subpd(xmm1, xmm6);
+  mulpd(xmm2, xmm1);
+  movdqu(xmm4, Address(tmp, 128));         // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL
+  mulpd(xmm3, xmm1);
+  movdqu(xmm5, Address(tmp, 144));         // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
+  subpd(xmm0, xmm2);
+  movdl(eax, xmm7);
+  movl(ecx, eax);
+  andl(ecx, 63);
+  shll(ecx, 4);
+  sarl(eax, 6);
+  movl(edx, eax);
+  movdqu(xmm6, Address(tmp, 16));          // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
+  pand(xmm7, xmm6);
+  movdqu(xmm6, Address(tmp, 32));          // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
+  paddq(xmm7, xmm6);
+  psllq(xmm7, 46);
+  subpd(xmm0, xmm3);
+  movdqu(xmm2, Address(tmp, ecx, Address::times_1, 160));
+  mulpd(xmm4, xmm0);
+  movapd(xmm6, xmm0);
+  movapd(xmm1, xmm0);
+  mulpd(xmm6, xmm6);
+  mulpd(xmm0, xmm6);
+  addpd(xmm5, xmm4);
+  mulsd(xmm0, xmm6);
+  mulpd(xmm6, Address(tmp, 112));          // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL
+  addsd(xmm1, xmm2);
+  unpckhpd(xmm2, xmm2);
+  mulpd(xmm0, xmm5);
+  addsd(xmm1, xmm0);
+  por(xmm2, xmm7);
+  unpckhpd(xmm0, xmm0);
+  addsd(xmm0, xmm1);
+  addsd(xmm0, xmm6);
+  addl(edx, 894);
+  cmpl(edx, 1916);
+  jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
+  mulsd(xmm0, xmm2);
+  addsd(xmm0, xmm2);
+  jmp(L_2TAG_PACKET_2_0_2);
+
+  bind(L_2TAG_PACKET_1_0_2);
+  fnstcw(Address(rsp, 24));
+  movzwl(edx, Address(rsp, 24));
+  orl(edx, 768);
+  movw(Address(rsp, 28), edx);
+  fldcw(Address(rsp, 28));
+  movl(edx, eax);
+  sarl(eax, 1);
+  subl(edx, eax);
+  movdqu(xmm6, Address(tmp, 0));           // 0x00000000UL, 0xfff00000UL, 0x00000000UL, 0xfff00000UL
+  pandn(xmm6, xmm2);
+  addl(eax, 1023);
+  movdl(xmm3, eax);
+  psllq(xmm3, 52);
+  por(xmm6, xmm3);
+  addl(edx, 1023);
+  movdl(xmm4, edx);
+  psllq(xmm4, 52);
+  movsd(Address(rsp, 8), xmm0);
+  fld_d(Address(rsp, 8));
+  movsd(Address(rsp, 16), xmm6);
+  fld_d(Address(rsp, 16));
+  fmula(1);
+  faddp(1);
+  movsd(Address(rsp, 8), xmm4);
+  fld_d(Address(rsp, 8));
+  fmulp(1);
+  fstp_d(Address(rsp, 8));
+  movsd(xmm0,Address(rsp, 8));
+  fldcw(Address(rsp, 24));
+  pextrw(ecx, xmm0, 3);
+  andl(ecx, 32752);
+  cmpl(ecx, 32752);
+  jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
+  cmpl(ecx, 0);
+  jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
+  jmp(L_2TAG_PACKET_2_0_2);
+  cmpl(ecx, INT_MIN);
+  jcc(Assembler::less, L_2TAG_PACKET_3_0_2);
+  cmpl(ecx, -1064950997);
+  jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
+  jcc(Assembler::greater, L_2TAG_PACKET_4_0_2);
+  movl(edx, Address(rsp, 128));
+  cmpl(edx ,-17155601);
+  jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
+  jmp(L_2TAG_PACKET_4_0_2);
+
+  bind(L_2TAG_PACKET_3_0_2);
+  movl(edx, 14);
+  jmp(L_2TAG_PACKET_5_0_2);
+
+  bind(L_2TAG_PACKET_4_0_2);
+  movl(edx, 15);
+
+  bind(L_2TAG_PACKET_5_0_2);
+  movsd(Address(rsp, 0), xmm0);
+  movsd(xmm0, Address(rsp, 128));
+  fld_d(Address(rsp, 0));
+  jmp(L_2TAG_PACKET_6_0_2);
+
+  bind(L_2TAG_PACKET_7_0_2);
+  cmpl(eax, 2146435072);
+  jcc(Assembler::greaterEqual, L_2TAG_PACKET_8_0_2);
+  movl(eax, Address(rsp, 132));
+  cmpl(eax, INT_MIN);
+  jcc(Assembler::greaterEqual, L_2TAG_PACKET_9_0_2);
+  movsd(xmm0, Address(tmp, 1208));         // 0xffffffffUL, 0x7fefffffUL
+  mulsd(xmm0, xmm0);
+  movl(edx, 14);
+  jmp(L_2TAG_PACKET_5_0_2);
+
+  bind(L_2TAG_PACKET_9_0_2);
+  movsd(xmm0, Address(tmp, 1216));
+  mulsd(xmm0, xmm0);
+  movl(edx, 15);
+  jmp(L_2TAG_PACKET_5_0_2);
+
+  bind(L_2TAG_PACKET_8_0_2);
+  movl(edx, Address(rsp, 128));
+  cmpl(eax, 2146435072);
+  jcc(Assembler::above, L_2TAG_PACKET_10_0_2);
+  cmpl(edx, 0);
+  jcc(Assembler::notEqual, L_2TAG_PACKET_10_0_2);
+  movl(eax, Address(rsp, 132));
+  cmpl(eax, 2146435072);
+  jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
+  movsd(xmm0, Address(tmp, 1192));         // 0x00000000UL, 0x7ff00000UL
+  jmp(L_2TAG_PACKET_2_0_2);
+
+  bind(L_2TAG_PACKET_11_0_2);
+  movsd(xmm0, Address(tmp, 1200));         // 0x00000000UL, 0x00000000UL
+  jmp(L_2TAG_PACKET_2_0_2);
+
+  bind(L_2TAG_PACKET_10_0_2);
+  movsd(xmm0, Address(rsp, 128));
+  addsd(xmm0, xmm0);
+  jmp(L_2TAG_PACKET_2_0_2);
+
+  bind(L_2TAG_PACKET_0_0_2);
+  movl(eax, Address(rsp, 132));
+  andl(eax, 2147483647);
+  cmpl(eax, 1083179008);
+  jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
+  movsd(xmm0, Address(rsp, 128));
+  addsd(xmm0, Address(tmp, 1184));         // 0x00000000UL, 0x3ff00000UL
+  jmp(L_2TAG_PACKET_2_0_2);
+
+  bind(L_2TAG_PACKET_2_0_2);
+  movsd(Address(rsp, 48), xmm0);
+  fld_d(Address(rsp, 48));
+
+  bind(L_2TAG_PACKET_6_0_2);
+  movl(tmp, Address(rsp, 64));
+}
+
+#endif
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -2135,14 +2135,6 @@
       __ ret(0);
     }
     {
-      StubCodeMark mark(this, "StubRoutines", "exp");
-      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
-
-      __ fld_d(Address(rsp, 4));
-      __ exp_with_fallback(0);
-      __ ret(0);
-    }
-    {
       StubCodeMark mark(this, "StubRoutines", "pow");
       StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
 
@@ -2991,6 +2983,89 @@
     return start;
   }
 
+  /**
+  *  Arguments:
+  *
+  * Inputs:
+  *   rsp(4)   - int crc
+  *   rsp(8)   - byte* buf
+  *   rsp(12)  - int length
+  *   rsp(16)  - table_start - optional (present only when doing a library_calll,
+  *              not used by x86 algorithm)
+  *
+  * Ouput:
+  *       rax  - int crc result
+  */
+  address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
+    assert(UseCRC32CIntrinsics, "need SSE4_2");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
+    address start = __ pc();
+    const Register crc = rax;  // crc
+    const Register buf = rcx;  // source java byte array address
+    const Register len = rdx;  // length
+    const Register d = rbx;
+    const Register g = rsi;
+    const Register h = rdi;
+    const Register empty = 0; // will never be used, in order not
+                              // to change a signature for crc32c_IPL_Alg2_Alt2
+                              // between 64/32 I'm just keeping it here
+    assert_different_registers(crc, buf, len, d, g, h);
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
+                                     // we need to add additional 4 because __ enter
+                                     // have just pushed ebp on a stack
+    Address buf_arg(rsp, 4 + 4 + 4);
+    Address len_arg(rsp, 4 + 4 + 8);
+      // Load up:
+      __ movl(crc, crc_arg);
+      __ movl(buf, buf_arg);
+      __ movl(len, len_arg);
+      __ push(d);
+      __ push(g);
+      __ push(h);
+      __ crc32c_ipl_alg2_alt2(crc, buf, len,
+                              d, g, h,
+                              empty, empty, empty,
+                              xmm0, xmm1, xmm2,
+                              is_pclmulqdq_supported);
+      __ pop(h);
+      __ pop(g);
+      __ pop(d);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+ address generate_libmExp() {
+    address start = __ pc();
+
+    const XMMRegister x0  = xmm0;
+    const XMMRegister x1  = xmm1;
+    const XMMRegister x2  = xmm2;
+    const XMMRegister x3  = xmm3;
+
+    const XMMRegister x4  = xmm4;
+    const XMMRegister x5  = xmm5;
+    const XMMRegister x6  = xmm6;
+    const XMMRegister x7  = xmm7;
+
+    const Register tmp   = rbx;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+
+  }
+
+
   // Safefetch stubs.
   void generate_safefetch(const char* name, int size, address* entry,
                           address* fault_pc, address* continuation_pc) {
@@ -3204,6 +3279,16 @@
       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
     }
+
+    if (UseCRC32CIntrinsics) {
+      bool supports_clmul = VM_Version::supports_clmul();
+      StubRoutines::x86::generate_CRC32C_table(supports_clmul);
+      StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
+      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
+    }
+    if (VM_Version::supports_sse2()) {
+      StubRoutines::_dexp = generate_libmExp();
+    }
   }
 
 
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -3039,19 +3039,6 @@
       __ ret(0);
     }
     {
-      StubCodeMark mark(this, "StubRoutines", "exp");
-      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
-
-      __ subq(rsp, 8);
-      __ movdbl(Address(rsp, 0), xmm0);
-      __ fld_d(Address(rsp, 0));
-      __ exp_with_fallback(0);
-      __ fstp_d(Address(rsp, 0));
-      __ movdbl(xmm0, Address(rsp, 0));
-      __ addq(rsp, 8);
-      __ ret(0);
-    }
-    {
       StubCodeMark mark(this, "StubRoutines", "pow");
       StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
 
@@ -3958,6 +3945,64 @@
     return start;
   }
 
+  /**
+  *  Arguments:
+  *
+  * Inputs:
+  *   c_rarg0   - int crc
+  *   c_rarg1   - byte* buf
+  *   c_rarg2   - long length
+  *   c_rarg3   - table_start - optional (present only when doing a library_calll,
+  *              not used by x86 algorithm)
+  *
+  * Ouput:
+  *       rax   - int crc result
+  */
+  address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
+      assert(UseCRC32CIntrinsics, "need SSE4_2");
+      __ align(CodeEntryAlignment);
+      StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
+      address start = __ pc();
+      //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
+      //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
+      //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
+      const Register crc = c_rarg0;  // crc
+      const Register buf = c_rarg1;  // source java byte array address
+      const Register len = c_rarg2;  // length
+      const Register a = rax;
+      const Register j = r9;
+      const Register k = r10;
+      const Register l = r11;
+#ifdef _WIN64
+      const Register y = rdi;
+      const Register z = rsi;
+#else
+      const Register y = rcx;
+      const Register z = r8;
+#endif
+      assert_different_registers(crc, buf, len, a, j, k, l, y, z);
+
+      BLOCK_COMMENT("Entry:");
+      __ enter(); // required for proper stackwalking of RuntimeStub frame
+#ifdef _WIN64
+      __ push(y);
+      __ push(z);
+#endif
+      __ crc32c_ipl_alg2_alt2(crc, buf, len,
+                              a, j, k,
+                              l, y, z,
+                              c_farg0, c_farg1, c_farg2,
+                              is_pclmulqdq_supported);
+      __ movl(rax, crc);
+#ifdef _WIN64
+      __ pop(z);
+      __ pop(y);
+#endif
+      __ leave(); // required for proper stackwalking of RuntimeStub frame
+      __ ret(0);
+
+      return start;
+  }
 
   /**
    *  Arguments:
@@ -4122,6 +4167,44 @@
     return start;
   }
 
+  address generate_libmExp() {
+    address start = __ pc();
+
+    const XMMRegister x0  = xmm0;
+    const XMMRegister x1  = xmm1;
+    const XMMRegister x2  = xmm2;
+    const XMMRegister x3  = xmm3;
+
+    const XMMRegister x4  = xmm4;
+    const XMMRegister x5  = xmm5;
+    const XMMRegister x6  = xmm6;
+    const XMMRegister x7  = xmm7;
+
+    const Register tmp   = r11;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef _WIN64
+    // save the xmm registers which must be preserved 6-7
+    __ movdqu(xmm_save(6), as_XMMRegister(6));
+    __ movdqu(xmm_save(7), as_XMMRegister(7));
+#endif
+      __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
+
+#ifdef _WIN64
+    // restore xmm regs belonging to calling function
+    __ movdqu(as_XMMRegister(6), xmm_save(6));
+    __ movdqu(as_XMMRegister(7), xmm_save(7));
+#endif
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+
+  }
+
 
 #undef __
 #define __ masm->
@@ -4302,6 +4385,14 @@
       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
     }
+
+    if (UseCRC32CIntrinsics) {
+      bool supports_clmul = VM_Version::supports_clmul();
+      StubRoutines::x86::generate_CRC32C_table(supports_clmul);
+      StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
+      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
+    }
+    StubRoutines::_dexp = generate_libmExp();
   }
 
   void generate_all() {
--- a/src/cpu/x86/vm/stubRoutines_x86.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/stubRoutines_x86.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -27,6 +27,7 @@
 #include "runtime/frame.inline.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "runtime/thread.inline.hpp"
+#include "crc32c.h"
 
 // Implementation of the platform-specific part of StubRoutines - for
 // a description of how to extend it, see the stubRoutines.hpp file.
@@ -130,3 +131,107 @@
     0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
     0x2d02ef8dUL
 };
+
+#define D 32
+#define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)
+
+#define TILL_CYCLE 31
+uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0]
+
+// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8
+// Listing 1: Multiplication of normalized polynomials
+// "a" and "b" occupy D least significant bits.
+uint32_t crc32c_multiply(uint32_t a, uint32_t b) {
+  uint32_t product = 0;
+  uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P
+  b_pow_x_table[0] = b;
+  for (int k = 0; k < D; ++k) {
+    // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result.
+    if ((a & (uint64_t)(1 << (D - 1 - k))) != 0) product ^= b_pow_x_table[k];
+
+    // Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P.
+    if (b_pow_x_table[k] & 1) {
+      // If degree of (b_pow_x_table[k] * x) is D, then
+      // degree of (b_pow_x_table[k] * x - P) is less than D.
+      b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P;
+    }
+    else {
+      b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1;
+    }
+  }
+  return product;
+}
+#undef D
+#undef P
+
+// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9
+void crc32c_init_pow_2k(void) {
+  // _crc32c_pow_2k_table(0) =
+  // x^(2^k) mod P(x) = x mod P(x) = x
+  // Since we are operating on a reflected values
+  // x = 10b, reflect(x) = 0x40000000
+  _crc32c_pow_2k_table[0] = 0x40000000;
+
+  for (int k = 1; k < TILL_CYCLE; k++) {
+    // _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x)
+    uint32_t tmp = _crc32c_pow_2k_table[k - 1];
+    _crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp);
+  }
+}
+
+// x^N mod P(x)
+uint32_t crc32c_f_pow_n(uint32_t n) {
+  //            result = 1 (polynomial)
+  uint32_t one, result = 0x80000000, i = 0;
+
+  while (one = (n & 1), (n == 1 || n - one > 0)) {
+    if (one) {
+      result = crc32c_multiply(result, _crc32c_pow_2k_table[i]);
+    }
+    n >>= 1;
+    i++;
+  }
+
+  return result;
+}
+
+juint *StubRoutines::x86::_crc32c_table;
+
+void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) {
+
+  static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+
+  crc32c_init_pow_2k();
+
+  pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8);      // 8N * 8 = 64N
+  pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2);  // 128N
+
+  pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8);
+  pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2);
+
+  pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8);
+  pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] =
+            crc32c_f_pow_n(CRC32C_LOW * 8 * 2);
+
+  if (is_pclmulqdq_table_supported) {
+    _crc32c_table = pow_n;
+  } else {
+    static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256];
+
+    for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) {
+      static juint X_CONST = pow_n[j];
+      for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations
+      // S. Gueron / Information Processing Letters 112 (2012) 184
+      // Algorithm 3: Generating a carry-less multiplication lookup table.
+      // Input: A 32-bit constant, X_CONST.
+      // Output: A table of 256 entries, each one is a 64-bit quadword,
+      // that can be used for computing "byte" * X_CONST, for a given byte.
+        pclmulqdq_table[j * 256 + i] =
+          ((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^
+          ((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^
+          ((i & 64) * X_CONST) ^ ((i & 128) * X_CONST);
+      }
+    }
+    _crc32c_table = (juint*)pclmulqdq_table;
+  }
+}
--- a/src/cpu/x86/vm/stubRoutines_x86.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/stubRoutines_x86.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -36,6 +36,8 @@
   // masks and table for CRC32
   static uint64_t _crc_by128_masks[];
   static juint    _crc_table[];
+  // table for CRC32C
+  static juint* _crc32c_table;
   // swap mask for ghash
   static address _ghash_long_swap_mask_addr;
   static address _ghash_byte_swap_mask_addr;
@@ -46,5 +48,6 @@
   static address crc_by128_masks_addr()  { return (address)_crc_by128_masks; }
   static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
   static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
+  static void generate_CRC32C_table(bool is_pclmulqdq_supported);
 
 #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -697,15 +697,14 @@
     __ jmp(rdi);
 
     __ bind(slow_path);
-    (void) generate_normal_entry(false);
-
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
     return entry;
   }
 #endif // INCLUDE_ALL_GCS
 
   // If G1 is not enabled then attempt to go through the accessor entry point
   // Reference.get is an accessor
-  return generate_jump_to_normal_entry();
+  return NULL;
 }
 
 /**
@@ -753,12 +752,10 @@
 
     // generate a vanilla native entry as the slow path
     __ bind(slow_path);
-
-    (void) generate_native_entry(false);
-
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
     return entry;
   }
-  return generate_native_entry(false);
+  return NULL;
 }
 
 /**
@@ -790,18 +787,25 @@
     const Register buf = rdx;  // source java byte array address
     const Register len = rdi;  // length
 
+    // value              x86_32
+    // interp. arg ptr    ESP + 4
+    // int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len)
+    //                                         3           2      1        0
+    // int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
+    //                                              4         2,3      1        0
+
     // Arguments are reversed on java expression stack
-    __ movl(len,   Address(rsp,   wordSize)); // Length
+    __ movl(len,   Address(rsp,   4 + 0)); // Length
     // Calculate address of start element
     if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
-      __ movptr(buf, Address(rsp, 3*wordSize)); // long buf
-      __ addptr(buf, Address(rsp, 2*wordSize)); // + offset
-      __ movl(crc,   Address(rsp, 5*wordSize)); // Initial CRC
+      __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long buf
+      __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+      __ movl(crc,   Address(rsp, 4 + 4 * wordSize)); // Initial CRC
     } else {
-      __ movptr(buf, Address(rsp, 3*wordSize)); // byte[] array
+      __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array
       __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
-      __ addptr(buf, Address(rsp, 2*wordSize)); // + offset
-      __ movl(crc,   Address(rsp, 4*wordSize)); // Initial CRC
+      __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+      __ movl(crc,   Address(rsp, 4 + 3 * wordSize)); // Initial CRC
     }
 
     __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32()), crc, buf, len);
@@ -814,12 +818,57 @@
 
     // generate a vanilla native entry as the slow path
     __ bind(slow_path);
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
+    return entry;
+  }
+  return NULL;
+}
 
-    (void) generate_native_entry(false);
+/**
+* Method entry for static native methods:
+*   int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end)
+*   int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
+*/
+address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+  if (UseCRC32CIntrinsics) {
+    address entry = __ pc();
+    // Load parameters
+    const Register crc = rax;  // crc
+    const Register buf = rcx;  // source java byte array address
+    const Register len = rdx;  // length
+    const Register end = len;
+
+    // value              x86_32
+    // interp. arg ptr    ESP + 4
+    // int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int end)
+    //                                         3           2      1        0
+    // int java.util.zip.CRC32.updateByteBuffer(int crc, long address, int off, int end)
+    //                                              4         2,3          1        0
+
+    // Arguments are reversed on java expression stack
+    __ movl(end, Address(rsp, 4 + 0)); // end
+    __ subl(len, Address(rsp, 4 + 1 * wordSize));  // end - offset == length
+    // Calculate address of start element
+    if (kind == Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer) {
+      __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long address
+      __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+      __ movl(crc, Address(rsp, 4 + 4 * wordSize)); // Initial CRC
+    } else {
+      __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array
+      __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+      __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+      __ movl(crc, Address(rsp, 4 + 3 * wordSize)); // Initial CRC
+    }
+    __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len);
+    // result in rax
+    // _areturn
+    __ pop(rdi);                // get return address
+    __ mov(rsp, rsi);           // set sp to sender sp
+    __ jmp(rdi);
 
     return entry;
   }
-  return generate_native_entry(false);
+  return NULL;
 }
 
 /**
@@ -827,10 +876,8 @@
  *    java.lang.Float.intBitsToFloat(int bits)
  */
 address InterpreterGenerator::generate_Float_intBitsToFloat_entry() {
-  address entry;
-
   if (UseSSE >= 1) {
-    entry = __ pc();
+    address entry = __ pc();
 
     // rsi: the sender's SP
 
@@ -844,11 +891,10 @@
     __ pop(rdi); // get return address
     __ mov(rsp, rsi); // set rsp to the sender's SP
     __ jmp(rdi);
-  } else {
-    entry = generate_native_entry(false);
+    return entry;
   }
 
-  return entry;
+  return NULL;
 }
 
 /**
@@ -856,10 +902,8 @@
  *    java.lang.Float.floatToRawIntBits(float value)
  */
 address InterpreterGenerator::generate_Float_floatToRawIntBits_entry() {
-  address entry;
-
   if (UseSSE >= 1) {
-    entry = __ pc();
+    address entry = __ pc();
 
     // rsi: the sender's SP
 
@@ -873,11 +917,10 @@
     __ pop(rdi); // get return address
     __ mov(rsp, rsi); // set rsp to the sender's SP
     __ jmp(rdi);
-  } else {
-    entry = generate_native_entry(false);
+    return entry;
   }
 
-  return entry;
+  return NULL;
 }
 
 
@@ -886,10 +929,8 @@
  *    java.lang.Double.longBitsToDouble(long bits)
  */
 address InterpreterGenerator::generate_Double_longBitsToDouble_entry() {
-  address entry;
-
    if (UseSSE >= 2) {
-     entry = __ pc();
+     address entry = __ pc();
 
      // rsi: the sender's SP
 
@@ -903,11 +944,10 @@
      __ pop(rdi); // get return address
      __ mov(rsp, rsi); // set rsp to the sender's SP
      __ jmp(rdi);
-   } else {
-     entry = generate_native_entry(false);
+     return entry;
    }
 
-   return entry;
+   return NULL;
 }
 
 /**
@@ -915,10 +955,8 @@
  *    java.lang.Double.doubleToRawLongBits(double value)
  */
 address InterpreterGenerator::generate_Double_doubleToRawLongBits_entry() {
-  address entry;
-
   if (UseSSE >= 2) {
-    entry = __ pc();
+    address entry = __ pc();
 
     // rsi: the sender's SP
 
@@ -933,11 +971,10 @@
     __ pop(rdi); // get return address
     __ mov(rsp, rsi); // set rsp to the sender's SP
     __ jmp(rdi);
-  } else {
-    entry = generate_native_entry(false);
+    return entry;
   }
 
-  return entry;
+  return NULL;
 }
 
 //
--- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -677,15 +677,14 @@
 
     // generate a vanilla interpreter entry as the slow path
     __ bind(slow_path);
-    (void) generate_normal_entry(false);
-
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
     return entry;
   }
 #endif // INCLUDE_ALL_GCS
 
   // If G1 is not enabled then attempt to go through the accessor entry point
   // Reference.get is an accessor
-  return generate_jump_to_normal_entry();
+  return NULL;
 }
 
 /**
@@ -733,12 +732,10 @@
 
     // generate a vanilla native entry as the slow path
     __ bind(slow_path);
-
-    (void) generate_native_entry(false);
-
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
     return entry;
   }
-  return generate_native_entry(false);
+  return NULL;
 }
 
 /**
@@ -796,12 +793,61 @@
 
     // generate a vanilla native entry as the slow path
     __ bind(slow_path);
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
+    return entry;
+  }
+  return NULL;
+}
 
-    (void) generate_native_entry(false);
+/**
+* Method entry for static native methods:
+*   int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end)
+*   int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
+*/
+address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+  if (UseCRC32CIntrinsics) {
+    address entry = __ pc();
+    // Load parameters
+    const Register crc = c_rarg0;  // crc
+    const Register buf = c_rarg1;  // source java byte array address
+    const Register len = c_rarg2;
+    const Register off = c_rarg3;  // offset
+    const Register end = len;
+
+    // Arguments are reversed on java expression stack
+    // Calculate address of start element
+    if (kind == Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer) {
+      __ movptr(buf, Address(rsp, 3 * wordSize)); // long buf
+      __ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset
+      __ addq(buf, off); // + offset
+      __ movl(crc, Address(rsp, 5 * wordSize)); // Initial CRC
+      // Note on 5 * wordSize vs. 4 * wordSize:
+      // *   int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
+      //                                                   4         2,3          1        0
+      // end starts at SP + 8
+      // The Java(R) Virtual Machine Specification Java SE 7 Edition
+      // 4.10.2.3. Values of Types long and double
+      //    "When calculating operand stack length, values of type long and double have length two."
+    } else {
+      __ movptr(buf, Address(rsp, 3 * wordSize)); // byte[] array
+      __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+      __ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset
+      __ addq(buf, off); // + offset
+      __ movl(crc, Address(rsp, 4 * wordSize)); // Initial CRC
+    }
+    __ movl(end, Address(rsp, wordSize)); // end
+    __ subl(end, off); // end - off
+    __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len);
+    // result in rax
+    // _areturn
+    __ pop(rdi);                // get return address
+    __ mov(rsp, r13);           // set sp to sender sp
+    __ jmp(rdi);
 
     return entry;
   }
-  return generate_native_entry(false);
+
+  return NULL;
 }
 
 // Interpreter stub for calling a native method. (asm interpreter)
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -661,6 +661,18 @@
     FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
   }
 
+  if (supports_sse4_2()) {
+    if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
+      UseCRC32CIntrinsics = true;
+    }
+  }
+  else if (UseCRC32CIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
+      warning("CRC32C intrinsics are not available on this CPU");
+    }
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
   // The AES intrinsic stubs require AES instruction support (of course)
   // but also require sse3 mode for instructions it use.
   if (UseAES && (UseSSE > 2)) {
@@ -704,12 +716,6 @@
     FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
   }
 
-  if (UseCRC32CIntrinsics) {
-    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
-      warning("CRC32C intrinsics are not available on this CPU");
-    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
-  }
-
   if (UseAdler32Intrinsics) {
     warning("Adler32Intrinsics not available on this CPU.");
     FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
--- a/src/cpu/x86/vm/x86.ad	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/x86.ad	Thu Oct 08 14:28:55 2015 -0700
@@ -1712,6 +1712,18 @@
   return ret_value;  // Per default match rules are supported.
 }
 
+const int Matcher::float_pressure(int default_pressure_threshold) {
+  int float_pressure_threshold = default_pressure_threshold;
+#ifdef _LP64
+  if (UseAVX > 2) {
+    // Increase pressure threshold on machines with AVX3 which have
+    // 2x more XMM registers.
+    float_pressure_threshold = default_pressure_threshold * 2;
+  }
+#endif
+  return float_pressure_threshold;
+}
+
 // Max vector size in bytes. 0 if not supported.
 const int Matcher::vector_width_in_bytes(BasicType bt) {
   assert(is_java_primitive(bt), "only primitive type vectors");
--- a/src/cpu/x86/vm/x86_32.ad	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/x86_32.ad	Thu Oct 08 14:28:55 2015 -0700
@@ -9911,35 +9911,6 @@
   ins_pipe( pipe_slow );
 %}
 
-
-instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
-  predicate (UseSSE<=1);
-  match(Set dpr1 (ExpD dpr1));
-  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
-  format %{ "fast_exp $dpr1 -> $dpr1  // KILL $rax, $rcx, $rdx" %}
-  ins_encode %{
-    __ fast_exp();
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct expD_reg(regD dst, regD src, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
-  predicate (UseSSE>=2);
-  match(Set dst (ExpD src));
-  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
-  format %{ "fast_exp $dst -> $src  // KILL $rax, $rcx, $rdx" %}
-  ins_encode %{
-    __ subptr(rsp, 8);
-    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
-    __ fld_d(Address(rsp, 0));
-    __ fast_exp();
-    __ fstp_d(Address(rsp, 0));
-    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
-    __ addptr(rsp, 8);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   // The source Double operand on FPU stack
--- a/src/cpu/x86/vm/x86_64.ad	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/x86/vm/x86_64.ad	Thu Oct 08 14:28:55 2015 -0700
@@ -3767,6 +3767,22 @@
   %}
 %}
 
+operand indPosIndexScale(any_RegP reg, rRegI idx, immI2 scale)
+%{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  predicate(n->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
+  match(AddP reg (LShiftL (ConvI2L idx) scale));
+
+  op_cost(10);
+  format %{"[$reg + pos $idx << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index($idx);
+    scale($scale);
+    disp(0x0);
+  %}
+%}
+
 // Indirect Memory Times Scale Plus Index Register Plus Offset Operand
 operand indIndexScaleOffset(any_RegP reg, immL32 off, rRegL lreg, immI2 scale)
 %{
@@ -4159,7 +4175,7 @@
 // case of this is memory operands.
 
 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
-               indIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
+               indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
                indCompressedOopOffset,
                indirectNarrow, indOffset8Narrow, indOffset32Narrow,
                indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
@@ -5186,6 +5202,17 @@
   ins_pipe(ialu_reg_reg_fat);
 %}
 
+instruct leaPPosIdxScale(rRegP dst, indPosIndexScale mem)
+%{
+  match(Set dst mem);
+
+  ins_cost(110);
+  format %{ "leaq    $dst, $mem\t# ptr idxscale" %}
+  opcode(0x8D);
+  ins_encode(REX_reg_mem_wide(dst, mem), OpcP, reg_mem(dst, mem));
+  ins_pipe(ialu_reg_reg_fat);
+%}
+
 instruct leaPIdxScaleOff(rRegP dst, indIndexScaleOffset mem)
 %{
   match(Set dst mem);
@@ -9871,22 +9898,6 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct expD_reg(regD dst, regD src, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{
-  match(Set dst (ExpD src));
-  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
-  format %{ "fast_exp $dst -> $src  // KILL $rax, $rcx, $rdx" %}
-  ins_encode %{
-    __ subptr(rsp, 8);
-    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
-    __ fld_d(Address(rsp, 0));
-    __ fast_exp();
-    __ fstp_d(Address(rsp, 0));
-    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
-    __ addptr(rsp, 8);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 //----------Arithmetic Conversion Instructions---------------------------------
 
 instruct roundFloat_nop(regF dst)
--- a/src/cpu/zero/vm/cppInterpreter_zero.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/zero/vm/cppInterpreter_zero.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -816,7 +816,7 @@
 
   // If G1 is not enabled then attempt to go through the normal entry point
   // Reference.get could be instrumented by jvmti
-  return generate_normal_entry(false);
+  return NULL;
 }
 
 address InterpreterGenerator::generate_native_entry(bool synchronized) {
--- a/src/cpu/zero/vm/interpreterGenerator_zero.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/cpu/zero/vm/interpreterGenerator_zero.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -42,4 +42,5 @@
   // Not supported
   address generate_CRC32_update_entry() { return NULL; }
   address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
+  address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
 #endif // CPU_ZERO_VM_INTERPRETERGENERATOR_ZERO_HPP
--- a/src/share/vm/adlc/formssel.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/adlc/formssel.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -4006,7 +4006,6 @@
         strcmp(opType,"DivD")==0 ||
         strcmp(opType,"DivF")==0 ||
         strcmp(opType,"DivI")==0 ||
-        strcmp(opType,"ExpD")==0 ||
         strcmp(opType,"LogD")==0 ||
         strcmp(opType,"Log10D")==0 ||
         strcmp(opType,"ModD")==0 ||
@@ -4143,6 +4142,8 @@
     "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
     "MulVS","MulVI","MulVL","MulVF","MulVD",
     "DivVF","DivVD",
+    "AbsVF","AbsVD",
+    "NegVF","NegVD",
     "SqrtVD",
     "AndV" ,"XorV" ,"OrV",
     "AddReductionVI", "AddReductionVL",
--- a/src/share/vm/c1/c1_GraphBuilder.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/c1/c1_GraphBuilder.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -3363,11 +3363,9 @@
   return NULL;
 }
 
-
 // negative filter: should callee NOT be inlined?  returns NULL, ok to inline, or rejection msg
 const char* GraphBuilder::should_not_inline(ciMethod* callee) const {
-  if ( callee->should_exclude())       return "excluded by CompilerOracle";
-  if ( callee->should_not_inline())    return "disallowed by CompilerOracle";
+  if ( callee->should_not_inline())    return "disallowed by CompileCommand";
   if ( callee->dont_inline())          return "don't inline by annotation";
   return NULL;
 }
@@ -3698,7 +3696,7 @@
 
     const char* msg = "";
     if (callee->force_inline())  msg = "force inline by annotation";
-    if (callee->should_inline()) msg = "force inline by CompileOracle";
+    if (callee->should_inline()) msg = "force inline by CompileCommand";
     print_inlining(callee, msg);
   } else {
     // use heuristic controls on inlining
--- a/src/share/vm/c1/c1_LIR.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/c1/c1_LIR.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -732,8 +732,7 @@
     case lir_sin:
     case lir_cos:
     case lir_log:
-    case lir_log10:
-    case lir_exp: {
+    case lir_log10: {
       assert(op->as_Op2() != NULL, "must be");
       LIR_Op2* op2 = (LIR_Op2*)op;
 
@@ -743,9 +742,6 @@
       // overlap with the input.
       assert(op2->_info == NULL, "not used");
       assert(op2->_tmp5->is_illegal(), "not used");
-      assert(op2->_tmp2->is_valid() == (op->code() == lir_exp), "not used");
-      assert(op2->_tmp3->is_valid() == (op->code() == lir_exp), "not used");
-      assert(op2->_tmp4->is_valid() == (op->code() == lir_exp), "not used");
       assert(op2->_opr1->is_valid(), "used");
       do_input(op2->_opr1); do_temp(op2->_opr1);
 
@@ -1775,7 +1771,6 @@
      case lir_tan:                   s = "tan";           break;
      case lir_log:                   s = "log";           break;
      case lir_log10:                 s = "log10";         break;
-     case lir_exp:                   s = "exp";           break;
      case lir_pow:                   s = "pow";           break;
      case lir_logic_and:             s = "logic_and";     break;
      case lir_logic_or:              s = "logic_or";      break;
--- a/src/share/vm/c1/c1_LIR.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/c1/c1_LIR.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -961,7 +961,6 @@
       , lir_tan
       , lir_log
       , lir_log10
-      , lir_exp
       , lir_pow
       , lir_logic_and
       , lir_logic_or
@@ -2199,7 +2198,6 @@
   void sin (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_sin , from, tmp1, to, tmp2)); }
   void cos (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_cos , from, tmp1, to, tmp2)); }
   void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }
-  void exp (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5)                { append(new LIR_Op2(lir_exp , from, tmp1, to, tmp2, tmp3, tmp4, tmp5)); }
   void pow (LIR_Opr arg1, LIR_Opr arg2, LIR_Opr res, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5) { append(new LIR_Op2(lir_pow, arg1, arg2, res, tmp1, tmp2, tmp3, tmp4, tmp5)); }
 
   void add (LIR_Opr left, LIR_Opr right, LIR_Opr res)      { append(new LIR_Op2(lir_add, left, right, res)); }
--- a/src/share/vm/c1/c1_LIRAssembler.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/c1/c1_LIRAssembler.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -739,7 +739,6 @@
     case lir_cos:
     case lir_log:
     case lir_log10:
-    case lir_exp:
     case lir_pow:
       intrinsic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op);
       break;
--- a/src/share/vm/c1/c1_LIRGenerator.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/c1/c1_LIRGenerator.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -244,6 +244,7 @@
   void do_getClass(Intrinsic* x);
   void do_currentThread(Intrinsic* x);
   void do_MathIntrinsic(Intrinsic* x);
+  void do_ExpIntrinsic(Intrinsic* x);
   void do_ArrayCopy(Intrinsic* x);
   void do_CompareAndSwap(Intrinsic* x, ValueType* type);
   void do_NIOCheckIndex(Intrinsic* x);
--- a/src/share/vm/c1/c1_LinearScan.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/c1/c1_LinearScan.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -6588,7 +6588,6 @@
         case lir_log10:
         case lir_log:
         case lir_pow:
-        case lir_exp:
         case lir_logic_and:
         case lir_logic_or:
         case lir_logic_xor:
--- a/src/share/vm/c1/c1_Runtime1.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/c1/c1_Runtime1.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -317,6 +317,7 @@
   FUNCTION_CASE(entry, TRACE_TIME_METHOD);
 #endif
   FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32());
+  FUNCTION_CASE(entry, StubRoutines::dexp());
 
 #undef FUNCTION_CASE
 
--- a/src/share/vm/ci/ciMethod.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/ci/ciMethod.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -1044,18 +1044,6 @@
 }
 
 // ------------------------------------------------------------------
-// ciMethod::should_exclude
-//
-// Should this method be excluded from compilation?
-bool ciMethod::should_exclude() {
-  check_is_loaded();
-  VM_ENTRY_MARK;
-  methodHandle mh(THREAD, get_Method());
-  bool ignore;
-  return CompilerOracle::should_exclude(mh, ignore);
-}
-
-// ------------------------------------------------------------------
 // ciMethod::should_inline
 //
 // Should this method be inlined during compilation?
--- a/src/share/vm/ci/ciMethod.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/ci/ciMethod.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -266,7 +266,6 @@
   int resolve_vtable_index(ciKlass* caller, ciKlass* receiver);
 
   // Compilation directives
-  bool should_exclude();
   bool should_inline();
   bool should_not_inline();
   bool should_print_assembly();
--- a/src/share/vm/compiler/compileBroker.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/compiler/compileBroker.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -1157,7 +1157,7 @@
       method->print_short_name(tty);
       tty->cr();
     }
-    method->set_not_compilable(CompLevel_all, !quietly, "excluded by CompilerOracle");
+    method->set_not_compilable(CompLevel_all, !quietly, "excluded by CompileCommand");
   }
 
   return false;
--- a/src/share/vm/compiler/compilerOracle.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/compiler/compilerOracle.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -24,149 +24,17 @@
 
 #include "precompiled.hpp"
 #include "compiler/compilerOracle.hpp"
+#include "compiler/methodMatcher.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/oopFactory.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/klass.hpp"
 #include "oops/method.hpp"
-#include "oops/oop.inline.hpp"
 #include "oops/symbol.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/jniHandles.hpp"
 #include "runtime/os.hpp"
 
-class MethodMatcher : public CHeapObj<mtCompiler> {
- public:
-  enum Mode {
-    Exact,
-    Prefix = 1,
-    Suffix = 2,
-    Substring = Prefix | Suffix,
-    Any,
-    Unknown = -1
-  };
-
- protected:
-  Symbol*        _class_name;
-  Symbol*        _method_name;
-  Symbol*        _signature;
-  Mode           _class_mode;
-  Mode           _method_mode;
-  MethodMatcher* _next;
-
-  static bool match(Symbol* candidate, Symbol* match, Mode match_mode);
-
-  Symbol* class_name() const { return _class_name; }
-  Symbol* method_name() const { return _method_name; }
-  Symbol* signature() const { return _signature; }
-
- public:
-  MethodMatcher(Symbol* class_name, Mode class_mode,
-                Symbol* method_name, Mode method_mode,
-                Symbol* signature, MethodMatcher* next);
-  MethodMatcher(Symbol* class_name, Symbol* method_name, MethodMatcher* next);
-
-  // utility method
-  MethodMatcher* find(methodHandle method) {
-    Symbol* class_name  = method->method_holder()->name();
-    Symbol* method_name = method->name();
-    for (MethodMatcher* current = this; current != NULL; current = current->_next) {
-      if (match(class_name, current->class_name(), current->_class_mode) &&
-          match(method_name, current->method_name(), current->_method_mode) &&
-          (current->signature() == NULL || current->signature() == method->signature())) {
-        return current;
-      }
-    }
-    return NULL;
-  }
-
-  bool match(methodHandle method) {
-    return find(method) != NULL;
-  }
-
-  MethodMatcher* next() const { return _next; }
-
-  static void print_symbol(Symbol* h, Mode mode) {
-    ResourceMark rm;
-
-    if (mode == Suffix || mode == Substring || mode == Any) {
-      tty->print("*");
-    }
-    if (mode != Any) {
-      h->print_symbol_on(tty);
-    }
-    if (mode == Prefix || mode == Substring) {
-      tty->print("*");
-    }
-  }
-
-  void print_base() {
-    print_symbol(class_name(), _class_mode);
-    tty->print(".");
-    print_symbol(method_name(), _method_mode);
-    if (signature() != NULL) {
-      signature()->print_symbol_on(tty);
-    }
-  }
-
-  virtual void print() {
-    print_base();
-    tty->cr();
-  }
-};
-
-MethodMatcher::MethodMatcher(Symbol* class_name, Symbol* method_name, MethodMatcher* next) {
-  _class_name  = class_name;
-  _method_name = method_name;
-  _next        = next;
-  _class_mode  = MethodMatcher::Exact;
-  _method_mode = MethodMatcher::Exact;
-  _signature   = NULL;
-}
-
-
-MethodMatcher::MethodMatcher(Symbol* class_name, Mode class_mode,
-                             Symbol* method_name, Mode method_mode,
-                             Symbol* signature, MethodMatcher* next):
-    _class_mode(class_mode)
-  , _method_mode(method_mode)
-  , _next(next)
-  , _class_name(class_name)
-  , _method_name(method_name)
-  , _signature(signature) {
-}
-
-bool MethodMatcher::match(Symbol* candidate, Symbol* match, Mode match_mode) {
-  if (match_mode == Any) {
-    return true;
-  }
-
-  if (match_mode == Exact) {
-    return candidate == match;
-  }
-
-  ResourceMark rm;
-  const char * candidate_string = candidate->as_C_string();
-  const char * match_string = match->as_C_string();
-
-  switch (match_mode) {
-  case Prefix:
-    return strstr(candidate_string, match_string) == candidate_string;
-
-  case Suffix: {
-    size_t clen = strlen(candidate_string);
-    size_t mlen = strlen(match_string);
-    return clen >= mlen && strcmp(candidate_string + clen - mlen, match_string) == 0;
-  }
-
-  case Substring:
-    return strstr(candidate_string, match_string) != NULL;
-
-  default:
-    return false;
-  }
-}
-
 enum OptionType {
   IntxType,
   UintxType,
@@ -202,114 +70,6 @@
   return DoubleType;
 }
 
-template<typename T>
-static const T copy_value(const T value) {
-  return value;
-}
-
-template<> const ccstr copy_value<ccstr>(const ccstr value) {
-  return (const ccstr)os::strdup_check_oom(value);
-}
-
-template <typename T>
-class TypedMethodOptionMatcher : public MethodMatcher {
-  const char* _option;
-  OptionType _type;
-  const T _value;
-
-public:
-  TypedMethodOptionMatcher(Symbol* class_name, Mode class_mode,
-                           Symbol* method_name, Mode method_mode,
-                           Symbol* signature, const char* opt,
-                           const T value,  MethodMatcher* next) :
-    MethodMatcher(class_name, class_mode, method_name, method_mode, signature, next),
-                  _type(get_type_for<T>()), _value(copy_value<T>(value)) {
-    _option = os::strdup_check_oom(opt);
-  }
-
-  ~TypedMethodOptionMatcher() {
-    os::free((void*)_option);
-  }
-
-  TypedMethodOptionMatcher* match(methodHandle method, const char* opt) {
-    TypedMethodOptionMatcher* current = this;
-    while (current != NULL) {
-      current = (TypedMethodOptionMatcher*)current->find(method);
-      if (current == NULL) {
-        return NULL;
-      }
-      if (strcmp(current->_option, opt) == 0) {
-        return current;
-      }
-      current = current->next();
-    }
-    return NULL;
-  }
-
-  TypedMethodOptionMatcher* next() {
-    return (TypedMethodOptionMatcher*)_next;
-  }
-
-  OptionType get_type(void) {
-      return _type;
-  };
-
-  T value() { return _value; }
-
-  void print() {
-    ttyLocker ttyl;
-    print_base();
-    tty->print(" %s", _option);
-    tty->print(" <unknown option type>");
-    tty->cr();
-  }
-};
-
-template<>
-void TypedMethodOptionMatcher<intx>::print() {
-  ttyLocker ttyl;
-  print_base();
-  tty->print(" intx %s", _option);
-  tty->print(" = " INTX_FORMAT, _value);
-  tty->cr();
-};
-
-template<>
-void TypedMethodOptionMatcher<uintx>::print() {
-  ttyLocker ttyl;
-  print_base();
-  tty->print(" uintx %s", _option);
-  tty->print(" = " UINTX_FORMAT, _value);
-  tty->cr();
-};
-
-template<>
-void TypedMethodOptionMatcher<bool>::print() {
-  ttyLocker ttyl;
-  print_base();
-  tty->print(" bool %s", _option);
-  tty->print(" = %s", _value ? "true" : "false");
-  tty->cr();
-};
-
-template<>
-void TypedMethodOptionMatcher<ccstr>::print() {
-  ttyLocker ttyl;
-  print_base();
-  tty->print(" const char* %s", _option);
-  tty->print(" = '%s'", _value);
-  tty->cr();
-};
-
-template<>
-void TypedMethodOptionMatcher<double>::print() {
-  ttyLocker ttyl;
-  print_base();
-  tty->print(" double %s", _option);
-  tty->print(" = %f", _value);
-  tty->cr();
-};
-
 // this must parallel the command_names below
 enum OracleCommand {
   UnknownCommand = -1,
@@ -342,8 +102,198 @@
 };
 
 class MethodMatcher;
-static MethodMatcher* lists[OracleCommandCount] = { 0, };
+class TypedMethodOptionMatcher;
 
+static BasicMatcher* lists[OracleCommandCount] = { 0, };
+static TypedMethodOptionMatcher* option_list = NULL;
+
+class TypedMethodOptionMatcher : public MethodMatcher {
+ private:
+  TypedMethodOptionMatcher* _next;
+  const char*   _option;
+  OptionType    _type;
+ public:
+
+  union {
+    bool bool_value;
+    intx intx_value;
+    uintx uintx_value;
+    double double_value;
+    ccstr ccstr_value;
+  } _u;
+
+  TypedMethodOptionMatcher() : MethodMatcher(),
+    _next(NULL),
+    _type(UnknownType) {
+      _option = NULL;
+      memset(&_u, 0, sizeof(_u));
+  }
+
+  static TypedMethodOptionMatcher* parse_method_pattern(char*& line, const char*& error_msg);
+  TypedMethodOptionMatcher* match(methodHandle method, const char* opt, OptionType type);
+
+  void init(const char* opt, OptionType type, TypedMethodOptionMatcher* next) {
+    _next = next;
+    _type = type;
+    _option = os::strdup_check_oom(opt);
+  }
+
+  void set_next(TypedMethodOptionMatcher* next) {_next = next; }
+  TypedMethodOptionMatcher* next() { return _next; }
+  OptionType type() { return _type; }
+  template<typename T> T value();
+  template<typename T> void set_value(T value);
+  void print();
+  void print_all();
+  TypedMethodOptionMatcher* clone();
+  ~TypedMethodOptionMatcher();
+};
+
+// A few templated accessors instead of a full template class.
+template<> intx TypedMethodOptionMatcher::value<intx>() {
+  return _u.intx_value;
+}
+
+template<> uintx TypedMethodOptionMatcher::value<uintx>() {
+  return _u.uintx_value;
+}
+
+template<> bool TypedMethodOptionMatcher::value<bool>() {
+  return _u.bool_value;
+}
+
+template<> double TypedMethodOptionMatcher::value<double>() {
+  return _u.double_value;
+}
+
+template<> ccstr TypedMethodOptionMatcher::value<ccstr>() {
+  return _u.ccstr_value;
+}
+
+template<> void TypedMethodOptionMatcher::set_value(intx value) {
+  _u.intx_value = value;
+}
+
+template<> void TypedMethodOptionMatcher::set_value(uintx value) {
+  _u.uintx_value = value;
+}
+
+template<> void TypedMethodOptionMatcher::set_value(double value) {
+  _u.double_value = value;
+}
+
+template<> void TypedMethodOptionMatcher::set_value(bool value) {
+  _u.bool_value = value;
+}
+
+template<> void TypedMethodOptionMatcher::set_value(ccstr value) {
+  _u.ccstr_value = (const ccstr)os::strdup_check_oom(value);
+}
+
+void TypedMethodOptionMatcher::print() {
+  ttyLocker ttyl;
+  print_base(tty);
+  switch (_type) {
+  case IntxType:
+    tty->print_cr(" intx %s = " INTX_FORMAT, _option, value<intx>());
+    break;
+  case UintxType:
+    tty->print_cr(" uintx %s = " UINTX_FORMAT, _option, value<uintx>());
+    break;
+  case BoolType:
+    tty->print_cr(" bool %s = %s", _option, value<bool>() ? "true" : "false");
+    break;
+  case DoubleType:
+    tty->print_cr(" double %s = %f", _option, value<double>());
+    break;
+  case CcstrType:
+    tty->print_cr(" const char* %s = '%s'", _option, value<ccstr>());
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+}
+
+void TypedMethodOptionMatcher::print_all() {
+   print();
+   if (_next != NULL) {
+     tty->print(" ");
+     _next->print_all();
+   }
+ }
+
+TypedMethodOptionMatcher* TypedMethodOptionMatcher::clone() {
+  TypedMethodOptionMatcher* m = new TypedMethodOptionMatcher();
+  m->_class_mode = _class_mode;
+  m->_class_name = _class_name;
+  m->_method_mode = _method_mode;
+  m->_method_name = _method_name;
+  m->_signature = _signature;
+  // Need to ref count the symbols
+  if (_class_name != NULL) {
+    _class_name->increment_refcount();
+  }
+  if (_method_name != NULL) {
+    _method_name->increment_refcount();
+  }
+  if (_signature != NULL) {
+    _signature->increment_refcount();
+  }
+  return m;
+}
+
+TypedMethodOptionMatcher::~TypedMethodOptionMatcher() {
+  if (_option != NULL) {
+    os::free((void*)_option);
+  }
+  if (_class_name != NULL) {
+    _class_name->decrement_refcount();
+  }
+  if (_method_name != NULL) {
+    _method_name->decrement_refcount();
+  }
+  if (_signature != NULL) {
+    _signature->decrement_refcount();
+  }
+}
+
+TypedMethodOptionMatcher* TypedMethodOptionMatcher::parse_method_pattern(char*& line, const char*& error_msg) {
+  assert(error_msg == NULL, "Dont call here with error_msg already set");
+  TypedMethodOptionMatcher* tom = new TypedMethodOptionMatcher();
+  MethodMatcher::parse_method_pattern(line, error_msg, tom);
+  if (error_msg != NULL) {
+    delete tom;
+    return NULL;
+  }
+  return tom;
+}
+
+TypedMethodOptionMatcher* TypedMethodOptionMatcher::match(methodHandle method, const char* opt, OptionType type) {
+  TypedMethodOptionMatcher* current = this;
+  while (current != NULL) {
+    // Fastest compare first.
+    if (current->type() == type) {
+      if (strcmp(current->_option, opt) == 0) {
+        if (current->matches(method)) {
+          return current;
+        }
+      }
+    }
+    current = current->next();
+  }
+  return NULL;
+}
+
+template<typename T>
+static void add_option_string(TypedMethodOptionMatcher* matcher,
+                                        const char* option,
+                                        T value) {
+  assert(matcher != option_list, "No circular lists please");
+  matcher->init(option, get_type_for<T>(), option_list);
+  matcher->set_value<T>(value);
+  option_list = matcher;
+  return;
+}
 
 static bool check_predicate(OracleCommand command, methodHandle method) {
   return ((lists[command] != NULL) &&
@@ -351,51 +301,27 @@
           lists[command]->match(method));
 }
 
+static void add_predicate(OracleCommand command, BasicMatcher* bm) {
+  assert(command != OptionCommand, "must use add_option_string");
+  if (command == LogCommand && !LogCompilation && lists[LogCommand] == NULL) {
+    tty->print_cr("Warning:  +LogCompilation must be enabled in order for individual methods to be logged.");
+  }
+  bm->set_next(lists[command]);
+  lists[command] = bm;
 
-static MethodMatcher* add_predicate(OracleCommand command,
-                                    Symbol* class_name, MethodMatcher::Mode c_mode,
-                                    Symbol* method_name, MethodMatcher::Mode m_mode,
-                                    Symbol* signature) {
-  assert(command != OptionCommand, "must use add_option_string");
-  if (command == LogCommand && !LogCompilation && lists[LogCommand] == NULL)
-    tty->print_cr("Warning:  +LogCompilation must be enabled in order for individual methods to be logged.");
-  lists[command] = new MethodMatcher(class_name, c_mode, method_name, m_mode, signature, lists[command]);
-  return lists[command];
-}
-
-template<typename T>
-static MethodMatcher* add_option_string(Symbol* class_name, MethodMatcher::Mode c_mode,
-                                        Symbol* method_name, MethodMatcher::Mode m_mode,
-                                        Symbol* signature,
-                                        const char* option,
-                                        T value) {
-  lists[OptionCommand] = new TypedMethodOptionMatcher<T>(class_name, c_mode, method_name, m_mode,
-                                                         signature, option, value, lists[OptionCommand]);
-  return lists[OptionCommand];
-}
-
-template<typename T>
-static bool get_option_value(methodHandle method, const char* option, T& value) {
-   TypedMethodOptionMatcher<T>* m;
-   if (lists[OptionCommand] != NULL
-       && (m = ((TypedMethodOptionMatcher<T>*)lists[OptionCommand])->match(method, option)) != NULL
-       && m->get_type() == get_type_for<T>()) {
-       value = m->value();
-       return true;
-   } else {
-     return false;
-   }
-}
-
-bool CompilerOracle::has_option_string(methodHandle method, const char* option) {
-  bool value = false;
-  get_option_value(method, option, value);
-  return value;
+  return;
 }
 
 template<typename T>
 bool CompilerOracle::has_option_value(methodHandle method, const char* option, T& value) {
-  return ::get_option_value(method, option, value);
+  if (option_list != NULL) {
+    TypedMethodOptionMatcher* m = option_list->match(method, option, get_type_for<T>());
+    if (m != NULL) {
+      value = m->value<T>();
+      return true;
+    }
+  }
+  return false;
 }
 
 // Explicit instantiation for all OptionTypes supported.
@@ -405,6 +331,12 @@
 template bool CompilerOracle::has_option_value<ccstr>(methodHandle method, const char* option, ccstr& value);
 template bool CompilerOracle::has_option_value<double>(methodHandle method, const char* option, double& value);
 
+bool CompilerOracle::has_option_string(methodHandle method, const char* option) {
+  bool value = false;
+  has_option_value(method, option, value);
+  return value;
+}
+
 bool CompilerOracle::should_exclude(methodHandle method, bool& quietly) {
   quietly = true;
   if (lists[ExcludeCommand] != NULL) {
@@ -420,19 +352,18 @@
   return false;
 }
 
-
 bool CompilerOracle::should_inline(methodHandle method) {
   return (check_predicate(InlineCommand, method));
 }
 
-
+// Check both DontInlineCommand and ExcludeCommand here
+// - consistent behavior for all compilers
 bool CompilerOracle::should_not_inline(methodHandle method) {
-  return (check_predicate(DontInlineCommand, method));
+  return check_predicate(DontInlineCommand, method) || check_predicate(ExcludeCommand, method);
 }
 
-
 bool CompilerOracle::should_print(methodHandle method) {
-  return (check_predicate(PrintCommand, method));
+  return check_predicate(PrintCommand, method);
 }
 
 bool CompilerOracle::should_print_methods() {
@@ -445,12 +376,10 @@
   return (check_predicate(LogCommand, method));
 }
 
-
 bool CompilerOracle::should_break_at(methodHandle method) {
   return check_predicate(BreakCommand, method);
 }
 
-
 static OracleCommand parse_command_name(const char * line, int* bytes_read) {
   assert(ARRAY_SIZE(command_names) == OracleCommandCount,
          "command_names size mismatch");
@@ -516,84 +445,12 @@
   tty->cr();
 };
 
-// The JVM specification defines the allowed characters.
-// Tokens that are disallowed by the JVM specification can have
-// a meaning to the parser so we need to include them here.
-// The parser does not enforce all rules of the JVMS - a successful parse
-// does not mean that it is an allowed name. Illegal names will
-// be ignored since they never can match a class or method.
-//
-// '\0' and 0xf0-0xff are disallowed in constant string values
-// 0x20 ' ', 0x09 '\t' and, 0x2c ',' are used in the matching
-// 0x5b '[' and 0x5d ']' can not be used because of the matcher
-// 0x28 '(' and 0x29 ')' are used for the signature
-// 0x2e '.' is always replaced before the matching
-// 0x2f '/' is only used in the class name as package separator
-
-#define RANGEBASE "\x1\x2\x3\x4\x5\x6\x7\x8\xa\xb\xc\xd\xe\xf" \
-    "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" \
-    "\x21\x22\x23\x24\x25\x26\x27\x2a\x2b\x2c\x2d" \
-    "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" \
-    "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" \
-    "\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5c\x5e\x5f" \
-    "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
-    "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" \
-    "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" \
-    "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" \
-    "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf" \
-    "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" \
-    "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" \
-    "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" \
-    "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-
-#define RANGE0 "[*" RANGEBASE "]"
-#define RANGESLASH "[*" RANGEBASE "/]"
-
-static MethodMatcher::Mode check_mode(char name[], const char*& error_msg) {
-  int match = MethodMatcher::Exact;
-  while (name[0] == '*') {
-    match |= MethodMatcher::Suffix;
-    // Copy remaining string plus NUL to the beginning
-    memmove(name, name + 1, strlen(name + 1) + 1);
-  }
-
-  if (strcmp(name, "*") == 0) return MethodMatcher::Any;
-
-  size_t len = strlen(name);
-  while (len > 0 && name[len - 1] == '*') {
-    match |= MethodMatcher::Prefix;
-    name[--len] = '\0';
-  }
-
-  if (strstr(name, "*") != NULL) {
-    error_msg = "  Embedded * not allowed";
-    return MethodMatcher::Unknown;
-  }
-  return (MethodMatcher::Mode)match;
-}
-
-static bool scan_line(const char * line,
-                      char class_name[],  MethodMatcher::Mode* c_mode,
-                      char method_name[], MethodMatcher::Mode* m_mode,
-                      int* bytes_read, const char*& error_msg) {
-  *bytes_read = 0;
-  error_msg = NULL;
-  if (2 == sscanf(line, "%*[ \t]%255" RANGESLASH "%*[ ]" "%255" RANGE0 "%n", class_name, method_name, bytes_read)) {
-    *c_mode = check_mode(class_name, error_msg);
-    *m_mode = check_mode(method_name, error_msg);
-    return *c_mode != MethodMatcher::Unknown && *m_mode != MethodMatcher::Unknown;
-  }
-  return false;
-}
-
 // Scan next flag and value in line, return MethodMatcher object on success, NULL on failure.
 // On failure, error_msg contains description for the first error.
 // For future extensions: set error_msg on first error.
-static MethodMatcher* scan_flag_and_value(const char* type, const char* line, int& total_bytes_read,
-                                          Symbol* c_name, MethodMatcher::Mode c_match,
-                                          Symbol* m_name, MethodMatcher::Mode m_match,
-                                          Symbol* signature,
-                                          char* errorbuf, const int buf_size) {
+static void scan_flag_and_value(const char* type, const char* line, int& total_bytes_read,
+                                            TypedMethodOptionMatcher* matcher,
+                                            char* errorbuf, const int buf_size) {
   total_bytes_read = 0;
   int bytes_read = 0;
   char flag[256];
@@ -608,7 +465,8 @@
       intx value;
       if (sscanf(line, "%*[ \t]" INTX_FORMAT "%n", &value, &bytes_read) == 1) {
         total_bytes_read += bytes_read;
-        return add_option_string(c_name, c_match, m_name, m_match, signature, flag, value);
+        add_option_string(matcher, flag, value);
+        return;
       } else {
         jio_snprintf(errorbuf, buf_size, "  Value cannot be read for flag %s of type %s ", flag, type);
       }
@@ -616,7 +474,8 @@
       uintx value;
       if (sscanf(line, "%*[ \t]" UINTX_FORMAT "%n", &value, &bytes_read) == 1) {
         total_bytes_read += bytes_read;
-        return add_option_string(c_name, c_match, m_name, m_match, signature, flag, value);
+        add_option_string(matcher, flag, value);
+        return;
       } else {
         jio_snprintf(errorbuf, buf_size, "  Value cannot be read for flag %s of type %s", flag, type);
       }
@@ -625,7 +484,8 @@
       char* value = NEW_RESOURCE_ARRAY(char, strlen(line) + 1);
       if (sscanf(line, "%*[ \t]%255[_a-zA-Z0-9]%n", value, &bytes_read) == 1) {
         total_bytes_read += bytes_read;
-        return add_option_string(c_name, c_match, m_name, m_match, signature, flag, (ccstr)value);
+        add_option_string(matcher, flag, (ccstr)value);
+        return;
       } else {
         jio_snprintf(errorbuf, buf_size, "  Value cannot be read for flag %s of type %s", flag, type);
       }
@@ -646,7 +506,8 @@
           next_value += bytes_read;
           end_value = next_value-1;
         }
-        return add_option_string(c_name, c_match, m_name, m_match, signature, flag, (ccstr)value);
+        add_option_string(matcher, flag, (ccstr)value);
+        return;
       } else {
         jio_snprintf(errorbuf, buf_size, "  Value cannot be read for flag %s of type %s", flag, type);
       }
@@ -655,10 +516,12 @@
       if (sscanf(line, "%*[ \t]%255[a-zA-Z]%n", value, &bytes_read) == 1) {
         if (strcmp(value, "true") == 0) {
           total_bytes_read += bytes_read;
-          return add_option_string(c_name, c_match, m_name, m_match, signature, flag, true);
+          add_option_string(matcher, flag, true);
+          return;
         } else if (strcmp(value, "false") == 0) {
           total_bytes_read += bytes_read;
-          return add_option_string(c_name, c_match, m_name, m_match, signature, flag, false);
+          add_option_string(matcher, flag, false);
+          return;
         } else {
           jio_snprintf(errorbuf, buf_size, "  Value cannot be read for flag %s of type %s", flag, type);
         }
@@ -673,7 +536,8 @@
         char value[512] = "";
         jio_snprintf(value, sizeof(value), "%s.%s", buffer[0], buffer[1]);
         total_bytes_read += bytes_read;
-        return add_option_string(c_name, c_match, m_name, m_match, signature, flag, atof(value));
+        add_option_string(matcher, flag, atof(value));
+        return;
       } else {
         jio_snprintf(errorbuf, buf_size, "  Value cannot be read for flag %s of type %s", flag, type);
       }
@@ -683,7 +547,7 @@
   } else {
     jio_snprintf(errorbuf, buf_size, "  Flag name for type %s should be alphanumeric ", type);
   }
-  return NULL;
+  return;
 }
 
 int skip_whitespace(char* line) {
@@ -693,31 +557,20 @@
   return whitespace_read;
 }
 
+void CompilerOracle::print_parse_error(const char*&  error_msg, char* original_line) {
+  assert(error_msg != NULL, "Must have error_message");
+
+  ttyLocker ttyl;
+  tty->print_cr("CompileCommand: An error occurred during parsing");
+  tty->print_cr("Line: %s", original_line);
+  tty->print_cr("Error: %s", error_msg);
+  CompilerOracle::print_tip();
+}
+
 void CompilerOracle::parse_from_line(char* line) {
   if (line[0] == '\0') return;
   if (line[0] == '#')  return;
 
-  bool have_colon = (strstr(line, "::") != NULL);
-  for (char* lp = line; *lp != '\0'; lp++) {
-    // Allow '.' to separate the class name from the method name.
-    // This is the preferred spelling of methods:
-    //      exclude java/lang/String.indexOf(I)I
-    // Allow ',' for spaces (eases command line quoting).
-    //      exclude,java/lang/String.indexOf
-    // For backward compatibility, allow space as separator also.
-    //      exclude java/lang/String indexOf
-    //      exclude,java/lang/String,indexOf
-    // For easy cut-and-paste of method names, allow VM output format
-    // as produced by Method::print_short_name:
-    //      exclude java.lang.String::indexOf
-    // For simple implementation convenience here, convert them all to space.
-    if (have_colon) {
-      if (*lp == '.')  *lp = '/';   // dots build the package prefix
-      if (*lp == ':')  *lp = ' ';
-    }
-    if (*lp == ',' || *lp == '.')  *lp = ' ';
-  }
-
   char* original_line = line;
   int bytes_read;
   OracleCommand command = parse_command_name(line, &bytes_read);
@@ -742,109 +595,86 @@
     return;
   }
 
-  MethodMatcher::Mode c_match = MethodMatcher::Exact;
-  MethodMatcher::Mode m_match = MethodMatcher::Exact;
-  char class_name[256];
-  char method_name[256];
-  char sig[1024];
-  char errorbuf[1024];
-  const char* error_msg = NULL; // description of first error that appears
-  MethodMatcher* match = NULL;
+  const char* error_msg = NULL;
+  if (command == OptionCommand) {
+    // Look for trailing options.
+    //
+    // Two types of trailing options are
+    // supported:
+    //
+    // (1) CompileCommand=option,Klass::method,flag
+    // (2) CompileCommand=option,Klass::method,type,flag,value
+    //
+    // Type (1) is used to enable a boolean flag for a method.
+    //
+    // Type (2) is used to support options with a value. Values can have the
+    // the following types: intx, uintx, bool, ccstr, ccstrlist, and double.
+    //
+    // For future extensions: extend scan_flag_and_value()
 
-  if (scan_line(line, class_name, &c_match, method_name, &m_match, &bytes_read, error_msg)) {
-    EXCEPTION_MARK;
-    Symbol* c_name = SymbolTable::new_symbol(class_name, CHECK);
-    Symbol* m_name = SymbolTable::new_symbol(method_name, CHECK);
-    Symbol* signature = NULL;
-
-    line += bytes_read;
-
-    // there might be a signature following the method.
-    // signatures always begin with ( so match that by hand
-    line += skip_whitespace(line);
-    if (1 == sscanf(line, "(%254[[);/" RANGEBASE "]%n", sig + 1, &bytes_read)) {
-      sig[0] = '(';
-      line += bytes_read;
-      signature = SymbolTable::new_symbol(sig, CHECK);
+    char option[256]; // stores flag for Type (1) and type of Type (2)
+    line++; // skip the ','
+    TypedMethodOptionMatcher* archetype = TypedMethodOptionMatcher::parse_method_pattern(line, error_msg);
+    if (archetype == NULL) {
+      assert(error_msg != NULL, "Must have error_message");
+      print_parse_error(error_msg, original_line);
+      return;
     }
 
-    if (command == OptionCommand) {
-      // Look for trailing options.
-      //
-      // Two types of trailing options are
-      // supported:
-      //
-      // (1) CompileCommand=option,Klass::method,flag
-      // (2) CompileCommand=option,Klass::method,type,flag,value
-      //
-      // Type (1) is used to enable a boolean flag for a method.
-      //
-      // Type (2) is used to support options with a value. Values can have the
-      // the following types: intx, uintx, bool, ccstr, ccstrlist, and double.
-      //
-      // For future extensions: extend scan_flag_and_value()
-      char option[256]; // stores flag for Type (1) and type of Type (2)
+    line += skip_whitespace(line);
 
-      line += skip_whitespace(line);
-      while (sscanf(line, "%255[a-zA-Z0-9]%n", option, &bytes_read) == 1) {
-        if (match != NULL && !_quiet) {
-          // Print out the last match added
-          ttyLocker ttyl;
-          tty->print("CompileCommand: %s ", command_names[command]);
-          match->print();
+    // This is unnecessarily complex. Should retire multi-option lines and skip while loop
+    while (sscanf(line, "%255[a-zA-Z0-9]%n", option, &bytes_read) == 1) {
+      line += bytes_read;
+
+      // typed_matcher is used as a blueprint for each option, deleted at the end
+      TypedMethodOptionMatcher* typed_matcher = archetype->clone();
+      if (strcmp(option, "intx") == 0
+          || strcmp(option, "uintx") == 0
+          || strcmp(option, "bool") == 0
+          || strcmp(option, "ccstr") == 0
+          || strcmp(option, "ccstrlist") == 0
+          || strcmp(option, "double") == 0
+          ) {
+        char errorbuf[1024] = {0};
+        // Type (2) option: parse flag name and value.
+        scan_flag_and_value(option, line, bytes_read, typed_matcher, errorbuf, sizeof(errorbuf));
+        if (*errorbuf != '\0') {
+          error_msg = errorbuf;
+          print_parse_error(error_msg, original_line);
+          return;
         }
         line += bytes_read;
+      } else {
+        // Type (1) option
+        add_option_string(typed_matcher, option, true);
+      }
+      if (typed_matcher != NULL && !_quiet) {
+        // Print out the last match added
+        assert(error_msg == NULL, "No error here");
+        ttyLocker ttyl;
+        tty->print("CompileCommand: %s ", command_names[command]);
+        typed_matcher->print();
+      }
+      line += skip_whitespace(line);
+    } // while(
+    delete archetype;
+  } else {  // not an OptionCommand)
+    assert(error_msg == NULL, "Don't call here with error_msg already set");
 
-        if (strcmp(option, "intx") == 0
-            || strcmp(option, "uintx") == 0
-            || strcmp(option, "bool") == 0
-            || strcmp(option, "ccstr") == 0
-            || strcmp(option, "ccstrlist") == 0
-            || strcmp(option, "double") == 0
-            ) {
+    BasicMatcher* matcher = BasicMatcher::parse_method_pattern(line, error_msg);
+    if (error_msg != NULL) {
+      assert(matcher == NULL, "consistency");
+      print_parse_error(error_msg, original_line);
+      return;
+    }
 
-          // Type (2) option: parse flag name and value.
-          match = scan_flag_and_value(option, line, bytes_read,
-                                      c_name, c_match, m_name, m_match, signature,
-                                      errorbuf, sizeof(errorbuf));
-          if (match == NULL) {
-            error_msg = errorbuf;
-            break;
-          }
-          line += bytes_read;
-        } else {
-          // Type (1) option
-          match = add_option_string(c_name, c_match, m_name, m_match, signature, option, true);
-        }
-        line += skip_whitespace(line);
-      } // while(
-    } else {
-      match = add_predicate(command, c_name, c_match, m_name, m_match, signature);
-    }
-  }
-
-  ttyLocker ttyl;
-  if (error_msg != NULL) {
-    // an error has happened
-    tty->print_cr("CompileCommand: An error occured during parsing");
-    tty->print_cr("  \"%s\"", original_line);
-    if (error_msg != NULL) {
-      tty->print_cr("%s", error_msg);
-    }
-    CompilerOracle::print_tip();
-
-  } else {
-    // check for remaining characters
-    bytes_read = 0;
-    sscanf(line, "%*[ \t]%n", &bytes_read);
-    if (line[bytes_read] != '\0') {
-      tty->print_cr("CompileCommand: Bad pattern");
-      tty->print_cr("  \"%s\"", original_line);
-      tty->print_cr("  Unrecognized text %s after command ", line);
-      CompilerOracle::print_tip();
-    } else if (match != NULL && !_quiet) {
+    add_predicate(command, matcher);
+    if (!_quiet) {
+      ttyLocker ttyl;
       tty->print("CompileCommand: %s ", command_names[command]);
-      match->print();
+      matcher->print(tty);
+      tty->cr();
     }
   }
 }
@@ -1045,10 +875,12 @@
       Symbol* m_name = SymbolTable::new_symbol(methodName, CHECK);
       Symbol* signature = NULL;
 
-      add_predicate(CompileOnlyCommand, c_name, c_match, m_name, m_match, signature);
+      BasicMatcher* bm = new BasicMatcher();
+      bm->init(c_name, c_match, m_name, m_match, signature);
+      add_predicate(CompileOnlyCommand, bm);
       if (PrintVMOptions) {
         tty->print("CompileOnly: compileonly ");
-        lists[CompileOnlyCommand]->print();
+        lists[CompileOnlyCommand]->print_all(tty);
       }
 
       className = NULL;
--- a/src/share/vm/compiler/compilerOracle.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/compiler/compilerOracle.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -35,6 +35,7 @@
  private:
   static bool _quiet;
   static void print_tip();
+  static void print_parse_error(const char*&  error_msg, char* original_line);
 
  public:
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/compiler/methodMatcher.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "compiler/methodMatcher.hpp"
+#include "memory/oopFactory.hpp"
+#include "oops/oop.inline.hpp"
+
+// The JVM specification defines the allowed characters.
+// Tokens that are disallowed by the JVM specification can have
+// a meaning to the parser so we need to include them here.
+// The parser does not enforce all rules of the JVMS - a successful parse
+// does not mean that it is an allowed name. Illegal names will
+// be ignored since they never can match a class or method.
+//
+// '\0' and 0xf0-0xff are disallowed in constant string values
+// 0x20 ' ', 0x09 '\t' and, 0x2c ',' are used in the matching
+// 0x5b '[' and 0x5d ']' can not be used because of the matcher
+// 0x28 '(' and 0x29 ')' are used for the signature
+// 0x2e '.' is always replaced before the matching
+// 0x2f '/' is only used in the class name as package separator
+
+#define RANGEBASE "\x1\x2\x3\x4\x5\x6\x7\x8\xa\xb\xc\xd\xe\xf" \
+    "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" \
+    "\x21\x22\x23\x24\x25\x26\x27\x2a\x2b\x2c\x2d" \
+    "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" \
+    "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" \
+    "\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5c\x5e\x5f" \
+    "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
+    "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" \
+    "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" \
+    "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" \
+    "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf" \
+    "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" \
+    "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" \
+    "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" \
+    "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+
+#define RANGE0 "[*" RANGEBASE "]"
+#define RANGESLASH "[*" RANGEBASE "/]"
+
+MethodMatcher::MethodMatcher():
+    _class_mode(Exact)
+  , _method_mode(Exact)
+  , _class_name(NULL)
+  , _method_name(NULL)
+  , _signature(NULL) {
+}
+
+MethodMatcher::~MethodMatcher() {
+  if (_class_name != NULL) {
+    _class_name->decrement_refcount();
+  }
+  if (_method_name != NULL) {
+    _method_name->decrement_refcount();
+  }
+  if (_signature != NULL) {
+    _signature->decrement_refcount();
+  }
+}
+
+void MethodMatcher::init(Symbol* class_name, Mode class_mode,
+                             Symbol* method_name, Mode method_mode,
+                             Symbol* signature) {
+ _class_mode = class_mode;
+ _method_mode = method_mode;
+ _class_name = class_name;
+ _method_name = method_name;
+ _signature = signature;
+}
+
+bool MethodMatcher::canonicalize(char * line, const char *& error_msg) {
+  char* colon = strstr(line, "::");
+  bool have_colon = (colon != NULL);
+  if (have_colon) {
+    // Don't allow multiple '::'
+    if (colon + 2 != '\0') {
+      if (strstr(colon+2, "::")) {
+        error_msg = "Method pattern only allows one '::' allowed";
+        return false;
+      }
+    }
+
+    bool in_signature = false;
+    char* pos = line;
+    if (pos != NULL) {
+      for (char* lp = pos + 1; *lp != '\0'; lp++) {
+        if (*lp == '(') {
+          break;
+        }
+
+        if (*lp == '/') {
+          error_msg = "Method pattern uses '/' together with '::'";
+          return false;
+        }
+      }
+    }
+  } else {
+    // Don't allow mixed package separators
+    char* pos = strchr(line, '.');
+    bool in_signature = false;
+    if (pos != NULL) {
+      for (char* lp = pos + 1; *lp != '\0'; lp++) {
+        if (*lp == '(') {
+          in_signature = true;
+        }
+
+        // After any comma the method pattern has ended
+        if (*lp == ',') {
+          break;
+        }
+
+        if (!in_signature && (*lp == '/')) {
+          error_msg = "Method pattern uses mixed '/' and '.' package separators";
+          return false;
+        }
+
+        if (*lp == '.') {
+          error_msg = "Method pattern uses multiple '.' in pattern";
+          return false;
+        }
+      }
+    }
+  }
+
+  for (char* lp = line; *lp != '\0'; lp++) {
+    // Allow '.' to separate the class name from the method name.
+    // This is the preferred spelling of methods:
+    //      exclude java/lang/String.indexOf(I)I
+    // Allow ',' for spaces (eases command line quoting).
+    //      exclude,java/lang/String.indexOf
+    // For backward compatibility, allow space as separator also.
+    //      exclude java/lang/String indexOf
+    //      exclude,java/lang/String,indexOf
+    // For easy cut-and-paste of method names, allow VM output format
+    // as produced by Method::print_short_name:
+    //      exclude java.lang.String::indexOf
+    // For simple implementation convenience here, convert them all to space.
+
+    if (have_colon) {
+      if (*lp == '.')  *lp = '/';   // dots build the package prefix
+      if (*lp == ':')  *lp = ' ';
+    }
+    if (*lp == ',' || *lp == '.')  *lp = ' ';
+  }
+  return true;
+}
+
+bool MethodMatcher::match(Symbol* candidate, Symbol* match, Mode match_mode) const {
+  if (match_mode == Any) {
+    return true;
+  }
+
+  if (match_mode == Exact) {
+    return candidate == match;
+  }
+
+  ResourceMark rm;
+  const char * candidate_string = candidate->as_C_string();
+  const char * match_string = match->as_C_string();
+
+  switch (match_mode) {
+  case Prefix:
+    return strstr(candidate_string, match_string) == candidate_string;
+
+  case Suffix: {
+    size_t clen = strlen(candidate_string);
+    size_t mlen = strlen(match_string);
+    return clen >= mlen && strcmp(candidate_string + clen - mlen, match_string) == 0;
+  }
+
+  case Substring:
+    return strstr(candidate_string, match_string) != NULL;
+
+  default:
+    return false;
+  }
+}
+
+static MethodMatcher::Mode check_mode(char name[], const char*& error_msg) {
+  int match = MethodMatcher::Exact;
+  if (name[0] == '*') {
+    if (strlen(name) == 1) {
+      return MethodMatcher::Any;
+    }
+    match |= MethodMatcher::Suffix;
+    memmove(name, name + 1, strlen(name + 1) + 1);
+  }
+
+  size_t len = strlen(name);
+  if (len > 0 && name[len - 1] == '*') {
+    match |= MethodMatcher::Prefix;
+    name[--len] = '\0';
+  }
+
+  if (strlen(name) == 0) {
+    error_msg = "** Not a valid pattern";
+    return MethodMatcher::Any;
+  }
+
+  if (strstr(name, "*") != NULL) {
+    error_msg = " Embedded * not allowed";
+    return MethodMatcher::Unknown;
+  }
+  return (MethodMatcher::Mode)match;
+}
+
+// Skip any leading spaces
+void skip_leading_spaces(char*& line, int* total_bytes_read ) {
+  int bytes_read = 0;
+  sscanf(line, "%*[ \t]%n", &bytes_read);
+  if (bytes_read > 0) {
+    line += bytes_read;
+    *total_bytes_read += bytes_read;
+  }
+}
+
+void MethodMatcher::parse_method_pattern(char*& line, const char*& error_msg, MethodMatcher* matcher) {
+  MethodMatcher::Mode c_match;
+  MethodMatcher::Mode m_match;
+  char class_name[256] = {0};
+  char method_name[256] = {0};
+  char sig[1024] = {0};
+  int bytes_read = 0;
+  int total_bytes_read = 0;
+
+  assert(error_msg == NULL, "Dont call here with error_msg already set");
+
+  if (!MethodMatcher::canonicalize(line, error_msg)) {
+    assert(error_msg != NULL, "Message must be set if parsing failed");
+    return;
+  }
+
+  skip_leading_spaces(line, &total_bytes_read);
+
+  if (2 == sscanf(line, "%255" RANGESLASH "%*[ ]" "%255"  RANGE0 "%n", class_name, method_name, &bytes_read)) {
+    c_match = check_mode(class_name, error_msg);
+    m_match = check_mode(method_name, error_msg);
+
+    if ((strchr(class_name, '<') != NULL) || (strchr(class_name, '>') != NULL)) {
+      error_msg = "Chars '<' and '>' not allowed in class name";
+      return;
+    }
+    if ((strchr(method_name, '<') != NULL) || (strchr(method_name, '>') != NULL)) {
+      if ((strncmp("<init>", method_name, 255) != 0) && (strncmp("<clinit>", method_name, 255) != 0)) {
+        error_msg = "Chars '<' and '>' only allowed in <init> and <clinit>";
+        return;
+      }
+    }
+
+    if (c_match == MethodMatcher::Unknown || m_match == MethodMatcher::Unknown) {
+      assert(error_msg != NULL, "Must have been set by check_mode()");
+      return;
+    }
+
+    EXCEPTION_MARK;
+    Symbol* signature = NULL;
+    line += bytes_read;
+    bytes_read = 0;
+
+    skip_leading_spaces(line, &total_bytes_read);
+
+    // there might be a signature following the method.
+    // signatures always begin with ( so match that by hand
+    if (line[0] == '(') {
+      line++;
+      sig[0] = '(';
+      // scan the rest
+      if (1 == sscanf(line, "%254[[);/" RANGEBASE "]%n", sig+1, &bytes_read)) {
+        if (strchr(sig, '*') != NULL) {
+          error_msg = " Wildcard * not allowed in signature";
+          return;
+        }
+        line += bytes_read;
+      }
+      signature = SymbolTable::new_symbol(sig, CHECK);
+    }
+    Symbol* c_name = SymbolTable::new_symbol(class_name, CHECK);
+    Symbol* m_name = SymbolTable::new_symbol(method_name, CHECK);
+
+    matcher->init(c_name, c_match, m_name, m_match, signature);
+    return;
+  } else {
+    error_msg = "Could not parse method pattern";
+  }
+}
+
+bool MethodMatcher::matches(methodHandle method) const {
+  Symbol* class_name  = method->method_holder()->name();
+  Symbol* method_name = method->name();
+  Symbol* signature = method->signature();
+
+  if (match(class_name, this->class_name(), _class_mode) &&
+      match(method_name, this->method_name(), _method_mode) &&
+      ((this->signature() == NULL) || match(signature, this->signature(), Prefix))) {
+    return true;
+  }
+  return false;
+}
+
+void MethodMatcher::print_symbol(outputStream* st, Symbol* h, Mode mode) {
+  ResourceMark rm;
+
+  if (mode == Suffix || mode == Substring || mode == Any) {
+    st->print("*");
+  }
+  if (mode != Any) {
+    h->print_symbol_on(st);
+  }
+  if (mode == Prefix || mode == Substring) {
+    st->print("*");
+  }
+}
+
+void MethodMatcher::print_base(outputStream* st) {
+  print_symbol(st, class_name(), _class_mode);
+  st->print(".");
+  print_symbol(st, method_name(), _method_mode);
+  if (signature() != NULL) {
+    signature()->print_symbol_on(st);
+  }
+}
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/compiler/methodMatcher.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_COMPILER_METHODMATCHER_HPP
+#define SHARE_VM_COMPILER_METHODMATCHER_HPP
+
+#include "memory/allocation.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "memory/resourceArea.hpp"
+
+class MethodMatcher : public CHeapObj<mtCompiler> {
+ public:
+  enum Mode {
+    Exact,
+    Prefix = 1,
+    Suffix = 2,
+    Substring = Prefix | Suffix,
+    Any,
+    Unknown = -1
+  };
+
+ protected:
+  Symbol*        _class_name;
+  Symbol*        _method_name;
+  Symbol*        _signature;
+  Mode           _class_mode;
+  Mode           _method_mode;
+
+ public:
+  Symbol* class_name() const { return _class_name; }
+  Mode class_mode() const { return _class_mode; }
+  Symbol* method_name() const { return _method_name; }
+  Mode method_mode() const { return _method_mode; }
+  Symbol* signature() const { return _signature; }
+
+  MethodMatcher();
+  ~MethodMatcher();
+
+  void init(Symbol* class_name, Mode class_mode, Symbol* method_name, Mode method_mode, Symbol* signature);
+  static void parse_method_pattern(char*& line, const char*& error_msg, MethodMatcher* m);
+  static void print_symbol(outputStream* st, Symbol* h, Mode mode);
+  bool matches(methodHandle method) const;
+  void print_base(outputStream* st);
+
+ private:
+  static bool canonicalize(char * line, const char *& error_msg);
+  bool match(Symbol* candidate, Symbol* match, Mode match_mode) const;
+};
+
+class BasicMatcher : public MethodMatcher {
+private:
+  BasicMatcher* _next;
+public:
+
+  BasicMatcher() : MethodMatcher(),
+    _next(NULL) {
+  }
+
+  BasicMatcher(BasicMatcher* next) :
+    _next(next) {
+  }
+
+  static BasicMatcher* parse_method_pattern(char* line, const char*& error_msg) {
+    assert(error_msg == NULL, "Dont call here with error_msg already set");
+    BasicMatcher* bm = new BasicMatcher();
+    MethodMatcher::parse_method_pattern(line, error_msg, bm);
+    if (error_msg != NULL) {
+      delete bm;
+      return NULL;
+    }
+
+    // check for bad trailing characters
+    int bytes_read = 0;
+    sscanf(line, "%*[ \t]%n", &bytes_read);
+    if (line[bytes_read] != '\0') {
+      error_msg = "Unrecognized trailing text after method pattern";
+      delete bm;
+      return NULL;
+    }
+    return bm;
+  }
+
+  bool match(methodHandle method) {
+    for (BasicMatcher* current = this; current != NULL; current = current->next()) {
+      if (current->matches(method)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void set_next(BasicMatcher* next) { _next = next; }
+  BasicMatcher* next() { return _next; }
+
+  void print(outputStream* st) { print_base(st); }
+  void print_all(outputStream* st) {
+    print_base(st);
+    if (_next != NULL) {
+      _next->print_all(st);
+    }
+  }
+};
+
+#endif // SHARE_VM_COMPILER_METHODMATCHER_HPP
+
--- a/src/share/vm/compiler/oopMap.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/compiler/oopMap.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -58,7 +58,6 @@
   _valid_omv = false;
 }
 
-
 void OopMapStream::find_next() {
   while(_position++ < _size) {
     _omv.read_from(_stream);
@@ -156,9 +155,7 @@
 
 
 void OopMap::set_value(VMReg reg) {
-  // At this time, we only need value entries in our OopMap when ZapDeadCompiledLocals is active.
-  if (ZapDeadCompiledLocals)
-    set_xxx(reg, OopMapValue::value_value, VMRegImpl::Bad());
+  // At this time, we don't need value entries in our OopMap.
 }
 
 
@@ -199,7 +196,6 @@
   set_om_data(new_data);
 }
 
-
 void OopMapSet::add_gc_map(int pc_offset, OopMap *map ) {
   assert(om_size() != -1,"Cannot grow a fixed OopMapSet");
 
@@ -345,72 +341,73 @@
       do {
         omv = oms.current();
         oop* loc = fr->oopmapreg_to_location(omv.reg(),reg_map);
-        if ( loc != NULL ) {
-          oop *base_loc    = fr->oopmapreg_to_location(omv.content_reg(), reg_map);
-          oop *derived_loc = loc;
-          oop val = *base_loc;
-          if (val == (oop)NULL || Universe::is_narrow_oop_base(val)) {
-            // Ignore NULL oops and decoded NULL narrow oops which
-            // equal to Universe::narrow_oop_base when a narrow oop
-            // implicit null check is used in compiled code.
-            // The narrow_oop_base could be NULL or be the address
-            // of the page below heap depending on compressed oops mode.
-          } else
-            derived_oop_fn(base_loc, derived_loc);
+        guarantee(loc != NULL, "missing saved register");
+        oop *base_loc    = fr->oopmapreg_to_location(omv.content_reg(), reg_map);
+        oop *derived_loc = loc;
+        oop val = *base_loc;
+        if (val == (oop)NULL || Universe::is_narrow_oop_base(val)) {
+          // Ignore NULL oops and decoded NULL narrow oops which
+          // equal to Universe::narrow_oop_base when a narrow oop
+          // implicit null check is used in compiled code.
+          // The narrow_oop_base could be NULL or be the address
+          // of the page below heap depending on compressed oops mode.
+        } else {
+          derived_oop_fn(base_loc, derived_loc);
         }
         oms.next();
       }  while (!oms.is_done());
     }
   }
 
-  // We want coop, value and oop oop_types
-  int mask = OopMapValue::oop_value | OopMapValue::value_value | OopMapValue::narrowoop_value;
+  // We want coop and oop oop_types
+  int mask = OopMapValue::oop_value | OopMapValue::narrowoop_value;
   {
     for (OopMapStream oms(map,mask); !oms.is_done(); oms.next()) {
       omv = oms.current();
       oop* loc = fr->oopmapreg_to_location(omv.reg(),reg_map);
-      if ( loc != NULL ) {
-        if ( omv.type() == OopMapValue::oop_value ) {
-          oop val = *loc;
-          if (val == (oop)NULL || Universe::is_narrow_oop_base(val)) {
-            // Ignore NULL oops and decoded NULL narrow oops which
-            // equal to Universe::narrow_oop_base when a narrow oop
-            // implicit null check is used in compiled code.
-            // The narrow_oop_base could be NULL or be the address
-            // of the page below heap depending on compressed oops mode.
-            continue;
-          }
+      // It should be an error if no location can be found for a
+      // register mentioned as contained an oop of some kind.  Maybe
+      // this was allowed previously because value_value items might
+      // be missing?
+      guarantee(loc != NULL, "missing saved register");
+      if ( omv.type() == OopMapValue::oop_value ) {
+        oop val = *loc;
+        if (val == (oop)NULL || Universe::is_narrow_oop_base(val)) {
+          // Ignore NULL oops and decoded NULL narrow oops which
+          // equal to Universe::narrow_oop_base when a narrow oop
+          // implicit null check is used in compiled code.
+          // The narrow_oop_base could be NULL or be the address
+          // of the page below heap depending on compressed oops mode.
+          continue;
+        }
 #ifdef ASSERT
-          if ((((uintptr_t)loc & (sizeof(*loc)-1)) != 0) ||
-             !Universe::heap()->is_in_or_null(*loc)) {
-            tty->print_cr("# Found non oop pointer.  Dumping state at failure");
-            // try to dump out some helpful debugging information
-            trace_codeblob_maps(fr, reg_map);
-            omv.print();
-            tty->print_cr("register r");
-            omv.reg()->print();
-            tty->print_cr("loc = %p *loc = %p\n", loc, (address)*loc);
-            // do the real assert.
-            assert(Universe::heap()->is_in_or_null(*loc), "found non oop pointer");
-          }
+        if ((((uintptr_t)loc & (sizeof(*loc)-1)) != 0) ||
+            !Universe::heap()->is_in_or_null(*loc)) {
+          tty->print_cr("# Found non oop pointer.  Dumping state at failure");
+          // try to dump out some helpful debugging information
+          trace_codeblob_maps(fr, reg_map);
+          omv.print();
+          tty->print_cr("register r");
+          omv.reg()->print();
+          tty->print_cr("loc = %p *loc = %p\n", loc, (address)*loc);
+          // do the real assert.
+          assert(Universe::heap()->is_in_or_null(*loc), "found non oop pointer");
+        }
 #endif // ASSERT
-          oop_fn->do_oop(loc);
-        } else if ( omv.type() == OopMapValue::value_value ) {
-          assert((*loc) == (oop)NULL || !Universe::is_narrow_oop_base(*loc),
-                 "found invalid value pointer");
-          value_fn->do_oop(loc);
-        } else if ( omv.type() == OopMapValue::narrowoop_value ) {
-          narrowOop *nl = (narrowOop*)loc;
+        oop_fn->do_oop(loc);
+      } else if ( omv.type() == OopMapValue::narrowoop_value ) {
+        narrowOop *nl = (narrowOop*)loc;
 #ifndef VM_LITTLE_ENDIAN
-          if (!omv.reg()->is_stack()) {
-            // compressed oops in registers only take up 4 bytes of an
-            // 8 byte register but they are in the wrong part of the
-            // word so adjust loc to point at the right place.
-            nl = (narrowOop*)((address)nl + 4);
-          }
+        VMReg vmReg = omv.reg();
+        // Don't do this on SPARC float registers as they can be individually addressed
+        if (!vmReg->is_stack() SPARC_ONLY(&& !vmReg->is_FloatRegister())) {
+          // compressed oops in registers only take up 4 bytes of an
+          // 8 byte register but they are in the wrong part of the
+          // word so adjust loc to point at the right place.
+          nl = (narrowOop*)((address)nl + 4);
+        }
 #endif
-          oop_fn->do_oop(nl);
-        }
+        oop_fn->do_oop(nl);
       }
     }
   }
@@ -485,9 +482,6 @@
   case OopMapValue::oop_value:
     st->print("Oop");
     break;
-  case OopMapValue::value_value:
-    st->print("Value");
-    break;
   case OopMapValue::narrowoop_value:
     st->print("NarrowOop");
     break;
--- a/src/share/vm/compiler/oopMap.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/compiler/oopMap.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -33,7 +33,6 @@
 // Interface for generating the frame map for compiled code.  A frame map
 // describes for a specific pc whether each register and frame stack slot is:
 //   Oop         - A GC root for current frame
-//   Value       - Live non-oop, non-float value: int, either half of double
 //   Dead        - Dead; can be Zapped for debugging
 //   CalleeXX    - Callee saved; also describes which caller register is saved
 //   DerivedXX   - A derived oop; original oop is described.
@@ -54,7 +53,7 @@
 
 public:
   // Constants
-  enum { type_bits                = 5,
+  enum { type_bits                = 4,
          register_bits            = BitsPerShort - type_bits };
 
   enum { type_shift               = 0,
@@ -68,10 +67,9 @@
   enum oop_types {              // must fit in type_bits
          unused_value =0,       // powers of 2, for masking OopMapStream
          oop_value = 1,
-         value_value = 2,
-         narrowoop_value = 4,
-         callee_saved_value = 8,
-         derived_oop_value= 16 };
+         narrowoop_value = 2,
+         callee_saved_value = 4,
+         derived_oop_value= 8 };
 
   // Constructors
   OopMapValue () { set_value(0); set_content_reg(VMRegImpl::Bad()); }
@@ -96,13 +94,11 @@
 
   // Querying
   bool is_oop()               { return mask_bits(value(), type_mask_in_place) == oop_value; }
-  bool is_value()             { return mask_bits(value(), type_mask_in_place) == value_value; }
   bool is_narrowoop()           { return mask_bits(value(), type_mask_in_place) == narrowoop_value; }
   bool is_callee_saved()      { return mask_bits(value(), type_mask_in_place) == callee_saved_value; }
   bool is_derived_oop()       { return mask_bits(value(), type_mask_in_place) == derived_oop_value; }
 
   void set_oop()              { set_value((value() & register_mask_in_place) | oop_value); }
-  void set_value()            { set_value((value() & register_mask_in_place) | value_value); }
   void set_narrowoop()          { set_value((value() & register_mask_in_place) | narrowoop_value); }
   void set_callee_saved()     { set_value((value() & register_mask_in_place) | callee_saved_value); }
   void set_derived_oop()      { set_value((value() & register_mask_in_place) | derived_oop_value); }
--- a/src/share/vm/interpreter/abstractInterpreter.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/interpreter/abstractInterpreter.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -90,6 +90,8 @@
     java_util_zip_CRC32_update,                                 // implementation of java.util.zip.CRC32.update()
     java_util_zip_CRC32_updateBytes,                            // implementation of java.util.zip.CRC32.updateBytes()
     java_util_zip_CRC32_updateByteBuffer,                       // implementation of java.util.zip.CRC32.updateByteBuffer()
+    java_util_zip_CRC32C_updateBytes,                           // implementation of java.util.zip.CRC32C.updateBytes(crc, b[], off, end)
+    java_util_zip_CRC32C_updateDirectByteBuffer,                // implementation of java.util.zip.CRC32C.updateDirectByteBuffer(crc, address, off, end)
     java_lang_Float_intBitsToFloat,                             // implementation of java.lang.Float.intBitsToFloat()
     java_lang_Float_floatToRawIntBits,                          // implementation of java.lang.Float.floatToRawIntBits()
     java_lang_Double_longBitsToDouble,                          // implementation of java.lang.Double.longBitsToDouble()
--- a/src/share/vm/interpreter/interpreter.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/interpreter/interpreter.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -104,7 +104,10 @@
   (*_masm)->flush();
 
   // Commit Codelet.
-  AbstractInterpreter::code()->commit((*_masm)->code()->pure_insts_size(), (*_masm)->code()->strings());
+  int committed_code_size = (*_masm)->code()->pure_insts_size();
+  if (committed_code_size) {
+    AbstractInterpreter::code()->commit(committed_code_size, (*_masm)->code()->strings());
+  }
   // Make sure nobody can use _masm outside a CodeletMark lifespan.
   *_masm = NULL;
 }
@@ -234,6 +237,13 @@
       case vmIntrinsics::_updateByteBufferCRC32  : return java_util_zip_CRC32_updateByteBuffer;
     }
   }
+  if (UseCRC32CIntrinsics) {
+    // Use optimized stub code for CRC32C methods.
+    switch (m->intrinsic_id()) {
+      case vmIntrinsics::_updateBytesCRC32C             : return java_util_zip_CRC32C_updateBytes;
+      case vmIntrinsics::_updateDirectByteBufferCRC32C  : return java_util_zip_CRC32C_updateDirectByteBuffer;
+    }
+  }
 
   switch(m->intrinsic_id()) {
   case vmIntrinsics::_intBitsToFloat:      return java_lang_Float_intBitsToFloat;
@@ -349,6 +359,8 @@
     case java_util_zip_CRC32_update           : tty->print("java_util_zip_CRC32_update"); break;
     case java_util_zip_CRC32_updateBytes      : tty->print("java_util_zip_CRC32_updateBytes"); break;
     case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;
+    case java_util_zip_CRC32C_updateBytes     : tty->print("java_util_zip_CRC32C_updateBytes"); break;
+    case java_util_zip_CRC32C_updateDirectByteBuffer: tty->print("java_util_zip_CRC32C_updateDirectByteByffer"); break;
     default:
       if (kind >= method_handle_invoke_FIRST &&
           kind <= method_handle_invoke_LAST) {
@@ -537,17 +549,18 @@
 address InterpreterGenerator::generate_method_entry(
                                         AbstractInterpreter::MethodKind kind) {
   // determine code generation flags
+  bool native = false;
   bool synchronized = false;
   address entry_point = NULL;
 
   switch (kind) {
-  case Interpreter::zerolocals             :                                                      break;
-  case Interpreter::zerolocals_synchronized: synchronized = true;                                 break;
-  case Interpreter::native                 : entry_point = generate_native_entry(false); break;
-  case Interpreter::native_synchronized    : entry_point = generate_native_entry(true);  break;
-  case Interpreter::empty                  : entry_point = generate_empty_entry(); break;
+  case Interpreter::zerolocals             :                                          break;
+  case Interpreter::zerolocals_synchronized:                synchronized = true;      break;
+  case Interpreter::native                 : native = true;                           break;
+  case Interpreter::native_synchronized    : native = true; synchronized = true;      break;
+  case Interpreter::empty                  : entry_point = generate_empty_entry();    break;
   case Interpreter::accessor               : entry_point = generate_accessor_entry(); break;
-  case Interpreter::abstract               : entry_point = generate_abstract_entry();    break;
+  case Interpreter::abstract               : entry_point = generate_abstract_entry(); break;
 
   case Interpreter::java_lang_math_sin     : // fall thru
   case Interpreter::java_lang_math_cos     : // fall thru
@@ -562,28 +575,32 @@
                                            : entry_point = generate_Reference_get_entry(); break;
 #ifndef CC_INTERP
   case Interpreter::java_util_zip_CRC32_update
-                                           : entry_point = generate_CRC32_update_entry();  break;
+                                           : native = true; entry_point = generate_CRC32_update_entry();  break;
   case Interpreter::java_util_zip_CRC32_updateBytes
                                            : // fall thru
   case Interpreter::java_util_zip_CRC32_updateByteBuffer
-                                           : entry_point = generate_CRC32_updateBytes_entry(kind); break;
+                                           : native = true; entry_point = generate_CRC32_updateBytes_entry(kind); break;
+  case Interpreter::java_util_zip_CRC32C_updateBytes
+                                           : // fall thru
+  case Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer
+                                           : entry_point = generate_CRC32C_updateBytes_entry(kind); break;
 #if defined(TARGET_ARCH_x86) && !defined(_LP64)
   // On x86_32 platforms, a special entry is generated for the following four methods.
   // On other platforms the normal entry is used to enter these methods.
   case Interpreter::java_lang_Float_intBitsToFloat
-                                           : entry_point = generate_Float_intBitsToFloat_entry(); break;
+                                           : native = true; entry_point = generate_Float_intBitsToFloat_entry(); break;
   case Interpreter::java_lang_Float_floatToRawIntBits
-                                           : entry_point = generate_Float_floatToRawIntBits_entry(); break;
+                                           : native = true; entry_point = generate_Float_floatToRawIntBits_entry(); break;
   case Interpreter::java_lang_Double_longBitsToDouble
-                                           : entry_point = generate_Double_longBitsToDouble_entry(); break;
+                                           : native = true; entry_point = generate_Double_longBitsToDouble_entry(); break;
   case Interpreter::java_lang_Double_doubleToRawLongBits
-                                           : entry_point = generate_Double_doubleToRawLongBits_entry(); break;
+                                           : native = true; entry_point = generate_Double_doubleToRawLongBits_entry(); break;
 #else
   case Interpreter::java_lang_Float_intBitsToFloat:
   case Interpreter::java_lang_Float_floatToRawIntBits:
   case Interpreter::java_lang_Double_longBitsToDouble:
   case Interpreter::java_lang_Double_doubleToRawLongBits:
-    entry_point = generate_native_entry(false);
+    native = true;
     break;
 #endif // defined(TARGET_ARCH_x86) && !defined(_LP64)
 #endif // CC_INTERP
@@ -596,5 +613,18 @@
     return entry_point;
   }
 
-  return generate_normal_entry(synchronized);
+  // We expect the normal and native entry points to be generated first so we can reuse them.
+  if (native) {
+    entry_point = Interpreter::entry_for_kind(synchronized ? Interpreter::native_synchronized : Interpreter::native);
+    if (entry_point == NULL) {
+      entry_point = generate_native_entry(synchronized);
+    }
+  } else {
+    entry_point = Interpreter::entry_for_kind(synchronized ? Interpreter::zerolocals_synchronized : Interpreter::zerolocals);
+    if (entry_point == NULL) {
+      entry_point = generate_normal_entry(synchronized);
+    }
+  }
+
+  return entry_point;
 }
--- a/src/share/vm/interpreter/oopMapCache.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/interpreter/oopMapCache.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -213,31 +213,6 @@
   }
 }
 
-
-#ifdef ENABLE_ZAP_DEAD_LOCALS
-
-void InterpreterOopMap::iterate_all(OffsetClosure* oop_closure, OffsetClosure* value_closure, OffsetClosure* dead_closure) {
-  int n = number_of_entries();
-  int word_index = 0;
-  uintptr_t value = 0;
-  uintptr_t mask = 0;
-  // iterate over entries
-  for (int i = 0; i < n; i++, mask <<= bits_per_entry) {
-    // get current word
-    if (mask == 0) {
-      value = bit_mask()[word_index++];
-      mask = 1;
-    }
-    // test for dead values  & oops, and for live values
-         if ((value & (mask << dead_bit_number)) != 0)  dead_closure->offset_do(i); // call this for all dead values or oops
-    else if ((value & (mask <<  oop_bit_number)) != 0)   oop_closure->offset_do(i); // call this for all live oops
-    else                                               value_closure->offset_do(i); // call this for all live values
-  }
-}
-
-#endif
-
-
 void InterpreterOopMap::print() const {
   int n = number_of_entries();
   tty->print("oop map for ");
@@ -297,12 +272,6 @@
     bool v2 = vars[i].is_reference()  ? true : false;
     assert(v1 == v2, "locals oop mask generation error");
     if (TraceOopMapGeneration && Verbose) tty->print("%d", v1 ? 1 : 0);
-#ifdef ENABLE_ZAP_DEAD_LOCALS
-    bool v3 = is_dead(i)              ? true : false;
-    bool v4 = !vars[i].is_live()      ? true : false;
-    assert(v3 == v4, "locals live mask generation error");
-    assert(!(v1 && v3), "dead value marked as oop");
-#endif
   }
 
   if (TraceOopMapGeneration && Verbose) { tty->cr(); tty->print("Stack (%d): ", stack_top); }
@@ -311,12 +280,6 @@
     bool v2 = stack[j].is_reference() ? true : false;
     assert(v1 == v2, "stack oop mask generation error");
     if (TraceOopMapGeneration && Verbose) tty->print("%d", v1 ? 1 : 0);
-#ifdef ENABLE_ZAP_DEAD_LOCALS
-    bool v3 = is_dead(max_locals + j) ? true : false;
-    bool v4 = !stack[j].is_live()     ? true : false;
-    assert(v3 == v4, "stack live mask generation error");
-    assert(!(v1 && v3), "dead value marked as oop");
-#endif
   }
   if (TraceOopMapGeneration && Verbose) tty->cr();
   return true;
--- a/src/share/vm/interpreter/oopMapCache.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/interpreter/oopMapCache.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -141,9 +141,6 @@
 
   int expression_stack_size() const              { return _expression_stack_size; }
 
-#ifdef ENABLE_ZAP_DEAD_LOCALS
-  void iterate_all(OffsetClosure* oop_closure, OffsetClosure* value_closure, OffsetClosure* dead_closure);
-#endif
 };
 
 class OopMapCache : public CHeapObj<mtClass> {
--- a/src/share/vm/interpreter/templateInterpreter.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/interpreter/templateInterpreter.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -412,17 +412,6 @@
       method_entry(java_lang_math_pow  )
       method_entry(java_lang_ref_reference_get)
 
-      if (UseCRC32Intrinsics) {
-        method_entry(java_util_zip_CRC32_update)
-        method_entry(java_util_zip_CRC32_updateBytes)
-        method_entry(java_util_zip_CRC32_updateByteBuffer)
-      }
-
-      method_entry(java_lang_Float_intBitsToFloat);
-      method_entry(java_lang_Float_floatToRawIntBits);
-      method_entry(java_lang_Double_longBitsToDouble);
-      method_entry(java_lang_Double_doubleToRawLongBits);
-
       initialize_method_handle_entries();
 
       // all native method kinds (must be one contiguous block)
@@ -431,6 +420,22 @@
       method_entry(native_synchronized)
       Interpreter::_native_entry_end = Interpreter::code()->code_end();
 
+      if (UseCRC32Intrinsics) {
+        method_entry(java_util_zip_CRC32_update)
+        method_entry(java_util_zip_CRC32_updateBytes)
+        method_entry(java_util_zip_CRC32_updateByteBuffer)
+      }
+
+      if (UseCRC32CIntrinsics) {
+        method_entry(java_util_zip_CRC32C_updateBytes)
+        method_entry(java_util_zip_CRC32C_updateDirectByteBuffer)
+      }
+
+      method_entry(java_lang_Float_intBitsToFloat);
+      method_entry(java_lang_Float_floatToRawIntBits);
+      method_entry(java_lang_Double_longBitsToDouble);
+      method_entry(java_lang_Double_doubleToRawLongBits);
+
 #undef method_entry
 
       // Bytecodes
--- a/src/share/vm/opto/block.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/block.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -358,6 +358,8 @@
 PhaseCFG::PhaseCFG(Arena* arena, RootNode* root, Matcher& matcher)
 : Phase(CFG)
 , _block_arena(arena)
+, _regalloc(NULL)
+, _scheduling_for_pressure(false)
 , _root(root)
 , _matcher(matcher)
 , _node_to_block_mapping(arena)
--- a/src/share/vm/opto/block.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/block.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -37,6 +37,7 @@
 class Matcher;
 class RootNode;
 class VectorSet;
+class PhaseChaitin;
 struct Tarjan;
 
 //------------------------------Block_Array------------------------------------
@@ -383,6 +384,12 @@
   // Arena for the blocks to be stored in
   Arena* _block_arena;
 
+  // Info used for scheduling
+  PhaseChaitin* _regalloc;
+
+  // Register pressure heuristic used?
+  bool _scheduling_for_pressure;
+
   // The matcher for this compilation
   Matcher& _matcher;
 
@@ -433,12 +440,14 @@
   // to late. Helper for schedule_late.
   Block* hoist_to_cheaper_block(Block* LCA, Block* early, Node* self);
 
-  bool schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call);
+  bool schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call, intptr_t* recacl_pressure_nodes);
   void set_next_call(Block* block, Node* n, VectorSet& next_call);
   void needed_for_next_call(Block* block, Node* this_call, VectorSet& next_call);
 
   // Perform basic-block local scheduling
-  Node* select(Block* block, Node_List& worklist, GrowableArray<int>& ready_cnt, VectorSet& next_call, uint sched_slot);
+  Node* select(Block* block, Node_List& worklist, GrowableArray<int>& ready_cnt, VectorSet& next_call, uint sched_slot,
+               intptr_t* recacl_pressure_nodes);
+  void adjust_register_pressure(Node* n, Block* block, intptr_t *recalc_pressure_nodes, bool finalize_mode);
 
   // Schedule a call next in the block
   uint sched_call(Block* block, uint node_cnt, Node_List& worklist, GrowableArray<int>& ready_cnt, MachCallNode* mcall, VectorSet& next_call);
--- a/src/share/vm/opto/bytecodeInfo.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/bytecodeInfo.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -114,7 +114,7 @@
       CompileTask::print_inline_indent(inline_level());
       tty->print_cr("Inlined method is hot: ");
     }
-    set_msg("force inline by CompilerOracle");
+    set_msg("force inline by CompileCommand");
     _forced_inline = true;
     return true;
   }
@@ -223,12 +223,12 @@
 
   // ignore heuristic controls on inlining
   if (callee_method->should_inline()) {
-    set_msg("force inline by CompilerOracle");
+    set_msg("force inline by CompileCommand");
     return false;
   }
 
   if (callee_method->should_not_inline()) {
-    set_msg("disallowed by CompilerOracle");
+    set_msg("disallowed by CompileCommand");
     return true;
   }
 
@@ -470,11 +470,6 @@
       }
     }
   }
-  // We will attempt to see if a class/field/etc got properly loaded.  If it
-  // did not, it may attempt to throw an exception during our probing.  Catch
-  // and ignore such exceptions and do not attempt to compile the method.
-  if( callee_method->should_exclude() )  return false;
-
   return true;
 }
 
--- a/src/share/vm/opto/c2_globals.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/c2_globals.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -69,22 +69,6 @@
   develop(bool, StressGCM, false,                                           \
           "Randomize instruction scheduling in GCM")                        \
                                                                             \
-  notproduct(intx, CompileZapFirst, 0,                                      \
-          "If +ZapDeadCompiledLocals, "                                     \
-          "skip this many before compiling in zap calls")                   \
-                                                                            \
-  notproduct(intx, CompileZapLast, -1,                                      \
-          "If +ZapDeadCompiledLocals, "                                     \
-          "compile this many after skipping (incl. skip count, -1 = all)")  \
-                                                                            \
-  notproduct(intx, ZapDeadCompiledLocalsFirst, 0,                           \
-          "If +ZapDeadCompiledLocals, "                                     \
-          "skip this many before really doing it")                          \
-                                                                            \
-  notproduct(intx, ZapDeadCompiledLocalsLast, -1,                           \
-          "If +ZapDeadCompiledLocals, "                                     \
-          "do this many after skipping (incl. skip count, -1 = all)")       \
-                                                                            \
   develop(intx, OptoPrologueNops, 0,                                        \
           "Insert this many extra nop instructions "                        \
           "in the prologue of every nmethod")                               \
@@ -306,6 +290,9 @@
   product_pd(bool, OptoScheduling,                                          \
           "Instruction Scheduling after register allocation")               \
                                                                             \
+  product_pd(bool, OptoRegScheduling,                                       \
+          "Instruction Scheduling before register allocation for pressure") \
+                                                                            \
   product(bool, PartialPeelLoop, true,                                      \
           "Partial peel (rotate) loops")                                    \
                                                                             \
--- a/src/share/vm/opto/callnode.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/callnode.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -907,6 +907,18 @@
 
   // Convenience for initialization->maybe_set_complete(phase)
   bool maybe_set_complete(PhaseGVN* phase);
+
+  // Return true if allocation doesn't escape thread, its escape state
+  // needs be noEscape or ArgEscape. InitializeNode._does_not_escape
+  // is true when its allocation's escape state is noEscape or
+  // ArgEscape. In case allocation's InitializeNode is NULL, check
+  // AlllocateNode._is_non_escaping flag.
+  // AlllocateNode._is_non_escaping is true when its escape state is
+  // noEscape.
+  bool does_not_escape_thread() {
+    InitializeNode* init = NULL;
+    return _is_non_escaping || (((init = initialization()) != NULL) && init->does_not_escape());
+  }
 };
 
 //------------------------------AllocateArray---------------------------------
--- a/src/share/vm/opto/chaitin.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/chaitin.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -191,7 +191,7 @@
   return next;
 }
 
-PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher)
+PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool scheduling_info_generated)
   : PhaseRegAlloc(unique, cfg, matcher,
 #ifndef PRODUCT
        print_chaitin_statistics
@@ -205,6 +205,11 @@
   , _spilled_twice(Thread::current()->resource_area())
   , _lo_degree(0), _lo_stk_degree(0), _hi_degree(0), _simplified(0)
   , _oldphi(unique)
+  , _scheduling_info_generated(scheduling_info_generated)
+  , _sched_int_pressure(0, INTPRESSURE)
+  , _sched_float_pressure(0, FLOATPRESSURE)
+  , _scratch_int_pressure(0, INTPRESSURE)
+  , _scratch_float_pressure(0, FLOATPRESSURE)
 #ifndef PRODUCT
   , _trace_spilling(TraceSpilling || C->method_has_option("TraceSpilling"))
 #endif
@@ -350,7 +355,7 @@
   // all copy-related live ranges low and then using the max copy-related
   // live range as a cut-off for LIVE and the IFG.  In other words, I can
   // build a subset of LIVE and IFG just for copies.
-  PhaseLive live(_cfg, _lrg_map.names(), &live_arena);
+  PhaseLive live(_cfg, _lrg_map.names(), &live_arena, false);
 
   // Need IFG for coalescing and coloring
   PhaseIFG ifg(&live_arena);
@@ -690,6 +695,29 @@
   _lrg_map.reset_uf_map(lr_counter);
 }
 
+void PhaseChaitin::mark_ssa() {
+  // Use ssa names to populate the live range maps or if no mask
+  // is available, use the 0 entry.
+  uint max_idx = 0;
+  for ( uint i = 0; i < _cfg.number_of_blocks(); i++ ) {
+    Block* block = _cfg.get_block(i);
+    uint cnt = block->number_of_nodes();
+
+    // Handle all the normal Nodes in the block
+    for ( uint j = 0; j < cnt; j++ ) {
+      Node *n = block->get_node(j);
+      // Pre-color to the zero live range, or pick virtual register
+      const RegMask &rm = n->out_RegMask();
+      _lrg_map.map(n->_idx, rm.is_NotEmpty() ? n->_idx : 0);
+      max_idx = (n->_idx > max_idx) ? n->_idx : max_idx;
+    }
+  }
+  _lrg_map.set_max_lrg_id(max_idx+1);
+
+  // Reset the Union-Find mapping to be identity
+  _lrg_map.reset_uf_map(max_idx+1);
+}
+
 
 // Gather LiveRanGe information, including register masks.  Modification of
 // cisc spillable in_RegMasks should not be done before AggressiveCoalesce.
@@ -707,7 +735,9 @@
     for (uint j = 1; j < block->number_of_nodes(); j++) {
       Node* n = block->get_node(j);
       uint input_edge_start =1; // Skip control most nodes
+      bool is_machine_node = false;
       if (n->is_Mach()) {
+        is_machine_node = true;
         input_edge_start = n->as_Mach()->oper_input_base();
       }
       uint idx = n->is_Copy();
@@ -929,6 +959,7 @@
           // Convert operand number to edge index number
           inp = n->as_Mach()->operand_index(inp);
       }
+
       // Prepare register mask for each input
       for( uint k = input_edge_start; k < cnt; k++ ) {
         uint vreg = _lrg_map.live_range_id(n->in(k));
@@ -948,6 +979,12 @@
           n->as_Mach()->use_cisc_RegMask();
         }
 
+        if (is_machine_node && _scheduling_info_generated) {
+          MachNode* cur_node = n->as_Mach();
+          // this is cleaned up by register allocation
+          if (k >= cur_node->num_opnds()) continue;
+        }
+
         LRG &lrg = lrgs(vreg);
         // // Testing for floating point code shape
         // Node *test = n->in(k);
@@ -989,7 +1026,7 @@
         // double can interfere with TWO aligned pairs, or effectively
         // FOUR registers!
 #ifdef ASSERT
-        if (is_vect) {
+        if (is_vect && !_scheduling_info_generated) {
           if (lrg.num_regs() != 0) {
             assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
             assert(!lrg._fat_proj, "sanity");
--- a/src/share/vm/opto/chaitin.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/chaitin.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -399,7 +399,6 @@
   int _trip_cnt;
   int _alternate;
 
-  LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); }
   PhaseLive *_live;             // Liveness, used in the interference graph
   PhaseIFG *_ifg;               // Interference graph (for original chunk)
   Node_List **_lrg_nodes;       // Array of node; lists for lrgs which spill
@@ -464,16 +463,28 @@
 #endif
 
 public:
-  PhaseChaitin( uint unique, PhaseCFG &cfg, Matcher &matcher );
+  PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool track_liveout_pressure);
   ~PhaseChaitin() {}
 
   LiveRangeMap _lrg_map;
 
+  LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); }
+
   // Do all the real work of allocate
   void Register_Allocate();
 
   float high_frequency_lrg() const { return _high_frequency_lrg; }
 
+  // Used when scheduling info generated, not in general register allocation
+  bool _scheduling_info_generated;
+
+  void set_ifg(PhaseIFG &ifg) { _ifg = &ifg;  }
+  void set_live(PhaseLive &live) { _live = &live; }
+  PhaseLive* get_live() { return _live; }
+
+  // Populate the live range maps with ssa info for scheduling
+  void mark_ssa();
+
 #ifndef PRODUCT
   bool trace_spilling() const { return _trace_spilling; }
 #endif
@@ -516,7 +527,11 @@
       uint _final_pressure;
 
       // number of live ranges that constitute high register pressure
-      const uint _high_pressure_limit;
+      uint _high_pressure_limit;
+
+      // initial pressure observed
+      uint _start_pressure;
+
     public:
 
       // lower the register pressure and look for a low to high pressure
@@ -537,6 +552,14 @@
         }
       }
 
+      void init(int limit) {
+        _current_pressure = 0;
+        _high_pressure_index = 0;
+        _final_pressure = 0;
+        _high_pressure_limit = limit;
+        _start_pressure = 0;
+      }
+
       uint high_pressure_index() const {
         return _high_pressure_index;
       }
@@ -545,6 +568,10 @@
         return _final_pressure;
       }
 
+      uint start_pressure() const {
+        return _start_pressure;
+      }
+
       uint current_pressure() const {
         return _current_pressure;
       }
@@ -561,6 +588,15 @@
         _high_pressure_index = 0;
       }
 
+      void set_start_pressure(int value) {
+        _start_pressure = value;
+        _final_pressure = value;
+      }
+
+      void set_current_pressure(int value) {
+        _current_pressure = value;
+      }
+
       void check_pressure_at_fatproj(uint fatproj_location, RegMask& fatproj_mask) {
         // this pressure is only valid at this instruction, i.e. we don't need to lower
         // the register pressure since the fat proj was never live before (going backwards)
@@ -577,14 +613,13 @@
       }
 
       Pressure(uint high_pressure_index, uint high_pressure_limit)
-      : _current_pressure(0)
-      , _high_pressure_index(high_pressure_index)
-      , _high_pressure_limit(high_pressure_limit)
-      , _final_pressure(0) {}
+        : _current_pressure(0)
+        , _high_pressure_index(high_pressure_index)
+        , _final_pressure(0)
+        , _high_pressure_limit(high_pressure_limit)
+        , _start_pressure(0) {}
   };
 
-  void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure);
-  void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure);
   void check_for_high_pressure_transition_at_fatproj(uint& block_reg_pressure, uint location, LRG& lrg, Pressure& pressure, const int op_regtype);
   void add_input_to_liveout(Block* b, Node* n, IndexSet* liveout, double cost, Pressure& int_pressure, Pressure& float_pressure);
   void compute_initial_block_pressure(Block* b, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure, double cost);
@@ -600,10 +635,25 @@
   // acceptable register sets do not overlap, then they do not interfere.
   uint build_ifg_physical( ResourceArea *a );
 
+public:
   // Gather LiveRanGe information, including register masks and base pointer/
   // derived pointer relationships.
   void gather_lrg_masks( bool mod_cisc_masks );
 
+  // user visible pressure variables for scheduling
+  Pressure _sched_int_pressure;
+  Pressure _sched_float_pressure;
+  Pressure _scratch_int_pressure;
+  Pressure _scratch_float_pressure;
+
+  // Pressure functions for user context
+  void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure);
+  void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure);
+  void compute_entry_block_pressure(Block* b);
+  void compute_exit_block_pressure(Block* b);
+  void print_pressure_info(Pressure& pressure, const char *str);
+
+private:
   // Force the bases of derived pointers to be alive at GC points.
   bool stretch_base_pointer_live_ranges( ResourceArea *a );
   // Helper to stretch above; recursively discover the base Node for
--- a/src/share/vm/opto/classes.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/classes.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -131,7 +131,6 @@
 macro(EncodeISOArray)
 macro(EncodeP)
 macro(EncodePKlass)
-macro(ExpD)
 macro(FastLock)
 macro(FastUnlock)
 macro(Goto)
@@ -290,6 +289,10 @@
 macro(MulReductionVD)
 macro(DivVF)
 macro(DivVD)
+macro(AbsVF)
+macro(AbsVD)
+macro(NegVF)
+macro(NegVD)
 macro(SqrtVD)
 macro(LShiftCntV)
 macro(RShiftCntV)
--- a/src/share/vm/opto/compile.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/compile.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -2336,7 +2336,7 @@
     debug_only( cfg.verify(); )
   }
 
-  PhaseChaitin regalloc(unique(), cfg, matcher);
+  PhaseChaitin regalloc(unique(), cfg, matcher, false);
   _regalloc = &regalloc;
   {
     TracePhase tp("regalloc", &timers[_t_registerAllocation]);
--- a/src/share/vm/opto/compile.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/compile.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -1208,12 +1208,6 @@
   // Compute the name of old_SP.  See <arch>.ad for frame layout.
   OptoReg::Name compute_old_SP();
 
-#ifdef ENABLE_ZAP_DEAD_LOCALS
-  static bool is_node_getting_a_safepoint(Node*);
-  void Insert_zap_nodes();
-  Node* call_zap_node(MachSafePointNode* n, int block_no);
-#endif
-
  private:
   // Phase control:
   void Init(int aliaslevel);                     // Prepare for a single compilation
--- a/src/share/vm/opto/gcm.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/gcm.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -34,6 +34,7 @@
 #include "opto/phaseX.hpp"
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
+#include "opto/chaitin.hpp"
 #include "runtime/deoptimization.hpp"
 
 // Portions of code courtesy of Clifford Click
@@ -1363,6 +1364,44 @@
     }
   }
 
+  bool block_size_threshold_ok = false;
+  intptr_t *recalc_pressure_nodes = NULL;
+  if (OptoRegScheduling) {
+    for (uint i = 0; i < number_of_blocks(); i++) {
+      Block* block = get_block(i);
+      if (block->number_of_nodes() > 10) {
+        block_size_threshold_ok = true;
+        break;
+      }
+    }
+  }
+
+  // Enabling the scheduler for register pressure plus finding blocks of size to schedule for it
+  // is key to enabling this feature.
+  PhaseChaitin regalloc(C->unique(), *this, _matcher, true);
+  ResourceArea live_arena;      // Arena for liveness
+  ResourceMark rm_live(&live_arena);
+  PhaseLive live(*this, regalloc._lrg_map.names(), &live_arena, true);
+  PhaseIFG ifg(&live_arena);
+  if (OptoRegScheduling && block_size_threshold_ok) {
+    regalloc.mark_ssa();
+    Compile::TracePhase tp("computeLive", &timers[_t_computeLive]);
+    rm_live.reset_to_mark();           // Reclaim working storage
+    IndexSet::reset_memory(C, &live_arena);
+    uint node_size = regalloc._lrg_map.max_lrg_id();
+    ifg.init(node_size); // Empty IFG
+    regalloc.set_ifg(ifg);
+    regalloc.set_live(live);
+    regalloc.gather_lrg_masks(false);    // Collect LRG masks
+    live.compute(node_size); // Compute liveness
+
+    recalc_pressure_nodes = NEW_RESOURCE_ARRAY(intptr_t, node_size);
+    for (uint i = 0; i < node_size; i++) {
+      recalc_pressure_nodes[i] = 0;
+    }
+  }
+  _regalloc = &regalloc;
+
 #ifndef PRODUCT
   if (trace_opto_pipelining()) {
     tty->print("\n---- Start Local Scheduling ----\n");
@@ -1375,13 +1414,15 @@
   visited.Clear();
   for (uint i = 0; i < number_of_blocks(); i++) {
     Block* block = get_block(i);
-    if (!schedule_local(block, ready_cnt, visited)) {
+    if (!schedule_local(block, ready_cnt, visited, recalc_pressure_nodes)) {
       if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
         C->record_method_not_compilable("local schedule failed");
       }
+      _regalloc = NULL;
       return;
     }
   }
+  _regalloc = NULL;
 
   // If we inserted any instructions between a Call and his CatchNode,
   // clone the instructions on all paths below the Catch.
--- a/src/share/vm/opto/ifg.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/ifg.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -439,8 +439,10 @@
       }
     }
   }
-  assert(int_pressure.current_pressure() == count_int_pressure(liveout), "the int pressure is incorrect");
-  assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
+  if (_scheduling_info_generated == false) {
+    assert(int_pressure.current_pressure() == count_int_pressure(liveout), "the int pressure is incorrect");
+    assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
+  }
 }
 
 /* Go to the first non-phi index in a block */
@@ -518,6 +520,58 @@
 }
 
 /*
+* Computes the entry register pressure of a block, looking at all live
+* ranges in the livein. The register pressure is computed for both float
+* and int/pointer registers.
+*/
+void PhaseChaitin::compute_entry_block_pressure(Block* b) {
+  IndexSet* livein = _live->livein(b);
+  IndexSetIterator elements(livein);
+  uint lid = elements.next();
+  while (lid != 0) {
+    LRG& lrg = lrgs(lid);
+    raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
+    lid = elements.next();
+  }
+  // Now check phis for locally defined inputs
+  for (uint j = 0; j < b->number_of_nodes(); j++) {
+    Node* n = b->get_node(j);
+    if (n->is_Phi()) {
+      for (uint k = 1; k < n->req(); k++) {
+        Node* phi_in = n->in(k);
+        // Because we are talking about phis, raise register pressure once for each
+        // instance of a phi to account for a single value
+        if (_cfg.get_block_for_node(phi_in) == b) {
+          LRG& lrg = lrgs(phi_in->_idx);
+          raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
+          break;
+        }
+      }
+    }
+  }
+  _sched_int_pressure.set_start_pressure(_sched_int_pressure.current_pressure());
+  _sched_float_pressure.set_start_pressure(_sched_float_pressure.current_pressure());
+}
+
+/*
+* Computes the exit register pressure of a block, looking at all live
+* ranges in the liveout. The register pressure is computed for both float
+* and int/pointer registers.
+*/
+void PhaseChaitin::compute_exit_block_pressure(Block* b) {
+  IndexSet* livein = _live->live(b);
+  IndexSetIterator elements(livein);
+  _sched_int_pressure.set_current_pressure(0);
+  _sched_float_pressure.set_current_pressure(0);
+  uint lid = elements.next();
+  while (lid != 0) {
+    LRG& lrg = lrgs(lid);
+    raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
+    lid = elements.next();
+  }
+}
+
+/*
  * Remove dead node if it's not used.
  * We only remove projection nodes if the node "defining" the projection is
  * dead, for example on x86, if we have a dead Add node we remove its
@@ -737,6 +791,16 @@
   block_hrp_index = i;
 }
 
+void PhaseChaitin::print_pressure_info(Pressure& pressure, const char *str) {
+  if (str != NULL) {
+    tty->print_cr("#  *** %s ***", str);
+  }
+  tty->print_cr("#     start pressure is = %d", pressure.start_pressure());
+  tty->print_cr("#     max pressure is = %d", pressure.final_pressure());
+  tty->print_cr("#     end pressure is = %d", pressure.current_pressure());
+  tty->print_cr("#");
+}
+
 /* Build an interference graph:
  *   That is, if 2 live ranges are simultaneously alive but in their acceptable
  *   register sets do not overlap, then they do not interfere. The IFG is built
--- a/src/share/vm/opto/lcm.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/lcm.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -31,6 +31,7 @@
 #include "opto/cfgnode.hpp"
 #include "opto/machnode.hpp"
 #include "opto/runtime.hpp"
+#include "opto/chaitin.hpp"
 #include "runtime/sharedRuntime.hpp"
 
 // Optimization - Graph Style
@@ -443,7 +444,13 @@
 // remaining cases (most), choose the instruction with the greatest latency
 // (that is, the most number of pseudo-cycles required to the end of the
 // routine). If there is a tie, choose the instruction with the most inputs.
-Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &ready_cnt, VectorSet &next_call, uint sched_slot) {
+Node* PhaseCFG::select(
+  Block* block,
+  Node_List &worklist,
+  GrowableArray<int> &ready_cnt,
+  VectorSet &next_call,
+  uint sched_slot,
+  intptr_t* recalc_pressure_nodes) {
 
   // If only a single entry on the stack, use it
   uint cnt = worklist.size();
@@ -458,6 +465,7 @@
   uint score   = 0; // Bigger is better
   int idx = -1;     // Index in worklist
   int cand_cnt = 0; // Candidate count
+  bool block_size_threshold_ok = (block->number_of_nodes() > 10) ? true : false;
 
   for( uint i=0; i<cnt; i++ ) { // Inspect entire worklist
     // Order in worklist is used to break ties.
@@ -537,7 +545,47 @@
     }
 
     uint n_latency = get_latency_for_node(n);
-    uint n_score   = n->req();   // Many inputs get high score to break ties
+    uint n_score = n->req();   // Many inputs get high score to break ties
+
+    if (OptoRegScheduling && block_size_threshold_ok) {
+      if (recalc_pressure_nodes[n->_idx] == 0x7fff7fff) {
+        _regalloc->_scratch_int_pressure.init(_regalloc->_sched_int_pressure.high_pressure_limit());
+        _regalloc->_scratch_float_pressure.init(_regalloc->_sched_float_pressure.high_pressure_limit());
+        // simulate the notion that we just picked this node to schedule
+        n->add_flag(Node::Flag_is_scheduled);
+        // now caculate its effect upon the graph if we did
+        adjust_register_pressure(n, block, recalc_pressure_nodes, false);
+        // return its state for finalize in case somebody else wins
+        n->remove_flag(Node::Flag_is_scheduled);
+        // now save the two final pressure components of register pressure, limiting pressure calcs to short size
+        short int_pressure = (short)_regalloc->_scratch_int_pressure.current_pressure();
+        short float_pressure = (short)_regalloc->_scratch_float_pressure.current_pressure();
+        recalc_pressure_nodes[n->_idx] = int_pressure;
+        recalc_pressure_nodes[n->_idx] |= (float_pressure << 16);
+      }
+
+      if (_scheduling_for_pressure) {
+        latency = n_latency;
+        if (n_choice != 3) {
+          // Now evaluate each register pressure component based on threshold in the score.
+          // In general the defining register type will dominate the score, ergo we will not see register pressure grow on both banks
+          // on a single instruction, but we might see it shrink on both banks.
+          // For each use of register that has a register class that is over the high pressure limit, we build n_score up for
+          // live ranges that terminate on this instruction.
+          if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) {
+            short int_pressure = (short)recalc_pressure_nodes[n->_idx];
+            n_score = (int_pressure < 0) ? ((score + n_score) - int_pressure) : (int_pressure > 0) ? 1 : n_score;
+          }
+          if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) {
+            short float_pressure = (short)(recalc_pressure_nodes[n->_idx] >> 16);
+            n_score = (float_pressure < 0) ? ((score + n_score) - float_pressure) : (float_pressure > 0) ? 1 : n_score;
+          }
+        } else {
+          // make sure we choose these candidates
+          score = 0;
+        }
+      }
+    }
 
     // Keep best latency found
     cand_cnt++;
@@ -562,6 +610,100 @@
   return n;
 }
 
+//-------------------------adjust_register_pressure----------------------------
+void PhaseCFG::adjust_register_pressure(Node* n, Block* block, intptr_t* recalc_pressure_nodes, bool finalize_mode) {
+  PhaseLive* liveinfo = _regalloc->get_live();
+  IndexSet* liveout = liveinfo->live(block);
+  // first adjust the register pressure for the sources
+  for (uint i = 1; i < n->req(); i++) {
+    bool lrg_ends = false;
+    Node *src_n = n->in(i);
+    if (src_n == NULL) continue;
+    if (!src_n->is_Mach()) continue;
+    uint src = _regalloc->_lrg_map.find(src_n);
+    if (src == 0) continue;
+    LRG& lrg_src = _regalloc->lrgs(src);
+    // detect if the live range ends or not
+    if (liveout->member(src) == false) {
+      lrg_ends = true;
+      for (DUIterator_Fast jmax, j = src_n->fast_outs(jmax); j < jmax; j++) {
+        Node* m = src_n->fast_out(j); // Get user
+        if (m == n) continue;
+        if (!m->is_Mach()) continue;
+        MachNode *mach = m->as_Mach();
+        bool src_matches = false;
+        int iop = mach->ideal_Opcode();
+
+        switch (iop) {
+        case Op_StoreB:
+        case Op_StoreC:
+        case Op_StoreCM:
+        case Op_StoreD:
+        case Op_StoreF:
+        case Op_StoreI:
+        case Op_StoreL:
+        case Op_StoreP:
+        case Op_StoreN:
+        case Op_StoreVector:
+        case Op_StoreNKlass:
+          for (uint k = 1; k < m->req(); k++) {
+            Node *in = m->in(k);
+            if (in == src_n) {
+              src_matches = true;
+              break;
+            }
+          }
+          break;
+
+        default:
+          src_matches = true;
+          break;
+        }
+
+        // If we have a store as our use, ignore the non source operands
+        if (src_matches == false) continue;
+
+        // Mark every unscheduled use which is not n with a recalculation
+        if ((get_block_for_node(m) == block) && (!m->is_scheduled())) {
+          if (finalize_mode && !m->is_Phi()) {
+            recalc_pressure_nodes[m->_idx] = 0x7fff7fff;
+          }
+          lrg_ends = false;
+        }
+      }
+    }
+    // if none, this live range ends and we can adjust register pressure
+    if (lrg_ends) {
+      if (finalize_mode) {
+        _regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure);
+      } else {
+        _regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure);
+      }
+    }
+  }
+
+  // now add the register pressure from the dest and evaluate which heuristic we should use:
+  // 1.) The default, latency scheduling
+  // 2.) Register pressure scheduling based on the high pressure limit threshold for int or float register stacks
+  uint dst = _regalloc->_lrg_map.find(n);
+  if (dst != 0) {
+    LRG& lrg_dst = _regalloc->lrgs(dst);
+    if (finalize_mode) {
+      _regalloc->raise_pressure(block, lrg_dst, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure);
+      // check to see if we fall over the register pressure cliff here
+      if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) {
+        _scheduling_for_pressure = true;
+      } else if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) {
+        _scheduling_for_pressure = true;
+      } else {
+        // restore latency scheduling mode
+        _scheduling_for_pressure = false;
+      }
+    } else {
+      _regalloc->raise_pressure(block, lrg_dst, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure);
+    }
+  }
+}
 
 //------------------------------set_next_call----------------------------------
 void PhaseCFG::set_next_call(Block* block, Node* n, VectorSet& next_call) {
@@ -644,7 +786,7 @@
         continue;
       }
       if( m->is_Phi() ) continue;
-      int m_cnt = ready_cnt.at(m->_idx)-1;
+      int m_cnt = ready_cnt.at(m->_idx) - 1;
       ready_cnt.at_put(m->_idx, m_cnt);
       if( m_cnt == 0 )
         worklist.push(m);
@@ -711,7 +853,7 @@
 
 //------------------------------schedule_local---------------------------------
 // Topological sort within a block.  Someday become a real scheduler.
-bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call) {
+bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call, intptr_t *recalc_pressure_nodes) {
   // Already "sorted" are the block start Node (as the first entry), and
   // the block-ending Node and any trailing control projections.  We leave
   // these alone.  PhiNodes and ParmNodes are made to follow the block start
@@ -733,10 +875,24 @@
     return true;
   }
 
+  bool block_size_threshold_ok = (block->number_of_nodes() > 10) ? true : false;
+
+  // We track the uses of local definitions as input dependences so that
+  // we know when a given instruction is avialable to be scheduled.
+  uint i;
+  if (OptoRegScheduling && block_size_threshold_ok) {
+    for (i = 1; i < block->number_of_nodes(); i++) { // setup nodes for pressure calc
+      Node *n = block->get_node(i);
+      n->remove_flag(Node::Flag_is_scheduled);
+      if (!n->is_Phi()) {
+        recalc_pressure_nodes[n->_idx] = 0x7fff7fff;
+      }
+    }
+  }
+
   // Move PhiNodes and ParmNodes from 1 to cnt up to the start
   uint node_cnt = block->end_idx();
   uint phi_cnt = 1;
-  uint i;
   for( i = 1; i<node_cnt; i++ ) { // Scan for Phi
     Node *n = block->get_node(i);
     if( n->is_Phi() ||          // Found a PhiNode or ParmNode
@@ -744,6 +900,10 @@
       // Move guy at 'phi_cnt' to the end; makes a hole at phi_cnt
       block->map_node(block->get_node(phi_cnt), i);
       block->map_node(n, phi_cnt++);  // swap Phi/Parm up front
+      if (OptoRegScheduling && block_size_threshold_ok) {
+        // mark n as scheduled
+        n->add_flag(Node::Flag_is_scheduled);
+      }
     } else {                    // All others
       // Count block-local inputs to 'n'
       uint cnt = n->len();      // Input count
@@ -791,12 +951,18 @@
 
   // All the prescheduled guys do not hold back internal nodes
   uint i3;
-  for(i3 = 0; i3<phi_cnt; i3++ ) {  // For all pre-scheduled
+  for (i3 = 0; i3 < phi_cnt; i3++) {  // For all pre-scheduled
     Node *n = block->get_node(i3);       // Get pre-scheduled
     for (DUIterator_Fast jmax, j = n->fast_outs(jmax); j < jmax; j++) {
       Node* m = n->fast_out(j);
       if (get_block_for_node(m) == block) { // Local-block user
         int m_cnt = ready_cnt.at(m->_idx)-1;
+        if (OptoRegScheduling && block_size_threshold_ok) {
+          // mark m as scheduled
+          if (m_cnt < 0) {
+            m->add_flag(Node::Flag_is_scheduled);
+          }
+        }
         ready_cnt.at_put(m->_idx, m_cnt);   // Fix ready count
       }
     }
@@ -827,6 +993,18 @@
     worklist.push(d);
   }
 
+  if (OptoRegScheduling && block_size_threshold_ok) {
+    // To stage register pressure calculations we need to examine the live set variables
+    // breaking them up by register class to compartmentalize the calculations.
+    uint float_pressure = Matcher::float_pressure(FLOATPRESSURE);
+    _regalloc->_sched_int_pressure.init(INTPRESSURE);
+    _regalloc->_sched_float_pressure.init(float_pressure);
+    _regalloc->_scratch_int_pressure.init(INTPRESSURE);
+    _regalloc->_scratch_float_pressure.init(float_pressure);
+
+    _regalloc->compute_entry_block_pressure(block);
+  }
+
   // Warm up the 'next_call' heuristic bits
   needed_for_next_call(block, block->head(), next_call);
 
@@ -858,9 +1036,18 @@
 #endif
 
     // Select and pop a ready guy from worklist
-    Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt);
+    Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt, recalc_pressure_nodes);
     block->map_node(n, phi_cnt++);    // Schedule him next
 
+    if (OptoRegScheduling && block_size_threshold_ok) {
+      n->add_flag(Node::Flag_is_scheduled);
+
+      // Now adjust the resister pressure with the node we selected
+      if (!n->is_Phi()) {
+        adjust_register_pressure(n, block, recalc_pressure_nodes, true);
+      }
+    }
+
 #ifndef PRODUCT
     if (trace_opto_pipelining()) {
       tty->print("#    select %d: %s", n->_idx, n->Name());
@@ -906,7 +1093,7 @@
         assert(m->is_MachProj() && n->is_Mach() && n->as_Mach()->has_call(), "unexpected node types");
         continue;
       }
-      int m_cnt = ready_cnt.at(m->_idx)-1;
+      int m_cnt = ready_cnt.at(m->_idx) - 1;
       ready_cnt.at_put(m->_idx, m_cnt);
       if( m_cnt == 0 )
         worklist.push(m);
@@ -925,6 +1112,12 @@
     return false;
   }
 
+  if (OptoRegScheduling && block_size_threshold_ok) {
+    _regalloc->compute_exit_block_pressure(block);
+    block->_reg_pressure = _regalloc->_sched_int_pressure.final_pressure();
+    block->_freg_pressure = _regalloc->_sched_float_pressure.final_pressure();
+  }
+
 #ifndef PRODUCT
   if (trace_opto_pipelining()) {
     tty->print_cr("#");
@@ -933,11 +1126,17 @@
       tty->print("# ");
       block->get_node(i)->fast_dump();
     }
+    tty->print_cr("# ");
+
+    if (OptoRegScheduling && block_size_threshold_ok) {
+      tty->print_cr("# pressure info : %d", block->_pre_order);
+      _regalloc->print_pressure_info(_regalloc->_sched_int_pressure, "int register info");
+      _regalloc->print_pressure_info(_regalloc->_sched_float_pressure, "float register info");
+    }
     tty->cr();
   }
 #endif
 
-
   return true;
 }
 
--- a/src/share/vm/opto/library_call.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/library_call.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -222,7 +222,6 @@
   bool inline_math_negateExactL();
   bool inline_math_subtractExactI(bool is_decrement);
   bool inline_math_subtractExactL(bool is_decrement);
-  bool inline_exp();
   bool inline_pow();
   Node* finish_pow_exp(Node* result, Node* x, Node* y, const TypeFunc* call_type, address funcAddr, const char* funcName);
   bool inline_min_max(vmIntrinsics::ID id);
@@ -1535,20 +1534,6 @@
   }
 }
 
-//------------------------------inline_exp-------------------------------------
-// Inline exp instructions, if possible.  The Intel hardware only misses
-// really odd corner cases (+/- Infinity).  Just uncommon-trap them.
-bool LibraryCallKit::inline_exp() {
-  Node* arg = round_double_node(argument(0));
-  Node* n   = _gvn.transform(new ExpDNode(C, control(), arg));
-
-  n = finish_pow_exp(n, arg, NULL, OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dexp), "EXP");
-  set_result(n);
-
-  C->set_has_split_ifs(true); // Has chance for split-if optimization
-  return true;
-}
-
 //------------------------------inline_pow-------------------------------------
 // Inline power instructions, if possible.
 bool LibraryCallKit::inline_pow() {
@@ -1776,8 +1761,9 @@
   case vmIntrinsics::_dsqrt:  return Matcher::match_rule_supported(Op_SqrtD) ? inline_math(id) : false;
   case vmIntrinsics::_dabs:   return Matcher::has_match_rule(Op_AbsD)   ? inline_math(id) : false;
 
-  case vmIntrinsics::_dexp:   return Matcher::has_match_rule(Op_ExpD)   ? inline_exp()    :
-    runtime_math(OptoRuntime::Math_D_D_Type(),  FN_PTR(SharedRuntime::dexp),  "EXP");
+  case vmIntrinsics::_dexp:
+    return (UseSSE >= 2) ? runtime_math(OptoRuntime::Math_D_D_Type(), StubRoutines::dexp(),  "dexp") :
+    runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dexp),  "EXP");
   case vmIntrinsics::_dpow:   return Matcher::has_match_rule(Op_PowD)   ? inline_pow()    :
     runtime_math(OptoRuntime::Math_DD_D_Type(), FN_PTR(SharedRuntime::dpow),  "POW");
 #undef FN_PTR
--- a/src/share/vm/opto/live.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/live.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -41,7 +41,14 @@
 // block is put on the worklist.
 //   The locally live-in stuff is computed once and added to predecessor
 // live-out sets.  This separate compilation is done in the outer loop below.
-PhaseLive::PhaseLive( const PhaseCFG &cfg, const LRG_List &names, Arena *arena ) : Phase(LIVE), _cfg(cfg), _names(names), _arena(arena), _live(0) {
+PhaseLive::PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas)
+  : Phase(LIVE),
+  _cfg(cfg),
+  _names(names),
+  _arena(arena),
+  _live(0),
+  _livein(0),
+  _keep_deltas(keep_deltas) {
 }
 
 void PhaseLive::compute(uint maxlrg) {
@@ -56,6 +63,13 @@
     _live[i].initialize(_maxlrg);
   }
 
+  if (_keep_deltas) {
+    _livein = (IndexSet*)_arena->Amalloc(sizeof(IndexSet) * _cfg.number_of_blocks());
+    for (i = 0; i < _cfg.number_of_blocks(); i++) {
+      _livein[i].initialize(_maxlrg);
+    }
+  }
+
   // Init the sparse arrays for delta-sets.
   ResourceMark rm;              // Nuke temp storage on exit
 
@@ -124,7 +138,10 @@
 
       // PhiNode uses go in the live-out set of prior blocks.
       for (uint k = i; k > 0; k--) {
-        add_liveout(p, _names.at(block->get_node(k-1)->in(l)->_idx), first_pass);
+        Node *phi = block->get_node(k - 1);
+        if (l < phi->req()) {
+          add_liveout(p, _names.at(phi->in(l)->_idx), first_pass);
+        }
       }
     }
     freeset(block);
@@ -200,8 +217,11 @@
 }
 
 // Free an IndexSet from a block.
-void PhaseLive::freeset( const Block *p ) {
+void PhaseLive::freeset( Block *p ) {
   IndexSet *f = _deltas[p->_pre_order-1];
+  if ( _keep_deltas ) {
+    add_livein(p, f);
+  }
   f->set_next(_free_IndexSet);
   _free_IndexSet = f;           // Drop onto free list
   _deltas[p->_pre_order-1] = NULL;
@@ -249,10 +269,23 @@
   }
 }
 
+// Add a vector of live-in values to a given blocks live-in set.
+void PhaseLive::add_livein(Block *p, IndexSet *lo) {
+  IndexSet *livein = &_livein[p->_pre_order-1];
+  IndexSetIterator elements(lo);
+  uint r;
+  while ((r = elements.next()) != 0) {
+    livein->insert(r);         // Then add to live-in set
+  }
+}
+
 #ifndef PRODUCT
 // Dump the live-out set for a block
 void PhaseLive::dump( const Block *b ) const {
   tty->print("Block %d: ",b->_pre_order);
+  if ( _keep_deltas ) {
+    tty->print("LiveIn: ");  _livein[b->_pre_order-1].dump();
+  }
   tty->print("LiveOut: ");  _live[b->_pre_order-1].dump();
   uint cnt = b->number_of_nodes();
   for( uint i=0; i<cnt; i++ ) {
--- a/src/share/vm/opto/live.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/live.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -46,7 +46,8 @@
 class PhaseLive : public Phase {
   // Array of Sets of values live at the start of a block.
   // Indexed by block pre-order number.
-  IndexSet *_live;
+  IndexSet *_live; // live out
+  IndexSet *_livein; // live in
 
   // Array of Sets of values defined locally in the block
   // Indexed by block pre-order number.
@@ -62,15 +63,17 @@
   const LRG_List &_names;       // Mapping from Nodes to live ranges
   uint _maxlrg;                 // Largest live-range number
   Arena *_arena;
+  bool _keep_deltas;            // Retain live in information
 
   IndexSet *getset( Block *p );
   IndexSet *getfreeset( );
-  void freeset( const Block *p );
+  void freeset( Block *p );
   void add_liveout( Block *p, uint r, VectorSet &first_pass );
   void add_liveout( Block *p, IndexSet *lo, VectorSet &first_pass );
+  void add_livein( Block *p, IndexSet *lo );
 
 public:
-  PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena);
+  PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas);
   ~PhaseLive() {}
   // Compute liveness info
   void compute(uint maxlrg);
@@ -79,6 +82,7 @@
 
   // Return the live-out set for this block
   IndexSet *live( const Block * b ) { return &_live[b->_pre_order-1]; }
+  IndexSet *livein( const Block * b ) { return &_livein[b->_pre_order - 1]; }
 
 #ifndef PRODUCT
   void dump( const Block *b ) const;
--- a/src/share/vm/opto/loopnode.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/loopnode.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -290,6 +290,7 @@
     if (phi() == NULL) {
       return NULL;
     }
+    assert(phi()->is_Phi(), "should be PhiNode");
     Node *ln = phi()->in(0);
     if (ln->is_CountedLoop() && ln->as_CountedLoop()->loopexit() == this) {
       return (CountedLoopNode*)ln;
--- a/src/share/vm/opto/loopopts.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/loopopts.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -447,21 +447,21 @@
     }
 
     // Replace (I1 +p (I2 + V)) with ((I1 +p I2) +p V)
-    if( n2_loop != n_loop && n3_loop == n_loop ) {
-      if( n->in(3)->Opcode() == Op_AddI ) {
+    if (n2_loop != n_loop && n3_loop == n_loop) {
+      if (n->in(3)->Opcode() == Op_AddX) {
         Node *V = n->in(3)->in(1);
         Node *I = n->in(3)->in(2);
-        if( is_member(n_loop,get_ctrl(V)) ) {
+        if (is_member(n_loop,get_ctrl(V))) {
         } else {
           Node *tmp = V; V = I; I = tmp;
         }
-        if( !is_member(n_loop,get_ctrl(I)) ) {
-          Node *add1 = new AddPNode( n->in(1), n->in(2), I );
+        if (!is_member(n_loop,get_ctrl(I))) {
+          Node *add1 = new AddPNode(n->in(1), n->in(2), I);
           // Stuff new AddP in the loop preheader
-          register_new_node( add1, n_loop->_head->in(LoopNode::EntryControl) );
-          Node *add2 = new AddPNode( n->in(1), add1, V );
-          register_new_node( add2, n_ctrl );
-          _igvn.replace_node( n, add2 );
+          register_new_node(add1, n_loop->_head->in(LoopNode::EntryControl));
+          Node *add2 = new AddPNode(n->in(1), add1, V);
+          register_new_node(add2, n_ctrl);
+          _igvn.replace_node(n, add2);
           return add2;
         }
       }
@@ -653,7 +653,6 @@
   return iff->in(1);
 }
 
-#ifdef ASSERT
 static void enqueue_cfg_uses(Node* m, Unique_Node_List& wq) {
   for (DUIterator_Fast imax, i = m->fast_outs(imax); i < imax; i++) {
     Node* u = m->fast_out(i);
@@ -667,7 +666,6 @@
     }
   }
 }
-#endif
 
 // Try moving a store out of a loop, right before the loop
 Node* PhaseIdealLoop::try_move_store_before_loop(Node* n, Node *n_ctrl) {
@@ -687,11 +685,15 @@
     // written at iteration i by the second store could be overwritten
     // at iteration i+n by the first store: it's not safe to move the
     // first store out of the loop
-    // - nothing must observe the Phi memory: it guarantees no read
-    // before the store and no early exit out of the loop
-    // With those conditions, we are also guaranteed the store post
-    // dominates the loop head. Otherwise there would be extra Phi
-    // involved between the loop's Phi and the store.
+    // - nothing must observe the memory Phi: it guarantees no read
+    // before the store, we are also guaranteed the store post
+    // dominates the loop head (ignoring a possible early
+    // exit). Otherwise there would be extra Phi involved between the
+    // loop's Phi and the store.
+    // - there must be no early exit from the loop before the Store
+    // (such an exit most of the time would be an extra use of the
+    // memory Phi but sometimes is a bottom memory Phi that takes the
+    // store as input).
 
     if (!n_loop->is_member(address_loop) &&
         !n_loop->is_member(value_loop) &&
@@ -699,9 +701,10 @@
         mem->outcnt() == 1 &&
         mem->in(LoopNode::LoopBackControl) == n) {
 
-#ifdef ASSERT
-      // Verify that store's control does post dominate loop entry and
-      // that there's no early exit of the loop before the store.
+      assert(n_loop->_tail != NULL, "need a tail");
+      assert(is_dominator(n_ctrl, n_loop->_tail), "store control must not be in a branch in the loop");
+
+      // Verify that there's no early exit of the loop before the store.
       bool ctrl_ok = false;
       {
         // Follow control from loop head until n, we exit the loop or
@@ -709,7 +712,7 @@
         ResourceMark rm;
         Unique_Node_List wq;
         wq.push(n_loop->_head);
-        assert(n_loop->_tail != NULL, "need a tail");
+
         for (uint next = 0; next < wq.size(); ++next) {
           Node *m = wq.at(next);
           if (m == n->in(0)) {
@@ -722,24 +725,27 @@
             break;
           }
           enqueue_cfg_uses(m, wq);
+          if (wq.size() > 10) {
+            ctrl_ok = false;
+            break;
+          }
         }
       }
-      assert(ctrl_ok, "bad control");
-#endif
+      if (ctrl_ok) {
+        // move the Store
+        _igvn.replace_input_of(mem, LoopNode::LoopBackControl, mem);
+        _igvn.replace_input_of(n, 0, n_loop->_head->in(LoopNode::EntryControl));
+        _igvn.replace_input_of(n, MemNode::Memory, mem->in(LoopNode::EntryControl));
+        // Disconnect the phi now. An empty phi can confuse other
+        // optimizations in this pass of loop opts.
+        _igvn.replace_node(mem, mem->in(LoopNode::EntryControl));
+        n_loop->_body.yank(mem);
 
-      // move the Store
-      _igvn.replace_input_of(mem, LoopNode::LoopBackControl, mem);
-      _igvn.replace_input_of(n, 0, n_loop->_head->in(LoopNode::EntryControl));
-      _igvn.replace_input_of(n, MemNode::Memory, mem->in(LoopNode::EntryControl));
-      // Disconnect the phi now. An empty phi can confuse other
-      // optimizations in this pass of loop opts.
-      _igvn.replace_node(mem, mem->in(LoopNode::EntryControl));
-      n_loop->_body.yank(mem);
+        IdealLoopTree* new_loop = get_loop(n->in(0));
+        set_ctrl_and_loop(n, n->in(0));
 
-      IdealLoopTree* new_loop = get_loop(n->in(0));
-      set_ctrl_and_loop(n, n->in(0));
-
-      return n;
+        return n;
+      }
     }
   }
   return NULL;
@@ -769,13 +775,15 @@
             }
             if (u->is_Phi() && u->in(0) == n_loop->_head) {
               assert(_igvn.type(u) == Type::MEMORY, "bad phi");
-              assert(phi == NULL, "already found");
+              // multiple phis on the same slice are possible
+              if (phi != NULL) {
+                return;
+              }
               phi = u;
               continue;
             }
           }
-          phi = NULL;
-          break;
+          return;
         }
         if (phi != NULL) {
           // Nothing in the loop before the store (next iteration)
--- a/src/share/vm/opto/macro.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/macro.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -1512,7 +1512,8 @@
     // MemBarStoreStore so that stores that initialize this object
     // can't be reordered with a subsequent store that makes this
     // object accessible by other threads.
-    if (init == NULL || (!init->is_complete_with_arraycopy() && !init->does_not_escape())) {
+    if (!alloc->does_not_escape_thread() &&
+        (init == NULL || !init->is_complete_with_arraycopy())) {
       if (init == NULL || init->req() < InitializeNode::RawStores) {
         // No InitializeNode or no stores captured by zeroing
         // elimination. Simply add the MemBarStoreStore after object
--- a/src/share/vm/opto/matcher.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/matcher.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -2045,6 +2045,33 @@
 // and then expanded into the inline_cache_reg and a method_oop register
 //   defined in ad_<arch>.cpp
 
+// Check for shift by small constant as well
+static bool clone_shift(Node* shift, Matcher* matcher, MStack& mstack, VectorSet& address_visited) {
+  if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
+      shift->in(2)->get_int() <= 3 &&
+      // Are there other uses besides address expressions?
+      !matcher->is_visited(shift)) {
+    address_visited.set(shift->_idx); // Flag as address_visited
+    mstack.push(shift->in(2), Visit);
+    Node *conv = shift->in(1);
+#ifdef _LP64
+    // Allow Matcher to match the rule which bypass
+    // ConvI2L operation for an array index on LP64
+    // if the index value is positive.
+    if (conv->Opcode() == Op_ConvI2L &&
+        conv->as_Type()->type()->is_long()->_lo >= 0 &&
+        // Are there other uses besides address expressions?
+        !matcher->is_visited(conv)) {
+      address_visited.set(conv->_idx); // Flag as address_visited
+      mstack.push(conv->in(1), Pre_Visit);
+    } else
+#endif
+      mstack.push(conv, Pre_Visit);
+    return true;
+  }
+  return false;
+}
+
 
 //------------------------------find_shared------------------------------------
 // Set bits if Node is shared or otherwise a root
@@ -2205,7 +2232,10 @@
 #endif
 
         // Clone addressing expressions as they are "free" in memory access instructions
-        if( mem_op && i == MemNode::Address && mop == Op_AddP ) {
+        if (mem_op && i == MemNode::Address && mop == Op_AddP &&
+            // When there are other uses besides address expressions
+            // put it on stack and mark as shared.
+            !is_visited(m)) {
           // Some inputs for address expression are not put on stack
           // to avoid marking them as shared and forcing them into register
           // if they are used only in address expressions.
@@ -2213,10 +2243,7 @@
           // besides address expressions.
 
           Node *off = m->in(AddPNode::Offset);
-          if( off->is_Con() &&
-              // When there are other uses besides address expressions
-              // put it on stack and mark as shared.
-              !is_visited(m) ) {
+          if (off->is_Con()) {
             address_visited.test_set(m->_idx); // Flag as address_visited
             Node *adr = m->in(AddPNode::Address);
 
@@ -2229,28 +2256,7 @@
                 !is_visited(adr) ) {
               address_visited.set(adr->_idx); // Flag as address_visited
               Node *shift = adr->in(AddPNode::Offset);
-              // Check for shift by small constant as well
-              if( shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
-                  shift->in(2)->get_int() <= 3 &&
-                  // Are there other uses besides address expressions?
-                  !is_visited(shift) ) {
-                address_visited.set(shift->_idx); // Flag as address_visited
-                mstack.push(shift->in(2), Visit);
-                Node *conv = shift->in(1);
-#ifdef _LP64
-                // Allow Matcher to match the rule which bypass
-                // ConvI2L operation for an array index on LP64
-                // if the index value is positive.
-                if( conv->Opcode() == Op_ConvI2L &&
-                    conv->as_Type()->type()->is_long()->_lo >= 0 &&
-                    // Are there other uses besides address expressions?
-                    !is_visited(conv) ) {
-                  address_visited.set(conv->_idx); // Flag as address_visited
-                  mstack.push(conv->in(1), Pre_Visit);
-                } else
-#endif
-                mstack.push(conv, Pre_Visit);
-              } else {
+              if (!clone_shift(shift, this, mstack, address_visited)) {
                 mstack.push(shift, Pre_Visit);
               }
               mstack.push(adr->in(AddPNode::Address), Pre_Visit);
@@ -2263,6 +2269,12 @@
             mstack.push(off, Visit);
             mstack.push(m->in(AddPNode::Base), Pre_Visit);
             continue; // for(int i = ...)
+          } else if (clone_shift_expressions &&
+                     clone_shift(off, this, mstack, address_visited)) {
+              address_visited.test_set(m->_idx); // Flag as address_visited
+              mstack.push(m->in(AddPNode::Address), Pre_Visit);
+              mstack.push(m->in(AddPNode::Base), Pre_Visit);
+              continue;
           } // if( off->is_Con() )
         }   // if( mem_op &&
         mstack.push(m, Pre_Visit);
--- a/src/share/vm/opto/matcher.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/matcher.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -269,6 +269,9 @@
   // should generate this one.
   static const bool match_rule_supported(int opcode);
 
+  // Some uarchs have different sized float register resources
+  static const int float_pressure(int default_pressure_threshold);
+
   // Used to determine if we have fast l2f conversion
   // USII has it, USIII doesn't
   static const bool convL2FSupported(void);
--- a/src/share/vm/opto/memnode.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/memnode.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -2945,7 +2945,7 @@
       // Final field stores.
       Node* alloc = AllocateNode::Ideal_allocation(in(MemBarNode::Precedent), phase);
       if ((alloc != NULL) && alloc->is_Allocate() &&
-          alloc->as_Allocate()->_is_non_escaping) {
+          alloc->as_Allocate()->does_not_escape_thread()) {
         // The allocated object does not escape.
         eliminate = true;
       }
--- a/src/share/vm/opto/node.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/node.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -674,7 +674,8 @@
     Flag_avoid_back_to_back_after    = Flag_avoid_back_to_back_before << 1,
     Flag_has_call                    = Flag_avoid_back_to_back_after << 1,
     Flag_is_reduction                = Flag_has_call << 1,
-    Flag_is_expensive                = Flag_is_reduction << 1,
+    Flag_is_scheduled                = Flag_is_reduction,
+    Flag_is_expensive                = Flag_is_scheduled << 1,
     _max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination
   };
 
@@ -861,6 +862,9 @@
   // It must have the loop's phi as input and provide a def to the phi.
   bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
 
+  // Used in lcm to mark nodes that have scheduled
+  bool is_scheduled() const { return (_flags & Flag_is_scheduled) != 0; }
+
 //----------------- Optimization
 
   // Get the worst-case Type output for this Node.
--- a/src/share/vm/opto/output.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/output.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -116,12 +116,6 @@
     }
   }
 
-# ifdef ENABLE_ZAP_DEAD_LOCALS
-  if (ZapDeadCompiledLocals) {
-    Insert_zap_nodes();
-  }
-# endif
-
   uint* blk_starts = NEW_RESOURCE_ARRAY(uint, _cfg->number_of_blocks() + 1);
   blk_starts[0] = 0;
 
@@ -184,113 +178,6 @@
   return (stub_function() == NULL && has_java_calls());
 }
 
-# ifdef ENABLE_ZAP_DEAD_LOCALS
-
-
-// In order to catch compiler oop-map bugs, we have implemented
-// a debugging mode called ZapDeadCompilerLocals.
-// This mode causes the compiler to insert a call to a runtime routine,
-// "zap_dead_locals", right before each place in compiled code
-// that could potentially be a gc-point (i.e., a safepoint or oop map point).
-// The runtime routine checks that locations mapped as oops are really
-// oops, that locations mapped as values do not look like oops,
-// and that locations mapped as dead are not used later
-// (by zapping them to an invalid address).
-
-int Compile::_CompiledZap_count = 0;
-
-void Compile::Insert_zap_nodes() {
-  bool skip = false;
-
-
-  // Dink with static counts because code code without the extra
-  // runtime calls is MUCH faster for debugging purposes
-
-       if ( CompileZapFirst  ==  0  ) ; // nothing special
-  else if ( CompileZapFirst  >  CompiledZap_count() )  skip = true;
-  else if ( CompileZapFirst  == CompiledZap_count() )
-    warning("starting zap compilation after skipping");
-
-       if ( CompileZapLast  ==  -1  ) ; // nothing special
-  else if ( CompileZapLast  <   CompiledZap_count() )  skip = true;
-  else if ( CompileZapLast  ==  CompiledZap_count() )
-    warning("about to compile last zap");
-
-  ++_CompiledZap_count; // counts skipped zaps, too
-
-  if ( skip )  return;
-
-
-  if ( _method == NULL )
-    return; // no safepoints/oopmaps emitted for calls in stubs,so we don't care
-
-  // Insert call to zap runtime stub before every node with an oop map
-  for( uint i=0; i<_cfg->number_of_blocks(); i++ ) {
-    Block *b = _cfg->get_block(i);
-    for ( uint j = 0;  j < b->number_of_nodes();  ++j ) {
-      Node *n = b->get_node(j);
-
-      // Determining if we should insert a zap-a-lot node in output.
-      // We do that for all nodes that has oopmap info, except for calls
-      // to allocation.  Calls to allocation passes in the old top-of-eden pointer
-      // and expect the C code to reset it.  Hence, there can be no safepoints between
-      // the inlined-allocation and the call to new_Java, etc.
-      // We also cannot zap monitor calls, as they must hold the microlock
-      // during the call to Zap, which also wants to grab the microlock.
-      bool insert = n->is_MachSafePoint() && (n->as_MachSafePoint()->oop_map() != NULL);
-      if ( insert ) { // it is MachSafePoint
-        if ( !n->is_MachCall() ) {
-          insert = false;
-        } else if ( n->is_MachCall() ) {
-          MachCallNode* call = n->as_MachCall();
-          if (call->entry_point() == OptoRuntime::new_instance_Java() ||
-              call->entry_point() == OptoRuntime::new_array_Java() ||
-              call->entry_point() == OptoRuntime::multianewarray2_Java() ||
-              call->entry_point() == OptoRuntime::multianewarray3_Java() ||
-              call->entry_point() == OptoRuntime::multianewarray4_Java() ||
-              call->entry_point() == OptoRuntime::multianewarray5_Java() ||
-              call->entry_point() == OptoRuntime::slow_arraycopy_Java() ||
-              call->entry_point() == OptoRuntime::complete_monitor_locking_Java()
-              ) {
-            insert = false;
-          }
-        }
-        if (insert) {
-          Node *zap = call_zap_node(n->as_MachSafePoint(), i);
-          b->insert_node(zap, j);
-          _cfg->map_node_to_block(zap, b);
-          ++j;
-        }
-      }
-    }
-  }
-}
-
-
-Node* Compile::call_zap_node(MachSafePointNode* node_to_check, int block_no) {
-  const TypeFunc *tf = OptoRuntime::zap_dead_locals_Type();
-  CallStaticJavaNode* ideal_node =
-    new CallStaticJavaNode( tf,
-         OptoRuntime::zap_dead_locals_stub(_method->flags().is_native()),
-                       "call zap dead locals stub", 0, TypePtr::BOTTOM);
-  // We need to copy the OopMap from the site we're zapping at.
-  // We have to make a copy, because the zap site might not be
-  // a call site, and zap_dead is a call site.
-  OopMap* clone = node_to_check->oop_map()->deep_copy();
-
-  // Add the cloned OopMap to the zap node
-  ideal_node->set_oop_map(clone);
-  return _matcher->match_sfpt(ideal_node);
-}
-
-bool Compile::is_node_getting_a_safepoint( Node* n) {
-  // This code duplicates the logic prior to the call of add_safepoint
-  // below in this file.
-  if( n->is_MachSafePoint() ) return true;
-  return false;
-}
-
-# endif // ENABLE_ZAP_DEAD_LOCALS
 
 // Compute the size of first NumberOfLoopInstrToAlign instructions at the top
 // of a loop. When aligning a loop we need to provide enough instructions
@@ -834,10 +721,6 @@
   MachSafePointNode *sfn   = mach->as_MachSafePoint();
   MachCallNode      *mcall;
 
-#ifdef ENABLE_ZAP_DEAD_LOCALS
-  assert( is_node_getting_a_safepoint(mach),  "logic does not match; false negative");
-#endif
-
   int safepoint_pc_offset = current_offset;
   bool is_method_handle_invoke = false;
   bool return_oop = false;
@@ -1294,10 +1177,6 @@
       if (Pipeline::requires_bundling() && starts_bundle(n))
         cb->flush_bundle(false);
 
-      // The following logic is duplicated in the code ifdeffed for
-      // ENABLE_ZAP_DEAD_LOCALS which appears above in this file.  It
-      // should be factored out.  Or maybe dispersed to the nodes?
-
       // Special handling for SafePoint/Call Nodes
       bool is_mcall = false;
       if (n->is_Mach()) {
@@ -1364,9 +1243,6 @@
             // !!!!! Stubs only need an oopmap right now, so bail out
             if (sfn->jvms()->method() == NULL) {
               // Write the oopmap directly to the code blob??!!
-#             ifdef ENABLE_ZAP_DEAD_LOCALS
-              assert( !is_node_getting_a_safepoint(sfn),  "logic does not match; false positive");
-#             endif
               continue;
             }
           } // End synchronization
@@ -1554,9 +1430,6 @@
           // !!!!! Stubs only need an oopmap right now, so bail out
           if (!mach->is_MachCall() && mach->as_MachSafePoint()->jvms()->method() == NULL) {
             // Write the oopmap directly to the code blob??!!
-#           ifdef ENABLE_ZAP_DEAD_LOCALS
-            assert( !is_node_getting_a_safepoint(mach),  "logic does not match; false positive");
-#           endif
             delay_slot = NULL;
             continue;
           }
--- a/src/share/vm/opto/runtime.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/runtime.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -102,11 +102,6 @@
 address OptoRuntime::_slow_arraycopy_Java                         = NULL;
 address OptoRuntime::_register_finalizer_Java                     = NULL;
 
-# ifdef ENABLE_ZAP_DEAD_LOCALS
-address OptoRuntime::_zap_dead_Java_locals_Java                   = NULL;
-address OptoRuntime::_zap_dead_native_locals_Java                 = NULL;
-# endif
-
 ExceptionBlob* OptoRuntime::_exception_blob;
 
 // This should be called in an assertion at the start of OptoRuntime routines
@@ -152,10 +147,6 @@
   gen(env, _slow_arraycopy_Java            , slow_arraycopy_Type          , SharedRuntime::slow_arraycopy_C ,    0 , false, false, false);
   gen(env, _register_finalizer_Java        , register_finalizer_Type      , register_finalizer              ,    0 , false, false, false);
 
-# ifdef ENABLE_ZAP_DEAD_LOCALS
-  gen(env, _zap_dead_Java_locals_Java      , zap_dead_locals_Type         , zap_dead_Java_locals_C          ,    0 , false, true , false );
-  gen(env, _zap_dead_native_locals_Java    , zap_dead_locals_Type         , zap_dead_native_locals_C        ,    0 , false, true , false );
-# endif
   return true;
 }
 
@@ -604,23 +595,6 @@
   return TypeFunc::make(domain, range);
 }
 
-# ifdef ENABLE_ZAP_DEAD_LOCALS
-// Type used for stub generation for zap_dead_locals.
-// No inputs or outputs
-const TypeFunc *OptoRuntime::zap_dead_locals_Type() {
-  // create input type (domain)
-  const Type **fields = TypeTuple::fields(0);
-  const TypeTuple *domain = TypeTuple::make(TypeFunc::Parms,fields);
-
-  // create result type (range)
-  fields = TypeTuple::fields(0);
-  const TypeTuple *range = TypeTuple::make(TypeFunc::Parms,fields);
-
-  return TypeFunc::make(domain,range);
-}
-# endif
-
-
 //-----------------------------------------------------------------------------
 // Monitor Handling
 const TypeFunc *OptoRuntime::complete_monitor_enter_Type() {
@@ -1648,67 +1622,3 @@
 
 #endif  // PRODUCT
 
-
-# ifdef ENABLE_ZAP_DEAD_LOCALS
-// Called from call sites in compiled code with oop maps (actually safepoints)
-// Zaps dead locals in first java frame.
-// Is entry because may need to lock to generate oop maps
-// Currently, only used for compiler frames, but someday may be used
-// for interpreter frames, too.
-
-int OptoRuntime::ZapDeadCompiledLocals_count = 0;
-
-// avoid pointers to member funcs with these helpers
-static bool is_java_frame(  frame* f) { return f->is_java_frame();   }
-static bool is_native_frame(frame* f) { return f->is_native_frame(); }
-
-
-void OptoRuntime::zap_dead_java_or_native_locals(JavaThread* thread,
-                                                bool (*is_this_the_right_frame_to_zap)(frame*)) {
-  assert(JavaThread::current() == thread, "is this needed?");
-
-  if ( !ZapDeadCompiledLocals )  return;
-
-  bool skip = false;
-
-       if ( ZapDeadCompiledLocalsFirst  ==  0  ) ; // nothing special
-  else if ( ZapDeadCompiledLocalsFirst  >  ZapDeadCompiledLocals_count )  skip = true;
-  else if ( ZapDeadCompiledLocalsFirst  == ZapDeadCompiledLocals_count )
-    warning("starting zapping after skipping");
-
-       if ( ZapDeadCompiledLocalsLast  ==  -1  ) ; // nothing special
-  else if ( ZapDeadCompiledLocalsLast  <   ZapDeadCompiledLocals_count )  skip = true;
-  else if ( ZapDeadCompiledLocalsLast  ==  ZapDeadCompiledLocals_count )
-    warning("about to zap last zap");
-
-  ++ZapDeadCompiledLocals_count; // counts skipped zaps, too
-
-  if ( skip )  return;
-
-  // find java frame and zap it
-
-  for (StackFrameStream sfs(thread);  !sfs.is_done();  sfs.next()) {
-    if (is_this_the_right_frame_to_zap(sfs.current()) ) {
-      sfs.current()->zap_dead_locals(thread, sfs.register_map());
-      return;
-    }
-  }
-  warning("no frame found to zap in zap_dead_Java_locals_C");
-}
-
-JRT_LEAF(void, OptoRuntime::zap_dead_Java_locals_C(JavaThread* thread))
-  zap_dead_java_or_native_locals(thread, is_java_frame);
-JRT_END
-
-// The following does not work because for one thing, the
-// thread state is wrong; it expects java, but it is native.
-// Also, the invariants in a native stub are different and
-// I'm not sure it is safe to have a MachCalRuntimeDirectNode
-// in there.
-// So for now, we do not zap in native stubs.
-
-JRT_LEAF(void, OptoRuntime::zap_dead_native_locals_C(JavaThread* thread))
-  zap_dead_java_or_native_locals(thread, is_native_frame);
-JRT_END
-
-# endif
--- a/src/share/vm/opto/runtime.hpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/runtime.hpp	Thu Oct 08 14:28:55 2015 -0700
@@ -152,12 +152,6 @@
   static address _slow_arraycopy_Java;
   static address _register_finalizer_Java;
 
-# ifdef ENABLE_ZAP_DEAD_LOCALS
-  static address _zap_dead_Java_locals_Java;
-  static address _zap_dead_native_locals_Java;
-# endif
-
-
   //
   // Implementation of runtime methods
   // =================================
@@ -212,19 +206,6 @@
 
   static void register_finalizer(oopDesc* obj, JavaThread* thread);
 
-  // zaping dead locals, either from Java frames or from native frames
-# ifdef ENABLE_ZAP_DEAD_LOCALS
-  static void zap_dead_Java_locals_C(   JavaThread* thread);
-  static void zap_dead_native_locals_C( JavaThread* thread);
-
-  static void zap_dead_java_or_native_locals( JavaThread*, bool (*)(frame*));
-
- public:
-   static int ZapDeadCompiledLocals_count;
-
-# endif
-
-
  public:
 
   static bool is_callee_saved_register(MachRegisterNumbers reg);
@@ -256,14 +237,6 @@
   static address slow_arraycopy_Java()                   { return _slow_arraycopy_Java; }
   static address register_finalizer_Java()               { return _register_finalizer_Java; }
 
-
-# ifdef ENABLE_ZAP_DEAD_LOCALS
-  static address zap_dead_locals_stub(bool is_native)    { return is_native
-                                                                  ? _zap_dead_native_locals_Java
-                                                                  : _zap_dead_Java_locals_Java; }
-  static MachNode* node_to_call_zap_dead_locals(Node* n, int block_num, bool is_native);
-# endif
-
   static ExceptionBlob*    exception_blob()                      { return _exception_blob; }
 
   // Leaf routines helping with method data update
@@ -353,10 +326,6 @@
   static const TypeFunc* dtrace_method_entry_exit_Type();
   static const TypeFunc* dtrace_object_alloc_Type();
 
-# ifdef ENABLE_ZAP_DEAD_LOCALS
-  static const TypeFunc* zap_dead_locals_Type();
-# endif
-
  private:
  static NamedCounter * volatile _named_counters;
 
--- a/src/share/vm/opto/subnode.cpp	Tue Oct 06 08:41:31 2015 -0700
+++ b/src/share/vm/opto/subnode.cpp	Thu Oct 08 14:28:55 2015 -0700
@@ -1532,18 +1532,6 @@
 
 //=============================================================================
 //------------------------------Value------------------------------------------
-// Compute exp
-const Type *ExpDNode::Value( PhaseTransform *phase ) const {
-  const Type *t1 = phase->type( in(1) );
-  if( t1 == Type::TOP ) return Type::TOP;
-  if( t1->base() != Type::DoubleCon ) return Type::DOUBLE;
-  double d = t1->getd();
-  return TypeD::make( StubRoutines::intrinsic_exp( d ) );
-}
-
-
-//=============================================================================
-//------------------------------Value------------------------------------------
 // Compute pow
 const Type *PowDNode::Value( PhaseTransform *phase ) const {
   const Type *t1 = phase->type( in(1) );
--- a/src/share/vm/opto/subnode.hpp	Tue Oct 06 08:41:31 2015 -0700