changeset 8888:8bc4eb358829

Merge
author roland
date Fri, 21 Aug 2015 09:12:42 +0200
parents 5ca290fb5d6e dbc1274bac56
children c0ea5537dc8b
files src/cpu/x86/vm/templateTable_x86.cpp
diffstat 31 files changed, 1923 insertions(+), 842 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/aarch64/vm/aarch64.ad	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/aarch64/vm/aarch64.ad	Fri Aug 21 09:12:42 2015 +0200
@@ -1033,27 +1033,39 @@
 };
 
   // graph traversal helpers
-  MemBarNode *has_parent_membar(const Node *n,
-				ProjNode *&ctl, ProjNode *&mem);
-  MemBarNode *has_child_membar(const MemBarNode *n,
-			       ProjNode *&ctl, ProjNode *&mem);
+
+  MemBarNode *parent_membar(const Node *n);
+  MemBarNode *child_membar(const MemBarNode *n);
+  bool leading_membar(const MemBarNode *barrier);
+
+  bool is_card_mark_membar(const MemBarNode *barrier);
+
+  MemBarNode *leading_to_normal(MemBarNode *leading);
+  MemBarNode *normal_to_leading(const MemBarNode *barrier);
+  MemBarNode *card_mark_to_trailing(const MemBarNode *barrier);
+  MemBarNode *trailing_to_card_mark(const MemBarNode *trailing);
+  MemBarNode *trailing_to_leading(const MemBarNode *trailing);
 
   // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
+
   bool unnecessary_acquire(const Node *barrier);
   bool needs_acquiring_load(const Node *load);
 
   // predicates controlling emit of str<x>/stlr<x> and associated dmbs
+
   bool unnecessary_release(const Node *barrier);
   bool unnecessary_volatile(const Node *barrier);
   bool needs_releasing_store(const Node *store);
 
-  // Use barrier instructions for unsafe volatile gets rather than
-  // trying to identify an exact signature for them
-  const bool UseBarriersForUnsafeVolatileGet = false;
+  // predicate controlling translation of StoreCM
+  bool unnecessary_storestore(const Node *storecm);
 %}
 
 source %{
 
+  // Optimizaton of volatile gets and puts
+  // -------------------------------------
+  //
   // AArch64 has ldar<x> and stlr<x> instructions which we can safely
   // use to implement volatile reads and writes. For a volatile read
   // we simply need
@@ -1102,15 +1114,19 @@
   // A volatile write is translated to the node sequence
   //
   //   MemBarRelease
-  //   StoreX[mo_release]
+  //   StoreX[mo_release] {CardMark}-optional
   //   MemBarVolatile
   //
   // n.b. the above node patterns are generated with a strict
   // 'signature' configuration of input and output dependencies (see
-  // the predicates below for exact details). The two signatures are
-  // unique to translated volatile reads/stores -- they will not
-  // appear as a result of any other bytecode translation or inlining
-  // nor as a consequence of optimizing transforms.
+  // the predicates below for exact details). The card mark may be as
+  // simple as a few extra nodes or, in a few GC configurations, may
+  // include more complex control flow between the leading and
+  // trailing memory barriers. However, whatever the card mark
+  // configuration these signatures are unique to translated volatile
+  // reads/stores -- they will not appear as a result of any other
+  // bytecode translation or inlining nor as a consequence of
+  // optimizing transforms.
   //
   // We also want to catch inlined unsafe volatile gets and puts and
   // be able to implement them using either ldar<x>/stlr<x> or some
@@ -1122,7 +1138,7 @@
   //
   //   MemBarRelease
   //   MemBarCPUOrder
-  //   StoreX[mo_release]
+  //   StoreX[mo_release] {CardMark}-optional
   //   MemBarVolatile
   //
   // n.b. as an aside, the cpuorder membar is not itself subject to
@@ -1130,7 +1146,7 @@
   // predicates need to detect its presence in order to correctly
   // select the desired adlc rules.
   //
-  // Inlined unsafe volatiles gets manifest as a somewhat different
+  // Inlined unsafe volatile gets manifest as a somewhat different
   // node sequence to a normal volatile get
   //
   //   MemBarCPUOrder
@@ -1173,33 +1189,22 @@
   // n.b. the translation rules below which rely on detection of the
   // volatile signatures and insert ldar<x> or stlr<x> are failsafe.
   // If we see anything other than the signature configurations we
-  // always just translate the loads and stors to ldr<x> and str<x>
+  // always just translate the loads and stores to ldr<x> and str<x>
   // and translate acquire, release and volatile membars to the
   // relevant dmb instructions.
   //
-  // n.b.b as a case in point for the above comment, the current
-  // predicates don't detect the precise signature for certain types
-  // of volatile object stores (where the heap_base input type is not
-  // known at compile-time to be non-NULL). In those cases the
-  // MemBarRelease and MemBarVolatile bracket an if-then-else sequence
-  // with a store in each branch (we need a different store depending
-  // on whether heap_base is actually NULL). In such a case we will
-  // just plant a dmb both before and after the branch/merge. The
-  // predicate could (and probably should) be fixed later to also
-  // detect this case.
-
-  // graph traversal helpers
+
+  // graph traversal helpers used for volatile put/get optimization
+
+  // 1) general purpose helpers
 
   // if node n is linked to a parent MemBarNode by an intervening
-  // Control or Memory ProjNode return the MemBarNode otherwise return
+  // Control and Memory ProjNode return the MemBarNode otherwise return
   // NULL.
   //
   // n may only be a Load or a MemBar.
-  //
-  // The ProjNode* references c and m are used to return the relevant
-  // nodes.
-
-  MemBarNode *has_parent_membar(const Node *n, ProjNode *&c, ProjNode *&m)
+
+  MemBarNode *parent_membar(const Node *n)
   {
     Node *ctl = NULL;
     Node *mem = NULL;
@@ -1218,15 +1223,11 @@
     if (!ctl || !mem || !ctl->is_Proj() || !mem->is_Proj())
       return NULL;
 
-    c = ctl->as_Proj();
-
     membar = ctl->lookup(0);
 
     if (!membar || !membar->is_MemBar())
       return NULL;
 
-    m = mem->as_Proj();
-
     if (mem->lookup(0) != membar)
       return NULL;
 
@@ -1235,12 +1236,8 @@
 
   // if n is linked to a child MemBarNode by intervening Control and
   // Memory ProjNodes return the MemBarNode otherwise return NULL.
-  //
-  // The ProjNode** arguments c and m are used to return pointers to
-  // the relevant nodes. A null argument means don't don't return a
-  // value.
-
-  MemBarNode *has_child_membar(const MemBarNode *n, ProjNode *&c, ProjNode *&m)
+
+  MemBarNode *child_membar(const MemBarNode *n)
   {
     ProjNode *ctl = n->proj_out(TypeFunc::Control);
     ProjNode *mem = n->proj_out(TypeFunc::Memory);
@@ -1249,9 +1246,6 @@
     if (! ctl || ! mem)
       return NULL;
 
-    c = ctl;
-    m = mem;
-
     MemBarNode *child = NULL;
     Node *x;
 
@@ -1279,9 +1273,838 @@
     return NULL;
   }
 
+  // helper predicate use to filter candidates for a leading memory
+  // barrier
+  //
+  // returns true if barrier is a MemBarRelease or a MemBarCPUOrder
+  // whose Ctl and Mem feeds come from a MemBarRelease otherwise false
+
+  bool leading_membar(const MemBarNode *barrier)
+  {
+    int opcode = barrier->Opcode();
+    // if this is a release membar we are ok
+    if (opcode == Op_MemBarRelease)
+      return true;
+    // if its a cpuorder membar . . .
+    if (opcode != Op_MemBarCPUOrder)
+      return false;
+    // then the parent has to be a release membar
+    MemBarNode *parent = parent_membar(barrier);
+    if (!parent)
+      return false;
+    opcode = parent->Opcode();
+    return opcode == Op_MemBarRelease;
+  }
+ 
+  // 2) card mark detection helper
+
+  // helper predicate which can be used to detect a volatile membar
+  // introduced as part of a conditional card mark sequence either by
+  // G1 or by CMS when UseCondCardMark is true.
+  //
+  // membar can be definitively determined to be part of a card mark
+  // sequence if and only if all the following hold
+  //
+  // i) it is a MemBarVolatile
+  //
+  // ii) either UseG1GC or (UseConcMarkSweepGC && UseCondCardMark) is
+  // true
+  //
+  // iii) the node's Mem projection feeds a StoreCM node.
+  
+  bool is_card_mark_membar(const MemBarNode *barrier)
+  {
+    if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark))
+      return false;
+
+    if (barrier->Opcode() != Op_MemBarVolatile)
+      return false;
+
+    ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax ; i++) {
+      Node *y = mem->fast_out(i);
+      if (y->Opcode() == Op_StoreCM) {
+	return true;
+      }
+    }
+  
+    return false;
+  }
+
+
+  // 3) helper predicates to traverse volatile put graphs which may
+  // contain GC barrier subgraphs
+
+  // Preamble
+  // --------
+  //
+  // for volatile writes we can omit generating barriers and employ a
+  // releasing store when we see a node sequence sequence with a
+  // leading MemBarRelease and a trailing MemBarVolatile as follows
+  //
+  //   MemBarRelease
+  //  {      ||      } -- optional
+  //  {MemBarCPUOrder}
+  //         ||     \\
+  //         ||     StoreX[mo_release]
+  //         | \     /
+  //         | MergeMem
+  //         | /
+  //   MemBarVolatile
+  //
+  // where
+  //  || and \\ represent Ctl and Mem feeds via Proj nodes
+  //  | \ and / indicate further routing of the Ctl and Mem feeds
+  // 
+  // this is the graph we see for non-object stores. however, for a
+  // volatile Object store (StoreN/P) we may see other nodes below the
+  // leading membar because of the need for a GC pre- or post-write
+  // barrier.
+  //
+  // with most GC configurations we with see this simple variant which
+  // includes a post-write barrier card mark.
+  //
+  //   MemBarRelease______________________________
+  //         ||    \\               Ctl \        \\
+  //         ||    StoreN/P[mo_release] CastP2X  StoreB/CM
+  //         | \     /                       . . .  /
+  //         | MergeMem
+  //         | /
+  //         ||      /
+  //   MemBarVolatile
+  //
+  // i.e. the leading membar feeds Ctl to a CastP2X (which converts
+  // the object address to an int used to compute the card offset) and
+  // Ctl+Mem to a StoreB node (which does the actual card mark).
+  //
+  // n.b. a StoreCM node will only appear in this configuration when
+  // using CMS. StoreCM differs from a normal card mark write (StoreB)
+  // because it implies a requirement to order visibility of the card
+  // mark (StoreCM) relative to the object put (StoreP/N) using a
+  // StoreStore memory barrier (arguably this ought to be represented
+  // explicitly in the ideal graph but that is not how it works). This
+  // ordering is required for both non-volatile and volatile
+  // puts. Normally that means we need to translate a StoreCM using
+  // the sequence
+  //
+  //   dmb ishst
+  //   stlrb
+  //
+  // However, in the case of a volatile put if we can recognise this
+  // configuration and plant an stlr for the object write then we can
+  // omit the dmb and just plant an strb since visibility of the stlr
+  // is ordered before visibility of subsequent stores. StoreCM nodes
+  // also arise when using G1 or using CMS with conditional card
+  // marking. In these cases (as we shall see) we don't need to insert
+  // the dmb when translating StoreCM because there is already an
+  // intervening StoreLoad barrier between it and the StoreP/N.
+  //
+  // It is also possible to perform the card mark conditionally on it
+  // currently being unmarked in which case the volatile put graph
+  // will look slightly different
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder___________________________________________
+  //         ||    \\               Ctl \     Ctl \     \\  Mem \
+  //         ||    StoreN/P[mo_release] CastP2X   If   LoadB     |
+  //         | \     /                              \            |
+  //         | MergeMem                            . . .      StoreB
+  //         | /                                                /
+  //         ||     /
+  //   MemBarVolatile
+  //
+  // It is worth noting at this stage that both the above
+  // configurations can be uniquely identified by checking that the
+  // memory flow includes the following subgraph:
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //          |  \      . . .
+  //          |  StoreX[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile
+  //
+  // This is referred to as a *normal* subgraph. It can easily be
+  // detected starting from any candidate MemBarRelease,
+  // StoreX[mo_release] or MemBarVolatile.
+  //
+  // the code below uses two helper predicates, leading_to_normal and
+  // normal_to_leading to identify this configuration, one validating
+  // the layout starting from the top membar and searching down and
+  // the other validating the layout starting from the lower membar
+  // and searching up.
+  //
+  // There are two special case GC configurations when a normal graph
+  // may not be generated: when using G1 (which always employs a
+  // conditional card mark); and when using CMS with conditional card
+  // marking configured. These GCs are both concurrent rather than
+  // stop-the world GCs. So they introduce extra Ctl+Mem flow into the
+  // graph between the leading and trailing membar nodes, in
+  // particular enforcing stronger memory serialisation beween the
+  // object put and the corresponding conditional card mark. CMS
+  // employs a post-write GC barrier while G1 employs both a pre- and
+  // post-write GC barrier. Of course the extra nodes may be absent --
+  // they are only inserted for object puts. This significantly
+  // complicates the task of identifying whether a MemBarRelease,
+  // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
+  // when using these GC configurations (see below).
+  //
+  // In both cases the post-write subtree includes an auxiliary
+  // MemBarVolatile (StoreLoad barrier) separating the object put and
+  // the read of the corresponding card. This poses two additional
+  // problems.
+  //
+  // Firstly, a card mark MemBarVolatile needs to be distinguished
+  // from a normal trailing MemBarVolatile. Resolving this first
+  // problem is straightforward: a card mark MemBarVolatile always
+  // projects a Mem feed to a StoreCM node and that is a unique marker
+  //
+  //      MemBarVolatile (card mark)
+  //       C |    \     . . .
+  //         |   StoreCM   . . .
+  //       . . .
+  //
+  // The second problem is how the code generator is to translate the
+  // card mark barrier? It always needs to be translated to a "dmb
+  // ish" instruction whether or not it occurs as part of a volatile
+  // put. A StoreLoad barrier is needed after the object put to ensure
+  // i) visibility to GC threads of the object put and ii) visibility
+  // to the mutator thread of any card clearing write by a GC
+  // thread. Clearly a normal store (str) will not guarantee this
+  // ordering but neither will a releasing store (stlr). The latter
+  // guarantees that the object put is visible but does not guarantee
+  // that writes by other threads have also been observed.
+  // 
+  // So, returning to the task of translating the object put and the
+  // leading/trailing membar nodes: what do the non-normal node graph
+  // look like for these 2 special cases? and how can we determine the
+  // status of a MemBarRelease, StoreX[mo_release] or MemBarVolatile
+  // in both normal and non-normal cases?
+  //
+  // A CMS GC post-barrier wraps its card write (StoreCM) inside an If
+  // which selects conditonal execution based on the value loaded
+  // (LoadB) from the card. Ctl and Mem are fed to the If via an
+  // intervening StoreLoad barrier (MemBarVolatile).
+  //
+  // So, with CMS we may see a node graph which looks like this
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder_(leading)__________________
+  //     C |    M \       \\                   C \
+  //       |       \    StoreN/P[mo_release]  CastP2X
+  //       |    Bot \    /
+  //       |       MergeMem
+  //       |         /
+  //      MemBarVolatile (card mark)
+  //     C |  ||    M |
+  //       | LoadB    |
+  //       |   |      |
+  //       | Cmp      |\
+  //       | /        | \
+  //       If         |  \
+  //       | \        |   \
+  // IfFalse  IfTrue  |    \
+  //       \     / \  |     \
+  //        \   / StoreCM    |
+  //         \ /      |      |
+  //        Region   . . .   |
+  //          | \           /
+  //          |  . . .  \  / Bot
+  //          |       MergeMem
+  //          |          |
+  //        MemBarVolatile (trailing)
+  //
+  // The first MergeMem merges the AliasIdxBot Mem slice from the
+  // leading membar and the oopptr Mem slice from the Store into the
+  // card mark membar. The trailing MergeMem merges the AliasIdxBot
+  // Mem slice from the card mark membar and the AliasIdxRaw slice
+  // from the StoreCM into the trailing membar (n.b. the latter
+  // proceeds via a Phi associated with the If region).
+  //
+  // G1 is quite a lot more complicated. The nodes inserted on behalf
+  // of G1 may comprise: a pre-write graph which adds the old value to
+  // the SATB queue; the releasing store itself; and, finally, a
+  // post-write graph which performs a card mark.
+  //
+  // The pre-write graph may be omitted, but only when the put is
+  // writing to a newly allocated (young gen) object and then only if
+  // there is a direct memory chain to the Initialize node for the
+  // object allocation. This will not happen for a volatile put since
+  // any memory chain passes through the leading membar.
+  //
+  // The pre-write graph includes a series of 3 If tests. The outermost
+  // If tests whether SATB is enabled (no else case). The next If tests
+  // whether the old value is non-NULL (no else case). The third tests
+  // whether the SATB queue index is > 0, if so updating the queue. The
+  // else case for this third If calls out to the runtime to allocate a
+  // new queue buffer.
+  //
+  // So with G1 the pre-write and releasing store subgraph looks like
+  // this (the nested Ifs are omitted).
+  //
+  //  MemBarRelease (leading)____________
+  //     C |  ||  M \   M \    M \  M \ . . .
+  //       | LoadB   \  LoadL  LoadN   \
+  //       | /        \                 \
+  //       If         |\                 \
+  //       | \        | \                 \
+  //  IfFalse  IfTrue |  \                 \
+  //       |     |    |   \                 |
+  //       |     If   |   /\                |
+  //       |     |          \               |
+  //       |                 \              |
+  //       |    . . .         \             |
+  //       | /       | /       |            |
+  //      Region  Phi[M]       |            |
+  //       | \       |         |            |
+  //       |  \_____ | ___     |            |
+  //     C | C \     |   C \ M |            |
+  //       | CastP2X | StoreN/P[mo_release] |
+  //       |         |         |            |
+  //     C |       M |       M |          M |
+  //        \        |         |           /
+  //                  . . . 
+  //          (post write subtree elided)
+  //                    . . .
+  //             C \         M /
+  //         MemBarVolatile (trailing)
+  //
+  // n.b. the LoadB in this subgraph is not the card read -- it's a
+  // read of the SATB queue active flag.
+  //
+  // The G1 post-write subtree is also optional, this time when the
+  // new value being written is either null or can be identified as a
+  // newly allocated (young gen) object with no intervening control
+  // flow. The latter cannot happen but the former may, in which case
+  // the card mark membar is omitted and the memory feeds from the
+  // leading membar and the StoreN/P are merged direct into the
+  // trailing membar as per the normal subgraph. So, the only special
+  // case which arises is when the post-write subgraph is generated.
+  //
+  // The kernel of the post-write G1 subgraph is the card mark itself
+  // which includes a card mark memory barrier (MemBarVolatile), a
+  // card test (LoadB), and a conditional update (If feeding a
+  // StoreCM). These nodes are surrounded by a series of nested Ifs
+  // which try to avoid doing the card mark. The top level If skips if
+  // the object reference does not cross regions (i.e. it tests if
+  // (adr ^ val) >> log2(regsize) != 0) -- intra-region references
+  // need not be recorded. The next If, which skips on a NULL value,
+  // may be absent (it is not generated if the type of value is >=
+  // OopPtr::NotNull). The 3rd If skips writes to young regions (by
+  // checking if card_val != young).  n.b. although this test requires
+  // a pre-read of the card it can safely be done before the StoreLoad
+  // barrier. However that does not bypass the need to reread the card
+  // after the barrier.
+  //
+  //                (pre-write subtree elided)
+  //        . . .                  . . .    . . .  . . .
+  //        C |                    M |     M |    M |
+  //       Region                  Phi[M] StoreN    |
+  //          |                     / \      |      |
+  //         / \_______            /   \     |      |
+  //      C / C \      . . .            \    |      |
+  //       If   CastP2X . . .            |   |      |
+  //       / \                           |   |      |
+  //      /   \                          |   |      |
+  // IfFalse IfTrue                      |   |      |
+  //   |       |                         |   |     /|
+  //   |       If                        |   |    / |
+  //   |      / \                        |   |   /  |
+  //   |     /   \                        \  |  /   |
+  //   | IfFalse IfTrue                   MergeMem  |
+  //   |  . . .    / \                       /      |
+  //   |          /   \                     /       |
+  //   |     IfFalse IfTrue                /        |
+  //   |      . . .    |                  /         |
+  //   |               If                /          |
+  //   |               / \              /           |
+  //   |              /   \            /            |
+  //   |         IfFalse IfTrue       /             |
+  //   |           . . .   |         /              |
+  //   |                    \       /               |
+  //   |                     \     /                |
+  //   |             MemBarVolatile__(card mark)    |
+  //   |                ||   C |  M \  M \          |
+  //   |               LoadB   If    |    |         |
+  //   |                      / \    |    |         |
+  //   |                     . . .   |    |         |
+  //   |                          \  |    |        /
+  //   |                        StoreCM   |       /
+  //   |                          . . .   |      /
+  //   |                        _________/      /
+  //   |                       /  _____________/
+  //   |   . . .       . . .  |  /            /
+  //   |    |                 | /   _________/
+  //   |    |               Phi[M] /        /
+  //   |    |                 |   /        /
+  //   |    |                 |  /        /
+  //   |  Region  . . .     Phi[M]  _____/
+  //   |    /                 |    /
+  //   |                      |   /   
+  //   | . . .   . . .        |  /
+  //   | /                    | /
+  // Region           |  |  Phi[M]
+  //   |              |  |  / Bot
+  //    \            MergeMem 
+  //     \            /
+  //     MemBarVolatile
+  //
+  // As with CMS the initial MergeMem merges the AliasIdxBot Mem slice
+  // from the leading membar and the oopptr Mem slice from the Store
+  // into the card mark membar i.e. the memory flow to the card mark
+  // membar still looks like a normal graph.
+  //
+  // The trailing MergeMem merges an AliasIdxBot Mem slice with other
+  // Mem slices (from the StoreCM and other card mark queue stores).
+  // However in this case the AliasIdxBot Mem slice does not come
+  // direct from the card mark membar. It is merged through a series
+  // of Phi nodes. These are needed to merge the AliasIdxBot Mem flow
+  // from the leading membar with the Mem feed from the card mark
+  // membar. Each Phi corresponds to one of the Ifs which may skip
+  // around the card mark membar. So when the If implementing the NULL
+  // value check has been elided the total number of Phis is 2
+  // otherwise it is 3.
+  //
+  // So, the upshot is that in all cases the volatile put graph will
+  // include a *normal* memory subgraph betwen the leading membar and
+  // its child membar. When that child is not a card mark membar then
+  // it marks the end of a volatile put subgraph. If the child is a
+  // card mark membar then the normal subgraph will form part of a
+  // volatile put subgraph if and only if the child feeds an
+  // AliasIdxBot Mem feed to a trailing barrier via a MergeMem. That
+  // feed is either direct (for CMS) or via 2 or 3 Phi nodes merging
+  // the leading barrier memory flow (for G1).
+  // 
+  // The predicates controlling generation of instructions for store
+  // and barrier nodes employ a few simple helper functions (described
+  // below) which identify the presence or absence of these subgraph
+  // configurations and provide a means of traversing from one node in
+  // the subgraph to another.
+
+  // leading_to_normal
+  //
+  //graph traversal helper which detects the normal case Mem feed
+  // from a release membar (or, optionally, its cpuorder child) to a
+  // dependent volatile membar i.e. it ensures that the following Mem
+  // flow subgraph is present.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //          |  \      . . .
+  //          |  StoreN/P[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile
+  //
+  // if the correct configuration is present returns the volatile
+  // membar otherwise NULL.
+  //
+  // the input membar is expected to be either a cpuorder membar or a
+  // release membar. in the latter case it should not have a cpu membar
+  // child.
+  //
+  // the returned membar may be a card mark membar rather than a
+  // trailing membar.
+
+  MemBarNode *leading_to_normal(MemBarNode *leading)
+  {
+    assert((leading->Opcode() == Op_MemBarRelease ||
+	    leading->Opcode() == Op_MemBarCPUOrder),
+	   "expecting a volatile or cpuroder membar!");
+
+    // check the mem flow
+    ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+
+    if (!mem)
+      return NULL;
+
+    Node *x = NULL;
+    StoreNode * st = NULL;
+    MergeMemNode *mm = NULL;
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      if (x->is_MergeMem()) {
+	if (mm != NULL)
+	  return NULL;
+	// two merge mems is one too many
+	mm = x->as_MergeMem();
+      } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+	// two releasing stores is one too many
+	if (st != NULL)
+	  return NULL;
+	st = x->as_Store();
+      }
+    }
+
+    if (!mm || !st)
+      return NULL;
+
+    bool found = false;
+    // ensure the store feeds the merge
+    for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+      if (st->fast_out(i) == mm) {
+	found = true;
+	break;
+      }
+    }
+
+    if (!found)
+      return NULL;
+
+    MemBarNode *mbvol = NULL;
+    // ensure the merge feeds a volatile membar
+    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+      x = mm->fast_out(i);
+      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
+	mbvol = x->as_MemBar();
+	break;
+      }
+    }
+
+    return mbvol;
+  }
+
+  // normal_to_leading
+  //
+  // graph traversal helper which detects the normal case Mem feed
+  // from either a card mark or a trailing membar to a preceding
+  // release membar (optionally its cpuorder child) i.e. it ensures
+  // that the following Mem flow subgraph is present.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //          |  \      . . .
+  //          |  StoreN/P[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile
+  //
+  // this predicate checks for the same flow as the previous predicate
+  // but starting from the bottom rather than the top.
+  //
+  // if the configuration is present returns the cpuorder member for
+  // preference or when absent the release membar otherwise NULL.
+  //
+  // n.b. the input membar is expected to be a MemBarVolatile but
+  // need not be a card mark membar.
+
+  MemBarNode *normal_to_leading(const MemBarNode *barrier)
+  {
+    // input must be a volatile membar
+    assert(barrier->Opcode() == Op_MemBarVolatile, "expecting a volatile membar");
+    Node *x;
+
+    // the Mem feed to the membar should be a merge
+    x = barrier->in(TypeFunc::Memory);
+    if (!x->is_MergeMem())
+      return NULL;
+
+    MergeMemNode *mm = x->as_MergeMem();
+
+    // the AliasIdxBot slice should be another MemBar projection
+    x = mm->in(Compile::AliasIdxBot);
+    // ensure this is a non control projection
+    if (!x->is_Proj() || x->is_CFG())
+      return NULL;
+    // if it is fed by a membar that's the one we want
+    x = x->in(0);
+
+    if (!x->is_MemBar())
+      return NULL;
+
+    MemBarNode *leading = x->as_MemBar();
+    // reject invalid candidates
+    if (!leading_membar(leading))
+      return NULL;
+
+    // ok, we have a leading ReleaseMembar, now for the sanity clauses
+
+    // the leading membar must feed Mem to a releasing store
+    ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+    StoreNode *st = NULL;
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+	st = x->as_Store();
+	break;
+      }
+    }
+    if (st == NULL)
+      return NULL;
+
+    // the releasing store has to feed the same merge
+    for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+      if (st->fast_out(i) == mm)
+	return leading;
+    }
+
+    return NULL;
+  }
+
+  // card_mark_to_trailing
+  //
+  // graph traversal helper which detects extra, non-normal Mem feed
+  // from a card mark volatile membar to a trailing membar i.e. it
+  // ensures that one of the following three GC post-write Mem flow
+  // subgraphs is present.
+  //
+  // 1)
+  //     . . .
+  //       |
+  //   MemBarVolatile (card mark)
+  //      |          |     
+  //      |        StoreCM
+  //      |          |
+  //      |        . . .
+  //  Bot |  / 
+  //   MergeMem 
+  //      |
+  //   MemBarVolatile (trailing)
+  //
+  //
+  // 2)
+  //   MemBarRelease/CPUOrder (leading)
+  //    |
+  //    | 
+  //    |\       . . .
+  //    | \        | 
+  //    |  \  MemBarVolatile (card mark) 
+  //    |   \   |     |
+  //     \   \  |   StoreCM    . . .
+  //      \   \ |
+  //       \  Phi
+  //        \ /
+  //        Phi  . . .
+  //     Bot |   /
+  //       MergeMem
+  //         |
+  //   MemBarVolatile (trailing)
+  //
+  // 3)
+  //   MemBarRelease/CPUOrder (leading)
+  //    |
+  //    |\
+  //    | \
+  //    |  \      . . .
+  //    |   \       |
+  //    |\   \  MemBarVolatile (card mark)
+  //    | \   \   |     |
+  //    |  \   \  |   StoreCM    . . .
+  //    |   \   \ |
+  //     \   \  Phi
+  //      \   \ /  
+  //       \  Phi
+  //        \ /
+  //        Phi  . . .
+  //     Bot |   /
+  //       MergeMem
+  //         |
+  //   MemBarVolatile (trailing)
+  //
+  // configuration 1 is only valid if UseConcMarkSweepGC &&
+  // UseCondCardMark
+  //
+  // configurations 2 and 3 are only valid if UseG1GC.
+  //
+  // if a valid configuration is present returns the trailing membar
+  // otherwise NULL.
+  //
+  // n.b. the supplied membar is expected to be a card mark
+  // MemBarVolatile i.e. the caller must ensure the input node has the
+  // correct operand and feeds Mem to a StoreCM node
+
+  MemBarNode *card_mark_to_trailing(const MemBarNode *barrier)
+  {
+    // input must be a card mark volatile membar
+    assert(is_card_mark_membar(barrier), "expecting a card mark membar");
+
+    Node *feed = barrier->proj_out(TypeFunc::Memory);
+    Node *x;
+    MergeMemNode *mm = NULL;
+
+    const int MAX_PHIS = 3;	// max phis we will search through
+    int phicount = 0; 		// current search count
+
+    bool retry_feed = true;
+    while (retry_feed) {
+      // see if we have a direct MergeMem feed
+      for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+	x = feed->fast_out(i);
+	// the correct Phi will be merging a Bot memory slice
+	if (x->is_MergeMem()) {
+	  mm = x->as_MergeMem();
+	  break;
+	}
+      }
+      if (mm) {
+	retry_feed = false;
+      } else if (UseG1GC & phicount++ < MAX_PHIS) {
+	// the barrier may feed indirectly via one or two Phi nodes
+	PhiNode *phi = NULL;
+	for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+	  x = feed->fast_out(i);
+	  // the correct Phi will be merging a Bot memory slice
+	  if (x->is_Phi() && x->adr_type() == TypePtr::BOTTOM) {
+	    phi = x->as_Phi();
+	    break;
+	  }
+	}
+	if (!phi)
+	  return NULL;
+	// look for another merge below this phi
+	feed = phi;
+      } else {
+	// couldn't find a merge
+	return NULL;
+      }
+    }
+
+    // sanity check this feed turns up as the expected slice
+    assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
+
+    MemBarNode *trailing = NULL;
+    // be sure we have a volatile membar below the merge
+    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+      x = mm->fast_out(i);
+      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
+	trailing = x->as_MemBar();
+	break;
+      }
+    }
+
+    return trailing;
+  }
+
+  // trailing_to_card_mark
+  //
+  // graph traversal helper which detects extra, non-normal Mem feed
+  // from a trailing membar to a preceding card mark volatile membar
+  // i.e. it identifies whether one of the three possible extra GC
+  // post-write Mem flow subgraphs is present
+  //
+  // this predicate checks for the same flow as the previous predicate
+  // but starting from the bottom rather than the top.
+  //
+  // if the configurationis present returns the card mark membar
+  // otherwise NULL
+
+  MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
+  {
+    assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+
+    Node *x = trailing->in(TypeFunc::Memory);
+    // the Mem feed to the membar should be a merge
+    if (!x->is_MergeMem())
+      return NULL;
+
+    MergeMemNode *mm = x->as_MergeMem();
+
+    x = mm->in(Compile::AliasIdxBot);
+    // with G1 we may possibly see a Phi or two before we see a Memory
+    // Proj from the card mark membar
+
+    const int MAX_PHIS = 3;	// max phis we will search through
+    int phicount = 0; 		// current search count
+
+    bool retry_feed = !x->is_Proj();
+
+    while (retry_feed) {
+      if (UseG1GC && x->is_Phi() && phicount++ < MAX_PHIS) {
+	PhiNode *phi = x->as_Phi();
+	ProjNode *proj = NULL;
+	PhiNode *nextphi = NULL;
+	bool found_leading = false;
+	for (uint i = 1; i < phi->req(); i++) {
+	  x = phi->in(i);
+	  if (x->is_Phi()) {
+	    nextphi = x->as_Phi();
+	  } else if (x->is_Proj()) {
+	    int opcode = x->in(0)->Opcode();
+	    if (opcode == Op_MemBarVolatile) {
+	      proj = x->as_Proj();
+	    } else if (opcode == Op_MemBarRelease ||
+		       opcode == Op_MemBarCPUOrder) {
+	      // probably a leading membar
+	      found_leading = true;
+	    }
+	  }
+	}
+	// if we found a correct looking proj then retry from there
+	// otherwise we must see a leading and a phi or this the
+	// wrong config
+	if (proj != NULL) {
+	  x = proj;
+	  retry_feed = false;
+	} else if (found_leading && nextphi != NULL) {
+	  // retry from this phi to check phi2
+	  x = nextphi;
+	} else {
+	  // not what we were looking for
+	  return NULL;
+	}
+      } else {
+	return NULL;
+      }
+    }
+    // the proj has to come from the card mark membar
+    x = x->in(0);
+    if (!x->is_MemBar())
+      return NULL;
+
+    MemBarNode *card_mark_membar = x->as_MemBar();
+
+    if (!is_card_mark_membar(card_mark_membar))
+      return NULL;
+
+    return card_mark_membar;
+  }
+
+  // trailing_to_leading
+  //
+  // graph traversal helper which checks the Mem flow up the graph
+  // from a (non-card mark) volatile membar attempting to locate and
+  // return an associated leading membar. it first looks for a
+  // subgraph in the normal configuration (relying on helper
+  // normal_to_leading). failing that it then looks for one of the
+  // possible post-write card mark subgraphs linking the trailing node
+  // to a the card mark membar (relying on helper
+  // trailing_to_card_mark), and then checks that the card mark membar
+  // is fed by a leading membar (once again relying on auxiliary
+  // predicate normal_to_leading).
+  //
+  // if the configuration is valid returns the cpuorder member for
+  // preference or when absent the release membar otherwise NULL.
+  //
+  // n.b. the input membar is expected to be a volatile membar but
+  // must *not* be a card mark membar.
+
+  MemBarNode *trailing_to_leading(const MemBarNode *trailing)
+  {
+    assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+
+    MemBarNode *leading = normal_to_leading(trailing);
+
+    if (leading)
+      return leading;
+
+    MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
+
+    if (!card_mark_membar)
+      return NULL;
+
+    return normal_to_leading(card_mark_membar);
+  }
+
   // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
 
-bool unnecessary_acquire(const Node *barrier) {
+bool unnecessary_acquire(const Node *barrier)
+{
   // assert barrier->is_MemBar();
   if (UseBarriersForVolatile)
     // we need to plant a dmb
@@ -1323,13 +2146,11 @@
     return (x->is_Load() && x->as_Load()->is_acquire());
   }
   
-  // only continue if we want to try to match unsafe volatile gets
-  if (UseBarriersForUnsafeVolatileGet)
-    return false;
+  // now check for an unsafe volatile get
 
   // need to check for
   //
-  //     MemBarCPUOrder
+  //   MemBarCPUOrder
   //        ||       \\
   //   MemBarAcquire* LoadX[mo_acquire]
   //        ||
@@ -1341,9 +2162,13 @@
   // check for a parent MemBarCPUOrder
   ProjNode *ctl;
   ProjNode *mem;
-  MemBarNode *parent = has_parent_membar(barrier, ctl, mem);
+  MemBarNode *parent = parent_membar(barrier);
   if (!parent || parent->Opcode() != Op_MemBarCPUOrder)
     return false;
+  ctl = parent->proj_out(TypeFunc::Control);
+  mem = parent->proj_out(TypeFunc::Memory);
+  if (!ctl || !mem)
+    return false;
   // ensure the proj nodes both feed a LoadX[mo_acquire]
   LoadNode *ld = NULL;
   for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
@@ -1369,7 +2194,7 @@
   if (ld)
     return false;
   // check for a child cpuorder membar
-  MemBarNode *child  = has_child_membar(barrier->as_MemBar(), ctl, mem);
+  MemBarNode *child  = child_membar(barrier->as_MemBar());
   if (!child || child->Opcode() != Op_MemBarCPUOrder)
     return false;
 
@@ -1422,9 +2247,7 @@
     return true;
   }
 
-  // only continue if we want to try to match unsafe volatile gets
-  if (UseBarriersForUnsafeVolatileGet)
-    return false;
+  // now check for an unsafe volatile get
 
   // check if Ctl and Proj feed comes from a MemBarCPUOrder
   //
@@ -1435,22 +2258,20 @@
   //   MemBarCPUOrder
 
   MemBarNode *membar;
-  ProjNode *ctl;
-  ProjNode *mem;
-
-  membar = has_parent_membar(ld, ctl, mem);
+
+  membar = parent_membar(ld);
 
   if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
     return false;
 
   // ensure that there is a CPUOrder->Acquire->CPUOrder membar chain
 
-  membar = has_child_membar(membar, ctl, mem);
+  membar = child_membar(membar);
 
   if (!membar || !membar->Opcode() == Op_MemBarAcquire)
     return false;
 
-  membar = has_child_membar(membar, ctl, mem);
+  membar = child_membar(membar);
   
   if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
     return false;
@@ -1458,282 +2279,64 @@
   return true;
 }
 
-bool unnecessary_release(const Node *n) {
+bool unnecessary_release(const Node *n)
+{
+  assert((n->is_MemBar() &&
+	  n->Opcode() == Op_MemBarRelease),
+	 "expecting a release membar");
+
+  if (UseBarriersForVolatile)
+    // we need to plant a dmb
+    return false;
+
+  // if there is a dependent CPUOrder barrier then use that as the
+  // leading
+
+  MemBarNode *barrier = n->as_MemBar();
+  // check for an intervening cpuorder membar
+  MemBarNode *b = child_membar(barrier);
+  if (b && b->Opcode() == Op_MemBarCPUOrder) {
+    // ok, so start the check from the dependent cpuorder barrier
+    barrier = b;
+  }
+
+  // must start with a normal feed
+  MemBarNode *child_barrier = leading_to_normal(barrier);
+
+  if (!child_barrier)
+    return false;
+
+  if (!is_card_mark_membar(child_barrier))
+    // this is the trailing membar and we are done
+    return true;
+
+  // must be sure this card mark feeds a trailing membar
+  MemBarNode *trailing = card_mark_to_trailing(child_barrier);
+  return (trailing != NULL);
+}
+
+bool unnecessary_volatile(const Node *n)
+{
   // assert n->is_MemBar();
   if (UseBarriersForVolatile)
     // we need to plant a dmb
     return false;
 
-  // ok, so we can omit this release barrier if it has been inserted
-  // as part of a volatile store sequence
-  //
-  //   MemBarRelease
-  //  {      ||      }
-  //  {MemBarCPUOrder} -- optional
-  //         ||     \\
-  //         ||     StoreX[mo_release]
-  //         | \     /
-  //         | MergeMem
-  //         | /
-  //   MemBarVolatile
-  //
-  // where
-  //  || and \\ represent Ctl and Mem feeds via Proj nodes
-  //  | \ and / indicate further routing of the Ctl and Mem feeds
-  // 
-  // so we need to check that
-  //
-  // ia) the release membar (or its dependent cpuorder membar) feeds
-  // control to a store node (via a Control project node)
-  //
-  // ii) the store is ordered release
-  //
-  // iii) the release membar (or its dependent cpuorder membar) feeds
-  // control to a volatile membar (via the same Control project node)
-  //
-  // iv) the release membar feeds memory to a merge mem and to the
-  // same store (both via a single Memory proj node)
-  //
-  // v) the store outputs to the merge mem
-  //
-  // vi) the merge mem outputs to the same volatile membar
-  //
-  // n.b. if this is an inlined unsafe node then the release membar
-  // may feed its control and memory links via an intervening cpuorder
-  // membar. this case can be dealt with when we check the release
-  // membar projections. if they both feed a single cpuorder membar
-  // node continue to make the same checks as above but with the
-  // cpuorder membar substituted for the release membar. if they don't
-  // both feed a cpuorder membar then the check fails.
-  //
-  // n.b.b. for an inlined unsafe store of an object in the case where
-  // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
-  // an embedded if then else where we expect the store. this is
-  // needed to do the right type of store depending on whether
-  // heap_base is NULL. We could check for that but for now we can
-  // just take the hit of on inserting a redundant dmb for this
-  // redundant volatile membar
-
-  MemBarNode *barrier = n->as_MemBar();
-  ProjNode *ctl;
-  ProjNode *mem;
-  // check for an intervening cpuorder membar
-  MemBarNode *b = has_child_membar(barrier, ctl, mem);
-  if (b && b->Opcode() == Op_MemBarCPUOrder) {
-    // ok, so start form the dependent cpuorder barrier
-    barrier = b;
-  }
-  // check the ctl and mem flow
-  ctl = barrier->proj_out(TypeFunc::Control);
-  mem = barrier->proj_out(TypeFunc::Memory);
-
-  // the barrier needs to have both a Ctl and Mem projection
-  if (! ctl || ! mem)
-    return false;
-
-  Node *x = NULL;
-  Node *mbvol = NULL;
-  StoreNode * st = NULL;
-
-  // For a normal volatile write the Ctl ProjNode should have output
-  // to a MemBarVolatile and a Store marked as releasing
-  //
-  // n.b. for an inlined unsafe store of an object in the case where
-  // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
-  // an embedded if then else where we expect the store. this is
-  // needed to do the right type of store depending on whether
-  // heap_base is NULL. We could check for that case too but for now
-  // we can just take the hit of inserting a dmb and a non-volatile
-  // store to implement the volatile store
-
-  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
-    x = ctl->fast_out(i);
-    if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-      if (mbvol) {
-	return false;
-      }
-      mbvol = x;
-    } else if (x->is_Store()) {
-      st = x->as_Store();
-      if (! st->is_release()) {
-	return false;
-      }
-    } else if (!x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
+  MemBarNode *mbvol = n->as_MemBar();
+
+  // first we check if this is part of a card mark. if so then we have
+  // to generate a StoreLoad barrier
+  
+  if (is_card_mark_membar(mbvol))
       return false;
-    }
-  }
-
-  if (!mbvol || !st)
-    return false;
-
-  // the Mem ProjNode should output to a MergeMem and the same Store
-  Node *mm = NULL;
-  for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-    x = mem->fast_out(i);
-    if (!mm && x->is_MergeMem()) {
-      mm = x;
-    } else if (x != st && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!mm)
-    return false;
-
-  // the MergeMem should output to the MemBarVolatile
-  for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-    x = mm->fast_out(i);
-    if (x != mbvol && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool unnecessary_volatile(const Node *n) {
-  // assert n->is_MemBar();
-  if (UseBarriersForVolatile)
-    // we need to plant a dmb
-    return false;
-
-  // ok, so we can omit this volatile barrier if it has been inserted
-  // as part of a volatile store sequence
-  //
-  //   MemBarRelease
-  //  {      ||      }
-  //  {MemBarCPUOrder} -- optional
-  //         ||     \\
-  //         ||     StoreX[mo_release]
-  //         | \     /
-  //         | MergeMem
-  //         | /
-  //   MemBarVolatile
-  //
-  // where
-  //  || and \\ represent Ctl and Mem feeds via Proj nodes
-  //  | \ and / indicate further routing of the Ctl and Mem feeds
-  // 
-  // we need to check that
-  //
-  // i) the volatile membar gets its control feed from a release
-  // membar (or its dependent cpuorder membar) via a Control project
-  // node
-  //
-  // ii) the release membar (or its dependent cpuorder membar) also
-  // feeds control to a store node via the same proj node
-  //
-  // iii) the store is ordered release
-  //
-  // iv) the release membar (or its dependent cpuorder membar) feeds
-  // memory to a merge mem and to the same store (both via a single
-  // Memory proj node)
-  //
-  // v) the store outputs to the merge mem
-  //
-  // vi) the merge mem outputs to the volatile membar
-  //
-  // n.b. for an inlined unsafe store of an object in the case where
-  // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
-  // an embedded if then else where we expect the store. this is
-  // needed to do the right type of store depending on whether
-  // heap_base is NULL. We could check for that but for now we can
-  // just take the hit of on inserting a redundant dmb for this
-  // redundant volatile membar
-
-  MemBarNode *mbvol = n->as_MemBar();
-  Node *x = n->lookup(TypeFunc::Control);
-
-  if (! x || !x->is_Proj())
-    return false;
-
-  ProjNode *proj = x->as_Proj();
-
-  x = proj->lookup(0);
-
-  if (!x || !x->is_MemBar())
-    return false;
-
-  MemBarNode *barrier = x->as_MemBar();
-
-  // if the barrier is a release membar we have what we want. if it is
-  // a cpuorder membar then we need to ensure that it is fed by a
-  // release membar in which case we proceed to check the graph below
-  // this cpuorder membar as the feed
-
-  if (x->Opcode() != Op_MemBarRelease) {
-    if (x->Opcode() != Op_MemBarCPUOrder)
-      return false;
-    ProjNode *ctl;
-    ProjNode *mem;
-    MemBarNode *b = has_parent_membar(x, ctl, mem);
-    if (!b || !b->Opcode() == Op_MemBarRelease)
-      return false;
-  }
-
-  ProjNode *ctl = barrier->proj_out(TypeFunc::Control);
-  ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
-
-  // barrier needs to have both a Ctl and Mem projection
-  // and we need to have reached it via the Ctl projection
-  if (! ctl || ! mem || ctl != proj)
-    return false;
-
-  StoreNode * st = NULL;
-
-  // The Ctl ProjNode should have output to a MemBarVolatile and
-  // a Store marked as releasing
-  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
-    x = ctl->fast_out(i);
-    if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-      if (x != mbvol) {
-	return false;
-      }
-    } else if (x->is_Store()) {
-      st = x->as_Store();
-      if (! st->is_release()) {
-	return false;
-      }
-    } else if (!x->is_Mach()){
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!st)
-    return false;
-
-  // the Mem ProjNode should output to a MergeMem and the same Store
-  Node *mm = NULL;
-  for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-    x = mem->fast_out(i);
-    if (!mm && x->is_MergeMem()) {
-      mm = x;
-    } else if (x != st && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!mm)
-    return false;
-
-  // the MergeMem should output to the MemBarVolatile
-  for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-    x = mm->fast_out(i);
-    if (x != mbvol && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  return true;
-}
-
-
+
+  // ok, if it's not a card mark then we still need to check if it is
+  // a trailing membar of a volatile put hgraph.
+
+  return (trailing_to_leading(mbvol) != NULL);
+}
+
+// predicates controlling emit of str<x>/stlr<x> and associated dmbs
 
 bool needs_releasing_store(const Node *n)
 {
@@ -1744,29 +2347,13 @@
 
   StoreNode *st = n->as_Store();
 
+  // the store must be marked as releasing
   if (!st->is_release())
     return false;
 
-  // check if this store is bracketed by a release (or its dependent
-  // cpuorder membar) and a volatile membar
-  //
-  //   MemBarRelease
-  //  {      ||      }
-  //  {MemBarCPUOrder} -- optional
-  //         ||     \\
-  //         ||     StoreX[mo_release]
-  //         | \     /
-  //         | MergeMem
-  //         | /
-  //   MemBarVolatile
-  //
-  // where
-  //  || and \\ represent Ctl and Mem feeds via Proj nodes
-  //  | \ and / indicate further routing of the Ctl and Mem feeds
-  // 
-
-
-  Node *x = st->lookup(TypeFunc::Control);
+  // the store must be fed by a membar
+
+  Node *x = st->lookup(StoreNode::Memory);
 
   if (! x || !x->is_Proj())
     return false;
@@ -1780,79 +2367,78 @@
 
   MemBarNode *barrier = x->as_MemBar();
 
-  // if the barrier is a release membar we have what we want. if it is
-  // a cpuorder membar then we need to ensure that it is fed by a
-  // release membar in which case we proceed to check the graph below
-  // this cpuorder membar as the feed
-
-  if (x->Opcode() != Op_MemBarRelease) {
-    if (x->Opcode() != Op_MemBarCPUOrder)
-      return false;
-    Node *ctl = x->lookup(TypeFunc::Control);
-    Node *mem = x->lookup(TypeFunc::Memory);
-    if (!ctl || !ctl->is_Proj() || !mem || !mem->is_Proj())
-      return false;
-    x = ctl->lookup(0);
-    if (!x || !x->is_MemBar() || !x->Opcode() == Op_MemBarRelease)
-      return false;
-    Node *y = mem->lookup(0);
-    if (!y || y != x)
-      return false;
-  }
-
-  ProjNode *ctl = barrier->proj_out(TypeFunc::Control);
-  ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
-
-  // MemBarRelease needs to have both a Ctl and Mem projection
-  // and we need to have reached it via the Ctl projection
-  if (! ctl || ! mem || ctl != proj)
+  // if the barrier is a release membar or a cpuorder mmebar fed by a
+  // release membar then we need to check whether that forms part of a
+  // volatile put graph.
+
+  // reject invalid candidates
+  if (!leading_membar(barrier))
     return false;
 
-  MemBarNode *mbvol = NULL;
-
-  // The Ctl ProjNode should have output to a MemBarVolatile and
-  // a Store marked as releasing
-  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
-    x = ctl->fast_out(i);
-    if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-      mbvol = x->as_MemBar();
-    } else if (x->is_Store()) {
-      if (x != st) {
-	return false;
-      }
-    } else if (!x->is_Mach()){
-      return false;
-    }
-  }
+  // does this lead a normal subgraph?
+  MemBarNode *mbvol = leading_to_normal(barrier);
 
   if (!mbvol)
     return false;
 
-  // the Mem ProjNode should output to a MergeMem and the same Store
-  Node *mm = NULL;
-  for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-    x = mem->fast_out(i);
-    if (!mm && x->is_MergeMem()) {
-      mm = x;
-    } else if (x != st && !x->is_Mach()) {
-      return false;
-    }
-  }
-
-  if (!mm)
+  // all done unless this is a card mark
+  if (!is_card_mark_membar(mbvol))
+    return true;
+  
+  // we found a card mark -- just make sure we have a trailing barrier
+
+  return (card_mark_to_trailing(mbvol) != NULL);
+}
+
+// predicate controlling translation of StoreCM
+//
+// returns true if a StoreStore must precede the card write otherwise
+// false
+
+bool unnecessary_storestore(const Node *storecm)
+{
+  assert(storecm->Opcode()  == Op_StoreCM, "expecting a StoreCM");
+
+  // we only ever need to generate a dmb ishst between an object put
+  // and the associated card mark when we are using CMS without
+  // conditional card marking
+
+  if (!UseConcMarkSweepGC || UseCondCardMark)
+    return true;
+
+  // if we are implementing volatile puts using barriers then the
+  // object put as an str so we must insert the dmb ishst
+
+  if (UseBarriersForVolatile)
     return false;
 
-  // the MergeMem should output to the MemBarVolatile
-  for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-    x = mm->fast_out(i);
-    if (x != mbvol && !x->is_Mach()) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
+  // we can omit the dmb ishst if this StoreCM is part of a volatile
+  // put because in thta case the put will be implemented by stlr
+  //
+  // we need to check for a normal subgraph feeding this StoreCM.
+  // that means the StoreCM must be fed Memory from a leading membar,
+  // either a MemBarRelease or its dependent MemBarCPUOrder, and the
+  // leading membar must be part of a normal subgraph
+
+  Node *x = storecm->in(StoreNode::Memory);
+
+  if (!x->is_Proj())
+    return false;
+
+  x = x->in(0);
+
+  if (!x->is_MemBar())
+    return false;
+
+  MemBarNode *leading = x->as_MemBar();
+
+  // reject invalid candidates
+  if (!leading_membar(leading))
+    return false;
+
+  // we can omit the StoreStore if it is the head of a normal subgraph
+  return (leading_to_normal(leading) != NULL);
+}
 
 
 #define __ _masm.
@@ -2944,6 +3530,13 @@
                as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}
 
+  enc_class aarch64_enc_strb0_ordered(memory mem) %{
+    MacroAssembler _masm(&cbuf);
+    __ membar(Assembler::StoreStore);
+    loadStore(_masm, &MacroAssembler::strb, zr, $mem->opcode(),
+               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
   enc_class aarch64_enc_strh(iRegI src, memory mem) %{
     Register src_reg = as_Register($src$$reg);
     loadStore(MacroAssembler(&cbuf), &MacroAssembler::strh, src_reg, $mem->opcode(),
@@ -6613,6 +7206,7 @@
 instruct storeimmCM0(immI0 zero, memory mem)
 %{
   match(Set mem (StoreCM mem zero));
+  predicate(unnecessary_storestore(n));
 
   ins_cost(INSN_COST);
   format %{ "strb zr, $mem\t# byte" %}
@@ -6622,6 +7216,21 @@
   ins_pipe(istore_mem);
 %}
 
+// Store CMS card-mark Immediate with intervening StoreStore
+// needed when using CMS with no conditional card marking
+instruct storeimmCM0_ordered(immI0 zero, memory mem)
+%{
+  match(Set mem (StoreCM mem zero));
+
+  ins_cost(INSN_COST * 2);
+  format %{ "dmb ishst"
+      "\n\tstrb zr, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_strb0_ordered(mem));
+
+  ins_pipe(istore_mem);
+%}
+
 // Store Byte
 instruct storeB(iRegIorL2I src, memory mem)
 %{
@@ -6643,7 +7252,7 @@
   predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
-  format %{ "strb zr, $mem\t# byte" %}
+  format %{ "strb rscractch2, $mem\t# byte" %}
 
   ins_encode(aarch64_enc_strb0(mem));
 
@@ -7396,6 +8005,7 @@
   format %{ "membar_acquire" %}
 
   ins_encode %{
+    __ block_comment("membar_acquire");
     __ membar(Assembler::LoadLoad|Assembler::LoadStore);
   %}
 
@@ -7448,6 +8058,7 @@
   format %{ "membar_release" %}
 
   ins_encode %{
+    __ block_comment("membar_release");
     __ membar(Assembler::LoadStore|Assembler::StoreStore);
   %}
   ins_pipe(pipe_serial);
@@ -7499,6 +8110,7 @@
   format %{ "membar_volatile" %}
 
   ins_encode %{
+    __ block_comment("membar_volatile");
     __ membar(Assembler::StoreLoad);
   %}
 
@@ -9429,7 +10041,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9465,7 +10077,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9501,7 +10113,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9537,7 +10149,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9573,7 +10185,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9609,7 +10221,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9645,7 +10257,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9681,7 +10293,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9717,7 +10329,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9754,7 +10366,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9792,7 +10404,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9830,7 +10442,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9868,7 +10480,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9906,7 +10518,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9944,7 +10556,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9982,7 +10594,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10020,7 +10632,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10058,7 +10670,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10096,7 +10708,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10134,7 +10746,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10172,7 +10784,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10210,7 +10822,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10248,7 +10860,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10286,7 +10898,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
--- a/src/cpu/aarch64/vm/aarch64_ad.m4	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/aarch64/vm/aarch64_ad.m4	Fri Aug 21 09:12:42 2015 +0200
@@ -42,7 +42,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::$5,
-              $src3$$constant & 0x3f);
+              $src3$$constant & ifelse($1,I,0x1f,0x3f));
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -87,7 +87,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::$5,
-              $src3$$constant & 0x3f);
+              $src3$$constant & ifelse($1,I,0x1f,0x3f));
   %}
 
   ins_pipe(ialu_reg_reg_shift);
--- a/src/cpu/aarch64/vm/assembler_aarch64.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/aarch64/vm/assembler_aarch64.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -268,7 +268,7 @@
     __ ldar(r21, r28);                                 //       ldar    x21, [x28]
 
 // LoadStoreExclusiveOp
-    __ stxrw(r24, r24, r7);                            //       stxr    w24, w24, [x7]
+    __ stxrw(r21, r24, r7);                            //       stxr    w21, w24, [x7]
     __ stlxrw(r21, r26, r28);                          //       stlxr   w21, w26, [x28]
     __ ldxrw(r21, r6);                                 //       ldxr    w21, [x6]
     __ ldaxrw(r15, r30);                               //       ldaxr   w15, [x30]
@@ -299,7 +299,7 @@
 
 // LoadStoreExclusiveOp
     __ ldxpw(r25, r4, r22);                            //       ldxp    w25, w4, [x22]
-    __ ldaxpw(r14, r14, r15);                          //       ldaxp   w14, w14, [x15]
+    __ ldaxpw(r13, r14, r15);                          //       ldaxp   w13, w14, [x15]
     __ stxpw(r20, r26, r8, r10);                       //       stxp    w20, w26, w8, [x10]
     __ stlxpw(r23, r18, r18, r18);                     //       stlxp   w23, w18, w18, [x18]
 
@@ -773,7 +773,7 @@
  260:   c85fffbb        ldaxr   x27, [x29]
  264:   c89fffa0        stlr    x0, [x29]
  268:   c8dfff95        ldar    x21, [x28]
- 26c:   88187cf8        stxr    w24, w24, [x7]
+ 26c:   88157cf8        stxr    w21, w24, [x7]
  270:   8815ff9a        stlxr   w21, w26, [x28]
  274:   885f7cd5        ldxr    w21, [x6]
  278:   885fffcf        ldaxr   w15, [x30]
@@ -796,7 +796,7 @@
  2bc:   c82870bb        stxp    w8, x27, x28, [x5]
  2c0:   c825b8c8        stlxp   w5, x8, x14, [x6]
  2c4:   887f12d9        ldxp    w25, w4, [x22]
- 2c8:   887fb9ee        ldaxp   w14, w14, [x15]
+ 2c8:   887fb9ed        ldaxp   w13, w14, [x15]
  2cc:   8834215a        stxp    w20, w26, w8, [x10]
  2d0:   8837ca52        stlxp   w23, w18, w18, [x18]
  2d4:   f806317e        str     x30, [x11,#99]
@@ -1085,13 +1085,13 @@
     0xd444c320,     0xd503201f,     0xd69f03e0,     0xd6bf03e0,
     0xd5033fdf,     0xd5033f9f,     0xd5033abf,     0xd61f0040,
     0xd63f00a0,     0xc8147c55,     0xc805fcfd,     0xc85f7e05,
-    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88187cf8,
+    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88157cf8,
     0x8815ff9a,     0x885f7cd5,     0x885fffcf,     0x889ffc73,
     0x88dffc56,     0x48127c0f,     0x480bff85,     0x485f7cdd,
     0x485ffcf2,     0x489fff99,     0x48dffe62,     0x080a7c3e,
     0x0814fed5,     0x085f7c59,     0x085ffcb8,     0x089ffc70,
     0x08dfffb6,     0xc87f0a68,     0xc87fcdc7,     0xc82870bb,
-    0xc825b8c8,     0x887f12d9,     0x887fb9ee,     0x8834215a,
+    0xc825b8c8,     0x887f12d9,     0x887fb9ed,     0x8834215a,
     0x8837ca52,     0xf806317e,     0xb81b3337,     0x39000dc2,
     0x78005149,     0xf84391f4,     0xb85b220c,     0x385fd356,
     0x785d127e,     0x389f4149,     0x79801e3c,     0x79c014a3,
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -1106,13 +1106,13 @@
 
 #define INSN4(NAME, sz, op, o0) /* Four registers */                    \
   void NAME(Register Rs, Register Rt1, Register Rt2, Register Rn) {     \
-    assert(Rs != Rn, "unpredictable instruction");                  \
+    guarantee(Rs != Rn && Rs != Rt1 && Rs != Rt2, "unpredictable instruction"); \
     load_store_exclusive(Rs, Rt1, Rt2, Rn, sz, op, o0);                 \
   }
 
 #define INSN3(NAME, sz, op, o0) /* Three registers */                   \
   void NAME(Register Rs, Register Rt, Register Rn) {                    \
-    assert(Rs != Rn, "unpredictable instruction");                  \
+    guarantee(Rs != Rn && Rs != Rt, "unpredictable instruction");       \
     load_store_exclusive(Rs, Rt, (Register)0b11111, Rn, sz, op, o0);    \
   }
 
@@ -1124,6 +1124,7 @@
 
 #define INSN_FOO(NAME, sz, op, o0) /* Three registers, encoded differently */ \
   void NAME(Register Rt1, Register Rt2, Register Rn) {                  \
+    guarantee(Rt1 != Rt2, "unpredictable instruction");                 \
     load_store_exclusive((Register)0b11111, Rt1, Rt2, Rn, sz, op, o0);  \
   }
 
--- a/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -611,6 +611,7 @@
     Label done;
 
     const Register swap_reg = r0;
+    const Register tmp = c_rarg2;
     const Register obj_reg = c_rarg3; // Will contain the oop
 
     const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
@@ -624,7 +625,7 @@
     ldr(obj_reg, Address(lock_reg, obj_offset));
 
     if (UseBiasedLocking) {
-      biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, done, &slow_case);
+      biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, done, &slow_case);
     }
 
     // Load (object->mark() | 1) into swap_reg
@@ -643,7 +644,7 @@
       cmpxchgptr(swap_reg, lock_reg, obj_reg, rscratch1, fast, &fail);
       bind(fast);
       atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
-                  rscratch2, rscratch1);
+                  rscratch2, rscratch1, tmp);
       b(done);
       bind(fail);
     } else {
@@ -671,7 +672,7 @@
     if (PrintBiasedLockingStatistics) {
       br(Assembler::NE, slow_case);
       atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
-                  rscratch2, rscratch1);
+                  rscratch2, rscratch1, tmp);
     }
     br(Assembler::EQ, done);
 
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -34,6 +34,7 @@
 #include "memory/resourceArea.hpp"
 #include "nativeInst_aarch64.hpp"
 #include "oops/klass.inline.hpp"
+#include "oops/oop.inline.hpp"
 #include "opto/compile.hpp"
 #include "opto/node.hpp"
 #include "runtime/biasedLocking.hpp"
@@ -398,11 +399,7 @@
   if (PrintBiasedLockingStatistics && counters == NULL)
     counters = BiasedLocking::counters();
 
-  bool need_tmp_reg = false;
-  if (tmp_reg == noreg) {
-    tmp_reg = rscratch2;
-  }
-  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
+  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
@@ -432,7 +429,7 @@
   if (counters != NULL) {
     Label around;
     cbnz(tmp_reg, around);
-    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
+    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
     b(done);
     bind(around);
   } else {
@@ -485,7 +482,7 @@
     bind(here);
     if (counters != NULL) {
       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
-                  tmp_reg, rscratch1);
+                  tmp_reg, rscratch1, rscratch2);
     }
   }
   b(done);
@@ -511,7 +508,7 @@
     bind(here);
     if (counters != NULL) {
       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
-                  tmp_reg, rscratch1);
+                  tmp_reg, rscratch1, rscratch2);
     }
   }
   b(done);
@@ -539,7 +536,7 @@
     // removing the bias bit from the object's header.
     if (counters != NULL) {
       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
-                  rscratch1);
+                  rscratch1, rscratch2);
     }
     bind(nope);
   }
@@ -1640,15 +1637,15 @@
   return Address(Rd);
 }
 
-void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
+void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
   Label retry_load;
   bind(retry_load);
   // flush and load exclusive from the memory location
   ldxrw(tmp, counter_addr);
   addw(tmp, tmp, 1);
   // if we store+flush with no intervening write tmp wil be zero
-  stxrw(tmp, tmp, counter_addr);
-  cbnzw(tmp, retry_load);
+  stxrw(tmp2, tmp, counter_addr);
+  cbnzw(tmp2, retry_load);
 }
 
 
@@ -2021,6 +2018,14 @@
   }
 }
 
+void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
+  if (decrement.is_register()) {
+    subw(Rd, Rn, decrement.as_register());
+  } else {
+    subw(Rd, Rn, decrement.as_constant());
+  }
+}
+
 void MacroAssembler::reinit_heapbase()
 {
   if (UseCompressedOops) {
@@ -2110,7 +2115,7 @@
     return a != b.as_register() && a != c && b.as_register() != c;
 }
 
-#define ATOMIC_OP(LDXR, OP, STXR)                                       \
+#define ATOMIC_OP(LDXR, OP, IOP, STXR)                                       \
 void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
   Register result = rscratch2;                                          \
   if (prev->is_valid())                                                 \
@@ -2120,14 +2125,15 @@
   bind(retry_load);                                                     \
   LDXR(result, addr);                                                   \
   OP(rscratch1, result, incr);                                          \
-  STXR(rscratch1, rscratch1, addr);                                     \
-  cbnzw(rscratch1, retry_load);                                         \
-  if (prev->is_valid() && prev != result)                               \
-    mov(prev, result);                                                  \
+  STXR(rscratch2, rscratch1, addr);                                     \
+  cbnzw(rscratch2, retry_load);                                         \
+  if (prev->is_valid() && prev != result) {                             \
+    IOP(prev, rscratch1, incr);                                         \
+  }                                                                     \
 }
 
-ATOMIC_OP(ldxr, add, stxr)
-ATOMIC_OP(ldxrw, addw, stxrw)
+ATOMIC_OP(ldxr, add, sub, stxr)
+ATOMIC_OP(ldxrw, addw, subw, stxrw)
 
 #undef ATOMIC_OP
 
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -107,9 +107,7 @@
   // Biased locking support
   // lock_reg and obj_reg must be loaded up with the appropriate values.
   // swap_reg is killed.
-  // tmp_reg is optional. If it is supplied (i.e., != noreg) it will
-  // be killed; if not supplied, push/pop will be used internally to
-  // allocate a temporary (inefficient, avoid if possible).
+  // tmp_reg must be supplied and must not be rscratch1 or rscratch2
   // Optional slow case is for implementations (interpreter and C1) which branch to
   // slow case directly. Leaves condition codes set for C2's Fast_Lock node.
   // Returns offset of first potentially-faulting instruction for null
@@ -126,10 +124,10 @@
 
   // Helper functions for statistics gathering.
   // Unconditional atomic increment.
-  void atomic_incw(Register counter_addr, Register tmp);
-  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2) {
+  void atomic_incw(Register counter_addr, Register tmp, Register tmp2);
+  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2, Register tmp3) {
     lea(tmp1, counter_addr);
-    atomic_incw(tmp1, tmp2);
+    atomic_incw(tmp1, tmp2, tmp3);
   }
   // Load Effective Address
   void lea(Register r, const Address &a) {
@@ -1057,6 +1055,7 @@
   void add(Register Rd, Register Rn, RegisterOrConstant increment);
   void addw(Register Rd, Register Rn, RegisterOrConstant increment);
   void sub(Register Rd, Register Rn, RegisterOrConstant decrement);
+  void subw(Register Rd, Register Rn, RegisterOrConstant decrement);
 
   void adrp(Register reg1, const Address &dest, unsigned long &byte_offset);
 
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -1774,6 +1774,7 @@
   const Register obj_reg  = r19;  // Will contain the oop
   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
   const Register old_hdr  = r13;  // value of old header at unlock time
+  const Register tmp = c_rarg3;
 
   Label slow_path_lock;
   Label lock_done;
@@ -1795,7 +1796,7 @@
     __ ldr(obj_reg, Address(oop_handle_reg, 0));
 
     if (UseBiasedLocking) {
-      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, lock_done, &slow_path_lock);
+      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, lock_done, &slow_path_lock);
     }
 
     // Load (object->mark() | 1) into swap_reg %r0
--- a/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -1913,15 +1913,18 @@
 }
 
 void TemplateInterpreterGenerator::count_bytecode() {
+  Register rscratch3 = r0;
   __ push(rscratch1);
   __ push(rscratch2);
+  __ push(rscratch3);
   Label L;
   __ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
   __ bind(L);
   __ ldxr(rscratch1, rscratch2);
   __ add(rscratch1, rscratch1, 1);
-  __ stxr(rscratch1, rscratch1, rscratch2);
-  __ cbnzw(rscratch1, L);
+  __ stxr(rscratch3, rscratch1, rscratch2);
+  __ cbnzw(rscratch3, L);
+  __ pop(rscratch3);
   __ pop(rscratch2);
   __ pop(rscratch1);
 }
--- a/src/cpu/x86/vm/assembler_x86.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -1674,6 +1674,13 @@
   emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true);
 }
 
+void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
+  emit_int8(0x2A);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
@@ -6604,13 +6611,6 @@
   emit_operand(dst, src);
 }
 
-void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
-  emit_int8(0x2A);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
--- a/src/cpu/x86/vm/interp_masm_x86.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/x86/vm/interp_masm_x86.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -355,8 +355,8 @@
     case ctos:                                   // fall through
     case stos:                                   // fall through
     case itos: movl(rax, val_addr);                 break;
-    case ftos: movflt(xmm0, val_addr);              break;
-    case dtos: movdbl(xmm0, val_addr);              break;
+    case ftos: load_float(val_addr);                break;
+    case dtos: load_double(val_addr);               break;
     case vtos: /* nothing to do */                  break;
     default  : ShouldNotReachHere();
   }
@@ -376,8 +376,8 @@
     case ctos:                                     // fall through
     case stos:                                     // fall through
     case itos: movl(rax, val_addr);                   break;
-    case ftos: fld_s(val_addr);                       break;
-    case dtos: fld_d(val_addr);                       break;
+    case ftos: load_float(val_addr);                  break;
+    case dtos: load_double(val_addr);                 break;
     case vtos: /* nothing to do */                    break;
     default  : ShouldNotReachHere();
   }
@@ -578,6 +578,26 @@
   push(r);
 }
 
+void InterpreterMacroAssembler::push_f(XMMRegister r) {
+  subptr(rsp, wordSize);
+  movflt(Address(rsp, 0), r);
+}
+
+void InterpreterMacroAssembler::pop_f(XMMRegister r) {
+  movflt(r, Address(rsp, 0));
+  addptr(rsp, wordSize);
+}
+
+void InterpreterMacroAssembler::push_d(XMMRegister r) {
+  subptr(rsp, 2 * wordSize);
+  movdbl(Address(rsp, 0), r);
+}
+
+void InterpreterMacroAssembler::pop_d(XMMRegister r) {
+  movdbl(r, Address(rsp, 0));
+  addptr(rsp, 2 * Interpreter::stackElementSize);
+}
+
 #ifdef _LP64
 void InterpreterMacroAssembler::pop_i(Register r) {
   // XXX can't use pop currently, upper half non clean
@@ -590,31 +610,11 @@
   addptr(rsp, 2 * Interpreter::stackElementSize);
 }
 
-void InterpreterMacroAssembler::pop_f(XMMRegister r) {
-  movflt(r, Address(rsp, 0));
-  addptr(rsp, wordSize);
-}
-
-void InterpreterMacroAssembler::pop_d(XMMRegister r) {
-  movdbl(r, Address(rsp, 0));
-  addptr(rsp, 2 * Interpreter::stackElementSize);
-}
-
 void InterpreterMacroAssembler::push_l(Register r) {
   subptr(rsp, 2 * wordSize);
   movq(Address(rsp, 0), r);
 }
 
-void InterpreterMacroAssembler::push_f(XMMRegister r) {
-  subptr(rsp, wordSize);
-  movflt(Address(rsp, 0), r);
-}
-
-void InterpreterMacroAssembler::push_d(XMMRegister r) {
-  subptr(rsp, 2 * wordSize);
-  movdbl(Address(rsp, 0), r);
-}
-
 void InterpreterMacroAssembler::pop(TosState state) {
   switch (state) {
   case atos: pop_ptr();                 break;
@@ -623,8 +623,8 @@
   case stos:
   case itos: pop_i();                   break;
   case ltos: pop_l();                   break;
-  case ftos: pop_f();                   break;
-  case dtos: pop_d();                   break;
+  case ftos: pop_f(xmm0);               break;
+  case dtos: pop_d(xmm0);               break;
   case vtos: /* nothing to do */        break;
   default:   ShouldNotReachHere();
   }
@@ -640,8 +640,8 @@
   case stos:
   case itos: push_i();                  break;
   case ltos: push_l();                  break;
-  case ftos: push_f();                  break;
-  case dtos: push_d();                  break;
+  case ftos: push_f(xmm0);              break;
+  case dtos: push_d(xmm0);              break;
   case vtos: /* nothing to do */        break;
   default  : ShouldNotReachHere();
   }
@@ -675,8 +675,20 @@
     case stos:                                               // fall through
     case itos: pop_i(rax);                                   break;
     case ltos: pop_l(rax, rdx);                              break;
-    case ftos: pop_f();                                      break;
-    case dtos: pop_d();                                      break;
+    case ftos:
+      if (UseSSE >= 1) {
+        pop_f(xmm0);
+      } else {
+        pop_f();
+      }
+      break;
+    case dtos:
+      if (UseSSE >= 2) {
+        pop_d(xmm0);
+      } else {
+        pop_d();
+      }
+      break;
     case vtos: /* nothing to do */                           break;
     default  : ShouldNotReachHere();
   }
@@ -695,7 +707,7 @@
   fstp_s(Address(rsp, 0));
 }
 
-void InterpreterMacroAssembler::push_d(Register r) {
+void InterpreterMacroAssembler::push_d() {
   // Do not schedule for no AGI! Never write beyond rsp!
   subptr(rsp, 2 * wordSize);
   fstp_d(Address(rsp, 0));
@@ -711,8 +723,20 @@
     case stos:                                               // fall through
     case itos: push_i(rax);                                    break;
     case ltos: push_l(rax, rdx);                               break;
-    case ftos: push_f();                                       break;
-    case dtos: push_d(rax);                                    break;
+    case ftos:
+      if (UseSSE >= 1) {
+        push_f(xmm0);
+      } else {
+        push_f();
+      }
+      break;
+    case dtos:
+      if (UseSSE >= 2) {
+        push_d(xmm0);
+      } else {
+        push_d();
+      }
+      break;
     case vtos: /* nothing to do */                             break;
     default  : ShouldNotReachHere();
   }
@@ -995,22 +1019,6 @@
   leave();                           // remove frame anchor
   pop(ret_addr);                     // get return address
   mov(rsp, rbx);                     // set sp to sender sp
-#ifndef _LP64
-  if (UseSSE) {
-    // float and double are returned in xmm register in SSE-mode
-    if (state == ftos && UseSSE >= 1) {
-      subptr(rsp, wordSize);
-      fstp_s(Address(rsp, 0));
-      movflt(xmm0, Address(rsp, 0));
-      addptr(rsp, wordSize);
-    } else if (state == dtos && UseSSE >= 2) {
-      subptr(rsp, 2*wordSize);
-      fstp_d(Address(rsp, 0));
-      movdbl(xmm0, Address(rsp, 0));
-      addptr(rsp, 2*wordSize);
-    }
-  }
-#endif // _LP64
 }
 #endif // !CC_INTERP
 
@@ -1783,7 +1791,10 @@
 
 void InterpreterMacroAssembler::verify_FPU(int stack_depth, TosState state) {
 #ifndef _LP64
-  if (state == ftos || state == dtos) MacroAssembler::verify_FPU(stack_depth);
+  if ((state == ftos && UseSSE < 1) ||
+      (state == dtos && UseSSE < 2)) {
+    MacroAssembler::verify_FPU(stack_depth);
+  }
 #endif
 }
 
--- a/src/cpu/x86/vm/interp_masm_x86.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/x86/vm/interp_masm_x86.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -140,20 +140,20 @@
   void push_ptr(Register r = rax);
   void push_i(Register r = rax);
 
+  void push_f(XMMRegister r);
+  void pop_f(XMMRegister r);
+  void pop_d(XMMRegister r);
+  void push_d(XMMRegister r);
 #ifdef _LP64
   void pop_l(Register r = rax);
-  void pop_f(XMMRegister r = xmm0);
-  void pop_d(XMMRegister r = xmm0);
   void push_l(Register r = rax);
-  void push_f(XMMRegister r = xmm0);
-  void push_d(XMMRegister r = xmm0);
 #else
   void pop_l(Register lo = rax, Register hi = rdx);
   void pop_f();
   void pop_d();
 
   void push_l(Register lo = rax, Register hi = rdx);
-  void push_d(Register r = rax);
+  void push_d();
   void push_f();
 #endif // _LP64
 
--- a/src/cpu/x86/vm/interpreterGenerator_x86.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/x86/vm/interpreterGenerator_x86.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -42,6 +42,12 @@
   address generate_Reference_get_entry();
   address generate_CRC32_update_entry();
   address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+#ifndef _LP64
+  address generate_Float_intBitsToFloat_entry();
+  address generate_Float_floatToRawIntBits_entry();
+  address generate_Double_longBitsToDouble_entry();
+  address generate_Double_doubleToRawLongBits_entry();
+#endif
   void lock_method(void);
   void generate_stack_overflow_check(void);
 
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -3314,6 +3314,42 @@
   fincstp();
 }
 
+void MacroAssembler::load_float(Address src) {
+  if (UseSSE >= 1) {
+    movflt(xmm0, src);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fld_s(src));
+  }
+}
+
+void MacroAssembler::store_float(Address dst) {
+  if (UseSSE >= 1) {
+    movflt(dst, xmm0);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fstp_s(dst));
+  }
+}
+
+void MacroAssembler::load_double(Address src) {
+  if (UseSSE >= 2) {
+    movdbl(xmm0, src);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fld_d(src));
+  }
+}
+
+void MacroAssembler::store_double(Address dst) {
+  if (UseSSE >= 2) {
+    movdbl(dst, xmm0);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fstp_d(dst));
+  }
+}
+
 void MacroAssembler::fremr(Register tmp) {
   save_rax(tmp);
   { Label L;
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -471,6 +471,22 @@
   // Pop ST (ffree & fincstp combined)
   void fpop();
 
+  // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
+  // register xmm0. Otherwise, the value is loaded onto the FPU stack.
+  void load_float(Address src);
+
+  // Store float value to 'address'. If UseSSE >= 1, the value is stored
+  // from register xmm0. Otherwise, the value is stored from the FPU stack.
+  void store_float(Address dst);
+
+  // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
+  // register xmm0. Otherwise, the value is loaded onto the FPU stack.
+  void load_double(Address src);
+
+  // Store double value to 'address'. If UseSSE >= 2, the value is stored
+  // from register xmm0. Otherwise, the value is stored from the FPU stack.
+  void store_double(Address dst);
+
   // pushes double TOS element of FPU stack on CPU stack; pops from FPU stack
   void push_fTOS();
 
--- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -170,22 +170,12 @@
     __ MacroAssembler::verify_FPU(0, "generate_return_entry_for compiled");
   }
 
-  // In SSE mode, interpreter returns FP results in xmm0 but they need
-  // to end up back on the FPU so it can operate on them.
-  if (state == ftos && UseSSE >= 1) {
-    __ subptr(rsp, wordSize);
-    __ movflt(Address(rsp, 0), xmm0);
-    __ fld_s(Address(rsp, 0));
-    __ addptr(rsp, wordSize);
-  } else if (state == dtos && UseSSE >= 2) {
-    __ subptr(rsp, 2*wordSize);
-    __ movdbl(Address(rsp, 0), xmm0);
-    __ fld_d(Address(rsp, 0));
-    __ addptr(rsp, 2*wordSize);
+  if (state == ftos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 1 ? 0 : 1, "generate_return_entry_for in interpreter");
+  } else if (state == dtos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 2 ? 0 : 1, "generate_return_entry_for in interpreter");
   }
 
-  __ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_return_entry_for in interpreter");
-
   // Restore stack bottom in case i2c adjusted stack
   __ movptr(rsp, Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize));
   // and NULL it as marker that rsp is now tos until next java call
@@ -217,21 +207,12 @@
 address TemplateInterpreterGenerator::generate_deopt_entry_for(TosState state, int step) {
   address entry = __ pc();
 
-  // In SSE mode, FP results are in xmm0
-  if (state == ftos && UseSSE > 0) {
-    __ subptr(rsp, wordSize);
-    __ movflt(Address(rsp, 0), xmm0);
-    __ fld_s(Address(rsp, 0));
-    __ addptr(rsp, wordSize);
-  } else if (state == dtos && UseSSE >= 2) {
-    __ subptr(rsp, 2*wordSize);
-    __ movdbl(Address(rsp, 0), xmm0);
-    __ fld_d(Address(rsp, 0));
-    __ addptr(rsp, 2*wordSize);
+  if (state == ftos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 1 ? 0 : 1, "generate_deopt_entry_for in interpreter");
+  } else if (state == dtos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 2 ? 0 : 1, "generate_deopt_entry_for in interpreter");
   }
 
-  __ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_deopt_entry_for in interpreter");
-
   // The stack is not extended by deopt but we must NULL last_sp as this
   // entry is like a "return".
   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
@@ -735,7 +716,7 @@
   if (UseCRC32Intrinsics) {
     address entry = __ pc();
 
-    // rbx,: Method*
+    // rbx: Method*
     // rsi: senderSP must preserved for slow path, set SP to it on fast path
     // rdx: scratch
     // rdi: scratch
@@ -841,6 +822,124 @@
   return generate_native_entry(false);
 }
 
+/**
+ * Method entry for static native method:
+ *    java.lang.Float.intBitsToFloat(int bits)
+ */
+address InterpreterGenerator::generate_Float_intBitsToFloat_entry() {
+  address entry;
+
+  if (UseSSE >= 1) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load 'bits' into xmm0 (interpreter returns results in xmm0)
+    __ movflt(xmm0, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Float.floatToRawIntBits(float value)
+ */
+address InterpreterGenerator::generate_Float_floatToRawIntBits_entry() {
+  address entry;
+
+  if (UseSSE >= 1) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load the parameter (a floating-point value) into rax.
+    __ movl(rax, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Double.longBitsToDouble(long bits)
+ */
+address InterpreterGenerator::generate_Double_longBitsToDouble_entry() {
+  address entry;
+
+   if (UseSSE >= 2) {
+     entry = __ pc();
+
+     // rsi: the sender's SP
+
+     // Skip safepoint check (compiler intrinsic versions of this method
+     // do not perform safepoint checks either).
+
+     // Load 'bits' into xmm0 (interpreter returns results in xmm0)
+     __ movdbl(xmm0, Address(rsp, wordSize));
+
+     // Return
+     __ pop(rdi); // get return address
+     __ mov(rsp, rsi); // set rsp to the sender's SP
+     __ jmp(rdi);
+   } else {
+     entry = generate_native_entry(false);
+   }
+
+   return entry;
+}
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Double.doubleToRawLongBits(double value)
+ */
+address InterpreterGenerator::generate_Double_doubleToRawLongBits_entry() {
+  address entry;
+
+  if (UseSSE >= 2) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load the parameter (a floating-point value) into rax.
+    __ movl(rdx, Address(rsp, 2*wordSize));
+    __ movl(rax, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
 //
 // Interpreter stub for calling a native method. (asm interpreter)
 // This sets up a somewhat different looking stack for calling the native method
@@ -1090,7 +1189,7 @@
               double_handler.addr());
     __ jcc(Assembler::notEqual, L);
     __ bind(push_double);
-    __ push(dtos);
+    __ push_d(); // FP values are returned using the FPU, so push FPU contents (even if UseSSE > 0).
     __ bind(L);
   }
   __ push(ltos);
--- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -1707,10 +1707,10 @@
                                                          address& vep) {
   assert(t->is_valid() && t->tos_in() == vtos, "illegal template");
   Label L;
-  aep = __ pc();  __ push_ptr();  __ jmp(L);
-  fep = __ pc();  __ push_f();    __ jmp(L);
-  dep = __ pc();  __ push_d();    __ jmp(L);
-  lep = __ pc();  __ push_l();    __ jmp(L);
+  aep = __ pc();  __ push_ptr();   __ jmp(L);
+  fep = __ pc();  __ push_f(xmm0); __ jmp(L);
+  dep = __ pc();  __ push_d(xmm0); __ jmp(L);
+  lep = __ pc();  __ push_l();     __ jmp(L);
   bep = cep = sep =
   iep = __ pc();  __ push_i();
   vep = __ pc();
--- a/src/cpu/x86/vm/templateTable_x86.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/cpu/x86/vm/templateTable_x86.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -349,53 +349,60 @@
 
 void TemplateTable::fconst(int value) {
   transition(vtos, ftos);
+  if (UseSSE >= 1) {
+    static float one = 1.0f, two = 2.0f;
+    switch (value) {
+    case 0:
+      __ xorps(xmm0, xmm0);
+      break;
+    case 1:
+      __ movflt(xmm0, ExternalAddress((address) &one));
+      break;
+    case 2:
+      __ movflt(xmm0, ExternalAddress((address) &two));
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
 #ifdef _LP64
-  static float one = 1.0f, two = 2.0f;
-  switch (value) {
-  case 0:
-    __ xorps(xmm0, xmm0);
-    break;
-  case 1:
-    __ movflt(xmm0, ExternalAddress((address) &one));
-    break;
-  case 2:
-    __ movflt(xmm0, ExternalAddress((address) &two));
-    break;
-  default:
     ShouldNotReachHere();
-    break;
+#else
+           if (value == 0) { __ fldz();
+    } else if (value == 1) { __ fld1();
+    } else if (value == 2) { __ fld1(); __ fld1(); __ faddp(); // should do a better solution here
+    } else                 { ShouldNotReachHere();
+    }
+#endif // _LP64
   }
-#else
-         if (value == 0) { __ fldz();
-  } else if (value == 1) { __ fld1();
-  } else if (value == 2) { __ fld1(); __ fld1(); __ faddp(); // should do a better solution here
-  } else                 { ShouldNotReachHere();
-  }
-#endif
 }
 
 void TemplateTable::dconst(int value) {
   transition(vtos, dtos);
+  if (UseSSE >= 2) {
+    static double one = 1.0;
+    switch (value) {
+    case 0:
+      __ xorpd(xmm0, xmm0);
+      break;
+    case 1:
+      __ movdbl(xmm0, ExternalAddress((address) &one));
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
 #ifdef _LP64
-  static double one = 1.0;
-  switch (value) {
-  case 0:
-    __ xorpd(xmm0, xmm0);
-    break;
-  case 1:
-    __ movdbl(xmm0, ExternalAddress((address) &one));
-    break;
-  default:
     ShouldNotReachHere();
-    break;
+#else
+           if (value == 0) { __ fldz();
+    } else if (value == 1) { __ fld1();
+    } else                 { ShouldNotReachHere();
+    }
+#endif
   }
-
-#else
-         if (value == 0) { __ fldz();
-  } else if (value == 1) { __ fld1();
-  } else                 { ShouldNotReachHere();
-  }
-#endif
 }
 
 void TemplateTable::bipush() {
@@ -454,8 +461,7 @@
   __ jccb(Assembler::notEqual, notFloat);
 
   // ftos
-  LP64_ONLY(__ movflt(xmm0, Address(rcx, rbx, Address::times_8, base_offset)));
-  NOT_LP64(__ fld_s(    Address(rcx, rbx, Address::times_ptr, base_offset)));
+  __ load_float(Address(rcx, rbx, Address::times_ptr, base_offset));
   __ push(ftos);
   __ jmp(Done);
 
@@ -522,8 +528,7 @@
   __ jccb(Assembler::notEqual, Long);
 
   // dtos
-  LP64_ONLY(__ movdbl(xmm0, Address(rcx, rbx, Address::times_8, base_offset)));
-  NOT_LP64(__ fld_d(    Address(rcx, rbx, Address::times_ptr, base_offset)));
+  __ load_double(Address(rcx, rbx, Address::times_ptr, base_offset));
   __ push(dtos);
 
   __ jmpb(Done);
@@ -617,15 +622,13 @@
 void TemplateTable::fload() {
   transition(vtos, ftos);
   locals_index(rbx);
-  LP64_ONLY(__ movflt(xmm0, faddress(rbx)));
-  NOT_LP64(__ fld_s(faddress(rbx)));
+  __ load_float(faddress(rbx));
 }
 
 void TemplateTable::dload() {
   transition(vtos, dtos);
   locals_index(rbx);
-  LP64_ONLY(__ movdbl(xmm0, daddress(rbx)));
-  NOT_LP64(__ fld_d(daddress(rbx)));
+  __ load_double(daddress(rbx));
 }
 
 void TemplateTable::aload() {
@@ -657,15 +660,13 @@
 void TemplateTable::wide_fload() {
   transition(vtos, ftos);
   locals_index_wide(rbx);
-  LP64_ONLY(__ movflt(xmm0, faddress(rbx)));
-  NOT_LP64(__ fld_s(faddress(rbx)));
+  __ load_float(faddress(rbx));
 }
 
 void TemplateTable::wide_dload() {
   transition(vtos, dtos);
   locals_index_wide(rbx);
-  LP64_ONLY(__ movdbl(xmm0, daddress(rbx)));
-  NOT_LP64(__ fld_d(daddress(rbx)));
+  __ load_double(daddress(rbx));
 }
 
 void TemplateTable::wide_aload() {
@@ -726,10 +727,9 @@
   // rax: index
   // rdx: array
   index_check(rdx, rax); // kills rbx
-  LP64_ONLY(__ movflt(xmm0, Address(rdx, rax,
-                         Address::times_4,
-                         arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
-  NOT_LP64(__ fld_s(Address(rdx, rax, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
+  __ load_float(Address(rdx, rax,
+                        Address::times_4,
+                        arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
 }
 
 void TemplateTable::daload() {
@@ -737,10 +737,9 @@
   // rax: index
   // rdx: array
   index_check(rdx, rax); // kills rbx
-  LP64_ONLY(__ movdbl(xmm0, Address(rdx, rax,
-                          Address::times_8,
-                          arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
-  NOT_LP64(__ fld_d(Address(rdx, rax, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
+  __ load_double(Address(rdx, rax,
+                         Address::times_8,
+                         arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
 }
 
 void TemplateTable::aaload() {
@@ -807,14 +806,12 @@
 
 void TemplateTable::fload(int n) {
   transition(vtos, ftos);
-  LP64_ONLY(__ movflt(xmm0, faddress(n)));
-  NOT_LP64(__ fld_s(faddress(n)));
+  __ load_float(faddress(n));
 }
 
 void TemplateTable::dload(int n) {
   transition(vtos, dtos);
-  LP64_ONLY(__ movdbl(xmm0, daddress(n)));
-  NOT_LP64(__ fld_d(daddress(n)));
+  __ load_double(daddress(n));
 }
 
 void TemplateTable::aload(int n) {
@@ -919,15 +916,13 @@
 void TemplateTable::fstore() {
   transition(ftos, vtos);
   locals_index(rbx);
-  LP64_ONLY(__ movflt(faddress(rbx), xmm0));
-  NOT_LP64(__ fstp_s(faddress(rbx)));
+  __ store_float(faddress(rbx));
 }
 
 void TemplateTable::dstore() {
   transition(dtos, vtos);
   locals_index(rbx);
-  LP64_ONLY(__ movdbl(daddress(rbx), xmm0));
-  NOT_LP64(__ fstp_d(daddress(rbx)));
+  __ store_double(daddress(rbx));
 }
 
 void TemplateTable::astore() {
@@ -956,7 +951,7 @@
 void TemplateTable::wide_fstore() {
 #ifdef _LP64
   transition(vtos, vtos);
-  __ pop_f();
+  __ pop_f(xmm0);
   locals_index_wide(rbx);
   __ movflt(faddress(rbx), xmm0);
 #else
@@ -967,7 +962,7 @@
 void TemplateTable::wide_dstore() {
 #ifdef _LP64
   transition(vtos, vtos);
-  __ pop_d();
+  __ pop_d(xmm0);
   locals_index_wide(rbx);
   __ movdbl(daddress(rbx), xmm0);
 #else
@@ -1011,29 +1006,21 @@
 void TemplateTable::fastore() {
   transition(ftos, vtos);
   __ pop_i(rbx);
-  // xmm0: value
+  // value is in UseSSE >= 1 ? xmm0 : ST(0)
   // rbx:  index
   // rdx:  array
   index_check(rdx, rbx); // prefer index in rbx
-  LP64_ONLY(__ movflt(Address(rdx, rbx,
-                   Address::times_4,
-                   arrayOopDesc::base_offset_in_bytes(T_FLOAT)),
-           xmm0));
-  NOT_LP64(__ fstp_s(Address(rdx, rbx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
+  __ store_float(Address(rdx, rbx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
 }
 
 void TemplateTable::dastore() {
   transition(dtos, vtos);
   __ pop_i(rbx);
-  // xmm0: value
+  // value is in UseSSE >= 2 ? xmm0 : ST(0)
   // rbx:  index
   // rdx:  array
   index_check(rdx, rbx); // prefer index in rbx
-  LP64_ONLY(__ movdbl(Address(rdx, rbx,
-                   Address::times_8,
-                   arrayOopDesc::base_offset_in_bytes(T_DOUBLE)),
-           xmm0));
-  NOT_LP64(__ fstp_d(Address(rdx, rbx, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
+  __ store_double(Address(rdx, rbx, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
 }
 
 void TemplateTable::aastore() {
@@ -1134,14 +1121,12 @@
 
 void TemplateTable::fstore(int n) {
   transition(ftos, vtos);
-  LP64_ONLY(__ movflt(faddress(n), xmm0));
-  NOT_LP64(__ fstp_s(faddress(n)));
+  __ store_float(faddress(n));
 }
 
 void TemplateTable::dstore(int n) {
   transition(dtos, vtos);
-  LP64_ONLY(__ movdbl(daddress(n), xmm0));
-  NOT_LP64(__ fstp_d(daddress(n)));
+  __ store_double(daddress(n));
 }
 
 
@@ -1425,82 +1410,127 @@
 
 void TemplateTable::fop2(Operation op) {
   transition(ftos, ftos);
+
+  if (UseSSE >= 1) {
+    switch (op) {
+    case add:
+      __ addss(xmm0, at_rsp());
+      __ addptr(rsp, Interpreter::stackElementSize);
+      break;
+    case sub:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ subss(xmm0, xmm1);
+      break;
+    case mul:
+      __ mulss(xmm0, at_rsp());
+      __ addptr(rsp, Interpreter::stackElementSize);
+      break;
+    case div:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ divss(xmm0, xmm1);
+      break;
+    case rem:
+      // On x86_64 platforms the SharedRuntime::frem method is called to perform the
+      // modulo operation. The frem method calls the function
+      // double fmod(double x, double y) in math.h. The documentation of fmod states:
+      // "If x or y is a NaN, a NaN is returned." without specifying what type of NaN
+      // (signalling or quiet) is returned.
+      //
+      // On x86_32 platforms the FPU is used to perform the modulo operation. The
+      // reason is that on 32-bit Windows the sign of modulo operations diverges from
+      // what is considered the standard (e.g., -0.0f % -3.14f is 0.0f (and not -0.0f).
+      // The fprem instruction used on x86_32 is functionally equivalent to
+      // SharedRuntime::frem in that it returns a NaN.
 #ifdef _LP64
-  switch (op) {
-  case add:
-    __ addss(xmm0, at_rsp());
-    __ addptr(rsp, Interpreter::stackElementSize);
-    break;
-  case sub:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ subss(xmm0, xmm1);
-    break;
-  case mul:
-    __ mulss(xmm0, at_rsp());
-    __ addptr(rsp, Interpreter::stackElementSize);
-    break;
-  case div:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ divss(xmm0, xmm1);
-    break;
-  case rem:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 2);
-    break;
-  default:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 2);
+#else
+      __ push_f(xmm0);
+      __ pop_f();
+      __ fld_s(at_rsp());
+      __ fremr(rax);
+      __ f2ieee();
+      __ pop(rax);  // pop second operand off the stack
+      __ push_f();
+      __ pop_f(xmm0);
+#endif
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
+#ifdef _LP64
     ShouldNotReachHere();
-    break;
-  }
 #else
-  switch (op) {
+    switch (op) {
     case add: __ fadd_s (at_rsp());                break;
     case sub: __ fsubr_s(at_rsp());                break;
     case mul: __ fmul_s (at_rsp());                break;
     case div: __ fdivr_s(at_rsp());                break;
     case rem: __ fld_s  (at_rsp()); __ fremr(rax); break;
     default : ShouldNotReachHere();
+    }
+    __ f2ieee();
+    __ pop(rax);  // pop second operand off the stack
+#endif // _LP64
   }
-  __ f2ieee();
-  __ pop(rax);  // pop float thing off
-#endif
 }
 
 void TemplateTable::dop2(Operation op) {
   transition(dtos, dtos);
+  if (UseSSE >= 2) {
+    switch (op) {
+    case add:
+      __ addsd(xmm0, at_rsp());
+      __ addptr(rsp, 2 * Interpreter::stackElementSize);
+      break;
+    case sub:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ subsd(xmm0, xmm1);
+      break;
+    case mul:
+      __ mulsd(xmm0, at_rsp());
+      __ addptr(rsp, 2 * Interpreter::stackElementSize);
+      break;
+    case div:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ divsd(xmm0, xmm1);
+      break;
+    case rem:
+      // Similar to fop2(), the modulo operation is performed using the
+      // SharedRuntime::drem method (on x86_64 platforms) or using the
+      // FPU (on x86_32 platforms) for the same reasons as mentioned in fop2().
 #ifdef _LP64
-  switch (op) {
-  case add:
-    __ addsd(xmm0, at_rsp());
-    __ addptr(rsp, 2 * Interpreter::stackElementSize);
-    break;
-  case sub:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ subsd(xmm0, xmm1);
-    break;
-  case mul:
-    __ mulsd(xmm0, at_rsp());
-    __ addptr(rsp, 2 * Interpreter::stackElementSize);
-    break;
-  case div:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ divsd(xmm0, xmm1);
-    break;
-  case rem:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 2);
-    break;
-  default:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 2);
+#else
+      __ push_d(xmm0);
+      __ pop_d();
+      __ fld_d(at_rsp());
+      __ fremr(rax);
+      __ d2ieee();
+      __ pop(rax);
+      __ pop(rdx);
+      __ push_d();
+      __ pop_d(xmm0);
+#endif
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
+#ifdef _LP64
     ShouldNotReachHere();
-    break;
-  }
 #else
-  switch (op) {
+    switch (op) {
     case add: __ fadd_d (at_rsp());                break;
     case sub: __ fsubr_d(at_rsp());                break;
     case mul: {
@@ -1543,12 +1573,13 @@
     }
     case rem: __ fld_d  (at_rsp()); __ fremr(rax); break;
     default : ShouldNotReachHere();
+    }
+    __ d2ieee();
+    // Pop double precision number from rsp.
+    __ pop(rax);
+    __ pop(rdx);
+#endif
   }
-  __ d2ieee();
-  // Pop double precision number from rsp.
-  __ pop(rax);
-  __ pop(rdx);
-#endif
 }
 
 void TemplateTable::ineg() {
@@ -1562,7 +1593,6 @@
   NOT_LP64(__ lneg(rdx, rax));
 }
 
-#ifdef _LP64
 // Note: 'double' and 'long long' have 32-bits alignment on x86.
 static jlong* double_quadword(jlong *adr, jlong lo, jlong hi) {
   // Use the expression (adr)&(~0xF) to provide 128-bits aligned address
@@ -1577,26 +1607,30 @@
 // Buffer for 128-bits masks used by SSE instructions.
 static jlong float_signflip_pool[2*2];
 static jlong double_signflip_pool[2*2];
-#endif
 
 void TemplateTable::fneg() {
   transition(ftos, ftos);
-#ifdef _LP64
-  static jlong *float_signflip  = double_quadword(&float_signflip_pool[1], 0x8000000080000000, 0x8000000080000000);
-  __ xorps(xmm0, ExternalAddress((address) float_signflip));
-#else
-  __ fchs();
-#endif
+  if (UseSSE >= 1) {
+    static jlong *float_signflip  = double_quadword(&float_signflip_pool[1], 0x8000000080000000, 0x8000000080000000);
+    __ xorps(xmm0, ExternalAddress((address) float_signflip));
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(__ fchs());
+  }
 }
 
 void TemplateTable::dneg() {
   transition(dtos, dtos);
+  if (UseSSE >= 2) {
+    static jlong *double_signflip  = double_quadword(&double_signflip_pool[1], 0x8000000000000000, 0x8000000000000000);
+    __ xorpd(xmm0, ExternalAddress((address) double_signflip));
+  } else {
 #ifdef _LP64
-  static jlong *double_signflip  = double_quadword(&double_signflip_pool[1], 0x8000000000000000, 0x8000000000000000);
-  __ xorpd(xmm0, ExternalAddress((address) double_signflip));
+    ShouldNotReachHere();
 #else
-  __ fchs();
+    __ fchs();
 #endif
+  }
 }
 
 void TemplateTable::iinc() {
@@ -1798,18 +1832,26 @@
       __ extend_sign(rdx, rax);
       break;
     case Bytecodes::_i2f:
-      __ push(rax);          // store int on tos
-      __ fild_s(at_rsp());   // load int to ST0
-      __ f2ieee();           // truncate to float size
-      __ pop(rcx);           // adjust rsp
+      if (UseSSE >= 1) {
+        __ cvtsi2ssl(xmm0, rax);
+      } else {
+        __ push(rax);          // store int on tos
+        __ fild_s(at_rsp());   // load int to ST0
+        __ f2ieee();           // truncate to float size
+        __ pop(rcx);           // adjust rsp
+      }
       break;
     case Bytecodes::_i2d:
+      if (UseSSE >= 2) {
+        __ cvtsi2sdl(xmm0, rax);
+      } else {
       __ push(rax);          // add one slot for d2ieee()
       __ push(rax);          // store int on tos
       __ fild_s(at_rsp());   // load int to ST0
       __ d2ieee();           // truncate to double size
       __ pop(rcx);           // adjust rsp
       __ pop(rcx);
+      }
       break;
     case Bytecodes::_i2b:
       __ shll(rax, 24);      // truncate upper 24 bits
@@ -1829,50 +1871,102 @@
       /* nothing to do */
       break;
     case Bytecodes::_l2f:
+      // On 64-bit platforms, the cvtsi2ssq instruction is used to convert
+      // 64-bit long values to floats. On 32-bit platforms it is not possible
+      // to use that instruction with 64-bit operands, therefore the FPU is
+      // used to perform the conversion.
       __ push(rdx);          // store long on tos
       __ push(rax);
       __ fild_d(at_rsp());   // load long to ST0
       __ f2ieee();           // truncate to float size
       __ pop(rcx);           // adjust rsp
       __ pop(rcx);
+      if (UseSSE >= 1) {
+        __ push_f();
+        __ pop_f(xmm0);
+      }
       break;
     case Bytecodes::_l2d:
+      // On 32-bit platforms the FPU is used for conversion because on
+      // 32-bit platforms it is not not possible to use the cvtsi2sdq
+      // instruction with 64-bit operands.
       __ push(rdx);          // store long on tos
       __ push(rax);
       __ fild_d(at_rsp());   // load long to ST0
       __ d2ieee();           // truncate to double size
       __ pop(rcx);           // adjust rsp
       __ pop(rcx);
+      if (UseSSE >= 2) {
+        __ push_d();
+        __ pop_d(xmm0);
+      }
       break;
     case Bytecodes::_f2i:
-      __ push(rcx);          // reserve space for argument
-      __ fstp_s(at_rsp());   // pass float argument on stack
+      // SharedRuntime::f2i does not differentiate between sNaNs and qNaNs
+      // as it returns 0 for any NaN.
+      if (UseSSE >= 1) {
+        __ push_f(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ fstp_s(at_rsp());   // pass float argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2i), 1);
       break;
     case Bytecodes::_f2l:
-      __ push(rcx);          // reserve space for argument
-      __ fstp_s(at_rsp());   // pass float argument on stack
+      // SharedRuntime::f2l does not differentiate between sNaNs and qNaNs
+      // as it returns 0 for any NaN.
+      if (UseSSE >= 1) {
+       __ push_f(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ fstp_s(at_rsp());   // pass float argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2l), 1);
       break;
     case Bytecodes::_f2d:
-      /* nothing to do */
+      if (UseSSE < 1) {
+        /* nothing to do */
+      } else if (UseSSE == 1) {
+        __ push_f(xmm0);
+        __ pop_f();
+      } else { // UseSSE >= 2
+        __ cvtss2sd(xmm0, xmm0);
+      }
       break;
     case Bytecodes::_d2i:
-      __ push(rcx);          // reserve space for argument
-      __ push(rcx);
-      __ fstp_d(at_rsp());   // pass double argument on stack
+      if (UseSSE >= 2) {
+        __ push_d(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ push(rcx);
+        __ fstp_d(at_rsp());   // pass double argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2i), 2);
       break;
     case Bytecodes::_d2l:
-      __ push(rcx);          // reserve space for argument
-      __ push(rcx);
-      __ fstp_d(at_rsp());   // pass double argument on stack
+      if (UseSSE >= 2) {
+        __ push_d(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ push(rcx);
+        __ fstp_d(at_rsp());   // pass double argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2l), 2);
       break;
     case Bytecodes::_d2f:
-      __ push(rcx);          // reserve space for f2ieee()
-      __ f2ieee();           // truncate to float size
-      __ pop(rcx);           // adjust rsp
+      if (UseSSE <= 1) {
+        __ push(rcx);          // reserve space for f2ieee()
+        __ f2ieee();           // truncate to float size
+        __ pop(rcx);           // adjust rsp
+        if (UseSSE == 1) {
+          // The cvtsd2ss instruction is not available if UseSSE==1, therefore
+          // the conversion is performed using the FPU in this case.
+          __ push_f();
+          __ pop_f(xmm0);
+        }
+      } else { // UseSSE >= 2
+        __ cvtsd2ss(xmm0, xmm0);
+      }
       break;
     default             :
       ShouldNotReachHere();
@@ -1901,42 +1995,47 @@
 }
 
 void TemplateTable::float_cmp(bool is_float, int unordered_result) {
+  if ((is_float && UseSSE >= 1) ||
+      (!is_float && UseSSE >= 2)) {
+    Label done;
+    if (is_float) {
+      // XXX get rid of pop here, use ... reg, mem32
+      __ pop_f(xmm1);
+      __ ucomiss(xmm1, xmm0);
+    } else {
+      // XXX get rid of pop here, use ... reg, mem64
+      __ pop_d(xmm1);
+      __ ucomisd(xmm1, xmm0);
+    }
+    if (unordered_result < 0) {
+      __ movl(rax, -1);
+      __ jccb(Assembler::parity, done);
+      __ jccb(Assembler::below, done);
+      __ setb(Assembler::notEqual, rdx);
+      __ movzbl(rax, rdx);
+    } else {
+      __ movl(rax, 1);
+      __ jccb(Assembler::parity, done);
+      __ jccb(Assembler::above, done);
+      __ movl(rax, 0);
+      __ jccb(Assembler::equal, done);
+      __ decrementl(rax);
+    }
+    __ bind(done);
+  } else {
 #ifdef _LP64
-  Label done;
-  if (is_float) {
-    // XXX get rid of pop here, use ... reg, mem32
-    __ pop_f(xmm1);
-    __ ucomiss(xmm1, xmm0);
-  } else {
-    // XXX get rid of pop here, use ... reg, mem64
-    __ pop_d(xmm1);
-    __ ucomisd(xmm1, xmm0);
+    ShouldNotReachHere();
+#else
+    if (is_float) {
+      __ fld_s(at_rsp());
+    } else {
+      __ fld_d(at_rsp());
+      __ pop(rdx);
+    }
+    __ pop(rcx);
+    __ fcmp2int(rax, unordered_result < 0);
+#endif // _LP64
   }
-  if (unordered_result < 0) {
-    __ movl(rax, -1);
-    __ jccb(Assembler::parity, done);
-    __ jccb(Assembler::below, done);
-    __ setb(Assembler::notEqual, rdx);
-    __ movzbl(rax, rdx);
-  } else {
-    __ movl(rax, 1);
-    __ jccb(Assembler::parity, done);
-    __ jccb(Assembler::above, done);
-    __ movl(rax, 0);
-    __ jccb(Assembler::equal, done);
-    __ decrementl(rax);
-  }
-  __ bind(done);
-#else
-  if (is_float) {
-    __ fld_s(at_rsp());
-  } else {
-    __ fld_d(at_rsp());
-    __ pop(rdx);
-  }
-  __ pop(rcx);
-  __ fcmp2int(rax, unordered_result < 0);
-#endif
 }
 
 void TemplateTable::branch(bool is_jsr, bool is_wide) {
@@ -2748,8 +2847,7 @@
   __ jcc(Assembler::notEqual, notFloat);
   // ftos
 
-  LP64_ONLY(__ movflt(xmm0, field));
-  NOT_LP64(__ fld_s(field));
+  __ load_float(field);
   __ push(ftos);
   // Rewrite bytecode to be faster
   if (!is_static && rc == may_rewrite) {
@@ -2763,8 +2861,7 @@
   __ jcc(Assembler::notEqual, notDouble);
 #endif
   // dtos
-  LP64_ONLY(__ movdbl(xmm0, field));
-  NOT_LP64(__ fld_d(field));
+  __ load_double(field);
   __ push(dtos);
   // Rewrite bytecode to be faster
   if (!is_static && rc == may_rewrite) {
@@ -3046,8 +3143,7 @@
   {
     __ pop(ftos);
     if (!is_static) pop_and_check_object(obj);
-    NOT_LP64( __ fstp_s(field);)
-    LP64_ONLY( __ movflt(field, xmm0);)
+    __ store_float(field);
     if (!is_static && rc == may_rewrite) {
       patch_bytecode(Bytecodes::_fast_fputfield, bc, rbx, true, byte_no);
     }
@@ -3064,8 +3160,7 @@
   {
     __ pop(dtos);
     if (!is_static) pop_and_check_object(obj);
-    NOT_LP64( __ fstp_d(field);)
-    LP64_ONLY( __ movdbl(field, xmm0);)
+    __ store_double(field);
     if (!is_static && rc == may_rewrite) {
       patch_bytecode(Bytecodes::_fast_dputfield, bc, rbx, true, byte_no);
     }
@@ -3123,8 +3218,8 @@
     case Bytecodes::_fast_sputfield: // fall through
     case Bytecodes::_fast_cputfield: // fall through
     case Bytecodes::_fast_iputfield: __ push_i(rax); break;
-    case Bytecodes::_fast_dputfield: __ push_d(); break;
-    case Bytecodes::_fast_fputfield: __ push_f(); break;
+    case Bytecodes::_fast_dputfield: __ push(dtos); break;
+    case Bytecodes::_fast_fputfield: __ push(ftos); break;
     case Bytecodes::_fast_lputfield: __ push_l(rax); break;
 
     default:
@@ -3147,8 +3242,8 @@
     case Bytecodes::_fast_sputfield: // fall through
     case Bytecodes::_fast_cputfield: // fall through
     case Bytecodes::_fast_iputfield: __ pop_i(rax); break;
-    case Bytecodes::_fast_dputfield: __ pop_d(); break;
-    case Bytecodes::_fast_fputfield: __ pop_f(); break;
+    case Bytecodes::_fast_dputfield: __ pop(dtos); break;
+    case Bytecodes::_fast_fputfield: __ pop(ftos); break;
     case Bytecodes::_fast_lputfield: __ pop_l(rax); break;
     }
     __ bind(L2);
@@ -3212,12 +3307,10 @@
     __ movw(field, rax);
     break;
   case Bytecodes::_fast_fputfield:
-    NOT_LP64( __ fstp_s(field); )
-    LP64_ONLY( __ movflt(field, xmm0);)
+    __ store_float(field);
     break;
   case Bytecodes::_fast_dputfield:
-    NOT_LP64( __ fstp_d(field); )
-    LP64_ONLY( __ movdbl(field, xmm0);)
+    __ store_double(field);
     break;
   default:
     ShouldNotReachHere();
@@ -3302,12 +3395,10 @@
     __ load_unsigned_short(rax, field);
     break;
   case Bytecodes::_fast_fgetfield:
-    LP64_ONLY(__ movflt(xmm0, field));
-    NOT_LP64(__ fld_s(field));
+    __ load_float(field);
     break;
   case Bytecodes::_fast_dgetfield:
-    LP64_ONLY(__ movdbl(xmm0, field));
-    NOT_LP64(__ fld_d(field));
+    __ load_double(field);
     break;
   default:
     ShouldNotReachHere();
@@ -3347,8 +3438,7 @@
     __ verify_oop(rax);
     break;
   case ftos:
-    LP64_ONLY(__ movflt(xmm0, field));
-    NOT_LP64(__ fld_s(field));
+    __ load_float(field);
     break;
   default:
     ShouldNotReachHere();
--- a/src/share/vm/compiler/compileBroker.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/compiler/compileBroker.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -1399,6 +1399,28 @@
   // do the compilation
   if (method->is_native()) {
     if (!PreferInterpreterNativeStubs || method->is_method_handle_intrinsic()) {
+      // The following native methods:
+      //
+      // java.lang.Float.intBitsToFloat
+      // java.lang.Float.floatToRawIntBits
+      // java.lang.Double.longBitsToDouble
+      // java.lang.Double.doubleToRawLongBits
+      //
+      // are called through the interpreter even if interpreter native stubs
+      // are not preferred (i.e., calling through adapter handlers is preferred).
+      // The reason is that on x86_32 signaling NaNs (sNaNs) are not preserved
+      // if the version of the methods from the native libraries is called.
+      // As the interpreter and the C2-intrinsified version of the methods preserves
+      // sNaNs, that would result in an inconsistent way of handling of sNaNs.
+      if ((UseSSE >= 1 &&
+          (method->intrinsic_id() == vmIntrinsics::_intBitsToFloat ||
+           method->intrinsic_id() == vmIntrinsics::_floatToRawIntBits)) ||
+          (UseSSE >= 2 &&
+           (method->intrinsic_id() == vmIntrinsics::_longBitsToDouble ||
+            method->intrinsic_id() == vmIntrinsics::_doubleToRawLongBits))) {
+        return NULL;
+      }
+
       // To properly handle the appendix argument for out-of-line calls we are using a small trampoline that
       // pops off the appendix argument and jumps to the target (see gen_special_dispatch in SharedRuntime).
       //
--- a/src/share/vm/interpreter/abstractInterpreter.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/interpreter/abstractInterpreter.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -90,6 +90,10 @@
     java_util_zip_CRC32_update,                                 // implementation of java.util.zip.CRC32.update()
     java_util_zip_CRC32_updateBytes,                            // implementation of java.util.zip.CRC32.updateBytes()
     java_util_zip_CRC32_updateByteBuffer,                       // implementation of java.util.zip.CRC32.updateByteBuffer()
+    java_lang_Float_intBitsToFloat,                             // implementation of java.lang.Float.intBitsToFloat()
+    java_lang_Float_floatToRawIntBits,                          // implementation of java.lang.Float.floatToRawIntBits()
+    java_lang_Double_longBitsToDouble,                          // implementation of java.lang.Double.longBitsToDouble()
+    java_lang_Double_doubleToRawLongBits,                       // implementation of java.lang.Double.doubleToRawLongBits()
     number_of_method_entries,
     invalid = -1
   };
--- a/src/share/vm/interpreter/interpreter.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/interpreter/interpreter.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -234,7 +234,15 @@
       case vmIntrinsics::_updateByteBufferCRC32  : return java_util_zip_CRC32_updateByteBuffer;
     }
   }
-#endif
+
+  switch(m->intrinsic_id()) {
+  case vmIntrinsics::_intBitsToFloat:      return java_lang_Float_intBitsToFloat;
+  case vmIntrinsics::_floatToRawIntBits:   return java_lang_Float_floatToRawIntBits;
+  case vmIntrinsics::_longBitsToDouble:    return java_lang_Double_longBitsToDouble;
+  case vmIntrinsics::_doubleToRawLongBits: return java_lang_Double_doubleToRawLongBits;
+  }
+
+#endif // CC_INTERP
 
   // Native method?
   // Note: This test must come _before_ the test for intrinsic
@@ -559,6 +567,25 @@
                                            : // fall thru
   case Interpreter::java_util_zip_CRC32_updateByteBuffer
                                            : entry_point = generate_CRC32_updateBytes_entry(kind); break;
+#if defined(TARGET_ARCH_x86) && !defined(_LP64)
+  // On x86_32 platforms, a special entry is generated for the following four methods.
+  // On other platforms the normal entry is used to enter these methods.
+  case Interpreter::java_lang_Float_intBitsToFloat
+                                           : entry_point = generate_Float_intBitsToFloat_entry(); break;
+  case Interpreter::java_lang_Float_floatToRawIntBits
+                                           : entry_point = generate_Float_floatToRawIntBits_entry(); break;
+  case Interpreter::java_lang_Double_longBitsToDouble
+                                           : entry_point = generate_Double_longBitsToDouble_entry(); break;
+  case Interpreter::java_lang_Double_doubleToRawLongBits
+                                           : entry_point = generate_Double_doubleToRawLongBits_entry(); break;
+#else
+  case Interpreter::java_lang_Float_intBitsToFloat:
+  case Interpreter::java_lang_Float_floatToRawIntBits:
+  case Interpreter::java_lang_Double_longBitsToDouble:
+  case Interpreter::java_lang_Double_doubleToRawLongBits:
+    entry_point = generate_native_entry(false);
+    break;
+#endif // defined(TARGET_ARCH_x86) && !defined(_LP64)
 #endif // CC_INTERP
   default:
     fatal(err_msg("unexpected method kind: %d", kind));
--- a/src/share/vm/interpreter/templateInterpreter.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/interpreter/templateInterpreter.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -397,34 +397,39 @@
 
       // all non-native method kinds
       method_entry(zerolocals)
-        method_entry(zerolocals_synchronized)
-        method_entry(empty)
-        method_entry(accessor)
-        method_entry(abstract)
-        method_entry(java_lang_math_sin  )
-        method_entry(java_lang_math_cos  )
-        method_entry(java_lang_math_tan  )
-        method_entry(java_lang_math_abs  )
-        method_entry(java_lang_math_sqrt )
-        method_entry(java_lang_math_log  )
-        method_entry(java_lang_math_log10)
-        method_entry(java_lang_math_exp  )
-        method_entry(java_lang_math_pow  )
-        method_entry(java_lang_ref_reference_get)
+      method_entry(zerolocals_synchronized)
+      method_entry(empty)
+      method_entry(accessor)
+      method_entry(abstract)
+      method_entry(java_lang_math_sin  )
+      method_entry(java_lang_math_cos  )
+      method_entry(java_lang_math_tan  )
+      method_entry(java_lang_math_abs  )
+      method_entry(java_lang_math_sqrt )
+      method_entry(java_lang_math_log  )
+      method_entry(java_lang_math_log10)
+      method_entry(java_lang_math_exp  )
+      method_entry(java_lang_math_pow  )
+      method_entry(java_lang_ref_reference_get)
 
-        if (UseCRC32Intrinsics) {
-          method_entry(java_util_zip_CRC32_update)
-            method_entry(java_util_zip_CRC32_updateBytes)
-            method_entry(java_util_zip_CRC32_updateByteBuffer)
-            }
+      if (UseCRC32Intrinsics) {
+        method_entry(java_util_zip_CRC32_update)
+        method_entry(java_util_zip_CRC32_updateBytes)
+        method_entry(java_util_zip_CRC32_updateByteBuffer)
+      }
+
+      method_entry(java_lang_Float_intBitsToFloat);
+      method_entry(java_lang_Float_floatToRawIntBits);
+      method_entry(java_lang_Double_longBitsToDouble);
+      method_entry(java_lang_Double_doubleToRawLongBits);
 
       initialize_method_handle_entries();
 
       // all native method kinds (must be one contiguous block)
       Interpreter::_native_entry_begin = Interpreter::code()->code_end();
       method_entry(native)
-        method_entry(native_synchronized)
-        Interpreter::_native_entry_end = Interpreter::code()->code_end();
+      method_entry(native_synchronized)
+      Interpreter::_native_entry_end = Interpreter::code()->code_end();
 
 #undef method_entry
 
--- a/src/share/vm/memory/metaspace.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/memory/metaspace.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -254,7 +254,7 @@
   // Debugging support
   void verify();
 
-  static void print_compressed_class_space(outputStream* st, const char* requested_addr = 0);
+  static void print_compressed_class_space(outputStream* st, const char* requested_addr = 0) NOT_LP64({});
 
   class AllocRecordClosure :  public StackObj {
   public:
--- a/src/share/vm/opto/chaitin.cpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/opto/chaitin.cpp	Fri Aug 21 09:12:42 2015 +0200
@@ -990,9 +990,13 @@
         // FOUR registers!
 #ifdef ASSERT
         if (is_vect) {
-          assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
-          assert(!lrg._fat_proj, "sanity");
-          assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+          if (lrg.num_regs() != 0) {
+            assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
+            assert(!lrg._fat_proj, "sanity");
+            assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+          } else {
+            assert(n->is_Phi(), "not all inputs processed only if Phi");
+          }
         }
 #endif
         if (!is_vect && lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_pair()) {
--- a/src/share/vm/opto/compile.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/opto/compile.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -93,7 +93,7 @@
  public:
 
   void set_idx(node_idx_t idx) {
-    _idx_clone_orig = _idx_clone_orig & 0xFFFFFFFF00000000 | idx;
+    _idx_clone_orig = _idx_clone_orig & CONST64(0xFFFFFFFF00000000) | idx;
   }
   node_idx_t idx() const { return (node_idx_t)(_idx_clone_orig & 0xFFFFFFFF); }
 
--- a/src/share/vm/utilities/globalDefinitions_gcc.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/utilities/globalDefinitions_gcc.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -161,7 +161,7 @@
 
 
 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long canstant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
 
 // Build a 64bit integer constant
 #define CONST64(x)  (x ## LL)
--- a/src/share/vm/utilities/globalDefinitions_sparcWorks.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/utilities/globalDefinitions_sparcWorks.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -178,7 +178,7 @@
 
 
 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long constant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
 
 // Build a 64bit integer constant
 #define CONST64(x)  (x ## LL)
--- a/src/share/vm/utilities/globalDefinitions_visCPP.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/utilities/globalDefinitions_visCPP.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -148,9 +148,9 @@
 inline int g_isfinite(jdouble f)                 { return _finite(f); }
 
 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long constant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
 
-// Build a 64bit integer constant on with Visual C++
+// Build a 64bit integer constant with Visual C++
 #define  CONST64(x) (x ##  i64)
 #define UCONST64(x) (x ## ui64)
 
--- a/src/share/vm/utilities/globalDefinitions_xlc.hpp	Thu Aug 20 09:31:28 2015 +0200
+++ b/src/share/vm/utilities/globalDefinitions_xlc.hpp	Fri Aug 21 09:12:42 2015 +0200
@@ -108,7 +108,7 @@
 
 
 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long canstant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
 
 // Build a 64bit integer constant
 #define CONST64(x)  (x ## LL)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/floatingpoint/NaNTest.java	Fri Aug 21 09:12:42 2015 +0200
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/**
+ * @test
+ * @bug 8076373
+ * @summary Verify if signaling NaNs are preserved.
+ * @run main NaNTest
+ */
+public class NaNTest {
+    static void testFloat() {
+        int originalValue = 0x7f800001;
+        int readBackValue = Float.floatToRawIntBits(Float.intBitsToFloat(originalValue));
+        if (originalValue != readBackValue) {
+            String errorMessage = String.format("Original and read back float values mismatch\n0x%X 0x%X\n",
+                                                originalValue,
+                                                readBackValue);
+            throw new RuntimeException(errorMessage);
+        } else {
+            System.out.printf("Written and read back float values match\n0x%X 0x%X\n",
+                              originalValue,
+                              readBackValue);
+        }
+    }
+
+    static void testDouble() {
+        long originalValue = 0xFFF0000000000001L;
+        long readBackValue = Double.doubleToRawLongBits(Double.longBitsToDouble(originalValue));
+        if (originalValue != readBackValue) {
+            String errorMessage = String.format("Original and read back double values mismatch\n0x%X 0x%X\n",
+                                                originalValue,
+                                                readBackValue);
+            throw new RuntimeException(errorMessage);
+        } else {
+            System.out.printf("Written and read back double values match\n0x%X 0x%X\n",
+                              originalValue,
+                              readBackValue);
+        }
+
+    }
+
+    public static void main(String args[]) {
+        System.out.println("### NanTest started");
+
+        testFloat();
+        testDouble();
+
+        System.out.println("### NanTest ended");
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/regalloc/TestVectorRegAlloc.java	Fri Aug 21 09:12:42 2015 +0200
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8131969
+ * @summary assert in register allocation code when vector Phi for a loop is processed because code assumes all inputs already processed
+ * @run main/othervm -Xbatch TestVectorRegAlloc
+ *
+ */
+
+public class TestVectorRegAlloc {
+
+    static int test_helper_i;
+    static boolean test_helper() {
+        test_helper_i++;
+        return (test_helper_i & 7) != 0;
+    }
+
+    static void test(double[] src, double[] dst, boolean flag) {
+        double j = 0.0;
+        while(test_helper()) {
+            for (int i = 0; i < src.length; i++) {
+                dst[i] = src[i] + j;
+            }
+            // Loop will be unswitched and ReplicateD of zero will be
+            // split through the Phi of outer loop
+            for (int i = 0; i < src.length; i++) {
+                double k;
+                if (flag) {
+                    k = j;
+                } else {
+                    k = 0;
+                }
+                dst[i] = src[i] + k;
+            }
+            j++;
+        }
+    }
+
+    static public void main(String[] args) {
+        double[] src = new double[10];
+        double[] dst = new double[10];
+        for (int i = 0; i < 20000; i++) {
+            test(src, dst, (i % 2) == 0);
+        }
+    }
+}