changeset 10418:2f5d1578b240 jdk-9+110

Merge
author lana
date Thu, 10 Mar 2016 09:50:18 -0800
parents 407003fcbdb9 797e6aac6d53
children 0de4d895a5c8 4d4f3f5b215a 07536fb80fad
files src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/development/Server16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/development/Server24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/About16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/About24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Delete16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Delete24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Find16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Help16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Help24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/History16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/History24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Information16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Information24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/New16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/New24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Open16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Open24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Save24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/SaveAs16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/SaveAs24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Zoom16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/ZoomIn16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/ZoomIn24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/navigation/Down16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/navigation/Up16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignCenter16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignCenter24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignLeft16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignLeft24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignRight16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignRight24.gif src/share/vm/jvmci/commandLineFlagConstraintsJVMCI.cpp src/share/vm/jvmci/commandLineFlagConstraintsJVMCI.hpp test/serviceability/dcmd/jvmti/LoadJavaAgentDcmdTest.java
diffstat 306 files changed, 8962 insertions(+), 4117 deletions(-) [+]
line wrap: on
line diff
--- a/make/aix/makefiles/trace.make	Thu Mar 10 09:28:10 2016 -0800
+++ b/make/aix/makefiles/trace.make	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -57,11 +57,6 @@
 TraceGeneratedNames +=  \
 	traceRequestables.hpp \
     traceEventControl.hpp
-
-ifneq ($(INCLUDE_TRACE), false)
-TraceGeneratedNames += traceProducer.cpp
-endif
-
 endif
 
 TraceGeneratedFiles = $(TraceGeneratedNames:%=$(TraceOutDir)/%)
@@ -100,9 +95,6 @@
 $(TraceOutDir)/traceEventClasses.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceEventClasses.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
-$(TraceOutDir)/traceProducer.cpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceProducer.xsl $(XML_DEPS)
-	$(GENERATE_CODE)
-
 $(TraceOutDir)/traceRequestables.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceRequestables.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
--- a/make/bsd/makefiles/trace.make	Thu Mar 10 09:28:10 2016 -0800
+++ b/make/bsd/makefiles/trace.make	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -57,11 +57,6 @@
 TraceGeneratedNames +=  \
 	traceRequestables.hpp \
     traceEventControl.hpp
-
-ifneq ($(INCLUDE_TRACE), false)
-TraceGeneratedNames += traceProducer.cpp
-endif
-
 endif
 
 
@@ -101,9 +96,6 @@
 $(TraceOutDir)/traceEventClasses.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceEventClasses.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
-$(TraceOutDir)/traceProducer.cpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceProducer.xsl $(XML_DEPS)
-	$(GENERATE_CODE)
-
 $(TraceOutDir)/traceRequestables.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceRequestables.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
--- a/make/linux/makefiles/trace.make	Thu Mar 10 09:28:10 2016 -0800
+++ b/make/linux/makefiles/trace.make	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -57,11 +57,6 @@
 TraceGeneratedNames +=  \
 	traceRequestables.hpp \
     traceEventControl.hpp
-
-ifneq ($(INCLUDE_TRACE), false)
-TraceGeneratedNames += traceProducer.cpp
-endif
-
 endif
 
 TraceGeneratedFiles = $(TraceGeneratedNames:%=$(TraceOutDir)/%)
@@ -100,9 +95,6 @@
 $(TraceOutDir)/traceEventClasses.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceEventClasses.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
-$(TraceOutDir)/traceProducer.cpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceProducer.xsl $(XML_DEPS)
-	$(GENERATE_CODE)
-
 $(TraceOutDir)/traceRequestables.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceRequestables.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
--- a/make/solaris/makefiles/amd64.make	Thu Mar 10 09:28:10 2016 -0800
+++ b/make/solaris/makefiles/amd64.make	Thu Mar 10 09:50:18 2016 -0800
@@ -39,7 +39,7 @@
 # of OPT_CFLAGS. Restore it here.
 ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
   OPT_CFLAGS/generateOptoStub.o += -g0 -xs
-  OPT_CFLAGS/LinearScan.o += -g0 -xs
+  OPT_CFLAGS/c1_LinearScan.o += -g0 -xs
 endif
 
 else
--- a/make/solaris/makefiles/trace.make	Thu Mar 10 09:28:10 2016 -0800
+++ b/make/solaris/makefiles/trace.make	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -56,8 +56,7 @@
 ifeq ($(HAS_ALT_SRC), true)
 TraceGeneratedNames +=  \
 	traceRequestables.hpp \
-    traceEventControl.hpp \
-    traceProducer.cpp
+    traceEventControl.hpp
 endif
 
 TraceGeneratedFiles = $(TraceGeneratedNames:%=$(TraceOutDir)/%)
@@ -96,9 +95,6 @@
 $(TraceOutDir)/traceEventClasses.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceEventClasses.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
-$(TraceOutDir)/traceProducer.cpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceProducer.xsl $(XML_DEPS)
-	$(GENERATE_CODE)
-
 $(TraceOutDir)/traceRequestables.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceRequestables.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
--- a/make/test/JtregNative.gmk	Thu Mar 10 09:28:10 2016 -0800
+++ b/make/test/JtregNative.gmk	Thu Mar 10 09:50:18 2016 -0800
@@ -48,6 +48,7 @@
     $(HOTSPOT_TOPDIR)/test/runtime/SameObject \
     $(HOTSPOT_TOPDIR)/test/compiler/floatingpoint/ \
     $(HOTSPOT_TOPDIR)/test/compiler/calls \
+    $(HOTSPOT_TOPDIR)/test/compiler/native \
     #
 
 # Add conditional directories here when needed.
--- a/make/windows/makefiles/trace.make	Thu Mar 10 09:28:10 2016 -0800
+++ b/make/windows/makefiles/trace.make	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -43,8 +43,7 @@
 !if EXISTS($(TraceAltSrcDir))
 TraceGeneratedNames = $(TraceGeneratedNames) \
     traceRequestables.hpp \
-    traceEventControl.hpp \
-    traceProducer.cpp
+    traceEventControl.hpp
 !endif
 
 
@@ -58,8 +57,7 @@
 !if EXISTS($(TraceAltSrcDir))
 TraceGeneratedFiles = $(TraceGeneratedFiles) \
 	$(TraceOutDir)/traceRequestables.hpp \
-    $(TraceOutDir)/traceEventControl.hpp \
-	$(TraceOutDir)/traceProducer.cpp
+    $(TraceOutDir)/traceEventControl.hpp
 !endif
 
 XSLT = $(QUIETLY) $(REMOTE) $(RUN_JAVA) -classpath $(JvmtiOutDir) jvmtiGen
@@ -98,10 +96,6 @@
 	@echo Generating AltSrc $@
 	@$(XSLT) -IN $(TraceSrcDir)/trace.xml -XSL $(TraceAltSrcDir)/traceEventClasses.xsl -OUT $(TraceOutDir)/traceEventClasses.hpp
 
-$(TraceOutDir)/traceProducer.cpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceProducer.xsl $(XML_DEPS)
-	@echo Generating AltSrc $@
-	@$(XSLT) -IN $(TraceSrcDir)/trace.xml -XSL $(TraceAltSrcDir)/traceProducer.xsl -OUT $(TraceOutDir)/traceProducer.cpp
-
 $(TraceOutDir)/traceRequestables.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceRequestables.xsl $(XML_DEPS)
 	@echo Generating AltSrc $@
 	@$(XSLT) -IN $(TraceSrcDir)/trace.xml -XSL $(TraceAltSrcDir)/traceRequestables.xsl -OUT $(TraceOutDir)/traceRequestables.hpp
--- a/src/cpu/aarch64/vm/aarch64.ad	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu Mar 10 09:50:18 2016 -0800
@@ -1041,10 +1041,8 @@
   bool is_card_mark_membar(const MemBarNode *barrier);
   bool is_CAS(int opcode);
 
-  MemBarNode *leading_to_normal(MemBarNode *leading);
-  MemBarNode *normal_to_leading(const MemBarNode *barrier);
-  MemBarNode *card_mark_to_trailing(const MemBarNode *barrier);
-  MemBarNode *trailing_to_card_mark(const MemBarNode *trailing);
+  MemBarNode *leading_to_trailing(MemBarNode *leading);
+  MemBarNode *card_mark_to_leading(const MemBarNode *barrier);
   MemBarNode *trailing_to_leading(const MemBarNode *trailing);
 
   // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
@@ -1418,23 +1416,28 @@
   // leading MemBarRelease and a trailing MemBarVolatile as follows
   //
   //   MemBarRelease
-  //  {      ||      } -- optional
+  //  {    ||        } -- optional
   //  {MemBarCPUOrder}
-  //         ||     \\
-  //         ||     StoreX[mo_release]
-  //         | \     /
-  //         | MergeMem
-  //         | /
+  //       ||       \\
+  //       ||     StoreX[mo_release]
+  //       | \ Bot    / ???
+  //       | MergeMem
+  //       | /
   //   MemBarVolatile
   //
   // where
   //  || and \\ represent Ctl and Mem feeds via Proj nodes
   //  | \ and / indicate further routing of the Ctl and Mem feeds
   //
-  // this is the graph we see for non-object stores. however, for a
-  // volatile Object store (StoreN/P) we may see other nodes below the
-  // leading membar because of the need for a GC pre- or post-write
-  // barrier.
+  // Note that the memory feed from the CPUOrder membar to the
+  // MergeMem node is an AliasIdxBot slice while the feed from the
+  // StoreX is for a slice determined by the type of value being
+  // written.
+  //
+  // the diagram above shows the graph we see for non-object stores.
+  // for a volatile Object store (StoreN/P) we may see other nodes
+  // below the leading membar because of the need for a GC pre- or
+  // post-write barrier.
   //
   // with most GC configurations we with see this simple variant which
   // includes a post-write barrier card mark.
@@ -1442,7 +1445,7 @@
   //   MemBarRelease______________________________
   //         ||    \\               Ctl \        \\
   //         ||    StoreN/P[mo_release] CastP2X  StoreB/CM
-  //         | \     /                       . . .  /
+  //         | \ Bot  / oop                 . . .  /
   //         | MergeMem
   //         | /
   //         ||      /
@@ -1452,152 +1455,142 @@
   // the object address to an int used to compute the card offset) and
   // Ctl+Mem to a StoreB node (which does the actual card mark).
   //
-  // n.b. a StoreCM node will only appear in this configuration when
-  // using CMS. StoreCM differs from a normal card mark write (StoreB)
-  // because it implies a requirement to order visibility of the card
-  // mark (StoreCM) relative to the object put (StoreP/N) using a
-  // StoreStore memory barrier (arguably this ought to be represented
-  // explicitly in the ideal graph but that is not how it works). This
-  // ordering is required for both non-volatile and volatile
-  // puts. Normally that means we need to translate a StoreCM using
-  // the sequence
+  // n.b. a StoreCM node is only ever used when CMS (with or without
+  // CondCardMark) or G1 is configured. This abstract instruction
+  // differs from a normal card mark write (StoreB) because it implies
+  // a requirement to order visibility of the card mark (StoreCM)
+  // after that of the object put (StoreP/N) using a StoreStore memory
+  // barrier. Note that this is /not/ a requirement to order the
+  // instructions in the generated code (that is already guaranteed by
+  // the order of memory dependencies). Rather it is a requirement to
+  // ensure visibility order which only applies on architectures like
+  // AArch64 which do not implement TSO. This ordering is required for
+  // both non-volatile and volatile puts.
+  //
+  // That implies that we need to translate a StoreCM using the
+  // sequence
   //
   //   dmb ishst
   //   stlrb
   //
-  // However, in the case of a volatile put if we can recognise this
-  // configuration and plant an stlr for the object write then we can
-  // omit the dmb and just plant an strb since visibility of the stlr
-  // is ordered before visibility of subsequent stores. StoreCM nodes
-  // also arise when using G1 or using CMS with conditional card
-  // marking. In these cases (as we shall see) we don't need to insert
-  // the dmb when translating StoreCM because there is already an
-  // intervening StoreLoad barrier between it and the StoreP/N.
-  //
-  // It is also possible to perform the card mark conditionally on it
-  // currently being unmarked in which case the volatile put graph
-  // will look slightly different
+  // This dmb cannot be omitted even when the associated StoreX or
+  // CompareAndSwapX is implemented using stlr. However, as described
+  // below there are circumstances where a specific GC configuration
+  // requires a stronger barrier in which case it can be omitted.
+  // 
+  // With the Serial or Parallel GC using +CondCardMark the card mark
+  // is performed conditionally on it currently being unmarked in
+  // which case the volatile put graph looks slightly different
   //
   //   MemBarRelease____________________________________________
   //         ||    \\               Ctl \     Ctl \     \\  Mem \
   //         ||    StoreN/P[mo_release] CastP2X   If   LoadB     |
-  //         | \     /                              \            |
+  //         | \ Bot / oop                          \            |
   //         | MergeMem                            . . .      StoreB
   //         | /                                                /
   //         ||     /
   //   MemBarVolatile
   //
-  // It is worth noting at this stage that both the above
+  // It is worth noting at this stage that all the above
   // configurations can be uniquely identified by checking that the
   // memory flow includes the following subgraph:
   //
   //   MemBarRelease
   //  {MemBarCPUOrder}
-  //          |  \      . . .
-  //          |  StoreX[mo_release]  . . .
-  //          |   /
-  //         MergeMem
-  //          |
+  //      |  \      . . .
+  //      |  StoreX[mo_release]  . . .
+  //  Bot |   / oop
+  //     MergeMem
+  //      |
   //   MemBarVolatile
   //
-  // This is referred to as a *normal* subgraph. It can easily be
-  // detected starting from any candidate MemBarRelease,
-  // StoreX[mo_release] or MemBarVolatile.
-  //
-  // A simple variation on this normal case occurs for an unsafe CAS
-  // operation. The basic graph for a non-object CAS is
+  // This is referred to as a *normal* volatile store subgraph. It can
+  // easily be detected starting from any candidate MemBarRelease,
+  // StoreX[mo_release] or MemBarVolatile node.
+  //
+  // A small variation on this normal case occurs for an unsafe CAS
+  // operation. The basic memory flow subgraph for a non-object CAS is
+  // as follows
   //
   //   MemBarRelease
   //         ||
   //   MemBarCPUOrder
-  //         ||     \\   . . .
-  //         ||     CompareAndSwapX
-  //         ||       |
-  //         ||     SCMemProj
-  //         | \     /
-  //         | MergeMem
-  //         | /
+  //          |     \\   . . .
+  //          |     CompareAndSwapX
+  //          |       |
+  //      Bot |     SCMemProj
+  //           \     / Bot
+  //           MergeMem
+  //           /
   //   MemBarCPUOrder
   //         ||
   //   MemBarAcquire
   //
   // The same basic variations on this arrangement (mutatis mutandis)
-  // occur when a card mark is introduced. i.e. we se the same basic
-  // shape but the StoreP/N is replaced with CompareAndSawpP/N and the
-  // tail of the graph is a pair comprising a MemBarCPUOrder +
-  // MemBarAcquire.
-  //
-  // So, in the case of a CAS the normal graph has the variant form
-  //
-  //   MemBarRelease
-  //   MemBarCPUOrder
-  //          |   \      . . .
-  //          |  CompareAndSwapX  . . .
-  //          |    |
-  //          |   SCMemProj
-  //          |   /  . . .
-  //         MergeMem
-  //          |
-  //   MemBarCPUOrder
-  //   MemBarAcquire
-  //
-  // This graph can also easily be detected starting from any
-  // candidate MemBarRelease, CompareAndSwapX or MemBarAcquire.
-  //
-  // the code below uses two helper predicates, leading_to_normal and
-  // normal_to_leading to identify these normal graphs, one validating
-  // the layout starting from the top membar and searching down and
-  // the other validating the layout starting from the lower membar
-  // and searching up.
-  //
-  // There are two special case GC configurations when a normal graph
-  // may not be generated: when using G1 (which always employs a
-  // conditional card mark); and when using CMS with conditional card
-  // marking configured. These GCs are both concurrent rather than
-  // stop-the world GCs. So they introduce extra Ctl+Mem flow into the
-  // graph between the leading and trailing membar nodes, in
-  // particular enforcing stronger memory serialisation beween the
-  // object put and the corresponding conditional card mark. CMS
-  // employs a post-write GC barrier while G1 employs both a pre- and
-  // post-write GC barrier. Of course the extra nodes may be absent --
-  // they are only inserted for object puts. This significantly
-  // complicates the task of identifying whether a MemBarRelease,
-  // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
-  // when using these GC configurations (see below). It adds similar
-  // complexity to the task of identifying whether a MemBarRelease,
-  // CompareAndSwapX or MemBarAcquire forms part of a CAS.
-  //
-  // In both cases the post-write subtree includes an auxiliary
-  // MemBarVolatile (StoreLoad barrier) separating the object put and
-  // the read of the corresponding card. This poses two additional
-  // problems.
-  //
-  // Firstly, a card mark MemBarVolatile needs to be distinguished
-  // from a normal trailing MemBarVolatile. Resolving this first
-  // problem is straightforward: a card mark MemBarVolatile always
-  // projects a Mem feed to a StoreCM node and that is a unique marker
+  // occur when a card mark is introduced. i.e. the CPUOrder MemBar
+  // feeds the extra CastP2X, LoadB etc nodes but the above memory
+  // flow subgraph is still present.
+  // 
+  // This is referred to as a *normal* CAS subgraph. It can easily be
+  // detected starting from any candidate MemBarRelease,
+  // StoreX[mo_release] or MemBarAcquire node.
+  //
+  // The code below uses two helper predicates, leading_to_trailing
+  // and trailing_to_leading to identify these normal graphs, one
+  // validating the layout starting from the top membar and searching
+  // down and the other validating the layout starting from the lower
+  // membar and searching up.
+  //
+  // There are two special case GC configurations when the simple
+  // normal graphs above may not be generated: when using G1 (which
+  // always employs a conditional card mark); and when using CMS with
+  // conditional card marking (+CondCardMark) configured. These GCs
+  // are both concurrent rather than stop-the world GCs. So they
+  // introduce extra Ctl+Mem flow into the graph between the leading
+  // and trailing membar nodes, in particular enforcing stronger
+  // memory serialisation beween the object put and the corresponding
+  // conditional card mark. CMS employs a post-write GC barrier while
+  // G1 employs both a pre- and post-write GC barrier.
+  //
+  // The post-write barrier subgraph for these configurations includes
+  // a MemBarVolatile node -- referred to as a card mark membar --
+  // which is needed to order the card write (StoreCM) operation in
+  // the barrier, the preceding StoreX (or CompareAndSwapX) and Store
+  // operations performed by GC threads i.e. a card mark membar
+  // constitutes a StoreLoad barrier hence must be translated to a dmb
+  // ish (whether or not it sits inside a volatile store sequence).
+  //
+  // Of course, the use of the dmb ish for the card mark membar also
+  // implies theat the StoreCM which follows can omit the dmb ishst
+  // instruction. The necessary visibility ordering will already be
+  // guaranteed by the dmb ish. In sum, the dmb ishst instruction only
+  // needs to be generated for as part of the StoreCM sequence with GC
+  // configuration +CMS -CondCardMark.
+  // 
+  // Of course all these extra barrier nodes may well be absent --
+  // they are only inserted for object puts. Their potential presence
+  // significantly complicates the task of identifying whether a
+  // MemBarRelease, StoreX[mo_release], MemBarVolatile or
+  // MemBarAcquire forms part of a volatile put or CAS when using
+  // these GC configurations (see below) and also complicates the
+  // decision as to how to translate a MemBarVolatile and StoreCM.
+  //
+  // So, thjis means that a card mark MemBarVolatile occurring in the
+  // post-barrier graph it needs to be distinguished from a normal
+  // trailing MemBarVolatile. Resolving this is straightforward: a
+  // card mark MemBarVolatile always projects a Mem feed to a StoreCM
+  // node and that is a unique marker
   //
   //      MemBarVolatile (card mark)
   //       C |    \     . . .
   //         |   StoreCM   . . .
   //       . . .
   //
-  // The second problem is how the code generator is to translate the
-  // card mark barrier? It always needs to be translated to a "dmb
-  // ish" instruction whether or not it occurs as part of a volatile
-  // put. A StoreLoad barrier is needed after the object put to ensure
-  // i) visibility to GC threads of the object put and ii) visibility
-  // to the mutator thread of any card clearing write by a GC
-  // thread. Clearly a normal store (str) will not guarantee this
-  // ordering but neither will a releasing store (stlr). The latter
-  // guarantees that the object put is visible but does not guarantee
-  // that writes by other threads have also been observed.
-  //
-  // So, returning to the task of translating the object put and the
-  // leading/trailing membar nodes: what do the non-normal node graph
-  // look like for these 2 special cases? and how can we determine the
-  // status of a MemBarRelease, StoreX[mo_release] or MemBarVolatile
-  // in both normal and non-normal cases?
+  // Returning to the task of translating the object put and the
+  // leading/trailing membar nodes: what do the node graphs look like
+  // for these 2 special cases? and how can we determine the status of
+  // a MemBarRelease, StoreX[mo_release] or MemBarVolatile in both
+  // normal and non-normal cases?
   //
   // A CMS GC post-barrier wraps its card write (StoreCM) inside an If
   // which selects conditonal execution based on the value loaded
@@ -1608,91 +1601,117 @@
   // which looks like this
   //
   //   MemBarRelease
-  //   MemBarCPUOrder_(leading)__________________
-  //     C |    M \       \\                   C \
-  //       |       \    StoreN/P[mo_release]  CastP2X
-  //       |    Bot \    /
-  //       |       MergeMem
-  //       |         /
-  //      MemBarVolatile (card mark)
-  //     C |  ||    M |
-  //       | LoadB    |
-  //       |   |      |
-  //       | Cmp      |\
-  //       | /        | \
-  //       If         |  \
-  //       | \        |   \
-  // IfFalse  IfTrue  |    \
-  //       \     / \  |     \
-  //        \   / StoreCM    |
-  //         \ /      |      |
-  //        Region   . . .   |
-  //          | \           /
-  //          |  . . .  \  / Bot
+  //   MemBarCPUOrder_(leading)____________________
+  //     C |  | M \       \\               M |   C \
+  //       |  |    \    StoreN/P[mo_release] |  CastP2X
+  //       |  | Bot \    / oop      \        |
+  //       |  |    MergeMem          \      / 
+  //       |  |      /                |    /
+  //     MemBarVolatile (card mark)   |   /
+  //     C |  ||    M |               |  /
+  //       | LoadB    | Bot       oop | / Bot
+  //       |   |      |              / /
+  //       | Cmp      |\            / /
+  //       | /        | \          / /
+  //       If         |  \        / /
+  //       | \        |   \      / /
+  // IfFalse  IfTrue  |    \    / /
+  //       \     / \  |    |   / /
+  //        \   / StoreCM  |  / /
+  //         \ /      \   /  / /
+  //        Region     Phi  / /
+  //          | \   Raw |  / /
+  //          |  . . .  | / /
   //          |       MergeMem
-  //          |          |
+  //          |           |
   //        MemBarVolatile (trailing)
   //
-  // The first MergeMem merges the AliasIdxBot Mem slice from the
-  // leading membar and the oopptr Mem slice from the Store into the
-  // card mark membar. The trailing MergeMem merges the AliasIdxBot
-  // Mem slice from the card mark membar and the AliasIdxRaw slice
-  // from the StoreCM into the trailing membar (n.b. the latter
-  // proceeds via a Phi associated with the If region).
-  //
-  // The graph for a CAS varies slightly, the obvious difference being
-  // that the StoreN/P node is replaced by a CompareAndSwapP/N node
-  // and the trailing MemBarVolatile by a MemBarCPUOrder +
-  // MemBarAcquire pair. The other important difference is that the
-  // CompareAndSwap node's SCMemProj is not merged into the card mark
-  // membar - it still feeds the trailing MergeMem. This also means
-  // that the card mark membar receives its Mem feed directly from the
-  // leading membar rather than via a MergeMem.
+  // Notice that there are two MergeMem nodes below the leading
+  // membar. The first MergeMem merges the AliasIdxBot Mem slice from
+  // the leading membar and the oopptr Mem slice from the Store into
+  // the card mark membar. The trailing MergeMem merges the
+  // AliasIdxBot Mem slice from the leading membar, the AliasIdxRaw
+  // slice from the StoreCM and an oop slice from the StoreN/P node
+  // into the trailing membar (n.b. the raw slice proceeds via a Phi
+  // associated with the If region).
+  //
+  // So, in the case of CMS + CondCardMark the volatile object store
+  // graph still includes a normal volatile store subgraph from the
+  // leading membar to the trailing membar. However, it also contains
+  // the same shape memory flow to the card mark membar. The two flows
+  // can be distinguished by testing whether or not the downstream
+  // membar is a card mark membar.
+  //
+  // The graph for a CAS also varies with CMS + CondCardMark, in
+  // particular employing a control feed from the CompareAndSwapX node
+  // through a CmpI and If to the card mark membar and StoreCM which
+  // updates the associated card. This avoids executing the card mark
+  // if the CAS fails. However, it can be seen from the diagram below
+  // that the presence of the barrier does not alter the normal CAS
+  // memory subgraph where the leading membar feeds a CompareAndSwapX,
+  // an SCMemProj, a MergeMem then a final trailing MemBarCPUOrder and
+  // MemBarAcquire pair.
   //
   //   MemBarRelease
-  //   MemBarCPUOrder__(leading)_________________________
-  //       ||                       \\                 C \
-  //   MemBarVolatile (card mark)  CompareAndSwapN/P  CastP2X
-  //     C |  ||    M |              |
-  //       | LoadB    |       ______/|
-  //       |   |      |      /       |
-  //       | Cmp      |     /      SCMemProj
-  //       | /        |    /         |
-  //       If         |   /         /
-  //       | \        |  /         /
-  // IfFalse  IfTrue  | /         /
-  //       \     / \  |/ prec    /
-  //        \   / StoreCM       /
-  //         \ /      |        /
-  //        Region   . . .    /
-  //          | \            /
-  //          |  . . .  \   / Bot
-  //          |       MergeMem
-  //          |          |
-  //        MemBarCPUOrder
-  //        MemBarAcquire (trailing)
+  //   MemBarCPUOrder__(leading)_______________________
+  //   C /  M |                        \\            C \
+  //  . . .   | Bot                CompareAndSwapN/P   CastP2X
+  //          |                  C /  M |
+  //          |                 CmpI    |
+  //          |                  /      |
+  //          |               . . .     |
+  //          |              IfTrue     |
+  //          |              /          |
+  //       MemBarVolatile (card mark)   |
+  //        C |  ||    M |              |
+  //          | LoadB    | Bot   ______/|
+  //          |   |      |      /       |
+  //          | Cmp      |     /      SCMemProj
+  //          | /        |    /         |
+  //          If         |   /         /
+  //          | \        |  /         / Bot
+  //     IfFalse  IfTrue | /         /
+  //          |   / \   / / prec    /
+  //   . . .  |  /  StoreCM        /
+  //        \ | /      | raw      /
+  //        Region    . . .      /
+  //           | \              /
+  //           |   . . .   \    / Bot
+  //           |        MergeMem
+  //           |          /
+  //         MemBarCPUOrder
+  //         MemBarAcquire (trailing)
   //
   // This has a slightly different memory subgraph to the one seen
-  // previously but the core of it is the same as for the CAS normal
-  // sungraph
+  // previously but the core of it has a similar memory flow to the
+  // CAS normal subgraph:
   //
   //   MemBarRelease
   //   MemBarCPUOrder____
-  //      ||             \      . . .
-  //   MemBarVolatile  CompareAndSwapX  . . .
-  //      |  \            |
-  //        . . .   SCMemProj
-  //          |     /  . . .
-  //         MergeMem
-  //          |
+  //         |          \      . . .
+  //         |       CompareAndSwapX  . . .
+  //         |       C /  M |
+  //         |      CmpI    |
+  //         |       /      |
+  //         |      . .    /
+  //     Bot |   IfTrue   /
+  //         |   /       /
+  //    MemBarVolatile  /
+  //         | ...     /
+  //      StoreCM ... /
+  //         |       / 
+  //       . . .  SCMemProj
+  //      Raw \    / Bot
+  //        MergeMem
+  //           |
   //   MemBarCPUOrder
   //   MemBarAcquire
   //
-  //
-  // G1 is quite a lot more complicated. The nodes inserted on behalf
-  // of G1 may comprise: a pre-write graph which adds the old value to
-  // the SATB queue; the releasing store itself; and, finally, a
-  // post-write graph which performs a card mark.
+  // The G1 graph for a volatile object put is a lot more complicated.
+  // Nodes inserted on behalf of G1 may comprise: a pre-write graph
+  // which adds the old value to the SATB queue; the releasing store
+  // itself; and, finally, a post-write graph which performs a card
+  // mark.
   //
   // The pre-write graph may be omitted, but only when the put is
   // writing to a newly allocated (young gen) object and then only if
@@ -1730,25 +1749,60 @@
   //       | CastP2X | StoreN/P[mo_release] |
   //       |         |         |            |
   //     C |       M |       M |          M |
-  //        \        |         |           /
+  //        \        | Raw     | oop       / Bot
   //                  . . .
   //          (post write subtree elided)
   //                    . . .
   //             C \         M /
   //         MemBarVolatile (trailing)
   //
+  // Note that the three memory feeds into the post-write tree are an
+  // AliasRawIdx slice associated with the writes in the pre-write
+  // tree, an oop type slice from the StoreX specific to the type of
+  // the volatile field and the AliasBotIdx slice emanating from the
+  // leading membar.
+  //
   // n.b. the LoadB in this subgraph is not the card read -- it's a
   // read of the SATB queue active flag.
   //
-  // Once again the CAS graph is a minor variant on the above with the
-  // expected substitutions of CompareAndSawpX for StoreN/P and
-  // MemBarCPUOrder + MemBarAcquire for trailing MemBarVolatile.
+  // The CAS graph is once again a variant of the above with a
+  // CompareAndSwapX node and SCMemProj in place of the StoreX.  The
+  // value from the CompareAndSwapX node is fed into the post-write
+  // graph aling with the AliasIdxRaw feed from the pre-barrier and
+  // the AliasIdxBot feeds from the leading membar and the ScMemProj.
+  //
+  //  MemBarRelease (leading)____________
+  //     C |  ||  M \   M \    M \  M \ . . .
+  //       | LoadB   \  LoadL  LoadN   \
+  //       | /        \                 \
+  //       If         |\                 \
+  //       | \        | \                 \
+  //  IfFalse  IfTrue |  \                 \
+  //       |     |    |   \                 \
+  //       |     If   |    \                 |
+  //       |     |          \                |
+  //       |                 \               |
+  //       |    . . .         \              |
+  //       | /       | /       \             |
+  //      Region  Phi[M]        \            |
+  //       | \       |           \           |
+  //       |  \_____ |            |          |
+  //     C | C \     |            |          |
+  //       | CastP2X |     CompareAndSwapX   |
+  //       |         |   res |     |         |
+  //     C |       M |       |  SCMemProj  M |
+  //        \        | Raw   |     | Bot    / Bot
+  //                  . . .
+  //          (post write subtree elided)
+  //                    . . .
+  //             C \         M /
+  //         MemBarVolatile (trailing)
   //
   // The G1 post-write subtree is also optional, this time when the
   // new value being written is either null or can be identified as a
   // newly allocated (young gen) object with no intervening control
   // flow. The latter cannot happen but the former may, in which case
-  // the card mark membar is omitted and the memory feeds form the
+  // the card mark membar is omitted and the memory feeds from the
   // leading membar and the SToreN/P are merged direct into the
   // trailing membar as per the normal subgraph. So, the only special
   // case which arises is when the post-write subgraph is generated.
@@ -1770,94 +1824,106 @@
   //
   //                (pre-write subtree elided)
   //        . . .                  . . .    . . .  . . .
-  //        C |                    M |     M |    M |
-  //       Region                  Phi[M] StoreN    |
-  //          |                     / \      |      |
-  //         / \_______            /   \     |      |
-  //      C / C \      . . .            \    |      |
-  //       If   CastP2X . . .            |   |      |
-  //       / \                           |   |      |
-  //      /   \                          |   |      |
-  // IfFalse IfTrue                      |   |      |
-  //   |       |                         |   |     /|
-  //   |       If                        |   |    / |
-  //   |      / \                        |   |   /  |
-  //   |     /   \                        \  |  /   |
-  //   | IfFalse IfTrue                   MergeMem  |
-  //   |  . . .    / \                       /      |
-  //   |          /   \                     /       |
-  //   |     IfFalse IfTrue                /        |
-  //   |      . . .    |                  /         |
-  //   |               If                /          |
-  //   |               / \              /           |
-  //   |              /   \            /            |
-  //   |         IfFalse IfTrue       /             |
-  //   |           . . .   |         /              |
-  //   |                    \       /               |
-  //   |                     \     /                |
-  //   |             MemBarVolatile__(card mark)    |
-  //   |                ||   C |  M \  M \          |
-  //   |               LoadB   If    |    |         |
-  //   |                      / \    |    |         |
-  //   |                     . . .   |    |         |
-  //   |                          \  |    |        /
-  //   |                        StoreCM   |       /
-  //   |                          . . .   |      /
-  //   |                        _________/      /
-  //   |                       /  _____________/
-  //   |   . . .       . . .  |  /            /
-  //   |    |                 | /   _________/
-  //   |    |               Phi[M] /        /
-  //   |    |                 |   /        /
-  //   |    |                 |  /        /
-  //   |  Region  . . .     Phi[M]  _____/
-  //   |    /                 |    /
-  //   |                      |   /
-  //   | . . .   . . .        |  /
-  //   | /                    | /
-  // Region           |  |  Phi[M]
-  //   |              |  |  / Bot
-  //    \            MergeMem
-  //     \            /
-  //     MemBarVolatile
-  //
-  // As with CMS the initial MergeMem merges the AliasIdxBot Mem slice
-  // from the leading membar and the oopptr Mem slice from the Store
-  // into the card mark membar i.e. the memory flow to the card mark
-  // membar still looks like a normal graph.
-  //
-  // The trailing MergeMem merges an AliasIdxBot Mem slice with other
-  // Mem slices (from the StoreCM and other card mark queue stores).
-  // However in this case the AliasIdxBot Mem slice does not come
-  // direct from the card mark membar. It is merged through a series
-  // of Phi nodes. These are needed to merge the AliasIdxBot Mem flow
-  // from the leading membar with the Mem feed from the card mark
-  // membar. Each Phi corresponds to one of the Ifs which may skip
-  // around the card mark membar. So when the If implementing the NULL
-  // value check has been elided the total number of Phis is 2
-  // otherwise it is 3.
-  //
-  // The CAS graph when using G1GC also includes a pre-write subgraph
-  // and an optional post-write subgraph. Teh sam evarioations are
-  // introduced as for CMS with conditional card marking i.e. the
-  // StoreP/N is swapped for a CompareAndSwapP/N, the tariling
-  // MemBarVolatile for a MemBarCPUOrder + MemBarAcquire pair and the
-  // Mem feed from the CompareAndSwapP/N includes a precedence
-  // dependency feed to the StoreCM and a feed via an SCMemProj to the
-  // trailing membar. So, as before the configuration includes the
-  // normal CAS graph as a subgraph of the memory flow.
-  //
-  // So, the upshot is that in all cases the volatile put graph will
-  // include a *normal* memory subgraph betwen the leading membar and
-  // its child membar, either a volatile put graph (including a
-  // releasing StoreX) or a CAS graph (including a CompareAndSwapX).
-  // When that child is not a card mark membar then it marks the end
-  // of the volatile put or CAS subgraph. If the child is a card mark
-  // membar then the normal subgraph will form part of a volatile put
-  // subgraph if and only if the child feeds an AliasIdxBot Mem feed
-  // to a trailing barrier via a MergeMem. That feed is either direct
-  // (for CMS) or via 2 or 3 Phi nodes merging the leading barrier
-  // memory flow (for G1).
+  //        C |               M |    M |    M |
+  //       Region            Phi[M] StoreN    |
+  //          |            Raw  |  oop |  Bot |
+  //         / \_______         |\     |\     |\
+  //      C / C \      . . .    | \    | \    | \
+  //       If   CastP2X . . .   |  \   |  \   |  \
+  //       / \                  |   \  |   \  |   \
+  //      /   \                 |    \ |    \ |    \
+  // IfFalse IfTrue             |      |      |     \
+  //   |       |                 \     |     /       |
+  //   |       If                 \    | \  /   \    |
+  //   |      / \                  \   |   /     \   |
+  //   |     /   \                  \  |  / \     |  |
+  //   | IfFalse IfTrue           MergeMem   \    |  |
+  //   |  . . .    / \                 |      \   |  |
+  //   |          /   \                |       |  |  |
+  //   |     IfFalse IfTrue            |       |  |  |
+  //   |      . . .    |               |       |  |  |
+  //   |               If             /        |  |  |
+  //   |               / \           /         |  |  |
+  //   |              /   \         /          |  |  |
+  //   |         IfFalse IfTrue    /           |  |  |
+  //   |           . . .   |      /            |  |  |
+  //   |                    \    /             |  |  |
+  //   |                     \  /              |  |  |
+  //   |         MemBarVolatile__(card mark  ) |  |  |
+  //   |              ||   C |     \           |  |  |
+  //   |             LoadB   If     |         /   |  |
+  //   |                    / \ Raw |        /   /  /
+  //   |                   . . .    |       /   /  /
+  //   |                        \   |      /   /  /
+  //   |                        StoreCM   /   /  /
+  //   |                           |     /   /  /
+  //   |                            . . .   /  /
+  //   |                                   /  /
+  //   |   . . .                          /  /
+  //   |    |             | /            /  /
+  //   |    |           Phi[M] /        /  /
+  //   |    |             |   /        /  /
+  //   |    |             |  /        /  /
+  //   |  Region  . . .  Phi[M]      /  /
+  //   |    |             |         /  /
+  //    \   |             |        /  /
+  //     \  | . . .       |       /  /
+  //      \ |             |      /  /
+  //      Region         Phi[M] /  /
+  //        |               \  /  /
+  //         \             MergeMem
+  //          \            /
+  //          MemBarVolatile
+  //
+  // As with CMS + CondCardMark the first MergeMem merges the
+  // AliasIdxBot Mem slice from the leading membar and the oopptr Mem
+  // slice from the Store into the card mark membar. However, in this
+  // case it may also merge an AliasRawIdx mem slice from the pre
+  // barrier write.
+  //
+  // The trailing MergeMem merges an AliasIdxBot Mem slice from the
+  // leading membar with an oop slice from the StoreN and an
+  // AliasRawIdx slice from the post barrier writes. In this case the
+  // AliasIdxRaw Mem slice is merged through a series of Phi nodes
+  // which combine feeds from the If regions in the post barrier
+  // subgraph.
+  //
+  // So, for G1 the same characteristic subgraph arises as for CMS +
+  // CondCardMark. There is a normal subgraph feeding the card mark
+  // membar and a normal subgraph feeding the trailing membar.
+  //
+  // The CAS graph when using G1GC also includes an optional
+  // post-write subgraph. It is very similar to the above graph except
+  // for a few details.
+  // 
+  // - The control flow is gated by an additonal If which tests the
+  // result from the CompareAndSwapX node
+  // 
+  //  - The MergeMem which feeds the card mark membar only merges the
+  // AliasIdxBot slice from the leading membar and the AliasIdxRaw
+  // slice from the pre-barrier. It does not merge the SCMemProj
+  // AliasIdxBot slice. So, this subgraph does not look like the
+  // normal CAS subgraph.
+  //
+  // - The MergeMem which feeds the trailing membar merges the
+  // AliasIdxBot slice from the leading membar, the AliasIdxRaw slice
+  // from the post-barrier and the SCMemProj AliasIdxBot slice i.e. it
+  // has two AliasIdxBot input slices. However, this subgraph does
+  // still look like the normal CAS subgraph.
+  //
+  // So, the upshot is:
+  //
+  // In all cases a volatile put graph will include a *normal*
+  // volatile store subgraph betwen the leading membar and the
+  // trailing membar. It may also include a normal volatile store
+  // subgraph betwen the leading membar and the card mark membar.
+  //
+  // In all cases a CAS graph will contain a unique normal CAS graph
+  // feeding the trailing membar.
+  //
+  // In all cases where there is a card mark membar (either as part of
+  // a volatile object put or CAS) it will be fed by a MergeMem whose
+  // AliasIdxBot slice feed will be a leading membar.
   //
   // The predicates controlling generation of instructions for store
   // and barrier nodes employ a few simple helper functions (described
@@ -1878,24 +1944,24 @@
 	    opcode == Op_CompareAndSwapP);
   }
 
-  // leading_to_normal
+  // leading_to_trailing
   //
   //graph traversal helper which detects the normal case Mem feed from
   // a release membar (or, optionally, its cpuorder child) to a
   // dependent volatile membar i.e. it ensures that one or other of
   // the following Mem flow subgraph is present.
   //
-  //   MemBarRelease
-  //   MemBarCPUOrder {leading}
-  //          |  \      . . .
-  //          |  StoreN/P[mo_release]  . . .
-  //          |   /
-  //         MergeMem
-  //          |
-  //   MemBarVolatile {trailing or card mark}
-  //
-  //   MemBarRelease
-  //   MemBarCPUOrder {leading}
+  //   MemBarRelease {leading}
+  //   {MemBarCPUOrder} {optional}
+  //     Bot |  \      . . .
+  //         |  StoreN/P[mo_release]  . . .
+  //         |   /
+  //        MergeMem
+  //         |
+  //   MemBarVolatile {not card mark}
+  //
+  //   MemBarRelease {leading}
+  //   {MemBarCPUOrder} {optional}
   //      |       \      . . .
   //      |     CompareAndSwapX  . . .
   //               |
@@ -1906,6 +1972,23 @@
   //    MemBarCPUOrder
   //    MemBarAcquire {trailing}
   //
+  // the predicate needs to be capable of distinguishing the following
+  // volatile put graph which may arises when a GC post barrier
+  // inserts a card mark membar
+  //
+  //   MemBarRelease {leading}
+  //   {MemBarCPUOrder}__
+  //     Bot |   \       \
+  //         |   StoreN/P \
+  //         |    / \     |
+  //        MergeMem \    |
+  //         |        \   |
+  //   MemBarVolatile  \  |
+  //    {card mark}     \ |
+  //                  MergeMem
+  //                      |
+  // {not card mark} MemBarVolatile
+  //
   // if the correct configuration is present returns the trailing
   // membar otherwise NULL.
   //
@@ -1916,7 +1999,7 @@
   // the returned value may be a card mark or trailing membar
   //
 
-  MemBarNode *leading_to_normal(MemBarNode *leading)
+  MemBarNode *leading_to_trailing(MemBarNode *leading)
   {
     assert((leading->Opcode() == Op_MemBarRelease ||
 	    leading->Opcode() == Op_MemBarCPUOrder),
@@ -1933,15 +2016,21 @@
     StoreNode * st = NULL;
     LoadStoreNode *cas = NULL;
     MergeMemNode *mm = NULL;
+    MergeMemNode *mm2 = NULL;
 
     for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
       x = mem->fast_out(i);
       if (x->is_MergeMem()) {
 	if (mm != NULL) {
-	  return NULL;
+	  if (mm2 != NULL) {
+	  // should not see more than 2 merge mems
+	    return NULL;
+	  } else {
+	    mm2 = x->as_MergeMem();
+	  }
+	} else {
+	  mm = x->as_MergeMem();
 	}
-	// two merge mems is one too many
-	mm = x->as_MergeMem();
       } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
 	// two releasing stores/CAS nodes is one too many
 	if (st != NULL || cas != NULL) {
@@ -1961,13 +2050,13 @@
       return NULL;
     }
 
-    // must have a merge if we also have st
+    // must have at least one merge if we also have st
     if (st && !mm) {
       return NULL;
     }
 
-    Node *y = NULL;
     if (cas) {
+      Node *y = NULL;
       // look for an SCMemProj
       for (DUIterator_Fast imax, i = cas->fast_outs(imax); i < imax; i++) {
 	x = cas->fast_out(i);
@@ -1987,10 +2076,29 @@
 	  break;
 	}
       }
-      if (mm == NULL)
+      if (mm == NULL) {
 	return NULL;
+      }
+      MemBarNode *mbar = NULL;
+      // ensure the merge feeds a trailing membar cpuorder + acquire pair
+      for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+	x = mm->fast_out(i);
+	if (x->is_MemBar()) {
+	  int opcode = x->Opcode();
+	  if (opcode == Op_MemBarCPUOrder) {
+	    MemBarNode *z =  x->as_MemBar();
+	    z = child_membar(z);
+	    if (z != NULL && z->Opcode() == Op_MemBarAcquire) {
+	      mbar = z;
+	    }
+	  }
+	  break;
+	}
+      }
+      return mbar;
     } else {
-      // ensure the store feeds the existing mergemem;
+      Node *y = NULL;
+      // ensure the store feeds the first mergemem;
       for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
 	if (st->fast_out(i) == mm) {
 	  y = st;
@@ -2000,55 +2108,89 @@
       if (y == NULL) {
 	return NULL;
       }
-    }
-
-    MemBarNode *mbar = NULL;
-    // ensure the merge feeds to the expected type of membar
-    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-      x = mm->fast_out(i);
-      if (x->is_MemBar()) {
-	int opcode = x->Opcode();
-	if (opcode == Op_MemBarVolatile && st) {
-	  mbar = x->as_MemBar();
-	} else if (cas && opcode == Op_MemBarCPUOrder) {
-	  MemBarNode *y =  x->as_MemBar();
-	  y = child_membar(y);
-	  if (y != NULL && y->Opcode() == Op_MemBarAcquire) {
-	    mbar = y;
+      if (mm2 != NULL) {
+	// ensure the store feeds the second mergemem;
+	y = NULL;
+	for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+	  if (st->fast_out(i) == mm2) {
+	    y = st;
 	  }
 	}
-	break;
+	if (y == NULL) {
+	  return NULL;
+	}
       }
-    }
-
-    return mbar;
-  }
-
-  // normal_to_leading
+
+      MemBarNode *mbar = NULL;
+      // ensure the first mergemem feeds a volatile membar
+      for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+	x = mm->fast_out(i);
+	if (x->is_MemBar()) {
+	  int opcode = x->Opcode();
+	  if (opcode == Op_MemBarVolatile) {
+	    mbar = x->as_MemBar();
+	  }
+	  break;
+	}
+      }
+      if (mm2 == NULL) {
+	// this is our only option for a trailing membar
+	return mbar;
+      }
+      // ensure the second mergemem feeds a volatile membar
+      MemBarNode *mbar2 = NULL;
+      for (DUIterator_Fast imax, i = mm2->fast_outs(imax); i < imax; i++) {
+	x = mm2->fast_out(i);
+	if (x->is_MemBar()) {
+	  int opcode = x->Opcode();
+	  if (opcode == Op_MemBarVolatile) {
+	    mbar2 = x->as_MemBar();
+	  }
+	  break;
+	}
+      }
+      // if we have two merge mems we must have two volatile membars
+      if (mbar == NULL || mbar2 == NULL) {
+	return NULL;
+      }
+      // return the trailing membar
+      if (is_card_mark_membar(mbar2)) {
+	return mbar;
+      } else {
+	if (is_card_mark_membar(mbar)) {
+	  return mbar2;
+	} else {
+	  return NULL;
+	}
+      }
+    }
+  }
+
+  // trailing_to_leading
   //
   // graph traversal helper which detects the normal case Mem feed
-  // from either a card mark or a trailing membar to a preceding
-  // release membar (optionally its cpuorder child) i.e. it ensures
-  // that one or other of the following Mem flow subgraphs is present.
-  //
-  //   MemBarRelease
-  //   MemBarCPUOrder {leading}
-  //          |  \      . . .
-  //          |  StoreN/P[mo_release]  . . .
-  //          |   /
-  //         MergeMem
-  //          |
-  //   MemBarVolatile {card mark or trailing}
-  //
-  //   MemBarRelease
-  //   MemBarCPUOrder {leading}
+  // from a trailing membar to a preceding release membar (optionally
+  // its cpuorder child) i.e. it ensures that one or other of the
+  // following Mem flow subgraphs is present.
+  //
+  //   MemBarRelease {leading}
+  //   MemBarCPUOrder {optional}
+  //    | Bot |  \      . . .
+  //    |     |  StoreN/P[mo_release]  . . .
+  //    |     |   /
+  //    |    MergeMem
+  //    |     |
+  //   MemBarVolatile {not card mark}
+  //
+  //   MemBarRelease {leading}
+  //   MemBarCPUOrder {optional}
   //      |       \      . . .
   //      |     CompareAndSwapX  . . .
   //               |
   //     . . .    SCMemProj
   //           \   |
   //      |    MergeMem
-  //      |        /
+  //      |       |
   //    MemBarCPUOrder
   //    MemBarAcquire {trailing}
   //
@@ -2058,15 +2200,20 @@
   // if the configuration is present returns the cpuorder member for
   // preference or when absent the release membar otherwise NULL.
   //
-  // n.b. the input membar is expected to be a MemBarVolatile but
-  // need not be a card mark membar.
-
-  MemBarNode *normal_to_leading(const MemBarNode *barrier)
+  // n.b. the input membar is expected to be a MemBarVolatile or
+  // MemBarAcquire. if it is a MemBarVolatile it must *not* be a card
+  // mark membar.
+
+  MemBarNode *trailing_to_leading(const MemBarNode *barrier)
   {
     // input must be a volatile membar
     assert((barrier->Opcode() == Op_MemBarVolatile ||
 	    barrier->Opcode() == Op_MemBarAcquire),
 	   "expecting a volatile or an acquire membar");
+
+    assert((barrier->Opcode() != Op_MemBarVolatile) ||
+	   !is_card_mark_membar(barrier),
+	   "not expecting a card mark membar");
     Node *x;
     bool is_cas = barrier->Opcode() == Op_MemBarAcquire;
 
@@ -2179,169 +2326,35 @@
     return NULL;
   }
 
-  // card_mark_to_trailing
-  //
-  // graph traversal helper which detects extra, non-normal Mem feed
-  // from a card mark volatile membar to a trailing membar i.e. it
-  // ensures that one of the following three GC post-write Mem flow
-  // subgraphs is present.
-  //
-  // 1)
-  //     . . .
-  //       |
-  //   MemBarVolatile (card mark)
-  //      |          |
-  //      |        StoreCM
-  //      |          |
-  //      |        . . .
-  //  Bot |  /
-  //   MergeMem
-  //      |
-  //      |
-  //    MemBarVolatile {trailing}
-  //
-  // 2)
-  //   MemBarRelease/CPUOrder (leading)
-  //    |
-  //    |
-  //    |\       . . .
-  //    | \        |
-  //    |  \  MemBarVolatile (card mark)
-  //    |   \   |     |
-  //     \   \  |   StoreCM    . . .
-  //      \   \ |
-  //       \  Phi
-  //        \ /
-  //        Phi  . . .
+  // card_mark_to_leading
+  //
+  // graph traversal helper which traverses from a card mark volatile
+  // membar to a leading membar i.e. it ensures that the following Mem
+  // flow subgraph is present.
+  //
+  //    MemBarRelease {leading}
+  //   {MemBarCPUOrder} {optional}
+  //         |   . . .
   //     Bot |   /
-  //       MergeMem
+  //      MergeMem
   //         |
-  //    MemBarVolatile {trailing}
-  //
-  //
-  // 3)
-  //   MemBarRelease/CPUOrder (leading)
-  //    |
-  //    |\
-  //    | \
-  //    |  \      . . .
-  //    |   \       |
-  //    |\   \  MemBarVolatile (card mark)
-  //    | \   \   |     |
-  //    |  \   \  |   StoreCM    . . .
-  //    |   \   \ |
-  //     \   \  Phi
-  //      \   \ /
-  //       \  Phi
-  //        \ /
-  //        Phi  . . .
-  //     Bot |   /
-  //       MergeMem
-  //         |
-  //         |
-  //    MemBarVolatile {trailing}
-  //
-  // configuration 1 is only valid if UseConcMarkSweepGC &&
-  // UseCondCardMark
-  //
-  // configurations 2 and 3 are only valid if UseG1GC.
-  //
-  // if a valid configuration is present returns the trailing membar
-  // otherwise NULL.
-  //
-  // n.b. the supplied membar is expected to be a card mark
-  // MemBarVolatile i.e. the caller must ensure the input node has the
-  // correct operand and feeds Mem to a StoreCM node
-
-  MemBarNode *card_mark_to_trailing(const MemBarNode *barrier)
+  //     MemBarVolatile (card mark)
+  //        |     \
+  //      . . .   StoreCM
+  //
+  // if the configuration is present returns the cpuorder member for
+  // preference or when absent the release membar otherwise NULL.
+  //
+  // n.b. the input membar is expected to be a MemBarVolatile amd must
+  // be a card mark membar.
+
+  MemBarNode *card_mark_to_leading(const MemBarNode *barrier)
   {
     // input must be a card mark volatile membar
     assert(is_card_mark_membar(barrier), "expecting a card mark membar");
 
-    Node *feed = barrier->proj_out(TypeFunc::Memory);
-    Node *x;
-    MergeMemNode *mm = NULL;
-
-    const int MAX_PHIS = 3;	// max phis we will search through
-    int phicount = 0; 		// current search count
-
-    bool retry_feed = true;
-    while (retry_feed) {
-      // see if we have a direct MergeMem feed
-      for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
-	x = feed->fast_out(i);
-	// the correct Phi will be merging a Bot memory slice
-	if (x->is_MergeMem()) {
-	  mm = x->as_MergeMem();
-	  break;
-	}
-      }
-      if (mm) {
-	retry_feed = false;
-      } else if (UseG1GC & phicount++ < MAX_PHIS) {
-	// the barrier may feed indirectly via one or two Phi nodes
-	PhiNode *phi = NULL;
-	for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
-	  x = feed->fast_out(i);
-	  // the correct Phi will be merging a Bot memory slice
-	  if (x->is_Phi() && x->adr_type() == TypePtr::BOTTOM) {
-	    phi = x->as_Phi();
-	    break;
-	  }
-	}
-	if (!phi) {
-	  return NULL;
-	}
-	// look for another merge below this phi
-	feed = phi;
-      } else {
-	// couldn't find a merge
-	return NULL;
-      }
-    }
-
-    // sanity check this feed turns up as the expected slice
-    assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
-
-    MemBarNode *trailing = NULL;
-    // be sure we have a trailing membar the merge
-    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-      x = mm->fast_out(i);
-      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-	trailing = x->as_MemBar();
-	break;
-      }
-    }
-
-    return trailing;
-  }
-
-  // trailing_to_card_mark
-  //
-  // graph traversal helper which detects extra, non-normal Mem feed
-  // from a trailing volatile membar to a preceding card mark volatile
-  // membar i.e. it identifies whether one of the three possible extra
-  // GC post-write Mem flow subgraphs is present
-  //
-  // this predicate checks for the same flow as the previous predicate
-  // but starting from the bottom rather than the top.
-  //
-  // if the configuration is present returns the card mark membar
-  // otherwise NULL
-  //
-  // n.b. the supplied membar is expected to be a trailing
-  // MemBarVolatile i.e. the caller must ensure the input node has the
-  // correct opcode
-
-  MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
-  {
-    assert(trailing->Opcode() == Op_MemBarVolatile,
-	   "expecting a volatile membar");
-    assert(!is_card_mark_membar(trailing),
-	   "not expecting a card mark membar");
-
     // the Mem feed to the membar should be a merge
-    Node *x = trailing->in(TypeFunc::Memory);
+    Node *x = barrier->in(TypeFunc::Memory);
     if (!x->is_MergeMem()) {
       return NULL;
     }
@@ -2349,117 +2362,19 @@
     MergeMemNode *mm = x->as_MergeMem();
 
     x = mm->in(Compile::AliasIdxBot);
-    // with G1 we may possibly see a Phi or two before we see a Memory
-    // Proj from the card mark membar
-
-    const int MAX_PHIS = 3;	// max phis we will search through
-    int phicount = 0; 		// current search count
-
-    bool retry_feed = !x->is_Proj();
-
-    while (retry_feed) {
-      if (UseG1GC && x->is_Phi() && phicount++ < MAX_PHIS) {
-	PhiNode *phi = x->as_Phi();
-	ProjNode *proj = NULL;
-	PhiNode *nextphi = NULL;
-	bool found_leading = false;
-	for (uint i = 1; i < phi->req(); i++) {
-	  x = phi->in(i);
-	  if (x->is_Phi()) {
-	    nextphi = x->as_Phi();
-	  } else if (x->is_Proj()) {
-	    int opcode = x->in(0)->Opcode();
-	    if (opcode == Op_MemBarVolatile) {
-	      proj = x->as_Proj();
-	    } else if (opcode == Op_MemBarRelease ||
-		       opcode == Op_MemBarCPUOrder) {
-	      // probably a leading membar
-	      found_leading = true;
-	    }
-	  }
-	}
-	// if we found a correct looking proj then retry from there
-	// otherwise we must see a leading and a phi or this the
-	// wrong config
-	if (proj != NULL) {
-	  x = proj;
-	  retry_feed = false;
-	} else if (found_leading && nextphi != NULL) {
-	  // retry from this phi to check phi2
-	  x = nextphi;
-	} else {
-	  // not what we were looking for
-	  return NULL;
-	}
-      } else {
-	return NULL;
-      }
-    }
-    // the proj has to come from the card mark membar
-    x = x->in(0);
+
     if (!x->is_MemBar()) {
       return NULL;
     }
 
-    MemBarNode *card_mark_membar = x->as_MemBar();
-
-    if (!is_card_mark_membar(card_mark_membar)) {
-      return NULL;
-    }
-
-    return card_mark_membar;
-  }
-
-  // trailing_to_leading
-  //
-  // graph traversal helper which checks the Mem flow up the graph
-  // from a (non-card mark) trailing membar attempting to locate and
-  // return an associated leading membar. it first looks for a
-  // subgraph in the normal configuration (relying on helper
-  // normal_to_leading). failing that it then looks for one of the
-  // possible post-write card mark subgraphs linking the trailing node
-  // to a the card mark membar (relying on helper
-  // trailing_to_card_mark), and then checks that the card mark membar
-  // is fed by a leading membar (once again relying on auxiliary
-  // predicate normal_to_leading).
-  //
-  // if the configuration is valid returns the cpuorder member for
-  // preference or when absent the release membar otherwise NULL.
-  //
-  // n.b. the input membar is expected to be either a volatile or
-  // acquire membar but in the former case must *not* be a card mark
-  // membar.
-
-  MemBarNode *trailing_to_leading(const MemBarNode *trailing)
-  {
-    assert((trailing->Opcode() == Op_MemBarAcquire ||
-	    trailing->Opcode() == Op_MemBarVolatile),
-	   "expecting an acquire or volatile membar");
-    assert((trailing->Opcode() != Op_MemBarVolatile ||
-	    !is_card_mark_membar(trailing)),
-	   "not expecting a card mark membar");
-
-    MemBarNode *leading = normal_to_leading(trailing);
-
-    if (leading) {
+    MemBarNode *leading = x->as_MemBar();
+
+    if (leading_membar(leading)) {
       return leading;
     }
 
-    // nothing more to do if this is an acquire
-    if (trailing->Opcode() == Op_MemBarAcquire) {
-      return NULL;
-    }
-
-    MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
-
-    if (!card_mark_membar) {
-      return NULL;
-    }
-
-    return normal_to_leading(card_mark_membar);
-  }
-
-  // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
+    return NULL;
+  }
 
 bool unnecessary_acquire(const Node *barrier)
 {
@@ -2675,19 +2590,8 @@
   }
 
   // must start with a normal feed
-  MemBarNode *child_barrier = leading_to_normal(barrier);
-
-  if (!child_barrier) {
-    return false;
-  }
-
-  if (!is_card_mark_membar(child_barrier)) {
-    // this is the trailing membar and we are done
-    return true;
-  }
-
-  // must be sure this card mark feeds a trailing membar
-  MemBarNode *trailing = card_mark_to_trailing(child_barrier);
+  MemBarNode *trailing = leading_to_trailing(barrier);
+
   return (trailing != NULL);
 }
 
@@ -2709,7 +2613,7 @@
   }
 
   // ok, if it's not a card mark then we still need to check if it is
-  // a trailing membar of a volatile put hgraph.
+  // a trailing membar of a volatile put graph.
 
   return (trailing_to_leading(mbvol) != NULL);
 }
@@ -2759,20 +2663,9 @@
   }
 
   // does this lead a normal subgraph?
-  MemBarNode *mbvol = leading_to_normal(barrier);
-
-  if (!mbvol) {
-    return false;
-  }
-
-  // all done unless this is a card mark
-  if (!is_card_mark_membar(mbvol)) {
-    return true;
-  }
-
-  // we found a card mark -- just make sure we have a trailing barrier
-
-  return (card_mark_to_trailing(mbvol) != NULL);
+  MemBarNode *trailing = leading_to_trailing(barrier);
+
+  return (trailing != NULL);
 }
 
 // predicate controlling translation of CAS
@@ -2814,7 +2707,7 @@
 	  "CAS not fed by cpuorder+release membar pair!");
 
   // does this lead a normal subgraph?
-  MemBarNode *mbar = leading_to_normal(barrier);
+  MemBarNode *mbar = leading_to_trailing(barrier);
 
   assert(mbar != NULL, "CAS not embedded in normal graph!");
 
@@ -2835,48 +2728,27 @@
 
   // we only ever need to generate a dmb ishst between an object put
   // and the associated card mark when we are using CMS without
-  // conditional card marking
+  // conditional card marking. Any other occurence will happen when
+  // performing a card mark using CMS with conditional card marking or
+  // G1. In those cases the preceding MamBarVolatile will be
+  // translated to a dmb ish which guarantes visibility of the
+  // preceding StoreN/P before this StoreCM
 
   if (!UseConcMarkSweepGC || UseCondCardMark) {
     return true;
   }
 
-  // if we are implementing volatile puts using barriers then the
-  // object put as an str so we must insert the dmb ishst
+  // if we are implementing volatile puts using barriers then we must
+  // insert the dmb ishst
 
   if (UseBarriersForVolatile) {
     return false;
   }
 
-  // we can omit the dmb ishst if this StoreCM is part of a volatile
-  // put because in thta case the put will be implemented by stlr
-  //
-  // we need to check for a normal subgraph feeding this StoreCM.
-  // that means the StoreCM must be fed Memory from a leading membar,
-  // either a MemBarRelease or its dependent MemBarCPUOrder, and the
-  // leading membar must be part of a normal subgraph
-
-  Node *x = storecm->in(StoreNode::Memory);
-
-  if (!x->is_Proj()) {
-    return false;
-  }
-
-  x = x->in(0);
-
-  if (!x->is_MemBar()) {
-    return false;
-  }
-
-  MemBarNode *leading = x->as_MemBar();
-
-  // reject invalid candidates
-  if (!leading_membar(leading)) {
-    return false;
-  }
-
-  // we can omit the StoreStore if it is the head of a normal subgraph
-  return (leading_to_normal(leading) != NULL);
+  // we must be using CMS with conditional card marking so we ahve to
+  // generate the StoreStore
+
+  return false;
 }
 
 
@@ -13409,7 +13281,7 @@
     __ fmovs($dst$$Register, as_FloatRegister($src$$reg));
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(fp_f2i);
 
 %}
 
@@ -13427,7 +13299,7 @@
     __ fmovs(as_FloatRegister($dst$$reg), $src$$Register);
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(fp_i2f);
 
 %}
 
@@ -13445,7 +13317,7 @@
     __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(fp_d2l);
 
 %}
 
@@ -13463,7 +13335,7 @@
     __ fmovd(as_FloatRegister($dst$$reg), $src$$Register);
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(fp_l2d);
 
 %}
 
@@ -14319,6 +14191,25 @@
   ins_pipe(pipe_cmp_branch);
 %}
 
+instruct cmpN_imm0_branch(cmpOp cmp, iRegN op1, immN0 op2, label labl, rFlagsReg cr) %{
+  match(If cmp (CmpN op1 op2));
+  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::ne
+	    || n->in(1)->as_Bool()->_test._test == BoolTest::eq);
+  effect(USE labl);
+
+  ins_cost(BRANCH_COST);
+  format %{ "cbw$cmp   $op1, $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
+    if (cond == Assembler::EQ)
+      __ cbzw($op1$$Register, *L);
+    else
+      __ cbnzw($op1$$Register, *L);
+  %}
+  ins_pipe(pipe_cmp_branch);
+%}
+
 instruct cmpP_narrowOop_imm0_branch(cmpOp cmp, iRegN oop, immP0 zero, label labl, rFlagsReg cr) %{
   match(If cmp (CmpP (DecodeN oop) zero));
   predicate(n->in(1)->as_Bool()->_test._test == BoolTest::ne
@@ -14911,19 +14802,19 @@
 %}
 
 instruct string_equals(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
-                        iRegI_R0 result, iRegP_R10 tmp, rFlagsReg cr)
+                        iRegI_R0 result, rFlagsReg cr)
 %{
   predicate(!CompactStrings);
   match(Set result (StrEquals (Binary str1 str2) cnt));
-  effect(KILL tmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL cr);
-
-  format %{ "String Equals $str1,$str2,$cnt -> $result    // KILL $tmp" %}
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL cr);
+
+  format %{ "String Equals $str1,$str2,$cnt -> $result" %}
   ins_encode %{
     // Count is in 8-bit bytes; non-Compact chars are 16 bits.
     __ asrw($cnt$$Register, $cnt$$Register, 1);
-    __ string_equals($str1$$Register, $str2$$Register,
-                      $cnt$$Register, $result$$Register,
-                      $tmp$$Register);
+    __ arrays_equals($str1$$Register, $str2$$Register,
+                     $result$$Register, $cnt$$Register,
+                     2, /*is_string*/true);
   %}
   ins_pipe(pipe_class_memory);
 %}
@@ -14937,9 +14828,10 @@
 
   format %{ "Array Equals $ary1,ary2 -> $result    // KILL $tmp" %}
   ins_encode %{
-    __ byte_arrays_equals($ary1$$Register, $ary2$$Register,
-                          $result$$Register, $tmp$$Register);
-  %}
+    __ arrays_equals($ary1$$Register, $ary2$$Register,
+                     $result$$Register, $tmp$$Register,
+                     1, /*is_string*/false);
+    %}
   ins_pipe(pipe_class_memory);
 %}
 
@@ -14952,12 +14844,14 @@
 
   format %{ "Array Equals $ary1,ary2 -> $result    // KILL $tmp" %}
   ins_encode %{
-    __ char_arrays_equals($ary1$$Register, $ary2$$Register,
-                          $result$$Register, $tmp$$Register);
+    __ arrays_equals($ary1$$Register, $ary2$$Register,
+                     $result$$Register, $tmp$$Register,
+                     2, /*is_string*/false);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
+
 // encode char[] to byte[] in ISO_8859_1
 instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
                           vRegD_V0 Vtmp1, vRegD_V1 Vtmp2,
@@ -16608,7 +16502,7 @@
             as_FloatRegister($src$$reg),
             as_FloatRegister($shift$$reg));
   %}
-  ins_pipe(vshift64_imm);
+  ins_pipe(vshift64);
 %}
 
 instruct vsll4I(vecX dst, vecX src, vecX shift) %{
@@ -16622,7 +16516,7 @@
             as_FloatRegister($src$$reg),
             as_FloatRegister($shift$$reg));
   %}
-  ins_pipe(vshift128_imm);
+  ins_pipe(vshift128);
 %}
 
 instruct vsrl2I(vecD dst, vecD src, vecX shift) %{
@@ -16635,7 +16529,7 @@
             as_FloatRegister($src$$reg),
             as_FloatRegister($shift$$reg));
   %}
-  ins_pipe(vshift64_imm);
+  ins_pipe(vshift64);
 %}
 
 instruct vsrl4I(vecX dst, vecX src, vecX shift) %{
@@ -16648,7 +16542,7 @@
             as_FloatRegister($src$$reg),
             as_FloatRegister($shift$$reg));
   %}
-  ins_pipe(vshift128_imm);
+  ins_pipe(vshift128);
 %}
 
 instruct vsll2I_imm(vecD dst, vecD src, immI shift) %{
@@ -16766,7 +16660,7 @@
            as_FloatRegister($src$$reg),
            (int)$shift$$constant & 63);
   %}
-  ins_pipe(vshift128);
+  ins_pipe(vshift128_imm);
 %}
 
 instruct vsra2L_imm(vecX dst, vecX src, immI shift) %{
--- a/src/cpu/aarch64/vm/jvmciCodeInstaller_aarch64.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/aarch64/vm/jvmciCodeInstaller_aarch64.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -74,7 +74,7 @@
 void CodeInstaller::pd_patch_DataSectionReference(int pc_offset, int data_offset, TRAPS) {
   address pc = _instructions->start() + pc_offset;
   NativeInstruction* inst = nativeInstruction_at(pc);
-  if (inst->is_adr_aligned()) {
+  if (inst->is_adr_aligned() || inst->is_ldr_literal()) {
     address dest = _constants->start() + data_offset;
     _instructions->relocate(pc, section_word_Relocation::spec((address) dest, CodeBuffer::SECT_CONSTS));
     TRACE_jvmci_3("relocating at " PTR_FORMAT " (+%d) with destination at %d", p2i(pc), pc_offset, data_offset);
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -4481,225 +4481,126 @@
   BLOCK_COMMENT("} string_compare");
 }
 
-
-void MacroAssembler::string_equals(Register str1, Register str2,
-                                   Register cnt, Register result,
-                                   Register tmp1) {
-  Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
-    NEXT_WORD;
-
-  const Register tmp2 = rscratch1;
-  assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);
-
-  BLOCK_COMMENT("string_equals {");
-
-  // Start by assuming that the strings are not equal.
-  mov(result, zr);
-
-  // A very short string
-  cmpw(cnt, 4);
-  br(Assembler::LT, SHORT_STRING);
-
-  // Check if the strings start at the same location.
-  cmp(str1, str2);
-  br(Assembler::EQ, SAME_CHARS);
-
-  // Compare longwords
+// Compare Strings or char/byte arrays.
+
+// is_string is true iff this is a string comparison.
+
+// For Strings we're passed the address of the first characters in a1
+// and a2 and the length in cnt1.
+
+// For byte and char arrays we're passed the arrays themselves and we
+// have to extract length fields and do null checks here.
+
+// elem_size is the element size in bytes: either 1 or 2.
+
+// There are two implementations.  For arrays >= 8 bytes, all
+// comparisons (including the final one, which may overlap) are
+// performed 8 bytes at a time.  For arrays < 8 bytes, we compare a
+// halfword, then a short, and then a byte.
+
+void MacroAssembler::arrays_equals(Register a1, Register a2,
+                                   Register result, Register cnt1,
+                                   int elem_size, bool is_string)
+{
+  Label SAME, DONE, SHORT, NEXT_WORD, ONE;
+  Register tmp1 = rscratch1;
+  Register tmp2 = rscratch2;
+  Register cnt2 = tmp2;  // cnt2 only used in array length compare
+  int elem_per_word = wordSize/elem_size;
+  int log_elem_size = exact_log2(elem_size);
+  int length_offset = arrayOopDesc::length_offset_in_bytes();
+  int base_offset
+    = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
+
+  assert(elem_size == 1 || elem_size == 2, "must be char or byte");
+  assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
+
+  BLOCK_COMMENT(is_string ? "string_equals {" : "array_equals {");
+
+  mov(result, false);
+
+  if (!is_string) {
+    // if (a==a2)
+    //     return true;
+    eor(rscratch1, a1, a2);
+    cbz(rscratch1, SAME);
+    // if (a==null || a2==null)
+    //     return false;
+    cbz(a1, DONE);
+    cbz(a2, DONE);
+    // if (a1.length != a2.length)
+    //      return false;
+    ldrw(cnt1, Address(a1, length_offset));
+    ldrw(cnt2, Address(a2, length_offset));
+    eorw(tmp1, cnt1, cnt2);
+    cbnzw(tmp1, DONE);
+
+    lea(a1, Address(a1, base_offset));
+    lea(a2, Address(a2, base_offset));
+  }
+
+  // Check for short strings, i.e. smaller than wordSize.
+  subs(cnt1, cnt1, elem_per_word);
+  br(Assembler::LT, SHORT);
+  // Main 8 byte comparison loop.
+  bind(NEXT_WORD); {
+    ldr(tmp1, Address(post(a1, wordSize)));
+    ldr(tmp2, Address(post(a2, wordSize)));
+    subs(cnt1, cnt1, elem_per_word);
+    eor(tmp1, tmp1, tmp2);
+    cbnz(tmp1, DONE);
+  } br(GT, NEXT_WORD);
+  // Last longword.  In the case where length == 4 we compare the
+  // same longword twice, but that's still faster than another
+  // conditional branch.
+  // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
+  // length == 4.
+  if (log_elem_size > 0)
+    lsl(cnt1, cnt1, log_elem_size);
+  ldr(tmp1, Address(a1, cnt1));
+  ldr(tmp2, Address(a2, cnt1));
+  eor(tmp1, tmp1, tmp2);
+  cbnz(tmp1, DONE);
+  b(SAME);
+
+  bind(SHORT);
+  Label TAIL03, TAIL01;
+
+  tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
   {
-    subw(cnt, cnt, 4); // The last longword is a special case
-
-    // Move both string pointers to the last longword of their
-    // strings, negate the remaining count, and convert it to bytes.
-    lea(str1, Address(str1, cnt, Address::uxtw(1)));
-    lea(str2, Address(str2, cnt, Address::uxtw(1)));
-    sub(cnt, zr, cnt, LSL, 1);
-
-    // Loop, loading longwords and comparing them into rscratch2.
-    bind(NEXT_WORD);
-    ldr(tmp1, Address(str1, cnt));
-    ldr(tmp2, Address(str2, cnt));
-    adds(cnt, cnt, wordSize);
-    eor(rscratch2, tmp1, tmp2);
-    cbnz(rscratch2, DONE);
-    br(Assembler::LT, NEXT_WORD);
-
-    // Last longword.  In the case where length == 4 we compare the
-    // same longword twice, but that's still faster than another
-    // conditional branch.
-
-    ldr(tmp1, Address(str1));
-    ldr(tmp2, Address(str2));
-    eor(rscratch2, tmp1, tmp2);
-    cbz(rscratch2, SAME_CHARS);
-    b(DONE);
+    ldrw(tmp1, Address(post(a1, 4)));
+    ldrw(tmp2, Address(post(a2, 4)));
+    eorw(tmp1, tmp1, tmp2);
+    cbnzw(tmp1, DONE);
   }
-
-  bind(SHORT_STRING);
-  // Is the length zero?
-  cbz(cnt, SAME_CHARS);
-
-  bind(SHORT_LOOP);
-  load_unsigned_short(tmp1, Address(post(str1, 2)));
-  load_unsigned_short(tmp2, Address(post(str2, 2)));
-  subw(tmp1, tmp1, tmp2);
-  cbnz(tmp1, DONE);
-  sub(cnt, cnt, 1);
-  cbnz(cnt, SHORT_LOOP);
-
-  // Strings are equal.
-  bind(SAME_CHARS);
+  bind(TAIL03);
+  tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
+  {
+    ldrh(tmp1, Address(post(a1, 2)));
+    ldrh(tmp2, Address(post(a2, 2)));
+    eorw(tmp1, tmp1, tmp2);
+    cbnzw(tmp1, DONE);
+  }
+  bind(TAIL01);
+  if (elem_size == 1) { // Only needed when comparing byte arrays.
+    tbz(cnt1, 0, SAME); // 0-1 bytes left.
+    {
+      ldrb(tmp1, a1);
+      ldrb(tmp2, a2);
+      eorw(tmp1, tmp1, tmp2);
+      cbnzw(tmp1, DONE);
+    }
+  }
+  // Arrays are equal.
+  bind(SAME);
   mov(result, true);
 
-  // That's it
+  // That's it.
   bind(DONE);
-
-  BLOCK_COMMENT("} string_equals");
+  BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals");
 }
 
 
-void MacroAssembler::byte_arrays_equals(Register ary1, Register ary2,
-                                        Register result, Register tmp1)
-{
-  Register cnt1 = rscratch1;
-  Register cnt2 = rscratch2;
-  Register tmp2 = rscratch2;
-
-  Label SAME, DIFFER, NEXT, TAIL07, TAIL03, TAIL01;
-
-  int length_offset  = arrayOopDesc::length_offset_in_bytes();
-  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_BYTE);
-
-  BLOCK_COMMENT("byte_arrays_equals  {");
-
-    // different until proven equal
-    mov(result, false);
-
-    // same array?
-    cmp(ary1, ary2);
-    br(Assembler::EQ, SAME);
-
-    // ne if either null
-    cbz(ary1, DIFFER);
-    cbz(ary2, DIFFER);
-
-    // lengths ne?
-    ldrw(cnt1, Address(ary1, length_offset));
-    ldrw(cnt2, Address(ary2, length_offset));
-    cmp(cnt1, cnt2);
-    br(Assembler::NE, DIFFER);
-
-    lea(ary1, Address(ary1, base_offset));
-    lea(ary2, Address(ary2, base_offset));
-
-    subs(cnt1, cnt1, 8);
-    br(LT, TAIL07);
-
-  BIND(NEXT);
-    ldr(tmp1, Address(post(ary1, 8)));
-    ldr(tmp2, Address(post(ary2, 8)));
-    subs(cnt1, cnt1, 8);
-    eor(tmp1, tmp1, tmp2);
-    cbnz(tmp1, DIFFER);
-    br(GE, NEXT);
-
-  BIND(TAIL07);  // 0-7 bytes left, cnt1 = #bytes left - 4
-    tst(cnt1, 0b100);
-    br(EQ, TAIL03);
-    ldrw(tmp1, Address(post(ary1, 4)));
-    ldrw(tmp2, Address(post(ary2, 4)));
-    cmp(tmp1, tmp2);
-    br(NE, DIFFER);
-
-  BIND(TAIL03);  // 0-3 bytes left, cnt1 = #bytes left - 4
-    tst(cnt1, 0b10);
-    br(EQ, TAIL01);
-    ldrh(tmp1, Address(post(ary1, 2)));
-    ldrh(tmp2, Address(post(ary2, 2)));
-    cmp(tmp1, tmp2);
-    br(NE, DIFFER);
-  BIND(TAIL01);  // 0-1 byte left
-    tst(cnt1, 0b01);
-    br(EQ, SAME);
-    ldrb(tmp1, ary1);
-    ldrb(tmp2, ary2);
-    cmp(tmp1, tmp2);
-    br(NE, DIFFER);
-
-  BIND(SAME);
-    mov(result, true);
-  BIND(DIFFER); // result already set
-
-  BLOCK_COMMENT("} byte_arrays_equals");
-}
-
-// Compare char[] arrays aligned to 4 bytes
-void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
-                                        Register result, Register tmp1)
-{
-  Register cnt1 = rscratch1;
-  Register cnt2 = rscratch2;
-  Register tmp2 = rscratch2;
-
-  Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
-
-  int length_offset  = arrayOopDesc::length_offset_in_bytes();
-  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
-
-  BLOCK_COMMENT("char_arrays_equals  {");
-
-    // different until proven equal
-    mov(result, false);
-
-    // same array?
-    cmp(ary1, ary2);
-    br(Assembler::EQ, SAME);
-
-    // ne if either null
-    cbz(ary1, DIFFER);
-    cbz(ary2, DIFFER);
-
-    // lengths ne?
-    ldrw(cnt1, Address(ary1, length_offset));
-    ldrw(cnt2, Address(ary2, length_offset));
-    cmp(cnt1, cnt2);
-    br(Assembler::NE, DIFFER);
-
-    lea(ary1, Address(ary1, base_offset));
-    lea(ary2, Address(ary2, base_offset));
-
-    subs(cnt1, cnt1, 4);
-    br(LT, TAIL03);
-
-  BIND(NEXT);
-    ldr(tmp1, Address(post(ary1, 8)));
-    ldr(tmp2, Address(post(ary2, 8)));
-    subs(cnt1, cnt1, 4);
-    eor(tmp1, tmp1, tmp2);
-    cbnz(tmp1, DIFFER);
-    br(GE, NEXT);
-
-  BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
-    tst(cnt1, 0b10);
-    br(EQ, TAIL01);
-    ldrw(tmp1, Address(post(ary1, 4)));
-    ldrw(tmp2, Address(post(ary2, 4)));
-    cmp(tmp1, tmp2);
-    br(NE, DIFFER);
-  BIND(TAIL01);  // 0-1 chars left
-    tst(cnt1, 0b01);
-    br(EQ, SAME);
-    ldrh(tmp1, ary1);
-    ldrh(tmp2, ary2);
-    cmp(tmp1, tmp2);
-    br(NE, DIFFER);
-
-  BIND(SAME);
-    mov(result, true);
-  BIND(DIFFER); // result already set
-
-  BLOCK_COMMENT("} char_arrays_equals");
-}
-
 // encode char[] to byte[] in ISO_8859_1
 void MacroAssembler::encode_iso_array(Register src, Register dst,
                       Register len, Register result,
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1186,13 +1186,11 @@
   void string_compare(Register str1, Register str2,
                       Register cnt1, Register cnt2, Register result,
                       Register tmp1);
-  void string_equals(Register str1, Register str2,
-                     Register cnt, Register result,
-                     Register tmp1);
-  void char_arrays_equals(Register ary1, Register ary2,
-                          Register result, Register tmp1);
-  void byte_arrays_equals(Register ary1, Register ary2,
-                          Register result, Register tmp1);
+
+  void arrays_equals(Register a1, Register a2,
+                     Register result, Register cnt1,
+                     int elem_size, bool is_string);
+
   void encode_iso_array(Register src, Register dst,
                         Register len, Register result,
                         FloatRegister Vtmp1, FloatRegister Vtmp2,
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -105,13 +105,20 @@
   inline friend NativeInstruction* nativeInstruction_at(address address);
 
   static bool is_adrp_at(address instr);
+
   static bool is_ldr_literal_at(address instr);
+
+  bool is_ldr_literal() {
+    return is_ldr_literal_at(addr_at(0));
+  }
+
   static bool is_ldrw_to_zr(address instr);
 
   static bool is_call_at(address instr) {
     const uint32_t insn = (*(uint32_t*)instr);
     return (insn >> 26) == 0b100101;
   }
+
   bool is_call() {
     return is_call_at(addr_at(0));
   }
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -163,30 +163,20 @@
     sp_after_call_off = -26,
 
     d15_off            = -26,
-    d14_off            = -25,
     d13_off            = -24,
-    d12_off            = -23,
     d11_off            = -22,
-    d10_off            = -21,
     d9_off             = -20,
-    d8_off             = -19,
 
     r28_off            = -18,
-    r27_off            = -17,
     r26_off            = -16,
-    r25_off            = -15,
     r24_off            = -14,
-    r23_off            = -13,
     r22_off            = -12,
-    r21_off            = -11,
     r20_off            = -10,
-    r19_off            =  -9,
     call_wrapper_off   =  -8,
     result_off         =  -7,
     result_type_off    =  -6,
     method_off         =  -5,
     entry_point_off    =  -4,
-    parameters_off     =  -3,
     parameter_size_off =  -2,
     thread_off         =  -1,
     fp_f               =   0,
@@ -208,30 +198,20 @@
     const Address result_type   (rfp, result_type_off    * wordSize);
     const Address method        (rfp, method_off         * wordSize);
     const Address entry_point   (rfp, entry_point_off    * wordSize);
-    const Address parameters    (rfp, parameters_off     * wordSize);
     const Address parameter_size(rfp, parameter_size_off * wordSize);
 
     const Address thread        (rfp, thread_off         * wordSize);
 
     const Address d15_save      (rfp, d15_off * wordSize);
-    const Address d14_save      (rfp, d14_off * wordSize);
     const Address d13_save      (rfp, d13_off * wordSize);
-    const Address d12_save      (rfp, d12_off * wordSize);
     const Address d11_save      (rfp, d11_off * wordSize);
-    const Address d10_save      (rfp, d10_off * wordSize);
     const Address d9_save       (rfp, d9_off * wordSize);
-    const Address d8_save       (rfp, d8_off * wordSize);
 
     const Address r28_save      (rfp, r28_off * wordSize);
-    const Address r27_save      (rfp, r27_off * wordSize);
     const Address r26_save      (rfp, r26_off * wordSize);
-    const Address r25_save      (rfp, r25_off * wordSize);
     const Address r24_save      (rfp, r24_off * wordSize);
-    const Address r23_save      (rfp, r23_off * wordSize);
     const Address r22_save      (rfp, r22_off * wordSize);
-    const Address r21_save      (rfp, r21_off * wordSize);
     const Address r20_save      (rfp, r20_off * wordSize);
-    const Address r19_save      (rfp, r19_off * wordSize);
 
     // stub code
 
@@ -254,31 +234,20 @@
     // rthread because we want to sanity check rthread later
     __ str(c_rarg7,  thread);
     __ strw(c_rarg6, parameter_size);
-    __ str(c_rarg5,  parameters);
-    __ str(c_rarg4,  entry_point);
-    __ str(c_rarg3,  method);
-    __ str(c_rarg2,  result_type);
-    __ str(c_rarg1,  result);
-    __ str(c_rarg0,  call_wrapper);
-    __ str(r19,      r19_save);
-    __ str(r20,      r20_save);
-    __ str(r21,      r21_save);
-    __ str(r22,      r22_save);
-    __ str(r23,      r23_save);
-    __ str(r24,      r24_save);
-    __ str(r25,      r25_save);
-    __ str(r26,      r26_save);
-    __ str(r27,      r27_save);
-    __ str(r28,      r28_save);
-
-    __ strd(v8,      d8_save);
-    __ strd(v9,      d9_save);
-    __ strd(v10,     d10_save);
-    __ strd(v11,     d11_save);
-    __ strd(v12,     d12_save);
-    __ strd(v13,     d13_save);
-    __ strd(v14,     d14_save);
-    __ strd(v15,     d15_save);
+    __ stp(c_rarg4, c_rarg5,  entry_point);
+    __ stp(c_rarg2, c_rarg3,  result_type);
+    __ stp(c_rarg0, c_rarg1,  call_wrapper);
+
+    __ stp(r20, r19,   r20_save);
+    __ stp(r22, r21,   r22_save);
+    __ stp(r24, r23,   r24_save);
+    __ stp(r26, r25,   r26_save);
+    __ stp(r28, r27,   r28_save);
+
+    __ stpd(v9,  v8,   d9_save);
+    __ stpd(v11, v10,  d11_save);
+    __ stpd(v13, v12,  d13_save);
+    __ stpd(v15, v14,  d15_save);
 
     // install Java thread in global register now we have saved
     // whatever value it held
@@ -385,33 +354,22 @@
 #endif
 
     // restore callee-save registers
-    __ ldrd(v15,      d15_save);
-    __ ldrd(v14,      d14_save);
-    __ ldrd(v13,      d13_save);
-    __ ldrd(v12,      d12_save);
-    __ ldrd(v11,      d11_save);
-    __ ldrd(v10,      d10_save);
-    __ ldrd(v9,       d9_save);
-    __ ldrd(v8,       d8_save);
-
-    __ ldr(r28,      r28_save);
-    __ ldr(r27,      r27_save);
-    __ ldr(r26,      r26_save);
-    __ ldr(r25,      r25_save);
-    __ ldr(r24,      r24_save);
-    __ ldr(r23,      r23_save);
-    __ ldr(r22,      r22_save);
-    __ ldr(r21,      r21_save);
-    __ ldr(r20,      r20_save);
-    __ ldr(r19,      r19_save);
-    __ ldr(c_rarg0,  call_wrapper);
-    __ ldr(c_rarg1,  result);
+    __ ldpd(v15, v14,  d15_save);
+    __ ldpd(v13, v12,  d13_save);
+    __ ldpd(v11, v10,  d11_save);
+    __ ldpd(v9,  v8,   d9_save);
+
+    __ ldp(r28, r27,   r28_save);
+    __ ldp(r26, r25,   r26_save);
+    __ ldp(r24, r23,   r24_save);
+    __ ldp(r22, r21,   r22_save);
+    __ ldp(r20, r19,   r20_save);
+
+    __ ldp(c_rarg0, c_rarg1,  call_wrapper);
     __ ldrw(c_rarg2, result_type);
     __ ldr(c_rarg3,  method);
-    __ ldr(c_rarg4,  entry_point);
-    __ ldr(c_rarg5,  parameters);
-    __ ldr(c_rarg6,  parameter_size);
-    __ ldr(c_rarg7,  thread);
+    __ ldp(c_rarg4, c_rarg5,  entry_point);
+    __ ldp(c_rarg6, c_rarg7,  parameter_size);
 
 #ifndef PRODUCT
     // tell the simulator we are about to end Java execution
@@ -666,7 +624,7 @@
   //     count   -  element count
   //     tmp     - scratch register
   //
-  //     Destroy no registers!
+  //     Destroy no registers except rscratch1 and rscratch2
   //
   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
     BarrierSet* bs = Universe::heap()->barrier_set();
@@ -674,12 +632,13 @@
     case BarrierSet::G1SATBCTLogging:
       // With G1, don't generate the call if we statically know that the target in uninitialized
       if (!dest_uninitialized) {
-        __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
+        __ push_call_clobbered_registers();
         if (count == c_rarg0) {
           if (addr == c_rarg1) {
             // exactly backwards!!
-            __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize));
-            __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize));
+            __ mov(rscratch1, c_rarg0);
+            __ mov(c_rarg0, c_rarg1);
+            __ mov(c_rarg1, rscratch1);
           } else {
             __ mov(c_rarg1, count);
             __ mov(c_rarg0, addr);
@@ -689,7 +648,7 @@
           __ mov(c_rarg1, count);
         }
         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
-        __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
+        __ pop_call_clobbered_registers();
         break;
       case BarrierSet::CardTableForRS:
       case BarrierSet::CardTableExtension:
@@ -719,7 +678,7 @@
       case BarrierSet::G1SATBCTLogging:
 
         {
-          __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
+          __ push_call_clobbered_registers();
           // must compute element count unless barrier set interface is changed (other platforms supply count)
           assert_different_registers(start, end, scratch);
           __ lea(scratch, Address(end, BytesPerHeapOop));
@@ -728,7 +687,7 @@
           __ mov(c_rarg0, start);
           __ mov(c_rarg1, scratch);
           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
-          __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
+          __ pop_call_clobbered_registers();
         }
         break;
       case BarrierSet::CardTableForRS:
@@ -1394,10 +1353,10 @@
   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
   //
   address generate_disjoint_oop_copy(bool aligned, address *entry,
-                                     const char *name, bool dest_uninitialized = false) {
+                                     const char *name, bool dest_uninitialized) {
     const bool is_oop = true;
     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
-    return generate_disjoint_copy(size, aligned, is_oop, entry, name);
+    return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
   }
 
   // Arguments:
@@ -1412,10 +1371,11 @@
   //
   address generate_conjoint_oop_copy(bool aligned,
                                      address nooverlap_target, address *entry,
-                                     const char *name, bool dest_uninitialized = false) {
+                                     const char *name, bool dest_uninitialized) {
     const bool is_oop = true;
     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
-    return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name);
+    return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
+                                  name, dest_uninitialized);
   }
 
 
@@ -1522,6 +1482,8 @@
     }
 #endif //ASSERT
 
+    gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
+
     // save the original count
     __ mov(count_save, count);
 
@@ -1988,9 +1950,11 @@
       bool aligned = !UseCompressedOops;
 
       StubRoutines::_arrayof_oop_disjoint_arraycopy
-        = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy");
+        = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
+                                     /*dest_uninitialized*/false);
       StubRoutines::_arrayof_oop_arraycopy
-        = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy");
+        = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
+                                     /*dest_uninitialized*/false);
       // Aligned versions without pre-barriers
       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
--- a/src/cpu/ppc/vm/globals_ppc.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/ppc/vm/globals_ppc.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2002, 2016, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
+ * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -74,8 +74,7 @@
 
 define_pd_global(uintx, TypeProfileLevel, 111);
 
-// No performance work done here yet.
-define_pd_global(bool, CompactStrings, false);
+define_pd_global(bool, CompactStrings, true);
 
 // Platform dependent flag handling: flags only defined on this platform.
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint)  \
--- a/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -45,6 +45,9 @@
 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
 #include "gc/g1/heapRegion.hpp"
 #endif // INCLUDE_ALL_GCS
+#ifdef COMPILER2
+#include "opto/intrinsicnode.hpp"
+#endif
 
 #ifdef PRODUCT
 #define BLOCK_COMMENT(str) // nothing
@@ -3168,6 +3171,553 @@
 
 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
 
+#ifdef COMPILER2
+// Intrinsics for CompactStrings
+
+// Compress char[] to byte[] by compressing 16 bytes at once.
+void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
+                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
+                                        Label& Lfailure) {
+
+  const Register tmp0 = R0;
+  assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
+  Label Lloop, Lslow;
+
+  // Check if cnt >= 8 (= 16 bytes)
+  lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
+  srwi_(tmp2, cnt, 3);
+  beq(CCR0, Lslow);
+  ori(tmp1, tmp1, 0xFF);
+  rldimi(tmp1, tmp1, 32, 0);
+  mtctr(tmp2);
+
+  // 2x unrolled loop
+  bind(Lloop);
+  ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
+  ld(tmp4, 8, src);               // _4_5_6_7
+
+  orr(tmp0, tmp2, tmp4);
+  rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
+  rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
+  rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
+  rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
+
+  andc_(tmp0, tmp0, tmp1);
+  bne(CCR0, Lfailure);            // Not latin1.
+  addi(src, src, 16);
+
+  rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
+  srdi(tmp2, tmp2, 3*8);          // ____0_2_
+  rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
+  srdi(tmp4, tmp4, 3*8);          // ____4_6_
+
+  orr(tmp2, tmp2, tmp3);          // ____0123
+  orr(tmp4, tmp4, tmp5);          // ____4567
+
+  stw(tmp2, 0, dst);
+  stw(tmp4, 4, dst);
+  addi(dst, dst, 8);
+  bdnz(Lloop);
+
+  bind(Lslow);                    // Fallback to slow version
+}
+
+// Compress char[] to byte[]. cnt must be positive int.
+void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
+  Label Lloop;
+  mtctr(cnt);
+
+  bind(Lloop);
+  lhz(tmp, 0, src);
+  cmplwi(CCR0, tmp, 0xff);
+  bgt(CCR0, Lfailure);            // Not latin1.
+  addi(src, src, 2);
+  stb(tmp, 0, dst);
+  addi(dst, dst, 1);
+  bdnz(Lloop);
+}
+
+// Inflate byte[] to char[] by inflating 16 bytes at once.
+void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
+                                       Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
+  const Register tmp0 = R0;
+  assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
+  Label Lloop, Lslow;
+
+  // Check if cnt >= 8
+  srwi_(tmp2, cnt, 3);
+  beq(CCR0, Lslow);
+  lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
+  ori(tmp1, tmp1, 0xFF);
+  mtctr(tmp2);
+
+  // 2x unrolled loop
+  bind(Lloop);
+  lwz(tmp2, 0, src);              // ____0123 (Big Endian)
+  lwz(tmp4, 4, src);              // ____4567
+  addi(src, src, 8);
+
+  rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
+  rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
+  rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
+  rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
+
+  andc(tmp0, tmp2, tmp1);         // ____0_1_
+  rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
+  andc(tmp3, tmp4, tmp1);         // ____4_5_
+  rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
+
+  rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
+  rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
+
+  std(tmp2, 0, dst);
+  std(tmp4, 8, dst);
+  addi(dst, dst, 16);
+  bdnz(Lloop);
+
+  bind(Lslow);                    // Fallback to slow version
+}
+
+// Inflate byte[] to char[]. cnt must be positive int.
+void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
+  Label Lloop;
+  mtctr(cnt);
+
+  bind(Lloop);
+  lbz(tmp, 0, src);
+  addi(src, src, 1);
+  sth(tmp, 0, dst);
+  addi(dst, dst, 2);
+  bdnz(Lloop);
+}
+
+void MacroAssembler::string_compare(Register str1, Register str2,
+                                    Register cnt1, Register cnt2,
+                                    Register tmp1, Register result, int ae) {
+  const Register tmp0 = R0,
+                 diff = tmp1;
+
+  assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
+  Label Ldone, Lslow, Lloop, Lreturn_diff;
+
+  // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
+  // we interchange str1 and str2 in the UL case and negate the result.
+  // Like this, str1 is always latin1 encoded, except for the UU case.
+  // In addition, we need 0 (or sign which is 0) extend.
+
+  if (ae == StrIntrinsicNode::UU) {
+    srwi(cnt1, cnt1, 1);
+  } else {
+    clrldi(cnt1, cnt1, 32);
+  }
+
+  if (ae != StrIntrinsicNode::LL) {
+    srwi(cnt2, cnt2, 1);
+  } else {
+    clrldi(cnt2, cnt2, 32);
+  }
+
+  // See if the lengths are different, and calculate min in cnt1.
+  // Save diff in case we need it for a tie-breaker.
+  subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
+  // if (diff > 0) { cnt1 = cnt2; }
+  if (VM_Version::has_isel()) {
+    isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
+  } else {
+    Label Lskip;
+    blt(CCR0, Lskip);
+    mr(cnt1, cnt2);
+    bind(Lskip);
+  }
+
+  // Rename registers
+  Register chr1 = result;
+  Register chr2 = tmp0;
+
+  // Compare multiple characters in fast loop (only implemented for same encoding).
+  int stride1 = 8, stride2 = 8;
+  if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
+    int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
+    Label Lfastloop, Lskipfast;
+
+    srwi_(tmp0, cnt1, log2_chars_per_iter);
+    beq(CCR0, Lskipfast);
+    rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
+    li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
+    mtctr(tmp0);
+
+    bind(Lfastloop);
+    ld(chr1, 0, str1);
+    ld(chr2, 0, str2);
+    cmpd(CCR0, chr1, chr2);
+    bne(CCR0, Lslow);
+    addi(str1, str1, stride1);
+    addi(str2, str2, stride2);
+    bdnz(Lfastloop);
+    mr(cnt1, cnt2); // Remaining characters.
+    bind(Lskipfast);
+  }
+
+  // Loop which searches the first difference character by character.
+  cmpwi(CCR0, cnt1, 0);
+  beq(CCR0, Lreturn_diff);
+  bind(Lslow);
+  mtctr(cnt1);
+
+  switch (ae) {
+    case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
+    case StrIntrinsicNode::UL: // fallthru (see comment above)
+    case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
+    case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
+    default: ShouldNotReachHere(); break;
+  }
+
+  bind(Lloop);
+  if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
+  if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
+  subf_(result, chr2, chr1); // result = chr1 - chr2
+  bne(CCR0, Ldone);
+  addi(str1, str1, stride1);
+  addi(str2, str2, stride2);
+  bdnz(Lloop);
+
+  // If strings are equal up to min length, return the length difference.
+  bind(Lreturn_diff);
+  mr(result, diff);
+
+  // Otherwise, return the difference between the first mismatched chars.
+  bind(Ldone);
+  if (ae == StrIntrinsicNode::UL) {
+    neg(result, result); // Negate result (see note above).
+  }
+}
+
+void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
+                                  Register limit, Register tmp1, Register result, bool is_byte) {
+  const Register tmp0 = R0;
+  assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
+  Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
+  bool limit_needs_shift = false;
+
+  if (is_array_equ) {
+    const int length_offset = arrayOopDesc::length_offset_in_bytes();
+    const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
+
+    // Return true if the same array.
+    cmpd(CCR0, ary1, ary2);
+    beq(CCR0, Lskiploop);
+
+    // Return false if one of them is NULL.
+    cmpdi(CCR0, ary1, 0);
+    cmpdi(CCR1, ary2, 0);
+    li(result, 0);
+    cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
+    beq(CCR0, Ldone);
+
+    // Load the lengths of arrays.
+    lwz(limit, length_offset, ary1);
+    lwz(tmp0, length_offset, ary2);
+
+    // Return false if the two arrays are not equal length.
+    cmpw(CCR0, limit, tmp0);
+    bne(CCR0, Ldone);
+
+    // Load array addresses.
+    addi(ary1, ary1, base_offset);
+    addi(ary2, ary2, base_offset);
+  } else {
+    limit_needs_shift = !is_byte;
+    li(result, 0); // Assume not equal.
+  }
+
+  // Rename registers
+  Register chr1 = tmp0;
+  Register chr2 = tmp1;
+
+  // Compare 8 bytes per iteration in fast loop.
+  const int log2_chars_per_iter = is_byte ? 3 : 2;
+
+  srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
+  beq(CCR0, Lskipfast);
+  mtctr(tmp0);
+
+  bind(Lfastloop);
+  ld(chr1, 0, ary1);
+  ld(chr2, 0, ary2);
+  addi(ary1, ary1, 8);
+  addi(ary2, ary2, 8);
+  cmpd(CCR0, chr1, chr2);
+  bne(CCR0, Ldone);
+  bdnz(Lfastloop);
+
+  bind(Lskipfast);
+  rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
+  beq(CCR0, Lskiploop);
+  mtctr(limit);
+
+  // Character by character.
+  bind(Lloop);
+  if (is_byte) {
+    lbz(chr1, 0, ary1);
+    lbz(chr2, 0, ary2);
+    addi(ary1, ary1, 1);
+    addi(ary2, ary2, 1);
+  } else {
+    lhz(chr1, 0, ary1);
+    lhz(chr2, 0, ary2);
+    addi(ary1, ary1, 2);
+    addi(ary2, ary2, 2);
+  }
+  cmpw(CCR0, chr1, chr2);
+  bne(CCR0, Ldone);
+  bdnz(Lloop);
+
+  bind(Lskiploop);
+  li(result, 1); // All characters are equal.
+  bind(Ldone);
+}
+
+void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
+                                    Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
+                                    Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
+
+  // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
+  Label L_TooShort, L_Found, L_NotFound, L_End;
+  Register last_addr = haycnt, // Kill haycnt at the beginning.
+  addr      = tmp1,
+  n_start   = tmp2,
+  ch1       = tmp3,
+  ch2       = R0;
+
+  assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
+  const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
+  const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
+
+  // **************************************************************************************************
+  // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
+  // **************************************************************************************************
+
+  // Compute last haystack addr to use if no match gets found.
+  clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
+  addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
+  if (needlecntval == 0) { // variable needlecnt
+   cmpwi(CCR6, needlecnt, 2);
+   clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
+   blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
+  }
+
+  if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
+
+  if (needlecntval == 0) { // variable needlecnt
+   subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
+   addi(needlecnt, needlecnt, -2);    // Rest of needle.
+  } else { // constant needlecnt
+  guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
+  assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
+   addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
+   if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
+  }
+
+  if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
+
+  if (ae ==StrIntrinsicNode::UL) {
+   srwi(tmp4, n_start, 1*8);          // ___0
+   rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
+  }
+
+  add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
+
+  // Main Loop (now we have at least 2 characters).
+  Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
+  bind(L_OuterLoop); // Search for 1st 2 characters.
+  Register addr_diff = tmp4;
+   subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
+   addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
+   srdi_(ch2, addr_diff, h_csize);
+   beq(CCR0, L_FinalCheck);           // 2 characters left?
+   mtctr(ch2);                        // num of characters / 2
+  bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
+   if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
+    lwz(ch1, 0, addr);
+    lwz(ch2, 2, addr);
+   } else {
+    lhz(ch1, 0, addr);
+    lhz(ch2, 1, addr);
+   }
+   cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
+   cmpw(CCR1, ch2, n_start);
+   beq(CCR0, L_Comp1);                // Did we find the needle start?
+   beq(CCR1, L_Comp2);
+   addi(addr, addr, 2 * h_csize);
+   bdnz(L_InnerLoop);
+  bind(L_FinalCheck);
+   andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
+   beq(CCR0, L_NotFound);
+   if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
+   cmpw(CCR1, ch1, n_start);
+   beq(CCR1, L_Comp1);
+  bind(L_NotFound);
+   li(result, -1);                    // not found
+   b(L_End);
+
+   // **************************************************************************************************
+   // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
+   // **************************************************************************************************
+  if (needlecntval == 0) {           // We have to handle these cases separately.
+  Label L_OneCharLoop;
+  bind(L_TooShort);
+   mtctr(haycnt);
+   if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
+  bind(L_OneCharLoop);
+   if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
+   cmpw(CCR1, ch1, n_start);
+   beq(CCR1, L_Found);               // Did we find the one character needle?
+   bdnz(L_OneCharLoop);
+   li(result, -1);                   // Not found.
+   b(L_End);
+  }
+
+  // **************************************************************************************************
+  // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
+  // **************************************************************************************************
+
+  // Compare the rest
+  bind(L_Comp2);
+   addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
+  bind(L_Comp1);                     // Addr points to possible needle start.
+  if (needlecntval != 2) {           // Const needlecnt==2?
+   if (needlecntval != 3) {
+    if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
+    Register n_ind = tmp4,
+             h_ind = n_ind;
+    li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
+    mtctr(needlecnt);                // Decremented by 2, still > 0.
+   Label L_CompLoop;
+   bind(L_CompLoop);
+    if (ae ==StrIntrinsicNode::UL) {
+      h_ind = ch1;
+      sldi(h_ind, n_ind, 1);
+    }
+    if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
+    if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
+    cmpw(CCR1, ch1, ch2);
+    bne(CCR1, L_OuterLoop);
+    addi(n_ind, n_ind, n_csize);
+    bdnz(L_CompLoop);
+   } else { // No loop required if there's only one needle character left.
+    if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
+    if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
+    cmpw(CCR1, ch1, ch2);
+    bne(CCR1, L_OuterLoop);
+   }
+  }
+  // Return index ...
+  bind(L_Found);
+   subf(result, haystack, addr);     // relative to haystack, ...
+   if (h_csize == 2) { srdi(result, result, 1); } // in characters.
+  bind(L_End);
+} // string_indexof
+
+void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
+                                         Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
+  assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
+
+  Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
+  Register addr = tmp1,
+           ch1 = tmp2,
+           ch2 = R0;
+
+  const int h_csize = is_byte ? 1 : 2;
+
+//4:
+   srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
+   mr(addr, haystack);
+   beq(CCR0, L_FinalCheck);
+   mtctr(tmp2);              // Move to count register.
+//8:
+  bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
+   if (!is_byte) {
+    lhz(ch1, 0, addr);
+    lhz(ch2, 2, addr);
+   } else {
+    lbz(ch1, 0, addr);
+    lbz(ch2, 1, addr);
+   }
+   (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
+   (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
+   beq(CCR0, L_Found1);      // Did we find the needle?
+   beq(CCR1, L_Found2);
+   addi(addr, addr, 2 * h_csize);
+   bdnz(L_InnerLoop);
+//16:
+  bind(L_FinalCheck);
+   andi_(R0, haycnt, 1);
+   beq(CCR0, L_NotFound);
+   if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
+   (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
+   beq(CCR1, L_Found1);
+//21:
+  bind(L_NotFound);
+   li(result, -1);           // Not found.
+   b(L_End);
+
+  bind(L_Found2);
+   addi(addr, addr, h_csize);
+//24:
+  bind(L_Found1);            // Return index ...
+   subf(result, haystack, addr); // relative to haystack, ...
+   if (!is_byte) { srdi(result, result, 1); } // in characters.
+  bind(L_End);
+} // string_indexof_char
+
+
+void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
+                                   Register tmp1, Register tmp2) {
+  const Register tmp0 = R0;
+  assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
+  Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
+
+  // Check if cnt >= 8 (= 16 bytes)
+  lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
+  srwi_(tmp2, cnt, 4);
+  li(result, 1);                  // Assume there's a negative byte.
+  beq(CCR0, Lslow);
+  ori(tmp1, tmp1, 0x8080);
+  rldimi(tmp1, tmp1, 32, 0);
+  mtctr(tmp2);
+
+  // 2x unrolled loop
+  bind(Lfastloop);
+  ld(tmp2, 0, src);
+  ld(tmp0, 8, src);
+
+  orr(tmp0, tmp2, tmp0);
+
+  and_(tmp0, tmp0, tmp1);
+  bne(CCR0, Ldone);               // Found negative byte.
+  addi(src, src, 16);
+
+  bdnz(Lfastloop);
+
+  bind(Lslow);                    // Fallback to slow version
+  rldicl_(tmp0, cnt, 0, 64-4);
+  beq(CCR0, Lnoneg);
+  mtctr(tmp0);
+  bind(Lloop);
+  lbz(tmp0, 0, src);
+  addi(src, src, 1);
+  andi_(tmp0, tmp0, 0x80);
+  bne(CCR0, Ldone);               // Found negative byte.
+  bdnz(Lloop);
+  bind(Lnoneg);
+  li(result, 0);
+
+  bind(Ldone);
+}
+
+
+// Intrinsics for non-CompactStrings
+
 // Search for a single jchar in an jchar[].
 //
 // Assumes that result differs from all other registers.
@@ -3613,6 +4163,8 @@
   bind(Ldone_false);
 }
 
+#endif // Compiler2
+
 // Helpers for Intrinsic Emitters
 //
 // Revert the byte order of a 32bit value in a register
--- a/src/cpu/ppc/vm/macroAssembler_ppc.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/ppc/vm/macroAssembler_ppc.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
+ * Copyright (c) 2002, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -679,6 +679,39 @@
 
   void clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp = R0);
 
+#ifdef COMPILER2
+  // Intrinsics for CompactStrings
+  // Compress char[] to byte[] by compressing 16 bytes at once.
+  void string_compress_16(Register src, Register dst, Register cnt,
+                          Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
+                          Label& Lfailure);
+
+  // Compress char[] to byte[]. cnt must be positive int.
+  void string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure);
+
+  // Inflate byte[] to char[] by inflating 16 bytes at once.
+  void string_inflate_16(Register src, Register dst, Register cnt,
+                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
+
+  // Inflate byte[] to char[]. cnt must be positive int.
+  void string_inflate(Register src, Register dst, Register cnt, Register tmp);
+
+  void string_compare(Register str1, Register str2, Register cnt1, Register cnt2,
+                      Register tmp1, Register result, int ae);
+
+  void array_equals(bool is_array_equ, Register ary1, Register ary2,
+                    Register limit, Register tmp1, Register result, bool is_byte);
+
+  void string_indexof(Register result, Register haystack, Register haycnt,
+                      Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
+                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae);
+
+  void string_indexof_char(Register result, Register haystack, Register haycnt,
+                           Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte);
+
+  void has_negatives(Register src, Register cnt, Register result, Register tmp1, Register tmp2);
+
+  // Intrinsics for non-CompactStrings
   // Needle of length 1.
   void string_indexof_1(Register result, Register haystack, Register haycnt,
                         Register needle, jchar needleChar,
@@ -694,6 +727,7 @@
                           Register tmp5_reg);
   void char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
                              Register tmp1_reg, Register tmp2_reg);
+#endif
 
   // Emitters for BigInteger.multiplyToLen intrinsic.
   inline void multiply64(Register dest_hi, Register dest_lo,
--- a/src/cpu/ppc/vm/ppc.ad	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/ppc/vm/ppc.ad	Thu Mar 10 09:50:18 2016 -0800
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
-// Copyright (c) 2012, 2015 SAP SE. All rights reserved.
+// Copyright (c) 2012, 2016 SAP SE. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -2024,13 +2024,13 @@
     return (UsePopCountInstruction && VM_Version::has_popcntw());
 
   case Op_StrComp:
-    return SpecialStringCompareTo && !CompactStrings;
+    return SpecialStringCompareTo;
   case Op_StrEquals:
-    return SpecialStringEquals && !CompactStrings;
+    return SpecialStringEquals;
   case Op_StrIndexOf:
-    return SpecialStringIndexOf && !CompactStrings;
+    return SpecialStringIndexOf;
   case Op_StrIndexOfChar:
-    return SpecialStringIndexOf && !CompactStrings;
+    return SpecialStringIndexOf;
   }
 
   return true;  // Per default match rules are supported.
@@ -11022,6 +11022,584 @@
   ins_pipe(pipe_class_default);
 %}
 
+instruct string_compareL(rarg1RegP str1, rarg2RegP str2, rarg3RegI cnt1, rarg4RegI cnt2, iRegIdst result,
+                         iRegIdst tmp, regCTR ctr, flagsRegCR0 cr0) %{
+  predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
+  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(TEMP_DEF result, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL ctr, KILL cr0, TEMP tmp);
+  ins_cost(300);
+  format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result \t// KILL $tmp" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ string_compare($str1$$Register, $str2$$Register,
+                      $cnt1$$Register, $cnt2$$Register,
+                      $tmp$$Register,
+                      $result$$Register, StrIntrinsicNode::LL);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct string_compareU(rarg1RegP str1, rarg2RegP str2, rarg3RegI cnt1, rarg4RegI cnt2, iRegIdst result,
+                         iRegIdst tmp, regCTR ctr, flagsRegCR0 cr0) %{
+  predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU);
+  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(TEMP_DEF result, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL ctr, KILL cr0, TEMP tmp);
+  ins_cost(300);
+  format %{ "String Compare char[] $str1,$cnt1,$str2,$cnt2 -> $result \t// KILL $tmp" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ string_compare($str1$$Register, $str2$$Register,
+                      $cnt1$$Register, $cnt2$$Register,
+                      $tmp$$Register,
+                      $result$$Register, StrIntrinsicNode::UU);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct string_compareLU(rarg1RegP str1, rarg2RegP str2, rarg3RegI cnt1, rarg4RegI cnt2, iRegIdst result,
+                          iRegIdst tmp, regCTR ctr, flagsRegCR0 cr0) %{
+  predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU);
+  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(TEMP_DEF result, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL ctr, KILL cr0, TEMP tmp);
+  ins_cost(300);
+  format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result \t// KILL $tmp" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ string_compare($str1$$Register, $str2$$Register,
+                      $cnt1$$Register, $cnt2$$Register,
+                      $tmp$$Register,
+                      $result$$Register, StrIntrinsicNode::LU);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct string_compareUL(rarg1RegP str1, rarg2RegP str2, rarg3RegI cnt1, rarg4RegI cnt2, iRegIdst result,
+                          iRegIdst tmp, regCTR ctr, flagsRegCR0 cr0) %{
+  predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL);
+  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(TEMP_DEF result, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL ctr, KILL cr0, TEMP tmp);
+  ins_cost(300);
+  format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result \t// KILL $tmp" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ string_compare($str2$$Register, $str1$$Register,
+                      $cnt2$$Register, $cnt1$$Register,
+                      $tmp$$Register,
+                      $result$$Register, StrIntrinsicNode::UL);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct string_equalsL(rarg1RegP str1, rarg2RegP str2, rarg3RegI cnt, iRegIdst result,
+                        iRegIdst tmp, regCTR ctr, flagsRegCR0 cr0) %{
+  predicate(((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::LL);
+  match(Set result (StrEquals (Binary str1 str2) cnt));
+  effect(TEMP_DEF result, USE_KILL str1, USE_KILL str2, USE_KILL cnt, TEMP tmp, KILL ctr, KILL cr0);
+  ins_cost(300);
+  format %{ "String Equals byte[] $str1,$str2,$cnt -> $result \t// KILL $tmp" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ array_equals(false, $str1$$Register, $str2$$Register,
+                    $cnt$$Register, $tmp$$Register,
+                    $result$$Register, true /* byte */);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct string_equalsU(rarg1RegP str1, rarg2RegP str2, rarg3RegI cnt, iRegIdst result,
+                        iRegIdst tmp, regCTR ctr, flagsRegCR0 cr0) %{
+  predicate(((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::UU);
+  match(Set result (StrEquals (Binary str1 str2) cnt));
+  effect(TEMP_DEF result, USE_KILL str1, USE_KILL str2, USE_KILL cnt, TEMP tmp, KILL ctr, KILL cr0);
+  ins_cost(300);
+  format %{ "String Equals char[]  $str1,$str2,$cnt -> $result \t// KILL $tmp" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ array_equals(false, $str1$$Register, $str2$$Register,
+                    $cnt$$Register, $tmp$$Register,
+                    $result$$Register, false /* byte */);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct array_equalsB(rarg1RegP ary1, rarg2RegP ary2, iRegIdst result,
+                       iRegIdst tmp1, iRegIdst tmp2, regCTR ctr, flagsRegCR0 cr0, flagsRegCR0 cr1) %{
+  predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL);
+  match(Set result (AryEq ary1 ary2));
+  effect(TEMP_DEF result, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, KILL ctr, KILL cr0, KILL cr1);
+  ins_cost(300);
+  format %{ "Array Equals $ary1,$ary2 -> $result \t// KILL $tmp1,$tmp2" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ array_equals(true, $ary1$$Register, $ary2$$Register,
+                    $tmp1$$Register, $tmp2$$Register,
+                    $result$$Register, true /* byte */);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct array_equalsC(rarg1RegP ary1, rarg2RegP ary2, iRegIdst result,
+                       iRegIdst tmp1, iRegIdst tmp2, regCTR ctr, flagsRegCR0 cr0, flagsRegCR0 cr1) %{
+  predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU);
+  match(Set result (AryEq ary1 ary2));
+  effect(TEMP_DEF result, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, KILL ctr, KILL cr0, KILL cr1);
+  ins_cost(300);
+  format %{ "Array Equals $ary1,$ary2 -> $result \t// KILL $tmp1,$tmp2" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ array_equals(true, $ary1$$Register, $ary2$$Register,
+                    $tmp1$$Register, $tmp2$$Register,
+                    $result$$Register, false /* byte */);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct indexOf_imm1_char_U(iRegIdst result, iRegPsrc haystack, iRegIsrc haycnt,
+                             immP needleImm, immL offsetImm, immI_1 needlecntImm,
+                             iRegIdst tmp1, iRegIdst tmp2,
+                             flagsRegCR0 cr0, flagsRegCR1 cr1, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary (AddP needleImm offsetImm) needlecntImm)));
+  effect(TEMP tmp1, TEMP tmp2, KILL cr0, KILL cr1, KILL ctr);
+  // Required for EA: check if it is still a type_array.
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU);
+  ins_cost(150);
+
+  format %{ "String IndexOf CSCL1 $haystack[0..$haycnt], $needleImm+$offsetImm[0..$needlecntImm]"
+            "-> $result \t// KILL $haycnt, $tmp1, $tmp2, $cr0, $cr1" %}
+
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    immPOper *needleOper = (immPOper *)$needleImm;
+    const TypeOopPtr *t = needleOper->type()->isa_oopptr();
+    ciTypeArray* needle_values = t->const_oop()->as_type_array();  // Pointer to live char *
+    jchar chr;
+#ifdef VM_LITTLE_ENDIAN
+    chr = (((jchar)(unsigned char)needle_values->element_value(1).as_byte()) << 8) |
+           ((jchar)(unsigned char)needle_values->element_value(0).as_byte());
+#else
+    chr = (((jchar)(unsigned char)needle_values->element_value(0).as_byte()) << 8) |
+           ((jchar)(unsigned char)needle_values->element_value(1).as_byte());
+#endif
+    __ string_indexof_char($result$$Register,
+                           $haystack$$Register, $haycnt$$Register,
+                           R0, chr,
+                           $tmp1$$Register, $tmp2$$Register, false /*is_byte*/);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_imm1_char_L(iRegIdst result, iRegPsrc haystack, iRegIsrc haycnt,
+                             immP needleImm, immL offsetImm, immI_1 needlecntImm,
+                             iRegIdst tmp1, iRegIdst tmp2,
+                             flagsRegCR0 cr0, flagsRegCR1 cr1, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary (AddP needleImm offsetImm) needlecntImm)));
+  effect(TEMP tmp1, TEMP tmp2, KILL cr0, KILL cr1, KILL ctr);
+  // Required for EA: check if it is still a type_array.
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL);
+  ins_cost(150);
+
+  format %{ "String IndexOf CSCL1 $haystack[0..$haycnt], $needleImm+$offsetImm[0..$needlecntImm]"
+            "-> $result \t// KILL $haycnt, $tmp1, $tmp2, $cr0, $cr1" %}
+
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    immPOper *needleOper = (immPOper *)$needleImm;
+    const TypeOopPtr *t = needleOper->type()->isa_oopptr();
+    ciTypeArray* needle_values = t->const_oop()->as_type_array();  // Pointer to live char *
+    jchar chr = (jchar)needle_values->element_value(0).as_byte();
+    __ string_indexof_char($result$$Register,
+                           $haystack$$Register, $haycnt$$Register,
+                           R0, chr,
+                           $tmp1$$Register, $tmp2$$Register, true /*is_byte*/);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_imm1_char_UL(iRegIdst result, iRegPsrc haystack, iRegIsrc haycnt,
+                              immP needleImm, immL offsetImm, immI_1 needlecntImm,
+                              iRegIdst tmp1, iRegIdst tmp2,
+                              flagsRegCR0 cr0, flagsRegCR1 cr1, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary (AddP needleImm offsetImm) needlecntImm)));
+  effect(TEMP tmp1, TEMP tmp2, KILL cr0, KILL cr1, KILL ctr);
+  // Required for EA: check if it is still a type_array.
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL);
+  ins_cost(150);
+
+  format %{ "String IndexOf CSCL1 $haystack[0..$haycnt], $needleImm+$offsetImm[0..$needlecntImm]"
+            "-> $result \t// KILL $haycnt, $tmp1, $tmp2, $cr0, $cr1" %}
+
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    immPOper *needleOper = (immPOper *)$needleImm;
+    const TypeOopPtr *t = needleOper->type()->isa_oopptr();
+    ciTypeArray* needle_values = t->const_oop()->as_type_array();  // Pointer to live char *
+    jchar chr = (jchar)needle_values->element_value(0).as_byte();
+    __ string_indexof_char($result$$Register,
+                           $haystack$$Register, $haycnt$$Register,
+                           R0, chr,
+                           $tmp1$$Register, $tmp2$$Register, false /*is_byte*/);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_imm1_U(iRegIdst result, iRegPsrc haystack, iRegIsrc haycnt,
+                        rscratch2RegP needle, immI_1 needlecntImm,
+                        iRegIdst tmp1, iRegIdst tmp2,
+                        flagsRegCR0 cr0, flagsRegCR1 cr1, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary needle needlecntImm)));
+  effect(USE_KILL needle, TEMP tmp1, TEMP tmp2, KILL cr0, KILL cr1, KILL ctr);
+  // Required for EA: check if it is still a type_array.
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop() &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop()->is_type_array());
+  ins_cost(180);
+
+  format %{ "String IndexOf SCL1 $haystack[0..$haycnt], $needle[0..$needlecntImm]"
+            " -> $result \t// KILL $haycnt, $needle, $tmp1, $tmp2, $cr0, $cr1" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    Node *ndl = in(operand_index($needle));  // The node that defines needle.
+    ciTypeArray* needle_values = ndl->bottom_type()->is_aryptr()->const_oop()->as_type_array();
+    guarantee(needle_values, "sanity");
+    jchar chr;
+#ifdef VM_LITTLE_ENDIAN
+    chr = (((jchar)(unsigned char)needle_values->element_value(1).as_byte()) << 8) |
+           ((jchar)(unsigned char)needle_values->element_value(0).as_byte());
+#else
+    chr = (((jchar)(unsigned char)needle_values->element_value(0).as_byte()) << 8) |
+           ((jchar)(unsigned char)needle_values->element_value(1).as_byte());
+#endif
+    __ string_indexof_char($result$$Register,
+                           $haystack$$Register, $haycnt$$Register,
+                           R0, chr,
+                           $tmp1$$Register, $tmp2$$Register, false /*is_byte*/);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_imm1_L(iRegIdst result, iRegPsrc haystack, iRegIsrc haycnt,
+                        rscratch2RegP needle, immI_1 needlecntImm,
+                        iRegIdst tmp1, iRegIdst tmp2,
+                        flagsRegCR0 cr0, flagsRegCR1 cr1, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary needle needlecntImm)));
+  effect(USE_KILL needle, TEMP tmp1, TEMP tmp2, KILL cr0, KILL cr1, KILL ctr);
+  // Required for EA: check if it is still a type_array.
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop() &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop()->is_type_array());
+  ins_cost(180);
+
+  format %{ "String IndexOf SCL1 $haystack[0..$haycnt], $needle[0..$needlecntImm]"
+            " -> $result \t// KILL $haycnt, $needle, $tmp1, $tmp2, $cr0, $cr1" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    Node *ndl = in(operand_index($needle));  // The node that defines needle.
+    ciTypeArray* needle_values = ndl->bottom_type()->is_aryptr()->const_oop()->as_type_array();
+    guarantee(needle_values, "sanity");
+    jchar chr = (jchar)needle_values->element_value(0).as_byte();
+    __ string_indexof_char($result$$Register,
+                           $haystack$$Register, $haycnt$$Register,
+                           R0, chr,
+                           $tmp1$$Register, $tmp2$$Register, true /*is_byte*/);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_imm1_UL(iRegIdst result, iRegPsrc haystack, iRegIsrc haycnt,
+                         rscratch2RegP needle, immI_1 needlecntImm,
+                         iRegIdst tmp1, iRegIdst tmp2,
+                         flagsRegCR0 cr0, flagsRegCR1 cr1, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary needle needlecntImm)));
+  effect(USE_KILL needle, TEMP tmp1, TEMP tmp2, KILL cr0, KILL cr1, KILL ctr);
+  // Required for EA: check if it is still a type_array.
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop() &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop()->is_type_array());
+  ins_cost(180);
+
+  format %{ "String IndexOf SCL1 $haystack[0..$haycnt], $needle[0..$needlecntImm]"
+            " -> $result \t// KILL $haycnt, $needle, $tmp1, $tmp2, $cr0, $cr1" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    Node *ndl = in(operand_index($needle));  // The node that defines needle.
+    ciTypeArray* needle_values = ndl->bottom_type()->is_aryptr()->const_oop()->as_type_array();
+    guarantee(needle_values, "sanity");
+    jchar chr = (jchar)needle_values->element_value(0).as_byte();
+    __ string_indexof_char($result$$Register,
+                           $haystack$$Register, $haycnt$$Register,
+                           R0, chr,
+                           $tmp1$$Register, $tmp2$$Register, false /*is_byte*/);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOfChar_U(iRegIdst result, iRegPsrc haystack, iRegIsrc haycnt,
+                       iRegIsrc ch, iRegIdst tmp1, iRegIdst tmp2,
+                       flagsRegCR0 cr0, flagsRegCR1 cr1, regCTR ctr) %{
+  match(Set result (StrIndexOfChar (Binary haystack haycnt) ch));
+  effect(TEMP tmp1, TEMP tmp2, KILL cr0, KILL cr1, KILL ctr);
+  predicate(CompactStrings);
+  ins_cost(180);
+
+  format %{ "String IndexOfChar $haystack[0..$haycnt], $ch"
+            " -> $result \t// KILL $haycnt, $tmp1, $tmp2, $cr0, $cr1" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ string_indexof_char($result$$Register,
+                           $haystack$$Register, $haycnt$$Register,
+                           $ch$$Register, 0 /* this is not used if the character is already in a register */,
+                           $tmp1$$Register, $tmp2$$Register, false /*is_byte*/);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_imm_U(iRegIdst result, iRegPsrc haystack, rscratch1RegI haycnt,
+                       iRegPsrc needle, uimmI15 needlecntImm,
+                       iRegIdst tmp1, iRegIdst tmp2, iRegIdst tmp3, iRegIdst tmp4, iRegIdst tmp5,
+                       flagsRegCR0 cr0, flagsRegCR1 cr1, flagsRegCR6 cr6, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary needle needlecntImm)));
+  effect(USE_KILL haycnt, /* better: TDEF haycnt, */ TEMP_DEF result,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr0, KILL cr1, KILL cr6, KILL ctr);
+  // Required for EA: check if it is still a type_array.
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop() &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop()->is_type_array());
+  ins_cost(250);
+
+  format %{ "String IndexOf SCL $haystack[0..$haycnt], $needle[0..$needlecntImm]"
+            " -> $result \t// KILL $haycnt, $tmp1, $tmp2, $tmp3, $tmp4, $tmp5, $cr0, $cr1" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    Node *ndl = in(operand_index($needle));  // The node that defines needle.
+    ciTypeArray* needle_values = ndl->bottom_type()->is_aryptr()->const_oop()->as_type_array();
+
+    __ string_indexof($result$$Register,
+                      $haystack$$Register, $haycnt$$Register,
+                      $needle$$Register, needle_values, $tmp5$$Register, $needlecntImm$$constant,
+                      $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, StrIntrinsicNode::UU);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_imm_L(iRegIdst result, iRegPsrc haystack, rscratch1RegI haycnt,
+                       iRegPsrc needle, uimmI15 needlecntImm,
+                       iRegIdst tmp1, iRegIdst tmp2, iRegIdst tmp3, iRegIdst tmp4, iRegIdst tmp5,
+                       flagsRegCR0 cr0, flagsRegCR1 cr1, flagsRegCR6 cr6, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary needle needlecntImm)));
+  effect(USE_KILL haycnt, /* better: TDEF haycnt, */ TEMP_DEF result,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr0, KILL cr1, KILL cr6, KILL ctr);
+  // Required for EA: check if it is still a type_array.
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop() &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop()->is_type_array());
+  ins_cost(250);
+
+  format %{ "String IndexOf SCL $haystack[0..$haycnt], $needle[0..$needlecntImm]"
+            " -> $result \t// KILL $haycnt, $tmp1, $tmp2, $tmp3, $tmp4, $tmp5, $cr0, $cr1" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    Node *ndl = in(operand_index($needle));  // The node that defines needle.
+    ciTypeArray* needle_values = ndl->bottom_type()->is_aryptr()->const_oop()->as_type_array();
+
+    __ string_indexof($result$$Register,
+                      $haystack$$Register, $haycnt$$Register,
+                      $needle$$Register, needle_values, $tmp5$$Register, $needlecntImm$$constant,
+                      $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, StrIntrinsicNode::LL);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_imm_UL(iRegIdst result, iRegPsrc haystack, rscratch1RegI haycnt,
+                        iRegPsrc needle, uimmI15 needlecntImm,
+                        iRegIdst tmp1, iRegIdst tmp2, iRegIdst tmp3, iRegIdst tmp4, iRegIdst tmp5,
+                        flagsRegCR0 cr0, flagsRegCR1 cr1, flagsRegCR6 cr6, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary needle needlecntImm)));
+  effect(USE_KILL haycnt, /* better: TDEF haycnt, */ TEMP_DEF result,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr0, KILL cr1, KILL cr6, KILL ctr);
+  // Required for EA: check if it is still a type_array.
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop() &&
+            n->in(3)->in(1)->bottom_type()->is_aryptr()->const_oop()->is_type_array());
+  ins_cost(250);
+
+  format %{ "String IndexOf SCL $haystack[0..$haycnt], $needle[0..$needlecntImm]"
+            " -> $result \t// KILL $haycnt, $tmp1, $tmp2, $tmp3, $tmp4, $tmp5, $cr0, $cr1" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    Node *ndl = in(operand_index($needle));  // The node that defines needle.
+    ciTypeArray* needle_values = ndl->bottom_type()->is_aryptr()->const_oop()->as_type_array();
+
+    __ string_indexof($result$$Register,
+                      $haystack$$Register, $haycnt$$Register,
+                      $needle$$Register, needle_values, $tmp5$$Register, $needlecntImm$$constant,
+                      $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, StrIntrinsicNode::UL);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_U(iRegIdst result, iRegPsrc haystack, rscratch1RegI haycnt, iRegPsrc needle, rscratch2RegI needlecnt,
+                   iRegLdst tmp1, iRegLdst tmp2, iRegLdst tmp3, iRegLdst tmp4,
+                   flagsRegCR0 cr0, flagsRegCR1 cr1, flagsRegCR6 cr6, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary needle needlecnt)));
+  effect(USE_KILL haycnt, USE_KILL needlecnt, /*better: TDEF haycnt, TDEF needlecnt,*/
+         TEMP_DEF result,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr0, KILL cr1, KILL cr6, KILL ctr);
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU);
+  ins_cost(300);
+
+  format %{ "String IndexOf $haystack[0..$haycnt], $needle[0..$needlecnt]"
+             " -> $result \t// KILL $haycnt, $needlecnt, $tmp1, $tmp2, $tmp3, $tmp4, $cr0, $cr1" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ string_indexof($result$$Register,
+                      $haystack$$Register, $haycnt$$Register,
+                      $needle$$Register, NULL, $needlecnt$$Register, 0,  // needlecnt not constant.
+                      $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, StrIntrinsicNode::UU);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_L(iRegIdst result, iRegPsrc haystack, rscratch1RegI haycnt, iRegPsrc needle, rscratch2RegI needlecnt,
+                   iRegLdst tmp1, iRegLdst tmp2, iRegLdst tmp3, iRegLdst tmp4,
+                   flagsRegCR0 cr0, flagsRegCR1 cr1, flagsRegCR6 cr6, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary needle needlecnt)));
+  effect(USE_KILL haycnt, USE_KILL needlecnt, /*better: TDEF haycnt, TDEF needlecnt,*/
+         TEMP_DEF result,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr0, KILL cr1, KILL cr6, KILL ctr);
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL);
+  ins_cost(300);
+
+  format %{ "String IndexOf $haystack[0..$haycnt], $needle[0..$needlecnt]"
+             " -> $result \t// KILL $haycnt, $needlecnt, $tmp1, $tmp2, $tmp3, $tmp4, $cr0, $cr1" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ string_indexof($result$$Register,
+                      $haystack$$Register, $haycnt$$Register,
+                      $needle$$Register, NULL, $needlecnt$$Register, 0,  // needlecnt not constant.
+                      $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, StrIntrinsicNode::LL);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+instruct indexOf_UL(iRegIdst result, iRegPsrc haystack, rscratch1RegI haycnt, iRegPsrc needle, rscratch2RegI needlecnt,
+                    iRegLdst tmp1, iRegLdst tmp2, iRegLdst tmp3, iRegLdst tmp4,
+                    flagsRegCR0 cr0, flagsRegCR1 cr1, flagsRegCR6 cr6, regCTR ctr) %{
+  match(Set result (StrIndexOf (Binary haystack haycnt) (Binary needle needlecnt)));
+  effect(USE_KILL haycnt, USE_KILL needlecnt, /*better: TDEF haycnt, TDEF needlecnt,*/
+         TEMP_DEF result,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr0, KILL cr1, KILL cr6, KILL ctr);
+  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL);
+  ins_cost(300);
+
+  format %{ "String IndexOf $haystack[0..$haycnt], $needle[0..$needlecnt]"
+             " -> $result \t// KILL $haycnt, $needlecnt, $tmp1, $tmp2, $tmp3, $tmp4, $cr0, $cr1" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ string_indexof($result$$Register,
+                      $haystack$$Register, $haycnt$$Register,
+                      $needle$$Register, NULL, $needlecnt$$Register, 0,  // needlecnt not constant.
+                      $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, StrIntrinsicNode::UL);
+  %}
+  ins_pipe(pipe_class_compare);
+%}
+
+// char[] to byte[] compression
+instruct string_compress(rarg1RegP src, rarg2RegP dst, iRegIsrc len, iRegIdst result, iRegLdst tmp1,
+                         iRegLdst tmp2, iRegLdst tmp3, iRegLdst tmp4, iRegLdst tmp5, regCTR ctr, flagsRegCR0 cr0) %{
+  match(Set result (StrCompressedCopy src (Binary dst len)));
+  effect(TEMP_DEF result, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5,
+         USE_KILL src, USE_KILL dst, KILL ctr, KILL cr0);
+  ins_cost(300);
+  format %{ "String Compress $src,$dst,$len -> $result \t// KILL $tmp1, $tmp2, $tmp3, $tmp4, $tmp5" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    Label Lskip, Ldone;
+    __ li($result$$Register, 0);
+    __ string_compress_16($src$$Register, $dst$$Register, $len$$Register, $tmp1$$Register,
+                          $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, $tmp5$$Register, Ldone);
+    __ rldicl_($tmp1$$Register, $len$$Register, 0, 64-3); // Remaining characters.
+    __ beq(CCR0, Lskip);
+    __ string_compress($src$$Register, $dst$$Register, $tmp1$$Register, $tmp2$$Register, Ldone);
+    __ bind(Lskip);
+    __ mr($result$$Register, $len$$Register);
+    __ bind(Ldone);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// byte[] to char[] inflation
+instruct string_inflate(Universe dummy, rarg1RegP src, rarg2RegP dst, iRegIsrc len, iRegLdst tmp1,
+                        iRegLdst tmp2, iRegLdst tmp3, iRegLdst tmp4, iRegLdst tmp5, regCTR ctr, flagsRegCR0 cr0) %{
+  match(Set dummy (StrInflatedCopy src (Binary dst len)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, USE_KILL src, USE_KILL dst, KILL ctr, KILL cr0);
+  ins_cost(300);
+  format %{ "String Inflate $src,$dst,$len \t// KILL $tmp1, $tmp2, $tmp3, $tmp4, $tmp5" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    Label Ldone;
+    __ string_inflate_16($src$$Register, $dst$$Register, $len$$Register, $tmp1$$Register,
+                         $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, $tmp5$$Register);
+    __ rldicl_($tmp1$$Register, $len$$Register, 0, 64-3); // Remaining characters.
+    __ beq(CCR0, Ldone);
+    __ string_inflate($src$$Register, $dst$$Register, $tmp1$$Register, $tmp2$$Register);
+    __ bind(Ldone);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// StringCoding.java intrinsics
+instruct has_negatives(rarg1RegP ary1, iRegIsrc len, iRegIdst result, iRegLdst tmp1, iRegLdst tmp2,
+                       regCTR ctr, flagsRegCR0 cr0)
+%{
+  match(Set result (HasNegatives ary1 len));
+  effect(TEMP_DEF result, USE_KILL ary1, TEMP tmp1, TEMP tmp2, KILL ctr, KILL cr0);
+  ins_cost(300);
+  format %{ "has negatives byte[] $ary1,$len -> $result \t// KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ has_negatives($ary1$$Register, $len$$Register, $result$$Register,
+                     $tmp1$$Register, $tmp2$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// encode char[] to byte[] in ISO_8859_1
+instruct encode_iso_array(rarg1RegP src, rarg2RegP dst, iRegIsrc len, iRegIdst result, iRegLdst tmp1,
+                          iRegLdst tmp2, iRegLdst tmp3, iRegLdst tmp4, iRegLdst tmp5, regCTR ctr, flagsRegCR0 cr0) %{
+  match(Set result (EncodeISOArray src (Binary dst len)));
+  effect(TEMP_DEF result, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5,
+         USE_KILL src, USE_KILL dst, KILL ctr, KILL cr0);
+  ins_cost(300);
+  format %{ "Encode array $src,$dst,$len -> $result \t// KILL $tmp1, $tmp2, $tmp3, $tmp4, $tmp5" %}
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    Label Lslow, Lfailure1, Lfailure2, Ldone;
+    __ string_compress_16($src$$Register, $dst$$Register, $len$$Register, $tmp1$$Register,
+                          $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, $tmp5$$Register, Lfailure1);
+    __ rldicl_($result$$Register, $len$$Register, 0, 64-3); // Remaining characters.
+    __ beq(CCR0, Ldone);
+    __ bind(Lslow);
+    __ string_compress($src$$Register, $dst$$Register, $result$$Register, $tmp2$$Register, Lfailure2);
+    __ li($result$$Register, 0);
+    __ b(Ldone);
+
+    __ bind(Lfailure1);
+    __ mr($result$$Register, $len$$Register);
+    __ mfctr($tmp1$$Register);
+    __ rldimi_($result$$Register, $tmp1$$Register, 3, 0); // Remaining characters.
+    __ beq(CCR0, Ldone);
+    __ b(Lslow);
+
+    __ bind(Lfailure2);
+    __ mfctr($result$$Register); // Remaining characters.
+
+    __ bind(Ldone);
+    __ subf($result$$Register, $result$$Register, $len$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+
 // String_IndexOf for needle of length 1.
 //
 // Match needle into immediate operands: no loadConP node needed. Saves one
@@ -11060,11 +11638,11 @@
     if (java_lang_String::has_coder_field()) {
       // New compact strings byte array strings
 #ifdef VM_LITTLE_ENDIAN
-      chr = (((jchar)needle_values->element_value(1).as_byte()) << 8) |
-              (jchar)needle_values->element_value(0).as_byte();
+    chr = (((jchar)(unsigned char)needle_values->element_value(1).as_byte()) << 8) |
+           ((jchar)(unsigned char)needle_values->element_value(0).as_byte());
 #else
-      chr = (((jchar)needle_values->element_value(0).as_byte()) << 8) |
-              (jchar)needle_values->element_value(1).as_byte();
+    chr = (((jchar)(unsigned char)needle_values->element_value(0).as_byte()) << 8) |
+           ((jchar)(unsigned char)needle_values->element_value(1).as_byte());
 #endif
     } else {
       // Old char array strings
@@ -11115,11 +11693,11 @@
     if (java_lang_String::has_coder_field()) {
       // New compact strings byte array strings
 #ifdef VM_LITTLE_ENDIAN
-      chr = (((jchar)needle_values->element_value(1).as_byte()) << 8) |
-              (jchar)needle_values->element_value(0).as_byte();
+    chr = (((jchar)(unsigned char)needle_values->element_value(1).as_byte()) << 8) |
+           ((jchar)(unsigned char)needle_values->element_value(0).as_byte());
 #else
-      chr = (((jchar)needle_values->element_value(0).as_byte()) << 8) |
-              (jchar)needle_values->element_value(1).as_byte();
+    chr = (((jchar)(unsigned char)needle_values->element_value(0).as_byte()) << 8) |
+           ((jchar)(unsigned char)needle_values->element_value(1).as_byte());
 #endif
     } else {
       // Old char array strings
@@ -11321,6 +11899,20 @@
   %}
 %}
 
+instruct minI_reg_reg_isel(iRegIdst dst, iRegIsrc src1, iRegIsrc src2, flagsRegCR0 cr0) %{
+  match(Set dst (MinI src1 src2));
+  effect(KILL cr0);
+  predicate(VM_Version::has_isel());
+  ins_cost(DEFAULT_COST*2);
+
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ cmpw(CCR0, $src1$$Register, $src2$$Register);
+    __ isel($dst$$Register, CCR0, Assembler::less, /*invert*/false, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 instruct maxI_reg_reg_Ex(iRegIdst dst, iRegIsrc src1, iRegIsrc src2) %{
   match(Set dst (MaxI src1 src2));
   ins_cost(DEFAULT_COST*6);
@@ -11341,6 +11933,20 @@
   %}
 %}
 
+instruct maxI_reg_reg_isel(iRegIdst dst, iRegIsrc src1, iRegIsrc src2, flagsRegCR0 cr0) %{
+  match(Set dst (MaxI src1 src2));
+  effect(KILL cr0);
+  predicate(VM_Version::has_isel());
+  ins_cost(DEFAULT_COST*2);
+
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    __ cmpw(CCR0, $src1$$Register, $src2$$Register);
+    __ isel($dst$$Register, CCR0, Assembler::greater, /*invert*/false, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
 //---------- Population Count Instructions ------------------------------------
 
 // Popcnt for Power7.
--- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -2609,9 +2609,7 @@
    *   R5_ARG3    - int   length (of buffer)
    *
    * scratch:
-   *   R6_ARG4    - crc table address
-   *   R7_ARG5    - tmp1
-   *   R8_ARG6    - tmp2
+   *   R2, R6-R12
    *
    * Ouput:
    *   R3_RET     - int   crc result
@@ -2623,22 +2621,25 @@
     address start = __ function_entry();  // Remember stub start address (is rtn value).
 
     // arguments to kernel_crc32:
-    Register       crc     = R3_ARG1;  // Current checksum, preset by caller or result from previous call.
-    Register       data    = R4_ARG2;  // source byte array
-    Register       dataLen = R5_ARG3;  // #bytes to process
-    Register       table   = R6_ARG4;  // crc table address
-
-    Register       t0      = R9;       // work reg for kernel* emitters
-    Register       t1      = R10;      // work reg for kernel* emitters
-    Register       t2      = R11;      // work reg for kernel* emitters
-    Register       t3      = R12;      // work reg for kernel* emitters
+    const Register crc     = R3_ARG1;  // Current checksum, preset by caller or result from previous call.
+    const Register data    = R4_ARG2;  // source byte array
+    const Register dataLen = R5_ARG3;  // #bytes to process
+    const Register table   = R6_ARG4;  // crc table address
+
+    const Register t0      = R2;
+    const Register t1      = R7;
+    const Register t2      = R8;
+    const Register t3      = R9;
+    const Register tc0     = R10;
+    const Register tc1     = R11;
+    const Register tc2     = R12;
 
     BLOCK_COMMENT("Stub body {");
     assert_different_registers(crc, data, dataLen, table);
 
     StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
 
-    __ kernel_crc32_1byte(crc, data, dataLen, table, t0, t1, t2, t3);
+    __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table);
 
     BLOCK_COMMENT("return");
     __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -53,7 +53,7 @@
 
   // If PowerArchitecturePPC64 hasn't been specified explicitly determine from features.
   if (FLAG_IS_DEFAULT(PowerArchitecturePPC64)) {
-    if (VM_Version::has_tcheck() && VM_Version::has_lqarx()) {
+    if (VM_Version::has_lqarx()) {
       FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 8);
     } else if (VM_Version::has_popcntw()) {
       FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 7);
@@ -68,8 +68,7 @@
 
   bool PowerArchitecturePPC64_ok = false;
   switch (PowerArchitecturePPC64) {
-    case 8: if (!VM_Version::has_tcheck() ) break;
-            if (!VM_Version::has_lqarx()  ) break;
+    case 8: if (!VM_Version::has_lqarx()  ) break;
     case 7: if (!VM_Version::has_popcntw()) break;
     case 6: if (!VM_Version::has_cmpb()   ) break;
     case 5: if (!VM_Version::has_popcntb()) break;
@@ -80,7 +79,7 @@
             UINTX_FORMAT " on this machine", PowerArchitecturePPC64);
 
   // Power 8: Configure Data Stream Control Register.
-  if (PowerArchitecturePPC64 >= 8) {
+  if (has_mfdscr()) {
     config_dscr();
   }
 
@@ -112,7 +111,7 @@
   // Create and print feature-string.
   char buf[(num_features+1) * 16]; // Max 16 chars per feature.
   jio_snprintf(buf, sizeof(buf),
-               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s",
+               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s",
                (has_fsqrt()   ? " fsqrt"   : ""),
                (has_isel()    ? " isel"    : ""),
                (has_lxarxeh() ? " lxarxeh" : ""),
@@ -125,7 +124,8 @@
                (has_lqarx()   ? " lqarx"   : ""),
                (has_vcipher() ? " vcipher" : ""),
                (has_vpmsumb() ? " vpmsumb" : ""),
-               (has_tcheck()  ? " tcheck"  : "")
+               (has_tcheck()  ? " tcheck"  : ""),
+               (has_mfdscr()  ? " mfdscr"  : "")
                // Make sure number of %s matches num_features!
               );
   _features_string = os::strdup(buf);
@@ -610,6 +610,7 @@
   a->vcipher(VR0, VR1, VR2);                   // code[10] -> vcipher
   a->vpmsumb(VR0, VR1, VR2);                   // code[11] -> vpmsumb
   a->tcheck(0);                                // code[12] -> tcheck
+  a->mfdscr(R0);                               // code[13] -> mfdscr
   a->blr();
 
   // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
@@ -657,6 +658,7 @@
   if (code[feature_cntr++]) features |= vcipher_m;
   if (code[feature_cntr++]) features |= vpmsumb_m;
   if (code[feature_cntr++]) features |= tcheck_m;
+  if (code[feature_cntr++]) features |= mfdscr_m;
 
   // Print the detection code.
   if (PrintAssembly) {
@@ -670,8 +672,6 @@
 
 // Power 8: Configure Data Stream Control Register.
 void VM_Version::config_dscr() {
-  assert(has_tcheck(), "Only execute on Power 8 or later!");
-
   // 7 InstWords for each call (function descriptor + blr instruction).
   const int code_size = (2+2*7)*BytesPerInstWord;
 
--- a/src/cpu/ppc/vm/vm_version_ppc.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/ppc/vm/vm_version_ppc.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -45,6 +45,7 @@
     vcipher,
     vpmsumb,
     tcheck,
+    mfdscr,
     num_features // last entry to count features
   };
   enum Feature_Flag_Set {
@@ -62,6 +63,7 @@
     vcipher_m             = (1 << vcipher),
     vpmsumb_m             = (1 << vpmsumb),
     tcheck_m              = (1 << tcheck ),
+    mfdscr_m              = (1 << mfdscr ),
     all_features_m        = (unsigned long)-1
   };
 
@@ -94,6 +96,7 @@
   static bool has_vcipher() { return (_features & vcipher_m) != 0; }
   static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }
   static bool has_tcheck()  { return (_features & tcheck_m) != 0; }
+  static bool has_mfdscr()  { return (_features & mfdscr_m) != 0; }
 
   // Assembler testing
   static void allow_all();
--- a/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1349,9 +1349,12 @@
     }
   } else if (dst.first()->is_stack()) {
     // reg to stack
-    __ st_ptr(src.first()->as_Register(), SP, reg2offset(dst.first()) + STACK_BIAS);
+    // Some compilers (gcc) expect a clean 32 bit value on function entry
+    __ signx(src.first()->as_Register(), L5);
+    __ st_ptr(L5, SP, reg2offset(dst.first()) + STACK_BIAS);
   } else {
-    __ mov(src.first()->as_Register(), dst.first()->as_Register());
+    // Some compilers (gcc) expect a clean 32 bit value on function entry
+    __ signx(src.first()->as_Register(), dst.first()->as_Register());
   }
 }
 
--- a/src/cpu/sparc/vm/sparc.ad	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/sparc/vm/sparc.ad	Thu Mar 10 09:50:18 2016 -0800
@@ -948,28 +948,28 @@
   }
 #endif
 
-  uint instr;
-  instr = (Assembler::ldst_op << 30)
-        | (dst_enc        << 25)
-        | (primary        << 19)
-        | (src1_enc       << 14);
+  uint instr = (Assembler::ldst_op << 30)
+             | (dst_enc        << 25)
+             | (primary        << 19)
+             | (src1_enc       << 14);
 
   uint index = src2_enc;
   int disp = disp32;
 
   if (src1_enc == R_SP_enc || src1_enc == R_FP_enc) {
     disp += STACK_BIAS;
-    // Quick fix for JDK-8029668: check that stack offset fits, bailout if not
+    // Check that stack offset fits, load into O7 if not
     if (!Assembler::is_simm13(disp)) {
-      ra->C->record_method_not_compilable("unable to handle large constant offsets");
-      return;
+      MacroAssembler _masm(&cbuf);
+      __ set(disp, O7);
+      if (index != R_G0_enc) {
+        __ add(O7, reg_to_register_object(index), O7);
+      }
+      index = R_O7_enc;
+      disp = 0;
     }
   }
 
-  // We should have a compiler bailout here rather than a guarantee.
-  // Better yet would be some mechanism to handle variable-size matches correctly.
-  guarantee(Assembler::is_simm13(disp), "Do not match large constant offsets" );
-
   if( disp == 0 ) {
     // use reg-reg form
     // bit 13 is already zero
@@ -983,7 +983,7 @@
   cbuf.insts()->emit_int32(instr);
 
 #ifdef ASSERT
-  {
+  if (VerifyOops) {
     MacroAssembler _masm(&cbuf);
     if (is_verified_oop_base) {
       __ verify_oop(reg_to_register_object(src1_enc));
@@ -1342,7 +1342,7 @@
 // Figure out which register class each belongs in: rc_int, rc_float, rc_stack
 enum RC { rc_bad, rc_int, rc_float, rc_stack };
 static enum RC rc_class( OptoReg::Name reg ) {
-  if( !OptoReg::is_valid(reg)  ) return rc_bad;
+  if (!OptoReg::is_valid(reg)) return rc_bad;
   if (OptoReg::is_stack(reg)) return rc_stack;
   VMReg r = OptoReg::as_VMReg(reg);
   if (r->is_Register()) return rc_int;
@@ -1350,66 +1350,79 @@
   return rc_float;
 }
 
-static int impl_helper(const MachNode* mach, CodeBuffer* cbuf, PhaseRegAlloc* ra, bool do_size, bool is_load, int offset, int reg, int opcode, const char *op_str, int size, outputStream* st ) {
+#ifndef PRODUCT
+ATTRIBUTE_PRINTF(2, 3)
+static void print_helper(outputStream* st, const char* format, ...) {
+  if (st->position() > 0) {
+    st->cr();
+    st->sp();
+  }
+  va_list ap;
+  va_start(ap, format);
+  st->vprint(format, ap);
+  va_end(ap);
+}
+#endif // !PRODUCT
+
+static void impl_helper(const MachNode* mach, CodeBuffer* cbuf, PhaseRegAlloc* ra, bool is_load, int offset, int reg, int opcode, const char *op_str, outputStream* st) {
   if (cbuf) {
     emit_form3_mem_reg(*cbuf, ra, mach, opcode, -1, R_SP_enc, offset, 0, Matcher::_regEncode[reg]);
   }
 #ifndef PRODUCT
-  else if (!do_size) {
-    if (size != 0) st->print("\n\t");
-    if (is_load) st->print("%s   [R_SP + #%d],R_%s\t! spill",op_str,offset,OptoReg::regname(reg));
-    else         st->print("%s   R_%s,[R_SP + #%d]\t! spill",op_str,OptoReg::regname(reg),offset);
+  else {
+    if (is_load) {
+      print_helper(st, "%s   [R_SP + #%d],R_%s\t! spill", op_str, offset, OptoReg::regname(reg));
+    } else {
+      print_helper(st, "%s   R_%s,[R_SP + #%d]\t! spill", op_str, OptoReg::regname(reg), offset);
+    }
   }
 #endif
-  return size+4;
 }
 
-static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int op1, int op2, const char *op_str, int size, outputStream* st ) {
-  if( cbuf ) emit3( *cbuf, Assembler::arith_op, Matcher::_regEncode[dst], op1, 0, op2, Matcher::_regEncode[src] );
+static void impl_mov_helper(CodeBuffer *cbuf, int src, int dst, int op1, int op2, const char *op_str, outputStream* st) {
+  if (cbuf) {
+    emit3(*cbuf, Assembler::arith_op, Matcher::_regEncode[dst], op1, 0, op2, Matcher::_regEncode[src]);
+  }
 #ifndef PRODUCT
-  else if( !do_size ) {
-    if( size != 0 ) st->print("\n\t");
-    st->print("%s  R_%s,R_%s\t! spill",op_str,OptoReg::regname(src),OptoReg::regname(dst));
+  else {
+    print_helper(st, "%s  R_%s,R_%s\t! spill", op_str, OptoReg::regname(src), OptoReg::regname(dst));
   }
 #endif
-  return size+4;
 }
 
-uint MachSpillCopyNode::implementation( CodeBuffer *cbuf,
-                                        PhaseRegAlloc *ra_,
-                                        bool do_size,
-                                        outputStream* st ) const {
+static void mach_spill_copy_implementation_helper(const MachNode* mach,
+                                                  CodeBuffer *cbuf,
+                                                  PhaseRegAlloc *ra_,
+                                                  outputStream* st) {
   // Get registers to move
-  OptoReg::Name src_second = ra_->get_reg_second(in(1));
-  OptoReg::Name src_first = ra_->get_reg_first(in(1));
-  OptoReg::Name dst_second = ra_->get_reg_second(this );
-  OptoReg::Name dst_first = ra_->get_reg_first(this );
+  OptoReg::Name src_second = ra_->get_reg_second(mach->in(1));
+  OptoReg::Name src_first  = ra_->get_reg_first(mach->in(1));
+  OptoReg::Name dst_second = ra_->get_reg_second(mach);
+  OptoReg::Name dst_first  = ra_->get_reg_first(mach);
 
   enum RC src_second_rc = rc_class(src_second);
-  enum RC src_first_rc = rc_class(src_first);
+  enum RC src_first_rc  = rc_class(src_first);
   enum RC dst_second_rc = rc_class(dst_second);
-  enum RC dst_first_rc = rc_class(dst_first);
-
-  assert( OptoReg::is_valid(src_first) && OptoReg::is_valid(dst_first), "must move at least 1 register" );
-
-  // Generate spill code!
-  int size = 0;
-
-  if( src_first == dst_first && src_second == dst_second )
-    return size;            // Self copy, no move
+  enum RC dst_first_rc  = rc_class(dst_first);
+
+  assert(OptoReg::is_valid(src_first) && OptoReg::is_valid(dst_first), "must move at least 1 register");
+
+  if (src_first == dst_first && src_second == dst_second) {
+    return; // Self copy, no move
+  }
 
   // --------------------------------------
   // Check for mem-mem move.  Load into unused float registers and fall into
   // the float-store case.
-  if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
+  if (src_first_rc == rc_stack && dst_first_rc == rc_stack) {
     int offset = ra_->reg2offset(src_first);
     // Further check for aligned-adjacent pair, so we can use a double load
-    if( (src_first&1)==0 && src_first+1 == src_second ) {
+    if ((src_first&1) == 0 && src_first+1 == src_second) {
       src_second    = OptoReg::Name(R_F31_num);
       src_second_rc = rc_float;
-      size = impl_helper(this,cbuf,ra_,do_size,true,offset,R_F30_num,Assembler::lddf_op3,"LDDF",size, st);
+      impl_helper(mach, cbuf, ra_, true, offset, R_F30_num, Assembler::lddf_op3, "LDDF", st);
     } else {
-      size = impl_helper(this,cbuf,ra_,do_size,true,offset,R_F30_num,Assembler::ldf_op3 ,"LDF ",size, st);
+      impl_helper(mach, cbuf, ra_, true, offset, R_F30_num, Assembler::ldf_op3, "LDF ", st);
     }
     src_first    = OptoReg::Name(R_F30_num);
     src_first_rc = rc_float;
@@ -1417,7 +1430,7 @@
 
   if( src_second_rc == rc_stack && dst_second_rc == rc_stack ) {
     int offset = ra_->reg2offset(src_second);
-    size = impl_helper(this,cbuf,ra_,do_size,true,offset,R_F31_num,Assembler::ldf_op3,"LDF ",size, st);
+    impl_helper(mach, cbuf, ra_, true, offset, R_F31_num, Assembler::ldf_op3, "LDF ", st);
     src_second    = OptoReg::Name(R_F31_num);
     src_second_rc = rc_float;
   }
@@ -1427,36 +1440,38 @@
   if (src_first_rc == rc_float && dst_first_rc == rc_int && UseVIS < 3) {
     int offset = frame::register_save_words*wordSize;
     if (cbuf) {
-      emit3_simm13( *cbuf, Assembler::arith_op, R_SP_enc, Assembler::sub_op3, R_SP_enc, 16 );
-      impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stf_op3 ,"STF ",size, st);
-      impl_helper(this,cbuf,ra_,do_size,true ,offset,dst_first,Assembler::lduw_op3,"LDUW",size, st);
-      emit3_simm13( *cbuf, Assembler::arith_op, R_SP_enc, Assembler::add_op3, R_SP_enc, 16 );
+      emit3_simm13(*cbuf, Assembler::arith_op, R_SP_enc, Assembler::sub_op3, R_SP_enc, 16);
+      impl_helper(mach, cbuf, ra_, false, offset, src_first,  Assembler::stf_op3, "STF ", st);
+      impl_helper(mach, cbuf, ra_,  true, offset, dst_first, Assembler::lduw_op3, "LDUW", st);
+      emit3_simm13(*cbuf, Assembler::arith_op, R_SP_enc, Assembler::add_op3, R_SP_enc, 16);
     }
 #ifndef PRODUCT
-    else if (!do_size) {
-      if (size != 0) st->print("\n\t");
-      st->print(  "SUB    R_SP,16,R_SP\n");
-      impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stf_op3 ,"STF ",size, st);
-      impl_helper(this,cbuf,ra_,do_size,true ,offset,dst_first,Assembler::lduw_op3,"LDUW",size, st);
-      st->print("\tADD    R_SP,16,R_SP\n");
+    else {
+      print_helper(st, "SUB    R_SP,16,R_SP");
+      impl_helper(mach, cbuf, ra_, false, offset, src_first,  Assembler::stf_op3, "STF ", st);
+      impl_helper(mach, cbuf, ra_,  true, offset, dst_first, Assembler::lduw_op3, "LDUW", st);
+      print_helper(st, "ADD    R_SP,16,R_SP");
     }
 #endif
-    size += 16;
   }
 
   // Check for float->int copy on T4
   if (src_first_rc == rc_float && dst_first_rc == rc_int && UseVIS >= 3) {
     // Further check for aligned-adjacent pair, so we can use a double move
-    if ((src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second)
-      return impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::mftoi_op3,Assembler::mdtox_opf,"MOVDTOX",size, st);
-    size  =  impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::mftoi_op3,Assembler::mstouw_opf,"MOVSTOUW",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_mov_helper(cbuf, src_first, dst_first, Assembler::mftoi_op3, Assembler::mdtox_opf, "MOVDTOX", st);
+      return;
+    }
+    impl_mov_helper(cbuf, src_first, dst_first, Assembler::mftoi_op3, Assembler::mstouw_opf, "MOVSTOUW", st);
   }
   // Check for int->float copy on T4
   if (src_first_rc == rc_int && dst_first_rc == rc_float && UseVIS >= 3) {
     // Further check for aligned-adjacent pair, so we can use a double move
-    if ((src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second)
-      return impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::mftoi_op3,Assembler::mxtod_opf,"MOVXTOD",size, st);
-    size  =  impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::mftoi_op3,Assembler::mwtos_opf,"MOVWTOS",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_mov_helper(cbuf, src_first, dst_first, Assembler::mftoi_op3, Assembler::mxtod_opf, "MOVXTOD", st);
+      return;
+    }
+    impl_mov_helper(cbuf, src_first, dst_first, Assembler::mftoi_op3, Assembler::mwtos_opf, "MOVWTOS", st);
   }
 
   // --------------------------------------
@@ -1466,10 +1481,10 @@
   // there.  Misaligned sources only come from native-long-returns (handled
   // special below).
 #ifndef _LP64
-  if( src_first_rc == rc_int &&     // source is already big-endian
+  if (src_first_rc == rc_int &&     // source is already big-endian
       src_second_rc != rc_bad &&    // 64-bit move
-      ((dst_first&1)!=0 || dst_second != dst_first+1) ) { // misaligned dst
-    assert( (src_first&1)==0 && src_second == src_first+1, "source must be aligned" );
+      ((dst_first & 1) != 0 || dst_second != dst_first + 1)) { // misaligned dst
+    assert((src_first & 1) == 0 && src_second == src_first + 1, "source must be aligned");
     // Do the big-endian flop.
     OptoReg::Name tmp    = dst_first   ; dst_first    = dst_second   ; dst_second    = tmp   ;
     enum RC       tmp_rc = dst_first_rc; dst_first_rc = dst_second_rc; dst_second_rc = tmp_rc;
@@ -1478,30 +1493,28 @@
 
   // --------------------------------------
   // Check for integer reg-reg copy
-  if( src_first_rc == rc_int && dst_first_rc == rc_int ) {
+  if (src_first_rc == rc_int && dst_first_rc == rc_int) {
 #ifndef _LP64
-    if( src_first == R_O0_num && src_second == R_O1_num ) {  // Check for the evil O0/O1 native long-return case
+    if (src_first == R_O0_num && src_second == R_O1_num) {  // Check for the evil O0/O1 native long-return case
       // Note: The _first and _second suffixes refer to the addresses of the the 2 halves of the 64-bit value
       //       as stored in memory.  On a big-endian machine like SPARC, this means that the _second
       //       operand contains the least significant word of the 64-bit value and vice versa.
       OptoReg::Name tmp = OptoReg::Name(R_O7_num);
-      assert( (dst_first&1)==0 && dst_second == dst_first+1, "return a native O0/O1 long to an aligned-adjacent 64-bit reg" );
+      assert((dst_first & 1) == 0 && dst_second == dst_first + 1, "return a native O0/O1 long to an aligned-adjacent 64-bit reg" );
       // Shift O0 left in-place, zero-extend O1, then OR them into the dst
-      if( cbuf ) {
-        emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[tmp], Assembler::sllx_op3, Matcher::_regEncode[src_first], 0x1020 );
-        emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[src_second], Assembler::srl_op3, Matcher::_regEncode[src_second], 0x0000 );
-        emit3       ( *cbuf, Assembler::arith_op, Matcher::_regEncode[dst_first], Assembler:: or_op3, Matcher::_regEncode[tmp], 0, Matcher::_regEncode[src_second] );
+      if ( cbuf ) {
+        emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[tmp], Assembler::sllx_op3, Matcher::_regEncode[src_first], 0x1020);
+        emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[src_second], Assembler::srl_op3, Matcher::_regEncode[src_second], 0x0000);
+        emit3       (*cbuf, Assembler::arith_op, Matcher::_regEncode[dst_first], Assembler:: or_op3, Matcher::_regEncode[tmp], 0, Matcher::_regEncode[src_second]);
 #ifndef PRODUCT
-      } else if( !do_size ) {
-        if( size != 0 ) st->print("\n\t");
-        st->print("SLLX   R_%s,32,R_%s\t! Move O0-first to O7-high\n\t", OptoReg::regname(src_first), OptoReg::regname(tmp));
-        st->print("SRL    R_%s, 0,R_%s\t! Zero-extend O1\n\t", OptoReg::regname(src_second), OptoReg::regname(src_second));
-        st->print("OR     R_%s,R_%s,R_%s\t! spill",OptoReg::regname(tmp), OptoReg::regname(src_second), OptoReg::regname(dst_first));
+      } else {
+        print_helper(st, "SLLX   R_%s,32,R_%s\t! Move O0-first to O7-high\n\t", OptoReg::regname(src_first), OptoReg::regname(tmp));
+        print_helper(st, "SRL    R_%s, 0,R_%s\t! Zero-extend O1\n\t", OptoReg::regname(src_second), OptoReg::regname(src_second));
+        print_helper(st, "OR     R_%s,R_%s,R_%s\t! spill",OptoReg::regname(tmp), OptoReg::regname(src_second), OptoReg::regname(dst_first));
 #endif
       }
-      return size+12;
-    }
-    else if( dst_first == R_I0_num && dst_second == R_I1_num ) {
+      return;
+    } else if (dst_first == R_I0_num && dst_second == R_I1_num) {
       // returning a long value in I0/I1
       // a SpillCopy must be able to target a return instruction's reg_class
       // Note: The _first and _second suffixes refer to the addresses of the the 2 halves of the 64-bit value
@@ -1511,27 +1524,25 @@
 
       if (src_first == dst_first) {
         tdest = OptoReg::Name(R_O7_num);
-        size += 4;
       }
 
-      if( cbuf ) {
-        assert( (src_first&1) == 0 && (src_first+1) == src_second, "return value was in an aligned-adjacent 64-bit reg");
+      if (cbuf) {
+        assert((src_first & 1) == 0 && (src_first + 1) == src_second, "return value was in an aligned-adjacent 64-bit reg");
         // Shift value in upper 32-bits of src to lower 32-bits of I0; move lower 32-bits to I1
         // ShrL_reg_imm6
-        emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[tdest], Assembler::srlx_op3, Matcher::_regEncode[src_second], 32 | 0x1000 );
+        emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[tdest], Assembler::srlx_op3, Matcher::_regEncode[src_second], 32 | 0x1000);
         // ShrR_reg_imm6  src, 0, dst
-        emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[dst_second], Assembler::srl_op3, Matcher::_regEncode[src_first], 0x0000 );
+        emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[dst_second], Assembler::srl_op3, Matcher::_regEncode[src_first], 0x0000);
         if (tdest != dst_first) {
-          emit3     ( *cbuf, Assembler::arith_op, Matcher::_regEncode[dst_first], Assembler::or_op3, 0/*G0*/, 0/*op2*/, Matcher::_regEncode[tdest] );
+          emit3     (*cbuf, Assembler::arith_op, Matcher::_regEncode[dst_first], Assembler::or_op3, 0/*G0*/, 0/*op2*/, Matcher::_regEncode[tdest]);
         }
       }
 #ifndef PRODUCT
-      else if( !do_size ) {
-        if( size != 0 ) st->print("\n\t");  // %%%%% !!!!!
-        st->print("SRLX   R_%s,32,R_%s\t! Extract MSW\n\t",OptoReg::regname(src_second),OptoReg::regname(tdest));
-        st->print("SRL    R_%s, 0,R_%s\t! Extract LSW\n\t",OptoReg::regname(src_first),OptoReg::regname(dst_second));
+      else {
+        print_helper(st, "SRLX   R_%s,32,R_%s\t! Extract MSW\n\t",OptoReg::regname(src_second),OptoReg::regname(tdest));
+        print_helper(st, "SRL    R_%s, 0,R_%s\t! Extract LSW\n\t",OptoReg::regname(src_first),OptoReg::regname(dst_second));
         if (tdest != dst_first) {
-          st->print("MOV    R_%s,R_%s\t! spill\n\t", OptoReg::regname(tdest), OptoReg::regname(dst_first));
+          print_helper(st, "MOV    R_%s,R_%s\t! spill\n\t", OptoReg::regname(tdest), OptoReg::regname(dst_first));
         }
       }
 #endif // PRODUCT
@@ -1539,65 +1550,77 @@
     }
 #endif // !_LP64
     // Else normal reg-reg copy
-    assert( src_second != dst_first, "smashed second before evacuating it" );
-    size = impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::or_op3,0,"MOV  ",size, st);
-    assert( (src_first&1) == 0 && (dst_first&1) == 0, "never move second-halves of int registers" );
+    assert(src_second != dst_first, "smashed second before evacuating it");
+    impl_mov_helper(cbuf, src_first, dst_first, Assembler::or_op3, 0, "MOV  ", st);
+    assert((src_first & 1) == 0 && (dst_first & 1) == 0, "never move second-halves of int registers");
     // This moves an aligned adjacent pair.
     // See if we are done.
-    if( src_first+1 == src_second && dst_first+1 == dst_second )
-      return size;
+    if (src_first + 1 == src_second && dst_first + 1 == dst_second) {
+      return;
+    }
   }
 
   // Check for integer store
-  if( src_first_rc == rc_int && dst_first_rc == rc_stack ) {
+  if (src_first_rc == rc_int && dst_first_rc == rc_stack) {
     int offset = ra_->reg2offset(dst_first);
     // Further check for aligned-adjacent pair, so we can use a double store
-    if( (src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second )
-      return impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stx_op3,"STX ",size, st);
-    size  =  impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stw_op3,"STW ",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_helper(mach, cbuf, ra_, false, offset, src_first, Assembler::stx_op3, "STX ", st);
+      return;
+    }
+    impl_helper(mach, cbuf, ra_, false, offset, src_first, Assembler::stw_op3, "STW ", st);
   }
 
   // Check for integer load
-  if( dst_first_rc == rc_int && src_first_rc == rc_stack ) {
+  if (dst_first_rc == rc_int && src_first_rc == rc_stack) {
     int offset = ra_->reg2offset(src_first);
     // Further check for aligned-adjacent pair, so we can use a double load
-    if( (src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second )
-      return impl_helper(this,cbuf,ra_,do_size,true,offset,dst_first,Assembler::ldx_op3 ,"LDX ",size, st);
-    size  =  impl_helper(this,cbuf,ra_,do_size,true,offset,dst_first,Assembler::lduw_op3,"LDUW",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_helper(mach, cbuf, ra_, true, offset, dst_first, Assembler::ldx_op3, "LDX ", st);
+      return;
+    }
+    impl_helper(mach, cbuf, ra_, true, offset, dst_first, Assembler::lduw_op3, "LDUW", st);
   }
 
   // Check for float reg-reg copy
-  if( src_first_rc == rc_float && dst_first_rc == rc_float ) {
+  if (src_first_rc == rc_float && dst_first_rc == rc_float) {
     // Further check for aligned-adjacent pair, so we can use a double move
-    if( (src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second )
-      return impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::fpop1_op3,Assembler::fmovd_opf,"FMOVD",size, st);
-    size  =  impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::fpop1_op3,Assembler::fmovs_opf,"FMOVS",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_mov_helper(cbuf, src_first, dst_first, Assembler::fpop1_op3, Assembler::fmovd_opf, "FMOVD", st);
+      return;
+    }
+    impl_mov_helper(cbuf, src_first, dst_first, Assembler::fpop1_op3, Assembler::fmovs_opf, "FMOVS", st);
   }
 
   // Check for float store
-  if( src_first_rc == rc_float && dst_first_rc == rc_stack ) {
+  if (src_first_rc == rc_float && dst_first_rc == rc_stack) {
     int offset = ra_->reg2offset(dst_first);
     // Further check for aligned-adjacent pair, so we can use a double store
-    if( (src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second )
-      return impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stdf_op3,"STDF",size, st);
-    size  =  impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stf_op3 ,"STF ",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_helper(mach, cbuf, ra_, false, offset, src_first, Assembler::stdf_op3, "STDF", st);
+      return;
+    }
+    impl_helper(mach, cbuf, ra_, false, offset, src_first, Assembler::stf_op3, "STF ", st);
   }
 
   // Check for float load
-  if( dst_first_rc == rc_float && src_first_rc == rc_stack ) {
+  if (dst_first_rc == rc_float && src_first_rc == rc_stack) {
     int offset = ra_->reg2offset(src_first);
     // Further check for aligned-adjacent pair, so we can use a double load
-    if( (src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second )
-      return impl_helper(this,cbuf,ra_,do_size,true,offset,dst_first,Assembler::lddf_op3,"LDDF",size, st);
-    size  =  impl_helper(this,cbuf,ra_,do_size,true,offset,dst_first,Assembler::ldf_op3 ,"LDF ",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_helper(mach, cbuf, ra_, true, offset, dst_first, Assembler::lddf_op3, "LDDF", st);
+      return;
+    }
+    impl_helper(mach, cbuf, ra_, true, offset, dst_first, Assembler::ldf_op3, "LDF ", st);
   }
 
   // --------------------------------------------------------------------
   // Check for hi bits still needing moving.  Only happens for misaligned
   // arguments to native calls.
-  if( src_second == dst_second )
-    return size;               // Self copy; no move
-  assert( src_second_rc != rc_bad && dst_second_rc != rc_bad, "src_second & dst_second cannot be Bad" );
+  if (src_second == dst_second) {
+    return; // Self copy; no move
+  }
+  assert(src_second_rc != rc_bad && dst_second_rc != rc_bad, "src_second & dst_second cannot be Bad");
 
 #ifndef _LP64
   // In the LP64 build, all registers can be moved as aligned/adjacent
@@ -1609,52 +1632,57 @@
   // 32-bits of a 64-bit register, but are needed in low bits of another
   // register (else it's a hi-bits-to-hi-bits copy which should have
   // happened already as part of a 64-bit move)
-  if( src_second_rc == rc_int && dst_second_rc == rc_int ) {
-    assert( (src_second&1)==1, "its the evil O0/O1 native return case" );
-    assert( (dst_second&1)==0, "should have moved with 1 64-bit move" );
+  if (src_second_rc == rc_int && dst_second_rc == rc_int) {
+    assert((src_second & 1) == 1, "its the evil O0/O1 native return case");
+    assert((dst_second & 1) == 0, "should have moved with 1 64-bit move");
     // Shift src_second down to dst_second's low bits.
-    if( cbuf ) {
-      emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[dst_second], Assembler::srlx_op3, Matcher::_regEncode[src_second-1], 0x1020 );
+    if (cbuf) {
+      emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[dst_second], Assembler::srlx_op3, Matcher::_regEncode[src_second-1], 0x1020);
 #ifndef PRODUCT
-    } else if( !do_size ) {
-      if( size != 0 ) st->print("\n\t");
-      st->print("SRLX   R_%s,32,R_%s\t! spill: Move high bits down low",OptoReg::regname(src_second-1),OptoReg::regname(dst_second));
+    } else  {
+      print_helper(st, "SRLX   R_%s,32,R_%s\t! spill: Move high bits down low", OptoReg::regname(src_second - 1), OptoReg::regname(dst_second));
 #endif
     }
-    return size+4;
+    return;
   }
 
   // Check for high word integer store.  Must down-shift the hi bits
   // into a temp register, then fall into the case of storing int bits.
-  if( src_second_rc == rc_int && dst_second_rc == rc_stack && (src_second&1)==1 ) {
+  if (src_second_rc == rc_int && dst_second_rc == rc_stack && (src_second & 1) == 1) {
     // Shift src_second down to dst_second's low bits.
-    if( cbuf ) {
-      emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[R_O7_num], Assembler::srlx_op3, Matcher::_regEncode[src_second-1], 0x1020 );
+    if (cbuf) {
+      emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[R_O7_num], Assembler::srlx_op3, Matcher::_regEncode[src_second-1], 0x1020);
 #ifndef PRODUCT
-    } else if( !do_size ) {
-      if( size != 0 ) st->print("\n\t");
-      st->print("SRLX   R_%s,32,R_%s\t! spill: Move high bits down low",OptoReg::regname(src_second-1),OptoReg::regname(R_O7_num));
+    } else {
+      print_helper(st, "SRLX   R_%s,32,R_%s\t! spill: Move high bits down low", OptoReg::regname(src_second-1), OptoReg::regname(R_O7_num));
 #endif
     }
-    size+=4;
     src_second = OptoReg::Name(R_O7_num); // Not R_O7H_num!
   }
 
   // Check for high word integer load
-  if( dst_second_rc == rc_int && src_second_rc == rc_stack )
-    return impl_helper(this,cbuf,ra_,do_size,true ,ra_->reg2offset(src_second),dst_second,Assembler::lduw_op3,"LDUW",size, st);
+  if (dst_second_rc == rc_int && src_second_rc == rc_stack)
+    return impl_helper(this, cbuf, ra_, true, ra_->reg2offset(src_second), dst_second, Assembler::lduw_op3, "LDUW", size, st);
 
   // Check for high word integer store
-  if( src_second_rc == rc_int && dst_second_rc == rc_stack )
-    return impl_helper(this,cbuf,ra_,do_size,false,ra_->reg2offset(dst_second),src_second,Assembler::stw_op3 ,"STW ",size, st);
+  if (src_second_rc == rc_int && dst_second_rc == rc_stack)
+    return impl_helper(this, cbuf, ra_, false, ra_->reg2offset(dst_second), src_second, Assembler::stw_op3, "STW ", size, st);
 
   // Check for high word float store
-  if( src_second_rc == rc_float && dst_second_rc == rc_stack )
-    return impl_helper(this,cbuf,ra_,do_size,false,ra_->reg2offset(dst_second),src_second,Assembler::stf_op3 ,"STF ",size, st);
+  if (src_second_rc == rc_float && dst_second_rc == rc_stack)
+    return impl_helper(this, cbuf, ra_, false, ra_->reg2offset(dst_second), src_second, Assembler::stf_op3, "STF ", size, st);
 
 #endif // !_LP64
 
   Unimplemented();
+}
+
+uint MachSpillCopyNode::implementation(CodeBuffer *cbuf,
+                                       PhaseRegAlloc *ra_,
+                                       bool do_size,
+                                       outputStream* st) const {
+  assert(!do_size, "not supported");
+  mach_spill_copy_implementation_helper(this, cbuf, ra_, st);
   return 0;
 }
 
@@ -1669,19 +1697,19 @@
 }
 
 uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
-  return implementation( NULL, ra_, true, NULL );
+  return MachNode::size(ra_);
 }
 
 //=============================================================================
 #ifndef PRODUCT
-void MachNopNode::format( PhaseRegAlloc *, outputStream *st ) const {
+void MachNopNode::format(PhaseRegAlloc *, outputStream *st) const {
   st->print("NOP \t# %d bytes pad for loops and calls", 4 * _count);
 }
 #endif
 
-void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc * ) const {
+void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *) const {
   MacroAssembler _masm(&cbuf);
-  for(int i = 0; i < _count; i += 1) {
+  for (int i = 0; i < _count; i += 1) {
     __ nop();
   }
 }
@@ -5197,7 +5225,6 @@
   // No match rule to avoid chain rule match.
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "LDF    $src,$dst\t! stkI to regF" %}
   opcode(Assembler::ldf_op3);
   ins_encode(simple_form3_mem_reg(src, dst));
@@ -5208,7 +5235,6 @@
   // No match rule to avoid chain rule match.
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "LDDF   $src,$dst\t! stkL to regD" %}
   opcode(Assembler::lddf_op3);
   ins_encode(simple_form3_mem_reg(src, dst));
@@ -5219,7 +5245,6 @@
   // No match rule to avoid chain rule match.
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STF    $src,$dst\t! regF to stkI" %}
   opcode(Assembler::stf_op3);
   ins_encode(simple_form3_mem_reg(dst, src));
@@ -5230,7 +5255,6 @@
   // No match rule to avoid chain rule match.
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STDF   $src,$dst\t! regD to stkL" %}
   opcode(Assembler::stdf_op3);
   ins_encode(simple_form3_mem_reg(dst, src));
@@ -5240,7 +5264,6 @@
 instruct regI_to_stkLHi(stackSlotL dst, iRegI src) %{
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST*2);
-  size(8);
   format %{ "STW    $src,$dst.hi\t! long\n\t"
             "STW    R_G0,$dst.lo" %}
   opcode(Assembler::stw_op3);
@@ -5252,7 +5275,6 @@
   // No match rule to avoid chain rule match.
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STX    $src,$dst\t! regL to stkD" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -5266,7 +5288,6 @@
   match(Set dst src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDUW   $src,$dst\t!stk" %}
   opcode(Assembler::lduw_op3);
   ins_encode(simple_form3_mem_reg( src, dst ) );
@@ -5278,7 +5299,6 @@
   match(Set dst src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STW    $src,$dst\t!stk" %}
   opcode(Assembler::stw_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -5290,7 +5310,6 @@
   match(Set dst src);
 
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "LDX    $src,$dst\t! long" %}
   opcode(Assembler::ldx_op3);
   ins_encode(simple_form3_mem_reg( src, dst ) );
@@ -5302,7 +5321,6 @@
   match(Set dst src);
 
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STX    $src,$dst\t! long" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -5314,7 +5332,6 @@
 instruct stkP_to_regP( iRegP dst, stackSlotP src ) %{
   match(Set dst src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "LDX    $src,$dst\t!ptr" %}
   opcode(Assembler::ldx_op3);
   ins_encode(simple_form3_mem_reg( src, dst ) );
@@ -5325,7 +5342,6 @@
 instruct regP_to_stkP(stackSlotP dst, iRegP src) %{
   match(Set dst src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STX    $src,$dst\t!ptr" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -5771,7 +5787,6 @@
   match(Set dst (LoadL_unaligned mem));
   effect(KILL tmp);
   ins_cost(MEMORY_REF_COST*2+DEFAULT_COST);
-  size(16);
   format %{ "LDUW   $mem+4,R_O7\t! misaligned long\n"
           "\tLDUW   $mem  ,$dst\n"
           "\tSLLX   #32, $dst, $dst\n"
@@ -5786,7 +5801,6 @@
   match(Set dst (LoadRange mem));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDUW   $mem,$dst\t! range" %}
   opcode(Assembler::lduw_op3);
   ins_encode(simple_form3_mem_reg( mem, dst ) );
@@ -5797,7 +5811,6 @@
 instruct loadI_freg(regF dst, memory mem) %{
   match(Set dst (LoadI mem));
   ins_cost(MEMORY_REF_COST);
-  size(4);
 
   format %{ "LDF    $mem,$dst\t! for fitos/fitod" %}
   opcode(Assembler::ldf_op3);
@@ -5876,7 +5889,6 @@
   match(Set dst (LoadD mem));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDDF   $mem,$dst" %}
   opcode(Assembler::lddf_op3);
   ins_encode(simple_form3_mem_reg( mem, dst ) );
@@ -5887,7 +5899,6 @@
 instruct loadD_unaligned(regD_low dst, memory mem ) %{
   match(Set dst (LoadD_unaligned mem));
   ins_cost(MEMORY_REF_COST*2+DEFAULT_COST);
-  size(8);
   format %{ "LDF    $mem  ,$dst.hi\t! misaligned double\n"
           "\tLDF    $mem+4,$dst.lo\t!" %}
   opcode(Assembler::ldf_op3);
@@ -5900,7 +5911,6 @@
   match(Set dst (LoadF mem));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDF    $mem,$dst" %}
   opcode(Assembler::ldf_op3);
   ins_encode(simple_form3_mem_reg( mem, dst ) );
@@ -6119,7 +6129,6 @@
   predicate(AllocatePrefetchInstr == 0);
   match( PrefetchAllocation mem );
   ins_cost(MEMORY_REF_COST);
-  size(4);
 
   format %{ "PREFETCH $mem,2\t! Prefetch allocation" %}
   opcode(Assembler::prefetch_op3);
@@ -6175,7 +6184,6 @@
   match(Set mem (StoreB mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STB    $src,$mem\t! byte" %}
   opcode(Assembler::stb_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6186,7 +6194,6 @@
   match(Set mem (StoreB mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STB    $src,$mem\t! byte" %}
   opcode(Assembler::stb_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6197,7 +6204,6 @@
   match(Set mem (StoreCM mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STB    $src,$mem\t! CMS card-mark byte 0" %}
   opcode(Assembler::stb_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6209,7 +6215,6 @@
   match(Set mem (StoreC mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STH    $src,$mem\t! short" %}
   opcode(Assembler::sth_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6220,7 +6225,6 @@
   match(Set mem (StoreC mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STH    $src,$mem\t! short" %}
   opcode(Assembler::sth_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6232,7 +6236,6 @@
   match(Set mem (StoreI mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STW    $src,$mem" %}
   opcode(Assembler::stw_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6243,7 +6246,6 @@
 instruct storeL(memory mem, iRegL src) %{
   match(Set mem (StoreL mem src));
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STX    $src,$mem\t! long" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6254,7 +6256,6 @@
   match(Set mem (StoreI mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STW    $src,$mem" %}
   opcode(Assembler::stw_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6265,7 +6266,6 @@
   match(Set mem (StoreL mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STX    $src,$mem" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6277,7 +6277,6 @@
   match(Set mem (StoreI mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STF    $src,$mem\t! after fstoi/fdtoi" %}
   opcode(Assembler::stf_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6288,7 +6287,6 @@
 instruct storeP(memory dst, sp_ptr_RegP src) %{
   match(Set dst (StoreP dst src));
   ins_cost(MEMORY_REF_COST);
-  size(4);
 
 #ifndef _LP64
   format %{ "STW    $src,$dst\t! ptr" %}
@@ -6304,7 +6302,6 @@
 instruct storeP0(memory dst, immP0 src) %{
   match(Set dst (StoreP dst src));
   ins_cost(MEMORY_REF_COST);
-  size(4);
 
 #ifndef _LP64
   format %{ "STW    $src,$dst\t! ptr" %}
@@ -6379,7 +6376,6 @@
   match(Set mem (StoreD mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STDF   $src,$mem" %}
   opcode(Assembler::stdf_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6390,7 +6386,6 @@
   match(Set mem (StoreD mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STX    $src,$mem" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6402,7 +6397,6 @@
   match(Set mem (StoreF mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STF    $src,$mem" %}
   opcode(Assembler::stf_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6413,7 +6407,6 @@
   match(Set mem (StoreF mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STW    $src,$mem\t! storeF0" %}
   opcode(Assembler::stw_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -7068,7 +7061,6 @@
   ins_cost(MEMORY_REF_COST);
 
 #ifndef _LP64
-  size(4);
   format %{ "LDUW   $mem,$dst\t! ptr" %}
   opcode(Assembler::lduw_op3, 0, REGP_OP);
 #else
@@ -8138,7 +8130,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDUW   $src,$dst\t! MoveF2I" %}
   opcode(Assembler::lduw_op3);
   ins_encode(simple_form3_mem_reg( src, dst ) );
@@ -8150,7 +8141,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDF    $src,$dst\t! MoveI2F" %}
   opcode(Assembler::ldf_op3);
   ins_encode(simple_form3_mem_reg(src, dst));
@@ -8162,7 +8152,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDX    $src,$dst\t! MoveD2L" %}
   opcode(Assembler::ldx_op3);
   ins_encode(simple_form3_mem_reg( src, dst ) );
@@ -8174,7 +8163,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDDF   $src,$dst\t! MoveL2D" %}
   opcode(Assembler::lddf_op3);
   ins_encode(simple_form3_mem_reg(src, dst));
@@ -8186,7 +8174,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STF   $src,$dst\t! MoveF2I" %}
   opcode(Assembler::stf_op3);
   ins_encode(simple_form3_mem_reg(dst, src));
@@ -8198,7 +8185,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STW    $src,$dst\t! MoveI2F" %}
   opcode(Assembler::stw_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -8210,7 +8196,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STDF   $src,$dst\t! MoveD2L" %}
   opcode(Assembler::stdf_op3);
   ins_encode(simple_form3_mem_reg(dst, src));
@@ -8222,7 +8207,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STX    $src,$dst\t! MoveL2D" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -8427,7 +8411,6 @@
 instruct convI2D_mem(regD_low dst, memory mem) %{
   match(Set dst (ConvI2D (LoadI mem)));
   ins_cost(DEFAULT_COST + MEMORY_REF_COST);
-  size(8);
   format %{ "LDF    $mem,$dst\n\t"
             "FITOD  $dst,$dst" %}
   opcode(Assembler::ldf_op3, Assembler::fitod_opf);
@@ -8468,7 +8451,6 @@
 instruct convI2F_mem( regF dst, memory mem ) %{
   match(Set dst (ConvI2F (LoadI mem)));
   ins_cost(DEFAULT_COST + MEMORY_REF_COST);
-  size(8);
   format %{ "LDF    $mem,$dst\n\t"
             "FITOS  $dst,$dst" %}
   opcode(Assembler::ldf_op3, Assembler::fitos_opf);
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -463,3 +463,37 @@
   }
   return result;
 }
+
+
+int VM_Version::parse_features(const char* implementation) {
+  int features = unknown_m;
+  // Convert to UPPER case before compare.
+  char* impl = os::strdup_check_oom(implementation);
+
+  for (int i = 0; impl[i] != 0; i++)
+    impl[i] = (char)toupper((uint)impl[i]);
+
+  if (strstr(impl, "SPARC64") != NULL) {
+    features |= sparc64_family_m;
+  } else if (strstr(impl, "SPARC-M") != NULL) {
+    // M-series SPARC is based on T-series.
+    features |= (M_family_m | T_family_m);
+  } else if (strstr(impl, "SPARC-T") != NULL) {
+    features |= T_family_m;
+    if (strstr(impl, "SPARC-T1") != NULL) {
+      features |= T1_model_m;
+    }
+  } else {
+    if (strstr(impl, "SPARC") == NULL) {
+#ifndef PRODUCT
+      // kstat on Solaris 8 virtual machines (branded zones)
+      // returns "(unsupported)" implementation. Solaris 8 is not
+      // supported anymore, but include this check to be on the
+      // safe side.
+      warning("Can't parse CPU implementation = '%s', assume generic SPARC", impl);
+#endif
+    }
+  }
+  os::free((void*)impl);
+  return features;
+}
--- a/src/cpu/sparc/vm/vm_version_sparc.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/sparc/vm/vm_version_sparc.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -121,7 +121,7 @@
   static bool is_T1_model(int features) { return is_T_family(features) && ((features & T1_model_m) != 0); }
 
   static int maximum_niagara1_processor_count() { return 32; }
-
+  static int parse_features(const char* implementation);
 public:
   // Initialization
   static void initialize();
--- a/src/cpu/x86/vm/templateInterpreterGenerator_x86.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/x86/vm/templateInterpreterGenerator_x86.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -161,13 +161,7 @@
                                      create_klass_exception),
                rarg, rarg2);
   } else {
-    // kind of lame ExternalAddress can't take NULL because
-    // external_word_Relocation will assert.
-    if (message != NULL) {
-      __ lea(rarg2, ExternalAddress((address)message));
-    } else {
-      __ movptr(rarg2, NULL_WORD);
-    }
+    __ lea(rarg2, ExternalAddress((address)message));
     __ call_VM(rax,
                CAST_FROM_FN_PTR(address, InterpreterRuntime::create_exception),
                rarg, rarg2);
--- a/src/cpu/x86/vm/x86_32.ad	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/x86/vm/x86_32.ad	Thu Mar 10 09:50:18 2016 -0800
@@ -7236,6 +7236,7 @@
 instruct compareAndSwapL( rRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
   predicate(VM_Version::supports_cx8());
   match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapL mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
   format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
             "MOV    $res,0\n\t"
@@ -7249,6 +7250,7 @@
 
 instruct compareAndSwapP( rRegI res,  pRegP mem_ptr, eAXRegP oldval, eCXRegP newval, eFlagsReg cr) %{
   match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapP mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
   format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
             "MOV    $res,0\n\t"
@@ -7261,6 +7263,7 @@
 
 instruct compareAndSwapI( rRegI res, pRegP mem_ptr, eAXRegI oldval, eCXRegI newval, eFlagsReg cr) %{
   match(Set res (CompareAndSwapI mem_ptr (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapI mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
   format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
             "MOV    $res,0\n\t"
@@ -7271,6 +7274,31 @@
   ins_pipe( pipe_cmpxchg );
 %}
 
+instruct compareAndExchangeL( eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
+  predicate(VM_Version::supports_cx8());
+  match(Set oldval (CompareAndExchangeL mem_ptr (Binary oldval newval)));
+  effect(KILL cr);
+  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t" %}
+  ins_encode( enc_cmpxchg8(mem_ptr) );
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct compareAndExchangeP( pRegP mem_ptr, eAXRegP oldval, eCXRegP newval, eFlagsReg cr) %{
+  match(Set oldval (CompareAndExchangeP mem_ptr (Binary oldval newval)));
+  effect(KILL cr);
+  format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t" %}
+  ins_encode( enc_cmpxchg(mem_ptr) );
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct compareAndExchangeI( pRegP mem_ptr, eAXRegI oldval, eCXRegI newval, eFlagsReg cr) %{
+  match(Set oldval (CompareAndExchangeI mem_ptr (Binary oldval newval)));
+  effect(KILL cr);
+  format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t" %}
+  ins_encode( enc_cmpxchg(mem_ptr) );
+  ins_pipe( pipe_cmpxchg );
+%}
+
 instruct xaddI_no_res( memory mem, Universe dummy, immI add, eFlagsReg cr) %{
   predicate(n->as_LoadStore()->result_not_used());
   match(Set dummy (GetAndAddI mem add));
--- a/src/cpu/x86/vm/x86_64.ad	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/cpu/x86/vm/x86_64.ad	Thu Mar 10 09:50:18 2016 -0800
@@ -7281,6 +7281,7 @@
 %{
   predicate(VM_Version::supports_cx8());
   match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapP mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
 
   format %{ "cmpxchgq $mem_ptr,$newval\t# "
@@ -7305,6 +7306,7 @@
 %{
   predicate(VM_Version::supports_cx8());
   match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapL mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
 
   format %{ "cmpxchgq $mem_ptr,$newval\t# "
@@ -7328,6 +7330,7 @@
                          rFlagsReg cr)
 %{
   match(Set res (CompareAndSwapI mem_ptr (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapI mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
 
   format %{ "cmpxchgl $mem_ptr,$newval\t# "
@@ -7351,6 +7354,7 @@
                           rax_RegN oldval, rRegN newval,
                           rFlagsReg cr) %{
   match(Set res (CompareAndSwapN mem_ptr (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapN mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
 
   format %{ "cmpxchgl $mem_ptr,$newval\t# "
@@ -7368,6 +7372,83 @@
   ins_pipe( pipe_cmpxchg );
 %}
 
+instruct compareAndExchangeI(
+                         memory mem_ptr,
+                         rax_RegI oldval, rRegI newval,
+                         rFlagsReg cr)
+%{
+  match(Set oldval (CompareAndExchangeI mem_ptr (Binary oldval newval)));
+  effect(KILL cr);
+
+  format %{ "cmpxchgl $mem_ptr,$newval\t# "
+            "If rax == $mem_ptr then store $newval into $mem_ptr\n\t"  %}
+  opcode(0x0F, 0xB1);
+  ins_encode(lock_prefix,
+             REX_reg_mem(newval, mem_ptr),
+             OpcP, OpcS,
+             reg_mem(newval, mem_ptr) // lock cmpxchg
+             );
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct compareAndExchangeL(
+                         memory mem_ptr,
+                         rax_RegL oldval, rRegL newval,
+                         rFlagsReg cr)
+%{
+  predicate(VM_Version::supports_cx8());
+  match(Set oldval (CompareAndExchangeL mem_ptr (Binary oldval newval)));
+  effect(KILL cr);
+
+  format %{ "cmpxchgq $mem_ptr,$newval\t# "
+            "If rax == $mem_ptr then store $newval into $mem_ptr\n\t"  %}
+  opcode(0x0F, 0xB1);
+  ins_encode(lock_prefix,
+             REX_reg_mem_wide(newval, mem_ptr),
+             OpcP, OpcS,
+             reg_mem(newval, mem_ptr)  // lock cmpxchg
+            );
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct compareAndExchangeN(
+                          memory mem_ptr,
+                          rax_RegN oldval, rRegN newval,
+                          rFlagsReg cr) %{
+  match(Set oldval (CompareAndExchangeN mem_ptr (Binary oldval newval)));
+  effect(KILL cr);
+
+  format %{ "cmpxchgl $mem_ptr,$newval\t# "
+            "If rax == $mem_ptr then store $newval into $mem_ptr\n\t" %}
+  opcode(0x0F, 0xB1);
+  ins_encode(lock_prefix,
+             REX_reg_mem(newval, mem_ptr),
+             OpcP, OpcS,
+             reg_mem(newval, mem_ptr)  // lock cmpxchg
+          );
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct compareAndExchangeP(
+                         memory mem_ptr,
+                         rax_RegP oldval, rRegP newval,
+                         rFlagsReg cr)
+%{
+  predicate(VM_Version::supports_cx8());
+  match(Set oldval (CompareAndExchangeP mem_ptr (Binary oldval newval)));
+  effect(KILL cr);
+
+  format %{ "cmpxchgq $mem_ptr,$newval\t# "
+            "If rax == $mem_ptr then store $newval into $mem_ptr\n\t" %}
+  opcode(0x0F, 0xB1);
+  ins_encode(lock_prefix,
+             REX_reg_mem_wide(newval, mem_ptr),
+             OpcP, OpcS,
+             reg_mem(newval, mem_ptr)  // lock cmpxchg
+          );
+  ins_pipe( pipe_cmpxchg );
+%}
+
 instruct xaddI_no_res( memory mem, Universe dummy, immI add, rFlagsReg cr) %{
   predicate(n->as_LoadStore()->result_not_used());
   match(Set dummy (GetAndAddI mem add));
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/development/Server16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/development/Server24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/About16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/About24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Delete16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Delete24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Find16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Help16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Help24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/History16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/History24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Information16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Information24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/New16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/New24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Open16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Open24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Save24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/SaveAs16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/SaveAs24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Zoom16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/ZoomIn16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/ZoomIn24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/navigation/Down16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/navigation/Up16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignCenter16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignCenter24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignLeft16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignLeft24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignRight16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignRight24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/development/Server16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/development/Server24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/About16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/About24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Delete16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Delete24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Find16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Help16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Help24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/History16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/History24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Information16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Information24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/New16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/New24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Open16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Open24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Save24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/SaveAs16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/SaveAs24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Zoom16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/ZoomIn16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/ZoomIn24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/navigation/Down16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/navigation/Up16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignCenter16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignCenter24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignLeft16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignLeft24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignRight16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignRight24.gif has changed
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java	Thu Mar 10 09:50:18 2016 -0800
@@ -22,6 +22,7 @@
  */
 package jdk.vm.ci.amd64;
 
+import static jdk.vm.ci.code.MemoryBarriers.LOAD_LOAD;
 import static jdk.vm.ci.code.MemoryBarriers.LOAD_STORE;
 import static jdk.vm.ci.code.MemoryBarriers.STORE_STORE;
 import static jdk.vm.ci.code.Register.SPECIAL;
@@ -220,7 +221,7 @@
     private final AMD64Kind largestKind;
 
     public AMD64(EnumSet<CPUFeature> features, EnumSet<Flag> flags) {
-        super("AMD64", AMD64Kind.QWORD, ByteOrder.LITTLE_ENDIAN, true, allRegisters, LOAD_STORE | STORE_STORE, 1, 8);
+        super("AMD64", AMD64Kind.QWORD, ByteOrder.LITTLE_ENDIAN, true, allRegisters, LOAD_LOAD | LOAD_STORE | STORE_STORE, 1, 8);
         this.features = features;
         this.flags = flags;
         assert features.contains(CPUFeature.SSE2) : "minimum config for x64";
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotVMConfig.java	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotVMConfig.java	Thu Mar 10 09:50:18 2016 -0800
@@ -1141,7 +1141,7 @@
 
     @HotSpotVMField(name = "JavaFrameAnchor::_last_Java_sp", type = "intptr_t*", get = HotSpotVMField.Type.OFFSET) @Stable private int javaFrameAnchorLastJavaSpOffset;
     @HotSpotVMField(name = "JavaFrameAnchor::_last_Java_pc", type = "address", get = HotSpotVMField.Type.OFFSET) @Stable private int javaFrameAnchorLastJavaPcOffset;
-    @HotSpotVMField(name = "JavaFrameAnchor::_last_Java_fp", type = "intptr_t*", get = HotSpotVMField.Type.OFFSET, archs = {"amd64"}) @Stable private int javaFrameAnchorLastJavaFpOffset;
+    @HotSpotVMField(name = "JavaFrameAnchor::_last_Java_fp", type = "intptr_t*", get = HotSpotVMField.Type.OFFSET, archs = {"aarch64, amd64"}) @Stable private int javaFrameAnchorLastJavaFpOffset;
     @HotSpotVMField(name = "JavaFrameAnchor::_flags", type = "int", get = HotSpotVMField.Type.OFFSET, archs = {"sparc"}) @Stable private int javaFrameAnchorFlagsOffset;
 
     public int threadLastJavaSpOffset() {
@@ -1152,11 +1152,8 @@
         return javaThreadAnchorOffset + javaFrameAnchorLastJavaPcOffset;
     }
 
-    /**
-     * This value is only valid on AMD64.
-     */
     public int threadLastJavaFpOffset() {
-        // TODO add an assert for AMD64
+        assert getHostArchitectureName().equals("aarch64") || getHostArchitectureName().equals("amd64");
         return javaThreadAnchorOffset + javaFrameAnchorLastJavaFpOffset;
     }
 
--- a/src/os/aix/vm/os_aix.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/os/aix/vm/os_aix.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
+ * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,6 +36,7 @@
 #include "compiler/compileBroker.hpp"
 #include "interpreter/interpreter.hpp"
 #include "jvm_aix.h"
+#include "logging/log.hpp"
 #include "libo4.hpp"
 #include "libperfstat_aix.hpp"
 #include "libodm_aix.hpp"
@@ -791,13 +792,8 @@
   const pthread_t pthread_id = ::pthread_self();
   const tid_t kernel_thread_id = ::thread_self();
 
-  trcVerbose("newborn Thread : pthread-id %u, ktid " UINT64_FORMAT
-    ", stack %p ... %p, stacksize 0x%IX (%IB)",
-    pthread_id, kernel_thread_id,
-    thread->stack_end(),
-    thread->stack_base(),
-    thread->stack_size(),
-    thread->stack_size());
+  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", kernel thread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) kernel_thread_id);
 
   // Normally, pthread stacks on AIX live in the data segment (are allocated with malloc()
   // by the pthread library). In rare cases, this may not be the case, e.g. when third-party
@@ -805,7 +801,7 @@
   // guard pages on those stacks, because the stacks may reside in memory which is not
   // protectable (shmated).
   if (thread->stack_base() > ::sbrk(0)) {
-    trcVerbose("Thread " UINT64_FORMAT ": stack not in data segment.", (uint64_t) pthread_id);
+    log_warning(os, thread)("Thread stack not in data segment.");
   }
 
   // Try to randomize the cache line index of hot stack frames.
@@ -839,8 +835,8 @@
   // Call one more level start routine.
   thread->run();
 
-  trcVerbose("Thread finished : pthread-id %u, ktid " UINT64_FORMAT ".",
-    pthread_id, kernel_thread_id);
+  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", kernel thread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) kernel_thread_id);
 
   return 0;
 }
@@ -908,20 +904,19 @@
   pthread_t tid;
   int ret = pthread_create(&tid, &attr, (void* (*)(void*)) java_start, thread);
 
+
+  char buf[64];
+  if (ret == 0) {
+    log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
+      (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+  } else {
+    log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
+      strerror(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+  }
+
   pthread_attr_destroy(&attr);
 
-  if (ret == 0) {
-    trcVerbose("Created New Thread : pthread-id %u", tid);
-  } else {
-    if (os::Aix::on_pase()) {
-      // QIBM_MULTI_THREADED=Y is needed when the launcher is started on iSeries
-      // using QSH. Otherwise pthread_create fails with errno=11.
-      trcVerbose("(Please make sure you set the environment variable "
-              "QIBM_MULTI_THREADED=Y before running this program.)");
-    }
-    if (PrintMiscellaneous && (Verbose || WizardMode)) {
-      perror("pthread_create()");
-    }
+  if (ret != 0) {
     // Need to clean up stuff we've allocated so far
     thread->set_osthread(NULL);
     delete osthread;
@@ -958,13 +953,6 @@
   const pthread_t pthread_id = ::pthread_self();
   const tid_t kernel_thread_id = ::thread_self();
 
-  trcVerbose("attaching Thread : pthread-id %u, ktid " UINT64_FORMAT ", stack %p ... %p, stacksize 0x%IX (%IB)",
-    pthread_id, kernel_thread_id,
-    thread->stack_end(),
-    thread->stack_base(),
-    thread->stack_size(),
-    thread->stack_size());
-
   // OSThread::thread_id is the pthread id.
   osthread->set_thread_id(pthread_id);
 
@@ -990,6 +978,9 @@
   // and save the caller's signal mask
   os::Aix::hotspot_sigmask(thread);
 
+  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", kernel thread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) kernel_thread_id);
+
   return true;
 }
 
--- a/src/os/bsd/vm/os_bsd.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/os/bsd/vm/os_bsd.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -32,6 +32,7 @@
 #include "compiler/disassembler.hpp"
 #include "interpreter/interpreter.hpp"
 #include "jvm_bsd.h"
+#include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/filemap.hpp"
 #include "mutex_bsd.inline.hpp"
@@ -681,6 +682,9 @@
 
   osthread->set_thread_id(os::Bsd::gettid());
 
+  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
 #ifdef __APPLE__
   uint64_t unique_thread_id = locate_unique_thread_id(osthread->thread_id());
   guarantee(unique_thread_id != 0, "unique thread id was not found");
@@ -716,6 +720,9 @@
   // call one more level start routine
   thread->run();
 
+  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
   return 0;
 }
 
@@ -776,12 +783,18 @@
     pthread_t tid;
     int ret = pthread_create(&tid, &attr, (void* (*)(void*)) java_start, thread);
 
+    char buf[64];
+    if (ret == 0) {
+      log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
+        (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+    } else {
+      log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
+        strerror(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+    }
+
     pthread_attr_destroy(&attr);
 
     if (ret != 0) {
-      if (PrintMiscellaneous && (Verbose || WizardMode)) {
-        perror("pthread_create()");
-      }
       // Need to clean up stuff we've allocated so far
       thread->set_osthread(NULL);
       delete osthread;
@@ -858,6 +871,9 @@
   // and save the caller's signal mask
   os::Bsd::hotspot_sigmask(thread);
 
+  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
   return true;
 }
 
--- a/src/os/linux/vm/os_linux.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/os/linux/vm/os_linux.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -662,6 +662,9 @@
 
   osthread->set_thread_id(os::current_thread_id());
 
+  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
   if (UseNUMA) {
     int lgrp_id = os::numa_get_group_id();
     if (lgrp_id != -1) {
@@ -691,6 +694,9 @@
   // call one more level start routine
   thread->run();
 
+  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
   return 0;
 }
 
@@ -756,12 +762,18 @@
     pthread_t tid;
     int ret = pthread_create(&tid, &attr, (void* (*)(void*)) java_start, thread);
 
+    char buf[64];
+    if (ret == 0) {
+      log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
+        (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+    } else {
+      log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
+        strerror(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+    }
+
     pthread_attr_destroy(&attr);
 
     if (ret != 0) {
-      if (PrintMiscellaneous && (Verbose || WizardMode)) {
-        perror("pthread_create()");
-      }
       // Need to clean up stuff we've allocated so far
       thread->set_osthread(NULL);
       delete osthread;
@@ -858,6 +870,9 @@
   // and save the caller's signal mask
   os::Linux::hotspot_sigmask(thread);
 
+  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
   return true;
 }
 
--- a/src/os/posix/vm/os_posix.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/os/posix/vm/os_posix.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1071,6 +1071,19 @@
 #endif
 }
 
+char* os::Posix::describe_pthread_attr(char* buf, size_t buflen, const pthread_attr_t* attr) {
+  size_t stack_size = 0;
+  size_t guard_size = 0;
+  int detachstate = 0;
+  pthread_attr_getstacksize(attr, &stack_size);
+  pthread_attr_getguardsize(attr, &guard_size);
+  pthread_attr_getdetachstate(attr, &detachstate);
+  jio_snprintf(buf, buflen, "stacksize: " SIZE_FORMAT "k, guardsize: " SIZE_FORMAT "k, %s",
+    stack_size / 1024, guard_size / 1024,
+    (detachstate == PTHREAD_CREATE_DETACHED ? "detached" : "joinable"));
+  return buf;
+}
+
 
 os::WatcherThreadCrashProtection::WatcherThreadCrashProtection() {
   assert(Thread::current()->is_Watcher_thread(), "Must be WatcherThread");
--- a/src/os/posix/vm/os_posix.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/os/posix/vm/os_posix.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -76,6 +76,11 @@
   static address ucontext_get_pc(const ucontext_t* ctx);
   // Set PC into context. Needed for continuation after signal.
   static void ucontext_set_pc(ucontext_t* ctx, address pc);
+
+  // Helper function; describes pthread attributes as short string. String is written
+  // to buf with len buflen; buf is returned.
+  static char* describe_pthread_attr(char* buf, size_t buflen, const pthread_attr_t* attr);
+
 };
 
 /*
--- a/src/os/solaris/vm/os_solaris.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/os/solaris/vm/os_solaris.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -32,6 +32,7 @@
 #include "compiler/disassembler.hpp"
 #include "interpreter/interpreter.hpp"
 #include "jvm_solaris.h"
+#include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/filemap.hpp"
 #include "mutex_solaris.inline.hpp"
@@ -68,6 +69,7 @@
 #include "utilities/defaultStream.hpp"
 #include "utilities/events.hpp"
 #include "utilities/growableArray.hpp"
+#include "utilities/macros.hpp"
 #include "utilities/vmError.hpp"
 
 // put OS-includes here
@@ -736,6 +738,9 @@
   osthr->set_lwp_id(_lwp_self());  // Store lwp in case we are bound
   thread->_schedctl = (void *) schedctl_init();
 
+  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ").",
+    os::current_thread_id());
+
   if (UseNUMA) {
     int lgrp_id = os::numa_get_group_id();
     if (lgrp_id != -1) {
@@ -781,6 +786,8 @@
     Atomic::dec(&os::Solaris::_os_thread_count);
   }
 
+  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ").", os::current_thread_id());
+
   if (UseDetachedThreads) {
     thr_exit(NULL);
     ShouldNotReachHere();
@@ -853,6 +860,9 @@
   // and save the caller's signal mask
   os::Solaris::hotspot_sigmask(thread);
 
+  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ").",
+    os::current_thread_id());
+
   return true;
 }
 
@@ -879,6 +889,24 @@
   return true;
 }
 
+// Helper function to trace thread attributes, similar to os::Posix::describe_pthread_attr()
+static char* describe_thr_create_attributes(char* buf, size_t buflen,
+                                            size_t stacksize, long flags) {
+  stringStream ss(buf, buflen);
+  ss.print("stacksize: " SIZE_FORMAT "k, ", stacksize / 1024);
+  ss.print("flags: ");
+  #define PRINT_FLAG(f) if (flags & f) ss.print( #f " ");
+  #define ALL(X) \
+    X(THR_SUSPENDED) \
+    X(THR_DETACHED) \
+    X(THR_BOUND) \
+    X(THR_NEW_LWP) \
+    X(THR_DAEMON)
+  ALL(PRINT_FLAG)
+  #undef ALL
+  #undef PRINT_FLAG
+  return buf;
+}
 
 bool os::create_thread(Thread* thread, ThreadType thr_type,
                        size_t stack_size) {
@@ -974,10 +1002,17 @@
   osthread->set_thread_id(-1);
 
   status = thr_create(NULL, stack_size, java_start, thread, flags, &tid);
+
+  char buf[64];
+  if (status == 0) {
+    log_info(os, thread)("Thread started (tid: " UINTX_FORMAT ", attributes: %s). ",
+      (uintx) tid, describe_thr_create_attributes(buf, sizeof(buf), stack_size, flags));
+  } else {
+    log_warning(os, thread)("Failed to start thread - thr_create failed (%s) for attributes: %s.",
+      strerror(status), describe_thr_create_attributes(buf, sizeof(buf), stack_size, flags));
+  }
+
   if (status != 0) {
-    if (PrintMiscellaneous && (Verbose || WizardMode)) {
-      perror("os::create_thread");
-    }
     thread->set_osthread(NULL);
     // Need to clean up stuff we've allocated so far
     delete osthread;
--- a/src/os/windows/vm/os_windows.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/os/windows/vm/os_windows.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -35,6 +35,7 @@
 #include "compiler/disassembler.hpp"
 #include "interpreter/interpreter.hpp"
 #include "jvm_windows.h"
+#include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/filemap.hpp"
 #include "mutex_windows.inline.hpp"
@@ -71,6 +72,7 @@
 #include "utilities/defaultStream.hpp"
 #include "utilities/events.hpp"
 #include "utilities/growableArray.hpp"
+#include "utilities/macros.hpp"
 #include "utilities/vmError.hpp"
 
 #ifdef _DEBUG
@@ -436,6 +438,8 @@
     res = 20115;    // java thread
   }
 
+  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ").", os::current_thread_id());
+
   // Install a win32 structured exception handler around every thread created
   // by VM, so VM can generate error dump when an exception occurred in non-
   // Java thread (e.g. VM thread).
@@ -446,6 +450,8 @@
     // Nothing to do.
   }
 
+  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ").", os::current_thread_id());
+
   // One less thread is executing
   // When the VMThread gets here, the main thread may have already exited
   // which frees the CodeHeap containing the Atomic::add code
@@ -509,6 +515,10 @@
   osthread->set_state(RUNNABLE);
 
   thread->set_osthread(osthread);
+
+  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ").",
+    os::current_thread_id());
+
   return true;
 }
 
@@ -530,6 +540,27 @@
   return true;
 }
 
+// Helper function to trace _beginthreadex attributes,
+//  similar to os::Posix::describe_pthread_attr()
+static char* describe_beginthreadex_attributes(char* buf, size_t buflen,
+                                               size_t stacksize, unsigned initflag) {
+  stringStream ss(buf, buflen);
+  if (stacksize == 0) {
+    ss.print("stacksize: default, ");
+  } else {
+    ss.print("stacksize: " SIZE_FORMAT "k, ", stacksize / 1024);
+  }
+  ss.print("flags: ");
+  #define PRINT_FLAG(f) if (initflag & f) ss.print( #f " ");
+  #define ALL(X) \
+    X(CREATE_SUSPENDED) \
+    X(STACK_SIZE_PARAM_IS_A_RESERVATION)
+  ALL(PRINT_FLAG)
+  #undef ALL
+  #undef PRINT_FLAG
+  return buf;
+}
+
 // Allocate and initialize a new OSThread
 bool os::create_thread(Thread* thread, ThreadType thr_type,
                        size_t stack_size) {
@@ -596,14 +627,24 @@
   // document because JVM uses C runtime library. The good news is that the
   // flag appears to work with _beginthredex() as well.
 
+  const unsigned initflag = CREATE_SUSPENDED | STACK_SIZE_PARAM_IS_A_RESERVATION;
   HANDLE thread_handle =
     (HANDLE)_beginthreadex(NULL,
                            (unsigned)stack_size,
                            (unsigned (__stdcall *)(void*)) java_start,
                            thread,
-                           CREATE_SUSPENDED | STACK_SIZE_PARAM_IS_A_RESERVATION,
+                           initflag,
                            &thread_id);
 
+  char buf[64];
+  if (thread_handle != NULL) {
+    log_info(os, thread)("Thread started (tid: %u, attributes: %s)",
+      thread_id, describe_beginthreadex_attributes(buf, sizeof(buf), stack_size, initflag));
+  } else {
+    log_warning(os, thread)("Failed to start thread - _beginthreadex failed (%s) for attributes: %s.",
+      strerror(errno), describe_beginthreadex_attributes(buf, sizeof(buf), stack_size, initflag));
+  }
+
   if (thread_handle == NULL) {
     // Need to clean up stuff we've allocated so far
     CloseHandle(osthread->interrupt_event());
@@ -1668,8 +1709,7 @@
     if (is_workstation) {
       st->print("10");
     } else {
-      // The server version name of Windows 10 is not known at this time
-      st->print("%d.%d", major_version, minor_version);
+      st->print("Server 2016");
     }
     break;
 
--- a/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -264,6 +264,7 @@
 
 // We need to keep these here as long as we have to build on Solaris
 // versions before 10.
+
 #ifndef SI_ARCHITECTURE_32
 #define SI_ARCHITECTURE_32      516     /* basic 32-bit SI_ARCHITECTURE */
 #endif
@@ -272,36 +273,87 @@
 #define SI_ARCHITECTURE_64      517     /* basic 64-bit SI_ARCHITECTURE */
 #endif
 
-static void do_sysinfo(int si, const char* string, int* features, int mask) {
-  char   tmp;
-  size_t bufsize = sysinfo(si, &tmp, 1);
+#ifndef SI_CPUBRAND
+#define SI_CPUBRAND             523     /* return cpu brand string */
+#endif
 
-  // All SI defines used below must be supported.
-  guarantee(bufsize != -1, "must be supported");
+class Sysinfo {
+  char* _string;
+public:
+  Sysinfo(int si) : _string(NULL) {
+    char   tmp;
+    size_t bufsize = sysinfo(si, &tmp, 1);
 
-  char* buf = (char*) os::malloc(bufsize, mtInternal);
-
-  if (buf == NULL)
-    return;
-
-  if (sysinfo(si, buf, bufsize) == bufsize) {
-    // Compare the string.
-    if (strcmp(buf, string) == 0) {
-      *features |= mask;
+    if (bufsize != -1) {
+      char* buf = (char*) os::malloc(bufsize, mtInternal);
+      if (buf != NULL) {
+        if (sysinfo(si, buf, bufsize) == bufsize) {
+          _string = buf;
+        } else {
+          os::free(buf);
+        }
+      }
     }
   }
 
-  os::free(buf);
-}
+  ~Sysinfo() {
+    if (_string != NULL) {
+      os::free(_string);
+    }
+  }
+
+  const char* value() const {
+    return _string;
+  }
+
+  bool valid() const {
+    return _string != NULL;
+  }
+
+  bool match(const char* s) const {
+    return valid() ? strcmp(_string, s) == 0 : false;
+  }
+
+  bool match_substring(const char* s) const {
+    return valid() ? strstr(_string, s) != NULL : false;
+  }
+};
+
+class Sysconf {
+  int _value;
+public:
+  Sysconf(int sc) : _value(-1) {
+    _value = sysconf(sc);
+  }
+  bool valid() const {
+    return _value != -1;
+  }
+  int value() const {
+    return _value;
+  }
+};
+
+
+#ifndef _SC_DCACHE_LINESZ
+#define _SC_DCACHE_LINESZ       508     /* Data cache line size */
+#endif
+
+#ifndef _SC_L2CACHE_LINESZ
+#define _SC_L2CACHE_LINESZ      527     /* Size of L2 cache line */
+#endif
 
 int VM_Version::platform_features(int features) {
   assert(os::Solaris::supports_getisax(), "getisax() must be available");
 
   // Check 32-bit architecture.
-  do_sysinfo(SI_ARCHITECTURE_32, "sparc", &features, v8_instructions_m);
+  if (Sysinfo(SI_ARCHITECTURE_32).match("sparc")) {
+    features |= v8_instructions_m;
+  }
 
   // Check 64-bit architecture.
-  do_sysinfo(SI_ARCHITECTURE_64, "sparcv9", &features, generic_v9_m);
+  if (Sysinfo(SI_ARCHITECTURE_64).match("sparcv9")) {
+    features |= generic_v9_m;
+  }
 
   // Extract valid instruction set extensions.
   uint_t avs[2];
@@ -388,67 +440,63 @@
   if (av & AV_SPARC_SHA512)       features |= sha512_instruction_m;
 
   // Determine the machine type.
-  do_sysinfo(SI_MACHINE, "sun4v", &features, sun4v_m);
+  if (Sysinfo(SI_MACHINE).match("sun4v")) {
+    features |= sun4v_m;
+  }
 
-  {
-    // Using kstat to determine the machine type.
+  bool use_solaris_12_api = false;
+  Sysinfo impl(SI_CPUBRAND);
+  if (impl.valid()) {
+    // If SI_CPUBRAND works, that means Solaris 12 API to get the cache line sizes
+    // is available to us as well
+    use_solaris_12_api = true;
+    features |= parse_features(impl.value());
+  } else {
+    // Otherwise use kstat to determine the machine type.
     kstat_ctl_t* kc = kstat_open();
     kstat_t* ksp = kstat_lookup(kc, (char*)"cpu_info", -1, NULL);
-    const char* implementation = "UNKNOWN";
+    const char* implementation;
+    bool has_implementation = false;
     if (ksp != NULL) {
       if (kstat_read(kc, ksp, NULL) != -1 && ksp->ks_data != NULL) {
         kstat_named_t* knm = (kstat_named_t *)ksp->ks_data;
         for (int i = 0; i < ksp->ks_ndata; i++) {
           if (strcmp((const char*)&(knm[i].name),"implementation") == 0) {
             implementation = KSTAT_NAMED_STR_PTR(&knm[i]);
+            has_implementation = true;
 #ifndef PRODUCT
             if (PrintMiscellaneous && Verbose) {
               tty->print_cr("cpu_info.implementation: %s", implementation);
             }
 #endif
-            // Convert to UPPER case before compare.
-            char* impl = os::strdup_check_oom(implementation);
-
-            for (int i = 0; impl[i] != 0; i++)
-              impl[i] = (char)toupper((uint)impl[i]);
-
-            if (strstr(impl, "SPARC64") != NULL) {
-              features |= sparc64_family_m;
-            } else if (strstr(impl, "SPARC-M") != NULL) {
-              // M-series SPARC is based on T-series.
-              features |= (M_family_m | T_family_m);
-            } else if (strstr(impl, "SPARC-T") != NULL) {
-              features |= T_family_m;
-              if (strstr(impl, "SPARC-T1") != NULL) {
-                features |= T1_model_m;
-              }
-            } else {
-              if (strstr(impl, "SPARC") == NULL) {
-#ifndef PRODUCT
-                // kstat on Solaris 8 virtual machines (branded zones)
-                // returns "(unsupported)" implementation. Solaris 8 is not
-                // supported anymore, but include this check to be on the
-                // safe side.
-                warning("kstat cpu_info implementation = '%s', assume generic SPARC", impl);
-#endif
-                implementation = "SPARC";
-              }
-            }
-            os::free((void*)impl);
+            features |= parse_features(implementation);
             break;
           }
         } // for(
       }
     }
-    assert(strcmp(implementation, "UNKNOWN") != 0,
-           "unknown cpu info (changed kstat interface?)");
+    assert(has_implementation, "unknown cpu info (changed kstat interface?)");
     kstat_close(kc);
   }
 
-  // Figure out cache line sizes using PICL
-  PICL picl((features & sparc64_family_m) != 0, (features & sun4v_m) != 0);
-  _L1_data_cache_line_size = picl.L1_data_cache_line_size();
-  _L2_data_cache_line_size = picl.L2_data_cache_line_size();
+  bool is_sun4v = (features & sun4v_m) != 0;
+  if (use_solaris_12_api && is_sun4v) {
+    // If Solaris 12 API is supported and it's sun4v use sysconf() to get the cache line sizes
+    Sysconf l1_dcache_line_size(_SC_DCACHE_LINESZ);
+    if (l1_dcache_line_size.valid()) {
+      _L1_data_cache_line_size =  l1_dcache_line_size.value();
+    }
 
+    Sysconf l2_dcache_line_size(_SC_L2CACHE_LINESZ);
+    if (l2_dcache_line_size.valid()) {
+      _L2_data_cache_line_size = l2_dcache_line_size.value();
+    }
+  } else {
+    // Otherwise figure out the cache line sizes using PICL
+    bool is_fujitsu = (features & sparc64_family_m) != 0;
+    PICL picl(is_fujitsu, is_sun4v);
+    _L1_data_cache_line_size = picl.L1_data_cache_line_size();
+    _L2_data_cache_line_size = picl.L2_data_cache_line_size();
+  }
   return features;
 }
--- a/src/os_cpu/windows_x86/vm/assembler_windows_x86.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/os_cpu/windows_x86/vm/assembler_windows_x86.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -51,11 +51,8 @@
 // movl reg, [reg + thread_ptr_offset]     Load thread
 //
 void MacroAssembler::get_thread(Register thread) {
-  // can't use ExternalAddress because it can't take NULL
-  AddressLiteral null(0, relocInfo::none);
-
   prefix(FS_segment);
-  movptr(thread, null);
+  movptr(thread, ExternalAddress(NULL));
   assert(os::win32::get_thread_ptr_offset() != 0,
          "Thread Pointer Offset has not been initialized");
   movl(thread, Address(thread, os::win32::get_thread_ptr_offset()));
--- a/src/share/vm/adlc/formssel.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/adlc/formssel.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -3491,6 +3491,8 @@
     "LoadPLocked",
     "StorePConditional", "StoreIConditional", "StoreLConditional",
     "CompareAndSwapI", "CompareAndSwapL", "CompareAndSwapP", "CompareAndSwapN",
+    "WeakCompareAndSwapI", "WeakCompareAndSwapL", "WeakCompareAndSwapP", "WeakCompareAndSwapN",
+    "CompareAndExchangeI", "CompareAndExchangeL", "CompareAndExchangeP", "CompareAndExchangeN",
     "StoreCM",
     "ClearArray",
     "GetAndAddI", "GetAndSetI", "GetAndSetP",
--- a/src/share/vm/c1/c1_CFGPrinter.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/c1/c1_CFGPrinter.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,7 +34,9 @@
 // compilation for later analysis.
 
 class CFGPrinterOutput;
-class IntervalList;
+class Interval;
+
+typedef GrowableArray<Interval*> IntervalList;
 
 class CFGPrinter : public AllStatic {
 private:
--- a/src/share/vm/c1/c1_Canonicalizer.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/c1/c1_Canonicalizer.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -222,27 +222,36 @@
 }
 
 void Canonicalizer::do_ArrayLength    (ArrayLength*     x) {
-  NewArray* array = x->array()->as_NewArray();
-  if (array != NULL && array->length() != NULL) {
-    Constant* length = array->length()->as_Constant();
-    if (length != NULL) {
-      // do not use the Constant itself, but create a new Constant
-      // with same value Otherwise a Constant is live over multiple
-      // blocks without being registered in a state array.
+  NewArray*  na;
+  Constant*  ct;
+  LoadField* lf;
+
+  if ((na = x->array()->as_NewArray()) != NULL) {
+    // New arrays might have the known length.
+    // Do not use the Constant itself, but create a new Constant
+    // with same value Otherwise a Constant is live over multiple
+    // blocks without being registered in a state array.
+    Constant* length;
+    if (na->length() != NULL &&
+        (length = na->length()->as_Constant()) != NULL) {
       assert(length->type()->as_IntConstant() != NULL, "array length must be integer");
       set_constant(length->type()->as_IntConstant()->value());
     }
-  } else {
-    LoadField* lf = x->array()->as_LoadField();
-    if (lf != NULL) {
-      ciField* field = lf->field();
-      if (field->is_constant() && field->is_static()) {
-        // final static field
-        ciObject* c = field->constant_value().as_object();
-        if (c->is_array()) {
-          ciArray* array = (ciArray*) c;
-          set_constant(array->length());
-        }
+
+  } else if ((ct = x->array()->as_Constant()) != NULL) {
+    // Constant arrays have constant lengths.
+    ArrayConstant* cnst = ct->type()->as_ArrayConstant();
+    if (cnst != NULL) {
+      set_constant(cnst->value()->length());
+    }
+
+  } else if ((lf = x->array()->as_LoadField()) != NULL) {
+    ciField* field = lf->field();
+    if (field->is_constant() && field->is_static()) {
+      assert(PatchALot || ScavengeRootsInCode < 2, "Constant field loads are folded during parsing");
+      ciObject* c = field->constant_value().as_object();
+      if (!c->is_null_object()) {
+        set_constant(c->as_array()->length());
       }
     }
   }
--- a/src/share/vm/c1/c1_Compiler.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/c1/c1_Compiler.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -228,8 +228,6 @@
   case vmIntrinsics::_getCharStringU:
   case vmIntrinsics::_putCharStringU:
 #ifdef TRACE_HAVE_INTRINSICS
-  case vmIntrinsics::_classID:
-  case vmIntrinsics::_threadID:
   case vmIntrinsics::_counterTime:
 #endif
     break;
--- a/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -43,6 +43,9 @@
 #if INCLUDE_ALL_GCS
 #include "gc/g1/heapRegion.hpp"
 #endif // INCLUDE_ALL_GCS
+#ifdef TRACE_HAVE_INTRINSICS
+#include "trace/traceMacros.hpp"
+#endif
 
 #ifdef ASSERT
 #define __ gen()->lir(__FILE__, __LINE__)->
@@ -3067,42 +3070,7 @@
   __ move(reg, result);
 }
 
-#ifdef TRACE_HAVE_INTRINSICS
-void LIRGenerator::do_ThreadIDIntrinsic(Intrinsic* x) {
-    LIR_Opr thread = getThreadPointer();
-    LIR_Opr osthread = new_pointer_register();
-    __ move(new LIR_Address(thread, in_bytes(JavaThread::osthread_offset()), osthread->type()), osthread);
-    size_t thread_id_size = OSThread::thread_id_size();
-    if (thread_id_size == (size_t) BytesPerLong) {
-      LIR_Opr id = new_register(T_LONG);
-      __ move(new LIR_Address(osthread, in_bytes(OSThread::thread_id_offset()), T_LONG), id);
-      __ convert(Bytecodes::_l2i, id, rlock_result(x));
-    } else if (thread_id_size == (size_t) BytesPerInt) {
-      __ move(new LIR_Address(osthread, in_bytes(OSThread::thread_id_offset()), T_INT), rlock_result(x));
-    } else {
-      ShouldNotReachHere();
-    }
-}
-
-void LIRGenerator::do_ClassIDIntrinsic(Intrinsic* x) {
-    CodeEmitInfo* info = state_for(x);
-    CodeEmitInfo* info2 = new CodeEmitInfo(info); // Clone for the second null check
-    BasicType klass_pointer_type = NOT_LP64(T_INT) LP64_ONLY(T_LONG);
-    assert(info != NULL, "must have info");
-    LIRItem arg(x->argument_at(1), this);
-    arg.load_item();
-    LIR_Opr klass = new_pointer_register();
-    __ move(new LIR_Address(arg.result(), java_lang_Class::klass_offset_in_bytes(), klass_pointer_type), klass, info);
-    LIR_Opr id = new_register(T_LONG);
-    ByteSize offset = TRACE_ID_OFFSET;
-    LIR_Address* trace_id_addr = new LIR_Address(klass, in_bytes(offset), T_LONG);
-    __ move(trace_id_addr, id);
-    __ logical_or(id, LIR_OprFact::longConst(0x01l), id);
-    __ store(id, trace_id_addr);
-    __ logical_and(id, LIR_OprFact::longConst(~0x3l), id);
-    __ move(id, rlock_result(x));
-}
-#endif
+
 
 void LIRGenerator::do_Intrinsic(Intrinsic* x) {
   switch (x->id()) {
@@ -3115,8 +3083,6 @@
   }
 
 #ifdef TRACE_HAVE_INTRINSICS
-  case vmIntrinsics::_threadID: do_ThreadIDIntrinsic(x); break;
-  case vmIntrinsics::_classID: do_ClassIDIntrinsic(x); break;
   case vmIntrinsics::_counterTime:
     do_RuntimeCall(CAST_FROM_FN_PTR(address, TRACE_TIME_METHOD), x);
     break;
--- a/src/share/vm/c1/c1_LIRGenerator.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/c1/c1_LIRGenerator.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -440,10 +440,7 @@
   void do_SwitchRanges(SwitchRangeArray* x, LIR_Opr value, BlockBegin* default_sux);
 
   void do_RuntimeCall(address routine, Intrinsic* x);
-#ifdef TRACE_HAVE_INTRINSICS
-  void do_ThreadIDIntrinsic(Intrinsic* x);
-  void do_ClassIDIntrinsic(Intrinsic* x);
-#endif
+
   ciKlass* profile_type(ciMethodData* md, int md_first_offset, int md_offset, intptr_t profiled_k,
                         Value arg, LIR_Opr& mdp, bool not_null, ciKlass* signature_at_call_k,
                         ciKlass* callee_signature_k);
--- a/src/share/vm/c1/c1_LinearScan.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/c1/c1_LinearScan.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1434,41 +1434,73 @@
 }
 
 #ifndef PRODUCT
+int interval_cmp(Interval* const& l, Interval* const& r) {
+  return l->from() - r->from();
+}
+
+bool find_interval(Interval* interval, IntervalArray* intervals) {
+  bool found;
+  int idx = intervals->find_sorted<Interval*, interval_cmp>(interval, found);
+
+  if (!found) {
+    return false;
+  }
+
+  int from = interval->from();
+
+  // The index we've found using binary search is pointing to an interval
+  // that is defined in the same place as the interval we were looking for.
+  // So now we have to look around that index and find exact interval.
+  for (int i = idx; i >= 0; i--) {
+    if (intervals->at(i) == interval) {
+      return true;
+    }
+    if (intervals->at(i)->from() != from) {
+      break;
+    }
+  }
+
+  for (int i = idx + 1; i < intervals->length(); i++) {
+    if (intervals->at(i) == interval) {
+      return true;
+    }
+    if (intervals->at(i)->from() != from) {
+      break;
+    }
+  }
+
+  return false;
+}
+
 bool LinearScan::is_sorted(IntervalArray* intervals) {
   int from = -1;
-  int i, j;
-  for (i = 0; i < intervals->length(); i ++) {
+  int null_count = 0;
+
+  for (int i = 0; i < intervals->length(); i++) {
     Interval* it = intervals->at(i);
     if (it != NULL) {
-      if (from > it->from()) {
-        assert(false, "");
-        return false;
-      }
+      assert(from <= it->from(), "Intervals are unordered");
       from = it->from();
-    }
-  }
-
-  // check in both directions if sorted list and unsorted list contain same intervals
-  for (i = 0; i < interval_count(); i++) {
-    if (interval_at(i) != NULL) {
-      int num_found = 0;
-      for (j = 0; j < intervals->length(); j++) {
-        if (interval_at(i) == intervals->at(j)) {
-          num_found++;
-        }
-      }
-      assert(num_found == 1, "lists do not contain same intervals");
-    }
-  }
-  for (j = 0; j < intervals->length(); j++) {
-    int num_found = 0;
-    for (i = 0; i < interval_count(); i++) {
-      if (interval_at(i) == intervals->at(j)) {
-        num_found++;
-      }
-    }
-    assert(num_found == 1, "lists do not contain same intervals");
-  }
+    } else {
+      null_count++;
+    }
+  }
+
+  assert(null_count == 0, "Sorted intervals should not contain nulls");
+
+  null_count = 0;
+
+  for (int i = 0; i < interval_count(); i++) {
+    Interval* interval = interval_at(i);
+    if (interval != NULL) {
+      assert(find_interval(interval, intervals), "Lists do not contain same intervals");
+    } else {
+      null_count++;
+    }
+  }
+
+  assert(interval_count() - null_count == intervals->length(),
+      "Sorted list should contain the same amount of non-NULL intervals as unsorted list");
 
   return true;
 }
@@ -1536,7 +1568,7 @@
       sorted_len++;
     }
   }
-  IntervalArray* sorted_list = new IntervalArray(sorted_len);
+  IntervalArray* sorted_list = new IntervalArray(sorted_len, sorted_len, NULL);
 
   // special sorting algorithm: the original interval-list is almost sorted,
   // only some intervals are swapped. So this is much faster than a complete QuickSort
@@ -1574,8 +1606,8 @@
     _needs_full_resort = false;
   }
 
-  IntervalArray* old_list      = _sorted_intervals;
-  IntervalList*  new_list      = _new_intervals_from_allocation;
+  IntervalArray* old_list = _sorted_intervals;
+  IntervalList* new_list = _new_intervals_from_allocation;
   int old_len = old_list->length();
   int new_len = new_list->length();
 
@@ -1589,7 +1621,8 @@
   new_list->sort(interval_cmp);
 
   // merge old and new list (both already sorted) into one combined list
-  IntervalArray* combined_list = new IntervalArray(old_len + new_len);
+  int combined_list_len = old_len + new_len;
+  IntervalArray* combined_list = new IntervalArray(combined_list_len, combined_list_len, NULL);
   int old_idx = 0;
   int new_idx = 0;
 
@@ -3211,6 +3244,10 @@
       has_error = true;
     }
 
+    // special intervals that are created in MoveResolver
+    // -> ignore them because the range information has no meaning there
+    if (i1->from() == 1 && i1->to() == 2) continue;
+
     if (i1->first() == Range::end()) {
       tty->print_cr("Interval %d has no Range", i1->reg_num()); i1->print(); tty->cr();
       has_error = true;
@@ -3225,18 +3262,13 @@
 
     for (int j = i + 1; j < len; j++) {
       Interval* i2 = interval_at(j);
-      if (i2 == NULL) continue;
-
-      // special intervals that are created in MoveResolver
-      // -> ignore them because the range information has no meaning there
-      if (i1->from() == 1 && i1->to() == 2) continue;
-      if (i2->from() == 1 && i2->to() == 2) continue;
+      if (i2 == NULL || (i2->from() == 1 && i2->to() == 2)) continue;
 
       int r1 = i1->assigned_reg();
       int r1Hi = i1->assigned_regHi();
       int r2 = i2->assigned_reg();
       int r2Hi = i2->assigned_regHi();
-      if (i1->intersects(i2) && (r1 == r2 || r1 == r2Hi || (r1Hi != any_reg && (r1Hi == r2 || r1Hi == r2Hi)))) {
+      if ((r1 == r2 || r1 == r2Hi || (r1Hi != any_reg && (r1Hi == r2 || r1Hi == r2Hi))) && i1->intersects(i2)) {
         tty->print_cr("Intervals %d and %d overlap and have the same register assigned", i1->reg_num(), i2->reg_num());
         i1->print(); tty->cr();
         i2->print(); tty->cr();
@@ -3429,7 +3461,8 @@
 
 void RegisterVerifier::verify(BlockBegin* start) {
   // setup input registers (method arguments) for first block
-  IntervalList* input_state = new IntervalList(state_size(), NULL);
+  int input_state_len = state_size();
+  IntervalList* input_state = new IntervalList(input_state_len, input_state_len, NULL);
   CallingConvention* args = compilation()->frame_map()->incoming_arguments();
   for (int n = 0; n < args->length(); n++) {
     LIR_Opr opr = args->at(n);
@@ -3543,7 +3576,7 @@
 
 IntervalList* RegisterVerifier::copy(IntervalList* input_state) {
   IntervalList* copy_state = new IntervalList(input_state->length());
-  copy_state->push_all(input_state);
+  copy_state->appendAll(input_state);
   return copy_state;
 }
 
@@ -5506,7 +5539,7 @@
     IntervalList* processed = _spill_intervals[reg];
     for (int i = 0; i < _spill_intervals[regHi]->length(); i++) {
       Interval* it = _spill_intervals[regHi]->at(i);
-      if (processed->index_of(it) == -1) {
+      if (processed->find_from_end(it) == -1) {
         remove_from_list(it);
         split_and_spill_interval(it);
       }
--- a/src/share/vm/c1/c1_LinearScan.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/c1/c1_LinearScan.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -42,8 +42,8 @@
 class MoveResolver;
 class Range;
 
-define_array(IntervalArray, Interval*)
-define_stack(IntervalList, IntervalArray)
+typedef GrowableArray<Interval*> IntervalArray;
+typedef GrowableArray<Interval*> IntervalList;
 
 define_array(IntervalsArray, IntervalList*)
 define_stack(IntervalsList, IntervalsArray)
--- a/src/share/vm/ci/ciArray.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/ci/ciArray.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -107,8 +107,9 @@
   intptr_t header = arrayOopDesc::base_offset_in_bytes(elembt);
   intptr_t index = (element_offset - header) >> shift;
   intptr_t offset = header + ((intptr_t)index << shift);
-  if (offset != element_offset || index != (jint)index)
+  if (offset != element_offset || index != (jint)index || index < 0 || index >= length()) {
     return ciConstant();
+  }
   return element_value((jint) index);
 }
 
--- a/src/share/vm/ci/ciConstantPoolCache.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/ci/ciConstantPoolCache.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -41,15 +41,21 @@
   _keys = new (arena) GrowableArray<int>(arena, expected_size, 0, 0);
 }
 
+int ciConstantPoolCache::key_compare(const int& key, const int& elt) {
+  if (key < elt)      return -1;
+  else if (key > elt) return 1;
+  else                  return 0;
+}
+
 // ------------------------------------------------------------------
 // ciConstantPoolCache::get
 //
 // Get the entry at some index
 void* ciConstantPoolCache::get(int index) {
   ASSERT_IN_VM;
-  int pos = find(index);
-  if (pos >= _keys->length() ||
-      _keys->at(pos) != index) {
+  bool found = false;
+  int pos = _keys->find_sorted<int, ciConstantPoolCache::key_compare>(index, found);
+  if (!found) {
     // This element is not present in the cache.
     return NULL;
   }
@@ -57,42 +63,15 @@
 }
 
 // ------------------------------------------------------------------
-// ciConstantPoolCache::find
-//
-// Use binary search to find the position of this index in the cache.
-// If there is no entry in the cache corresponding to this oop, return
-// the position at which the index would be inserted.
-int ciConstantPoolCache::find(int key) {
-  int min = 0;
-  int max = _keys->length()-1;
-
-  while (max >= min) {
-    int mid = (max + min) / 2;
-    int value = _keys->at(mid);
-    if (value < key) {
-      min = mid + 1;
-    } else if (value > key) {
-      max = mid - 1;
-    } else {
-      return mid;
-    }
-  }
-  return min;
-}
-
-// ------------------------------------------------------------------
 // ciConstantPoolCache::insert
 //
 // Insert a ciObject into the table at some index.
 void ciConstantPoolCache::insert(int index, void* elem) {
-  int i;
-  int pos = find(index);
-  for (i = _keys->length()-1; i >= pos; i--) {
-    _keys->at_put_grow(i+1, _keys->at(i));
-    _elements->at_put_grow(i+1, _elements->at(i));
-  }
-  _keys->at_put_grow(pos, index);
-  _elements->at_put_grow(pos, elem);
+  bool found = false;
+  int pos = _keys->find_sorted<int, ciConstantPoolCache::key_compare>(index, found);
+  assert(!found, "duplicate");
+  _keys->insert_before(pos, index);
+  _elements->insert_before(pos, elem);
 }
 
 // ------------------------------------------------------------------
--- a/src/share/vm/ci/ciConstantPoolCache.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/ci/ciConstantPoolCache.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -38,7 +38,7 @@
   GrowableArray<int>*   _keys;
   GrowableArray<void*>* _elements;
 
-  int find(int index);
+  static int key_compare(const int& key, const int& elt);
 
 public:
   ciConstantPoolCache(Arena* arena, int expected_size);
--- a/src/share/vm/ci/ciObjectFactory.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/ci/ciObjectFactory.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -260,6 +260,13 @@
   return new_object;
 }
 
+int ciObjectFactory::metadata_compare(Metadata* const& key, ciMetadata* const& elt) {
+  Metadata* value = elt->constant_encoding();
+  if (key < value)      return -1;
+  else if (key > value) return 1;
+  else                  return 0;
+}
+
 // ------------------------------------------------------------------
 // ciObjectFactory::get_metadata
 //
@@ -280,7 +287,8 @@
   }
 #endif // ASSERT
   int len = _ci_metadata->length();
-  int index = find(key, _ci_metadata);
+  bool found = false;
+  int index = _ci_metadata->find_sorted<Metadata*, ciObjectFactory::metadata_compare>(key, found);
 #ifdef ASSERT
   if (CIObjectFactoryVerify) {
     for (int i=0; i<_ci_metadata->length(); i++) {
@@ -290,7 +298,8 @@
     }
   }
 #endif
-  if (!is_found_at(index, key, _ci_metadata)) {
+
+  if (!found) {
     // The ciMetadata does not yet exist. Create it and insert it
     // into the cache.
     ciMetadata* new_object = create_new_metadata(key);
@@ -300,10 +309,10 @@
     if (len != _ci_metadata->length()) {
       // creating the new object has recursively entered new objects
       // into the table.  We need to recompute our index.
-      index = find(key, _ci_metadata);
+      index = _ci_metadata->find_sorted<Metadata*, ciObjectFactory::metadata_compare>(key, found);
     }
-    assert(!is_found_at(index, key, _ci_metadata), "no double insert");
-    insert(index, new_object, _ci_metadata);
+    assert(!found, "no double insert");
+    _ci_metadata->insert_before(index, new_object);
     return new_object;
   }
   return _ci_metadata->at(index)->as_metadata();
@@ -655,60 +664,6 @@
   obj->set_ident(_next_ident++);
 }
 
-// ------------------------------------------------------------------
-// ciObjectFactory::find
-//
-// Use binary search to find the position of this oop in the cache.
-// If there is no entry in the cache corresponding to this oop, return
-// the position at which the oop should be inserted.
-int ciObjectFactory::find(Metadata* key, GrowableArray<ciMetadata*>* objects) {
-  int min = 0;
-  int max = objects->length()-1;
-
-  // print_contents();
-
-  while (max >= min) {
-    int mid = (max + min) / 2;
-    Metadata* value = objects->at(mid)->constant_encoding();
-    if (value < key) {
-      min = mid + 1;
-    } else if (value > key) {
-      max = mid - 1;
-    } else {
-      return mid;
-    }
-  }
-  return min;
-}
-
-// ------------------------------------------------------------------
-// ciObjectFactory::is_found_at
-//
-// Verify that the binary seach found the given key.
-bool ciObjectFactory::is_found_at(int index, Metadata* key, GrowableArray<ciMetadata*>* objects) {
-  return (index < objects->length() &&
-          objects->at(index)->constant_encoding() == key);
-}
-
-
-// ------------------------------------------------------------------
-// ciObjectFactory::insert
-//
-// Insert a ciObject into the table at some index.
-void ciObjectFactory::insert(int index, ciMetadata* obj, GrowableArray<ciMetadata*>* objects) {
-  int len = objects->length();
-  if (len == index) {
-    objects->append(obj);
-  } else {
-    objects->append(objects->at(len-1));
-    int pos;
-    for (pos = len-2; pos >= index; pos--) {
-      objects->at_put(pos+1,objects->at(pos));
-    }
-    objects->at_put(index, obj);
-  }
-}
-
 static ciObjectFactory::NonPermObject* emptyBucket = NULL;
 
 // ------------------------------------------------------------------
--- a/src/share/vm/ci/ciObjectFactory.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/ci/ciObjectFactory.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -68,9 +68,7 @@
   NonPermObject* _non_perm_bucket[NON_PERM_BUCKETS];
   int _non_perm_count;
 
-  int find(Metadata* key, GrowableArray<ciMetadata*>* objects);
-  bool is_found_at(int index, Metadata* key, GrowableArray<ciMetadata*>* objects);
-  void insert(int index, ciMetadata* obj, GrowableArray<ciMetadata*>* objects);
+  static int metadata_compare(Metadata* const& key, ciMetadata* const& elt);
 
   ciObject* create_new_object(oop o);
   ciMetadata* create_new_metadata(Metadata* o);
--- a/src/share/vm/classfile/classFileParser.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/classFileParser.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -5380,7 +5380,7 @@
     }
   }
 
-  TRACE_INIT_ID(ik);
+  TRACE_INIT_KLASS_ID(ik);
 
   // If we reach here, all is well.
   // Now remove the InstanceKlass* from the _klass_to_deallocate field
--- a/src/share/vm/classfile/classLoader.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/classLoader.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -37,6 +37,7 @@
 #include "gc/shared/generation.hpp"
 #include "interpreter/bytecodeStream.hpp"
 #include "interpreter/oopMapCache.hpp"
+#include "logging/logTag.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/filemap.hpp"
 #include "memory/oopFactory.hpp"
@@ -417,34 +418,30 @@
 #if INCLUDE_CDS
 void ClassLoader::exit_with_path_failure(const char* error, const char* message) {
   assert(DumpSharedSpaces, "only called at dump time");
-  tty->print_cr("Hint: enable -XX:+TraceClassPaths to diagnose the failure");
+  tty->print_cr("Hint: enable -Xlog:classpath=info to diagnose the failure");
   vm_exit_during_initialization(error, message);
 }
 #endif
 
-void ClassLoader::trace_class_path(outputStream* out, const char* msg, const char* name) {
-  if (!TraceClassPaths) {
-    return;
-  }
-
-  if (msg) {
-    out->print("%s", msg);
-  }
-  if (name) {
-    if (strlen(name) < 256) {
-      out->print("%s", name);
-    } else {
-      // For very long paths, we need to print each character separately,
-      // as print_cr() has a length limit
-      while (name[0] != '\0') {
-        out->print("%c", name[0]);
-        name++;
+void ClassLoader::trace_class_path(const char* msg, const char* name) {
+  if (log_is_enabled(Info, classpath)) {
+    ResourceMark rm;
+    outputStream* out = LogHandle(classpath)::info_stream();
+    if (msg) {
+      out->print("%s", msg);
+    }
+    if (name) {
+      if (strlen(name) < 256) {
+        out->print("%s", name);
+      } else {
+        // For very long paths, we need to print each character separately,
+        // as print_cr() has a length limit
+        while (name[0] != '\0') {
+          out->print("%c", name[0]);
+          name++;
+        }
       }
     }
-  }
-  if (msg && msg[0] == '[') {
-    out->print_cr("]");
-  } else {
     out->cr();
   }
 }
@@ -470,11 +467,13 @@
 void ClassLoader::setup_bootstrap_search_path() {
   assert(_first_entry == NULL, "should not setup bootstrap class search path twice");
   const char* sys_class_path = Arguments::get_sysclasspath();
+  const char* java_class_path = Arguments::get_appclasspath();
   if (PrintSharedArchiveAndExit) {
     // Don't print sys_class_path - this is the bootcp of this current VM process, not necessarily
     // the same as the bootcp of the shared archive.
   } else {
-    trace_class_path(tty, "[Bootstrap loader class path=", sys_class_path);
+    trace_class_path("bootstrap loader class path=", sys_class_path);
+    trace_class_path("classpath: ", java_class_path);
   }
 #if INCLUDE_CDS
   if (DumpSharedSpaces) {
@@ -578,9 +577,7 @@
         }
       }
     }
-    if (TraceClassPaths) {
-      tty->print_cr("[Opened %s]", path);
-    }
+    log_info(classpath)("opened: %s", path);
     log_info(classload)("opened: %s", path);
   } else {
     // Directory
--- a/src/share/vm/classfile/classLoader.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/classLoader.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -331,7 +331,7 @@
   static void  exit_with_path_failure(const char* error, const char* message);
 #endif
 
-  static void  trace_class_path(outputStream* out, const char* msg, const char* name = NULL);
+  static void  trace_class_path(const char* msg, const char* name = NULL);
 
   // VM monitoring and management support
   static jlong classloader_time_ms();
--- a/src/share/vm/classfile/dictionary.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/dictionary.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -135,8 +135,10 @@
     //          via a store to _pd_set.
     OrderAccess::release_store_ptr(&_pd_set, new_head);
   }
-  if (TraceProtectionDomainVerification && WizardMode) {
-    print();
+  if (log_is_enabled(Trace, protectiondomain)) {
+    ResourceMark rm;
+    outputStream* log = LogHandle(protectiondomain)::trace_stream();
+    print_count(log);
   }
 }
 
--- a/src/share/vm/classfile/dictionary.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/dictionary.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -29,6 +29,7 @@
 #include "oops/instanceKlass.hpp"
 #include "oops/oop.hpp"
 #include "utilities/hashtable.hpp"
+#include "utilities/ostream.hpp"
 
 class DictionaryEntry;
 class PSPromotionManager;
@@ -323,14 +324,14 @@
     return (klass->name() == class_name && _loader_data == loader_data);
   }
 
-  void print() {
+  void print_count(outputStream *st) {
     int count = 0;
     for (ProtectionDomainEntry* current = _pd_set;
                                 current != NULL;
                                 current = current->_next) {
       count++;
     }
-    tty->print_cr("pd set = #%d", count);
+    st->print_cr("pd set count = #%d", count);
   }
 };
 
--- a/src/share/vm/classfile/klassFactory.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/klassFactory.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+* Copyright (c) 2015, 2016, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -29,6 +29,7 @@
 #include "classfile/klassFactory.hpp"
 #include "memory/resourceArea.hpp"
 #include "prims/jvmtiEnvBase.hpp"
+#include "trace/traceMacros.hpp"
 
 static ClassFileStream* prologue(ClassFileStream* stream,
                                  Symbol* name,
@@ -136,5 +137,7 @@
     result->set_cached_class_file(cached_class_file);
   }
 
+  TRACE_KLASS_CREATION(result, parser, THREAD);
+
   return result;
 }
--- a/src/share/vm/classfile/sharedPathsMiscInfo.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/sharedPathsMiscInfo.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -26,15 +26,15 @@
 #include "classfile/classLoader.hpp"
 #include "classfile/classLoaderData.inline.hpp"
 #include "classfile/sharedPathsMiscInfo.hpp"
+#include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/metaspaceShared.hpp"
 #include "runtime/arguments.hpp"
+#include "utilities/ostream.hpp"
 
 void SharedPathsMiscInfo::add_path(const char* path, int type) {
-  if (TraceClassPaths) {
-    tty->print("[type=%s] ", type_name(type));
-    trace_class_path("[Add misc shared path ", path);
-  }
+  log_info(classpath)("type=%s ", type_name(type));
+  ClassLoader::trace_class_path("add misc shared path ", path);
   write(path, strlen(path) + 1);
   write_jint(jint(type));
 }
@@ -67,11 +67,29 @@
 }
 
 bool SharedPathsMiscInfo::fail(const char* msg, const char* name) {
-  ClassLoader::trace_class_path(tty, msg, name);
+  ClassLoader::trace_class_path(msg, name);
   MetaspaceShared::set_archive_loading_failed();
   return false;
 }
 
+void SharedPathsMiscInfo::print_path(int type, const char* path) {
+  ResourceMark rm;
+  outputStream* out = LogHandle(classpath)::info_stream();
+  switch (type) {
+  case BOOT:
+    out->print("Expecting -Dsun.boot.class.path=%s", path);
+    break;
+  case NON_EXIST:
+    out->print("Expecting that %s does not exist", path);
+    break;
+  case REQUIRED:
+    out->print("Expecting that file %s must exist and is not altered", path);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+}
+
 bool SharedPathsMiscInfo::check() {
   // The whole buffer must be 0 terminated so that we can use strlen and strcmp
   // without fear.
@@ -90,17 +108,14 @@
     if (!read_jint(&type)) {
       return fail("Corrupted archive file header");
     }
-    if (TraceClassPaths) {
-      tty->print("[type=%s ", type_name(type));
-      print_path(tty, type, path);
-      tty->print_cr("]");
-    }
+    log_info(classpath)("type=%s ", type_name(type));
+    print_path(type, path);
     if (!check(type, path)) {
       if (!PrintSharedArchiveAndExit) {
         return false;
       }
     } else {
-      trace_class_path("[ok");
+      ClassLoader::trace_class_path("ok");
     }
   }
 
--- a/src/share/vm/classfile/sharedPathsMiscInfo.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/sharedPathsMiscInfo.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -64,9 +64,6 @@
   void write(const void* ptr, size_t size);
   bool read(void* ptr, size_t size);
 
-  static void trace_class_path(const char* msg, const char* name = NULL) {
-    ClassLoader::trace_class_path(tty, msg, name);
-  }
 protected:
   static bool fail(const char* msg, const char* name = NULL);
   virtual bool check(jint type, const char* path);
@@ -144,21 +141,7 @@
     }
   }
 
-  virtual void print_path(outputStream* out, int type, const char* path) {
-    switch (type) {
-    case BOOT:
-      out->print("Expecting -Dsun.boot.class.path=%s", path);
-      break;
-    case NON_EXIST:
-      out->print("Expecting that %s does not exist", path);
-      break;
-    case REQUIRED:
-      out->print("Expecting that file %s must exist and is not altered", path);
-      break;
-    default:
-      ShouldNotReachHere();
-    }
-  }
+  virtual void print_path(int type, const char* path);
 
   bool check();
   bool read_jint(jint *ptr) {
--- a/src/share/vm/classfile/systemDictionary.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/systemDictionary.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -430,12 +430,15 @@
 
   // Now we have to call back to java to check if the initating class has access
   JavaValue result(T_VOID);
-  if (TraceProtectionDomainVerification) {
+  if (log_is_enabled(Debug, protectiondomain)) {
+    ResourceMark rm;
     // Print out trace information
-    tty->print_cr("Checking package access");
-    tty->print(" - class loader:      "); class_loader()->print_value_on(tty);      tty->cr();
-    tty->print(" - protection domain: "); protection_domain()->print_value_on(tty); tty->cr();
-    tty->print(" - loading:           "); klass()->print_value_on(tty);             tty->cr();
+    outputStream* log = LogHandle(protectiondomain)::debug_stream();
+    log->print_cr("Checking package access");
+    log->print("class loader: "); class_loader()->print_value_on(log);
+    log->print(" protection domain: "); protection_domain()->print_value_on(log);
+    log->print(" loading: "); klass()->print_value_on(log);
+    log->cr();
   }
 
   KlassHandle system_loader(THREAD, SystemDictionary::ClassLoader_klass());
@@ -448,13 +451,10 @@
                          protection_domain,
                          THREAD);
 
-  if (TraceProtectionDomainVerification) {
-    if (HAS_PENDING_EXCEPTION) {
-      tty->print_cr(" -> DENIED !!!!!!!!!!!!!!!!!!!!!");
-    } else {
-     tty->print_cr(" -> granted");
-    }
-    tty->cr();
+  if (HAS_PENDING_EXCEPTION) {
+    log_debug(protectiondomain)("DENIED !!!!!!!!!!!!!!!!!!!!!");
+  } else {
+   log_debug(protectiondomain)("granted");
   }
 
   if (HAS_PENDING_EXCEPTION) return;
--- a/src/share/vm/classfile/vmSymbols.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/vmSymbols.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -328,8 +328,6 @@
   assert(id != vmIntrinsics::_none, "must be a VM intrinsic");
   switch(id) {
 #ifdef TRACE_HAVE_INTRINSICS
-  case vmIntrinsics::_classID:
-  case vmIntrinsics::_threadID:
   case vmIntrinsics::_counterTime:
 #endif
   case vmIntrinsics::_currentTimeMillis:
@@ -544,6 +542,42 @@
   case vmIntrinsics::_putLongVolatile:
   case vmIntrinsics::_putFloatVolatile:
   case vmIntrinsics::_putDoubleVolatile:
+  case vmIntrinsics::_getObjectAcquire:
+  case vmIntrinsics::_getBooleanAcquire:
+  case vmIntrinsics::_getByteAcquire:
+  case vmIntrinsics::_getShortAcquire:
+  case vmIntrinsics::_getCharAcquire:
+  case vmIntrinsics::_getIntAcquire:
+  case vmIntrinsics::_getLongAcquire:
+  case vmIntrinsics::_getFloatAcquire:
+  case vmIntrinsics::_getDoubleAcquire:
+  case vmIntrinsics::_putObjectRelease:
+  case vmIntrinsics::_putBooleanRelease:
+  case vmIntrinsics::_putByteRelease:
+  case vmIntrinsics::_putShortRelease:
+  case vmIntrinsics::_putCharRelease:
+  case vmIntrinsics::_putIntRelease:
+  case vmIntrinsics::_putLongRelease:
+  case vmIntrinsics::_putFloatRelease:
+  case vmIntrinsics::_putDoubleRelease:
+  case vmIntrinsics::_getObjectOpaque:
+  case vmIntrinsics::_getBooleanOpaque:
+  case vmIntrinsics::_getByteOpaque:
+  case vmIntrinsics::_getShortOpaque:
+  case vmIntrinsics::_getCharOpaque:
+  case vmIntrinsics::_getIntOpaque:
+  case vmIntrinsics::_getLongOpaque:
+  case vmIntrinsics::_getFloatOpaque:
+  case vmIntrinsics::_getDoubleOpaque:
+  case vmIntrinsics::_putObjectOpaque:
+  case vmIntrinsics::_putBooleanOpaque:
+  case vmIntrinsics::_putByteOpaque:
+  case vmIntrinsics::_putShortOpaque:
+  case vmIntrinsics::_putCharOpaque:
+  case vmIntrinsics::_putIntOpaque:
+  case vmIntrinsics::_putLongOpaque:
+  case vmIntrinsics::_putFloatOpaque:
+  case vmIntrinsics::_putDoubleOpaque:
   case vmIntrinsics::_getByte_raw:
   case vmIntrinsics::_getShort_raw:
   case vmIntrinsics::_getChar_raw:
@@ -569,9 +603,27 @@
   case vmIntrinsics::_loadFence:
   case vmIntrinsics::_storeFence:
   case vmIntrinsics::_fullFence:
+  case vmIntrinsics::_compareAndSwapLong:
+  case vmIntrinsics::_weakCompareAndSwapLong:
+  case vmIntrinsics::_weakCompareAndSwapLongAcquire:
+  case vmIntrinsics::_weakCompareAndSwapLongRelease:
+  case vmIntrinsics::_compareAndSwapInt:
+  case vmIntrinsics::_weakCompareAndSwapInt:
+  case vmIntrinsics::_weakCompareAndSwapIntAcquire:
+  case vmIntrinsics::_weakCompareAndSwapIntRelease:
   case vmIntrinsics::_compareAndSwapObject:
-  case vmIntrinsics::_compareAndSwapLong:
-  case vmIntrinsics::_compareAndSwapInt:
+  case vmIntrinsics::_weakCompareAndSwapObject:
+  case vmIntrinsics::_weakCompareAndSwapObjectAcquire:
+  case vmIntrinsics::_weakCompareAndSwapObjectRelease:
+  case vmIntrinsics::_compareAndExchangeIntVolatile:
+  case vmIntrinsics::_compareAndExchangeIntAcquire:
+  case vmIntrinsics::_compareAndExchangeIntRelease:
+  case vmIntrinsics::_compareAndExchangeLongVolatile:
+  case vmIntrinsics::_compareAndExchangeLongAcquire:
+  case vmIntrinsics::_compareAndExchangeLongRelease:
+  case vmIntrinsics::_compareAndExchangeObjectVolatile:
+  case vmIntrinsics::_compareAndExchangeObjectAcquire:
+  case vmIntrinsics::_compareAndExchangeObjectRelease:
     if (!InlineUnsafeOps) return true;
     break;
   case vmIntrinsics::_getShortUnaligned:
--- a/src/share/vm/classfile/vmSymbols.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/classfile/vmSymbols.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1146,6 +1146,64 @@
   do_intrinsic(_putFloatVolatile,         jdk_internal_misc_Unsafe,     putFloatVolatile_name, putFloat_signature,     F_RN)  \
   do_intrinsic(_putDoubleVolatile,        jdk_internal_misc_Unsafe,     putDoubleVolatile_name, putDouble_signature,   F_RN)  \
                                                                                                                         \
+  do_name(getObjectOpaque_name,"getObjectOpaque")     do_name(putObjectOpaque_name,"putObjectOpaque")                   \
+  do_name(getBooleanOpaque_name,"getBooleanOpaque")   do_name(putBooleanOpaque_name,"putBooleanOpaque")                 \
+  do_name(getByteOpaque_name,"getByteOpaque")         do_name(putByteOpaque_name,"putByteOpaque")                       \
+  do_name(getShortOpaque_name,"getShortOpaque")       do_name(putShortOpaque_name,"putShortOpaque")                     \
+  do_name(getCharOpaque_name,"getCharOpaque")         do_name(putCharOpaque_name,"putCharOpaque")                       \
+  do_name(getIntOpaque_name,"getIntOpaque")           do_name(putIntOpaque_name,"putIntOpaque")                         \
+  do_name(getLongOpaque_name,"getLongOpaque")         do_name(putLongOpaque_name,"putLongOpaque")                       \
+  do_name(getFloatOpaque_name,"getFloatOpaque")       do_name(putFloatOpaque_name,"putFloatOpaque")                     \
+  do_name(getDoubleOpaque_name,"getDoubleOpaque")     do_name(putDoubleOpaque_name,"putDoubleOpaque")                   \
+                                                                                                                        \
+  do_intrinsic(_getObjectOpaque,          jdk_internal_misc_Unsafe,        getObjectOpaque_name, getObject_signature,   F_R)  \
+  do_intrinsic(_getBooleanOpaque,         jdk_internal_misc_Unsafe,        getBooleanOpaque_name, getBoolean_signature, F_R)  \
+  do_intrinsic(_getByteOpaque,            jdk_internal_misc_Unsafe,        getByteOpaque_name, getByte_signature,       F_R)  \
+  do_intrinsic(_getShortOpaque,           jdk_internal_misc_Unsafe,        getShortOpaque_name, getShort_signature,     F_R)  \
+  do_intrinsic(_getCharOpaque,            jdk_internal_misc_Unsafe,        getCharOpaque_name, getChar_signature,       F_R)  \
+  do_intrinsic(_getIntOpaque,             jdk_internal_misc_Unsafe,        getIntOpaque_name, getInt_signature,         F_R)  \
+  do_intrinsic(_getLongOpaque,            jdk_internal_misc_Unsafe,        getLongOpaque_name, getLong_signature,       F_R)  \
+  do_intrinsic(_getFloatOpaque,           jdk_internal_misc_Unsafe,        getFloatOpaque_name, getFloat_signature,     F_R)  \
+  do_intrinsic(_getDoubleOpaque,          jdk_internal_misc_Unsafe,        getDoubleOpaque_name, getDouble_signature,   F_R)  \
+  do_intrinsic(_putObjectOpaque,          jdk_internal_misc_Unsafe,        putObjectOpaque_name, putObject_signature,   F_R)  \
+  do_intrinsic(_putBooleanOpaque,         jdk_internal_misc_Unsafe,        putBooleanOpaque_name, putBoolean_signature, F_R)  \
+  do_intrinsic(_putByteOpaque,            jdk_internal_misc_Unsafe,        putByteOpaque_name, putByte_signature,       F_R)  \
+  do_intrinsic(_putShortOpaque,           jdk_internal_misc_Unsafe,        putShortOpaque_name, putShort_signature,     F_R)  \
+  do_intrinsic(_putCharOpaque,            jdk_internal_misc_Unsafe,        putCharOpaque_name, putChar_signature,       F_R)  \
+  do_intrinsic(_putIntOpaque,             jdk_internal_misc_Unsafe,        putIntOpaque_name, putInt_signature,         F_R)  \
+  do_intrinsic(_putLongOpaque,            jdk_internal_misc_Unsafe,        putLongOpaque_name, putLong_signature,       F_R)  \
+  do_intrinsic(_putFloatOpaque,           jdk_internal_misc_Unsafe,        putFloatOpaque_name, putFloat_signature,     F_R)  \
+  do_intrinsic(_putDoubleOpaque,          jdk_internal_misc_Unsafe,        putDoubleOpaque_name, putDouble_signature,   F_R)  \
+                                                                                                                        \
+  do_name(getObjectAcquire_name,  "getObjectAcquire")    do_name(putObjectRelease_name,  "putObjectRelease")            \
+  do_name(getBooleanAcquire_name, "getBooleanAcquire")   do_name(putBooleanRelease_name, "putBooleanRelease")           \
+  do_name(getByteAcquire_name,    "getByteAcquire")      do_name(putByteRelease_name,    "putByteRelease")              \
+  do_name(getShortAcquire_name,   "getShortAcquire")     do_name(putShortRelease_name,   "putShortRelease")             \
+  do_name(getCharAcquire_name,    "getCharAcquire")      do_name(putCharRelease_name,    "putCharRelease")              \
+  do_name(getIntAcquire_name,     "getIntAcquire")       do_name(putIntRelease_name,     "putIntRelease")               \
+  do_name(getLongAcquire_name,    "getLongAcquire")      do_name(putLongRelease_name,    "putLongRelease")              \
+  do_name(getFloatAcquire_name,   "getFloatAcquire")     do_name(putFloatRelease_name,   "putFloatRelease")             \
+  do_name(getDoubleAcquire_name,  "getDoubleAcquire")    do_name(putDoubleRelease_name,  "putDoubleRelease")            \
+                                                                                                                        \
+  do_intrinsic(_getObjectAcquire,        jdk_internal_misc_Unsafe,        getObjectAcquire_name, getObject_signature,   F_R)  \
+  do_intrinsic(_getBooleanAcquire,       jdk_internal_misc_Unsafe,        getBooleanAcquire_name, getBoolean_signature, F_R)  \
+  do_intrinsic(_getByteAcquire,          jdk_internal_misc_Unsafe,        getByteAcquire_name, getByte_signature,       F_R)  \
+  do_intrinsic(_getShortAcquire,         jdk_internal_misc_Unsafe,        getShortAcquire_name, getShort_signature,     F_R)  \
+  do_intrinsic(_getCharAcquire,          jdk_internal_misc_Unsafe,        getCharAcquire_name, getChar_signature,       F_R)  \
+  do_intrinsic(_getIntAcquire,           jdk_internal_misc_Unsafe,        getIntAcquire_name, getInt_signature,         F_R)  \
+  do_intrinsic(_getLongAcquire,          jdk_internal_misc_Unsafe,        getLongAcquire_name, getLong_signature,       F_R)  \
+  do_intrinsic(_getFloatAcquire,         jdk_internal_misc_Unsafe,        getFloatAcquire_name, getFloat_signature,     F_R)  \
+  do_intrinsic(_getDoubleAcquire,        jdk_internal_misc_Unsafe,        getDoubleAcquire_name, getDouble_signature,   F_R)  \
+  do_intrinsic(_putObjectRelease,        jdk_internal_misc_Unsafe,        putObjectRelease_name, putObject_signature,   F_R)  \
+  do_intrinsic(_putBooleanRelease,       jdk_internal_misc_Unsafe,        putBooleanRelease_name, putBoolean_signature, F_R)  \
+  do_intrinsic(_putByteRelease,          jdk_internal_misc_Unsafe,        putByteRelease_name, putByte_signature,       F_R)  \
+  do_intrinsic(_putShortRelease,         jdk_internal_misc_Unsafe,        putShortRelease_name, putShort_signature,     F_R)  \
+  do_intrinsic(_putCharRelease,          jdk_internal_misc_Unsafe,        putCharRelease_name, putChar_signature,       F_R)  \
+  do_intrinsic(_putIntRelease,           jdk_internal_misc_Unsafe,        putIntRelease_name, putInt_signature,         F_R)  \
+  do_intrinsic(_putLongRelease,          jdk_internal_misc_Unsafe,        putLongRelease_name, putLong_signature,       F_R)  \
+  do_intrinsic(_putFloatRelease,         jdk_internal_misc_Unsafe,        putFloatRelease_name, putFloat_signature,     F_R)  \
+  do_intrinsic(_putDoubleRelease,        jdk_internal_misc_Unsafe,        putDoubleRelease_name, putDouble_signature,   F_R)  \
+                                                                                                                        \
   do_name(getShortUnaligned_name,"getShortUnaligned")     do_name(putShortUnaligned_name,"putShortUnaligned")           \
   do_name(getCharUnaligned_name,"getCharUnaligned")       do_name(putCharUnaligned_name,"putCharUnaligned")             \
   do_name(getIntUnaligned_name,"getIntUnaligned")         do_name(putIntUnaligned_name,"putIntUnaligned")               \
@@ -1197,24 +1255,68 @@
   do_intrinsic(_putDouble_raw,            jdk_internal_misc_Unsafe,     putDouble_name, putDouble_raw_signature,       F_R)  \
   do_intrinsic(_putAddress_raw,           jdk_internal_misc_Unsafe,     putAddress_name, putAddress_raw_signature,     F_R)  \
                                                                                                                         \
-  do_intrinsic(_compareAndSwapObject,     jdk_internal_misc_Unsafe,     compareAndSwapObject_name, compareAndSwapObject_signature, F_R) \
-   do_name(     compareAndSwapObject_name,                              "compareAndSwapObject")                                \
-   do_signature(compareAndSwapObject_signature,                         "(Ljava/lang/Object;JLjava/lang/Object;Ljava/lang/Object;)Z")          \
-  do_intrinsic(_compareAndSwapLong,       jdk_internal_misc_Unsafe,     compareAndSwapLong_name, compareAndSwapLong_signature, F_R) \
-   do_name(     compareAndSwapLong_name,                                "compareAndSwapLong")                                  \
-   do_signature(compareAndSwapLong_signature,                           "(Ljava/lang/Object;JJJ)Z")                            \
-  do_intrinsic(_compareAndSwapInt,        jdk_internal_misc_Unsafe,     compareAndSwapInt_name, compareAndSwapInt_signature, F_R) \
-   do_name(     compareAndSwapInt_name,                                 "compareAndSwapInt")                                   \
-   do_signature(compareAndSwapInt_signature,                            "(Ljava/lang/Object;JII)Z")                            \
-  do_intrinsic(_putOrderedObject,         jdk_internal_misc_Unsafe,     putOrderedObject_name, putOrderedObject_signature, F_R) \
-   do_name(     putOrderedObject_name,                                  "putOrderedObject")                                    \
-   do_alias(    putOrderedObject_signature,                             /*(LObject;JLObject;)V*/ putObject_signature)           \
-  do_intrinsic(_putOrderedLong,           jdk_internal_misc_Unsafe,     putOrderedLong_name, putOrderedLong_signature, F_R)  \
-   do_name(     putOrderedLong_name,                                    "putOrderedLong")                                      \
-   do_alias(    putOrderedLong_signature,                               /*(Ljava/lang/Object;JJ)V*/ putLong_signature)          \
-  do_intrinsic(_putOrderedInt,            jdk_internal_misc_Unsafe,     putOrderedInt_name, putOrderedInt_signature,   F_R)  \
-   do_name(     putOrderedInt_name,                                     "putOrderedInt")                                       \
-   do_alias(    putOrderedInt_signature,                                 /*(Ljava/lang/Object;JI)V*/ putInt_signature)           \
+  do_signature(compareAndSwapObject_signature,     "(Ljava/lang/Object;JLjava/lang/Object;Ljava/lang/Object;)Z")        \
+  do_signature(compareAndExchangeObject_signature, "(Ljava/lang/Object;JLjava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;") \
+  do_signature(compareAndSwapLong_signature,       "(Ljava/lang/Object;JJJ)Z")                                          \
+  do_signature(compareAndExchangeLong_signature,   "(Ljava/lang/Object;JJJ)J")                                          \
+  do_signature(compareAndSwapInt_signature,        "(Ljava/lang/Object;JII)Z")                                          \
+  do_signature(compareAndExchangeInt_signature,    "(Ljava/lang/Object;JII)I")                                          \
+                                                                                                                        \
+  do_name(compareAndSwapObject_name,             "compareAndSwapObject")                                                \
+  do_name(compareAndExchangeObjectVolatile_name, "compareAndExchangeObjectVolatile")                                    \
+  do_name(compareAndExchangeObjectAcquire_name,  "compareAndExchangeObjectAcquire")                                     \
+  do_name(compareAndExchangeObjectRelease_name,  "compareAndExchangeObjectRelease")                                     \
+  do_name(compareAndSwapLong_name,               "compareAndSwapLong")                                                  \
+  do_name(compareAndExchangeLongVolatile_name,   "compareAndExchangeLongVolatile")                                      \
+  do_name(compareAndExchangeLongAcquire_name,    "compareAndExchangeLongAcquire")                                       \
+  do_name(compareAndExchangeLongRelease_name,    "compareAndExchangeLongRelease")                                       \
+  do_name(compareAndSwapInt_name,                "compareAndSwapInt")                                                   \
+  do_name(compareAndExchangeIntVolatile_name,    "compareAndExchangeIntVolatile")                                       \
+  do_name(compareAndExchangeIntAcquire_name,     "compareAndExchangeIntAcquire")                                        \
+  do_name(compareAndExchangeIntRelease_name,     "compareAndExchangeIntRelease")                                        \
+                                                                                                                        \
+  do_name(weakCompareAndSwapObject_name,         "weakCompareAndSwapObject")                                            \
+  do_name(weakCompareAndSwapObjectAcquire_name,  "weakCompareAndSwapObjectAcquire")                                     \
+  do_name(weakCompareAndSwapObjectRelease_name,  "weakCompareAndSwapObjectRelease")                                     \
+  do_name(weakCompareAndSwapLong_name,           "weakCompareAndSwapLong")                                              \
+  do_name(weakCompareAndSwapLongAcquire_name,    "weakCompareAndSwapLongAcquire")                                       \
+  do_name(weakCompareAndSwapLongRelease_name,    "weakCompareAndSwapLongRelease")                                       \
+  do_name(weakCompareAndSwapInt_name,            "weakCompareAndSwapInt")                                               \
+  do_name(weakCompareAndSwapIntAcquire_name,     "weakCompareAndSwapIntAcquire")                                        \
+  do_name(weakCompareAndSwapIntRelease_name,     "weakCompareAndSwapIntRelease")                                        \
+                                                                                                                        \
+  do_intrinsic(_compareAndSwapObject,             jdk_internal_misc_Unsafe,  compareAndSwapObject_name,             compareAndSwapObject_signature,     F_RN) \
+  do_intrinsic(_compareAndExchangeObjectVolatile, jdk_internal_misc_Unsafe,  compareAndExchangeObjectVolatile_name, compareAndExchangeObject_signature, F_RN) \
+  do_intrinsic(_compareAndExchangeObjectAcquire,  jdk_internal_misc_Unsafe,  compareAndExchangeObjectAcquire_name,  compareAndExchangeObject_signature, F_R)  \
+  do_intrinsic(_compareAndExchangeObjectRelease,  jdk_internal_misc_Unsafe,  compareAndExchangeObjectRelease_name,  compareAndExchangeObject_signature, F_R)  \
+  do_intrinsic(_compareAndSwapLong,               jdk_internal_misc_Unsafe,  compareAndSwapLong_name,               compareAndSwapLong_signature,       F_RN) \
+  do_intrinsic(_compareAndExchangeLongVolatile,   jdk_internal_misc_Unsafe,  compareAndExchangeLongVolatile_name,   compareAndExchangeLong_signature,   F_RN) \
+  do_intrinsic(_compareAndExchangeLongAcquire,    jdk_internal_misc_Unsafe,  compareAndExchangeLongAcquire_name,    compareAndExchangeLong_signature,   F_R)  \
+  do_intrinsic(_compareAndExchangeLongRelease,    jdk_internal_misc_Unsafe,  compareAndExchangeLongRelease_name,    compareAndExchangeLong_signature,   F_R)  \
+  do_intrinsic(_compareAndSwapInt,                jdk_internal_misc_Unsafe,  compareAndSwapInt_name,                compareAndSwapInt_signature,        F_RN) \
+  do_intrinsic(_compareAndExchangeIntVolatile,    jdk_internal_misc_Unsafe,  compareAndExchangeIntVolatile_name,    compareAndExchangeInt_signature,    F_RN) \
+  do_intrinsic(_compareAndExchangeIntAcquire,     jdk_internal_misc_Unsafe,  compareAndExchangeIntAcquire_name,     compareAndExchangeInt_signature,    F_R)  \
+  do_intrinsic(_compareAndExchangeIntRelease,     jdk_internal_misc_Unsafe,  compareAndExchangeIntRelease_name,     compareAndExchangeInt_signature,    F_R)  \
+                                                                                                                                                              \
+  do_intrinsic(_weakCompareAndSwapObject,         jdk_internal_misc_Unsafe,  weakCompareAndSwapObject_name,         compareAndSwapObject_signature,     F_R) \
+  do_intrinsic(_weakCompareAndSwapObjectAcquire,  jdk_internal_misc_Unsafe,  weakCompareAndSwapObjectAcquire_name,  compareAndSwapObject_signature,     F_R) \
+  do_intrinsic(_weakCompareAndSwapObjectRelease,  jdk_internal_misc_Unsafe,  weakCompareAndSwapObjectRelease_name,  compareAndSwapObject_signature,     F_R) \
+  do_intrinsic(_weakCompareAndSwapLong,           jdk_internal_misc_Unsafe,  weakCompareAndSwapLong_name,           compareAndSwapLong_signature,       F_R) \
+  do_intrinsic(_weakCompareAndSwapLongAcquire,    jdk_internal_misc_Unsafe,  weakCompareAndSwapLongAcquire_name,    compareAndSwapLong_signature,       F_R) \
+  do_intrinsic(_weakCompareAndSwapLongRelease,    jdk_internal_misc_Unsafe,  weakCompareAndSwapLongRelease_name,    compareAndSwapLong_signature,       F_R) \
+  do_intrinsic(_weakCompareAndSwapInt,            jdk_internal_misc_Unsafe,  weakCompareAndSwapInt_name,            compareAndSwapInt_signature,        F_R) \
+  do_intrinsic(_weakCompareAndSwapIntAcquire,     jdk_internal_misc_Unsafe,  weakCompareAndSwapIntAcquire_name,     compareAndSwapInt_signature,        F_R) \
+  do_intrinsic(_weakCompareAndSwapIntRelease,     jdk_internal_misc_Unsafe,  weakCompareAndSwapIntRelease_name,     compareAndSwapInt_signature,        F_R) \
+                                                                                                                        \
+  do_intrinsic(_putOrderedObject,         jdk_internal_misc_Unsafe,        putOrderedObject_name, putOrderedObject_signature, F_RN) \
+   do_name(     putOrderedObject_name,                           "putOrderedObject")                                    \
+   do_alias(    putOrderedObject_signature,                     /*(LObject;JLObject;)V*/ putObject_signature)           \
+  do_intrinsic(_putOrderedLong,           jdk_internal_misc_Unsafe,        putOrderedLong_name, putOrderedLong_signature, F_RN)  \
+   do_name(     putOrderedLong_name,                             "putOrderedLong")                                      \
+   do_alias(    putOrderedLong_signature,                       /*(Ljava/lang/Object;JJ)V*/ putLong_signature)          \
+  do_intrinsic(_putOrderedInt,            jdk_internal_misc_Unsafe,        putOrderedInt_name, putOrderedInt_signature,   F_RN)  \
+   do_name(     putOrderedInt_name,                              "putOrderedInt")                                       \
+   do_alias(    putOrderedInt_signature,                        /*(Ljava/lang/Object;JI)V*/ putInt_signature)           \
                                                                                                                         \
   do_intrinsic(_getAndAddInt,             jdk_internal_misc_Unsafe,     getAndAddInt_name, getAndAddInt_signature, F_R)       \
    do_name(     getAndAddInt_name,                                      "getAndAddInt")                                       \
--- a/src/share/vm/code/codeCache.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/code/codeCache.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1023,7 +1023,7 @@
 // Keeps track of time spent for checking dependencies
 NOT_PRODUCT(static elapsedTimer dependentCheckTime;)
 
-int CodeCache::mark_for_deoptimization(DepChange& changes) {
+int CodeCache::mark_for_deoptimization(KlassDepChange& changes) {
   MutexLockerEx mu(CodeCache_lock, Mutex::_no_safepoint_check_flag);
   int number_of_marked_CodeBlobs = 0;
 
--- a/src/share/vm/code/codeCache.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/code/codeCache.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -72,7 +72,7 @@
 // Solaris and BSD.
 
 class OopClosure;
-class DepChange;
+class KlassDepChange;
 
 class CodeCache : AllStatic {
   friend class VMStructs;
@@ -223,7 +223,7 @@
 
   // Deoptimization
  private:
-  static int  mark_for_deoptimization(DepChange& changes);
+  static int  mark_for_deoptimization(KlassDepChange& changes);
 #ifdef HOTSWAP
   static int  mark_for_evol_deoptimization(instanceKlassHandle dependee);
 #endif // HOTSWAP
--- a/src/share/vm/code/dependencies.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/code/dependencies.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -664,6 +664,8 @@
   virtual bool is_klass_change()     const { return false; }
   virtual bool is_call_site_change() const { return false; }
 
+  virtual void mark_for_deoptimization(nmethod* nm) = 0;
+
   // Subclass casting with assertions.
   KlassDepChange*    as_klass_change() {
     assert(is_klass_change(), "bad cast");
@@ -753,6 +755,10 @@
   // What kind of DepChange is this?
   virtual bool is_klass_change() const { return true; }
 
+  virtual void mark_for_deoptimization(nmethod* nm) {
+    nm->mark_for_deoptimization(/*inc_recompile_counts=*/true);
+  }
+
   Klass* new_type() { return _new_type(); }
 
   // involves_context(k) is true if k is new_type or any of the super types
@@ -772,6 +778,10 @@
   // What kind of DepChange is this?
   virtual bool is_call_site_change() const { return true; }
 
+  virtual void mark_for_deoptimization(nmethod* nm) {
+    nm->mark_for_deoptimization(/*inc_recompile_counts=*/false);
+  }
+
   oop call_site()     const { return _call_site();     }
   oop method_handle() const { return _method_handle(); }
 };
--- a/src/share/vm/code/dependencyContext.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/code/dependencyContext.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -73,7 +73,7 @@
         nm->print();
         nm->print_dependencies();
       }
-      nm->mark_for_deoptimization();
+      changes.mark_for_deoptimization(nm);
       found++;
     }
   }
--- a/src/share/vm/code/nmethod.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/code/nmethod.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -536,7 +536,7 @@
   _has_method_handle_invokes  = 0;
   _lazy_critical_native       = 0;
   _has_wide_vectors           = 0;
-  _marked_for_deoptimization  = 0;
+  _mark_for_deoptimization_status = not_marked;
   _lock_count                 = 0;
   _stack_traversal_mark       = 0;
   _unload_reported            = false; // jvmti state
@@ -1459,7 +1459,7 @@
                   SharedRuntime::get_handle_wrong_method_stub());
     }
 
-    if (is_in_use()) {
+    if (is_in_use() && update_recompile_counts()) {
       // It's a true state change, so mark the method as decompiled.
       // Do it only for transition from alive.
       inc_decompile_count();
--- a/src/share/vm/code/nmethod.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/code/nmethod.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -107,6 +107,7 @@
 //  [Implicit Null Pointer exception table]
 //  - implicit null table array
 
+class DepChange;
 class Dependencies;
 class ExceptionHandlerTable;
 class ImplicitExceptionTable;
@@ -188,7 +189,13 @@
   bool _has_flushed_dependencies;            // Used for maintenance of dependencies (CodeCache_lock)
 
   bool _marked_for_reclamation;              // Used by NMethodSweeper (set only by sweeper)
-  bool _marked_for_deoptimization;           // Used for stack deoptimization
+
+  enum MarkForDeoptimizationStatus {
+    not_marked,
+    deoptimize,
+    deoptimize_noupdate };
+
+  MarkForDeoptimizationStatus _mark_for_deoptimization_status; // Used for stack deoptimization
 
   // used by jvmti to track if an unload event has been posted for this nmethod.
   bool _unload_reported;
@@ -462,8 +469,16 @@
   void set_unloading_clock(unsigned char unloading_clock);
   unsigned char unloading_clock();
 
-  bool  is_marked_for_deoptimization() const      { return _marked_for_deoptimization; }
-  void  mark_for_deoptimization()                 { _marked_for_deoptimization = true; }
+  bool  is_marked_for_deoptimization() const      { return _mark_for_deoptimization_status != not_marked; }
+  void  mark_for_deoptimization(bool inc_recompile_counts = true) {
+    _mark_for_deoptimization_status = (inc_recompile_counts ? deoptimize : deoptimize_noupdate);
+  }
+  bool update_recompile_counts() const {
+    // Update recompile counts when either the update is explicitly requested (deoptimize)
+    // or the nmethod is not marked for deoptimization at all (not_marked).
+    // The latter happens during uncommon traps when deoptimized nmethod is made not entrant.
+    return _mark_for_deoptimization_status != deoptimize_noupdate;
+  }
 
   void  make_unloaded(BoolObjectClosure* is_alive, oop cause);
 
--- a/src/share/vm/code/relocInfo.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/code/relocInfo.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -457,49 +457,6 @@
   return itr._rh;
 }
 
-int32_t Relocation::runtime_address_to_index(address runtime_address) {
-  assert(!is_reloc_index((intptr_t)runtime_address), "must not look like an index");
-
-  if (runtime_address == NULL)  return 0;
-
-  StubCodeDesc* p = StubCodeDesc::desc_for(runtime_address);
-  if (p != NULL && p->begin() == runtime_address) {
-    assert(is_reloc_index(p->index()), "there must not be too many stubs");
-    return (int32_t)p->index();
-  } else {
-    // Known "miscellaneous" non-stub pointers:
-    // os::get_polling_page(), SafepointSynchronize::address_of_state()
-    if (PrintRelocations) {
-      tty->print_cr("random unregistered address in relocInfo: " INTPTR_FORMAT, p2i(runtime_address));
-    }
-#ifndef _LP64
-    return (int32_t) (intptr_t)runtime_address;
-#else
-    // didn't fit return non-index
-    return -1;
-#endif /* _LP64 */
-  }
-}
-
-
-address Relocation::index_to_runtime_address(int32_t index) {
-  if (index == 0)  return NULL;
-
-  if (is_reloc_index(index)) {
-    StubCodeDesc* p = StubCodeDesc::desc_for_index(index);
-    assert(p != NULL, "there must be a stub for this index");
-    return p->begin();
-  } else {
-#ifndef _LP64
-    // this only works on 32bit machines
-    return (address) ((intptr_t) index);
-#else
-    fatal("Relocation::index_to_runtime_address, int32_t not pointer sized");
-    return NULL;
-#endif /* _LP64 */
-  }
-}
-
 address Relocation::old_addr_for(address newa,
                                  const CodeBuffer* src, CodeBuffer* dest) {
   int sect = dest->section_index_of(newa);
@@ -623,20 +580,13 @@
 
 void external_word_Relocation::pack_data_to(CodeSection* dest) {
   short* p = (short*) dest->locs_end();
-  int32_t index = runtime_address_to_index(_target);
 #ifndef _LP64
-  p = pack_1_int_to(p, index);
+  p = pack_1_int_to(p, (int32_t) (intptr_t)_target);
 #else
-  if (is_reloc_index(index)) {
-    p = pack_2_ints_to(p, index, 0);
-  } else {
-    jlong t = (jlong) _target;
-    int32_t lo = low(t);
-    int32_t hi = high(t);
-    p = pack_2_ints_to(p, lo, hi);
-    DEBUG_ONLY(jlong t1 = jlong_from(hi, lo));
-    assert(!is_reloc_index(t1) && (address) t1 == _target, "not symmetric");
-  }
+  jlong t = (jlong) _target;
+  int32_t lo = low(t);
+  int32_t hi = high(t);
+  p = pack_2_ints_to(p, lo, hi);
 #endif /* _LP64 */
   dest->set_locs_end((relocInfo*) p);
 }
@@ -644,16 +594,12 @@
 
 void external_word_Relocation::unpack_data() {
 #ifndef _LP64
-  _target = index_to_runtime_address(unpack_1_int());
+  _target = (address) (intptr_t)unpack_1_int();
 #else
   int32_t lo, hi;
   unpack_2_ints(lo, hi);
   jlong t = jlong_from(hi, lo);;
-  if (is_reloc_index(t)) {
-    _target = index_to_runtime_address(t);
-  } else {
-    _target = (address) t;
-  }
+  _target = (address) t;
 #endif /* _LP64 */
 }
 
--- a/src/share/vm/code/relocInfo.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/code/relocInfo.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -707,10 +707,6 @@
     assert(datalen()==0 || type()==relocInfo::none, "no data here");
   }
 
-  static bool is_reloc_index(intptr_t index) {
-    return 0 < index && index < os::vm_page_size();
-  }
-
  protected:
   // Helper functions for pack_data_to() and unpack_data().
 
@@ -806,10 +802,6 @@
     return base + byte_offset;
   }
 
-  // these convert between indexes and addresses in the runtime system
-  static int32_t runtime_address_to_index(address runtime_address);
-  static address index_to_runtime_address(int32_t index);
-
   // helpers for mapping between old and new addresses after a move or resize
   address old_addr_for(address newa, const CodeBuffer* src, CodeBuffer* dest);
   address new_addr_for(address olda, const CodeBuffer* src, CodeBuffer* dest);
@@ -1253,7 +1245,8 @@
   // Some address looking values aren't safe to treat as relocations
   // and should just be treated as constants.
   static bool can_be_relocated(address target) {
-    return target != NULL && !is_reloc_index((intptr_t)target);
+    assert(target == NULL || (uintptr_t)target >= (uintptr_t)os::vm_page_size(), INTPTR_FORMAT, (intptr_t)target);
+    return target != NULL;
   }
 
  private:
--- a/src/share/vm/compiler/compileBroker.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/compiler/compileBroker.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -469,7 +469,6 @@
 void CompileBroker::print_compile_queues(outputStream* st) {
   st->print_cr("Current compiles: ");
   MutexLocker locker(MethodCompileQueue_lock);
-  MutexLocker locker2(Threads_lock);
 
   char buf[2000];
   int buflen = sizeof(buf);
@@ -2152,18 +2151,33 @@
 
     if (CITime) {
       int bytes_compiled = method->code_size() + task->num_inlined_bytecodes();
-      JVMCI_ONLY(CompilerStatistics* stats = compiler(task->comp_level())->stats();)
       if (is_osr) {
         _t_osr_compilation.add(time);
         _sum_osr_bytes_compiled += bytes_compiled;
-        JVMCI_ONLY(stats->_osr.update(time, bytes_compiled);)
       } else {
         _t_standard_compilation.add(time);
         _sum_standard_bytes_compiled += method->code_size() + task->num_inlined_bytecodes();
-        JVMCI_ONLY(stats->_standard.update(time, bytes_compiled);)
       }
-      JVMCI_ONLY(stats->_nmethods_size += code->total_size();)
-      JVMCI_ONLY(stats->_nmethods_code_size += code->insts_size();)
+
+#if INCLUDE_JVMCI
+      AbstractCompiler* comp = compiler(task->comp_level());
+      if (comp) {
+        CompilerStatistics* stats = comp->stats();
+        if (stats) {
+          if (is_osr) {
+            stats->_osr.update(time, bytes_compiled);
+          } else {
+            stats->_standard.update(time, bytes_compiled);
+          }
+          stats->_nmethods_size += code->total_size();
+          stats->_nmethods_code_size += code->insts_size();
+        } else { // if (!stats)
+          assert(false, "Compiler statistics object must exist");
+        }
+      } else { // if (!comp)
+        assert(false, "Compiler object must exist");
+      }
+#endif // INCLUDE_JVMCI
     }
 
     if (UsePerfData) {
@@ -2222,11 +2236,15 @@
 #if INCLUDE_JVMCI
 void CompileBroker::print_times(AbstractCompiler* comp) {
   CompilerStatistics* stats = comp->stats();
-  tty->print_cr("  %s {speed: %d bytes/s; standard: %6.3f s, %d bytes, %d methods; osr: %6.3f s, %d bytes, %d methods; nmethods_size: %d bytes; nmethods_code_size: %d bytes}",
+  if (stats) {
+    tty->print_cr("  %s {speed: %d bytes/s; standard: %6.3f s, %d bytes, %d methods; osr: %6.3f s, %d bytes, %d methods; nmethods_size: %d bytes; nmethods_code_size: %d bytes}",
                 comp->name(), stats->bytes_per_second(),
                 stats->_standard._time.seconds(), stats->_standard._bytes, stats->_standard._count,
                 stats->_osr._time.seconds(), stats->_osr._bytes, stats->_osr._count,
                 stats->_nmethods_size, stats->_nmethods_code_size);
+  } else { // if (!stats)
+    assert(false, "Compiler statistics object must exist");
+  }
   comp->print_timers();
 }
 #endif // INCLUDE_JVMCI
@@ -2260,17 +2278,21 @@
       }
       CompilerStatistics* stats = comp->stats();
 
-      standard_compilation.add(stats->_standard._time);
-      osr_compilation.add(stats->_osr._time);
+      if (stats) {
+        standard_compilation.add(stats->_standard._time);
+        osr_compilation.add(stats->_osr._time);
 
-      standard_bytes_compiled += stats->_standard._bytes;
-      osr_bytes_compiled += stats->_osr._bytes;
+        standard_bytes_compiled += stats->_standard._bytes;
+        osr_bytes_compiled += stats->_osr._bytes;
 
-      standard_compile_count += stats->_standard._count;
-      osr_compile_count += stats->_osr._count;
+        standard_compile_count += stats->_standard._count;
+        osr_compile_count += stats->_osr._count;
 
-      nmethods_size += stats->_nmethods_size;
-      nmethods_code_size += stats->_nmethods_code_size;
+        nmethods_size += stats->_nmethods_size;
+        nmethods_code_size += stats->_nmethods_code_size;
+      } else { // if (!stats)
+        assert(false, "Compiler statistics object must exist");
+      }
 
       if (per_compiler) {
         print_times(comp);
--- a/src/share/vm/gc/cms/parNewGeneration.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/cms/parNewGeneration.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -615,7 +615,7 @@
   : DefNewGeneration(rs, initial_byte_size, "PCopy"),
   _overflow_list(NULL),
   _is_alive_closure(this),
-  _plab_stats(YoungPLABSize, PLABWeight)
+  _plab_stats("Young", YoungPLABSize, PLABWeight)
 {
   NOT_PRODUCT(_overflow_counter = ParGCWorkQueueOverflowInterval;)
   NOT_PRODUCT(_num_par_pushes = 0;)
@@ -1008,9 +1008,7 @@
   from()->set_concurrent_iteration_safe_limit(from()->top());
   to()->set_concurrent_iteration_safe_limit(to()->top());
 
-  if (ResizePLAB) {
-    plab_stats()->adjust_desired_plab_sz();
-  }
+  plab_stats()->adjust_desired_plab_sz();
 
   TASKQUEUE_STATS_ONLY(thread_state_set.print_termination_stats());
   TASKQUEUE_STATS_ONLY(thread_state_set.print_taskqueue_stats());
--- a/src/share/vm/gc/g1/concurrentG1Refine.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/concurrentG1Refine.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,19 +36,19 @@
 {
   // Ergonomically select initial concurrent refinement parameters
   if (FLAG_IS_DEFAULT(G1ConcRefinementGreenZone)) {
-    FLAG_SET_DEFAULT(G1ConcRefinementGreenZone, (intx)ParallelGCThreads);
+    FLAG_SET_DEFAULT(G1ConcRefinementGreenZone, ParallelGCThreads);
   }
   set_green_zone(G1ConcRefinementGreenZone);
 
   if (FLAG_IS_DEFAULT(G1ConcRefinementYellowZone)) {
     FLAG_SET_DEFAULT(G1ConcRefinementYellowZone, green_zone() * 3);
   }
-  set_yellow_zone(MAX2<int>(G1ConcRefinementYellowZone, green_zone()));
+  set_yellow_zone(MAX2(G1ConcRefinementYellowZone, green_zone()));
 
   if (FLAG_IS_DEFAULT(G1ConcRefinementRedZone)) {
     FLAG_SET_DEFAULT(G1ConcRefinementRedZone, yellow_zone() * 2);
   }
-  set_red_zone(MAX2<int>(G1ConcRefinementRedZone, yellow_zone()));
+  set_red_zone(MAX2(G1ConcRefinementRedZone, yellow_zone()));
 }
 
 ConcurrentG1Refine* ConcurrentG1Refine::create(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure, jint* ecode) {
--- a/src/share/vm/gc/g1/concurrentG1Refine.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/concurrentG1Refine.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -61,11 +61,11 @@
   * 2) green = 0. Means no caching. Can be a good way to minimize the
   *    amount of time spent updating rsets during a collection.
   */
-  int _green_zone;
-  int _yellow_zone;
-  int _red_zone;
+  size_t _green_zone;
+  size_t _yellow_zone;
+  size_t _red_zone;
 
-  int _thread_threshold_step;
+  size_t _thread_threshold_step;
 
   // We delay the refinement of 'hot' cards using the hot card cache.
   G1HotCardCache _hot_card_cache;
@@ -100,17 +100,17 @@
 
   void print_worker_threads_on(outputStream* st) const;
 
-  void set_green_zone(int x)  { _green_zone = x;  }
-  void set_yellow_zone(int x) { _yellow_zone = x; }
-  void set_red_zone(int x)    { _red_zone = x;    }
+  void set_green_zone(size_t x)  { _green_zone = x;  }
+  void set_yellow_zone(size_t x) { _yellow_zone = x; }
+  void set_red_zone(size_t x)    { _red_zone = x;    }
 
-  int green_zone() const      { return _green_zone;  }
-  int yellow_zone() const     { return _yellow_zone; }
-  int red_zone() const        { return _red_zone;    }
+  size_t green_zone() const      { return _green_zone;  }
+  size_t yellow_zone() const     { return _yellow_zone; }
+  size_t red_zone() const        { return _red_zone;    }
 
   uint worker_thread_num() const { return _n_worker_threads; }
 
-  int thread_threshold_step() const { return _thread_threshold_step; }
+  size_t thread_threshold_step() const { return _thread_threshold_step; }
 
   G1HotCardCache* hot_card_cache() { return &_hot_card_cache; }
 
--- a/src/share/vm/gc/g1/concurrentG1RefineThread.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/concurrentG1RefineThread.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -67,10 +67,12 @@
 
 void ConcurrentG1RefineThread::initialize() {
   // Current thread activation threshold
-  _threshold = MIN2<int>(cg1r()->thread_threshold_step() * (_worker_id + 1) + cg1r()->green_zone(),
-                         cg1r()->yellow_zone());
+  _threshold = MIN2(cg1r()->thread_threshold_step() * (_worker_id + 1) + cg1r()->green_zone(),
+                    cg1r()->yellow_zone());
   // A thread deactivates once the number of buffer reached a deactivation threshold
-  _deactivation_threshold = MAX2<int>(_threshold - cg1r()->thread_threshold_step(), cg1r()->green_zone());
+   _deactivation_threshold =
+     MAX2(_threshold - MIN2(_threshold, cg1r()->thread_threshold_step()),
+          cg1r()->green_zone());
 }
 
 void ConcurrentG1RefineThread::wait_for_completed_buffers() {
@@ -127,14 +129,14 @@
     }
 
     DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
-    log_debug(gc, refine)("Activated %d, on threshold: %d, current: %d",
+    log_debug(gc, refine)("Activated %d, on threshold: " SIZE_FORMAT ", current: " SIZE_FORMAT,
                           _worker_id, _threshold, dcqs.completed_buffers_num());
 
     {
       SuspendibleThreadSetJoiner sts_join;
 
       do {
-        int curr_buffer_num = (int)dcqs.completed_buffers_num();
+        size_t curr_buffer_num = dcqs.completed_buffers_num();
         // If the number of the buffers falls down into the yellow zone,
         // that means that the transition period after the evacuation pause has ended.
         if (dcqs.completed_queue_padding() > 0 && curr_buffer_num <= cg1r()->yellow_zone()) {
@@ -151,7 +153,7 @@
                                                       false /* during_pause */));
 
       deactivate();
-      log_debug(gc, refine)("Deactivated %d, off threshold: %d, current: %d",
+      log_debug(gc, refine)("Deactivated %d, off threshold: " SIZE_FORMAT ", current: " SIZE_FORMAT,
                             _worker_id, _deactivation_threshold,
                             dcqs.completed_buffers_num());
     }
--- a/src/share/vm/gc/g1/concurrentG1RefineThread.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/concurrentG1RefineThread.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -53,11 +53,11 @@
   // The closure applied to completed log buffers.
   CardTableEntryClosure* _refine_closure;
 
-  int _thread_threshold_step;
+  size_t _thread_threshold_step;
   // This thread activation threshold
-  int _threshold;
+  size_t _threshold;
   // This thread deactivation threshold
-  int _deactivation_threshold;
+  size_t _deactivation_threshold;
 
   void wait_for_completed_buffers();
 
--- a/src/share/vm/gc/g1/dirtyCardQueue.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/dirtyCardQueue.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -207,22 +207,24 @@
 }
 
 
-BufferNode* DirtyCardQueueSet::get_completed_buffer(int stop_at) {
+BufferNode* DirtyCardQueueSet::get_completed_buffer(size_t stop_at) {
   BufferNode* nd = NULL;
   MutexLockerEx x(_cbl_mon, Mutex::_no_safepoint_check_flag);
 
-  if ((int)_n_completed_buffers <= stop_at) {
+  if (_n_completed_buffers <= stop_at) {
     _process_completed = false;
     return NULL;
   }
 
   if (_completed_buffers_head != NULL) {
     nd = _completed_buffers_head;
+    assert(_n_completed_buffers > 0, "Invariant");
     _completed_buffers_head = nd->next();
-    if (_completed_buffers_head == NULL)
+    _n_completed_buffers--;
+    if (_completed_buffers_head == NULL) {
+      assert(_n_completed_buffers == 0, "Invariant");
       _completed_buffers_tail = NULL;
-    _n_completed_buffers--;
-    assert(_n_completed_buffers >= 0, "Invariant");
+    }
   }
   DEBUG_ONLY(assert_completed_buffer_list_len_correct_locked());
   return nd;
@@ -230,7 +232,7 @@
 
 bool DirtyCardQueueSet::apply_closure_to_completed_buffer(CardTableEntryClosure* cl,
                                                           uint worker_i,
-                                                          int stop_at,
+                                                          size_t stop_at,
                                                           bool during_pause) {
   assert(!during_pause || stop_at == 0, "Should not leave any completed buffers during a pause");
   BufferNode* nd = get_completed_buffer(stop_at);
--- a/src/share/vm/gc/g1/dirtyCardQueue.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/dirtyCardQueue.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -134,10 +134,10 @@
   // is returned to the completed buffer set, and this call returns false.
   bool apply_closure_to_completed_buffer(CardTableEntryClosure* cl,
                                          uint worker_i,
-                                         int stop_at,
+                                         size_t stop_at,
                                          bool during_pause);
 
-  BufferNode* get_completed_buffer(int stop_at);
+  BufferNode* get_completed_buffer(size_t stop_at);
 
   // Applies the current closure to all completed buffers,
   // non-consumptively.
--- a/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1400,7 +1400,6 @@
       JavaThread::dirty_card_queue_set().abandon_logs();
       assert(dirty_card_queue_set().completed_buffers_num() == 0, "DCQS should be empty");
 
-      _young_list->reset_sampled_info();
       // At this point there should be no regions in the
       // entire heap tagged as young.
       assert(check_young_list_empty(true /* check_heap */),
@@ -1761,8 +1760,8 @@
   _young_list(new YoungList(this)),
   _gc_time_stamp(0),
   _summary_bytes_used(0),
-  _survivor_evac_stats(YoungPLABSize, PLABWeight),
-  _old_evac_stats(OldPLABSize, PLABWeight),
+  _survivor_evac_stats("Young", YoungPLABSize, PLABWeight),
+  _old_evac_stats("Old", OldPLABSize, PLABWeight),
   _expand_heap_after_alloc_failure(true),
   _old_marking_cycles_started(0),
   _old_marking_cycles_completed(0),
@@ -1985,8 +1984,8 @@
   JavaThread::dirty_card_queue_set().initialize(_refine_cte_cl,
                                                 DirtyCardQ_CBL_mon,
                                                 DirtyCardQ_FL_lock,
-                                                concurrent_g1_refine()->yellow_zone(),
-                                                concurrent_g1_refine()->red_zone(),
+                                                (int)concurrent_g1_refine()->yellow_zone(),
+                                                (int)concurrent_g1_refine()->red_zone(),
                                                 Shared_DirtyCardQ_lock,
                                                 NULL,  // fl_owner
                                                 true); // init_free_ids
@@ -3385,13 +3384,15 @@
 
         g1_policy()->clear_collection_set();
 
+        record_obj_copy_mem_stats();
+        _survivor_evac_stats.adjust_desired_plab_sz();
+        _old_evac_stats.adjust_desired_plab_sz();
+
         // Start a new incremental collection set for the next pause.
         g1_policy()->start_incremental_cset_building();
 
         clear_cset_fast_test();
 
-        _young_list->reset_sampled_info();
-
         // Don't check the whole heap at this point as the
         // GC alloc regions from this pause have been tagged
         // as survivors and moved on to the survivor list.
@@ -4398,6 +4399,8 @@
   { }
 
   void work(uint worker_id) {
+    G1GCParPhaseTimesTracker x(_g1h->g1_policy()->phase_times(), G1GCPhaseTimes::PreserveCMReferents, worker_id);
+
     ResourceMark rm;
     HandleMark   hm;
 
@@ -4461,13 +4464,8 @@
   g1_policy()->phase_times()->record_ref_proc_time(ref_proc_time * 1000.0);
 }
 
-// Weak Reference processing during an evacuation pause (part 1).
-void G1CollectedHeap::process_discovered_references(G1ParScanThreadStateSet* per_thread_states) {
-  double ref_proc_start = os::elapsedTime();
-
-  ReferenceProcessor* rp = _ref_processor_stw;
-  assert(rp->discovery_enabled(), "should have been enabled");
-
+void G1CollectedHeap::preserve_cm_referents(G1ParScanThreadStateSet* per_thread_states) {
+  double preserve_cm_referents_start = os::elapsedTime();
   // Any reference objects, in the collection set, that were 'discovered'
   // by the CM ref processor should have already been copied (either by
   // applying the external root copy closure to the discovered lists, or
@@ -4495,9 +4493,18 @@
                                                  per_thread_states,
                                                  no_of_gc_workers,
                                                  _task_queues);
-
   workers()->run_task(&keep_cm_referents);
 
+  g1_policy()->phase_times()->record_preserve_cm_referents_time_ms((os::elapsedTime() - preserve_cm_referents_start) * 1000.0);
+}
+
+// Weak Reference processing during an evacuation pause (part 1).
+void G1CollectedHeap::process_discovered_references(G1ParScanThreadStateSet* per_thread_states) {
+  double ref_proc_start = os::elapsedTime();
+
+  ReferenceProcessor* rp = _ref_processor_stw;
+  assert(rp->discovery_enabled(), "should have been enabled");
+
   // Closure to test whether a referent is alive.
   G1STWIsAliveClosure is_alive(this);
 
@@ -4529,6 +4536,8 @@
                                               NULL,
                                               _gc_timer_stw);
   } else {
+    uint no_of_gc_workers = workers()->active_workers();
+
     // Parallel reference processing
     assert(rp->num_q() == no_of_gc_workers, "sanity");
     assert(no_of_gc_workers <= rp->max_num_q(), "sanity");
@@ -4586,6 +4595,12 @@
   g1_policy()->phase_times()->record_ref_enq_time(ref_enq_time * 1000.0);
 }
 
+void G1CollectedHeap::merge_per_thread_state_info(G1ParScanThreadStateSet* per_thread_states) {
+  double merge_pss_time_start = os::elapsedTime();
+  per_thread_states->flush();
+  g1_policy()->phase_times()->record_merge_pss_time_ms((os::elapsedTime() - merge_pss_time_start) * 1000.0);
+}
+
 void G1CollectedHeap::pre_evacuate_collection_set() {
   _expand_heap_after_alloc_failure = true;
   _evacuation_failed = false;
@@ -4644,6 +4659,7 @@
   // objects (and their reachable sub-graphs) that were
   // not copied during the pause.
   if (g1_policy()->should_process_references()) {
+    preserve_cm_referents(per_thread_states);
     process_discovered_references(per_thread_states);
   } else {
     ref_processor_stw()->verify_no_references_recorded();
@@ -4687,12 +4703,7 @@
 
   _allocator->release_gc_alloc_regions(evacuation_info);
 
-  per_thread_states->flush();
-
-  record_obj_copy_mem_stats();
-
-  _survivor_evac_stats.adjust_desired_plab_sz();
-  _old_evac_stats.adjust_desired_plab_sz();
+  merge_per_thread_state_info(per_thread_states);
 
   // Reset and re-enable the hot card cache.
   // Note the counts for the cards in the regions in the
@@ -5188,8 +5199,8 @@
   bool success() { return _success; }
 };
 
-bool G1CollectedHeap::check_young_list_empty(bool check_heap, bool check_sample) {
-  bool ret = _young_list->check_list_empty(check_sample);
+bool G1CollectedHeap::check_young_list_empty(bool check_heap) {
+  bool ret = _young_list->check_list_empty();
 
   if (check_heap) {
     NoYoungRegionsClosure closure;
--- a/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -511,6 +511,9 @@
   // allocated block, or else "NULL".
   HeapWord* expand_and_allocate(size_t word_size, AllocationContext_t context);
 
+  // Preserve any referents discovered by concurrent marking that have not yet been
+  // copied by the STW pause.
+  void preserve_cm_referents(G1ParScanThreadStateSet* per_thread_states);
   // Process any reference objects discovered during
   // an incremental evacuation pause.
   void process_discovered_references(G1ParScanThreadStateSet* per_thread_states);
@@ -519,6 +522,9 @@
   // after processing.
   void enqueue_discovered_references(G1ParScanThreadStateSet* per_thread_states);
 
+  // Merges the information gathered on a per-thread basis for all worker threads
+  // during GC into global variables.
+  void merge_per_thread_state_info(G1ParScanThreadStateSet* per_thread_states);
 public:
   WorkGang* workers() const { return _workers; }
 
@@ -1333,8 +1339,7 @@
     return _young_list->check_list_well_formed();
   }
 
-  bool check_young_list_empty(bool check_heap,
-                              bool check_sample = true);
+  bool check_young_list_empty(bool check_heap);
 
   // *** Stuff related to concurrent marking.  It's not clear to me that so
   // many of these need to be public.
--- a/src/share/vm/gc/g1/g1CollectorPolicy.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/g1CollectorPolicy.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -787,10 +787,9 @@
   return survivor_regions_evac_time;
 }
 
-void G1CollectorPolicy::revise_young_list_target_length_if_necessary() {
+void G1CollectorPolicy::revise_young_list_target_length_if_necessary(size_t rs_lengths) {
   guarantee( adaptive_young_list_length(), "should not call this otherwise" );
 
-  size_t rs_lengths = _g1->young_list()->sampled_rs_lengths();
   if (rs_lengths > _rs_lengths_prediction) {
     // add 10% to avoid having to recalculate often
     size_t rs_lengths_prediction = rs_lengths * 1100 / 1000;
@@ -1118,14 +1117,15 @@
   _short_lived_surv_rate_group->start_adding_regions();
   // Do that for any other surv rate groups
 
+  double scan_hcc_time_ms = ConcurrentG1Refine::hot_card_cache_enabled() ? average_time_ms(G1GCPhaseTimes::ScanHCC) : 0.0;
+
   if (update_stats) {
     double cost_per_card_ms = 0.0;
-    double cost_scan_hcc = average_time_ms(G1GCPhaseTimes::ScanHCC);
     if (_pending_cards > 0) {
-      cost_per_card_ms = (average_time_ms(G1GCPhaseTimes::UpdateRS) - cost_scan_hcc) / (double) _pending_cards;
+      cost_per_card_ms = (average_time_ms(G1GCPhaseTimes::UpdateRS) - scan_hcc_time_ms) / (double) _pending_cards;
       _cost_per_card_ms_seq->add(cost_per_card_ms);
     }
-    _cost_scan_hcc_seq->add(cost_scan_hcc);
+    _cost_scan_hcc_seq->add(scan_hcc_time_ms);
 
     double cost_per_entry_ms = 0.0;
     if (cards_scanned > 10) {
@@ -1215,8 +1215,6 @@
   // Note that _mmu_tracker->max_gc_time() returns the time in seconds.
   double update_rs_time_goal_ms = _mmu_tracker->max_gc_time() * MILLIUNITS * G1RSetUpdatingPauseTimePercent / 100.0;
 
-  double scan_hcc_time_ms = average_time_ms(G1GCPhaseTimes::ScanHCC);
-
   if (update_rs_time_goal_ms < scan_hcc_time_ms) {
     log_debug(gc, ergo, refine)("Adjust concurrent refinement thresholds (scanning the HCC expected to take longer than Update RS time goal)."
                                 "Update RS time goal: %1.2fms Scan HCC time: %1.2fms",
@@ -1302,12 +1300,12 @@
     const int k_gy = 3, k_gr = 6;
     const double inc_k = 1.1, dec_k = 0.9;
 
-    int g = cg1r->green_zone();
+    size_t g = cg1r->green_zone();
     if (update_rs_time > goal_ms) {
-      g = (int)(g * dec_k);  // Can become 0, that's OK. That would mean a mutator-only processing.
+      g = (size_t)(g * dec_k);  // Can become 0, that's OK. That would mean a mutator-only processing.
     } else {
       if (update_rs_time < goal_ms && update_rs_processed_buffers > g) {
-        g = (int)MAX2(g * inc_k, g + 1.0);
+        g = (size_t)MAX2(g * inc_k, g + 1.0);
       }
     }
     // Change the refinement threads params
@@ -1316,15 +1314,15 @@
     cg1r->set_red_zone(g * k_gr);
     cg1r->reinitialize_threads();
 
-    int processing_threshold_delta = MAX2((int)(cg1r->green_zone() * _predictor.sigma()), 1);
-    int processing_threshold = MIN2(cg1r->green_zone() + processing_threshold_delta,
+    size_t processing_threshold_delta = MAX2<size_t>(cg1r->green_zone() * _predictor.sigma(), 1);
+    size_t processing_threshold = MIN2(cg1r->green_zone() + processing_threshold_delta,
                                     cg1r->yellow_zone());
     // Change the barrier params
-    dcqs.set_process_completed_threshold(processing_threshold);
-    dcqs.set_max_completed_queue(cg1r->red_zone());
+    dcqs.set_process_completed_threshold((int)processing_threshold);
+    dcqs.set_max_completed_queue((int)cg1r->red_zone());
   }
 
-  int curr_queue_size = dcqs.completed_buffers_num();
+  size_t curr_queue_size = dcqs.completed_buffers_num();
   if (curr_queue_size >= cg1r->yellow_zone()) {
     dcqs.set_completed_queue_padding(curr_queue_size);
   } else {
--- a/src/share/vm/gc/g1/g1CollectorPolicy.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/g1CollectorPolicy.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -471,7 +471,7 @@
   // Check the current value of the young list RSet lengths and
   // compare it against the last prediction. If the current value is
   // higher, recalculate the young list target length prediction.
-  void revise_young_list_target_length_if_necessary();
+  void revise_young_list_target_length_if_necessary(size_t rs_lengths);
 
   // This should be called after the heap is resized.
   void record_new_heap_size(uint new_number_of_regions);
--- a/src/share/vm/gc/g1/g1ConcurrentMark.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/g1ConcurrentMark.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1097,7 +1097,7 @@
     reset_marking_state();
   } else {
     {
-      GCTraceTime(Debug, gc) trace("GC Aggregate Data", g1h->gc_timer_cm());
+      GCTraceTime(Debug, gc) trace("Aggregate Data", g1h->gc_timer_cm());
 
       // Aggregate the per-task counting data that we have accumulated
       // while marking.
@@ -2018,7 +2018,7 @@
   // Inner scope to exclude the cleaning of the string and symbol
   // tables from the displayed time.
   {
-    GCTraceTime(Debug, gc) trace("GC Ref Proc", g1h->gc_timer_cm());
+    GCTraceTime(Debug, gc) trace("Reference Processing", g1h->gc_timer_cm());
 
     ReferenceProcessor* rp = g1h->ref_processor_cm();
 
@@ -2271,7 +2271,7 @@
   SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
   guarantee(has_overflown() ||
             satb_mq_set.completed_buffers_num() == 0,
-            "Invariant: has_overflown = %s, num buffers = %d",
+            "Invariant: has_overflown = %s, num buffers = " SIZE_FORMAT,
             BOOL_TO_STR(has_overflown()),
             satb_mq_set.completed_buffers_num());
 
@@ -2702,11 +2702,8 @@
 };
 
 static ReferenceProcessor* get_cm_oop_closure_ref_processor(G1CollectedHeap* g1h) {
-  ReferenceProcessor* result = NULL;
-  if (G1UseConcMarkReferenceProcessing) {
-    result = g1h->ref_processor_cm();
-    assert(result != NULL, "should not be NULL");
-  }
+  ReferenceProcessor* result = g1h->ref_processor_cm();
+  assert(result != NULL, "CM reference processor should not be NULL");
   return result;
 }
 
--- a/src/share/vm/gc/g1/g1EvacStats.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/g1EvacStats.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -29,15 +29,26 @@
 #include "logging/log.hpp"
 #include "trace/tracing.hpp"
 
+void G1EvacStats::log_plab_allocation() {
+  PLABStats::log_plab_allocation();
+  log_debug(gc, plab)("%s other allocation: "
+                      "region end waste: " SIZE_FORMAT "B, "
+                      "regions filled: %u, "
+                      "direct allocated: " SIZE_FORMAT "B, "
+                      "failure used: " SIZE_FORMAT "B, "
+                      "failure wasted: " SIZE_FORMAT "B",
+                      _description,
+                      _region_end_waste * HeapWordSize,
+                      _regions_filled,
+                      _direct_allocated * HeapWordSize,
+                      _failure_used * HeapWordSize,
+                      _failure_waste * HeapWordSize);
+}
+
 void G1EvacStats::adjust_desired_plab_sz() {
+  log_plab_allocation();
+
   if (!ResizePLAB) {
-    log_debug(gc, plab)(" (allocated = " SIZE_FORMAT " wasted = " SIZE_FORMAT " "
-                        "unused = " SIZE_FORMAT " used = " SIZE_FORMAT " "
-                        "undo_waste = " SIZE_FORMAT " region_end_waste = " SIZE_FORMAT " "
-                        "regions filled = %u direct_allocated = " SIZE_FORMAT " "
-                        "failure_used = " SIZE_FORMAT " failure_waste = " SIZE_FORMAT ") ",
-                        _allocated, _wasted, _unused, used(), _undo_wasted, _region_end_waste,
-                        _regions_filled, _direct_allocated, _failure_used, _failure_waste);
     // Clear accumulators for next round.
     reset();
     return;
@@ -107,18 +118,19 @@
   // Latch the result
   _desired_net_plab_sz = plab_sz;
 
-  log_debug(gc, plab)(" (allocated = " SIZE_FORMAT " wasted = " SIZE_FORMAT " "
-                      "unused = " SIZE_FORMAT " used = " SIZE_FORMAT " "
-                      "undo_waste = " SIZE_FORMAT " region_end_waste = " SIZE_FORMAT " "
-                      "regions filled = %u direct_allocated = " SIZE_FORMAT " "
-                      "failure_used = " SIZE_FORMAT " failure_waste = " SIZE_FORMAT ") "
-                      " (plab_sz = " SIZE_FORMAT " desired_plab_sz = " SIZE_FORMAT ")",
-                      _allocated, _wasted, _unused, used(), _undo_wasted, _region_end_waste,
-                      _regions_filled, _direct_allocated, _failure_used, _failure_waste,
-                      cur_plab_sz, plab_sz);
-
+  log_sizing(cur_plab_sz, plab_sz);
   // Clear accumulators for next round.
   reset();
 }
 
+G1EvacStats::G1EvacStats(const char* description, size_t desired_plab_sz_, unsigned wt) :
+  PLABStats(description, desired_plab_sz_, wt),
+  _region_end_waste(0),
+  _regions_filled(0),
+  _direct_allocated(0),
+  _failure_used(0),
+  _failure_waste(0) {
+}
+
+
 G1EvacStats::~G1EvacStats() { }
--- a/src/share/vm/gc/g1/g1EvacStats.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/g1EvacStats.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -51,20 +51,15 @@
     _failure_waste = 0;
   }
 
+  virtual void log_plab_allocation();
+
  public:
-  G1EvacStats(size_t desired_plab_sz_, unsigned wt) : PLABStats(desired_plab_sz_, wt),
-    _region_end_waste(0), _regions_filled(0), _direct_allocated(0),
-    _failure_used(0), _failure_waste(0) {
-  }
+  G1EvacStats(const char* description, size_t desired_plab_sz_, unsigned wt);
+
+  ~G1EvacStats();
 
   virtual void adjust_desired_plab_sz();
 
-  size_t allocated() const { return _allocated; }
-  size_t wasted() const { return _wasted; }
-  size_t unused() const { return _unused; }
-  size_t used() const { return allocated() - (wasted() + unused()); }
-  size_t undo_wasted() const { return _undo_wasted; }
-
   uint regions_filled() const { return _regions_filled; }
   size_t region_end_waste() const { return _region_end_waste; }
   size_t direct_allocated() const { return _direct_allocated; }
@@ -77,8 +72,6 @@
   inline void add_direct_allocated(size_t value);
   inline void add_region_end_waste(size_t value);
   inline void add_failure_used_and_waste(size_t used, size_t waste);
-
-  ~G1EvacStats();
 };
 
 #endif // SHARE_VM_GC_G1_G1EVACSTATS_HPP
--- a/src/share/vm/gc/g1/g1GCPhaseTimes.cpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/g1GCPhaseTimes.cpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,107 +28,70 @@
 #include "gc/g1/g1GCPhaseTimes.hpp"
 #include "gc/g1/g1StringDedup.hpp"
 #include "gc/g1/workerDataArray.inline.hpp"
-#include "memory/allocation.hpp"
+#include "memory/resourceArea.hpp"
 #include "logging/log.hpp"
 #include "runtime/timer.hpp"
 #include "runtime/os.hpp"
 
-// Helper class for avoiding interleaved logging
-class LineBuffer: public StackObj {
-
-private:
-  static const int BUFFER_LEN = 1024;
-  static const int INDENT_CHARS = 3;
-  char _buffer[BUFFER_LEN];
-  int _indent_level;
-  int _cur;
-
-  void vappend(const char* format, va_list ap)  ATTRIBUTE_PRINTF(2, 0) {
-    int res = vsnprintf(&_buffer[_cur], BUFFER_LEN - _cur, format, ap);
-    if (res != -1) {
-      _cur += res;
-    } else {
-      DEBUG_ONLY(warning("buffer too small in LineBuffer");)
-      _buffer[BUFFER_LEN -1] = 0;
-      _cur = BUFFER_LEN; // vsnprintf above should not add to _buffer if we are called again
-    }
-  }
-
-public:
-  explicit LineBuffer(int indent_level): _indent_level(indent_level), _cur(0) {
-    for (; (_cur < BUFFER_LEN && _cur < (_indent_level * INDENT_CHARS)); _cur++) {
-      _buffer[_cur] = ' ';
-    }
-  }
-
-#ifndef PRODUCT
-  ~LineBuffer() {
-    assert(_cur == _indent_level * INDENT_CHARS, "pending data in buffer - append_and_print_cr() not called?");
-  }
-#endif
-
-  void append(const char* format, ...)  ATTRIBUTE_PRINTF(2, 3) {
-    va_list ap;
-    va_start(ap, format);
-    vappend(format, ap);
-    va_end(ap);
-  }
-
-  const char* to_string() {
-    _cur = _indent_level * INDENT_CHARS;
-    return _buffer;
-  }
-};
-
-static const char* Indents[4] = {"", "   ", "      ", "         "};
+static const char* Indents[5] = {"", "  ", "    ", "     ", "       "};
 
 G1GCPhaseTimes::G1GCPhaseTimes(uint max_gc_threads) :
   _max_gc_threads(max_gc_threads)
 {
   assert(max_gc_threads > 0, "Must have some GC threads");
 
-  _gc_par_phases[GCWorkerStart] = new WorkerDataArray<double>(max_gc_threads, "GC Worker Start:", false, 2);
-  _gc_par_phases[ExtRootScan] = new WorkerDataArray<double>(max_gc_threads, "Ext Root Scanning:", true, 2);
+  _gc_par_phases[GCWorkerStart] = new WorkerDataArray<double>(max_gc_threads, "GC Worker Start (ms):");
+  _gc_par_phases[ExtRootScan] = new WorkerDataArray<double>(max_gc_threads, "Ext Root Scanning (ms):");
 
   // Root scanning phases
-  _gc_par_phases[ThreadRoots] = new WorkerDataArray<double>(max_gc_threads, "Thread Roots:", true, 3);
-  _gc_par_phases[StringTableRoots] = new WorkerDataArray<double>(max_gc_threads, "StringTable Roots:", true, 3);
-  _gc_par_phases[UniverseRoots] = new WorkerDataArray<double>(max_gc_threads, "Universe Roots:", true, 3);
-  _gc_par_phases[JNIRoots] = new WorkerDataArray<double>(max_gc_threads, "JNI Handles Roots:", true, 3);
-  _gc_par_phases[ObjectSynchronizerRoots] = new WorkerDataArray<double>(max_gc_threads, "ObjectSynchronizer Roots:", true, 3);
-  _gc_par_phases[FlatProfilerRoots] = new WorkerDataArray<double>(max_gc_threads, "FlatProfiler Roots:", true, 3);
-  _gc_par_phases[ManagementRoots] = new WorkerDataArray<double>(max_gc_threads, "Management Roots:", true, 3);
-  _gc_par_phases[SystemDictionaryRoots] = new WorkerDataArray<double>(max_gc_threads, "SystemDictionary Roots:", true, 3);
-  _gc_par_phases[CLDGRoots] = new WorkerDataArray<double>(max_gc_threads, "CLDG Roots:", true, 3);
-  _gc_par_phases[JVMTIRoots] = new WorkerDataArray<double>(max_gc_threads, "JVMTI Roots:", true, 3);
-  _gc_par_phases[CMRefRoots] = new WorkerDataArray<double>(max_gc_threads, "CM RefProcessor Roots:", true, 3);
-  _gc_par_phases[WaitForStrongCLD] = new WorkerDataArray<double>(max_gc_threads, "Wait For Strong CLD:", true, 3);
-  _gc_par_phases[WeakCLDRoots] = new WorkerDataArray<double>(max_gc_threads, "Weak CLD Roots:", true, 3);
-  _gc_par_phases[SATBFiltering] = new WorkerDataArray<double>(max_gc_threads, "SATB Filtering:", true, 3);
+  _gc_par_phases[ThreadRoots] = new WorkerDataArray<double>(max_gc_threads, "Thread Roots (ms):");
+  _gc_par_phases[StringTableRoots] = new WorkerDataArray<double>(max_gc_threads, "StringTable Roots (ms):");
+  _gc_par_phases[UniverseRoots] = new WorkerDataArray<double>(max_gc_threads, "Universe Roots (ms):");
+  _gc_par_phases[JNIRoots] = new WorkerDataArray<double>(max_gc_threads, "JNI Handles Roots (ms):");
+  _gc_par_phases[ObjectSynchronizerRoots] = new WorkerDataArray<double>(max_gc_threads, "ObjectSynchronizer Roots (ms):");
+  _gc_par_phases[FlatProfilerRoots] = new WorkerDataArray<double>(max_gc_threads, "FlatProfiler Roots (ms):");
+  _gc_par_phases[ManagementRoots] = new WorkerDataArray<double>(max_gc_threads, "Management Roots (ms):");
+  _gc_par_phases[SystemDictionaryRoots] = new WorkerDataArray<double>(max_gc_threads, "SystemDictionary Roots (ms):");
+  _gc_par_phases[CLDGRoots] = new WorkerDataArray<double>(max_gc_threads, "CLDG Roots (ms):");
+  _gc_par_phases[JVMTIRoots] = new WorkerDataArray<double>(max_gc_threads, "JVMTI Roots (ms):");
+  _gc_par_phases[CMRefRoots] = new WorkerDataArray<double>(max_gc_threads, "CM RefProcessor Roots (ms):");
+  _gc_par_phases[WaitForStrongCLD] = new WorkerDataArray<double>(max_gc_threads, "Wait For Strong CLD (ms):");
+  _gc_par_phases[WeakCLDRoots] = new WorkerDataArray<double>(max_gc_threads, "Weak CLD Roots (ms):");
+  _gc_par_phases[SATBFiltering] = new WorkerDataArray<double>(max_gc_threads, "SATB Filtering (ms):");
 
-  _gc_par_phases[UpdateRS] = new WorkerDataArray<double>(max_gc_threads, "Update RS:", true, 2);
-  _gc_par_phases[ScanHCC] = new WorkerDataArray<double>(max_gc_threads, "Scan HCC:", true, 3);
-  _gc_par_phases[ScanHCC]->set_enabled(ConcurrentG1Refine::hot_card_cache_enabled());
-  _gc_par_phases[ScanRS] = new WorkerDataArray<double>(max_gc_threads, "Scan RS:", true, 2);
-  _gc_par_phases[CodeRoots] = new WorkerDataArray<double>(max_gc_threads, "Code Root Scanning:", true, 2);
-  _gc_par_phases[ObjCopy] = new WorkerDataArray<double>(max_gc_threads, "Object Copy:", true, 2);
-  _gc_par_phases[Termination] = new WorkerDataArray<double>(max_gc_threads, "Termination:", true, 2);
-  _gc_par_phases[GCWorkerTotal] = new WorkerDataArray<double>(max_gc_threads, "GC Worker Total:", true, 2);
-  _gc_par_phases[GCWorkerEnd] = new WorkerDataArray<double>(max_gc_threads, "GC Worker End:", false, 2);
-  _gc_par_phases[Other] = new WorkerDataArray<double>(max_gc_threads, "GC Worker Other:", true, 2);
+  _gc_par_phases[UpdateRS] = new WorkerDataArray<double>(max_gc_threads, "Update RS (ms):");
+  if (ConcurrentG1Refine::hot_card_cache_enabled()) {
+    _gc_par_phases[ScanHCC] = new WorkerDataArray<double>(max_gc_threads, "Scan HCC (ms):");
+  } else {
+    _gc_par_phases[ScanHCC] = NULL;
+  }
+  _gc_par_phases[ScanRS] = new WorkerDataArray<double>(max_gc_threads, "Scan RS (ms):");
+  _gc_par_phases[CodeRoots] = new WorkerDataArray<double>(max_gc_threads, "Code Root Scanning (ms):");
+  _gc_par_phases[ObjCopy] = new WorkerDataArray<double>(max_gc_threads, "Object Copy (ms):");
+  _gc_par_phases[Termination] = new WorkerDataArray<double>(max_gc_threads, "Termination (ms):");
+  _gc_par_phases[GCWorkerTotal] = new WorkerDataArray<double>(max_gc_threads, "GC Worker Total (ms):");
+  _gc_par_phases[GCWorkerEnd] = new WorkerDataArray<double>(max_gc_threads, "GC Worker End (ms):");
+  _gc_par_phases[Other] = new WorkerDataArray<double>(max_gc_threads, "GC Worker Other (ms):");
 
-  _update_rs_processed_buffers = new WorkerDataArray<size_t>(max_gc_threads, "Processed Buffers:", true, 3);
+  _update_rs_processed_buffers = new WorkerDataArray<size_t>(max_gc_threads, "Processed Buffers:");
   _gc_par_phases[UpdateRS]->link_thread_work_items(_update_rs_processed_buffers);
 
-  _termination_attempts = new WorkerDataArray<size_t>(max_gc_threads, "Termination Attempts:", true, 3);
+  _termination_attempts = new WorkerDataArray<size_t>(max_gc_threads, "Termination Attempts:");
   _gc_par_phases[Termination]->link_thread_work_items(_termination_attempts);
 
-  _gc_par_phases[StringDedupQueueFixup] = new WorkerDataArray<double>(max_gc_threads, "Queue Fixup:", true, 2);
-  _gc_par_phases[StringDedupTableFixup] = new WorkerDataArray<double>(max_gc_threads, "Table Fixup:", true, 2);
+  if (UseStringDeduplication) {
+    _gc_par_phases[StringDedupQueueFixup] = new WorkerDataArray<double>(max_gc_threads, "Queue Fixup (ms):");
+    _gc_par_phases[StringDedupTableFixup] = new WorkerDataArray<double>(max_gc_threads, "Table Fixup (ms):");
+  } else {
+    _gc_par_phases[StringDedupQueueFixup] = NULL;
+    _gc_par_phases[StringDedupTableFixup] = NULL;
+  }
 
-  _gc_par_phases[RedirtyCards] = new WorkerDataArray<double>(max_gc_threads, "Parallel Redirty:", true, 3);
-  _redirtied_cards = new WorkerDataArray<size_t>(max_gc_threads, "Redirtied Cards:", true, 3);
+  _gc_par_phases[RedirtyCards] = new WorkerDataArray<double>(max_gc_threads, "Parallel Redirty (ms):");
+  _redirtied_cards = new WorkerDataArray<size_t>(max_gc_threads, "Redirtied Cards:");
   _gc_par_phases[RedirtyCards]->link_thread_work_items(_redirtied_cards);
+
+  _gc_par_phases[PreserveCMReferents] = new WorkerDataArray<double>(max_gc_threads, "Parallel Preserve CM Refs (ms):");
 }
 
 void G1GCPhaseTimes::note_gc_start(uint active_gc_threads) {
@@ -140,11 +103,10 @@
   _external_accounted_time_ms = 0.0;
 
   for (int i = 0; i < GCParPhasesSentinel; i++) {
-    _gc_par_phases[i]->reset();
+    if (_gc_par_phases[i] != NULL) {
+      _gc_par_phases[i]->reset();
+    }
   }
-
-  _gc_par_phases[StringDedupQueueFixup]->set_enabled(G1StringDedup::is_enabled());
-  _gc_par_phases[StringDedupTableFixup]->set_enabled(G1StringDedup::is_enabled());
 }
 
 void G1GCPhaseTimes::note_gc_end() {
@@ -166,45 +128,12 @@
   }
 
   for (int i = 0; i < GCParPhasesSentinel; i++) {
-    _gc_par_phases[i]->verify(_active_gc_threads);
+    if (_gc_par_phases[i] != NULL) {
+      _gc_par_phases[i]->verify(_active_gc_threads);
+    }
   }
 }
 
-void G1GCPhaseTimes::print_stats(const char* indent, const char* str, double value) {
-  log_debug(gc, phases)("%s%s: %.1lf ms", indent, str, value);
-}
-
-double G1GCPhaseTimes::accounted_time_ms() {
-    // First subtract any externally accounted time
-    double misc_time_ms = _external_accounted_time_ms;
-
-    // Subtract the root region scanning wait time. It's initialized to
-    // zero at the start of the pause.
-    misc_time_ms += _root_region_scan_wait_time_ms;
-
-    misc_time_ms += _cur_collection_par_time_ms;
-
-    // Now subtract the time taken to fix up roots in generated code
-    misc_time_ms += _cur_collection_code_root_fixup_time_ms;
-
-    // Strong code root purge time
-    misc_time_ms += _cur_strong_code_root_purge_time_ms;
-
-    if (G1StringDedup::is_enabled()) {
-      // String dedup fixup time
-      misc_time_ms += _cur_string_dedup_fixup_time_ms;
-    }
-
-    // Subtract the time taken to clean the card table from the
-    // current value of "other time"
-    misc_time_ms += _cur_clear_ct_time_ms;
-
-    // Remove expand heap time from "other time"
-    misc_time_ms += _cur_expand_heap_time_ms;
-
-    return misc_time_ms;
-}
-
 // record the time a phase took in seconds
 void G1GCPhaseTimes::record_time_secs(GCParPhases phase, uint worker_i, double secs) {
   _gc_par_phases[phase]->set(worker_i, secs);
@@ -224,193 +153,144 @@
   return _gc_par_phases[phase]->average(_active_gc_threads) * 1000.0;
 }
 
-double G1GCPhaseTimes::get_time_ms(GCParPhases phase, uint worker_i) {
-  return _gc_par_phases[phase]->get(worker_i) * 1000.0;
-}
-
-double G1GCPhaseTimes::sum_time_ms(GCParPhases phase) {
-  return _gc_par_phases[phase]->sum(_active_gc_threads) * 1000.0;
-}
-
-double G1GCPhaseTimes::min_time_ms(GCParPhases phase) {
-  return _gc_par_phases[phase]->minimum(_active_gc_threads) * 1000.0;
-}
-
-double G1GCPhaseTimes::max_time_ms(GCParPhases phase) {
-  return _gc_par_phases[phase]->maximum(_active_gc_threads) * 1000.0;
-}
-
-size_t G1GCPhaseTimes::get_thread_work_item(GCParPhases phase, uint worker_i) {
-  assert(_gc_par_phases[phase]->thread_work_items() != NULL, "No sub count");
-  return _gc_par_phases[phase]->thread_work_items()->get(worker_i);
-}
-
 size_t G1GCPhaseTimes::sum_thread_work_items(GCParPhases phase) {
   assert(_gc_par_phases[phase]->thread_work_items() != NULL, "No sub count");
   return _gc_par_phases[phase]->thread_work_items()->sum(_active_gc_threads);
 }
 
-double G1GCPhaseTimes::average_thread_work_items(GCParPhases phase) {
-  assert(_gc_par_phases[phase]->thread_work_items() != NULL, "No sub count");
-  return _gc_par_phases[phase]->thread_work_items()->average(_active_gc_threads);
+template <class T>
+void G1GCPhaseTimes::details(T* phase, const char* indent) {
+  LogHandle(gc, phases, task) log;
+  if (log.is_level(LogLevel::Trace)) {
+    outputStream* trace_out = log.trace_stream();
+    trace_out->print("%s", indent);
+    phase->print_details_on(trace_out, _active_gc_threads);
+  }
 }
 
-size_t G1GCPhaseTimes::min_thread_work_items(GCParPhases phase) {
-  assert(_gc_par_phases[phase]->thread_work_items() != NULL, "No sub count");
-  return _gc_par_phases[phase]->thread_work_items()->minimum(_active_gc_threads);
+void G1GCPhaseTimes::log_phase(WorkerDataArray<double>* phase, uint indent, outputStream* out, bool print_sum) {
+  out->print("%s", Indents[indent]);
+  phase->print_summary_on(out, _active_gc_threads, print_sum);
+  details(phase, Indents[indent]);
+
+  WorkerDataArray<size_t>* work_items = phase->thread_work_items();
+  if (work_items != NULL) {
+    out->print("%s", Indents[indent + 1]);
+    work_items->print_summary_on(out, _active_gc_threads, true);
+    details(work_items, Indents[indent + 1]);
+  }
 }
 
-size_t G1GCPhaseTimes::max_thread_work_items(GCParPhases phase) {
-  assert(_gc_par_phases[phase]->thread_work_items() != NULL, "No sub count");
-  return _gc_par_phases[phase]->thread_work_items()->maximum(_active_gc_threads);
+void G1GCPhaseTimes::debug_phase(WorkerDataArray<double>* phase) {
+  LogHandle(gc, phases) log;
+  if (log.is_level(LogLevel::Debug)) {
+    ResourceMark rm;
+    log_phase(phase, 2, log.debug_stream(), true);
+  }
 }
 
-class G1GCParPhasePrinter : public StackObj {
-  G1GCPhaseTimes* _phase_times;
- public:
-  G1GCParPhasePrinter(G1GCPhaseTimes* phase_times) : _phase_times(phase_times) {}
+void G1GCPhaseTimes::trace_phase(WorkerDataArray<double>* phase, bool print_sum) {
+  LogHandle(gc, phases) log;
+  if (log.is_level(LogLevel::Trace)) {
+    ResourceMark rm;
+    log_phase(phase, 3, log.trace_stream(), print_sum);
+  }
+}
 
-  void print(G1GCPhaseTimes::GCParPhases phase_id) {
-    WorkerDataArray<double>* phase = _phase_times->_gc_par_phases[phase_id];
+#define PHASE_DOUBLE_FORMAT "%s%s: %.1lfms"
+#define PHASE_SIZE_FORMAT "%s%s: " SIZE_FORMAT
 
-    if (phase->_length == 1) {
-      print_single_length(phase_id, phase);
-    } else {
-      print_multi_length(phase_id, phase);
-    }
-  }
+#define info_line(str, value) \
+  log_info(gc, phases)(PHASE_DOUBLE_FORMAT, Indents[1], str, value);
 
+#define debug_line(str, value) \
+  log_debug(gc, phases)(PHASE_DOUBLE_FORMAT, Indents[2], str, value);
 
- private:
-  void print_single_length(G1GCPhaseTimes::GCParPhases phase_id, WorkerDataArray<double>* phase) {
-    // No need for min, max, average and sum for only one worker
-    log_debug(gc, phases)("%s%s:  %.1lf", Indents[phase->_indent_level], phase->_title, _phase_times->get_time_ms(phase_id, 0));
+#define trace_line(str, value) \
+  log_trace(gc, phases)(PHASE_DOUBLE_FORMAT, Indents[3], str, value);
 
-    WorkerDataArray<size_t>* work_items = phase->_thread_work_items;
-    if (work_items != NULL) {
-      log_debug(gc, phases)("%s%s:  " SIZE_FORMAT, Indents[work_items->_indent_level], work_items->_title, _phase_times->sum_thread_work_items(phase_id));
-    }
-  }
+#define trace_line_sz(str, value) \
+  log_trace(gc, phases)(PHASE_SIZE_FORMAT, Indents[3], str, value);
 
-  void print_time_values(const char* indent, G1GCPhaseTimes::GCParPhases phase_id) {
-    if (log_is_enabled(Trace, gc)) {
-      LineBuffer buf(0);
-      uint active_length = _phase_times->_active_gc_threads;
-      for (uint i = 0; i < active_length; ++i) {
-        buf.append(" %4.1lf", _phase_times->get_time_ms(phase_id, i));
-      }
-      const char* line = buf.to_string();
-      log_trace(gc, phases)("%s%-25s%s", indent, "", line);
-    }
-  }
+#define trace_line_ms(str, value) \
+  log_trace(gc, phases)(PHASE_SIZE_FORMAT, Indents[3], str, value);
 
-  void print_count_values(const char* indent, G1GCPhaseTimes::GCParPhases phase_id, WorkerDataArray<size_t>* thread_work_items) {
-    if (log_is_enabled(Trace, gc)) {
-      LineBuffer buf(0);
-      uint active_length = _phase_times->_active_gc_threads;
-      for (uint i = 0; i < active_length; ++i) {
-        buf.append("  " SIZE_FORMAT, _phase_times->get_thread_work_item(phase_id, i));
-      }
-      const char* line = buf.to_string();
-      log_trace(gc, phases)("%s%-25s%s", indent, "", line);
-    }
-  }
-
-  void print_thread_work_items(G1GCPhaseTimes::GCParPhases phase_id, WorkerDataArray<size_t>* thread_work_items) {
-    const char* indent = Indents[thread_work_items->_indent_level];
-
-    assert(thread_work_items->_print_sum, "%s does not have print sum true even though it is a count", thread_work_items->_title);
-
-    log_debug(gc, phases)("%s%-25s Min: " SIZE_FORMAT ", Avg: %4.1lf, Max: " SIZE_FORMAT ", Diff: " SIZE_FORMAT ", Sum: " SIZE_FORMAT,
-        indent, thread_work_items->_title,
-        _phase_times->min_thread_work_items(phase_id), _phase_times->average_thread_work_items(phase_id), _phase_times->max_thread_work_items(phase_id),
-        _phase_times->max_thread_work_items(phase_id) - _phase_times->min_thread_work_items(phase_id), _phase_times->sum_thread_work_items(phase_id));
-
-    print_count_values(indent, phase_id, thread_work_items);
-  }
-
-  void print_multi_length(G1GCPhaseTimes::GCParPhases phase_id, WorkerDataArray<double>* phase) {
-    const char* indent = Indents[phase->_indent_level];
-
-    if (phase->_print_sum) {
-      log_debug(gc, phases)("%s%-25s Min: %4.1lf, Avg: %4.1lf, Max: %4.1lf, Diff: %4.1lf, Sum: %4.1lf",
-          indent, phase->_title,
-          _phase_times->min_time_ms(phase_id), _phase_times->average_time_ms(phase_id), _phase_times->max_time_ms(phase_id),
-          _phase_times->max_time_ms(phase_id) - _phase_times->min_time_ms(phase_id), _phase_times->sum_time_ms(phase_id));
-    } else {
-      log_debug(gc, phases)("%s%-25s Min: %4.1lf, Avg: %4.1lf, Max: %4.1lf, Diff: %4.1lf",
-          indent, phase->_title,
-          _phase_times->min_time_ms(phase_id), _phase_times->average_time_ms(phase_id), _phase_times->max_time_ms(phase_id),
-          _phase_times->max_time_ms(phase_id) - _phase_times->min_time_ms(phase_id));
-    }
-
-    print_time_values(indent, phase_id);
-
-    if (phase->_thread_work_items != NULL) {
-      print_thread_work_items(phase_id, phase->_thread_work_items);
-    }
-  }
-};
+#define info_line_and_account(str, value) \
+  info_line(str, value);                  \
+  accounted_time_ms += value;
 
 void G1GCPhaseTimes::print() {
   note_gc_end();
 
-  G1GCParPhasePrinter par_phase_printer(this);
-
+  double accounted_time_ms = _external_accounted_time_ms;
   if (_root_region_scan_wait_time_ms > 0.0) {
-    print_stats(Indents[1], "Root Region Scan Waiting", _root_region_scan_wait_time_ms);
+    info_line_and_account("Root Region Scan Waiting", _root_region_scan_wait_time_ms);
   }
 
-  print_stats(Indents[1], "Parallel Time", _cur_collection_par_time_ms);
-  for (int i = 0; i <= GCMainParPhasesLast; i++) {
-    par_phase_printer.print((GCParPhases) i);
+  info_line_and_account("Evacuate Collection Set", _cur_collection_par_time_ms);
+  trace_phase(_gc_par_phases[GCWorkerStart], false);
+  debug_phase(_gc_par_phases[ExtRootScan]);
+  for (int i = ThreadRoots; i <= SATBFiltering; i++) {
+    trace_phase(_gc_par_phases[i]);
   }
+  debug_phase(_gc_par_phases[UpdateRS]);
+  if (ConcurrentG1Refine::hot_card_cache_enabled()) {
+    trace_phase(_gc_par_phases[ScanHCC]);
+  }
+  debug_phase(_gc_par_phases[ScanRS]);
+  debug_phase(_gc_par_phases[CodeRoots]);
+  debug_phase(_gc_par_phases[ObjCopy]);
+  debug_phase(_gc_par_phases[Termination]);
+  debug_phase(_gc_par_phases[Other]);
+  debug_phase(_gc_par_phases[GCWorkerTotal]);
+  trace_phase(_gc_par_phases[GCWorkerEnd], false);
 
-  print_stats(Indents[1], "Code Root Fixup", _cur_collection_code_root_fixup_time_ms);
-  print_stats(Indents[1], "Code Root Purge", _cur_strong_code_root_purge_time_ms);
+  info_line_and_account("Code Roots", _cur_collection_code_root_fixup_time_ms + _cur_strong_code_root_purge_time_ms);
+  debug_line("Code Roots Fixup", _cur_collection_code_root_fixup_time_ms);
+  debug_line("Code Roots Purge", _cur_strong_code_root_purge_time_ms);
+
   if (G1StringDedup::is_enabled()) {
-    print_stats(Indents[1], "String Dedup Fixup", _cur_string_dedup_fixup_time_ms);
-    for (int i = StringDedupPhasesFirst; i <= StringDedupPhasesLast; i++) {
-      par_phase_printer.print((GCParPhases) i);
-    }
+    info_line_and_account("String Dedup Fixup", _cur_string_dedup_fixup_time_ms);
+    debug_phase(_gc_par_phases[StringDedupQueueFixup]);
+    debug_phase(_gc_par_phases[StringDedupTableFixup]);
   }
-  print_stats(Indents[1], "Clear CT", _cur_clear_ct_time_ms);
-  print_stats(Indents[1], "Expand Heap After Collection", _cur_expand_heap_time_ms);
-  double misc_time_ms = _gc_pause_time_ms - accounted_time_ms();
-  print_stats(Indents[1], "Other", misc_time_ms);
+  info_line_and_account("Clear Card Table", _cur_clear_ct_time_ms);
+  info_line_and_account("Expand Heap After Collection", _cur_expand_heap_time_ms);
+
+  double free_cset_time = _recorded_young_free_cset_time_ms + _recorded_non_young_free_cset_time_ms;
+  info_line_and_account("Free Collection Set", free_cset_time);
+  debug_line("Young Free Collection Set", _recorded_young_free_cset_time_ms);
+  debug_line("Non-Young Free Collection Set", _recorded_non_young_free_cset_time_ms);
+  info_line_and_account("Merge Per-Thread State", _recorded_merge_pss_time_ms);
+
+  info_line("Other", _gc_pause_time_ms - accounted_time_ms);
   if (_cur_verify_before_time_ms > 0.0) {
-    print_stats(Indents[2], "Verify Before", _cur_verify_before_time_ms);
+    debug_line("Verify Before", _cur_verify_before_time_ms);
   }
   if (G1CollectedHeap::heap()->evacuation_failed()) {
     double evac_fail_handling = _cur_evac_fail_recalc_used + _cur_evac_fail_remove_self_forwards +
       _cur_evac_fail_restore_remsets;
-    print_stats(Indents[2], "Evacuation Failure", evac_fail_handling);
-    log_trace(gc, phases)("%sRecalculate Used: %.1lf ms", Indents[3], _cur_evac_fail_recalc_used);
-    log_trace(gc, phases)("%sRemove Self Forwards: %.1lf ms", Indents[3], _cur_evac_fail_remove_self_forwards);
-    log_trace(gc, phases)("%sRestore RemSet: %.1lf ms", Indents[3], _cur_evac_fail_restore_remsets);
+    debug_line("Evacuation Failure", evac_fail_handling);
+    trace_line("Recalculate Used", _cur_evac_fail_recalc_used);
+    trace_line("Remove Self Forwards",_cur_evac_fail_remove_self_forwards);
+    trace_line("Restore RemSet", _cur_evac_fail_restore_remsets);
   }
-  print_stats(Indents[2], "Choose CSet",
-    (_recorded_young_cset_choice_time_ms +
-    _recorded_non_young_cset_choice_time_ms));
-  print_stats(Indents[2], "Ref Proc", _cur_ref_proc_time_ms);
-  print_stats(Indents[2], "Ref Enq", _cur_ref_enq_time_ms);
-  print_stats(Indents[2], "Redirty Cards", _recorded_redirty_logged_cards_time_ms);
-  par_phase_printer.print(RedirtyCards);
+  debug_line("Choose CSet", (_recorded_young_cset_choice_time_ms + _recorded_non_young_cset_choice_time_ms));
+  debug_line("Preserve CM Refs", _recorded_preserve_cm_referents_time_ms);
+  debug_line("Reference Processing", _cur_ref_proc_time_ms);
+  debug_line("Reference Enqueuing", _cur_ref_enq_time_ms);
+  debug_line("Redirty Cards", _recorded_redirty_logged_cards_time_ms);
+  trace_phase(_gc_par_phases[RedirtyCards]);
+  trace_phase(_gc_par_phases[PreserveCMReferents]);
   if (G1EagerReclaimHumongousObjects) {
-    print_stats(Indents[2], "Humongous Register", _cur_fast_reclaim_humongous_register_time_ms);
-
-    log_trace(gc, phases)("%sHumongous Total: " SIZE_FORMAT, Indents[3], _cur_fast_reclaim_humongous_total);
-    log_trace(gc, phases)("%sHumongous Candidate: " SIZE_FORMAT, Indents[3], _cur_fast_reclaim_humongous_candidates);
-    print_stats(Indents[2], "Humongous Reclaim", _cur_fast_reclaim_humongous_time_ms);
-    log_trace(gc, phases)("%sHumongous Reclaimed: " SIZE_FORMAT, Indents[3], _cur_fast_reclaim_humongous_reclaimed);
+    debug_line("Humongous Register", _cur_fast_reclaim_humongous_register_time_ms);
+    trace_line_sz("Humongous Total", _cur_fast_reclaim_humongous_total);
+    trace_line_sz("Humongous Candidate", _cur_fast_reclaim_humongous_candidates);
+    debug_line("Humongous Reclaim", _cur_fast_reclaim_humongous_time_ms);
+    trace_line_sz("Humongous Reclaimed", _cur_fast_reclaim_humongous_reclaimed);
   }
-  print_stats(Indents[2], "Free CSet",
-    (_recorded_young_free_cset_time_ms +
-    _recorded_non_young_free_cset_time_ms));
-  log_trace(gc, phases)("%sYoung Free CSet: %.1lf ms", Indents[3], _recorded_young_free_cset_time_ms);
-  log_trace(gc, phases)("%sNon-Young Free CSet: %.1lf ms", Indents[3], _recorded_non_young_free_cset_time_ms);
   if (_cur_verify_after_time_ms > 0.0) {
-    print_stats(Indents[2], "Verify After", _cur_verify_after_time_ms);
+    debug_line("Verify After", _cur_verify_after_time_ms);
   }
 }
 
--- a/src/share/vm/gc/g1/g1GCPhaseTimes.hpp	Thu Mar 10 09:28:10 2016 -0800
+++ b/src/share/vm/gc/g1/g1GCPhaseTimes.hpp	Thu Mar 10 09:50:18 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -32,8 +32,6 @@
 template <class T> class WorkerDataArray;
 
 class G1GCPhaseTimes : public CHeapObj<mtGC> {
-  friend class G1GCParPhasePrinter;
-
   uint _active_gc_threads;
   uint _max_gc_threads;
   jlong _gc_start_counter;
@@ -69,6 +67,7 @@
     StringDedupQueueFixup,
     StringDedupTableFixup,
     RedirtyCards,
+    PreserveCMReferents,
     GCParPhasesSentinel
   };
 
@@ -108,6 +107,10 @@
 
   double _recorded_redirty_logged_cards_time_ms;
 
+  double _recorded_preserve_cm_referents_time_ms;
+
+  double _recorded_merge_pss_time_ms;
+
   double _recorded_young_free_cset_time_ms;
   double _recorded_non_young_free_cset_time_ms;
 
@@ -120,10 +123,13 @@
   double _cur_verify_before_time_ms;
   double _cur_verify_after_time_ms;
 
-  // Helper methods for detailed logging
-  void print_stats(const char*, const char* str, double value);
+  void note_gc_end();
 
-  void note_gc_end();
+  template <class T>
+  void details(T* phase, const char* indent);
+  void log_phase(WorkerDataArray<double>* phase, uint indent, outputStream* out, bool print_sum);
+  void debug_phase(WorkerDataArray<double>* phase);
+  void trace_phase(WorkerDataArray<double>* phase, bool print_sum = true);
 
  public:
   G1GCPhaseTimes(uint max_gc_threads);
@@ -143,16 +149,6 @@
 
   size_t sum_thread_work_items(GCParPhases phase);
 
- private:
-  double get_time_ms(GCParPhases phase, uint worker_i);
-  double sum_time_ms(GCParPhases phase);
-  double min_time_ms(GCParPhases phase);
-  double max_time_ms(GCParPhases phase);
-  size_t get_thread_work_item(GCParPhases phase, uint worker_i);
-  double average_thread_work_items(GCParPhases phase);
-  size_t min_thread_work_items(GCParPhases phase);
-  size_t max_thread_work_items(GCParPhases phase);
-
  public:
 
   void record_clear_ct_time(double ms) {
@@ -234,6 +230,14 @@
     _recorded_redirty_logged_cards_time_ms = time_ms;
   }
 
+  void record_preserve_cm_referents_time_ms(double time_ms) {
+    _recorded_preserve_cm_referents_time_ms = time_ms;
+  }