changeset 11000:7c31312c5725

Merge
author aph
date Thu, 17 Mar 2016 17:03:20 +0000
parents b7ba700456c5 27654175e92a
children d28f025a1f80
files src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/development/Server16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/development/Server24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/About16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/About24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Delete16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Delete24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Find16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Help16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Help24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/History16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/History24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Information16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Information24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/New16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/New24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Open16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Open24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Save24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/SaveAs16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/SaveAs24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Zoom16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/ZoomIn16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/ZoomIn24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/navigation/Down16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/navigation/Up16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignCenter16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignCenter24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignLeft16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignLeft24.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignRight16.gif src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignRight24.gif test/compiler/compilercontrol/jcmd/StressAddSequentiallyTest.java
diffstat 423 files changed, 8325 insertions(+), 4856 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Thu Feb 25 14:59:44 2016 +0000
+++ b/.hgtags	Thu Mar 17 17:03:20 2016 +0000
@@ -509,3 +509,6 @@
 534c50395957c6025fb6627e93b35756f8d48a08 jdk-9+104
 266fa9bb5297bf02cb2a7b038b10a109817d2b48 jdk-9+105
 7232de4c17c37f60aecec4f3191090bd3d41d334 jdk-9+106
+c5146d4da417f76edfc43097d2e2ced042a65b4e jdk-9+107
+934f6793f5f7dca44f69b4559d525fa64b31840d jdk-9+108
+7e7e50ac4faf19899fc811569e32cfa478759ebb jdk-9+109
--- a/make/aix/makefiles/trace.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/aix/makefiles/trace.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -57,11 +57,6 @@
 TraceGeneratedNames +=  \
 	traceRequestables.hpp \
     traceEventControl.hpp
-
-ifneq ($(INCLUDE_TRACE), false)
-TraceGeneratedNames += traceProducer.cpp
-endif
-
 endif
 
 TraceGeneratedFiles = $(TraceGeneratedNames:%=$(TraceOutDir)/%)
@@ -100,9 +95,6 @@
 $(TraceOutDir)/traceEventClasses.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceEventClasses.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
-$(TraceOutDir)/traceProducer.cpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceProducer.xsl $(XML_DEPS)
-	$(GENERATE_CODE)
-
 $(TraceOutDir)/traceRequestables.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceRequestables.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
--- a/make/bsd/makefiles/amd64.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/bsd/makefiles/amd64.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -37,3 +37,11 @@
 endif
 
 OPT_CFLAGS/compactingPermGenGen.o = -O1
+
+# The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+# of OPT_CFLAGS. Restore it here.
+ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+   OPT_CFLAGS/sharedRuntimeTrig.o += -g
+   OPT_CFLAGS/sharedRuntimeTrans.o += -g
+   OPT_CFLAGS/compactingPermGenGen.o += -g
+endif
--- a/make/bsd/makefiles/arm.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/bsd/makefiles/arm.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -24,8 +24,4 @@
 
 Obj_Files += bsd_arm.o
 
-ifneq ($(EXT_LIBS_PATH),)
-  LIBS += $(EXT_LIBS_PATH)/sflt_glibc.a 
-endif
-
 CFLAGS += -DVM_LITTLE_ENDIAN
--- a/make/bsd/makefiles/gcc.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/bsd/makefiles/gcc.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -330,6 +330,13 @@
     ), 1)
     OPT_CFLAGS/loopTransform.o += $(OPT_CFLAGS/NOOPT)
     OPT_CFLAGS/unsafe.o += -O1
+
+    # The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+    # of OPT_CFLAGS. Restore it here.
+    ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+      OPT_CFLAGS/loopTransform.o += -g
+      OPT_CFLAGS/unsafe.o += -g
+    endif
   else
     $(error "Update compiler workarounds for Clang $(CC_VER_MAJOR).$(CC_VER_MINOR)")
   endif
--- a/make/bsd/makefiles/trace.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/bsd/makefiles/trace.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -57,11 +57,6 @@
 TraceGeneratedNames +=  \
 	traceRequestables.hpp \
     traceEventControl.hpp
-
-ifneq ($(INCLUDE_TRACE), false)
-TraceGeneratedNames += traceProducer.cpp
-endif
-
 endif
 
 
@@ -101,9 +96,6 @@
 $(TraceOutDir)/traceEventClasses.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceEventClasses.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
-$(TraceOutDir)/traceProducer.cpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceProducer.xsl $(XML_DEPS)
-	$(GENERATE_CODE)
-
 $(TraceOutDir)/traceRequestables.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceRequestables.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
--- a/make/linux/makefiles/amd64.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/linux/makefiles/amd64.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -32,3 +32,11 @@
 CFLAGS += -D_LP64=1
 
 OPT_CFLAGS/compactingPermGenGen.o = -O1
+
+# The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+# of OPT_CFLAGS. Restore it here.
+ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+   OPT_CFLAGS/sharedRuntimeTrig.o += -g
+   OPT_CFLAGS/sharedRuntimeTrans.o += -g
+   OPT_CFLAGS/compactingPermGenGen.o += -g
+endif
--- a/make/linux/makefiles/gcc.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/linux/makefiles/gcc.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -265,6 +265,11 @@
   # GCC >= 4.3
   # Gcc 4.1.2 does not support this flag, nor does it have problems compiling the file.
   OPT_CFLAGS/vmStructs.o += -fno-var-tracking-assignments
+  # The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+  # of OPT_CFLAGS. Restore it here.
+  ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+    OPT_CFLAGS/vmStructs.o += -g
+  endif
 endif
 
 # The gcc compiler segv's on ia64 when compiling bytecodeInterpreter.cpp
@@ -277,6 +282,11 @@
 ifeq ($(USE_CLANG), true)
   ifeq ($(shell expr $(CC_VER_MAJOR) = 4 \& $(CC_VER_MINOR) = 2), 1)
     OPT_CFLAGS/loopTransform.o += $(OPT_CFLAGS/NOOPT)
+    # The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+    # of OPT_CFLAGS. Restore it here.
+    ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+      OPT_CFLAGS/loopTransform.o += -g
+    endif
   endif
 else
   # Do not allow GCC 4.1.1
--- a/make/linux/makefiles/i486.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/linux/makefiles/i486.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -32,3 +32,11 @@
 CFLAGS += -DVM_LITTLE_ENDIAN
 
 OPT_CFLAGS/compactingPermGenGen.o = -O1
+
+# The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+# of OPT_CFLAGS. Restore it here.
+ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+   OPT_CFLAGS/sharedRuntimeTrig.o += -g
+   OPT_CFLAGS/sharedRuntimeTrans.o += -g
+   OPT_CFLAGS/compactingPermGenGen.o += -g
+endif
--- a/make/linux/makefiles/trace.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/linux/makefiles/trace.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -57,11 +57,6 @@
 TraceGeneratedNames +=  \
 	traceRequestables.hpp \
     traceEventControl.hpp
-
-ifneq ($(INCLUDE_TRACE), false)
-TraceGeneratedNames += traceProducer.cpp
-endif
-
 endif
 
 TraceGeneratedFiles = $(TraceGeneratedNames:%=$(TraceOutDir)/%)
@@ -100,9 +95,6 @@
 $(TraceOutDir)/traceEventClasses.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceEventClasses.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
-$(TraceOutDir)/traceProducer.cpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceProducer.xsl $(XML_DEPS)
-	$(GENERATE_CODE)
-
 $(TraceOutDir)/traceRequestables.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceRequestables.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
--- a/make/linux/makefiles/zeroshark.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/linux/makefiles/zeroshark.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # Copyright 2007, 2008 Red Hat, Inc.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
@@ -29,12 +29,6 @@
 ifeq ($(USE_CLANG), true)
   WARNING_FLAGS += -Wno-undef
 endif
-# Suppress some warning flags that are normally turned on for hotspot,
-# because some of the zero code has not been updated accordingly.
-WARNING_FLAGS += -Wno-return-type \
-  -Wno-format-nonliteral -Wno-format-security \
-  -Wno-maybe-uninitialized
- 
 
 # The copied fdlibm routines in sharedRuntimeTrig.o must not be optimized
 OPT_CFLAGS/sharedRuntimeTrig.o = $(OPT_CFLAGS/NOOPT)
--- a/make/solaris/makefiles/amd64.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/solaris/makefiles/amd64.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2004, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -34,6 +34,14 @@
 OPT_CFLAGS/generateOptoStub.o = -xO2
 # Temporary util SS12u1 C++ compiler is fixed
 OPT_CFLAGS/c1_LinearScan.o = -xO2
+
+# The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+# of OPT_CFLAGS. Restore it here.
+ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+  OPT_CFLAGS/generateOptoStub.o += -g0 -xs
+  OPT_CFLAGS/c1_LinearScan.o += -g0 -xs
+endif
+
 else
 
 ifeq ("${Platform_compiler}", "gcc")
--- a/make/solaris/makefiles/product.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/solaris/makefiles/product.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -35,11 +35,21 @@
 # for this method for now. (fix this when dtrace bug 6258412 is fixed)
 ifndef USE_GCC
 OPT_CFLAGS/ciEnv.o = $(OPT_CFLAGS) -xinline=no%__1cFciEnvbFpost_compiled_method_load_event6MpnHnmethod__v_
+# The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+# of OPT_CFLAGS. Restore it here.
+ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+   OPT_CFLAGS/ciEnv.o += -g0 -xs
+endif
 endif
 
 # Need extra inlining to get oop_ps_push_contents functions to perform well enough.
 ifndef USE_GCC
 OPT_CFLAGS/psPromotionManager.o = $(OPT_CFLAGS) -W2,-Ainline:inc=1000
+# The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+# of OPT_CFLAGS. Restore it here.
+ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+   OPT_CFLAGS/psPromotionManager.o += -g0 -xs
+endif
 endif
 
 # (OPT_CFLAGS/SLOWER is also available, to alter compilation of buggy files)
@@ -55,6 +65,12 @@
 ifeq ($(shell expr $(COMPILER_REV_NUMERIC) \>= 509), 1)
 # dtrace cannot handle tail call optimization (6672627, 6693876)
 OPT_CFLAGS/jni.o = $(OPT_CFLAGS/DEFAULT) $(OPT_CCFLAGS/NO_TAIL_CALL_OPT)
+# The -g0 -xs flag is added to OPT_CFLAGS in sparcWorks.make, but lost in case of
+# per-file overrides of OPT_CFLAGS. Restore it here. This is mainly needed
+# to provide a good baseline to compare the new build against.
+ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+   OPT_CFLAGS/jni.o += -g0 -xs
+endif
 endif # COMPILER_NUMERIC_REV >= 509
 
 # Workaround SS11 bug 6345274 (all platforms) (Fixed in SS11 patch and SS12)
--- a/make/solaris/makefiles/sparcWorks.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/solaris/makefiles/sparcWorks.make	Thu Mar 17 17:03:20 2016 +0000
@@ -158,9 +158,20 @@
 OPT_CCFLAGS/NO_TAIL_CALL_OPT = -Qoption ube -O~yz
 OPT_CFLAGS/stubGenerator_x86_32.o = $(OPT_CFLAGS) -xspace
 OPT_CFLAGS/stubGenerator_x86_64.o = $(OPT_CFLAGS) -xspace
+# The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+# of OPT_CFLAGS. Restore it here.
+ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+   OPT_CFLAGS/stubGenerator_x86_32.o += -g0 -xs
+   OPT_CFLAGS/stubGenerator_x86_64.o += -g0 -xs
+endif
 endif # Platform_arch == x86
 ifeq ("${Platform_arch}", "sparc")
 OPT_CFLAGS/stubGenerator_sparc.o = $(OPT_CFLAGS) -xspace
+# The debug flag is added to OPT_CFLAGS, but lost in case of per-file overrides
+# of OPT_CFLAGS. Restore it here.
+ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
+   OPT_CFLAGS/stubGenerator_sparc.o += -g0 -xs
+endif
 endif
 endif # COMPILER_REV_NUMERIC >= 509
 
--- a/make/solaris/makefiles/trace.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/solaris/makefiles/trace.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -56,8 +56,7 @@
 ifeq ($(HAS_ALT_SRC), true)
 TraceGeneratedNames +=  \
 	traceRequestables.hpp \
-    traceEventControl.hpp \
-    traceProducer.cpp
+    traceEventControl.hpp
 endif
 
 TraceGeneratedFiles = $(TraceGeneratedNames:%=$(TraceOutDir)/%)
@@ -96,9 +95,6 @@
 $(TraceOutDir)/traceEventClasses.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceEventClasses.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
-$(TraceOutDir)/traceProducer.cpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceProducer.xsl $(XML_DEPS)
-	$(GENERATE_CODE)
-
 $(TraceOutDir)/traceRequestables.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceRequestables.xsl $(XML_DEPS)
 	$(GENERATE_CODE)
 
--- a/make/windows/makefiles/trace.make	Thu Feb 25 14:59:44 2016 +0000
+++ b/make/windows/makefiles/trace.make	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -43,8 +43,7 @@
 !if EXISTS($(TraceAltSrcDir))
 TraceGeneratedNames = $(TraceGeneratedNames) \
     traceRequestables.hpp \
-    traceEventControl.hpp \
-    traceProducer.cpp
+    traceEventControl.hpp
 !endif
 
 
@@ -58,8 +57,7 @@
 !if EXISTS($(TraceAltSrcDir))
 TraceGeneratedFiles = $(TraceGeneratedFiles) \
 	$(TraceOutDir)/traceRequestables.hpp \
-    $(TraceOutDir)/traceEventControl.hpp \
-	$(TraceOutDir)/traceProducer.cpp
+    $(TraceOutDir)/traceEventControl.hpp
 !endif
 
 XSLT = $(QUIETLY) $(REMOTE) $(RUN_JAVA) -classpath $(JvmtiOutDir) jvmtiGen
@@ -98,10 +96,6 @@
 	@echo Generating AltSrc $@
 	@$(XSLT) -IN $(TraceSrcDir)/trace.xml -XSL $(TraceAltSrcDir)/traceEventClasses.xsl -OUT $(TraceOutDir)/traceEventClasses.hpp
 
-$(TraceOutDir)/traceProducer.cpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceProducer.xsl $(XML_DEPS)
-	@echo Generating AltSrc $@
-	@$(XSLT) -IN $(TraceSrcDir)/trace.xml -XSL $(TraceAltSrcDir)/traceProducer.xsl -OUT $(TraceOutDir)/traceProducer.cpp
-
 $(TraceOutDir)/traceRequestables.hpp: $(TraceSrcDir)/trace.xml $(TraceAltSrcDir)/traceRequestables.xsl $(XML_DEPS)
 	@echo Generating AltSrc $@
 	@$(XSLT) -IN $(TraceSrcDir)/trace.xml -XSL $(TraceAltSrcDir)/traceRequestables.xsl -OUT $(TraceOutDir)/traceRequestables.hpp
--- a/src/cpu/aarch64/vm/aarch64.ad	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu Mar 17 17:03:20 2016 +0000
@@ -3425,9 +3425,6 @@
 // false => size gets scaled to BytesPerLong, ok.
 const bool Matcher::init_array_count_is_in_bytes = false;
 
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 18 * BytesPerLong;
-
 // Use conditional move (CMOVL)
 const int Matcher::long_cmove_cost() {
   // long cmoves are no more expensive than int cmoves
@@ -4135,14 +4132,14 @@
     MacroAssembler _masm(&cbuf);
     guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
     __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
-               &Assembler::ldxr, &MacroAssembler::cmp, &Assembler::stlxr);
+               Assembler::xword, /*acquire*/ false, /*release*/ true);
   %}
 
   enc_class aarch64_enc_cmpxchgw(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
     MacroAssembler _masm(&cbuf);
     guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
     __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
-               &Assembler::ldxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
+               Assembler::word, /*acquire*/ false, /*release*/ true);
   %}
 
 
@@ -4154,14 +4151,14 @@
     MacroAssembler _masm(&cbuf);
     guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
     __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
-               &Assembler::ldaxr, &MacroAssembler::cmp, &Assembler::stlxr);
+               Assembler::xword, /*acquire*/ true, /*release*/ true);
   %}
 
   enc_class aarch64_enc_cmpxchgw_acq(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
     MacroAssembler _masm(&cbuf);
     guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
     __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
-               &Assembler::ldaxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
+               Assembler::word, /*acquire*/ true, /*release*/ true);
   %}
 
 
@@ -4679,8 +4676,14 @@
 
     // Compare object markOop with mark and if equal exchange scratch1
     // with object markOop.
-    {
+    if (UseLSE) {
+      __ mov(tmp, disp_hdr);
+      __ casal(Assembler::xword, tmp, box, oop);
+      __ cmp(tmp, disp_hdr);
+      __ br(Assembler::EQ, cont);
+    } else {
       Label retry_load;
+      __ prfm(Address(oop), PSTL1STRM);
       __ bind(retry_load);
       __ ldaxr(tmp, oop);
       __ cmp(tmp, disp_hdr);
@@ -4729,8 +4732,13 @@
       __ add(tmp, disp_hdr, (ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value));
       __ mov(disp_hdr, zr);
 
-      {
+      if (UseLSE) {
+        __ mov(rscratch1, disp_hdr);
+        __ casal(Assembler::xword, rscratch1, rthread, tmp);
+        __ cmp(rscratch1, disp_hdr);
+      } else {
         Label retry_load, fail;
+        __ prfm(Address(tmp), PSTL1STRM);
         __ bind(retry_load);
         __ ldaxr(rscratch1, tmp);
         __ cmp(disp_hdr, rscratch1);
@@ -4818,8 +4826,13 @@
     // see the stack address of the basicLock in the markOop of the
     // object.
 
-      {
+      if (UseLSE) {
+        __ mov(tmp, box);
+        __ casl(Assembler::xword, tmp, disp_hdr, oop);
+        __ cmp(tmp, box);
+      } else {
         Label retry_load;
+        __ prfm(Address(oop), PSTL1STRM);
         __ bind(retry_load);
         __ ldxr(tmp, oop);
         __ cmp(box, tmp);
@@ -13281,7 +13294,7 @@
     __ fmovs($dst$$Register, as_FloatRegister($src$$reg));
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(fp_f2i);
 
 %}
 
@@ -13299,7 +13312,7 @@
     __ fmovs(as_FloatRegister($dst$$reg), $src$$Register);
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(fp_i2f);
 
 %}
 
@@ -13317,7 +13330,7 @@
     __ fmovd($dst$$Register, as_FloatRegister($src$$reg));
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(fp_d2l);
 
 %}
 
@@ -13335,7 +13348,7 @@
     __ fmovd(as_FloatRegister($dst$$reg), $src$$Register);
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(fp_l2d);
 
 %}
 
@@ -14191,6 +14204,25 @@
   ins_pipe(pipe_cmp_branch);
 %}
 
+instruct cmpN_imm0_branch(cmpOp cmp, iRegN op1, immN0 op2, label labl, rFlagsReg cr) %{
+  match(If cmp (CmpN op1 op2));
+  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::ne
+	    || n->in(1)->as_Bool()->_test._test == BoolTest::eq);
+  effect(USE labl);
+
+  ins_cost(BRANCH_COST);
+  format %{ "cbw$cmp   $op1, $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
+    if (cond == Assembler::EQ)
+      __ cbzw($op1$$Register, *L);
+    else
+      __ cbnzw($op1$$Register, *L);
+  %}
+  ins_pipe(pipe_cmp_branch);
+%}
+
 instruct cmpP_narrowOop_imm0_branch(cmpOp cmp, iRegN oop, immP0 zero, label labl, rFlagsReg cr) %{
   match(If cmp (CmpP (DecodeN oop) zero));
   predicate(n->in(1)->as_Bool()->_test._test == BoolTest::ne
@@ -14783,19 +14815,19 @@
 %}
 
 instruct string_equals(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
-                        iRegI_R0 result, iRegP_R10 tmp, rFlagsReg cr)
+                        iRegI_R0 result, rFlagsReg cr)
 %{
   predicate(!CompactStrings);
   match(Set result (StrEquals (Binary str1 str2) cnt));
-  effect(KILL tmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL cr);
-
-  format %{ "String Equals $str1,$str2,$cnt -> $result    // KILL $tmp" %}
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL cr);
+
+  format %{ "String Equals $str1,$str2,$cnt -> $result" %}
   ins_encode %{
     // Count is in 8-bit bytes; non-Compact chars are 16 bits.
     __ asrw($cnt$$Register, $cnt$$Register, 1);
-    __ string_equals($str1$$Register, $str2$$Register,
-                      $cnt$$Register, $result$$Register,
-                      $tmp$$Register);
+    __ arrays_equals($str1$$Register, $str2$$Register,
+                     $result$$Register, $cnt$$Register,
+                     2, /*is_string*/true);
   %}
   ins_pipe(pipe_class_memory);
 %}
@@ -14809,9 +14841,10 @@
 
   format %{ "Array Equals $ary1,ary2 -> $result    // KILL $tmp" %}
   ins_encode %{
-    __ byte_arrays_equals($ary1$$Register, $ary2$$Register,
-                          $result$$Register, $tmp$$Register);
-  %}
+    __ arrays_equals($ary1$$Register, $ary2$$Register,
+                     $result$$Register, $tmp$$Register,
+                     1, /*is_string*/false);
+    %}
   ins_pipe(pipe_class_memory);
 %}
 
@@ -14824,12 +14857,14 @@
 
   format %{ "Array Equals $ary1,ary2 -> $result    // KILL $tmp" %}
   ins_encode %{
-    __ char_arrays_equals($ary1$$Register, $ary2$$Register,
-                          $result$$Register, $tmp$$Register);
+    __ arrays_equals($ary1$$Register, $ary2$$Register,
+                     $result$$Register, $tmp$$Register,
+                     2, /*is_string*/false);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
+
 // encode char[] to byte[] in ISO_8859_1
 instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
                           vRegD_V0 Vtmp1, vRegD_V1 Vtmp2,
@@ -16480,7 +16515,7 @@
             as_FloatRegister($src$$reg),
             as_FloatRegister($shift$$reg));
   %}
-  ins_pipe(vshift64_imm);
+  ins_pipe(vshift64);
 %}
 
 instruct vsll4I(vecX dst, vecX src, vecX shift) %{
@@ -16494,7 +16529,7 @@
             as_FloatRegister($src$$reg),
             as_FloatRegister($shift$$reg));
   %}
-  ins_pipe(vshift128_imm);
+  ins_pipe(vshift128);
 %}
 
 instruct vsrl2I(vecD dst, vecD src, vecX shift) %{
@@ -16507,7 +16542,7 @@
             as_FloatRegister($src$$reg),
             as_FloatRegister($shift$$reg));
   %}
-  ins_pipe(vshift64_imm);
+  ins_pipe(vshift64);
 %}
 
 instruct vsrl4I(vecX dst, vecX src, vecX shift) %{
@@ -16520,7 +16555,7 @@
             as_FloatRegister($src$$reg),
             as_FloatRegister($shift$$reg));
   %}
-  ins_pipe(vshift128_imm);
+  ins_pipe(vshift128);
 %}
 
 instruct vsll2I_imm(vecD dst, vecD src, immI shift) %{
@@ -16638,7 +16673,7 @@
            as_FloatRegister($src$$reg),
            (int)$shift$$constant & 63);
   %}
-  ins_pipe(vshift128);
+  ins_pipe(vshift128_imm);
 %}
 
 instruct vsra2L_imm(vecX dst, vecX src, immI shift) %{
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -972,7 +972,7 @@
 
   // System
   void system(int op0, int op1, int CRn, int CRm, int op2,
-              Register rt = (Register)0b11111)
+              Register rt = dummy_reg)
   {
     starti;
     f(0b11010101000, 31, 21);
@@ -1082,7 +1082,7 @@
 
 #define INSN(NAME, opc)                         \
   void NAME() {                 \
-    branch_reg((Register)0b11111, opc);         \
+    branch_reg(dummy_reg, opc);         \
   }
 
   INSN(eret, 0b0100);
@@ -1094,10 +1094,22 @@
   enum operand_size { byte, halfword, word, xword };
 
   void load_store_exclusive(Register Rs, Register Rt1, Register Rt2,
-    Register Rn, enum operand_size sz, int op, int o0) {
+    Register Rn, enum operand_size sz, int op, bool ordered) {
     starti;
     f(sz, 31, 30), f(0b001000, 29, 24), f(op, 23, 21);
-    rf(Rs, 16), f(o0, 15), rf(Rt2, 10), rf(Rn, 5), rf(Rt1, 0);
+    rf(Rs, 16), f(ordered, 15), rf(Rt2, 10), rf(Rn, 5), rf(Rt1, 0);
+  }
+
+  void load_exclusive(Register dst, Register addr,
+                      enum operand_size sz, bool ordered) {
+    load_store_exclusive(dummy_reg, dst, dummy_reg, addr,
+                         sz, 0b010, ordered);
+  }
+
+  void store_exclusive(Register status, Register new_val, Register addr,
+                       enum operand_size sz, bool ordered) {
+    load_store_exclusive(status, new_val, dummy_reg, addr,
+                         sz, 0b000, ordered);
   }
 
 #define INSN4(NAME, sz, op, o0) /* Four registers */                    \
@@ -1109,19 +1121,19 @@
 #define INSN3(NAME, sz, op, o0) /* Three registers */                   \
   void NAME(Register Rs, Register Rt, Register Rn) {                    \
     guarantee(Rs != Rn && Rs != Rt, "unpredictable instruction");       \
-    load_store_exclusive(Rs, Rt, (Register)0b11111, Rn, sz, op, o0);    \
+    load_store_exclusive(Rs, Rt, dummy_reg, Rn, sz, op, o0); \
   }
 
 #define INSN2(NAME, sz, op, o0) /* Two registers */                     \
   void NAME(Register Rt, Register Rn) {                                 \
-    load_store_exclusive((Register)0b11111, Rt, (Register)0b11111,      \
+    load_store_exclusive(dummy_reg, Rt, dummy_reg, \
                          Rn, sz, op, o0);                               \
   }
 
 #define INSN_FOO(NAME, sz, op, o0) /* Three registers, encoded differently */ \
   void NAME(Register Rt1, Register Rt2, Register Rn) {                  \
     guarantee(Rt1 != Rt2, "unpredictable instruction");                 \
-    load_store_exclusive((Register)0b11111, Rt1, Rt2, Rn, sz, op, o0);  \
+    load_store_exclusive(dummy_reg, Rt1, Rt2, Rn, sz, op, o0);          \
   }
 
   // bytes
@@ -1169,6 +1181,46 @@
 #undef INSN4
 #undef INSN_FOO
 
+  // 8.1 Compare and swap extensions
+  void lse_cas(Register Rs, Register Rt, Register Rn,
+                        enum operand_size sz, bool a, bool r, bool not_pair) {
+    starti;
+    if (! not_pair) { // Pair
+      assert(sz == word || sz == xword, "invalid size");
+      /* The size bit is in bit 30, not 31 */
+      sz = (operand_size)(sz == word ? 0b00:0b01);
+    }
+    f(sz, 31, 30), f(0b001000, 29, 24), f(1, 23), f(a, 22), f(1, 21);
+    rf(Rs, 16), f(r, 15), f(0b11111, 14, 10), rf(Rn, 5), rf(Rt, 0);
+  }
+
+  // CAS
+#define INSN(NAME, a, r)                                                \
+  void NAME(operand_size sz, Register Rs, Register Rt, Register Rn) {   \
+    assert(Rs != Rn && Rs != Rt, "unpredictable instruction");          \
+    lse_cas(Rs, Rt, Rn, sz, a, r, true);                                \
+  }
+  INSN(cas,    false, false)
+  INSN(casa,   true,  false)
+  INSN(casl,   false, true)
+  INSN(casal,  true,  true)
+#undef INSN
+
+  // CASP
+#define INSN(NAME, a, r)                                                \
+  void NAME(operand_size sz, Register Rs, Register Rs1,                 \
+            Register Rt, Register Rt1, Register Rn) {                   \
+    assert((Rs->encoding() & 1) == 0 && (Rt->encoding() & 1) == 0 &&    \
+           Rs->successor() == Rs1 && Rt->successor() == Rt1 &&          \
+           Rs != Rn && Rs1 != Rn && Rs != Rt, "invalid registers");     \
+    lse_cas(Rs, Rt, Rn, sz, a, r, false);                               \
+  }
+  INSN(casp,    false, false)
+  INSN(caspa,   true,  false)
+  INSN(caspl,   false, true)
+  INSN(caspal,  true,  true)
+#undef INSN
+
   // Load register (literal)
 #define INSN(NAME, opc, V)                                              \
   void NAME(Register Rt, address dest) {                                \
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1556,38 +1556,54 @@
 }
 
 void LIR_Assembler::casw(Register addr, Register newval, Register cmpval) {
-  Label retry_load, nope;
-  // flush and load exclusive from the memory location
-  // and fail if it is not what we expect
-  __ bind(retry_load);
-  __ ldaxrw(rscratch1, addr);
-  __ cmpw(rscratch1, cmpval);
-  __ cset(rscratch1, Assembler::NE);
-  __ br(Assembler::NE, nope);
-  // if we store+flush with no intervening write rscratch1 wil be zero
-  __ stlxrw(rscratch1, newval, addr);
-  // retry so we only ever return after a load fails to compare
-  // ensures we don't return a stale value after a failed write.
-  __ cbnzw(rscratch1, retry_load);
-  __ bind(nope);
+  if (UseLSE) {
+    __ mov(rscratch1, cmpval);
+    __ casal(Assembler::word, rscratch1, newval, addr);
+    __ cmpw(rscratch1, cmpval);
+    __ cset(rscratch1, Assembler::NE);
+  } else {
+    Label retry_load, nope;
+    // flush and load exclusive from the memory location
+    // and fail if it is not what we expect
+    __ prfm(Address(addr), PSTL1STRM);
+    __ bind(retry_load);
+    __ ldaxrw(rscratch1, addr);
+    __ cmpw(rscratch1, cmpval);
+    __ cset(rscratch1, Assembler::NE);
+    __ br(Assembler::NE, nope);
+    // if we store+flush with no intervening write rscratch1 wil be zero
+    __ stlxrw(rscratch1, newval, addr);
+    // retry so we only ever return after a load fails to compare
+    // ensures we don't return a stale value after a failed write.
+    __ cbnzw(rscratch1, retry_load);
+    __ bind(nope);
+  }
   __ membar(__ AnyAny);
 }
 
 void LIR_Assembler::casl(Register addr, Register newval, Register cmpval) {
-  Label retry_load, nope;
-  // flush and load exclusive from the memory location
-  // and fail if it is not what we expect
-  __ bind(retry_load);
-  __ ldaxr(rscratch1, addr);
-  __ cmp(rscratch1, cmpval);
-  __ cset(rscratch1, Assembler::NE);
-  __ br(Assembler::NE, nope);
-  // if we store+flush with no intervening write rscratch1 wil be zero
-  __ stlxr(rscratch1, newval, addr);
-  // retry so we only ever return after a load fails to compare
-  // ensures we don't return a stale value after a failed write.
-  __ cbnz(rscratch1, retry_load);
-  __ bind(nope);
+  if (UseLSE) {
+    __ mov(rscratch1, cmpval);
+    __ casal(Assembler::xword, rscratch1, newval, addr);
+    __ cmp(rscratch1, cmpval);
+    __ cset(rscratch1, Assembler::NE);
+  } else {
+    Label retry_load, nope;
+    // flush and load exclusive from the memory location
+    // and fail if it is not what we expect
+    __ prfm(Address(addr), PSTL1STRM);
+    __ bind(retry_load);
+    __ ldaxr(rscratch1, addr);
+    __ cmp(rscratch1, cmpval);
+    __ cset(rscratch1, Assembler::NE);
+    __ br(Assembler::NE, nope);
+    // if we store+flush with no intervening write rscratch1 wil be zero
+    __ stlxr(rscratch1, newval, addr);
+    // retry so we only ever return after a load fails to compare
+    // ensures we don't return a stale value after a failed write.
+    __ cbnz(rscratch1, retry_load);
+    __ bind(nope);
+  }
   __ membar(__ AnyAny);
 }
 
@@ -3156,6 +3172,7 @@
       }
       Label again;
       __ lea(tmp, addr);
+      __ prfm(Address(tmp), PSTL1STRM);
       __ bind(again);
       (_masm->*lda)(dst, tmp);
       (_masm->*add)(rscratch1, dst, inc);
@@ -3175,6 +3192,7 @@
       assert_different_registers(obj, addr.base(), tmp, rscratch2, dst);
       Label again;
       __ lea(tmp, addr);
+      __ prfm(Address(tmp), PSTL1STRM);
       __ bind(again);
       (_masm->*lda)(dst, tmp);
       (_masm->*stl)(rscratch2, obj, tmp);
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -76,6 +76,8 @@
 // avoid biased locking while we are bootstrapping the aarch64 build
 define_pd_global(bool, UseBiasedLocking, false);
 
+define_pd_global(intx, InitArrayShortSize, 18*BytesPerLong);
+
 #if defined(COMPILER1) || defined(COMPILER2)
 define_pd_global(intx, InlineSmallCode,          1000);
 #endif
@@ -101,9 +103,13 @@
                                                                         \
   product(bool, UseCRC32, false,                                        \
           "Use CRC32 instructions for CRC32 computation")               \
+                                                                        \
+  product(bool, UseLSE, false,                                          \
+          "Use LSE instructions")                                       \
 
 // Don't attempt to use Neon on builtin sim until builtin sim supports it
 #define UseCRC32 false
+#define UseSIMDForMemoryOps    false
 
 #else
 #define UseBuiltinSim           false
@@ -121,6 +127,10 @@
           "Use Neon for CRC32 computation")                             \
   product(bool, UseCRC32, false,                                        \
           "Use CRC32 instructions for CRC32 computation")               \
+  product(bool, UseSIMDForMemoryOps, false,                             \
+          "Use SIMD instructions in generated memory move code")        \
+  product(bool, UseLSE, false,                                          \
+          "Use LSE instructions")                                       \
   product(bool, TraceTraps, false, "Trace all traps the signal handler")
 
 #endif
--- a/src/cpu/aarch64/vm/jvmciCodeInstaller_aarch64.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/jvmciCodeInstaller_aarch64.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -74,7 +74,7 @@
 void CodeInstaller::pd_patch_DataSectionReference(int pc_offset, int data_offset, TRAPS) {
   address pc = _instructions->start() + pc_offset;
   NativeInstruction* inst = nativeInstruction_at(pc);
-  if (inst->is_adr_aligned()) {
+  if (inst->is_adr_aligned() || inst->is_ldr_literal()) {
     address dest = _constants->start() + data_offset;
     _instructions->relocate(pc, section_word_Relocation::spec((address) dest, CodeBuffer::SECT_CONSTS));
     TRACE_jvmci_3("relocating at " PTR_FORMAT " (+%d) with destination at %d", p2i(pc), pc_offset, data_offset);
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1638,6 +1638,7 @@
 
 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
   Label retry_load;
+  prfm(Address(counter_addr), PSTL1STRM);
   bind(retry_load);
   // flush and load exclusive from the memory location
   ldxrw(tmp, counter_addr);
@@ -2070,25 +2071,32 @@
   // oldv holds comparison value
   // newv holds value to write in exchange
   // addr identifies memory word to compare against/update
-  // tmp returns 0/1 for success/failure
-  Label retry_load, nope;
-
-  bind(retry_load);
-  // flush and load exclusive from the memory location
-  // and fail if it is not what we expect
-  ldaxr(tmp, addr);
-  cmp(tmp, oldv);
-  br(Assembler::NE, nope);
-  // if we store+flush with no intervening write tmp wil be zero
-  stlxr(tmp, newv, addr);
-  cbzw(tmp, succeed);
-  // retry so we only ever return after a load fails to compare
-  // ensures we don't return a stale value after a failed write.
-  b(retry_load);
-  // if the memory word differs we return it in oldv and signal a fail
-  bind(nope);
-  membar(AnyAny);
-  mov(oldv, tmp);
+  if (UseLSE) {
+    mov(tmp, oldv);
+    casal(Assembler::xword, oldv, newv, addr);
+    cmp(tmp, oldv);
+    br(Assembler::EQ, succeed);
+    membar(AnyAny);
+  } else {
+    Label retry_load, nope;
+    prfm(Address(addr), PSTL1STRM);
+    bind(retry_load);
+    // flush and load exclusive from the memory location
+    // and fail if it is not what we expect
+    ldaxr(tmp, addr);
+    cmp(tmp, oldv);
+    br(Assembler::NE, nope);
+    // if we store+flush with no intervening write tmp wil be zero
+    stlxr(tmp, newv, addr);
+    cbzw(tmp, succeed);
+    // retry so we only ever return after a load fails to compare
+    // ensures we don't return a stale value after a failed write.
+    b(retry_load);
+    // if the memory word differs we return it in oldv and signal a fail
+    bind(nope);
+    membar(AnyAny);
+    mov(oldv, tmp);
+  }
   if (fail)
     b(*fail);
 }
@@ -2099,28 +2107,64 @@
   // newv holds value to write in exchange
   // addr identifies memory word to compare against/update
   // tmp returns 0/1 for success/failure
-  Label retry_load, nope;
-
-  bind(retry_load);
-  // flush and load exclusive from the memory location
-  // and fail if it is not what we expect
-  ldaxrw(tmp, addr);
-  cmp(tmp, oldv);
-  br(Assembler::NE, nope);
-  // if we store+flush with no intervening write tmp wil be zero
-  stlxrw(tmp, newv, addr);
-  cbzw(tmp, succeed);
-  // retry so we only ever return after a load fails to compare
-  // ensures we don't return a stale value after a failed write.
-  b(retry_load);
-  // if the memory word differs we return it in oldv and signal a fail
-  bind(nope);
-  membar(AnyAny);
-  mov(oldv, tmp);
+  if (UseLSE) {
+    mov(tmp, oldv);
+    casal(Assembler::word, oldv, newv, addr);
+    cmp(tmp, oldv);
+    br(Assembler::EQ, succeed);
+    membar(AnyAny);
+  } else {
+    Label retry_load, nope;
+    prfm(Address(addr), PSTL1STRM);
+    bind(retry_load);
+    // flush and load exclusive from the memory location
+    // and fail if it is not what we expect
+    ldaxrw(tmp, addr);
+    cmp(tmp, oldv);
+    br(Assembler::NE, nope);
+    // if we store+flush with no intervening write tmp wil be zero
+    stlxrw(tmp, newv, addr);
+    cbzw(tmp, succeed);
+    // retry so we only ever return after a load fails to compare
+    // ensures we don't return a stale value after a failed write.
+    b(retry_load);
+    // if the memory word differs we return it in oldv and signal a fail
+    bind(nope);
+    membar(AnyAny);
+    mov(oldv, tmp);
+  }
   if (fail)
     b(*fail);
 }
 
+// A generic CAS; success or failure is in the EQ flag.
+void MacroAssembler::cmpxchg(Register addr, Register expected,
+                             Register new_val,
+                             enum operand_size size,
+                             bool acquire, bool release,
+                             Register tmp) {
+  if (UseLSE) {
+    mov(tmp, expected);
+    lse_cas(tmp, new_val, addr, size, acquire, release, /*not_pair*/ true);
+    cmp(tmp, expected);
+  } else {
+    BLOCK_COMMENT("cmpxchg {");
+    Label retry_load, done;
+    prfm(Address(addr), PSTL1STRM);
+    bind(retry_load);
+    load_exclusive(tmp, addr, size, acquire);
+    if (size == xword)
+      cmp(tmp, expected);
+    else
+      cmpw(tmp, expected);
+    br(Assembler::NE, done);
+    store_exclusive(tmp, new_val, addr, size, release);
+    cbnzw(tmp, retry_load);
+    bind(done);
+    BLOCK_COMMENT("} cmpxchg");
+  }
+}
+
 static bool different(Register a, RegisterOrConstant b, Register c) {
   if (b.is_constant())
     return a != c;
@@ -2135,6 +2179,7 @@
     result = different(prev, incr, addr) ? prev : rscratch2;            \
                                                                         \
   Label retry_load;                                                     \
+  prfm(Address(addr), PSTL1STRM);                                       \
   bind(retry_load);                                                     \
   LDXR(result, addr);                                                   \
   OP(rscratch1, result, incr);                                          \
@@ -2157,6 +2202,7 @@
     result = different(prev, newv, addr) ? prev : rscratch2;            \
                                                                         \
   Label retry_load;                                                     \
+  prfm(Address(addr), PSTL1STRM);                                       \
   bind(retry_load);                                                     \
   LDXR(result, addr);                                                   \
   STXR(rscratch1, newv, addr);                                          \
@@ -4481,225 +4527,126 @@
   BLOCK_COMMENT("} string_compare");
 }
 
-
-void MacroAssembler::string_equals(Register str1, Register str2,
-                                   Register cnt, Register result,
-                                   Register tmp1) {
-  Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
-    NEXT_WORD;
-
-  const Register tmp2 = rscratch1;
-  assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);
-
-  BLOCK_COMMENT("string_equals {");
-
-  // Start by assuming that the strings are not equal.
-  mov(result, zr);
-
-  // A very short string
-  cmpw(cnt, 4);
-  br(Assembler::LT, SHORT_STRING);
-
-  // Check if the strings start at the same location.
-  cmp(str1, str2);
-  br(Assembler::EQ, SAME_CHARS);
-
-  // Compare longwords
+// Compare Strings or char/byte arrays.
+
+// is_string is true iff this is a string comparison.
+
+// For Strings we're passed the address of the first characters in a1
+// and a2 and the length in cnt1.
+
+// For byte and char arrays we're passed the arrays themselves and we
+// have to extract length fields and do null checks here.
+
+// elem_size is the element size in bytes: either 1 or 2.
+
+// There are two implementations.  For arrays >= 8 bytes, all
+// comparisons (including the final one, which may overlap) are
+// performed 8 bytes at a time.  For arrays < 8 bytes, we compare a
+// halfword, then a short, and then a byte.
+
+void MacroAssembler::arrays_equals(Register a1, Register a2,
+                                   Register result, Register cnt1,
+                                   int elem_size, bool is_string)
+{
+  Label SAME, DONE, SHORT, NEXT_WORD, ONE;
+  Register tmp1 = rscratch1;
+  Register tmp2 = rscratch2;
+  Register cnt2 = tmp2;  // cnt2 only used in array length compare
+  int elem_per_word = wordSize/elem_size;
+  int log_elem_size = exact_log2(elem_size);
+  int length_offset = arrayOopDesc::length_offset_in_bytes();
+  int base_offset
+    = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
+
+  assert(elem_size == 1 || elem_size == 2, "must be char or byte");
+  assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
+
+  BLOCK_COMMENT(is_string ? "string_equals {" : "array_equals {");
+
+  mov(result, false);
+
+  if (!is_string) {
+    // if (a==a2)
+    //     return true;
+    eor(rscratch1, a1, a2);
+    cbz(rscratch1, SAME);
+    // if (a==null || a2==null)
+    //     return false;
+    cbz(a1, DONE);
+    cbz(a2, DONE);
+    // if (a1.length != a2.length)
+    //      return false;
+    ldrw(cnt1, Address(a1, length_offset));
+    ldrw(cnt2, Address(a2, length_offset));
+    eorw(tmp1, cnt1, cnt2);
+    cbnzw(tmp1, DONE);
+
+    lea(a1, Address(a1, base_offset));
+    lea(a2, Address(a2, base_offset));
+  }
+
+  // Check for short strings, i.e. smaller than wordSize.
+  subs(cnt1, cnt1, elem_per_word);
+  br(Assembler::LT, SHORT);
+  // Main 8 byte comparison loop.
+  bind(NEXT_WORD); {
+    ldr(tmp1, Address(post(a1, wordSize)));
+    ldr(tmp2, Address(post(a2, wordSize)));
+    subs(cnt1, cnt1, elem_per_word);
+    eor(tmp1, tmp1, tmp2);
+    cbnz(tmp1, DONE);
+  } br(GT, NEXT_WORD);
+  // Last longword.  In the case where length == 4 we compare the
+  // same longword twice, but that's still faster than another
+  // conditional branch.
+  // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
+  // length == 4.
+  if (log_elem_size > 0)
+    lsl(cnt1, cnt1, log_elem_size);
+  ldr(tmp1, Address(a1, cnt1));
+  ldr(tmp2, Address(a2, cnt1));
+  eor(tmp1, tmp1, tmp2);
+  cbnz(tmp1, DONE);
+  b(SAME);
+
+  bind(SHORT);
+  Label TAIL03, TAIL01;
+
+  tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
   {
-    subw(cnt, cnt, 4); // The last longword is a special case
-
-    // Move both string pointers to the last longword of their
-    // strings, negate the remaining count, and convert it to bytes.
-    lea(str1, Address(str1, cnt, Address::uxtw(1)));
-    lea(str2, Address(str2, cnt, Address::uxtw(1)));
-    sub(cnt, zr, cnt, LSL, 1);
-
-    // Loop, loading longwords and comparing them into rscratch2.
-    bind(NEXT_WORD);
-    ldr(tmp1, Address(str1, cnt));
-    ldr(tmp2, Address(str2, cnt));
-    adds(cnt, cnt, wordSize);
-    eor(rscratch2, tmp1, tmp2);
-    cbnz(rscratch2, DONE);
-    br(Assembler::LT, NEXT_WORD);
-
-    // Last longword.  In the case where length == 4 we compare the
-    // same longword twice, but that's still faster than another
-    // conditional branch.
-
-    ldr(tmp1, Address(str1));
-    ldr(tmp2, Address(str2));
-    eor(rscratch2, tmp1, tmp2);
-    cbz(rscratch2, SAME_CHARS);
-    b(DONE);
+    ldrw(tmp1, Address(post(a1, 4)));
+    ldrw(tmp2, Address(post(a2, 4)));
+    eorw(tmp1, tmp1, tmp2);
+    cbnzw(tmp1, DONE);
   }
-
-  bind(SHORT_STRING);
-  // Is the length zero?
-  cbz(cnt, SAME_CHARS);
-
-  bind(SHORT_LOOP);
-  load_unsigned_short(tmp1, Address(post(str1, 2)));
-  load_unsigned_short(tmp2, Address(post(str2, 2)));
-  subw(tmp1, tmp1, tmp2);
-  cbnz(tmp1, DONE);
-  sub(cnt, cnt, 1);
-  cbnz(cnt, SHORT_LOOP);
-
-  // Strings are equal.
-  bind(SAME_CHARS);
+  bind(TAIL03);
+  tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
+  {
+    ldrh(tmp1, Address(post(a1, 2)));
+    ldrh(tmp2, Address(post(a2, 2)));
+    eorw(tmp1, tmp1, tmp2);
+    cbnzw(tmp1, DONE);
+  }
+  bind(TAIL01);
+  if (elem_size == 1) { // Only needed when comparing byte arrays.
+    tbz(cnt1, 0, SAME); // 0-1 bytes left.
+    {
+      ldrb(tmp1, a1);
+      ldrb(tmp2, a2);
+      eorw(tmp1, tmp1, tmp2);
+      cbnzw(tmp1, DONE);
+    }
+  }
+  // Arrays are equal.
+  bind(SAME);
   mov(result, true);
 
-  // That's it
+  // That's it.
   bind(DONE);
-
-  BLOCK_COMMENT("} string_equals");
+  BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals");
 }
 
 
-void MacroAssembler::byte_arrays_equals(Register ary1, Register ary2,
-                                        Register result, Register tmp1)
-{
-  Register cnt1 = rscratch1;
-  Register cnt2 = rscratch2;
-  Register tmp2 = rscratch2;
-
-  Label SAME, DIFFER, NEXT, TAIL07, TAIL03, TAIL01;
-
-  int length_offset  = arrayOopDesc::length_offset_in_bytes();
-  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_BYTE);
-
-  BLOCK_COMMENT("byte_arrays_equals  {");
-
-    // different until proven equal
-    mov(result, false);
-
-    // same array?
-    cmp(ary1, ary2);
-    br(Assembler::EQ, SAME);
-
-    // ne if either null
-    cbz(ary1, DIFFER);
-    cbz(ary2, DIFFER);
-
-    // lengths ne?
-    ldrw(cnt1, Address(ary1, length_offset));
-    ldrw(cnt2, Address(ary2, length_offset));
-    cmp(cnt1, cnt2);
-    br(Assembler::NE, DIFFER);
-
-    lea(ary1, Address(ary1, base_offset));
-    lea(ary2, Address(ary2, base_offset));
-
-    subs(cnt1, cnt1, 8);
-    br(LT, TAIL07);
-
-  BIND(NEXT);
-    ldr(tmp1, Address(post(ary1, 8)));
-    ldr(tmp2, Address(post(ary2, 8)));
-    subs(cnt1, cnt1, 8);
-    eor(tmp1, tmp1, tmp2);
-    cbnz(tmp1, DIFFER);
-    br(GE, NEXT);
-
-  BIND(TAIL07);  // 0-7 bytes left, cnt1 = #bytes left - 4
-    tst(cnt1, 0b100);
-    br(EQ, TAIL03);
-    ldrw(tmp1, Address(post(ary1, 4)));
-    ldrw(tmp2, Address(post(ary2, 4)));
-    cmp(tmp1, tmp2);
-    br(NE, DIFFER);
-
-  BIND(TAIL03);  // 0-3 bytes left, cnt1 = #bytes left - 4
-    tst(cnt1, 0b10);
-    br(EQ, TAIL01);
-    ldrh(tmp1, Address(post(ary1, 2)));
-    ldrh(tmp2, Address(post(ary2, 2)));
-    cmp(tmp1, tmp2);
-    br(NE, DIFFER);
-  BIND(TAIL01);  // 0-1 byte left
-    tst(cnt1, 0b01);
-    br(EQ, SAME);
-    ldrb(tmp1, ary1);
-    ldrb(tmp2, ary2);
-    cmp(tmp1, tmp2);
-    br(NE, DIFFER);
-
-  BIND(SAME);
-    mov(result, true);
-  BIND(DIFFER); // result already set
-
-  BLOCK_COMMENT("} byte_arrays_equals");
-}
-
-// Compare char[] arrays aligned to 4 bytes
-void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
-                                        Register result, Register tmp1)
-{
-  Register cnt1 = rscratch1;
-  Register cnt2 = rscratch2;
-  Register tmp2 = rscratch2;
-
-  Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
-
-  int length_offset  = arrayOopDesc::length_offset_in_bytes();
-  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
-
-  BLOCK_COMMENT("char_arrays_equals  {");
-
-    // different until proven equal
-    mov(result, false);
-
-    // same array?
-    cmp(ary1, ary2);
-    br(Assembler::EQ, SAME);
-
-    // ne if either null
-    cbz(ary1, DIFFER);
-    cbz(ary2, DIFFER);
-
-    // lengths ne?
-    ldrw(cnt1, Address(ary1, length_offset));
-    ldrw(cnt2, Address(ary2, length_offset));
-    cmp(cnt1, cnt2);
-    br(Assembler::NE, DIFFER);
-
-    lea(ary1, Address(ary1, base_offset));
-    lea(ary2, Address(ary2, base_offset));
-
-    subs(cnt1, cnt1, 4);
-    br(LT, TAIL03);
-
-  BIND(NEXT);
-    ldr(tmp1, Address(post(ary1, 8)));
-    ldr(tmp2, Address(post(ary2, 8)));
-    subs(cnt1, cnt1, 4);
-    eor(tmp1, tmp1, tmp2);
-    cbnz(tmp1, DIFFER);
-    br(GE, NEXT);
-
-  BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
-    tst(cnt1, 0b10);
-    br(EQ, TAIL01);
-    ldrw(tmp1, Address(post(ary1, 4)));
-    ldrw(tmp2, Address(post(ary2, 4)));
-    cmp(tmp1, tmp2);
-    br(NE, DIFFER);
-  BIND(TAIL01);  // 0-1 chars left
-    tst(cnt1, 0b01);
-    br(EQ, SAME);
-    ldrh(tmp1, ary1);
-    ldrh(tmp2, ary2);
-    cmp(tmp1, tmp2);
-    br(NE, DIFFER);
-
-  BIND(SAME);
-    mov(result, true);
-  BIND(DIFFER); // result already set
-
-  BLOCK_COMMENT("} char_arrays_equals");
-}
-
 // encode char[] to byte[] in ISO_8859_1
 void MacroAssembler::encode_iso_array(Register src, Register dst,
                       Register len, Register result,
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -971,21 +971,10 @@
   }
 
   // A generic CAS; success or failure is in the EQ flag.
-  template <typename T1, typename T2>
   void cmpxchg(Register addr, Register expected, Register new_val,
-               T1 load_insn,
-               void (MacroAssembler::*cmp_insn)(Register, Register),
-               T2 store_insn,
-               Register tmp = rscratch1) {
-    Label retry_load, done;
-    bind(retry_load);
-    (this->*load_insn)(tmp, addr);
-    (this->*cmp_insn)(tmp, expected);
-    br(Assembler::NE, done);
-    (this->*store_insn)(tmp, new_val, addr);
-    cbnzw(tmp, retry_load);
-    bind(done);
-  }
+               enum operand_size size,
+               bool acquire, bool release,
+               Register tmp = rscratch1);
 
   // Calls
 
@@ -1186,13 +1175,11 @@
   void string_compare(Register str1, Register str2,
                       Register cnt1, Register cnt2, Register result,
                       Register tmp1);
-  void string_equals(Register str1, Register str2,
-                     Register cnt, Register result,
-                     Register tmp1);
-  void char_arrays_equals(Register ary1, Register ary2,
-                          Register result, Register tmp1);
-  void byte_arrays_equals(Register ary1, Register ary2,
-                          Register result, Register tmp1);
+
+  void arrays_equals(Register a1, Register a2,
+                     Register result, Register cnt1,
+                     int elem_size, bool is_string);
+
   void encode_iso_array(Register src, Register dst,
                         Register len, Register result,
                         FloatRegister Vtmp1, FloatRegister Vtmp2,
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -105,13 +105,20 @@
   inline friend NativeInstruction* nativeInstruction_at(address address);
 
   static bool is_adrp_at(address instr);
+
   static bool is_ldr_literal_at(address instr);
+
+  bool is_ldr_literal() {
+    return is_ldr_literal_at(addr_at(0));
+  }
+
   static bool is_ldrw_to_zr(address instr);
 
   static bool is_call_at(address instr) {
     const uint32_t insn = (*(uint32_t*)instr);
     return (insn >> 26) == 0b100101;
   }
+
   bool is_call() {
     return is_call_at(addr_at(0));
   }
--- a/src/cpu/aarch64/vm/register_aarch64.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/register_aarch64.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -107,6 +107,9 @@
 CONSTANT_REGISTER_DECLARATION(Register, zr,  (32));
 CONSTANT_REGISTER_DECLARATION(Register, sp,  (33));
 
+// Used as a filler in instructions where a register field is unused.
+const Register dummy_reg = r31_sp;
+
 // Use FloatRegister as shortcut
 class FloatRegisterImpl;
 typedef FloatRegisterImpl* FloatRegister;
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -163,30 +163,20 @@
     sp_after_call_off = -26,
 
     d15_off            = -26,
-    d14_off            = -25,
     d13_off            = -24,
-    d12_off            = -23,
     d11_off            = -22,
-    d10_off            = -21,
     d9_off             = -20,
-    d8_off             = -19,
 
     r28_off            = -18,
-    r27_off            = -17,
     r26_off            = -16,
-    r25_off            = -15,
     r24_off            = -14,
-    r23_off            = -13,
     r22_off            = -12,
-    r21_off            = -11,
     r20_off            = -10,
-    r19_off            =  -9,
     call_wrapper_off   =  -8,
     result_off         =  -7,
     result_type_off    =  -6,
     method_off         =  -5,
     entry_point_off    =  -4,
-    parameters_off     =  -3,
     parameter_size_off =  -2,
     thread_off         =  -1,
     fp_f               =   0,
@@ -208,30 +198,20 @@
     const Address result_type   (rfp, result_type_off    * wordSize);
     const Address method        (rfp, method_off         * wordSize);
     const Address entry_point   (rfp, entry_point_off    * wordSize);
-    const Address parameters    (rfp, parameters_off     * wordSize);
     const Address parameter_size(rfp, parameter_size_off * wordSize);
 
     const Address thread        (rfp, thread_off         * wordSize);
 
     const Address d15_save      (rfp, d15_off * wordSize);
-    const Address d14_save      (rfp, d14_off * wordSize);
     const Address d13_save      (rfp, d13_off * wordSize);
-    const Address d12_save      (rfp, d12_off * wordSize);
     const Address d11_save      (rfp, d11_off * wordSize);
-    const Address d10_save      (rfp, d10_off * wordSize);
     const Address d9_save       (rfp, d9_off * wordSize);
-    const Address d8_save       (rfp, d8_off * wordSize);
 
     const Address r28_save      (rfp, r28_off * wordSize);
-    const Address r27_save      (rfp, r27_off * wordSize);
     const Address r26_save      (rfp, r26_off * wordSize);
-    const Address r25_save      (rfp, r25_off * wordSize);
     const Address r24_save      (rfp, r24_off * wordSize);
-    const Address r23_save      (rfp, r23_off * wordSize);
     const Address r22_save      (rfp, r22_off * wordSize);
-    const Address r21_save      (rfp, r21_off * wordSize);
     const Address r20_save      (rfp, r20_off * wordSize);
-    const Address r19_save      (rfp, r19_off * wordSize);
 
     // stub code
 
@@ -254,31 +234,20 @@
     // rthread because we want to sanity check rthread later
     __ str(c_rarg7,  thread);
     __ strw(c_rarg6, parameter_size);
-    __ str(c_rarg5,  parameters);
-    __ str(c_rarg4,  entry_point);
-    __ str(c_rarg3,  method);
-    __ str(c_rarg2,  result_type);
-    __ str(c_rarg1,  result);
-    __ str(c_rarg0,  call_wrapper);
-    __ str(r19,      r19_save);
-    __ str(r20,      r20_save);
-    __ str(r21,      r21_save);
-    __ str(r22,      r22_save);
-    __ str(r23,      r23_save);
-    __ str(r24,      r24_save);
-    __ str(r25,      r25_save);
-    __ str(r26,      r26_save);
-    __ str(r27,      r27_save);
-    __ str(r28,      r28_save);
-
-    __ strd(v8,      d8_save);
-    __ strd(v9,      d9_save);
-    __ strd(v10,     d10_save);
-    __ strd(v11,     d11_save);
-    __ strd(v12,     d12_save);
-    __ strd(v13,     d13_save);
-    __ strd(v14,     d14_save);
-    __ strd(v15,     d15_save);
+    __ stp(c_rarg4, c_rarg5,  entry_point);
+    __ stp(c_rarg2, c_rarg3,  result_type);
+    __ stp(c_rarg0, c_rarg1,  call_wrapper);
+
+    __ stp(r20, r19,   r20_save);
+    __ stp(r22, r21,   r22_save);
+    __ stp(r24, r23,   r24_save);
+    __ stp(r26, r25,   r26_save);
+    __ stp(r28, r27,   r28_save);
+
+    __ stpd(v9,  v8,   d9_save);
+    __ stpd(v11, v10,  d11_save);
+    __ stpd(v13, v12,  d13_save);
+    __ stpd(v15, v14,  d15_save);
 
     // install Java thread in global register now we have saved
     // whatever value it held
@@ -385,33 +354,22 @@
 #endif
 
     // restore callee-save registers
-    __ ldrd(v15,      d15_save);
-    __ ldrd(v14,      d14_save);
-    __ ldrd(v13,      d13_save);
-    __ ldrd(v12,      d12_save);
-    __ ldrd(v11,      d11_save);
-    __ ldrd(v10,      d10_save);
-    __ ldrd(v9,       d9_save);
-    __ ldrd(v8,       d8_save);
-
-    __ ldr(r28,      r28_save);
-    __ ldr(r27,      r27_save);
-    __ ldr(r26,      r26_save);
-    __ ldr(r25,      r25_save);
-    __ ldr(r24,      r24_save);
-    __ ldr(r23,      r23_save);
-    __ ldr(r22,      r22_save);
-    __ ldr(r21,      r21_save);
-    __ ldr(r20,      r20_save);
-    __ ldr(r19,      r19_save);
-    __ ldr(c_rarg0,  call_wrapper);
-    __ ldr(c_rarg1,  result);
+    __ ldpd(v15, v14,  d15_save);
+    __ ldpd(v13, v12,  d13_save);
+    __ ldpd(v11, v10,  d11_save);
+    __ ldpd(v9,  v8,   d9_save);
+
+    __ ldp(r28, r27,   r28_save);
+    __ ldp(r26, r25,   r26_save);
+    __ ldp(r24, r23,   r24_save);
+    __ ldp(r22, r21,   r22_save);
+    __ ldp(r20, r19,   r20_save);
+
+    __ ldp(c_rarg0, c_rarg1,  call_wrapper);
     __ ldrw(c_rarg2, result_type);
     __ ldr(c_rarg3,  method);
-    __ ldr(c_rarg4,  entry_point);
-    __ ldr(c_rarg5,  parameters);
-    __ ldr(c_rarg6,  parameter_size);
-    __ ldr(c_rarg7,  thread);
+    __ ldp(c_rarg4, c_rarg5,  entry_point);
+    __ ldp(c_rarg6, c_rarg7,  parameter_size);
 
 #ifndef PRODUCT
     // tell the simulator we are about to end Java execution
@@ -771,7 +729,7 @@
   //
   // count is a count of words.
   //
-  // Precondition: count >= 2
+  // Precondition: count >= 8
   //
   // Postconditions:
   //
@@ -783,6 +741,7 @@
   void generate_copy_longs(Label &start, Register s, Register d, Register count,
                            copy_direction direction) {
     int unit = wordSize * direction;
+    int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 
     int offset;
     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
@@ -792,7 +751,7 @@
     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
     assert_different_registers(s, d, count, rscratch1);
 
-    Label again, large, small;
+    Label again, drain;
     const char *stub_name;
     if (direction == copy_forwards)
       stub_name = "foward_copy_longs";
@@ -801,57 +760,35 @@
     StubCodeMark mark(this, "StubRoutines", stub_name);
     __ align(CodeEntryAlignment);
     __ bind(start);
-    __ cmp(count, 8);
-    __ br(Assembler::LO, small);
     if (direction == copy_forwards) {
-      __ sub(s, s, 2 * wordSize);
-      __ sub(d, d, 2 * wordSize);
+      __ sub(s, s, bias);
+      __ sub(d, d, bias);
     }
+
+#ifdef ASSERT
+    // Make sure we are never given < 8 words
+    {
+      Label L;
+      __ cmp(count, 8);
+      __ br(Assembler::GE, L);
+      __ stop("genrate_copy_longs called with < 8 words");
+      __ bind(L);
+    }
+#endif
+
+    // Fill 8 registers
+    if (UseSIMDForMemoryOps) {
+      __ ldpq(v0, v1, Address(s, 4 * unit));
+      __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
+    } else {
+      __ ldp(t0, t1, Address(s, 2 * unit));
+      __ ldp(t2, t3, Address(s, 4 * unit));
+      __ ldp(t4, t5, Address(s, 6 * unit));
+      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+    }
+
     __ subs(count, count, 16);
-    __ br(Assembler::GE, large);
-
-    // 8 <= count < 16 words.  Copy 8.
-    __ ldp(t0, t1, Address(s, 2 * unit));
-    __ ldp(t2, t3, Address(s, 4 * unit));
-    __ ldp(t4, t5, Address(s, 6 * unit));
-    __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
-
-    __ stp(t0, t1, Address(d, 2 * unit));
-    __ stp(t2, t3, Address(d, 4 * unit));
-    __ stp(t4, t5, Address(d, 6 * unit));
-    __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
-
-    if (direction == copy_forwards) {
-      __ add(s, s, 2 * wordSize);
-      __ add(d, d, 2 * wordSize);
-    }
-
-    {
-      Label L1, L2;
-      __ bind(small);
-      __ tbz(count, exact_log2(4), L1);
-      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
-      __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
-      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
-      __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
-      __ bind(L1);
-
-      __ tbz(count, 1, L2);
-      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
-      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
-      __ bind(L2);
-    }
-
-    __ ret(lr);
-
-    __ align(CodeEntryAlignment);
-    __ bind(large);
-
-    // Fill 8 registers
-    __ ldp(t0, t1, Address(s, 2 * unit));
-    __ ldp(t2, t3, Address(s, 4 * unit));
-    __ ldp(t4, t5, Address(s, 6 * unit));
-    __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+    __ br(Assembler::LO, drain);
 
     int prefetch = PrefetchCopyIntervalInBytes;
     bool use_stride = false;
@@ -866,38 +803,56 @@
     if (PrefetchCopyIntervalInBytes > 0)
       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 
-    __ stp(t0, t1, Address(d, 2 * unit));
-    __ ldp(t0, t1, Address(s, 2 * unit));
-    __ stp(t2, t3, Address(d, 4 * unit));
-    __ ldp(t2, t3, Address(s, 4 * unit));
-    __ stp(t4, t5, Address(d, 6 * unit));
-    __ ldp(t4, t5, Address(s, 6 * unit));
-    __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
-    __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+    if (UseSIMDForMemoryOps) {
+      __ stpq(v0, v1, Address(d, 4 * unit));
+      __ ldpq(v0, v1, Address(s, 4 * unit));
+      __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
+      __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
+    } else {
+      __ stp(t0, t1, Address(d, 2 * unit));
+      __ ldp(t0, t1, Address(s, 2 * unit));
+      __ stp(t2, t3, Address(d, 4 * unit));
+      __ ldp(t2, t3, Address(s, 4 * unit));
+      __ stp(t4, t5, Address(d, 6 * unit));
+      __ ldp(t4, t5, Address(s, 6 * unit));
+      __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
+      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+    }
 
     __ subs(count, count, 8);
     __ br(Assembler::HS, again);
 
     // Drain
-    __ stp(t0, t1, Address(d, 2 * unit));
-    __ stp(t2, t3, Address(d, 4 * unit));
-    __ stp(t4, t5, Address(d, 6 * unit));
-    __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
-
-    if (direction == copy_forwards) {
-      __ add(s, s, 2 * wordSize);
-      __ add(d, d, 2 * wordSize);
+    __ bind(drain);
+    if (UseSIMDForMemoryOps) {
+      __ stpq(v0, v1, Address(d, 4 * unit));
+      __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
+    } else {
+      __ stp(t0, t1, Address(d, 2 * unit));
+      __ stp(t2, t3, Address(d, 4 * unit));
+      __ stp(t4, t5, Address(d, 6 * unit));
+      __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
     }
 
     {
       Label L1, L2;
       __ tbz(count, exact_log2(4), L1);
-      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
-      __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
-      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
-      __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      if (UseSIMDForMemoryOps) {
+        __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
+        __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
+      } else {
+        __ ldp(t0, t1, Address(s, 2 * unit));
+        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
+        __ stp(t0, t1, Address(d, 2 * unit));
+        __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
+      }
       __ bind(L1);
 
+      if (direction == copy_forwards) {
+        __ add(s, s, bias);
+        __ add(d, d, bias);
+      }
+
       __ tbz(count, 1, L2);
       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
@@ -973,16 +928,135 @@
     int granularity = uabs(step);
     const Register t0 = r3, t1 = r4;
 
+    // <= 96 bytes do inline. Direction doesn't matter because we always
+    // load all the data before writing anything
+    Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
+    const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
+    const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
+    const Register send = r17, dend = r18;
+
+    if (PrefetchCopyIntervalInBytes > 0)
+      __ prfm(Address(s, 0), PLDL1KEEP);
+    __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
+    __ br(Assembler::HI, copy_big);
+
+    __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
+    __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
+
+    __ cmp(count, 16/granularity);
+    __ br(Assembler::LS, copy16);
+
+    __ cmp(count, 64/granularity);
+    __ br(Assembler::HI, copy80);
+
+    __ cmp(count, 32/granularity);
+    __ br(Assembler::LS, copy32);
+
+    // 33..64 bytes
+    if (UseSIMDForMemoryOps) {
+      __ ldpq(v0, v1, Address(s, 0));
+      __ ldpq(v2, v3, Address(send, -32));
+      __ stpq(v0, v1, Address(d, 0));
+      __ stpq(v2, v3, Address(dend, -32));
+    } else {
+      __ ldp(t0, t1, Address(s, 0));
+      __ ldp(t2, t3, Address(s, 16));
+      __ ldp(t4, t5, Address(send, -32));
+      __ ldp(t6, t7, Address(send, -16));
+
+      __ stp(t0, t1, Address(d, 0));
+      __ stp(t2, t3, Address(d, 16));
+      __ stp(t4, t5, Address(dend, -32));
+      __ stp(t6, t7, Address(dend, -16));
+    }
+    __ b(finish);
+
+    // 17..32 bytes
+    __ bind(copy32);
+    __ ldp(t0, t1, Address(s, 0));
+    __ ldp(t2, t3, Address(send, -16));
+    __ stp(t0, t1, Address(d, 0));
+    __ stp(t2, t3, Address(dend, -16));
+    __ b(finish);
+
+    // 65..80/96 bytes
+    // (96 bytes if SIMD because we do 32 byes per instruction)
+    __ bind(copy80);
+    if (UseSIMDForMemoryOps) {
+      __ ldpq(v0, v1, Address(s, 0));
+      __ ldpq(v2, v3, Address(s, 32));
+      __ ldpq(v4, v5, Address(send, -32));
+      __ stpq(v0, v1, Address(d, 0));
+      __ stpq(v2, v3, Address(d, 32));
+      __ stpq(v4, v5, Address(dend, -32));
+    } else {
+      __ ldp(t0, t1, Address(s, 0));
+      __ ldp(t2, t3, Address(s, 16));
+      __ ldp(t4, t5, Address(s, 32));
+      __ ldp(t6, t7, Address(s, 48));
+      __ ldp(t8, t9, Address(send, -16));
+
+      __ stp(t0, t1, Address(d, 0));
+      __ stp(t2, t3, Address(d, 16));
+      __ stp(t4, t5, Address(d, 32));
+      __ stp(t6, t7, Address(d, 48));
+      __ stp(t8, t9, Address(dend, -16));
+    }
+    __ b(finish);
+
+    // 0..16 bytes
+    __ bind(copy16);
+    __ cmp(count, 8/granularity);
+    __ br(Assembler::LO, copy8);
+
+    // 8..16 bytes
+    __ ldr(t0, Address(s, 0));
+    __ ldr(t1, Address(send, -8));
+    __ str(t0, Address(d, 0));
+    __ str(t1, Address(dend, -8));
+    __ b(finish);
+
+    if (granularity < 8) {
+      // 4..7 bytes
+      __ bind(copy8);
+      __ tbz(count, 2 - exact_log2(granularity), copy4);
+      __ ldrw(t0, Address(s, 0));
+      __ ldrw(t1, Address(send, -4));
+      __ strw(t0, Address(d, 0));
+      __ strw(t1, Address(dend, -4));
+      __ b(finish);
+      if (granularity < 4) {
+        // 0..3 bytes
+        __ bind(copy4);
+        __ cbz(count, finish); // get rid of 0 case
+        if (granularity == 2) {
+          __ ldrh(t0, Address(s, 0));
+          __ strh(t0, Address(d, 0));
+        } else { // granularity == 1
+          // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
+          // the first and last byte.
+          // Handle the 3 byte case by loading and storing base + count/2
+          // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
+          // This does means in the 1 byte case we load/store the same
+          // byte 3 times.
+          __ lsr(count, count, 1);
+          __ ldrb(t0, Address(s, 0));
+          __ ldrb(t1, Address(send, -1));
+          __ ldrb(t2, Address(s, count));
+          __ strb(t0, Address(d, 0));
+          __ strb(t1, Address(dend, -1));
+          __ strb(t2, Address(d, count));
+        }
+        __ b(finish);
+      }
+    }
+
+    __ bind(copy_big);
     if (is_backwards) {
       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
     }
 
-    Label tail;
-
-    __ cmp(count, 16/granularity);
-    __ br(Assembler::LO, tail);
-
     // Now we've got the small case out of the way we can align the
     // source address on a 2-word boundary.
 
@@ -1028,8 +1102,6 @@
 #endif
     }
 
-    __ cmp(count, 16/granularity);
-    __ br(Assembler::LT, tail);
     __ bind(aligned);
 
     // s is now 2-word-aligned.
@@ -1043,9 +1115,11 @@
       __ bl(copy_b);
 
     // And the tail.
-
-    __ bind(tail);
     copy_memory_small(s, d, count, tmp, step);
+
+    if (granularity >= 8) __ bind(copy8);
+    if (granularity >= 4) __ bind(copy4);
+    __ bind(finish);
   }
 
 
--- a/src/cpu/aarch64/vm/templateInterpreterGenerator_aarch64.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/templateInterpreterGenerator_aarch64.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1984,6 +1984,7 @@
   __ push(rscratch3);
   Label L;
   __ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
+  __ prfm(Address(rscratch2), PSTL1STRM);
   __ bind(L);
   __ ldxr(rscratch1, rscratch2);
   __ add(rscratch1, rscratch1, 1);
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -61,6 +61,10 @@
 #define HWCAP_CRC32 (1<<7)
 #endif
 
+#ifndef HWCAP_ATOMICS
+#define HWCAP_ATOMICS (1<<8)
+#endif
+
 int VM_Version::_cpu;
 int VM_Version::_model;
 int VM_Version::_model2;
@@ -172,6 +176,7 @@
   if (auxv & HWCAP_AES)   strcat(buf, ", aes");
   if (auxv & HWCAP_SHA1)  strcat(buf, ", sha1");
   if (auxv & HWCAP_SHA2)  strcat(buf, ", sha256");
+  if (auxv & HWCAP_ATOMICS) strcat(buf, ", lse");
 
   _features_string = os::strdup(buf);
 
@@ -191,6 +196,15 @@
     FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
   }
 
+  if (auxv & HWCAP_ATOMICS) {
+    if (FLAG_IS_DEFAULT(UseLSE))
+      FLAG_SET_DEFAULT(UseLSE, true);
+  } else {
+    if (UseLSE) {
+      warning("UseLSE specified, but not supported on this CPU");
+    }
+  }
+
   if (auxv & HWCAP_AES) {
     UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
     UseAESIntrinsics =
--- a/src/cpu/ppc/vm/globalDefinitions_ppc.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/ppc/vm/globalDefinitions_ppc.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -47,7 +47,7 @@
 // The expected size in bytes of a cache line, used to pad data structures.
 #define DEFAULT_CACHE_LINE_SIZE 128
 
-#if defined(COMPILER2) && defined(AIX)
+#if defined(COMPILER2) && (defined(AIX) || defined(linux))
 // Include Transactional Memory lock eliding optimization
 #define INCLUDE_RTM_OPT 1
 #endif
--- a/src/cpu/ppc/vm/globals_ppc.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/ppc/vm/globals_ppc.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -76,6 +76,8 @@
 
 define_pd_global(bool, CompactStrings, true);
 
+define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
+
 // Platform dependent flag handling: flags only defined on this platform.
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint)  \
                                                                             \
--- a/src/cpu/ppc/vm/ppc.ad	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/ppc/vm/ppc.ad	Thu Mar 17 17:03:20 2016 +0000
@@ -2137,8 +2137,6 @@
   return decode;
 }
 */
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
 
 // false => size gets scaled to BytesPerLong, ok.
 const bool Matcher::init_array_count_is_in_bytes = false;
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -255,7 +255,16 @@
     }
 #endif
 #ifdef linux
-    // TODO: check kernel version (we currently have too old versions only)
+    // At least Linux kernel 4.2, as the problematic behavior of syscalls
+    // being called in the middle of a transaction has been addressed.
+    // Please, refer to commit b4b56f9ecab40f3b4ef53e130c9f6663be491894
+    // in Linux kernel source tree: https://goo.gl/Kc5i7A
+    if (os::Linux::os_version_is_known()) {
+      if (os::Linux::os_version() >= 0x040200)
+        os_too_old = false;
+    } else {
+      vm_exit_during_initialization("RTM can not be enabled: kernel version is unknown.");
+    }
 #endif
     if (os_too_old) {
       vm_exit_during_initialization("RTM is not supported on this OS version.");
--- a/src/cpu/sparc/vm/globals_sparc.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/sparc/vm/globals_sparc.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -90,6 +90,8 @@
 
 define_pd_global(bool, CompactStrings, true);
 
+define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
+
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                             \
   product(intx, UseVIS, 99,                                                 \
--- a/src/cpu/sparc/vm/sparc.ad	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/sparc/vm/sparc.ad	Thu Mar 17 17:03:20 2016 +0000
@@ -948,28 +948,28 @@
   }
 #endif
 
-  uint instr;
-  instr = (Assembler::ldst_op << 30)
-        | (dst_enc        << 25)
-        | (primary        << 19)
-        | (src1_enc       << 14);
+  uint instr = (Assembler::ldst_op << 30)
+             | (dst_enc        << 25)
+             | (primary        << 19)
+             | (src1_enc       << 14);
 
   uint index = src2_enc;
   int disp = disp32;
 
   if (src1_enc == R_SP_enc || src1_enc == R_FP_enc) {
     disp += STACK_BIAS;
-    // Quick fix for JDK-8029668: check that stack offset fits, bailout if not
+    // Check that stack offset fits, load into O7 if not
     if (!Assembler::is_simm13(disp)) {
-      ra->C->record_method_not_compilable("unable to handle large constant offsets");
-      return;
+      MacroAssembler _masm(&cbuf);
+      __ set(disp, O7);
+      if (index != R_G0_enc) {
+        __ add(O7, reg_to_register_object(index), O7);
+      }
+      index = R_O7_enc;
+      disp = 0;
     }
   }
 
-  // We should have a compiler bailout here rather than a guarantee.
-  // Better yet would be some mechanism to handle variable-size matches correctly.
-  guarantee(Assembler::is_simm13(disp), "Do not match large constant offsets" );
-
   if( disp == 0 ) {
     // use reg-reg form
     // bit 13 is already zero
@@ -983,7 +983,7 @@
   cbuf.insts()->emit_int32(instr);
 
 #ifdef ASSERT
-  {
+  if (VerifyOops) {
     MacroAssembler _masm(&cbuf);
     if (is_verified_oop_base) {
       __ verify_oop(reg_to_register_object(src1_enc));
@@ -1342,7 +1342,7 @@
 // Figure out which register class each belongs in: rc_int, rc_float, rc_stack
 enum RC { rc_bad, rc_int, rc_float, rc_stack };
 static enum RC rc_class( OptoReg::Name reg ) {
-  if( !OptoReg::is_valid(reg)  ) return rc_bad;
+  if (!OptoReg::is_valid(reg)) return rc_bad;
   if (OptoReg::is_stack(reg)) return rc_stack;
   VMReg r = OptoReg::as_VMReg(reg);
   if (r->is_Register()) return rc_int;
@@ -1350,66 +1350,79 @@
   return rc_float;
 }
 
-static int impl_helper(const MachNode* mach, CodeBuffer* cbuf, PhaseRegAlloc* ra, bool do_size, bool is_load, int offset, int reg, int opcode, const char *op_str, int size, outputStream* st ) {
+#ifndef PRODUCT
+ATTRIBUTE_PRINTF(2, 3)
+static void print_helper(outputStream* st, const char* format, ...) {
+  if (st->position() > 0) {
+    st->cr();
+    st->sp();
+  }
+  va_list ap;
+  va_start(ap, format);
+  st->vprint(format, ap);
+  va_end(ap);
+}
+#endif // !PRODUCT
+
+static void impl_helper(const MachNode* mach, CodeBuffer* cbuf, PhaseRegAlloc* ra, bool is_load, int offset, int reg, int opcode, const char *op_str, outputStream* st) {
   if (cbuf) {
     emit_form3_mem_reg(*cbuf, ra, mach, opcode, -1, R_SP_enc, offset, 0, Matcher::_regEncode[reg]);
   }
 #ifndef PRODUCT
-  else if (!do_size) {
-    if (size != 0) st->print("\n\t");
-    if (is_load) st->print("%s   [R_SP + #%d],R_%s\t! spill",op_str,offset,OptoReg::regname(reg));
-    else         st->print("%s   R_%s,[R_SP + #%d]\t! spill",op_str,OptoReg::regname(reg),offset);
+  else {
+    if (is_load) {
+      print_helper(st, "%s   [R_SP + #%d],R_%s\t! spill", op_str, offset, OptoReg::regname(reg));
+    } else {
+      print_helper(st, "%s   R_%s,[R_SP + #%d]\t! spill", op_str, OptoReg::regname(reg), offset);
+    }
   }
 #endif
-  return size+4;
 }
 
-static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int op1, int op2, const char *op_str, int size, outputStream* st ) {
-  if( cbuf ) emit3( *cbuf, Assembler::arith_op, Matcher::_regEncode[dst], op1, 0, op2, Matcher::_regEncode[src] );
+static void impl_mov_helper(CodeBuffer *cbuf, int src, int dst, int op1, int op2, const char *op_str, outputStream* st) {
+  if (cbuf) {
+    emit3(*cbuf, Assembler::arith_op, Matcher::_regEncode[dst], op1, 0, op2, Matcher::_regEncode[src]);
+  }
 #ifndef PRODUCT
-  else if( !do_size ) {
-    if( size != 0 ) st->print("\n\t");
-    st->print("%s  R_%s,R_%s\t! spill",op_str,OptoReg::regname(src),OptoReg::regname(dst));
+  else {
+    print_helper(st, "%s  R_%s,R_%s\t! spill", op_str, OptoReg::regname(src), OptoReg::regname(dst));
   }
 #endif
-  return size+4;
 }
 
-uint MachSpillCopyNode::implementation( CodeBuffer *cbuf,
-                                        PhaseRegAlloc *ra_,
-                                        bool do_size,
-                                        outputStream* st ) const {
+static void mach_spill_copy_implementation_helper(const MachNode* mach,
+                                                  CodeBuffer *cbuf,
+                                                  PhaseRegAlloc *ra_,
+                                                  outputStream* st) {
   // Get registers to move
-  OptoReg::Name src_second = ra_->get_reg_second(in(1));
-  OptoReg::Name src_first = ra_->get_reg_first(in(1));
-  OptoReg::Name dst_second = ra_->get_reg_second(this );
-  OptoReg::Name dst_first = ra_->get_reg_first(this );
+  OptoReg::Name src_second = ra_->get_reg_second(mach->in(1));
+  OptoReg::Name src_first  = ra_->get_reg_first(mach->in(1));
+  OptoReg::Name dst_second = ra_->get_reg_second(mach);
+  OptoReg::Name dst_first  = ra_->get_reg_first(mach);
 
   enum RC src_second_rc = rc_class(src_second);
-  enum RC src_first_rc = rc_class(src_first);
+  enum RC src_first_rc  = rc_class(src_first);
   enum RC dst_second_rc = rc_class(dst_second);
-  enum RC dst_first_rc = rc_class(dst_first);
-
-  assert( OptoReg::is_valid(src_first) && OptoReg::is_valid(dst_first), "must move at least 1 register" );
-
-  // Generate spill code!
-  int size = 0;
-
-  if( src_first == dst_first && src_second == dst_second )
-    return size;            // Self copy, no move
+  enum RC dst_first_rc  = rc_class(dst_first);
+
+  assert(OptoReg::is_valid(src_first) && OptoReg::is_valid(dst_first), "must move at least 1 register");
+
+  if (src_first == dst_first && src_second == dst_second) {
+    return; // Self copy, no move
+  }
 
   // --------------------------------------
   // Check for mem-mem move.  Load into unused float registers and fall into
   // the float-store case.
-  if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
+  if (src_first_rc == rc_stack && dst_first_rc == rc_stack) {
     int offset = ra_->reg2offset(src_first);
     // Further check for aligned-adjacent pair, so we can use a double load
-    if( (src_first&1)==0 && src_first+1 == src_second ) {
+    if ((src_first&1) == 0 && src_first+1 == src_second) {
       src_second    = OptoReg::Name(R_F31_num);
       src_second_rc = rc_float;
-      size = impl_helper(this,cbuf,ra_,do_size,true,offset,R_F30_num,Assembler::lddf_op3,"LDDF",size, st);
+      impl_helper(mach, cbuf, ra_, true, offset, R_F30_num, Assembler::lddf_op3, "LDDF", st);
     } else {
-      size = impl_helper(this,cbuf,ra_,do_size,true,offset,R_F30_num,Assembler::ldf_op3 ,"LDF ",size, st);
+      impl_helper(mach, cbuf, ra_, true, offset, R_F30_num, Assembler::ldf_op3, "LDF ", st);
     }
     src_first    = OptoReg::Name(R_F30_num);
     src_first_rc = rc_float;
@@ -1417,7 +1430,7 @@
 
   if( src_second_rc == rc_stack && dst_second_rc == rc_stack ) {
     int offset = ra_->reg2offset(src_second);
-    size = impl_helper(this,cbuf,ra_,do_size,true,offset,R_F31_num,Assembler::ldf_op3,"LDF ",size, st);
+    impl_helper(mach, cbuf, ra_, true, offset, R_F31_num, Assembler::ldf_op3, "LDF ", st);
     src_second    = OptoReg::Name(R_F31_num);
     src_second_rc = rc_float;
   }
@@ -1427,36 +1440,38 @@
   if (src_first_rc == rc_float && dst_first_rc == rc_int && UseVIS < 3) {
     int offset = frame::register_save_words*wordSize;
     if (cbuf) {
-      emit3_simm13( *cbuf, Assembler::arith_op, R_SP_enc, Assembler::sub_op3, R_SP_enc, 16 );
-      impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stf_op3 ,"STF ",size, st);
-      impl_helper(this,cbuf,ra_,do_size,true ,offset,dst_first,Assembler::lduw_op3,"LDUW",size, st);
-      emit3_simm13( *cbuf, Assembler::arith_op, R_SP_enc, Assembler::add_op3, R_SP_enc, 16 );
+      emit3_simm13(*cbuf, Assembler::arith_op, R_SP_enc, Assembler::sub_op3, R_SP_enc, 16);
+      impl_helper(mach, cbuf, ra_, false, offset, src_first,  Assembler::stf_op3, "STF ", st);
+      impl_helper(mach, cbuf, ra_,  true, offset, dst_first, Assembler::lduw_op3, "LDUW", st);
+      emit3_simm13(*cbuf, Assembler::arith_op, R_SP_enc, Assembler::add_op3, R_SP_enc, 16);
     }
 #ifndef PRODUCT
-    else if (!do_size) {
-      if (size != 0) st->print("\n\t");
-      st->print(  "SUB    R_SP,16,R_SP\n");
-      impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stf_op3 ,"STF ",size, st);
-      impl_helper(this,cbuf,ra_,do_size,true ,offset,dst_first,Assembler::lduw_op3,"LDUW",size, st);
-      st->print("\tADD    R_SP,16,R_SP\n");
+    else {
+      print_helper(st, "SUB    R_SP,16,R_SP");
+      impl_helper(mach, cbuf, ra_, false, offset, src_first,  Assembler::stf_op3, "STF ", st);
+      impl_helper(mach, cbuf, ra_,  true, offset, dst_first, Assembler::lduw_op3, "LDUW", st);
+      print_helper(st, "ADD    R_SP,16,R_SP");
     }
 #endif
-    size += 16;
   }
 
   // Check for float->int copy on T4
   if (src_first_rc == rc_float && dst_first_rc == rc_int && UseVIS >= 3) {
     // Further check for aligned-adjacent pair, so we can use a double move
-    if ((src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second)
-      return impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::mftoi_op3,Assembler::mdtox_opf,"MOVDTOX",size, st);
-    size  =  impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::mftoi_op3,Assembler::mstouw_opf,"MOVSTOUW",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_mov_helper(cbuf, src_first, dst_first, Assembler::mftoi_op3, Assembler::mdtox_opf, "MOVDTOX", st);
+      return;
+    }
+    impl_mov_helper(cbuf, src_first, dst_first, Assembler::mftoi_op3, Assembler::mstouw_opf, "MOVSTOUW", st);
   }
   // Check for int->float copy on T4
   if (src_first_rc == rc_int && dst_first_rc == rc_float && UseVIS >= 3) {
     // Further check for aligned-adjacent pair, so we can use a double move
-    if ((src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second)
-      return impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::mftoi_op3,Assembler::mxtod_opf,"MOVXTOD",size, st);
-    size  =  impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::mftoi_op3,Assembler::mwtos_opf,"MOVWTOS",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_mov_helper(cbuf, src_first, dst_first, Assembler::mftoi_op3, Assembler::mxtod_opf, "MOVXTOD", st);
+      return;
+    }
+    impl_mov_helper(cbuf, src_first, dst_first, Assembler::mftoi_op3, Assembler::mwtos_opf, "MOVWTOS", st);
   }
 
   // --------------------------------------
@@ -1466,10 +1481,10 @@
   // there.  Misaligned sources only come from native-long-returns (handled
   // special below).
 #ifndef _LP64
-  if( src_first_rc == rc_int &&     // source is already big-endian
+  if (src_first_rc == rc_int &&     // source is already big-endian
       src_second_rc != rc_bad &&    // 64-bit move
-      ((dst_first&1)!=0 || dst_second != dst_first+1) ) { // misaligned dst
-    assert( (src_first&1)==0 && src_second == src_first+1, "source must be aligned" );
+      ((dst_first & 1) != 0 || dst_second != dst_first + 1)) { // misaligned dst
+    assert((src_first & 1) == 0 && src_second == src_first + 1, "source must be aligned");
     // Do the big-endian flop.
     OptoReg::Name tmp    = dst_first   ; dst_first    = dst_second   ; dst_second    = tmp   ;
     enum RC       tmp_rc = dst_first_rc; dst_first_rc = dst_second_rc; dst_second_rc = tmp_rc;
@@ -1478,30 +1493,28 @@
 
   // --------------------------------------
   // Check for integer reg-reg copy
-  if( src_first_rc == rc_int && dst_first_rc == rc_int ) {
+  if (src_first_rc == rc_int && dst_first_rc == rc_int) {
 #ifndef _LP64
-    if( src_first == R_O0_num && src_second == R_O1_num ) {  // Check for the evil O0/O1 native long-return case
+    if (src_first == R_O0_num && src_second == R_O1_num) {  // Check for the evil O0/O1 native long-return case
       // Note: The _first and _second suffixes refer to the addresses of the the 2 halves of the 64-bit value
       //       as stored in memory.  On a big-endian machine like SPARC, this means that the _second
       //       operand contains the least significant word of the 64-bit value and vice versa.
       OptoReg::Name tmp = OptoReg::Name(R_O7_num);
-      assert( (dst_first&1)==0 && dst_second == dst_first+1, "return a native O0/O1 long to an aligned-adjacent 64-bit reg" );
+      assert((dst_first & 1) == 0 && dst_second == dst_first + 1, "return a native O0/O1 long to an aligned-adjacent 64-bit reg" );
       // Shift O0 left in-place, zero-extend O1, then OR them into the dst
-      if( cbuf ) {
-        emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[tmp], Assembler::sllx_op3, Matcher::_regEncode[src_first], 0x1020 );
-        emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[src_second], Assembler::srl_op3, Matcher::_regEncode[src_second], 0x0000 );
-        emit3       ( *cbuf, Assembler::arith_op, Matcher::_regEncode[dst_first], Assembler:: or_op3, Matcher::_regEncode[tmp], 0, Matcher::_regEncode[src_second] );
+      if ( cbuf ) {
+        emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[tmp], Assembler::sllx_op3, Matcher::_regEncode[src_first], 0x1020);
+        emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[src_second], Assembler::srl_op3, Matcher::_regEncode[src_second], 0x0000);
+        emit3       (*cbuf, Assembler::arith_op, Matcher::_regEncode[dst_first], Assembler:: or_op3, Matcher::_regEncode[tmp], 0, Matcher::_regEncode[src_second]);
 #ifndef PRODUCT
-      } else if( !do_size ) {
-        if( size != 0 ) st->print("\n\t");
-        st->print("SLLX   R_%s,32,R_%s\t! Move O0-first to O7-high\n\t", OptoReg::regname(src_first), OptoReg::regname(tmp));
-        st->print("SRL    R_%s, 0,R_%s\t! Zero-extend O1\n\t", OptoReg::regname(src_second), OptoReg::regname(src_second));
-        st->print("OR     R_%s,R_%s,R_%s\t! spill",OptoReg::regname(tmp), OptoReg::regname(src_second), OptoReg::regname(dst_first));
+      } else {
+        print_helper(st, "SLLX   R_%s,32,R_%s\t! Move O0-first to O7-high\n\t", OptoReg::regname(src_first), OptoReg::regname(tmp));
+        print_helper(st, "SRL    R_%s, 0,R_%s\t! Zero-extend O1\n\t", OptoReg::regname(src_second), OptoReg::regname(src_second));
+        print_helper(st, "OR     R_%s,R_%s,R_%s\t! spill",OptoReg::regname(tmp), OptoReg::regname(src_second), OptoReg::regname(dst_first));
 #endif
       }
-      return size+12;
-    }
-    else if( dst_first == R_I0_num && dst_second == R_I1_num ) {
+      return;
+    } else if (dst_first == R_I0_num && dst_second == R_I1_num) {
       // returning a long value in I0/I1
       // a SpillCopy must be able to target a return instruction's reg_class
       // Note: The _first and _second suffixes refer to the addresses of the the 2 halves of the 64-bit value
@@ -1511,27 +1524,25 @@
 
       if (src_first == dst_first) {
         tdest = OptoReg::Name(R_O7_num);
-        size += 4;
       }
 
-      if( cbuf ) {
-        assert( (src_first&1) == 0 && (src_first+1) == src_second, "return value was in an aligned-adjacent 64-bit reg");
+      if (cbuf) {
+        assert((src_first & 1) == 0 && (src_first + 1) == src_second, "return value was in an aligned-adjacent 64-bit reg");
         // Shift value in upper 32-bits of src to lower 32-bits of I0; move lower 32-bits to I1
         // ShrL_reg_imm6
-        emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[tdest], Assembler::srlx_op3, Matcher::_regEncode[src_second], 32 | 0x1000 );
+        emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[tdest], Assembler::srlx_op3, Matcher::_regEncode[src_second], 32 | 0x1000);
         // ShrR_reg_imm6  src, 0, dst
-        emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[dst_second], Assembler::srl_op3, Matcher::_regEncode[src_first], 0x0000 );
+        emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[dst_second], Assembler::srl_op3, Matcher::_regEncode[src_first], 0x0000);
         if (tdest != dst_first) {
-          emit3     ( *cbuf, Assembler::arith_op, Matcher::_regEncode[dst_first], Assembler::or_op3, 0/*G0*/, 0/*op2*/, Matcher::_regEncode[tdest] );
+          emit3     (*cbuf, Assembler::arith_op, Matcher::_regEncode[dst_first], Assembler::or_op3, 0/*G0*/, 0/*op2*/, Matcher::_regEncode[tdest]);
         }
       }
 #ifndef PRODUCT
-      else if( !do_size ) {
-        if( size != 0 ) st->print("\n\t");  // %%%%% !!!!!
-        st->print("SRLX   R_%s,32,R_%s\t! Extract MSW\n\t",OptoReg::regname(src_second),OptoReg::regname(tdest));
-        st->print("SRL    R_%s, 0,R_%s\t! Extract LSW\n\t",OptoReg::regname(src_first),OptoReg::regname(dst_second));
+      else {
+        print_helper(st, "SRLX   R_%s,32,R_%s\t! Extract MSW\n\t",OptoReg::regname(src_second),OptoReg::regname(tdest));
+        print_helper(st, "SRL    R_%s, 0,R_%s\t! Extract LSW\n\t",OptoReg::regname(src_first),OptoReg::regname(dst_second));
         if (tdest != dst_first) {
-          st->print("MOV    R_%s,R_%s\t! spill\n\t", OptoReg::regname(tdest), OptoReg::regname(dst_first));
+          print_helper(st, "MOV    R_%s,R_%s\t! spill\n\t", OptoReg::regname(tdest), OptoReg::regname(dst_first));
         }
       }
 #endif // PRODUCT
@@ -1539,65 +1550,77 @@
     }
 #endif // !_LP64
     // Else normal reg-reg copy
-    assert( src_second != dst_first, "smashed second before evacuating it" );
-    size = impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::or_op3,0,"MOV  ",size, st);
-    assert( (src_first&1) == 0 && (dst_first&1) == 0, "never move second-halves of int registers" );
+    assert(src_second != dst_first, "smashed second before evacuating it");
+    impl_mov_helper(cbuf, src_first, dst_first, Assembler::or_op3, 0, "MOV  ", st);
+    assert((src_first & 1) == 0 && (dst_first & 1) == 0, "never move second-halves of int registers");
     // This moves an aligned adjacent pair.
     // See if we are done.
-    if( src_first+1 == src_second && dst_first+1 == dst_second )
-      return size;
+    if (src_first + 1 == src_second && dst_first + 1 == dst_second) {
+      return;
+    }
   }
 
   // Check for integer store
-  if( src_first_rc == rc_int && dst_first_rc == rc_stack ) {
+  if (src_first_rc == rc_int && dst_first_rc == rc_stack) {
     int offset = ra_->reg2offset(dst_first);
     // Further check for aligned-adjacent pair, so we can use a double store
-    if( (src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second )
-      return impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stx_op3,"STX ",size, st);
-    size  =  impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stw_op3,"STW ",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_helper(mach, cbuf, ra_, false, offset, src_first, Assembler::stx_op3, "STX ", st);
+      return;
+    }
+    impl_helper(mach, cbuf, ra_, false, offset, src_first, Assembler::stw_op3, "STW ", st);
   }
 
   // Check for integer load
-  if( dst_first_rc == rc_int && src_first_rc == rc_stack ) {
+  if (dst_first_rc == rc_int && src_first_rc == rc_stack) {
     int offset = ra_->reg2offset(src_first);
     // Further check for aligned-adjacent pair, so we can use a double load
-    if( (src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second )
-      return impl_helper(this,cbuf,ra_,do_size,true,offset,dst_first,Assembler::ldx_op3 ,"LDX ",size, st);
-    size  =  impl_helper(this,cbuf,ra_,do_size,true,offset,dst_first,Assembler::lduw_op3,"LDUW",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_helper(mach, cbuf, ra_, true, offset, dst_first, Assembler::ldx_op3, "LDX ", st);
+      return;
+    }
+    impl_helper(mach, cbuf, ra_, true, offset, dst_first, Assembler::lduw_op3, "LDUW", st);
   }
 
   // Check for float reg-reg copy
-  if( src_first_rc == rc_float && dst_first_rc == rc_float ) {
+  if (src_first_rc == rc_float && dst_first_rc == rc_float) {
     // Further check for aligned-adjacent pair, so we can use a double move
-    if( (src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second )
-      return impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::fpop1_op3,Assembler::fmovd_opf,"FMOVD",size, st);
-    size  =  impl_mov_helper(cbuf,do_size,src_first,dst_first,Assembler::fpop1_op3,Assembler::fmovs_opf,"FMOVS",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_mov_helper(cbuf, src_first, dst_first, Assembler::fpop1_op3, Assembler::fmovd_opf, "FMOVD", st);
+      return;
+    }
+    impl_mov_helper(cbuf, src_first, dst_first, Assembler::fpop1_op3, Assembler::fmovs_opf, "FMOVS", st);
   }
 
   // Check for float store
-  if( src_first_rc == rc_float && dst_first_rc == rc_stack ) {
+  if (src_first_rc == rc_float && dst_first_rc == rc_stack) {
     int offset = ra_->reg2offset(dst_first);
     // Further check for aligned-adjacent pair, so we can use a double store
-    if( (src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second )
-      return impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stdf_op3,"STDF",size, st);
-    size  =  impl_helper(this,cbuf,ra_,do_size,false,offset,src_first,Assembler::stf_op3 ,"STF ",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_helper(mach, cbuf, ra_, false, offset, src_first, Assembler::stdf_op3, "STDF", st);
+      return;
+    }
+    impl_helper(mach, cbuf, ra_, false, offset, src_first, Assembler::stf_op3, "STF ", st);
   }
 
   // Check for float load
-  if( dst_first_rc == rc_float && src_first_rc == rc_stack ) {
+  if (dst_first_rc == rc_float && src_first_rc == rc_stack) {
     int offset = ra_->reg2offset(src_first);
     // Further check for aligned-adjacent pair, so we can use a double load
-    if( (src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second )
-      return impl_helper(this,cbuf,ra_,do_size,true,offset,dst_first,Assembler::lddf_op3,"LDDF",size, st);
-    size  =  impl_helper(this,cbuf,ra_,do_size,true,offset,dst_first,Assembler::ldf_op3 ,"LDF ",size, st);
+    if ((src_first & 1) == 0 && src_first + 1 == src_second && (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
+      impl_helper(mach, cbuf, ra_, true, offset, dst_first, Assembler::lddf_op3, "LDDF", st);
+      return;
+    }
+    impl_helper(mach, cbuf, ra_, true, offset, dst_first, Assembler::ldf_op3, "LDF ", st);
   }
 
   // --------------------------------------------------------------------
   // Check for hi bits still needing moving.  Only happens for misaligned
   // arguments to native calls.
-  if( src_second == dst_second )
-    return size;               // Self copy; no move
-  assert( src_second_rc != rc_bad && dst_second_rc != rc_bad, "src_second & dst_second cannot be Bad" );
+  if (src_second == dst_second) {
+    return; // Self copy; no move
+  }
+  assert(src_second_rc != rc_bad && dst_second_rc != rc_bad, "src_second & dst_second cannot be Bad");
 
 #ifndef _LP64
   // In the LP64 build, all registers can be moved as aligned/adjacent
@@ -1609,52 +1632,57 @@
   // 32-bits of a 64-bit register, but are needed in low bits of another
   // register (else it's a hi-bits-to-hi-bits copy which should have
   // happened already as part of a 64-bit move)
-  if( src_second_rc == rc_int && dst_second_rc == rc_int ) {
-    assert( (src_second&1)==1, "its the evil O0/O1 native return case" );
-    assert( (dst_second&1)==0, "should have moved with 1 64-bit move" );
+  if (src_second_rc == rc_int && dst_second_rc == rc_int) {
+    assert((src_second & 1) == 1, "its the evil O0/O1 native return case");
+    assert((dst_second & 1) == 0, "should have moved with 1 64-bit move");
     // Shift src_second down to dst_second's low bits.
-    if( cbuf ) {
-      emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[dst_second], Assembler::srlx_op3, Matcher::_regEncode[src_second-1], 0x1020 );
+    if (cbuf) {
+      emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[dst_second], Assembler::srlx_op3, Matcher::_regEncode[src_second-1], 0x1020);
 #ifndef PRODUCT
-    } else if( !do_size ) {
-      if( size != 0 ) st->print("\n\t");
-      st->print("SRLX   R_%s,32,R_%s\t! spill: Move high bits down low",OptoReg::regname(src_second-1),OptoReg::regname(dst_second));
+    } else  {
+      print_helper(st, "SRLX   R_%s,32,R_%s\t! spill: Move high bits down low", OptoReg::regname(src_second - 1), OptoReg::regname(dst_second));
 #endif
     }
-    return size+4;
+    return;
   }
 
   // Check for high word integer store.  Must down-shift the hi bits
   // into a temp register, then fall into the case of storing int bits.
-  if( src_second_rc == rc_int && dst_second_rc == rc_stack && (src_second&1)==1 ) {
+  if (src_second_rc == rc_int && dst_second_rc == rc_stack && (src_second & 1) == 1) {
     // Shift src_second down to dst_second's low bits.
-    if( cbuf ) {
-      emit3_simm13( *cbuf, Assembler::arith_op, Matcher::_regEncode[R_O7_num], Assembler::srlx_op3, Matcher::_regEncode[src_second-1], 0x1020 );
+    if (cbuf) {
+      emit3_simm13(*cbuf, Assembler::arith_op, Matcher::_regEncode[R_O7_num], Assembler::srlx_op3, Matcher::_regEncode[src_second-1], 0x1020);
 #ifndef PRODUCT
-    } else if( !do_size ) {
-      if( size != 0 ) st->print("\n\t");
-      st->print("SRLX   R_%s,32,R_%s\t! spill: Move high bits down low",OptoReg::regname(src_second-1),OptoReg::regname(R_O7_num));
+    } else {
+      print_helper(st, "SRLX   R_%s,32,R_%s\t! spill: Move high bits down low", OptoReg::regname(src_second-1), OptoReg::regname(R_O7_num));
 #endif
     }
-    size+=4;
     src_second = OptoReg::Name(R_O7_num); // Not R_O7H_num!
   }
 
   // Check for high word integer load
-  if( dst_second_rc == rc_int && src_second_rc == rc_stack )
-    return impl_helper(this,cbuf,ra_,do_size,true ,ra_->reg2offset(src_second),dst_second,Assembler::lduw_op3,"LDUW",size, st);
+  if (dst_second_rc == rc_int && src_second_rc == rc_stack)
+    return impl_helper(this, cbuf, ra_, true, ra_->reg2offset(src_second), dst_second, Assembler::lduw_op3, "LDUW", size, st);
 
   // Check for high word integer store
-  if( src_second_rc == rc_int && dst_second_rc == rc_stack )
-    return impl_helper(this,cbuf,ra_,do_size,false,ra_->reg2offset(dst_second),src_second,Assembler::stw_op3 ,"STW ",size, st);
+  if (src_second_rc == rc_int && dst_second_rc == rc_stack)
+    return impl_helper(this, cbuf, ra_, false, ra_->reg2offset(dst_second), src_second, Assembler::stw_op3, "STW ", size, st);
 
   // Check for high word float store
-  if( src_second_rc == rc_float && dst_second_rc == rc_stack )
-    return impl_helper(this,cbuf,ra_,do_size,false,ra_->reg2offset(dst_second),src_second,Assembler::stf_op3 ,"STF ",size, st);
+  if (src_second_rc == rc_float && dst_second_rc == rc_stack)
+    return impl_helper(this, cbuf, ra_, false, ra_->reg2offset(dst_second), src_second, Assembler::stf_op3, "STF ", size, st);
 
 #endif // !_LP64
 
   Unimplemented();
+}
+
+uint MachSpillCopyNode::implementation(CodeBuffer *cbuf,
+                                       PhaseRegAlloc *ra_,
+                                       bool do_size,
+                                       outputStream* st) const {
+  assert(!do_size, "not supported");
+  mach_spill_copy_implementation_helper(this, cbuf, ra_, st);
   return 0;
 }
 
@@ -1669,19 +1697,19 @@
 }
 
 uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
-  return implementation( NULL, ra_, true, NULL );
+  return MachNode::size(ra_);
 }
 
 //=============================================================================
 #ifndef PRODUCT
-void MachNopNode::format( PhaseRegAlloc *, outputStream *st ) const {
+void MachNopNode::format(PhaseRegAlloc *, outputStream *st) const {
   st->print("NOP \t# %d bytes pad for loops and calls", 4 * _count);
 }
 #endif
 
-void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc * ) const {
+void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *) const {
   MacroAssembler _masm(&cbuf);
-  for(int i = 0; i < _count; i += 1) {
+  for (int i = 0; i < _count; i += 1) {
     __ nop();
   }
 }
@@ -1952,9 +1980,6 @@
 // No scaling for the parameter the ClearArray node.
 const bool Matcher::init_array_count_is_in_bytes = true;
 
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
-
 // No additional cost for CMOVL.
 const int Matcher::long_cmove_cost() { return 0; }
 
@@ -5197,7 +5222,6 @@
   // No match rule to avoid chain rule match.
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "LDF    $src,$dst\t! stkI to regF" %}
   opcode(Assembler::ldf_op3);
   ins_encode(simple_form3_mem_reg(src, dst));
@@ -5208,7 +5232,6 @@
   // No match rule to avoid chain rule match.
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "LDDF   $src,$dst\t! stkL to regD" %}
   opcode(Assembler::lddf_op3);
   ins_encode(simple_form3_mem_reg(src, dst));
@@ -5219,7 +5242,6 @@
   // No match rule to avoid chain rule match.
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STF    $src,$dst\t! regF to stkI" %}
   opcode(Assembler::stf_op3);
   ins_encode(simple_form3_mem_reg(dst, src));
@@ -5230,7 +5252,6 @@
   // No match rule to avoid chain rule match.
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STDF   $src,$dst\t! regD to stkL" %}
   opcode(Assembler::stdf_op3);
   ins_encode(simple_form3_mem_reg(dst, src));
@@ -5240,7 +5261,6 @@
 instruct regI_to_stkLHi(stackSlotL dst, iRegI src) %{
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST*2);
-  size(8);
   format %{ "STW    $src,$dst.hi\t! long\n\t"
             "STW    R_G0,$dst.lo" %}
   opcode(Assembler::stw_op3);
@@ -5252,7 +5272,6 @@
   // No match rule to avoid chain rule match.
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STX    $src,$dst\t! regL to stkD" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -5266,7 +5285,6 @@
   match(Set dst src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDUW   $src,$dst\t!stk" %}
   opcode(Assembler::lduw_op3);
   ins_encode(simple_form3_mem_reg( src, dst ) );
@@ -5278,7 +5296,6 @@
   match(Set dst src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STW    $src,$dst\t!stk" %}
   opcode(Assembler::stw_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -5290,7 +5307,6 @@
   match(Set dst src);
 
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "LDX    $src,$dst\t! long" %}
   opcode(Assembler::ldx_op3);
   ins_encode(simple_form3_mem_reg( src, dst ) );
@@ -5302,7 +5318,6 @@
   match(Set dst src);
 
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STX    $src,$dst\t! long" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -5314,7 +5329,6 @@
 instruct stkP_to_regP( iRegP dst, stackSlotP src ) %{
   match(Set dst src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "LDX    $src,$dst\t!ptr" %}
   opcode(Assembler::ldx_op3);
   ins_encode(simple_form3_mem_reg( src, dst ) );
@@ -5325,7 +5339,6 @@
 instruct regP_to_stkP(stackSlotP dst, iRegP src) %{
   match(Set dst src);
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STX    $src,$dst\t!ptr" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -5771,7 +5784,6 @@
   match(Set dst (LoadL_unaligned mem));
   effect(KILL tmp);
   ins_cost(MEMORY_REF_COST*2+DEFAULT_COST);
-  size(16);
   format %{ "LDUW   $mem+4,R_O7\t! misaligned long\n"
           "\tLDUW   $mem  ,$dst\n"
           "\tSLLX   #32, $dst, $dst\n"
@@ -5786,7 +5798,6 @@
   match(Set dst (LoadRange mem));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDUW   $mem,$dst\t! range" %}
   opcode(Assembler::lduw_op3);
   ins_encode(simple_form3_mem_reg( mem, dst ) );
@@ -5797,7 +5808,6 @@
 instruct loadI_freg(regF dst, memory mem) %{
   match(Set dst (LoadI mem));
   ins_cost(MEMORY_REF_COST);
-  size(4);
 
   format %{ "LDF    $mem,$dst\t! for fitos/fitod" %}
   opcode(Assembler::ldf_op3);
@@ -5876,7 +5886,6 @@
   match(Set dst (LoadD mem));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDDF   $mem,$dst" %}
   opcode(Assembler::lddf_op3);
   ins_encode(simple_form3_mem_reg( mem, dst ) );
@@ -5887,7 +5896,6 @@
 instruct loadD_unaligned(regD_low dst, memory mem ) %{
   match(Set dst (LoadD_unaligned mem));
   ins_cost(MEMORY_REF_COST*2+DEFAULT_COST);
-  size(8);
   format %{ "LDF    $mem  ,$dst.hi\t! misaligned double\n"
           "\tLDF    $mem+4,$dst.lo\t!" %}
   opcode(Assembler::ldf_op3);
@@ -5900,7 +5908,6 @@
   match(Set dst (LoadF mem));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDF    $mem,$dst" %}
   opcode(Assembler::ldf_op3);
   ins_encode(simple_form3_mem_reg( mem, dst ) );
@@ -6119,7 +6126,6 @@
   predicate(AllocatePrefetchInstr == 0);
   match( PrefetchAllocation mem );
   ins_cost(MEMORY_REF_COST);
-  size(4);
 
   format %{ "PREFETCH $mem,2\t! Prefetch allocation" %}
   opcode(Assembler::prefetch_op3);
@@ -6175,7 +6181,6 @@
   match(Set mem (StoreB mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STB    $src,$mem\t! byte" %}
   opcode(Assembler::stb_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6186,7 +6191,6 @@
   match(Set mem (StoreB mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STB    $src,$mem\t! byte" %}
   opcode(Assembler::stb_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6197,7 +6201,6 @@
   match(Set mem (StoreCM mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STB    $src,$mem\t! CMS card-mark byte 0" %}
   opcode(Assembler::stb_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6209,7 +6212,6 @@
   match(Set mem (StoreC mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STH    $src,$mem\t! short" %}
   opcode(Assembler::sth_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6220,7 +6222,6 @@
   match(Set mem (StoreC mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STH    $src,$mem\t! short" %}
   opcode(Assembler::sth_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6232,7 +6233,6 @@
   match(Set mem (StoreI mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STW    $src,$mem" %}
   opcode(Assembler::stw_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6243,7 +6243,6 @@
 instruct storeL(memory mem, iRegL src) %{
   match(Set mem (StoreL mem src));
   ins_cost(MEMORY_REF_COST);
-  size(4);
   format %{ "STX    $src,$mem\t! long" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6254,7 +6253,6 @@
   match(Set mem (StoreI mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STW    $src,$mem" %}
   opcode(Assembler::stw_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6265,7 +6263,6 @@
   match(Set mem (StoreL mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STX    $src,$mem" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6277,7 +6274,6 @@
   match(Set mem (StoreI mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STF    $src,$mem\t! after fstoi/fdtoi" %}
   opcode(Assembler::stf_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6288,7 +6284,6 @@
 instruct storeP(memory dst, sp_ptr_RegP src) %{
   match(Set dst (StoreP dst src));
   ins_cost(MEMORY_REF_COST);
-  size(4);
 
 #ifndef _LP64
   format %{ "STW    $src,$dst\t! ptr" %}
@@ -6304,7 +6299,6 @@
 instruct storeP0(memory dst, immP0 src) %{
   match(Set dst (StoreP dst src));
   ins_cost(MEMORY_REF_COST);
-  size(4);
 
 #ifndef _LP64
   format %{ "STW    $src,$dst\t! ptr" %}
@@ -6379,7 +6373,6 @@
   match(Set mem (StoreD mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STDF   $src,$mem" %}
   opcode(Assembler::stdf_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6390,7 +6383,6 @@
   match(Set mem (StoreD mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STX    $src,$mem" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -6402,7 +6394,6 @@
   match(Set mem (StoreF mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STF    $src,$mem" %}
   opcode(Assembler::stf_op3);
   ins_encode(simple_form3_mem_reg( mem, src ) );
@@ -6413,7 +6404,6 @@
   match(Set mem (StoreF mem src));
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STW    $src,$mem\t! storeF0" %}
   opcode(Assembler::stw_op3);
   ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
@@ -7068,7 +7058,6 @@
   ins_cost(MEMORY_REF_COST);
 
 #ifndef _LP64
-  size(4);
   format %{ "LDUW   $mem,$dst\t! ptr" %}
   opcode(Assembler::lduw_op3, 0, REGP_OP);
 #else
@@ -8138,7 +8127,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDUW   $src,$dst\t! MoveF2I" %}
   opcode(Assembler::lduw_op3);
   ins_encode(simple_form3_mem_reg( src, dst ) );
@@ -8150,7 +8138,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDF    $src,$dst\t! MoveI2F" %}
   opcode(Assembler::ldf_op3);
   ins_encode(simple_form3_mem_reg(src, dst));
@@ -8162,7 +8149,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDX    $src,$dst\t! MoveD2L" %}
   opcode(Assembler::ldx_op3);
   ins_encode(simple_form3_mem_reg( src, dst ) );
@@ -8174,7 +8160,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "LDDF   $src,$dst\t! MoveL2D" %}
   opcode(Assembler::lddf_op3);
   ins_encode(simple_form3_mem_reg(src, dst));
@@ -8186,7 +8171,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STF   $src,$dst\t! MoveF2I" %}
   opcode(Assembler::stf_op3);
   ins_encode(simple_form3_mem_reg(dst, src));
@@ -8198,7 +8182,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STW    $src,$dst\t! MoveI2F" %}
   opcode(Assembler::stw_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -8210,7 +8193,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STDF   $src,$dst\t! MoveD2L" %}
   opcode(Assembler::stdf_op3);
   ins_encode(simple_form3_mem_reg(dst, src));
@@ -8222,7 +8204,6 @@
   effect(DEF dst, USE src);
   ins_cost(MEMORY_REF_COST);
 
-  size(4);
   format %{ "STX    $src,$dst\t! MoveL2D" %}
   opcode(Assembler::stx_op3);
   ins_encode(simple_form3_mem_reg( dst, src ) );
@@ -8427,7 +8408,6 @@
 instruct convI2D_mem(regD_low dst, memory mem) %{
   match(Set dst (ConvI2D (LoadI mem)));
   ins_cost(DEFAULT_COST + MEMORY_REF_COST);
-  size(8);
   format %{ "LDF    $mem,$dst\n\t"
             "FITOD  $dst,$dst" %}
   opcode(Assembler::ldf_op3, Assembler::fitod_opf);
@@ -8468,7 +8448,6 @@
 instruct convI2F_mem( regF dst, memory mem ) %{
   match(Set dst (ConvI2F (LoadI mem)));
   ins_cost(DEFAULT_COST + MEMORY_REF_COST);
-  size(8);
   format %{ "LDF    $mem,$dst\n\t"
             "FITOS  $dst,$dst" %}
   opcode(Assembler::ldf_op3, Assembler::fitos_opf);
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -463,3 +463,37 @@
   }
   return result;
 }
+
+
+int VM_Version::parse_features(const char* implementation) {
+  int features = unknown_m;
+  // Convert to UPPER case before compare.
+  char* impl = os::strdup_check_oom(implementation);
+
+  for (int i = 0; impl[i] != 0; i++)
+    impl[i] = (char)toupper((uint)impl[i]);
+
+  if (strstr(impl, "SPARC64") != NULL) {
+    features |= sparc64_family_m;
+  } else if (strstr(impl, "SPARC-M") != NULL) {
+    // M-series SPARC is based on T-series.
+    features |= (M_family_m | T_family_m);
+  } else if (strstr(impl, "SPARC-T") != NULL) {
+    features |= T_family_m;
+    if (strstr(impl, "SPARC-T1") != NULL) {
+      features |= T1_model_m;
+    }
+  } else {
+    if (strstr(impl, "SPARC") == NULL) {
+#ifndef PRODUCT
+      // kstat on Solaris 8 virtual machines (branded zones)
+      // returns "(unsupported)" implementation. Solaris 8 is not
+      // supported anymore, but include this check to be on the
+      // safe side.
+      warning("Can't parse CPU implementation = '%s', assume generic SPARC", impl);
+#endif
+    }
+  }
+  os::free((void*)impl);
+  return features;
+}
--- a/src/cpu/sparc/vm/vm_version_sparc.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/sparc/vm/vm_version_sparc.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -121,7 +121,7 @@
   static bool is_T1_model(int features) { return is_T_family(features) && ((features & T1_model_m) != 0); }
 
   static int maximum_niagara1_processor_count() { return 32; }
-
+  static int parse_features(const char* implementation);
 public:
   // Initialization
   static void initialize();
--- a/src/cpu/x86/vm/assembler_x86.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -777,6 +777,7 @@
     case 0x6E: // movd
     case 0x7E: // movd
     case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
+    case 0xFE: // paddd
       debug_only(has_disp32 = true);
       break;
 
@@ -926,6 +927,7 @@
     ip++; // skip P2, move to opcode
     // To find the end of instruction (which == end_pc_operand).
     switch (0xFF & *ip) {
+    case 0x22: // pinsrd r, r/a, #8
     case 0x61: // pcmpestri r, r/a, #8
     case 0x70: // pshufd r, r/a, #8
     case 0x73: // psrldq r, #8
@@ -3953,6 +3955,83 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::palignr(XMMRegister dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_ssse3(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8((unsigned char)0x0F);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(imm8);
+}
+
+void Assembler::pblendw(XMMRegister dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8((unsigned char)0x0E);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(imm8);
+}
+
+void Assembler::sha1rnds4(XMMRegister dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sha(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8((unsigned char)0xCC);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8((unsigned char)imm8);
+}
+
+void Assembler::sha1nexte(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sha(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xC8);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::sha1msg1(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sha(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xC9);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::sha1msg2(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sha(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xCA);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// xmm0 is implicit additional source to this instruction.
+void Assembler::sha256rnds2(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sha(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xCB);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::sha256msg1(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sha(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xCC);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::sha256msg2(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sha(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xCD);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+
 void Assembler::shll(Register dst, int imm8) {
   assert(isShiftCount(imm8), "illegal shift count");
   int encode = prefix_and_encode(dst->encoding());
@@ -4931,6 +5010,15 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::paddd(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0xFE);
+  emit_operand(dst, src);
+}
+
 void Assembler::paddq(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -5611,8 +5699,9 @@
 }
 
 
-void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
@@ -5621,11 +5710,12 @@
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 128 bits
   // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -5633,26 +5723,29 @@
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
   // 0x01 - insert into upper 256 bits
-  emit_int8(value & 0x01);
-}
-
-void Assembler::vinsertf64x4h(XMMRegister dst, Address src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionMark im(this);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_64bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1A);
   emit_operand(dst, src);
   // 0x00 - insert into lower 256 bits
-  // 0x01 - insert into upper 128 bits
-  emit_int8(value & 0x01);
-}
-
-void Assembler::vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value) {
+  // 0x01 - insert into upper 256 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -5662,57 +5755,64 @@
   // 0x01 - insert into q1 128 bits (128..255)
   // 0x02 - insert into q2 128 bits (256..383)
   // 0x03 - insert into q3 128 bits (384..511)
-  emit_int8(value & 0x3);
-}
-
-void Assembler::vinsertf32x4h(XMMRegister dst, Address src, int value) {
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_operand(dst, src);
   // 0x00 - insert into q0 128 bits (0..127)
   // 0x01 - insert into q1 128 bits (128..255)
   // 0x02 - insert into q2 128 bits (256..383)
   // 0x03 - insert into q3 128 bits (384..511)
-  emit_int8(value & 0x3);
-}
-
-void Assembler::vinsertf128h(XMMRegister dst, Address src) {
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_operand(dst, src);
+  // 0x00 - insert into lower 128 bits
   // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - insert into lower 128 bits
-  // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vextractf128h(Address dst, XMMRegister src) {
+  // 0x00 - extract from lower 128 bits
+  // 0x01 - extract from upper 128 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextractf128(Address dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
@@ -5720,12 +5820,14 @@
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_operand(src, dst);
+  // 0x00 - extract from lower 128 bits
   // 0x01 - extract from upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx2(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
@@ -5734,11 +5836,12 @@
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 128 bits
   // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -5746,39 +5849,44 @@
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
   // 0x01 - insert into upper 256 bits
-  emit_int8(value & 0x01);
-}
-
-void Assembler::vinserti128h(XMMRegister dst, Address src) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
   assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x38);
   emit_operand(dst, src);
+  // 0x00 - insert into lower 128 bits
   // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vextracti128h(XMMRegister dst, XMMRegister src) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - insert into lower 128 bits
-  // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vextracti128h(Address dst, XMMRegister src) {
+  // 0x00 - extract from lower 128 bits
+  // 0x01 - extract from upper 128 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx2(), "");
   assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
@@ -5786,47 +5894,53 @@
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_operand(src, dst);
+  // 0x00 - extract from lower 128 bits
   // 0x01 - extract from upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vextracti64x4h(XMMRegister dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x3B);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - extract from lower 256 bits
   // 0x01 - extract from upper 256 bits
-  emit_int8(value & 0x01);
-}
-
-void Assembler::vextracti64x2h(XMMRegister dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
   // 0x01 - extract from bits 255:128
   // 0x02 - extract from bits 383:256
   // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
-}
-
-void Assembler::vextractf64x4h(XMMRegister dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1B);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - extract from lower 256 bits
   // 0x01 - extract from upper 256 bits
-  emit_int8(value & 0x1);
-}
-
-void Assembler::vextractf64x4h(Address dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextractf64x4(Address dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionMark im(this);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4,/* input_size_in_bits */  EVEX_64bit);
@@ -5835,11 +5949,12 @@
   emit_operand(src, dst);
   // 0x00 - extract from lower 256 bits
   // 0x01 - extract from upper 256 bits
-  emit_int8(value & 0x01);
-}
-
-void Assembler::vextractf32x4h(XMMRegister dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -5849,12 +5964,13 @@
   // 0x01 - extract from bits 255:128
   // 0x02 - extract from bits 383:256
   // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
-}
-
-void Assembler::vextractf32x4h(Address dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vextractf32x4(Address dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionMark im(this);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
@@ -5865,19 +5981,21 @@
   // 0x01 - extract from bits 255:128
   // 0x02 - extract from bits 383:256
   // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
-}
-
-void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
   // 0x01 - extract from bits 255:128
   // 0x02 - extract from bits 383:256
   // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
+  emit_int8(imm8 & 0x03);
 }
 
 // duplicate 4-bytes integer data from src into 8 locations in dest
--- a/src/cpu/x86/vm/assembler_x86.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1672,6 +1672,18 @@
 
   void setb(Condition cc, Register dst);
 
+  void palignr(XMMRegister dst, XMMRegister src, int imm8);
+  void pblendw(XMMRegister dst, XMMRegister src, int imm8);
+
+  void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8);
+  void sha1nexte(XMMRegister dst, XMMRegister src);
+  void sha1msg1(XMMRegister dst, XMMRegister src);
+  void sha1msg2(XMMRegister dst, XMMRegister src);
+  // xmm0 is implicit additional source to the following instruction.
+  void sha256rnds2(XMMRegister dst, XMMRegister src);
+  void sha256msg1(XMMRegister dst, XMMRegister src);
+  void sha256msg2(XMMRegister dst, XMMRegister src);
+
   void shldl(Register dst, Register src);
   void shldl(Register dst, Register src, int8_t imm8);
 
@@ -1868,6 +1880,7 @@
   void paddb(XMMRegister dst, XMMRegister src);
   void paddw(XMMRegister dst, XMMRegister src);
   void paddd(XMMRegister dst, XMMRegister src);
+  void paddd(XMMRegister dst, Address src);
   void paddq(XMMRegister dst, XMMRegister src);
   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@@ -1958,33 +1971,31 @@
   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
-  // Copy low 128bit into high 128bit of YMM registers.
-  void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
-  void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
-  void vextractf128h(XMMRegister dst, XMMRegister src);
-  void vextracti128h(XMMRegister dst, XMMRegister src);
-
-  // Load/store high 128bit of YMM registers which does not destroy other half.
-  void vinsertf128h(XMMRegister dst, Address src);
-  void vinserti128h(XMMRegister dst, Address src);
-  void vextractf128h(Address dst, XMMRegister src);
-  void vextracti128h(Address dst, XMMRegister src);
-
-  // Copy low 256bit into high 256bit of ZMM registers.
-  void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
-  void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
-  void vextracti64x4h(XMMRegister dst, XMMRegister src, int value);
-  void vextractf64x4h(XMMRegister dst, XMMRegister src, int value);
-  void vextractf64x4h(Address dst, XMMRegister src, int value);
-  void vinsertf64x4h(XMMRegister dst, Address src, int value);
-
-  // Copy targeted 128bit segments of the ZMM registers
-  void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
-  void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
-  void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
-  void vextractf32x4h(Address dst, XMMRegister src, int value);
-  void vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
-  void vinsertf32x4h(XMMRegister dst, Address src, int value);
+  // 128bit copy from/to 256bit (YMM) vector registers
+  void vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vextractf128(Address dst, XMMRegister src, uint8_t imm8);
+  void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
+
+  // 256bit copy from/to 512bit (ZMM) vector registers
+  void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
+  void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+
+  // 128bit copy from/to 256bit (YMM) or 512bit (ZMM) vector registers
+  void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
+  void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
 
   // duplicate 4-bytes integer data from src into 8 locations in dest
   void vpbroadcastd(XMMRegister dst, XMMRegister src);
--- a/src/cpu/x86/vm/globals_x86.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/globals_x86.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -97,6 +97,8 @@
 
 define_pd_global(bool, PreserveFramePointer, false);
 
+define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
+
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                             \
   develop(bool, IEEEPrecision, true,                                        \
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -3445,7 +3445,7 @@
 
 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
-    Assembler::vextractf32x4h(dst, src, 0);
+    Assembler::vextractf32x4(dst, src, 0);
   } else {
     Assembler::movdqu(dst, src);
   }
@@ -3453,7 +3453,7 @@
 
 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
-    Assembler::vinsertf32x4h(dst, src, 0);
+    Assembler::vinsertf32x4(dst, dst, src, 0);
   } else {
     Assembler::movdqu(dst, src);
   }
@@ -3478,7 +3478,7 @@
 
 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
-    Assembler::vextractf64x4h(dst, src, 0);
+    vextractf64x4_low(dst, src);
   } else {
     Assembler::vmovdqu(dst, src);
   }
@@ -3486,7 +3486,7 @@
 
 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
-    Assembler::vinsertf64x4h(dst, src, 0);
+    vinsertf64x4_low(dst, src);
   } else {
     Assembler::vmovdqu(dst, src);
   }
@@ -5649,14 +5649,14 @@
         // Save upper half of ZMM registers
         subptr(rsp, 32*num_xmm_regs);
         for (int n = 0; n < num_xmm_regs; n++) {
-          vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
+          vextractf64x4_high(Address(rsp, n*32), as_XMMRegister(n));
         }
       }
       assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
       // Save upper half of YMM registers
       subptr(rsp, 16*num_xmm_regs);
       for (int n = 0; n < num_xmm_regs; n++) {
-        vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
+        vextractf128_high(Address(rsp, n*16), as_XMMRegister(n));
       }
     }
 #endif
@@ -5665,7 +5665,7 @@
 #ifdef _LP64
     if (VM_Version::supports_evex()) {
       for (int n = 0; n < num_xmm_regs; n++) {
-        vextractf32x4h(Address(rsp, n*16), as_XMMRegister(n), 0);
+        vextractf32x4(Address(rsp, n*16), as_XMMRegister(n), 0);
       }
     } else {
       for (int n = 0; n < num_xmm_regs; n++) {
@@ -5753,7 +5753,7 @@
 #ifdef _LP64
   if (VM_Version::supports_evex()) {
     for (int n = 0; n < num_xmm_regs; n++) {
-      vinsertf32x4h(as_XMMRegister(n), Address(rsp, n*16), 0);
+      vinsertf32x4(as_XMMRegister(n), as_XMMRegister(n), Address(rsp, n*16), 0);
     }
   } else {
     for (int n = 0; n < num_xmm_regs; n++) {
@@ -5771,12 +5771,12 @@
     if (MaxVectorSize > 16) {
       // Restore upper half of YMM registers.
       for (int n = 0; n < num_xmm_regs; n++) {
-        vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
+        vinsertf128_high(as_XMMRegister(n), Address(rsp, n*16));
       }
       addptr(rsp, 16*num_xmm_regs);
       if(UseAVX > 2) {
         for (int n = 0; n < num_xmm_regs; n++) {
-          vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
+          vinsertf64x4_high(as_XMMRegister(n), Address(rsp, n*32));
         }
         addptr(rsp, 32*num_xmm_regs);
       }
@@ -7198,21 +7198,50 @@
 
 }
 
-void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
+void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, bool is_large) {
   // cnt - number of qwords (8-byte words).
   // base - start address, qword aligned.
+  // is_large - if optimizers know cnt is larger than InitArrayShortSize
   assert(base==rdi, "base register must be edi for rep stos");
   assert(tmp==rax,   "tmp register must be eax for rep stos");
   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
+  assert(InitArrayShortSize % BytesPerLong == 0,
+    "InitArrayShortSize should be the multiple of BytesPerLong");
+
+  Label DONE;
 
   xorptr(tmp, tmp);
+
+  if (!is_large) {
+    Label LOOP, LONG;
+    cmpptr(cnt, InitArrayShortSize/BytesPerLong);
+    jccb(Assembler::greater, LONG);
+
+    NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
+
+    decrement(cnt);
+    jccb(Assembler::negative, DONE); // Zero length
+
+    // Use individual pointer-sized stores for small counts:
+    BIND(LOOP);
+    movptr(Address(base, cnt, Address::times_ptr), tmp);
+    decrement(cnt);
+    jccb(Assembler::greaterEqual, LOOP);
+    jmpb(DONE);
+
+    BIND(LONG);
+  }
+
+  // Use longer rep-prefixed ops for non-small counts:
   if (UseFastStosb) {
-    shlptr(cnt,3); // convert to number of bytes
+    shlptr(cnt, 3); // convert to number of bytes
     rep_stosb();
   } else {
-    NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
+    NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
     rep_stos();
   }
+
+  BIND(DONE);
 }
 
 #ifdef COMPILER2
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -48,7 +48,6 @@
   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
   // may customize this version by overriding it for its purposes (e.g., to save/restore
   // additional registers when doing a VM call).
-#define COMMA ,
 
   virtual void call_VM_leaf_base(
     address entry_point,               // the entry point
@@ -903,35 +902,66 @@
   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
   void ldmxcsr(AddressLiteral src);
 
+  void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
+                 XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
+                 Register buf, Register state, Register ofs, Register limit, Register rsp,
+                 bool multi_block);
+
+#ifdef _LP64
+  void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+                   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+                   Register buf, Register state, Register ofs, Register limit, Register rsp,
+                   bool multi_block, XMMRegister shuf_mask);
+#else
+  void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+                   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+                   Register buf, Register state, Register ofs, Register limit, Register rsp,
+                   bool multi_block);
+#endif
+
   void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
                 Register rax, Register rcx, Register rdx, Register tmp);
 
+#ifdef _LP64
   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
-                Register rax, Register rcx, Register rdx, Register tmp1 LP64_ONLY(COMMA Register tmp2));
+                Register rax, Register rcx, Register rdx, Register tmp1, Register tmp2);
 
   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
-                Register rdx NOT_LP64(COMMA  Register tmp) LP64_ONLY(COMMA  Register tmp1)
-                LP64_ONLY(COMMA  Register tmp2) LP64_ONLY(COMMA  Register tmp3) LP64_ONLY(COMMA  Register tmp4));
+                Register rdx, Register tmp1, Register tmp2, Register tmp3, Register tmp4);
 
   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
-                Register rax, Register rbx LP64_ONLY(COMMA  Register rcx), Register rdx
-                LP64_ONLY(COMMA Register tmp1) LP64_ONLY(COMMA Register tmp2)
-                LP64_ONLY(COMMA Register tmp3) LP64_ONLY(COMMA Register tmp4));
+                Register rax, Register rbx, Register rcx, Register rdx, Register tmp1, Register tmp2,
+                Register tmp3, Register tmp4);
 
   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
-                Register rax, Register rcx, Register rdx NOT_LP64(COMMA Register tmp)
-                LP64_ONLY(COMMA Register r8) LP64_ONLY(COMMA Register r9)
-                LP64_ONLY(COMMA Register r10) LP64_ONLY(COMMA Register r11));
+                Register rax, Register rcx, Register rdx, Register tmp1,
+                Register tmp2, Register tmp3, Register tmp4);
+#else
+  void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
+                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+                Register rax, Register rcx, Register rdx, Register tmp1);
 
-#ifndef _LP64
+  void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
+                XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
+                Register rdx, Register tmp);
+
+  void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
+                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+                Register rax, Register rbx, Register rdx);
+
+  void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
+                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+                Register rax, Register rcx, Register rdx, Register tmp);
+
   void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
                         Register edx, Register ebx, Register esi, Register edi,
                         Register ebp, Register esp);
+
   void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
                          Register esi, Register edi, Register ebp, Register esp);
 #endif
@@ -1185,14 +1215,131 @@
   void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); }
   void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }
 
-  // Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector.
-  void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
-    if (UseAVX > 1) // vinserti128h is available only in AVX2
-      Assembler::vinserti128h(dst, nds, src);
-    else
-      Assembler::vinsertf128h(dst, nds, src);
+  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+    if (UseAVX > 1) { // vinserti128 is available only in AVX2
+      Assembler::vinserti128(dst, nds, src, imm8);
+    } else {
+      Assembler::vinsertf128(dst, nds, src, imm8);
+    }
   }
 
+  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
+    if (UseAVX > 1) { // vinserti128 is available only in AVX2
+      Assembler::vinserti128(dst, nds, src, imm8);
+    } else {
+      Assembler::vinsertf128(dst, nds, src, imm8);
+    }
+  }
+
+  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
+    if (UseAVX > 1) { // vextracti128 is available only in AVX2
+      Assembler::vextracti128(dst, src, imm8);
+    } else {
+      Assembler::vextractf128(dst, src, imm8);
+    }
+  }
+
+  void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
+    if (UseAVX > 1) { // vextracti128 is available only in AVX2
+      Assembler::vextracti128(dst, src, imm8);
+    } else {
+      Assembler::vextractf128(dst, src, imm8);
+    }
+  }
+
+  // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
+  void vinserti128_high(XMMRegister dst, XMMRegister src) {
+    vinserti128(dst, dst, src, 1);
+  }
+  void vinserti128_high(XMMRegister dst, Address src) {
+    vinserti128(dst, dst, src, 1);
+  }
+  void vextracti128_high(XMMRegister dst, XMMRegister src) {
+    vextracti128(dst, src, 1);
+  }
+  void vextracti128_high(Address dst, XMMRegister src) {
+    vextracti128(dst, src, 1);
+  }
+  void vinsertf128_high(XMMRegister dst, XMMRegister src) {
+    vinsertf128(dst, dst, src, 1);
+  }
+  void vinsertf128_high(XMMRegister dst, Address src) {
+    vinsertf128(dst, dst, src, 1);
+  }
+  void vextractf128_high(XMMRegister dst, XMMRegister src) {
+    vextractf128(dst, src, 1);
+  }
+  void vextractf128_high(Address dst, XMMRegister src) {
+    vextractf128(dst, src, 1);
+  }
+
+  // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
+  void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
+    vinserti64x4(dst, dst, src, 1);
+  }
+  void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
+    vinsertf64x4(dst, dst, src, 1);
+  }
+  void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
+    vextracti64x4(dst, src, 1);
+  }
+  void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
+    vextractf64x4(dst, src, 1);
+  }
+  void vextractf64x4_high(Address dst, XMMRegister src) {
+    vextractf64x4(dst, src, 1);
+  }
+  void vinsertf64x4_high(XMMRegister dst, Address src) {
+    vinsertf64x4(dst, dst, src, 1);
+  }
+
+  // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
+  void vinserti128_low(XMMRegister dst, XMMRegister src) {
+    vinserti128(dst, dst, src, 0);
+  }
+  void vinserti128_low(XMMRegister dst, Address src) {
+    vinserti128(dst, dst, src, 0);
+  }
+  void vextracti128_low(XMMRegister dst, XMMRegister src) {
+    vextracti128(dst, src, 0);
+  }
+  void vextracti128_low(Address dst, XMMRegister src) {
+    vextracti128(dst, src, 0);
+  }
+  void vinsertf128_low(XMMRegister dst, XMMRegister src) {
+    vinsertf128(dst, dst, src, 0);
+  }
+  void vinsertf128_low(XMMRegister dst, Address src) {
+    vinsertf128(dst, dst, src, 0);
+  }
+  void vextractf128_low(XMMRegister dst, XMMRegister src) {
+    vextractf128(dst, src, 0);
+  }
+  void vextractf128_low(Address dst, XMMRegister src) {
+    vextractf128(dst, src, 0);
+  }
+
+  // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
+  void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
+    vinserti64x4(dst, dst, src, 0);
+  }
+  void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
+    vinsertf64x4(dst, dst, src, 0);
+  }
+  void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
+    vextracti64x4(dst, src, 0);
+  }
+  void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
+    vextractf64x4(dst, src, 0);
+  }
+  void vextractf64x4_low(Address dst, XMMRegister src) {
+    vextractf64x4(dst, src, 0);
+  }
+  void vinsertf64x4_low(XMMRegister dst, Address src) {
+    vinsertf64x4(dst, dst, src, 0);
+  }
+
+
   // Carry-Less Multiplication Quadword
   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
     // 0x00 - multiply lower 64 bits [0:63]
@@ -1284,8 +1431,9 @@
   // C2 compiled method's prolog code.
   void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b);
 
-  // clear memory of size 'cnt' qwords, starting at 'base'.
-  void clear_mem(Register base, Register cnt, Register rtmp);
+  // clear memory of size 'cnt' qwords, starting at 'base';
+  // if 'is_large' is set, do not try to produce short loop
+  void clear_mem(Register base, Register cnt, Register rtmp, bool is_large);
 
 #ifdef COMPILER2
   void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cpu/x86/vm/macroAssembler_x86_sha.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -0,0 +1,495 @@
+/*
+* Copyright (c) 2016, Intel Corporation.
+*
+* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+*
+* This code is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License version 2 only, as
+* published by the Free Software Foundation.
+*
+* This code is distributed in the hope that it will be useful, but WITHOUT
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+* version 2 for more details (a copy is included in the LICENSE file that
+* accompanied this code).
+*
+* You should have received a copy of the GNU General Public License version
+* 2 along with this work; if not, write to the Free Software Foundation,
+* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+*
+* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+* or visit www.oracle.com if you need additional information or have any
+* questions.
+*
+*/
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "macroAssembler_x86.hpp"
+
+// ofs and limit are used for multi-block byte array.
+// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
+  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
+  Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
+
+  Label start, done_hash, loop0;
+
+  address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
+  address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
+
+  bind(start);
+  movdqu(abcd, Address(state, 0));
+  pinsrd(e0, Address(state, 16), 3);
+  movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
+  pand(e0, shuf_mask);
+  pshufd(abcd, abcd, 0x1B);
+  movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
+
+  bind(loop0);
+  // Save hash values for addition after rounds
+  movdqu(Address(rsp, 0), e0);
+  movdqu(Address(rsp, 16), abcd);
+
+
+  // Rounds 0 - 3
+  movdqu(msg0, Address(buf, 0));
+  pshufb(msg0, shuf_mask);
+  paddd(e0, msg0);
+  movdqa(e1, abcd);
+  sha1rnds4(abcd, e0, 0);
+
+  // Rounds 4 - 7
+  movdqu(msg1, Address(buf, 16));
+  pshufb(msg1, shuf_mask);
+  sha1nexte(e1, msg1);
+  movdqa(e0, abcd);
+  sha1rnds4(abcd, e1, 0);
+  sha1msg1(msg0, msg1);
+
+  // Rounds 8 - 11
+  movdqu(msg2, Address(buf, 32));
+  pshufb(msg2, shuf_mask);
+  sha1nexte(e0, msg2);
+  movdqa(e1, abcd);
+  sha1rnds4(abcd, e0, 0);
+  sha1msg1(msg1, msg2);
+  pxor(msg0, msg2);
+
+  // Rounds 12 - 15
+  movdqu(msg3, Address(buf, 48));
+  pshufb(msg3, shuf_mask);
+  sha1nexte(e1, msg3);
+  movdqa(e0, abcd);
+  sha1msg2(msg0, msg3);
+  sha1rnds4(abcd, e1, 0);
+  sha1msg1(msg2, msg3);
+  pxor(msg1, msg3);
+
+  // Rounds 16 - 19
+  sha1nexte(e0, msg0);
+  movdqa(e1, abcd);
+  sha1msg2(msg1, msg0);
+  sha1rnds4(abcd, e0, 0);
+  sha1msg1(msg3, msg0);
+  pxor(msg2, msg0);
+
+  // Rounds 20 - 23
+  sha1nexte(e1, msg1);
+  movdqa(e0, abcd);
+  sha1msg2(msg2, msg1);
+  sha1rnds4(abcd, e1, 1);
+  sha1msg1(msg0, msg1);
+  pxor(msg3, msg1);
+
+  // Rounds 24 - 27
+  sha1nexte(e0, msg2);
+  movdqa(e1, abcd);
+  sha1msg2(msg3, msg2);
+  sha1rnds4(abcd, e0, 1);
+  sha1msg1(msg1, msg2);
+  pxor(msg0, msg2);
+
+  // Rounds 28 - 31
+  sha1nexte(e1, msg3);
+  movdqa(e0, abcd);
+  sha1msg2(msg0, msg3);
+  sha1rnds4(abcd, e1, 1);
+  sha1msg1(msg2, msg3);
+  pxor(msg1, msg3);
+
+  // Rounds 32 - 35
+  sha1nexte(e0, msg0);
+  movdqa(e1, abcd);
+  sha1msg2(msg1, msg0);
+  sha1rnds4(abcd, e0, 1);
+  sha1msg1(msg3, msg0);
+  pxor(msg2, msg0);
+
+  // Rounds 36 - 39
+  sha1nexte(e1, msg1);
+  movdqa(e0, abcd);
+  sha1msg2(msg2, msg1);
+  sha1rnds4(abcd, e1, 1);
+  sha1msg1(msg0, msg1);
+  pxor(msg3, msg1);
+
+  // Rounds 40 - 43
+  sha1nexte(e0, msg2);
+  movdqa(e1, abcd);
+  sha1msg2(msg3, msg2);
+  sha1rnds4(abcd, e0, 2);
+  sha1msg1(msg1, msg2);
+  pxor(msg0, msg2);
+
+  // Rounds 44 - 47
+  sha1nexte(e1, msg3);
+  movdqa(e0, abcd);
+  sha1msg2(msg0, msg3);
+  sha1rnds4(abcd, e1, 2);
+  sha1msg1(msg2, msg3);
+  pxor(msg1, msg3);
+
+  // Rounds 48 - 51
+  sha1nexte(e0, msg0);
+  movdqa(e1, abcd);
+  sha1msg2(msg1, msg0);
+  sha1rnds4(abcd, e0, 2);
+  sha1msg1(msg3, msg0);
+  pxor(msg2, msg0);
+
+  // Rounds 52 - 55
+  sha1nexte(e1, msg1);
+  movdqa(e0, abcd);
+  sha1msg2(msg2, msg1);
+  sha1rnds4(abcd, e1, 2);
+  sha1msg1(msg0, msg1);
+  pxor(msg3, msg1);
+
+  // Rounds 56 - 59
+  sha1nexte(e0, msg2);
+  movdqa(e1, abcd);
+  sha1msg2(msg3, msg2);
+  sha1rnds4(abcd, e0, 2);
+  sha1msg1(msg1, msg2);
+  pxor(msg0, msg2);
+
+  // Rounds 60 - 63
+  sha1nexte(e1, msg3);
+  movdqa(e0, abcd);
+  sha1msg2(msg0, msg3);
+  sha1rnds4(abcd, e1, 3);
+  sha1msg1(msg2, msg3);
+  pxor(msg1, msg3);
+
+  // Rounds 64 - 67
+  sha1nexte(e0, msg0);
+  movdqa(e1, abcd);
+  sha1msg2(msg1, msg0);
+  sha1rnds4(abcd, e0, 3);
+  sha1msg1(msg3, msg0);
+  pxor(msg2, msg0);
+
+  // Rounds 68 - 71
+  sha1nexte(e1, msg1);
+  movdqa(e0, abcd);
+  sha1msg2(msg2, msg1);
+  sha1rnds4(abcd, e1, 3);
+  pxor(msg3, msg1);
+
+  // Rounds 72 - 75
+  sha1nexte(e0, msg2);
+  movdqa(e1, abcd);
+  sha1msg2(msg3, msg2);
+  sha1rnds4(abcd, e0, 3);
+
+  // Rounds 76 - 79
+  sha1nexte(e1, msg3);
+  movdqa(e0, abcd);
+  sha1rnds4(abcd, e1, 3);
+
+  // add current hash values with previously saved
+  movdqu(msg0, Address(rsp, 0));
+  sha1nexte(e0, msg0);
+  movdqu(msg0, Address(rsp, 16));
+  paddd(abcd, msg0);
+
+  if (multi_block) {
+    // increment data pointer and loop if more to process
+    addptr(buf, 64);
+    addptr(ofs, 64);
+    cmpptr(ofs, limit);
+    jcc(Assembler::belowEqual, loop0);
+    movptr(rax, ofs); //return ofs
+  }
+  // write hash values back in the correct order
+  pshufd(abcd, abcd, 0x1b);
+  movdqu(Address(state, 0), abcd);
+  pextrd(Address(state, 16), e0, 3);
+
+  bind(done_hash);
+
+}
+
+// xmm0 (msg) is used as an implicit argument to sh256rnds2
+// and state0 and state1 can never use xmm0 register.
+// ofs and limit are used for multi-block byte array.
+// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+#ifdef _LP64
+void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+  XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+  Register buf, Register state, Register ofs, Register limit, Register rsp,
+  bool multi_block, XMMRegister shuf_mask) {
+#else
+void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+  XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+  Register buf, Register state, Register ofs, Register limit, Register rsp,
+  bool multi_block) {
+#endif
+  Label start, done_hash, loop0;
+
+  address K256 = StubRoutines::x86::k256_addr();
+  address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
+
+  bind(start);
+  movdqu(state0, Address(state, 0));
+  movdqu(state1, Address(state, 16));
+
+  pshufd(state0, state0, 0xB1);
+  pshufd(state1, state1, 0x1B);
+  movdqa(msgtmp4, state0);
+  palignr(state0, state1, 8);
+  pblendw(state1, msgtmp4, 0xF0);
+
+#ifdef _LP64
+  movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+  lea(rax, ExternalAddress(K256));
+
+  bind(loop0);
+  movdqu(Address(rsp, 0), state0);
+  movdqu(Address(rsp, 16), state1);
+
+  // Rounds 0-3
+  movdqu(msg, Address(buf, 0));
+#ifdef _LP64
+  pshufb(msg, shuf_mask);
+#else
+  pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+  movdqa(msgtmp0, msg);
+  paddd(msg, Address(rax, 0));
+  sha256rnds2(state1, state0);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+
+  // Rounds 4-7
+  movdqu(msg, Address(buf, 16));
+#ifdef _LP64
+  pshufb(msg, shuf_mask);
+#else
+  pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+  movdqa(msgtmp1, msg);
+  paddd(msg, Address(rax, 16));
+  sha256rnds2(state1, state0);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp0, msgtmp1);
+
+  // Rounds 8-11
+  movdqu(msg, Address(buf, 32));
+#ifdef _LP64
+  pshufb(msg, shuf_mask);
+#else
+  pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+  movdqa(msgtmp2, msg);
+  paddd(msg, Address(rax, 32));
+  sha256rnds2(state1, state0);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp1, msgtmp2);
+
+  // Rounds 12-15
+  movdqu(msg, Address(buf, 48));
+#ifdef _LP64
+  pshufb(msg, shuf_mask);
+#else
+  pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+  movdqa(msgtmp3, msg);
+  paddd(msg, Address(rax, 48));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp3);
+  palignr(msgtmp4, msgtmp2, 4);
+  paddd(msgtmp0, msgtmp4);
+  sha256msg2(msgtmp0, msgtmp3);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp2, msgtmp3);
+
+  // Rounds 16-19
+  movdqa(msg, msgtmp0);
+  paddd(msg, Address(rax, 64));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp0);
+  palignr(msgtmp4, msgtmp3, 4);
+  paddd(msgtmp1, msgtmp4);
+  sha256msg2(msgtmp1, msgtmp0);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp3, msgtmp0);
+
+  // Rounds 20-23
+  movdqa(msg, msgtmp1);
+  paddd(msg, Address(rax, 80));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp1);
+  palignr(msgtmp4, msgtmp0, 4);
+  paddd(msgtmp2, msgtmp4);
+  sha256msg2(msgtmp2, msgtmp1);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp0, msgtmp1);
+
+  // Rounds 24-27
+  movdqa(msg, msgtmp2);
+  paddd(msg, Address(rax, 96));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp2);
+  palignr(msgtmp4, msgtmp1, 4);
+  paddd(msgtmp3, msgtmp4);
+  sha256msg2(msgtmp3, msgtmp2);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp1, msgtmp2);
+
+  // Rounds 28-31
+  movdqa(msg, msgtmp3);
+  paddd(msg, Address(rax, 112));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp3);
+  palignr(msgtmp4, msgtmp2, 4);
+  paddd(msgtmp0, msgtmp4);
+  sha256msg2(msgtmp0, msgtmp3);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp2, msgtmp3);
+
+  // Rounds 32-35
+  movdqa(msg, msgtmp0);
+  paddd(msg, Address(rax, 128));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp0);
+  palignr(msgtmp4, msgtmp3, 4);
+  paddd(msgtmp1, msgtmp4);
+  sha256msg2(msgtmp1, msgtmp0);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp3, msgtmp0);
+
+  // Rounds 36-39
+  movdqa(msg, msgtmp1);
+  paddd(msg, Address(rax, 144));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp1);
+  palignr(msgtmp4, msgtmp0, 4);
+  paddd(msgtmp2, msgtmp4);
+  sha256msg2(msgtmp2, msgtmp1);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp0, msgtmp1);
+
+  // Rounds 40-43
+  movdqa(msg, msgtmp2);
+  paddd(msg, Address(rax, 160));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp2);
+  palignr(msgtmp4, msgtmp1, 4);
+  paddd(msgtmp3, msgtmp4);
+  sha256msg2(msgtmp3, msgtmp2);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp1, msgtmp2);
+
+  // Rounds 44-47
+  movdqa(msg, msgtmp3);
+  paddd(msg, Address(rax, 176));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp3);
+  palignr(msgtmp4, msgtmp2, 4);
+  paddd(msgtmp0, msgtmp4);
+  sha256msg2(msgtmp0, msgtmp3);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp2, msgtmp3);
+
+  // Rounds 48-51
+  movdqa(msg, msgtmp0);
+  paddd(msg, Address(rax, 192));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp0);
+  palignr(msgtmp4, msgtmp3, 4);
+  paddd(msgtmp1, msgtmp4);
+  sha256msg2(msgtmp1, msgtmp0);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  sha256msg1(msgtmp3, msgtmp0);
+
+  // Rounds 52-55
+  movdqa(msg, msgtmp1);
+  paddd(msg, Address(rax, 208));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp1);
+  palignr(msgtmp4, msgtmp0, 4);
+  paddd(msgtmp2, msgtmp4);
+  sha256msg2(msgtmp2, msgtmp1);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+
+  // Rounds 56-59
+  movdqa(msg, msgtmp2);
+  paddd(msg, Address(rax, 224));
+  sha256rnds2(state1, state0);
+  movdqa(msgtmp4, msgtmp2);
+  palignr(msgtmp4, msgtmp1, 4);
+  paddd(msgtmp3, msgtmp4);
+  sha256msg2(msgtmp3, msgtmp2);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+
+  // Rounds 60-63
+  movdqa(msg, msgtmp3);
+  paddd(msg, Address(rax, 240));
+  sha256rnds2(state1, state0);
+  pshufd(msg, msg, 0x0E);
+  sha256rnds2(state0, state1);
+  movdqu(msg, Address(rsp, 0));
+  paddd(state0, msg);
+  movdqu(msg, Address(rsp, 16));
+  paddd(state1, msg);
+
+  if (multi_block) {
+    // increment data pointer and loop if more to process
+    addptr(buf, 64);
+    addptr(ofs, 64);
+    cmpptr(ofs, limit);
+    jcc(Assembler::belowEqual, loop0);
+    movptr(rax, ofs); //return ofs
+  }
+
+  pshufd(state0, state0, 0x1B);
+  pshufd(state1, state1, 0xB1);
+  movdqa(msgtmp4, state0);
+  pblendw(state0, state1, 0xF0);
+  palignr(state1, msgtmp4, 8);
+
+  movdqu(Address(state, 0), state0);
+  movdqu(Address(state, 16), state1);
+
+  bind(done_hash);
+
+}
--- a/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -208,13 +208,13 @@
     __ subptr(rsp, ymm_bytes);
     // Save upper half of YMM registers
     for (int n = 0; n < num_xmm_regs; n++) {
-      __ vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
+      __ vextractf128_high(Address(rsp, n*16), as_XMMRegister(n));
     }
     if (UseAVX > 2) {
       __ subptr(rsp, zmm_bytes);
       // Save upper half of ZMM registers
       for (int n = 0; n < num_xmm_regs; n++) {
-        __ vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
+        __ vextractf64x4_high(Address(rsp, n*32), as_XMMRegister(n));
       }
     }
   }
@@ -304,13 +304,13 @@
     if (UseAVX > 2) {
       // Restore upper half of ZMM registers.
       for (int n = 0; n < num_xmm_regs; n++) {
-        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
+        __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, n*32));
       }
       __ addptr(rsp, zmm_bytes);
     }
     // Restore upper half of YMM registers.
     for (int n = 0; n < num_xmm_regs; n++) {
-      __ vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
+      __ vinsertf128_high(as_XMMRegister(n), Address(rsp, n*16));
     }
     __ addptr(rsp, ymm_bytes);
   }
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -179,13 +179,13 @@
     // Save upper half of YMM registers(0..15)
     int base_addr = XSAVE_AREA_YMM_BEGIN;
     for (int n = 0; n < 16; n++) {
-      __ vextractf128h(Address(rsp, base_addr+n*16), as_XMMRegister(n));
+      __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
     }
     if (VM_Version::supports_evex()) {
       // Save upper half of ZMM registers(0..15)
       base_addr = XSAVE_AREA_ZMM_BEGIN;
       for (int n = 0; n < 16; n++) {
-        __ vextractf64x4h(Address(rsp, base_addr+n*32), as_XMMRegister(n), 1);
+        __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
       }
       // Save full ZMM registers(16..num_xmm_regs)
       base_addr = XSAVE_AREA_UPPERBANK;
@@ -333,13 +333,13 @@
     // Restore upper half of YMM registers (0..15)
     int base_addr = XSAVE_AREA_YMM_BEGIN;
     for (int n = 0; n < 16; n++) {
-      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  base_addr+n*16));
+      __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
     }
     if (VM_Version::supports_evex()) {
       // Restore upper half of ZMM registers (0..15)
       base_addr = XSAVE_AREA_ZMM_BEGIN;
       for (int n = 0; n < 16; n++) {
-        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, base_addr+n*32), 1);
+        __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
       }
       // Restore full ZMM registers(16..num_xmm_regs)
       base_addr = XSAVE_AREA_UPPERBANK;
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -3068,6 +3068,136 @@
     return start;
   }
 
+  address generate_upper_word_mask() {
+    __ align(64);
+    StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
+    address start = __ pc();
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    __ emit_data(0xFFFFFFFF, relocInfo::none, 0);
+    return start;
+  }
+
+  address generate_shuffle_byte_flip_mask() {
+    __ align(64);
+    StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
+    address start = __ pc();
+    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+    __ emit_data(0x08090a0b, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    return start;
+  }
+
+  // ofs and limit are use for multi-block byte array.
+  // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+  address generate_sha1_implCompress(bool multi_block, const char *name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    Register buf   = rax;
+    Register state = rdx;
+    Register ofs   = rcx;
+    Register limit = rdi;
+
+    const Address  buf_param(rbp, 8 + 0);
+    const Address  state_param(rbp, 8 + 4);
+    const Address  ofs_param(rbp, 8 + 8);
+    const Address  limit_param(rbp, 8 + 12);
+
+    const XMMRegister abcd = xmm0;
+    const XMMRegister e0 = xmm1;
+    const XMMRegister e1 = xmm2;
+    const XMMRegister msg0 = xmm3;
+
+    const XMMRegister msg1 = xmm4;
+    const XMMRegister msg2 = xmm5;
+    const XMMRegister msg3 = xmm6;
+    const XMMRegister shuf_mask = xmm7;
+
+    __ enter();
+    __ subptr(rsp, 8 * wordSize);
+    if (multi_block) {
+      __ push(limit);
+    }
+    __ movptr(buf, buf_param);
+    __ movptr(state, state_param);
+    if (multi_block) {
+      __ movptr(ofs, ofs_param);
+      __ movptr(limit, limit_param);
+    }
+
+    __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
+      buf, state, ofs, limit, rsp, multi_block);
+
+    if (multi_block) {
+      __ pop(limit);
+    }
+    __ addptr(rsp, 8 * wordSize);
+    __ leave();
+    __ ret(0);
+    return start;
+  }
+
+  address generate_pshuffle_byte_flip_mask() {
+    __ align(64);
+    StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
+    address start = __ pc();
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x08090a0b, relocInfo::none, 0);
+    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+    return start;
+  }
+
+  // ofs and limit are use for multi-block byte array.
+  // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+ address generate_sha256_implCompress(bool multi_block, const char *name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    Register buf = rbx;
+    Register state = rsi;
+    Register ofs = rdx;
+    Register limit = rcx;
+
+    const Address  buf_param(rbp, 8 + 0);
+    const Address  state_param(rbp, 8 + 4);
+    const Address  ofs_param(rbp, 8 + 8);
+    const Address  limit_param(rbp, 8 + 12);
+
+    const XMMRegister msg = xmm0;
+    const XMMRegister state0 = xmm1;
+    const XMMRegister state1 = xmm2;
+    const XMMRegister msgtmp0 = xmm3;
+
+    const XMMRegister msgtmp1 = xmm4;
+    const XMMRegister msgtmp2 = xmm5;
+    const XMMRegister msgtmp3 = xmm6;
+    const XMMRegister msgtmp4 = xmm7;
+
+    __ enter();
+    __ subptr(rsp, 8 * wordSize);
+    handleSOERegisters(true /*saving*/);
+    __ movptr(buf, buf_param);
+    __ movptr(state, state_param);
+    if (multi_block) {
+     __ movptr(ofs, ofs_param);
+     __ movptr(limit, limit_param);
+    }
+
+    __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
+      buf, state, ofs, limit, rsp, multi_block);
+
+    handleSOERegisters(false);
+    __ addptr(rsp, 8 * wordSize);
+    __ leave();
+    __ ret(0);
+    return start;
+  }
 
   // byte swap x86 long
   address generate_ghash_long_swap_mask() {
@@ -3772,6 +3902,19 @@
       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
     }
 
+    if (UseSHA1Intrinsics) {
+      StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
+      StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
+      StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
+      StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
+    }
+    if (UseSHA256Intrinsics) {
+      StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
+      StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
+      StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
+      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
+    }
+
     // Generate GHASH intrinsics code
     if (UseGHASHIntrinsics) {
       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -275,7 +275,7 @@
     }
     if (VM_Version::supports_evex()) {
       for (int i = xmm_save_first; i <= last_reg; i++) {
-        __ vextractf32x4h(xmm_save(i), as_XMMRegister(i), 0);
+        __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
       }
     } else {
       for (int i = xmm_save_first; i <= last_reg; i++) {
@@ -393,7 +393,7 @@
     // emit the restores for xmm regs
     if (VM_Version::supports_evex()) {
       for (int i = xmm_save_first; i <= last_reg; i++) {
-        __ vinsertf32x4h(as_XMMRegister(i), xmm_save(i), 0);
+        __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
       }
     } else {
       for (int i = xmm_save_first; i <= last_reg; i++) {
@@ -3695,6 +3695,133 @@
     return start;
   }
 
+  address generate_upper_word_mask() {
+    __ align(64);
+    StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
+    address start = __ pc();
+    __ emit_data64(0x0000000000000000, relocInfo::none);
+    __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
+    return start;
+  }
+
+  address generate_shuffle_byte_flip_mask() {
+    __ align(64);
+    StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
+    address start = __ pc();
+    __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
+    __ emit_data64(0x0001020304050607, relocInfo::none);
+    return start;
+  }
+
+  // ofs and limit are use for multi-block byte array.
+  // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+  address generate_sha1_implCompress(bool multi_block, const char *name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    Register buf = c_rarg0;
+    Register state = c_rarg1;
+    Register ofs = c_rarg2;
+    Register limit = c_rarg3;
+
+    const XMMRegister abcd = xmm0;
+    const XMMRegister e0 = xmm1;
+    const XMMRegister e1 = xmm2;
+    const XMMRegister msg0 = xmm3;
+
+    const XMMRegister msg1 = xmm4;
+    const XMMRegister msg2 = xmm5;
+    const XMMRegister msg3 = xmm6;
+    const XMMRegister shuf_mask = xmm7;
+
+    __ enter();
+
+#ifdef _WIN64
+    // save the xmm registers which must be preserved 6-7
+    __ subptr(rsp, 4 * wordSize);
+    __ movdqu(Address(rsp, 0), xmm6);
+    __ movdqu(Address(rsp, 2 * wordSize), xmm7);
+#endif
+
+    __ subptr(rsp, 4 * wordSize);
+
+    __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
+      buf, state, ofs, limit, rsp, multi_block);
+
+    __ addptr(rsp, 4 * wordSize);
+#ifdef _WIN64
+    // restore xmm regs belonging to calling function
+    __ movdqu(xmm6, Address(rsp, 0));
+    __ movdqu(xmm7, Address(rsp, 2 * wordSize));
+    __ addptr(rsp, 4 * wordSize);
+#endif
+
+    __ leave();
+    __ ret(0);
+    return start;
+  }
+
+  address generate_pshuffle_byte_flip_mask() {
+    __ align(64);
+    StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
+    address start = __ pc();
+    __ emit_data64(0x0405060700010203, relocInfo::none);
+    __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
+    return start;
+  }
+
+// ofs and limit are use for multi-block byte array.
+// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+  address generate_sha256_implCompress(bool multi_block, const char *name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    Register buf = c_rarg0;
+    Register state = c_rarg1;
+    Register ofs = c_rarg2;
+    Register limit = c_rarg3;
+
+    const XMMRegister msg = xmm0;
+    const XMMRegister state0 = xmm1;
+    const XMMRegister state1 = xmm2;
+    const XMMRegister msgtmp0 = xmm3;
+
+    const XMMRegister msgtmp1 = xmm4;
+    const XMMRegister msgtmp2 = xmm5;
+    const XMMRegister msgtmp3 = xmm6;
+    const XMMRegister msgtmp4 = xmm7;
+
+    const XMMRegister shuf_mask = xmm8;
+
+    __ enter();
+#ifdef _WIN64
+    // save the xmm registers which must be preserved 6-7
+    __ subptr(rsp, 6 * wordSize);
+    __ movdqu(Address(rsp, 0), xmm6);
+    __ movdqu(Address(rsp, 2 * wordSize), xmm7);
+    __ movdqu(Address(rsp, 4 * wordSize), xmm8);
+#endif
+
+    __ subptr(rsp, 4 * wordSize);
+
+    __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
+      buf, state, ofs, limit, rsp, multi_block, shuf_mask);
+
+    __ addptr(rsp, 4 * wordSize);
+#ifdef _WIN64
+    // restore xmm regs belonging to calling function
+    __ movdqu(xmm6, Address(rsp, 0));
+    __ movdqu(xmm7, Address(rsp, 2 * wordSize));
+    __ movdqu(xmm8, Address(rsp, 4 * wordSize));
+    __ addptr(rsp, 6 * wordSize);
+#endif
+    __ leave();
+    __ ret(0);
+    return start;
+  }
+
   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
   // to hide instruction latency
   //
@@ -4974,6 +5101,19 @@
       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
     }
 
+    if (UseSHA1Intrinsics) {
+      StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
+      StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
+      StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
+      StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
+    }
+    if (UseSHA256Intrinsics) {
+      StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
+      StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
+      StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
+      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
+    }
+
     // Generate GHASH intrinsics code
     if (UseGHASHIntrinsics) {
       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
--- a/src/cpu/x86/vm/stubRoutines_x86.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/stubRoutines_x86.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -29,6 +29,12 @@
 #include "runtime/thread.inline.hpp"
 #include "crc32c.h"
 
+#ifdef _MSC_VER
+#define ALIGNED_(x) __declspec(align(x))
+#else
+#define ALIGNED_(x) __attribute__ ((aligned(x)))
+#endif
+
 // Implementation of the platform-specific part of StubRoutines - for
 // a description of how to extend it, see the stubRoutines.hpp file.
 
@@ -37,6 +43,10 @@
 address StubRoutines::x86::_counter_shuffle_mask_addr = NULL;
 address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
 address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
+address StubRoutines::x86::_upper_word_mask_addr = NULL;
+address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL;
+address StubRoutines::x86::_k256_adr = NULL;
+address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL;
 
 uint64_t StubRoutines::x86::_crc_by128_masks[] =
 {
@@ -236,3 +246,23 @@
     _crc32c_table = (juint*)pclmulqdq_table;
   }
 }
+
+ALIGNED_(64) juint StubRoutines::x86::_k256[] =
+{
+    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
+    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
+    0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
+    0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
+    0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
+    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
+    0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
+    0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
+    0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
+    0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
+    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
+    0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
+    0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
+    0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
+    0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
+    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
+};
--- a/src/cpu/x86/vm/stubRoutines_x86.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/stubRoutines_x86.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -46,6 +46,17 @@
   static address _ghash_long_swap_mask_addr;
   static address _ghash_byte_swap_mask_addr;
 
+  // upper word mask for sha1
+  static address _upper_word_mask_addr;
+  // byte flip mask for sha1
+  static address _shuffle_byte_flip_mask_addr;
+
+  //k256 table for sha256
+  static juint _k256[];
+  static address _k256_adr;
+  // byte flip mask for sha256
+  static address _pshuffle_byte_flip_mask_addr;
+
  public:
   static address verify_mxcsr_entry()    { return _verify_mxcsr_entry; }
   static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
@@ -53,5 +64,9 @@
   static address crc_by128_masks_addr()  { return (address)_crc_by128_masks; }
   static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
   static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
+  static address upper_word_mask_addr() { return _upper_word_mask_addr; }
+  static address shuffle_byte_flip_mask_addr() { return _shuffle_byte_flip_mask_addr; }
+  static address k256_addr()      { return _k256_adr; }
+  static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
   static void generate_CRC32C_table(bool is_pclmulqdq_supported);
 #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/src/cpu/x86/vm/vmStructs_x86.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/vmStructs_x86.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -68,10 +68,11 @@
   declare_constant(VM_Version::CPU_AVX512DQ)                        \
   declare_constant(VM_Version::CPU_AVX512PF)                        \
   declare_constant(VM_Version::CPU_AVX512ER)                        \
-  declare_constant(VM_Version::CPU_AVX512CD)                        \
-  declare_constant(VM_Version::CPU_AVX512BW)
+  declare_constant(VM_Version::CPU_AVX512CD)
 
 #define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
-  declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL)
+  declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
+  declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
+  declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
 
 #endif // CPU_X86_VM_VMSTRUCTS_X86_HPP
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -385,7 +385,7 @@
 
     __ movdl(xmm0, rcx);
     __ pshufd(xmm0, xmm0, 0x00);
-    __ vinsertf128h(xmm0, xmm0, xmm0);
+    __ vinsertf128_high(xmm0, xmm0);
     __ vmovdqu(xmm7, xmm0);
 #ifdef _LP64
     __ vmovdqu(xmm8, xmm0);
@@ -577,7 +577,7 @@
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -608,7 +608,8 @@
                (supports_bmi1() ? ", bmi1" : ""),
                (supports_bmi2() ? ", bmi2" : ""),
                (supports_adx() ? ", adx" : ""),
-               (supports_evex() ? ", evex" : ""));
+               (supports_evex() ? ", evex" : ""),
+               (supports_sha() ? ", sha" : ""));
   _features_string = os::strdup(buf);
 
   // UseSSE is set to the smaller of what hardware supports and what
@@ -730,17 +731,29 @@
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
   }
 
-  if (UseSHA) {
+  if (supports_sha()) {
+    if (FLAG_IS_DEFAULT(UseSHA)) {
+      UseSHA = true;
+    }
+  } else if (UseSHA) {
     warning("SHA instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseSHA, false);
   }
 
-  if (UseSHA1Intrinsics) {
+  if (UseSHA) {
+    if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA1Intrinsics, true);
+    }
+  } else if (UseSHA1Intrinsics) {
     warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
     FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
   }
 
-  if (UseSHA256Intrinsics) {
+  if (UseSHA) {
+    if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
+    }
+  } else if (UseSHA256Intrinsics) {
     warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU.");
     FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
   }
@@ -750,6 +763,10 @@
     FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
   }
 
+  if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) {
+    FLAG_SET_DEFAULT(UseSHA, false);
+  }
+
   if (UseAdler32Intrinsics) {
     warning("Adler32Intrinsics not available on this CPU.");
     FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
--- a/src/cpu/x86/vm/vm_version_x86.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/vm_version_x86.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -221,7 +221,7 @@
                avx512pf : 1,
                avx512er : 1,
                avx512cd : 1,
-                        : 1,
+                    sha : 1,
                avx512bw : 1,
                avx512vl : 1;
     } bits;
@@ -282,11 +282,13 @@
     CPU_AVX512DQ = (1 << 27),
     CPU_AVX512PF = (1 << 28),
     CPU_AVX512ER = (1 << 29),
-    CPU_AVX512CD = (1 << 30),
-    CPU_AVX512BW = (1 << 31)
+    CPU_AVX512CD = (1 << 30)
+    // Keeping sign bit 31 unassigned.
   };
 
-#define CPU_AVX512VL UCONST64(0x100000000) // EVEX instructions with smaller vector length : enums are limited to 32bit
+#define CPU_AVX512BW ((uint64_t)UCONST64(0x100000000)) // enums are limited to 31 bit
+#define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
+#define CPU_SHA ((uint64_t)UCONST64(0x400000000))      // SHA instructions
 
   enum Extended_Family {
     // AMD
@@ -516,6 +518,8 @@
          result |= CPU_ADX;
       if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
         result |= CPU_BMI2;
+      if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
+        result |= CPU_SHA;
       if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
         result |= CPU_LZCNT;
       // for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
@@ -721,6 +725,7 @@
   static bool supports_avx512nobw() { return (supports_evex() && !supports_avx512bw()); }
   static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
   static bool supports_avxonly()    { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
+  static bool supports_sha()        { return (_features & CPU_SHA) != 0; }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/src/cpu/x86/vm/x86.ad	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/x86.ad	Thu Mar 17 17:03:20 2016 +0000
@@ -3179,13 +3179,13 @@
             "punpcklbw $dst,$dst\n\t"
             "pshuflw $dst,$dst,0x00\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+            "vinserti128_high $dst,$dst\t! replicate32B" %}
   ins_encode %{
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3196,12 +3196,12 @@
   format %{ "punpcklbw $dst,$mem\n\t"
             "pshuflw $dst,$dst,0x00\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+            "vinserti128_high $dst,$dst\t! replicate32B" %}
   ins_encode %{
     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3223,11 +3223,11 @@
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
+            "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
   ins_encode %{
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3298,12 +3298,12 @@
   format %{ "movd    $dst,$src\n\t"
             "pshuflw $dst,$dst,0x00\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+            "vinserti128_high $dst,$dst\t! replicate16S" %}
   ins_encode %{
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3313,11 +3313,11 @@
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "pshuflw $dst,$mem,0x00\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+            "vinserti128_high $dst,$dst\t! replicate16S" %}
   ins_encode %{
     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3327,11 +3327,11 @@
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
+            "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
   ins_encode %{
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3363,11 +3363,11 @@
   match(Set dst (ReplicateI src));
   format %{ "movd    $dst,$src\n\t"
             "pshufd  $dst,$dst,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+            "vinserti128_high $dst,$dst\t! replicate8I" %}
   ins_encode %{
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3376,10 +3376,10 @@
   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI (LoadI mem)));
   format %{ "pshufd  $dst,$mem,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+            "vinserti128_high $dst,$dst\t! replicate8I" %}
   ins_encode %{
     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3401,11 +3401,11 @@
   match(Set dst (ReplicateI con));
   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst" %}
+            "vinserti128_high $dst,$dst" %}
   ins_encode %{
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3430,11 +3430,11 @@
   match(Set dst (ReplicateL src));
   format %{ "movdq   $dst,$src\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+            "vinserti128_high $dst,$dst\t! replicate4L" %}
   ins_encode %{
     __ movdq($dst$$XMMRegister, $src$$Register);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3447,13 +3447,13 @@
             "movdl   $tmp,$src.hi\n\t"
             "punpckldq $dst,$tmp\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+            "vinserti128_high $dst,$dst\t! replicate4L" %}
   ins_encode %{
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3464,11 +3464,11 @@
   match(Set dst (ReplicateL con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
+            "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
   ins_encode %{
     __ movq($dst$$XMMRegister, $constantaddress($con));
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3478,11 +3478,11 @@
   match(Set dst (ReplicateL (LoadL mem)));
   format %{ "movq    $dst,$mem\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+            "vinserti128_high $dst,$dst\t! replicate4L" %}
   ins_encode %{
     __ movq($dst$$XMMRegister, $mem$$Address);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3511,10 +3511,10 @@
   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF src));
   format %{ "pshufd  $dst,$src,0x00\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+            "vinsertf128_high $dst,$dst\t! replicate8F" %}
   ins_encode %{
     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3523,10 +3523,10 @@
   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF (LoadF mem)));
   format %{ "pshufd  $dst,$mem,0x00\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+            "vinsertf128_high $dst,$dst\t! replicate8F" %}
   ins_encode %{
     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3576,10 +3576,10 @@
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD src));
   format %{ "pshufd  $dst,$src,0x44\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+            "vinsertf128_high $dst,$dst\t! replicate4D" %}
   ins_encode %{
     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3588,10 +3588,10 @@
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD (LoadD mem)));
   format %{ "pshufd  $dst,$mem,0x44\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+            "vinsertf128_high $dst,$dst\t! replicate4D" %}
   ins_encode %{
     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4791,7 +4791,7 @@
   effect(TEMP tmp, TEMP tmp2);
   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
             "vphaddd  $tmp,$tmp,$tmp2\n\t"
-            "vextracti128  $tmp2,$tmp\n\t"
+            "vextracti128_high  $tmp2,$tmp\n\t"
             "vpaddd   $tmp,$tmp,$tmp2\n\t"
             "movd     $tmp2,$src1\n\t"
             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
@@ -4800,7 +4800,7 @@
     int vector_len = 1;
     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-    __ vextracti128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
     __ movdl($tmp2$$XMMRegister, $src1$$Register);
     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
@@ -4813,7 +4813,7 @@
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti128  $tmp,$src2\n\t"
+  format %{ "vextracti128_high  $tmp,$src2\n\t"
             "vpaddd  $tmp,$tmp,$src2\n\t"
             "pshufd  $tmp2,$tmp,0xE\n\t"
             "vpaddd  $tmp,$tmp,$tmp2\n\t"
@@ -4824,7 +4824,7 @@
             "movd    $dst,$tmp2\t! add reduction8I" %}
   ins_encode %{
     int vector_len = 0;
-    __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
@@ -4841,9 +4841,9 @@
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
-  format %{ "vextracti64x4  $tmp3,$src2,0x1\n\t"
+  format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
             "vpaddd  $tmp3,$tmp3,$src2\n\t"
-            "vextracti128   $tmp,$tmp3\n\t"
+            "vextracti128_high  $tmp,$tmp3\n\t"
             "vpaddd  $tmp,$tmp,$tmp3\n\t"
             "pshufd  $tmp2,$tmp,0xE\n\t"
             "vpaddd  $tmp,$tmp,$tmp2\n\t"
@@ -4853,9 +4853,9 @@
             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
             "movd    $dst,$tmp2\t! mul reduction16I" %}
   ins_encode %{
-    __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
@@ -4892,7 +4892,7 @@
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti128  $tmp,$src2\n\t"
+  format %{ "vextracti128_high  $tmp,$src2\n\t"
             "vpaddq  $tmp2,$tmp,$src2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
@@ -4900,7 +4900,7 @@
             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
             "movdq   $dst,$tmp2\t! add reduction4L" %}
   ins_encode %{
-    __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
@@ -4915,9 +4915,9 @@
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti64x4  $tmp2,$src2,0x1\n\t"
+  format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
             "vpaddq  $tmp2,$tmp2,$src2\n\t"
-            "vextracti128   $tmp,$tmp2\n\t"
+            "vextracti128_high  $tmp,$tmp2\n\t"
             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
@@ -4925,9 +4925,9 @@
             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
             "movdq   $dst,$tmp2\t! add reduction8L" %}
   ins_encode %{
-    __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
@@ -5026,7 +5026,7 @@
             "vaddss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$src2,0x03\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
-            "vextractf128  $tmp2,$src2\n\t"
+            "vextractf128_high  $tmp2,$src2\n\t"
             "vaddss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
@@ -5042,7 +5042,7 @@
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5065,7 +5065,7 @@
             "vaddss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$src2,0x03\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x1\n\t"
+            "vextractf32x4  $tmp2,$src2,0x1\n\t"
             "vaddss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
@@ -5073,7 +5073,7 @@
             "vaddss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$tmp2,0x03\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x2\n\t"
+            "vextractf32x4  $tmp2,$src2,0x2\n\t"
             "vaddss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
@@ -5081,7 +5081,7 @@
             "vaddss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$tmp2,0x03\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x3\n\t"
+            "vextractf32x4  $tmp2,$src2,0x3\n\t"
             "vaddss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
@@ -5097,7 +5097,7 @@
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5105,7 +5105,7 @@
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5113,7 +5113,7 @@
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5162,7 +5162,7 @@
   format %{ "vaddsd  $dst,$dst,$src2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4h  $tmp2,$src2, 0x1\n\t"
+            "vextractf32x4  $tmp2,$src2,0x1\n\t"
             "vaddsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
@@ -5170,7 +5170,7 @@
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5185,15 +5185,15 @@
   format %{ "vaddsd  $dst,$dst,$src2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x1\n\t"
+            "vextractf32x4  $tmp2,$src2,0x1\n\t"
             "vaddsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x2\n\t"
+            "vextractf32x4  $tmp2,$src2,0x2\n\t"
             "vaddsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x3\n\t"
+            "vextractf32x4  $tmp2,$src2,0x3\n\t"
             "vaddsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
@@ -5201,15 +5201,15 @@
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5307,7 +5307,7 @@
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti128  $tmp,$src2\n\t"
+  format %{ "vextracti128_high  $tmp,$src2\n\t"
             "vpmulld  $tmp,$tmp,$src2\n\t"
             "pshufd   $tmp2,$tmp,0xE\n\t"
             "vpmulld  $tmp,$tmp,$tmp2\n\t"
@@ -5318,7 +5318,7 @@
             "movd     $dst,$tmp2\t! mul reduction8I" %}
   ins_encode %{
     int vector_len = 0;
-    __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
@@ -5335,9 +5335,9 @@
   predicate(UseAVX > 2);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
-  format %{ "vextracti64x4  $tmp3,$src2,0x1\n\t"
+  format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
             "vpmulld  $tmp3,$tmp3,$src2\n\t"
-            "vextracti128   $tmp,$tmp3\n\t"
+            "vextracti128_high  $tmp,$tmp3\n\t"
             "vpmulld  $tmp,$tmp,$src2\n\t"
             "pshufd   $tmp2,$tmp,0xE\n\t"
             "vpmulld  $tmp,$tmp,$tmp2\n\t"
@@ -5347,9 +5347,9 @@
             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
             "movd     $dst,$tmp2\t! mul reduction16I" %}
   ins_encode %{
-    __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
@@ -5386,7 +5386,7 @@
   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti128  $tmp,$src2\n\t"
+  format %{ "vextracti128_high  $tmp,$src2\n\t"
             "vpmullq  $tmp2,$tmp,$src2\n\t"
             "pshufd   $tmp,$tmp2,0xE\n\t"
             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
@@ -5394,7 +5394,7 @@
             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
             "movdq    $dst,$tmp2\t! mul reduction4L" %}
   ins_encode %{
-    __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
@@ -5409,9 +5409,9 @@
   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti64x4  $tmp2,$src2,0x1\n\t"
+  format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
             "vpmullq  $tmp2,$tmp2,$src2\n\t"
-            "vextracti128   $tmp,$tmp2\n\t"
+            "vextracti128_high  $tmp,$tmp2\n\t"
             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
             "pshufd   $tmp,$tmp2,0xE\n\t"
             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
@@ -5419,9 +5419,9 @@
             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
             "movdq    $dst,$tmp2\t! mul reduction8L" %}
   ins_encode %{
-    __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
@@ -5520,7 +5520,7 @@
             "vmulss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$src2,0x03\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
-            "vextractf128  $tmp2,$src2\n\t"
+            "vextractf128_high  $tmp2,$src2\n\t"
             "vmulss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
@@ -5536,7 +5536,7 @@
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5559,7 +5559,7 @@
             "vmulss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$src2,0x03\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x1\n\t"
+            "vextractf32x4  $tmp2,$src2,0x1\n\t"
             "vmulss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
@@ -5567,7 +5567,7 @@
             "vmulss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$tmp2,0x03\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x2\n\t"
+            "vextractf32x4  $tmp2,$src2,0x2\n\t"
             "vmulss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
@@ -5575,7 +5575,7 @@
             "vmulss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$tmp2,0x03\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x3\n\t"
+            "vextractf32x4  $tmp2,$src2,0x3\n\t"
             "vmulss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
@@ -5591,7 +5591,7 @@
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5599,7 +5599,7 @@
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5607,7 +5607,7 @@
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5656,7 +5656,7 @@
   format %{ "vmulsd  $dst,$dst,$src2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\n\t"
-            "vextractf128  $tmp2,$src2\n\t"
+            "vextractf128_high  $tmp2,$src2\n\t"
             "vmulsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
@@ -5664,7 +5664,7 @@
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5679,15 +5679,15 @@
   format %{ "vmulsd  $dst,$dst,$src2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x1\n\t"
+            "vextractf32x4  $tmp2,$src2,0x1\n\t"
             "vmulsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x2\n\t"
+            "vextractf32x4  $tmp2,$src2,0x2\n\t"
             "vmulsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x3\n\t"
+            "vextractf32x4  $tmp2,$src2,0x3\n\t"
             "vmulsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
@@ -5695,15 +5695,15 @@
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
--- a/src/cpu/x86/vm/x86_32.ad	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/x86_32.ad	Thu Mar 17 17:03:20 2016 +0000
@@ -1420,9 +1420,6 @@
 // The ecx parameter to rep stos for the ClearArray node is in dwords.
 const bool Matcher::init_array_count_is_in_bytes = false;
 
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
-
 // Needs 2 CMOV's for longs.
 const int Matcher::long_cmove_cost() { return 1; }
 
@@ -11369,27 +11366,54 @@
 // =======================================================================
 // fast clearing of an array
 instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
-  predicate(!UseFastStosb);
+  predicate(!((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
-  format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
-            "SHL    ECX,1\t# Convert doublewords to words\n\t"
-            "REP STOS\t# store EAX into [EDI++] while ECX--" %}
-  ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
-  predicate(UseFastStosb);
+
+  format %{ $$template
+    $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
+    $$emit$$"CMP    InitArrayShortSize,rcx\n\t"
+    $$emit$$"JG     LARGE\n\t"
+    $$emit$$"SHL    ECX, 1\n\t"
+    $$emit$$"DEC    ECX\n\t"
+    $$emit$$"JS     DONE\t# Zero length\n\t"
+    $$emit$$"MOV    EAX,(EDI,ECX,4)\t# LOOP\n\t"
+    $$emit$$"DEC    ECX\n\t"
+    $$emit$$"JGE    LOOP\n\t"
+    $$emit$$"JMP    DONE\n\t"
+    $$emit$$"# LARGE:\n\t"
+    if (UseFastStosb) {
+       $$emit$$"SHL    ECX,3\t# Convert doublewords to bytes\n\t"
+       $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+    } else {
+       $$emit$$"SHL    ECX,1\t# Convert doublewords to words\n\t"
+       $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
+    }
+    $$emit$$"# DONE"
+  %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+  predicate(((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
-  format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
-            "SHL    ECX,3\t# Convert doublewords to bytes\n\t"
-            "REP STOSB\t# store EAX into [EDI++] while ECX--" %}
-  ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  format %{ $$template
+    $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
+    if (UseFastStosb) {
+       $$emit$$"SHL    ECX,3\t# Convert doublewords to bytes\n\t"
+       $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+    } else {
+       $$emit$$"SHL    ECX,1\t# Convert doublewords to words\n\t"
+       $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
+    }
+    $$emit$$"# DONE"
+  %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
   %}
   ins_pipe( pipe_slow );
 %}
--- a/src/cpu/x86/vm/x86_64.ad	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/x86/vm/x86_64.ad	Thu Mar 17 17:03:20 2016 +0000
@@ -1637,9 +1637,6 @@
 // The ecx parameter to rep stosq for the ClearArray node is in words.
 const bool Matcher::init_array_count_is_in_bytes = false;
 
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
-
 // No additional cost for CMOVL.
 const int Matcher::long_cmove_cost() { return 0; }
 
@@ -10460,31 +10457,55 @@
 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
                   rFlagsReg cr)
 %{
-  predicate(!UseFastStosb);
+  predicate(!((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
 
-  format %{ "xorq    rax, rax\t# ClearArray:\n\t"
-            "rep     stosq\t# Store rax to *rdi++ while rcx--" %}
-  ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  format %{ $$template
+    $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
+    $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
+    $$emit$$"jg      LARGE\n\t"
+    $$emit$$"dec     rcx\n\t"
+    $$emit$$"js      DONE\t# Zero length\n\t"
+    $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
+    $$emit$$"dec     rcx\n\t"
+    $$emit$$"jge     LOOP\n\t"
+    $$emit$$"jmp     DONE\n\t"
+    $$emit$$"# LARGE:\n\t"
+    if (UseFastStosb) {
+       $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
+       $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
+    } else {
+       $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
+    }
+    $$emit$$"# DONE"
+  %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
-                        rFlagsReg cr)
-%{
-  predicate(UseFastStosb);
+instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
+                  rFlagsReg cr)
+%{
+  predicate(((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
-  format %{ "xorq    rax, rax\t# ClearArray:\n\t"
-            "shlq    rcx,3\t# Convert doublewords to bytes\n\t"
-            "rep     stosb\t# Store rax to *rdi++ while rcx--" %}
-  ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
-  %}
-  ins_pipe( pipe_slow );
+
+  format %{ $$template
+    $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
+    if (UseFastStosb) {
+       $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
+       $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
+    } else {
+       $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
+    }
+  %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+  %}
+  ins_pipe(pipe_slow);
 %}
 
 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
--- a/src/cpu/zero/vm/cppInterpreter_zero.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/zero/vm/cppInterpreter_zero.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -773,7 +773,7 @@
 }
 
 BasicType CppInterpreter::result_type_of(Method* method) {
-  BasicType t;
+  BasicType t = T_ILLEGAL; // silence compiler warnings
   switch (method->result_index()) {
     case 0 : t = T_BOOLEAN; break;
     case 1 : t = T_CHAR;    break;
--- a/src/cpu/zero/vm/interpreterRT_zero.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/cpu/zero/vm/interpreterRT_zero.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2007, 2008, 2010 Red Hat, Inc.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -62,7 +62,7 @@
 }
 
 void InterpreterRuntime::SignatureHandlerGeneratorBase::push(BasicType type) {
-  ffi_type *ftype;
+  ffi_type *ftype = NULL;
   switch (type) {
   case T_VOID:
     ftype = &ffi_type_void;
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/development/Server16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/development/Server24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/About16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/About24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Delete16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Delete24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Find16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Help16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Help24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/History16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/History24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Information16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Information24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/New16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/New24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Open16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Open24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Save24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/SaveAs16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/SaveAs24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/Zoom16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/ZoomIn16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/general/ZoomIn24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/navigation/Down16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/navigation/Up16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignCenter16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignCenter24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignLeft16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignLeft24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignRight16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/images/toolbarButtonGraphics/text/AlignRight24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/development/Server16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/development/Server24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/About16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/About24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Delete16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Delete24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Find16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Help16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Help24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/History16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/History24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Information16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Information24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/New16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/New24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Open16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Open24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Save24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/SaveAs16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/SaveAs24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/Zoom16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/ZoomIn16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/general/ZoomIn24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/navigation/Down16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/navigation/Up16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignCenter16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignCenter24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignLeft16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignLeft24.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignRight16.gif has changed
Binary file src/jdk.hotspot.agent/share/classes/toolbarButtonGraphics/text/AlignRight24.gif has changed
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java	Thu Mar 17 17:03:20 2016 +0000
@@ -22,6 +22,7 @@
  */
 package jdk.vm.ci.amd64;
 
+import static jdk.vm.ci.code.MemoryBarriers.LOAD_LOAD;
 import static jdk.vm.ci.code.MemoryBarriers.LOAD_STORE;
 import static jdk.vm.ci.code.MemoryBarriers.STORE_STORE;
 import static jdk.vm.ci.code.Register.SPECIAL;
@@ -202,7 +203,8 @@
         AVX512ER,
         AVX512CD,
         AVX512BW,
-        AVX512VL
+        AVX512VL,
+        SHA
     }
 
     private final EnumSet<CPUFeature> features;
@@ -220,7 +222,7 @@
     private final AMD64Kind largestKind;
 
     public AMD64(EnumSet<CPUFeature> features, EnumSet<Flag> flags) {
-        super("AMD64", AMD64Kind.QWORD, ByteOrder.LITTLE_ENDIAN, true, allRegisters, LOAD_STORE | STORE_STORE, 1, 8);
+        super("AMD64", AMD64Kind.QWORD, ByteOrder.LITTLE_ENDIAN, true, allRegisters, LOAD_LOAD | LOAD_STORE | STORE_STORE, 1, 8);
         this.features = features;
         this.flags = flags;
         assert features.contains(CPUFeature.SSE2) : "minimum config for x64";
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java	Thu Mar 17 17:03:20 2016 +0000
@@ -122,6 +122,9 @@
         if ((config.vmVersionFeatures & config.amd64AVX512VL) != 0) {
             features.add(AMD64.CPUFeature.AVX512VL);
         }
+        if ((config.vmVersionFeatures & config.amd64SHA) != 0) {
+            features.add(AMD64.CPUFeature.SHA);
+        }
         return features;
     }
 
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotConstantReflectionProvider.java	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotConstantReflectionProvider.java	Thu Mar 17 17:03:20 2016 +0000
@@ -339,7 +339,7 @@
 
     public JavaConstant readStableFieldValue(ResolvedJavaField field, JavaConstant receiver, boolean isDefaultStable) {
         JavaConstant fieldValue = readNonStableFieldValue(field, receiver);
-        if (fieldValue.isNonNull()) {
+        if (fieldValue != null && fieldValue.isNonNull()) {
             JavaType declaredType = field.getType();
             if (declaredType.getComponentType() != null) {
                 int stableDimension = getArrayDimension(declaredType);
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotJVMCICompilerConfig.java	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotJVMCICompilerConfig.java	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,6 +25,7 @@
 import jdk.vm.ci.code.CompilationRequest;
 import jdk.vm.ci.code.CompilationRequestResult;
 import jdk.vm.ci.common.JVMCIError;
+import jdk.vm.ci.hotspot.HotSpotJVMCIRuntime.Option;
 import jdk.vm.ci.runtime.JVMCICompiler;
 import jdk.vm.ci.runtime.JVMCICompilerFactory;
 import jdk.vm.ci.runtime.JVMCIRuntime;
@@ -47,29 +48,33 @@
         }
     }
 
+    /**
+     * Factory of the selected system compiler.
+     */
     private static JVMCICompilerFactory compilerFactory;
 
     /**
-     * Selects the system compiler.
+     * Gets the selected system compiler factory.
      *
-     * Called from VM. This method has an object return type to allow it to be called with a VM
-     * utility function used to call other static initialization methods.
+     * @return the selected system compiler factory
      */
-    static Boolean selectCompiler(String compilerName) {
-        assert compilerFactory == null;
-        for (JVMCICompilerFactory factory : Services.load(JVMCICompilerFactory.class)) {
-            if (factory.getCompilerName().equals(compilerName)) {
-                compilerFactory = factory;
-                return Boolean.TRUE;
-            }
-        }
-
-        throw new JVMCIError("JVMCI compiler '%s' not found", compilerName);
-    }
-
     static JVMCICompilerFactory getCompilerFactory() {
         if (compilerFactory == null) {
-            compilerFactory = new DummyCompilerFactory();
+            JVMCICompilerFactory factory = null;
+            String compilerName = Option.Compiler.getString();
+            if (compilerName != null) {
+                for (JVMCICompilerFactory f : Services.load(JVMCICompilerFactory.class)) {
+                    if (f.getCompilerName().equals(compilerName)) {
+                        factory = f;
+                    }
+                }
+                if (factory == null) {
+                    throw new JVMCIError("JVMCI compiler '%s' not found", compilerName);
+                }
+            } else {
+                factory = new DummyCompilerFactory();
+            }
+            compilerFactory = factory;
         }
         return compilerFactory;
     }
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotJVMCIRuntime.java	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotJVMCIRuntime.java	Thu Mar 17 17:03:20 2016 +0000
@@ -91,6 +91,7 @@
      * A list of all supported JVMCI options.
      */
     public enum Option {
+        Compiler(String.class, null, "Selects the system compiler."),
         ImplicitStableValues(boolean.class, true, "Mark well-known stable fields as such."),
         // Note: The following one is not used (see InitTimer.ENABLED).
         InitTimer(boolean.class, false, "Specifies if initialization timing is enabled."),
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotMetaAccessProvider.java	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotMetaAccessProvider.java	Thu Mar 17 17:03:20 2016 +0000
@@ -41,7 +41,6 @@
 import jdk.vm.ci.meta.DeoptimizationReason;
 import jdk.vm.ci.meta.JavaConstant;
 import jdk.vm.ci.meta.JavaKind;
-import jdk.vm.ci.meta.JavaType;
 import jdk.vm.ci.meta.MetaAccessProvider;
 import jdk.vm.ci.meta.ResolvedJavaField;
 import jdk.vm.ci.meta.ResolvedJavaMethod;
@@ -111,23 +110,26 @@
     }
 
     public ResolvedJavaField lookupJavaField(Field reflectionField) {
-        String name = reflectionField.getName();
         Class<?> fieldHolder = reflectionField.getDeclaringClass();
-        Class<?> fieldType = reflectionField.getType();
-        // java.lang.reflect.Field's modifiers should be enough here since VM internal modifier bits
-        // are not used (yet).
-        final int modifiers = reflectionField.getModifiers();
-        final long offset = Modifier.isStatic(modifiers) ? UNSAFE.staticFieldOffset(reflectionField) : UNSAFE.objectFieldOffset(reflectionField);
 
         HotSpotResolvedObjectType holder = fromObjectClass(fieldHolder);
-        JavaType type = runtime.fromClass(fieldType);
+        if (Modifier.isStatic(reflectionField.getModifiers())) {
+            final long offset = UNSAFE.staticFieldOffset(reflectionField);
+            for (ResolvedJavaField field : holder.getStaticFields()) {
+                if (offset == ((HotSpotResolvedJavaField) field).offset()) {
+                    return field;
+                }
+            }
+        } else {
+            final long offset = UNSAFE.objectFieldOffset(reflectionField);
+            for (ResolvedJavaField field : holder.getInstanceFields(false)) {
+                if (offset == ((HotSpotResolvedJavaField) field).offset()) {
+                    return field;
+                }
+            }
+        }
 
-        if (offset != -1) {
-            HotSpotResolvedObjectType resolved = holder;
-            return resolved.createField(name, type, offset, modifiers);
-        } else {
-            throw new JVMCIError("unresolved field %s", reflectionField);
-        }
+        throw new JVMCIError("unresolved field %s", reflectionField);
     }
 
     private static int intMaskRight(int n) {
--- a/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotVMConfig.java	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotVMConfig.java	Thu Mar 17 17:03:20 2016 +0000
@@ -945,6 +945,7 @@
     @HotSpotVMConstant(name = "VM_Version::CPU_AVX512CD", archs = {"amd64"}) @Stable public long amd64AVX512CD;
     @HotSpotVMConstant(name = "VM_Version::CPU_AVX512BW", archs = {"amd64"}) @Stable public long amd64AVX512BW;
     @HotSpotVMConstant(name = "VM_Version::CPU_AVX512VL", archs = {"amd64"}) @Stable public long amd64AVX512VL;
+    @HotSpotVMConstant(name = "VM_Version::CPU_SHA", archs = {"amd64"}) @Stable public long amd64SHA;
 
     // SPARC specific values
     @HotSpotVMConstant(name = "VM_Version::vis3_instructions_m", archs = {"sparc"}) @Stable public int sparcVis3Instructions;
@@ -1141,7 +1142,7 @@
 
     @HotSpotVMField(name = "JavaFrameAnchor::_last_Java_sp", type = "intptr_t*", get = HotSpotVMField.Type.OFFSET) @Stable private int javaFrameAnchorLastJavaSpOffset;
     @HotSpotVMField(name = "JavaFrameAnchor::_last_Java_pc", type = "address", get = HotSpotVMField.Type.OFFSET) @Stable private int javaFrameAnchorLastJavaPcOffset;
-    @HotSpotVMField(name = "JavaFrameAnchor::_last_Java_fp", type = "intptr_t*", get = HotSpotVMField.Type.OFFSET, archs = {"amd64"}) @Stable private int javaFrameAnchorLastJavaFpOffset;
+    @HotSpotVMField(name = "JavaFrameAnchor::_last_Java_fp", type = "intptr_t*", get = HotSpotVMField.Type.OFFSET, archs = {"aarch64, amd64"}) @Stable private int javaFrameAnchorLastJavaFpOffset;
     @HotSpotVMField(name = "JavaFrameAnchor::_flags", type = "int", get = HotSpotVMField.Type.OFFSET, archs = {"sparc"}) @Stable private int javaFrameAnchorFlagsOffset;
 
     public int threadLastJavaSpOffset() {
@@ -1152,11 +1153,8 @@
         return javaThreadAnchorOffset + javaFrameAnchorLastJavaPcOffset;
     }
 
-    /**
-     * This value is only valid on AMD64.
-     */
     public int threadLastJavaFpOffset() {
-        // TODO add an assert for AMD64
+        assert getHostArchitectureName().equals("aarch64") || getHostArchitectureName().equals("amd64");
         return javaThreadAnchorOffset + javaFrameAnchorLastJavaFpOffset;
     }
 
--- a/src/os/aix/vm/attachListener_aix.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/aix/vm/attachListener_aix.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
+ * Copyright (c) 2005, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -225,7 +225,7 @@
   // We must call bind with the actual socketaddr length. This is obligatory for AS400.
   int res = ::bind(listener, (struct sockaddr*)&addr, SUN_LEN(&addr));
   if (res == -1) {
-    RESTARTABLE(::close(listener), res);
+    ::close(listener);
     return -1;
   }
 
@@ -238,7 +238,7 @@
       }
   }
   if (res == -1) {
-    RESTARTABLE(::close(listener), res);
+    ::close(listener);
     ::unlink(initial_path);
     return -1;
   }
@@ -400,7 +400,7 @@
     AixAttachOperation* op = read_request(s);
     if (op == NULL) {
       int res;
-      RESTARTABLE(::close(s), res);
+      ::close(s);
       continue;
     } else {
       return op;
@@ -452,7 +452,7 @@
   }
 
   // done
-  RESTARTABLE(::close(this->socket()), rc);
+  ::close(this->socket());
 
   // were we externally suspended while we were waiting?
   thread->check_and_wait_while_suspended();
--- a/src/os/aix/vm/os_aix.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/aix/vm/os_aix.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
+ * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,6 +36,7 @@
 #include "compiler/compileBroker.hpp"
 #include "interpreter/interpreter.hpp"
 #include "jvm_aix.h"
+#include "logging/log.hpp"
 #include "libo4.hpp"
 #include "libperfstat_aix.hpp"
 #include "libodm_aix.hpp"
@@ -791,13 +792,8 @@
   const pthread_t pthread_id = ::pthread_self();
   const tid_t kernel_thread_id = ::thread_self();
 
-  trcVerbose("newborn Thread : pthread-id %u, ktid " UINT64_FORMAT
-    ", stack %p ... %p, stacksize 0x%IX (%IB)",
-    pthread_id, kernel_thread_id,
-    thread->stack_end(),
-    thread->stack_base(),
-    thread->stack_size(),
-    thread->stack_size());
+  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", kernel thread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) kernel_thread_id);
 
   // Normally, pthread stacks on AIX live in the data segment (are allocated with malloc()
   // by the pthread library). In rare cases, this may not be the case, e.g. when third-party
@@ -805,7 +801,7 @@
   // guard pages on those stacks, because the stacks may reside in memory which is not
   // protectable (shmated).
   if (thread->stack_base() > ::sbrk(0)) {
-    trcVerbose("Thread " UINT64_FORMAT ": stack not in data segment.", (uint64_t) pthread_id);
+    log_warning(os, thread)("Thread stack not in data segment.");
   }
 
   // Try to randomize the cache line index of hot stack frames.
@@ -839,8 +835,8 @@
   // Call one more level start routine.
   thread->run();
 
-  trcVerbose("Thread finished : pthread-id %u, ktid " UINT64_FORMAT ".",
-    pthread_id, kernel_thread_id);
+  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", kernel thread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) kernel_thread_id);
 
   return 0;
 }
@@ -908,20 +904,19 @@
   pthread_t tid;
   int ret = pthread_create(&tid, &attr, (void* (*)(void*)) java_start, thread);
 
+
+  char buf[64];
+  if (ret == 0) {
+    log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
+      (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+  } else {
+    log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
+      strerror(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+  }
+
   pthread_attr_destroy(&attr);
 
-  if (ret == 0) {
-    trcVerbose("Created New Thread : pthread-id %u", tid);
-  } else {
-    if (os::Aix::on_pase()) {
-      // QIBM_MULTI_THREADED=Y is needed when the launcher is started on iSeries
-      // using QSH. Otherwise pthread_create fails with errno=11.
-      trcVerbose("(Please make sure you set the environment variable "
-              "QIBM_MULTI_THREADED=Y before running this program.)");
-    }
-    if (PrintMiscellaneous && (Verbose || WizardMode)) {
-      perror("pthread_create()");
-    }
+  if (ret != 0) {
     // Need to clean up stuff we've allocated so far
     thread->set_osthread(NULL);
     delete osthread;
@@ -958,13 +953,6 @@
   const pthread_t pthread_id = ::pthread_self();
   const tid_t kernel_thread_id = ::thread_self();
 
-  trcVerbose("attaching Thread : pthread-id %u, ktid " UINT64_FORMAT ", stack %p ... %p, stacksize 0x%IX (%IB)",
-    pthread_id, kernel_thread_id,
-    thread->stack_end(),
-    thread->stack_base(),
-    thread->stack_size(),
-    thread->stack_size());
-
   // OSThread::thread_id is the pthread id.
   osthread->set_thread_id(pthread_id);
 
@@ -990,6 +978,9 @@
   // and save the caller's signal mask
   os::Aix::hotspot_sigmask(thread);
 
+  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", kernel thread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) kernel_thread_id);
+
   return true;
 }
 
--- a/src/os/aix/vm/perfMemory_aix.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/aix/vm/perfMemory_aix.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2013 SAP SE. All rights reserved.
+ * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -121,7 +121,7 @@
       addr += result;
     }
 
-    RESTARTABLE(::close(fd), result);
+    result = ::close(fd);
     if (PrintMiscellaneous && Verbose) {
       if (result == OS_ERR) {
         warning("Could not close %s: %s\n", destfile, strerror(errno));
@@ -299,10 +299,13 @@
   bool create;
   int error;
   int fd;
+  int result;
 
   create = false;
 
-  if (lstat(path, &orig_st) != 0) {
+  RESTARTABLE(::lstat(path, &orig_st), result);
+
+  if (result == OS_ERR) {
     if (errno == ENOENT && (oflag & O_CREAT) != 0) {
       // File doesn't exist, but_we want to create it, add O_EXCL flag
       // to make sure no-one creates it (or a symlink) before us
@@ -316,7 +319,7 @@
       return OS_ERR;
     }
   } else {
-    // Lstat success, check if existing file is a link.
+    // lstat success, check if existing file is a link.
     if ((orig_st.st_mode & S_IFMT) == S_IFLNK)  {
       // File is a symlink.
       errno = ELOOP;
@@ -325,9 +328,9 @@
   }
 
   if (use_mode == true) {
-    fd = open(path, oflag, mode);
+    RESTARTABLE(::open(path, oflag, mode), fd);
   } else {
-    fd = open(path, oflag);
+    RESTARTABLE(::open(path, oflag), fd);
   }
 
   if (fd == OS_ERR) {
@@ -336,7 +339,8 @@
 
   // Can't do inode checks on before/after if we created the file.
   if (create == false) {
-    if (fstat(fd, &new_st) != 0) {
+    RESTARTABLE(::fstat(fd, &new_st), result);
+    if (result == OS_ERR) {
       // Keep errno from fstat, in case close also fails.
       error = errno;
       ::close(fd);
@@ -384,7 +388,7 @@
   RESTARTABLE(::open(dirname, O_RDONLY|O_NOFOLLOW), result);
 #else
   // workaround (jdk6 coding)
-  RESTARTABLE(::open_o_nofollow(dirname, O_RDONLY), result);
+  result = open_o_nofollow(dirname, O_RDONLY);
 #endif
 
   if (result == OS_ERR) {
@@ -888,7 +892,7 @@
   RESTARTABLE(::open(filename, O_RDWR|O_CREAT|O_NOFOLLOW, S_IREAD|S_IWRITE), result);
 #else
   // workaround function (jdk6 code)
-  RESTARTABLE(::open_o_nofollow(filename, O_RDWR|O_CREAT, S_IREAD|S_IWRITE), result);
+  result = open_o_nofollow(filename, O_RDWR|O_CREAT, S_IREAD|S_IWRITE);
 #endif
 
   if (result == OS_ERR) {
@@ -931,7 +935,7 @@
     if (PrintMiscellaneous && Verbose) {
       warning("could not set shared memory file size: %s\n", strerror(errno));
     }
-    RESTARTABLE(::close(fd), result);
+    ::close(fd);
     return -1;
   }
 
@@ -951,7 +955,7 @@
 #ifdef O_NOFOLLOW
   RESTARTABLE(::open(filename, oflags), result);
 #else
-  RESTARTABLE(::open_o_nofollow(filename, oflags), result);
+  open_o_nofollow(filename, oflags);
 #endif
 
   if (result == OS_ERR) {
@@ -1006,8 +1010,7 @@
 
   char* dirname = get_user_tmp_dir(user_name);
   char* filename = get_sharedmem_filename(dirname, vmid);
-
-  // Get the short filename.
+  // get the short filename.
   char* short_filename = strrchr(filename, '/');
   if (short_filename == NULL) {
     short_filename = filename;
@@ -1033,9 +1036,7 @@
 
   mapAddress = (char*)::mmap((char*)0, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
 
-  // attempt to close the file - restart it if it was interrupted,
-  // but ignore other failures
-  RESTARTABLE(::close(fd), result);
+  result = ::close(fd);
   assert(result != OS_ERR, "could not close file");
 
   if (mapAddress == MAP_FAILED) {
@@ -1142,7 +1143,6 @@
   // constructs for the file and the shared memory mapping.
   if (mode == PerfMemory::PERF_MODE_RO) {
     mmap_prot = PROT_READ;
-
   // No O_NOFOLLOW defined at buildtime, and it is not documented for open.
 #ifdef O_NOFOLLOW
     file_flags = O_RDONLY | O_NOFOLLOW;
@@ -1205,21 +1205,28 @@
   FREE_C_HEAP_ARRAY(char, filename);
 
   // open the shared memory file for the give vmid
-  fd = open_sharedmem_file(rfilename, file_flags, CHECK);
-  assert(fd != OS_ERR, "unexpected value");
+  fd = open_sharedmem_file(rfilename, file_flags, THREAD);
+
+  if (fd == OS_ERR) {
+    return;
+  }
+
+  if (HAS_PENDING_EXCEPTION) {
+    ::close(fd);
+    return;
+  }
 
   if (*sizep == 0) {
     size = sharedmem_filesize(fd, CHECK);
-    assert(size != 0, "unexpected size");
   } else {
     size = *sizep;
   }
 
+  assert(size > 0, "unexpected size <= 0");
+
   mapAddress = (char*)::mmap((char*)0, size, mmap_prot, MAP_SHARED, fd, 0);
 
-  // attempt to close the file - restart if it gets interrupted,
-  // but ignore other failures
-  RESTARTABLE(::close(fd), result);
+  result = ::close(fd);
   assert(result != OS_ERR, "could not close file");
 
   if (mapAddress == MAP_FAILED) {
@@ -1230,7 +1237,7 @@
               "Could not map PerfMemory");
   }
 
-  // It does not go through os api, the operation has to record from here.
+  // it does not go through os api, the operation has to record from here.
   MemTracker::record_virtual_memory_reserve((address)mapAddress, size, CURRENT_PC, mtInternal);
 
   *addr = mapAddress;
@@ -1238,7 +1245,7 @@
 
   if (PerfTraceMemOps) {
     tty->print("mapped " SIZE_FORMAT " bytes for vmid %d at "
-               INTPTR_FORMAT "\n", size, vmid, (void*)mapAddress);
+               INTPTR_FORMAT "\n", size, vmid, p2i((void*)mapAddress));
   }
 }
 
--- a/src/os/bsd/vm/os_bsd.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/bsd/vm/os_bsd.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -32,6 +32,7 @@
 #include "compiler/disassembler.hpp"
 #include "interpreter/interpreter.hpp"
 #include "jvm_bsd.h"
+#include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/filemap.hpp"
 #include "mutex_bsd.inline.hpp"
@@ -681,6 +682,9 @@
 
   osthread->set_thread_id(os::Bsd::gettid());
 
+  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
 #ifdef __APPLE__
   uint64_t unique_thread_id = locate_unique_thread_id(osthread->thread_id());
   guarantee(unique_thread_id != 0, "unique thread id was not found");
@@ -716,6 +720,9 @@
   // call one more level start routine
   thread->run();
 
+  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
   return 0;
 }
 
@@ -776,12 +783,18 @@
     pthread_t tid;
     int ret = pthread_create(&tid, &attr, (void* (*)(void*)) java_start, thread);
 
+    char buf[64];
+    if (ret == 0) {
+      log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
+        (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+    } else {
+      log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
+        strerror(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+    }
+
     pthread_attr_destroy(&attr);
 
     if (ret != 0) {
-      if (PrintMiscellaneous && (Verbose || WizardMode)) {
-        perror("pthread_create()");
-      }
       // Need to clean up stuff we've allocated so far
       thread->set_osthread(NULL);
       delete osthread;
@@ -858,6 +871,9 @@
   // and save the caller's signal mask
   os::Bsd::hotspot_sigmask(thread);
 
+  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
   return true;
 }
 
--- a/src/os/linux/vm/os_linux.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/linux/vm/os_linux.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -144,6 +144,7 @@
 int os::Linux::_page_size = -1;
 const int os::Linux::_vm_default_page_size = (8 * K);
 bool os::Linux::_supports_fast_thread_cpu_time = false;
+uint32_t os::Linux::_os_version = 0;
 const char * os::Linux::_glibc_version = NULL;
 const char * os::Linux::_libpthread_version = NULL;
 pthread_condattr_t os::Linux::_condattr[1];
@@ -662,6 +663,9 @@
 
   osthread->set_thread_id(os::current_thread_id());
 
+  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
   if (UseNUMA) {
     int lgrp_id = os::numa_get_group_id();
     if (lgrp_id != -1) {
@@ -691,6 +695,9 @@
   // call one more level start routine
   thread->run();
 
+  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
   return 0;
 }
 
@@ -756,12 +763,18 @@
     pthread_t tid;
     int ret = pthread_create(&tid, &attr, (void* (*)(void*)) java_start, thread);
 
+    char buf[64];
+    if (ret == 0) {
+      log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
+        (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+    } else {
+      log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
+        strerror(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
+    }
+
     pthread_attr_destroy(&attr);
 
     if (ret != 0) {
-      if (PrintMiscellaneous && (Verbose || WizardMode)) {
-        perror("pthread_create()");
-      }
       // Need to clean up stuff we've allocated so far
       thread->set_osthread(NULL);
       delete osthread;
@@ -858,6 +871,9 @@
   // and save the caller's signal mask
   os::Linux::hotspot_sigmask(thread);
 
+  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
+    os::current_thread_id(), (uintx) pthread_self());
+
   return true;
 }
 
@@ -4341,6 +4357,48 @@
   return (tp.tv_sec * NANOSECS_PER_SEC) + tp.tv_nsec;
 }
 
+void os::Linux::initialize_os_info() {
+  assert(_os_version == 0, "OS info already initialized");
+
+  struct utsname _uname;
+
+  uint32_t major;
+  uint32_t minor;
+  uint32_t fix;
+
+  int rc;
+
+  // Kernel version is unknown if
+  // verification below fails.
+  _os_version = 0x01000000;
+
+  rc = uname(&_uname);
+  if (rc != -1) {
+
+    rc = sscanf(_uname.release,"%d.%d.%d", &major, &minor, &fix);
+    if (rc == 3) {
+
+      if (major < 256 && minor < 256 && fix < 256) {
+        // Kernel version format is as expected,
+        // set it overriding unknown state.
+        _os_version = (major << 16) |
+                      (minor << 8 ) |
+                      (fix   << 0 ) ;
+      }
+    }
+  }
+}
+
+uint32_t os::Linux::os_version() {
+  assert(_os_version != 0, "not initialized");
+  return _os_version & 0x00FFFFFF;
+}
+
+bool os::Linux::os_version_is_known() {
+  assert(_os_version != 0, "not initialized");
+  return _os_version & 0x01000000 ? false : true;
+}
+
 /////
 // glibc on Linux platform uses non-documented flag
 // to indicate, that some special sort of signal
@@ -4563,6 +4621,8 @@
 
   Linux::initialize_system_info();
 
+  Linux::initialize_os_info();
+
   // main_thread points to the aboriginal thread
   Linux::_main_thread = pthread_self();
 
--- a/src/os/linux/vm/os_linux.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/linux/vm/os_linux.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -56,6 +56,15 @@
 
   static GrowableArray<int>* _cpu_to_node;
 
+  // 0x00000000 = uninitialized,
+  // 0x01000000 = kernel version unknown,
+  // otherwise a 32-bit number:
+  // Ox00AABBCC
+  // AA, Major Version
+  // BB, Minor Version
+  // CC, Fix   Version
+  static uint32_t _os_version;
+
  protected:
 
   static julong _physical_memory;
@@ -198,6 +207,10 @@
 
   static jlong fast_thread_cpu_time(clockid_t clockid);
 
+  static void initialize_os_info();
+  static bool os_version_is_known();
+  static uint32_t os_version();
+
   // pthread_cond clock suppport
  private:
   static pthread_condattr_t _condattr[1];
--- a/src/os/posix/vm/os_posix.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/posix/vm/os_posix.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -238,14 +238,12 @@
   st->cr();
 }
 
-#ifndef PRODUCT
 bool os::get_host_name(char* buf, size_t buflen) {
   struct utsname name;
   uname(&name);
   jio_snprintf(buf, buflen, "%s", name.nodename);
   return true;
 }
-#endif // PRODUCT
 
 bool os::has_allocatable_memory_limit(julong* limit) {
   struct rlimit rlim;
@@ -1073,6 +1071,19 @@
 #endif
 }
 
+char* os::Posix::describe_pthread_attr(char* buf, size_t buflen, const pthread_attr_t* attr) {
+  size_t stack_size = 0;
+  size_t guard_size = 0;
+  int detachstate = 0;
+  pthread_attr_getstacksize(attr, &stack_size);
+  pthread_attr_getguardsize(attr, &guard_size);
+  pthread_attr_getdetachstate(attr, &detachstate);
+  jio_snprintf(buf, buflen, "stacksize: " SIZE_FORMAT "k, guardsize: " SIZE_FORMAT "k, %s",
+    stack_size / 1024, guard_size / 1024,
+    (detachstate == PTHREAD_CREATE_DETACHED ? "detached" : "joinable"));
+  return buf;
+}
+
 
 os::WatcherThreadCrashProtection::WatcherThreadCrashProtection() {
   assert(Thread::current()->is_Watcher_thread(), "Must be WatcherThread");
--- a/src/os/posix/vm/os_posix.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/posix/vm/os_posix.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -76,6 +76,11 @@
   static address ucontext_get_pc(const ucontext_t* ctx);
   // Set PC into context. Needed for continuation after signal.
   static void ucontext_set_pc(ucontext_t* ctx, address pc);
+
+  // Helper function; describes pthread attributes as short string. String is written
+  // to buf with len buflen; buf is returned.
+  static char* describe_pthread_attr(char* buf, size_t buflen, const pthread_attr_t* attr);
+
 };
 
 /*
--- a/src/os/solaris/vm/os_solaris.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/solaris/vm/os_solaris.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -32,6 +32,7 @@
 #include "compiler/disassembler.hpp"
 #include "interpreter/interpreter.hpp"
 #include "jvm_solaris.h"
+#include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/filemap.hpp"
 #include "mutex_solaris.inline.hpp"
@@ -68,6 +69,7 @@
 #include "utilities/defaultStream.hpp"
 #include "utilities/events.hpp"
 #include "utilities/growableArray.hpp"
+#include "utilities/macros.hpp"
 #include "utilities/vmError.hpp"
 
 // put OS-includes here
@@ -736,6 +738,9 @@
   osthr->set_lwp_id(_lwp_self());  // Store lwp in case we are bound
   thread->_schedctl = (void *) schedctl_init();
 
+  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ").",
+    os::current_thread_id());
+
   if (UseNUMA) {
     int lgrp_id = os::numa_get_group_id();
     if (lgrp_id != -1) {
@@ -781,6 +786,8 @@
     Atomic::dec(&os::Solaris::_os_thread_count);
   }
 
+  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ").", os::current_thread_id());
+
   if (UseDetachedThreads) {
     thr_exit(NULL);
     ShouldNotReachHere();
@@ -853,6 +860,9 @@
   // and save the caller's signal mask
   os::Solaris::hotspot_sigmask(thread);
 
+  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ").",
+    os::current_thread_id());
+
   return true;
 }
 
@@ -879,6 +889,24 @@
   return true;
 }
 
+// Helper function to trace thread attributes, similar to os::Posix::describe_pthread_attr()
+static char* describe_thr_create_attributes(char* buf, size_t buflen,
+                                            size_t stacksize, long flags) {
+  stringStream ss(buf, buflen);
+  ss.print("stacksize: " SIZE_FORMAT "k, ", stacksize / 1024);
+  ss.print("flags: ");
+  #define PRINT_FLAG(f) if (flags & f) ss.print( #f " ");
+  #define ALL(X) \
+    X(THR_SUSPENDED) \
+    X(THR_DETACHED) \
+    X(THR_BOUND) \
+    X(THR_NEW_LWP) \
+    X(THR_DAEMON)
+  ALL(PRINT_FLAG)
+  #undef ALL
+  #undef PRINT_FLAG
+  return buf;
+}
 
 bool os::create_thread(Thread* thread, ThreadType thr_type,
                        size_t stack_size) {
@@ -974,10 +1002,17 @@
   osthread->set_thread_id(-1);
 
   status = thr_create(NULL, stack_size, java_start, thread, flags, &tid);
+
+  char buf[64];
+  if (status == 0) {
+    log_info(os, thread)("Thread started (tid: " UINTX_FORMAT ", attributes: %s). ",
+      (uintx) tid, describe_thr_create_attributes(buf, sizeof(buf), stack_size, flags));
+  } else {
+    log_warning(os, thread)("Failed to start thread - thr_create failed (%s) for attributes: %s.",
+      strerror(status), describe_thr_create_attributes(buf, sizeof(buf), stack_size, flags));
+  }
+
   if (status != 0) {
-    if (PrintMiscellaneous && (Verbose || WizardMode)) {
-      perror("os::create_thread");
-    }
     thread->set_osthread(NULL);
     // Need to clean up stuff we've allocated so far
     delete osthread;
--- a/src/os/windows/vm/os_windows.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/windows/vm/os_windows.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -35,6 +35,7 @@
 #include "compiler/disassembler.hpp"
 #include "interpreter/interpreter.hpp"
 #include "jvm_windows.h"
+#include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/filemap.hpp"
 #include "mutex_windows.inline.hpp"
@@ -71,6 +72,7 @@
 #include "utilities/defaultStream.hpp"
 #include "utilities/events.hpp"
 #include "utilities/growableArray.hpp"
+#include "utilities/macros.hpp"
 #include "utilities/vmError.hpp"
 
 #ifdef _DEBUG
@@ -436,6 +438,8 @@
     res = 20115;    // java thread
   }
 
+  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ").", os::current_thread_id());
+
   // Install a win32 structured exception handler around every thread created
   // by VM, so VM can generate error dump when an exception occurred in non-
   // Java thread (e.g. VM thread).
@@ -446,6 +450,8 @@
     // Nothing to do.
   }
 
+  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ").", os::current_thread_id());
+
   // One less thread is executing
   // When the VMThread gets here, the main thread may have already exited
   // which frees the CodeHeap containing the Atomic::add code
@@ -509,6 +515,10 @@
   osthread->set_state(RUNNABLE);
 
   thread->set_osthread(osthread);
+
+  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ").",
+    os::current_thread_id());
+
   return true;
 }
 
@@ -530,6 +540,27 @@
   return true;
 }
 
+// Helper function to trace _beginthreadex attributes,
+//  similar to os::Posix::describe_pthread_attr()
+static char* describe_beginthreadex_attributes(char* buf, size_t buflen,
+                                               size_t stacksize, unsigned initflag) {
+  stringStream ss(buf, buflen);
+  if (stacksize == 0) {
+    ss.print("stacksize: default, ");
+  } else {
+    ss.print("stacksize: " SIZE_FORMAT "k, ", stacksize / 1024);
+  }
+  ss.print("flags: ");
+  #define PRINT_FLAG(f) if (initflag & f) ss.print( #f " ");
+  #define ALL(X) \
+    X(CREATE_SUSPENDED) \
+    X(STACK_SIZE_PARAM_IS_A_RESERVATION)
+  ALL(PRINT_FLAG)
+  #undef ALL
+  #undef PRINT_FLAG
+  return buf;
+}
+
 // Allocate and initialize a new OSThread
 bool os::create_thread(Thread* thread, ThreadType thr_type,
                        size_t stack_size) {
@@ -596,14 +627,24 @@
   // document because JVM uses C runtime library. The good news is that the
   // flag appears to work with _beginthredex() as well.
 
+  const unsigned initflag = CREATE_SUSPENDED | STACK_SIZE_PARAM_IS_A_RESERVATION;
   HANDLE thread_handle =
     (HANDLE)_beginthreadex(NULL,
                            (unsigned)stack_size,
                            (unsigned (__stdcall *)(void*)) java_start,
                            thread,
-                           CREATE_SUSPENDED | STACK_SIZE_PARAM_IS_A_RESERVATION,
+                           initflag,
                            &thread_id);
 
+  char buf[64];
+  if (thread_handle != NULL) {
+    log_info(os, thread)("Thread started (tid: %u, attributes: %s)",
+      thread_id, describe_beginthreadex_attributes(buf, sizeof(buf), stack_size, initflag));
+  } else {
+    log_warning(os, thread)("Failed to start thread - _beginthreadex failed (%s) for attributes: %s.",
+      strerror(errno), describe_beginthreadex_attributes(buf, sizeof(buf), stack_size, initflag));
+  }
+
   if (thread_handle == NULL) {
     // Need to clean up stuff we've allocated so far
     CloseHandle(osthread->interrupt_event());
@@ -1531,12 +1572,10 @@
   return result;
 }
 
-#ifndef PRODUCT
 bool os::get_host_name(char* buf, size_t buflen) {
   DWORD size = (DWORD)buflen;
   return (GetComputerNameEx(ComputerNameDnsHostname, buf, &size) == TRUE);
 }
-#endif // PRODUCT
 
 void os::get_summary_os_info(char* buf, size_t buflen) {
   stringStream sst(buf, buflen);
@@ -1670,8 +1709,7 @@
     if (is_workstation) {
       st->print("10");
     } else {
-      // The server version name of Windows 10 is not known at this time
-      st->print("%d.%d", major_version, minor_version);
+      st->print("Server 2016");
     }
     break;
 
--- a/src/os/windows/vm/perfMemory_windows.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os/windows/vm/perfMemory_windows.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -628,6 +628,7 @@
 
   if (!is_directory_secure(dirname)) {
     // the directory is not secure, don't attempt any cleanup
+    os::closedir(dirp);
     return;
   }
 
@@ -1445,6 +1446,8 @@
 
   // check that the file system is secure - i.e. it supports ACLs.
   if (!is_filesystem_secure(dirname)) {
+    FREE_C_HEAP_ARRAY(char, dirname);
+    FREE_C_HEAP_ARRAY(char, user);
     return NULL;
   }
 
@@ -1624,6 +1627,7 @@
   //
   if (!is_directory_secure(dirname)) {
     FREE_C_HEAP_ARRAY(char, dirname);
+    if (luser != user) FREE_C_HEAP_ARRAY(char, luser);
     THROW_MSG(vmSymbols::java_lang_IllegalArgumentException(),
               "Process not found");
   }
--- a/src/os_cpu/linux_aarch64/vm/copy_linux_aarch64.inline.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os_cpu/linux_aarch64/vm/copy_linux_aarch64.inline.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -26,44 +26,108 @@
 #ifndef OS_CPU_LINUX_AARCH64_VM_COPY_LINUX_AARCH64_INLINE_HPP
 #define OS_CPU_LINUX_AARCH64_VM_COPY_LINUX_AARCH64_INLINE_HPP
 
+#define COPY_SMALL(from, to, count)                                     \
+{                                                                       \
+        long tmp0, tmp1, tmp2, tmp3;                                    \
+        long tmp4, tmp5, tmp6, tmp7;                                    \
+  __asm volatile(                                                       \
+"       adr     %[t0], 0f;"                                             \
+"       add     %[t0], %[t0], %[cnt], lsl #5;"                          \
+"       br      %[t0];"                                                 \
+"       .align  5;"                                                     \
+"0:"                                                                    \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldr     %[t0], [%[s], #0];"                                     \
+"       str     %[t0], [%[d], #0];"                                     \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       ldr     %[t2], [%[s], #16];"                                    \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       str     %[t2], [%[d], #16];"                                    \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       ldp     %[t2], %[t3], [%[s], #16];"                             \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       stp     %[t2], %[t3], [%[d], #16];"                             \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       ldp     %[t2], %[t3], [%[s], #16];"                             \
+"       ldr     %[t4], [%[s], #32];"                                    \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       stp     %[t2], %[t3], [%[d], #16];"                             \
+"       str     %[t4], [%[d], #32];"                                    \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       ldp     %[t2], %[t3], [%[s], #16];"                             \
+"       ldp     %[t4], %[t5], [%[s], #32];"                             \
+"2:"                                                                    \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       stp     %[t2], %[t3], [%[d], #16];"                             \
+"       stp     %[t4], %[t5], [%[d], #32];"                             \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldr     %[t6], [%[s], #0];"                                     \
+"       ldp     %[t0], %[t1], [%[s], #8];"                              \
+"       ldp     %[t2], %[t3], [%[s], #24];"                             \
+"       ldp     %[t4], %[t5], [%[s], #40];"                             \
+"       str     %[t6], [%[d]], #8;"                                     \
+"       b       2b;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       ldp     %[t2], %[t3], [%[s], #16];"                             \
+"       ldp     %[t4], %[t5], [%[s], #32];"                             \
+"       ldp     %[t6], %[t7], [%[s], #48];"                             \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       stp     %[t2], %[t3], [%[d], #16];"                             \
+"       stp     %[t4], %[t5], [%[d], #32];"                             \
+"       stp     %[t6], %[t7], [%[d], #48];"                             \
+"1:"                                                                    \
+                                                                        \
+  : [s]"+r"(from), [d]"+r"(to), [cnt]"+r"(count),                       \
+    [t0]"=&r"(tmp0), [t1]"=&r"(tmp1), [t2]"=&r"(tmp2), [t3]"=&r"(tmp3), \
+    [t4]"=&r"(tmp4), [t5]"=&r"(tmp5), [t6]"=&r"(tmp6), [t7]"=&r"(tmp7)  \
+  :                                                                     \
+  : "memory", "cc");                                                    \
+}
+
 static void pd_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
-  (void)memmove(to, from, count * HeapWordSize);
+  __asm volatile( "prfm pldl1strm, [%[s], #0];" :: [s]"r"(from) : "memory");
+  if (__builtin_expect(count <= 8, 1)) {
+    COPY_SMALL(from, to, count);
+    return;
+  }
+  _Copy_conjoint_words(from, to, count);
 }
 
 static void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
-  switch (count) {
-  case 8:  to[7] = from[7];
-  case 7:  to[6] = from[6];
-  case 6:  to[5] = from[5];
-  case 5:  to[4] = from[4];
-  case 4:  to[3] = from[3];
-  case 3:  to[2] = from[2];
-  case 2:  to[1] = from[1];
-  case 1:  to[0] = from[0];
-  case 0:  break;
-  default:
-    (void)memcpy(to, from, count * HeapWordSize);
-    break;
+  if (__builtin_constant_p(count)) {
+    memcpy(to, from, count * sizeof(HeapWord));
+    return;
   }
+  __asm volatile( "prfm pldl1strm, [%[s], #0];" :: [s]"r"(from) : "memory");
+  if (__builtin_expect(count <= 8, 1)) {
+    COPY_SMALL(from, to, count);
+    return;
+  }
+  _Copy_disjoint_words(from, to, count);
 }
 
 static void pd_disjoint_words_atomic(HeapWord* from, HeapWord* to, size_t count) {
-  switch (count) {
-  case 8:  to[7] = from[7];
-  case 7:  to[6] = from[6];
-  case 6:  to[5] = from[5];
-  case 5:  to[4] = from[4];
-  case 4:  to[3] = from[3];
-  case 3:  to[2] = from[2];
-  case 2:  to[1] = from[1];
-  case 1:  to[0] = from[0];
-  case 0:  break;
-  default:
-    while (count-- > 0) {
-      *to++ = *from++;
-    }
-    break;
+  __asm volatile( "prfm pldl1strm, [%[s], #0];" :: [s]"r"(from) : "memory");
+  if (__builtin_expect(count <= 8, 1)) {
+    COPY_SMALL(from, to, count);
+    return;
   }
+  _Copy_disjoint_words(from, to, count);
 }
 
 static void pd_aligned_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/os_cpu/linux_aarch64/vm/copy_linux_aarch64.s	Thu Mar 17 17:03:20 2016 +0000
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2016, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+        .global _Copy_conjoint_words
+        .global _Copy_disjoint_words
+
+s       .req    x0
+d       .req    x1
+count   .req    x2
+t0      .req    x3
+t1      .req    x4
+t2      .req    x5
+t3      .req    x6
+t4      .req    x7
+t5      .req    x8
+t6      .req    x9
+t7      .req    x10
+
+        .align  6
+_Copy_disjoint_words:
+        // Ensure 2 word aligned
+        tbz     s, #3, fwd_copy_aligned
+        ldr     t0, [s], #8
+        str     t0, [d], #8
+        sub     count, count, #1
+
+fwd_copy_aligned:
+        // Bias s & d so we only pre index on the last copy
+        sub     s, s, #16
+        sub     d, d, #16
+
+        ldp     t0, t1, [s, #16]
+        ldp     t2, t3, [s, #32]
+        ldp     t4, t5, [s, #48]
+        ldp     t6, t7, [s, #64]!
+
+        subs    count, count, #16
+        blo     fwd_copy_drain
+
+fwd_copy_again:
+        prfm    pldl1keep, [s, #256]
+        stp     t0, t1, [d, #16]
+        ldp     t0, t1, [s, #16]
+        stp     t2, t3, [d, #32]
+        ldp     t2, t3, [s, #32]
+        stp     t4, t5, [d, #48]
+        ldp     t4, t5, [s, #48]
+        stp     t6, t7, [d, #64]!
+        ldp     t6, t7, [s, #64]!
+        subs    count, count, #8
+        bhs     fwd_copy_again
+
+fwd_copy_drain:
+        stp     t0, t1, [d, #16]
+        stp     t2, t3, [d, #32]
+        stp     t4, t5, [d, #48]
+        stp     t6, t7, [d, #64]!
+
+        // count is now -8..-1 for 0..7 words to copy
+        adr     t0, 0f
+        add     t0, t0, count, lsl #5
+        br      t0
+
+        .align  5
+        ret                             // -8 == 0 words
+        .align  5
+        ldr     t0, [s, #16]            // -7 == 1 word
+        str     t0, [d, #16]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -6 = 2 words
+        stp     t0, t1, [d, #16]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -5 = 3 words
+        ldr     t2, [s, #32]
+        stp     t0, t1, [d, #16]
+        str     t2, [d, #32]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -4 = 4 words
+        ldp     t2, t3, [s, #32]
+        stp     t0, t1, [d, #16]
+        stp     t2, t3, [d, #32]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -3 = 5 words
+        ldp     t2, t3, [s, #32]
+        ldr     t4, [s, #48]
+        stp     t0, t1, [d, #16]
+        stp     t2, t3, [d, #32]
+        str     t4, [d, #48]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -2 = 6 words
+        ldp     t2, t3, [s, #32]
+        ldp     t4, t5, [s, #48]
+        stp     t0, t1, [d, #16]
+        stp     t2, t3, [d, #32]
+        stp     t4, t5, [d, #48]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -1 = 7 words
+        ldp     t2, t3, [s, #32]
+        ldp     t4, t5, [s, #48]
+        ldr     t6, [s, #64]
+        stp     t0, t1, [d, #16]
+        stp     t2, t3, [d, #32]
+        stp     t4, t5, [d, #48]
+        str     t6, [d, #64]
+        // Is always aligned here, code for 7 words is one instruction
+        // too large so it just falls through.
+        .align  5
+0:
+        ret
+
+        .align  6
+_Copy_conjoint_words:
+        sub     t0, d, s
+        cmp     t0, count, lsl #3
+        bhs     _Copy_disjoint_words
+
+        add     s, s, count, lsl #3
+        add     d, d, count, lsl #3
+
+        // Ensure 2 word aligned
+        tbz     s, #3, bwd_copy_aligned
+        ldr     t0, [s, #-8]!
+        str     t0, [d, #-8]!
+        sub     count, count, #1
+
+bwd_copy_aligned:
+        ldp     t0, t1, [s, #-16]
+        ldp     t2, t3, [s, #-32]
+        ldp     t4, t5, [s, #-48]
+        ldp     t6, t7, [s, #-64]!
+
+        subs    count, count, #16
+        blo     bwd_copy_drain
+
+bwd_copy_again:
+        prfm    pldl1keep, [s, #-256]
+        stp     t0, t1, [d, #-16]
+        ldp     t0, t1, [s, #-16]
+        stp     t2, t3, [d, #-32]
+        ldp     t2, t3, [s, #-32]
+        stp     t4, t5, [d, #-48]
+        ldp     t4, t5, [s, #-48]
+        stp     t6, t7, [d, #-64]!
+        ldp     t6, t7, [s, #-64]!
+        subs    count, count, #8
+        bhs     bwd_copy_again
+
+bwd_copy_drain:
+        stp     t0, t1, [d, #-16]
+        stp     t2, t3, [d, #-32]
+        stp     t4, t5, [d, #-48]
+        stp     t6, t7, [d, #-64]!
+
+        // count is now -8..-1 for 0..7 words to copy
+        adr     t0, 0f
+        add     t0, t0, count, lsl #5
+        br      t0
+
+        .align  5
+        ret                             // -8 == 0 words
+        .align  5
+        ldr     t0, [s, #-8]            // -7 == 1 word
+        str     t0, [d, #-8]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -6 = 2 words
+        stp     t0, t1, [d, #-16]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -5 = 3 words
+        ldr     t2, [s, #-24]
+        stp     t0, t1, [d, #-16]
+        str     t2, [d, #-24]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -4 = 4 words
+        ldp     t2, t3, [s, #-32]
+        stp     t0, t1, [d, #-16]
+        stp     t2, t3, [d, #-32]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -3 = 5 words
+        ldp     t2, t3, [s, #-32]
+        ldr     t4, [s, #-40]
+        stp     t0, t1, [d, #-16]
+        stp     t2, t3, [d, #-32]
+        str     t4, [d, #-40]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -2 = 6 words
+        ldp     t2, t3, [s, #-32]
+        ldp     t4, t5, [s, #-48]
+        stp     t0, t1, [d, #-16]
+        stp     t2, t3, [d, #-32]
+        stp     t4, t5, [d, #-48]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -1 = 7 words
+        ldp     t2, t3, [s, #-32]
+        ldp     t4, t5, [s, #-48]
+        ldr     t6, [s, #-56]
+        stp     t0, t1, [d, #-16]
+        stp     t2, t3, [d, #-32]
+        stp     t4, t5, [d, #-48]
+        str     t6, [d, #-56]
+        // Is always aligned here, code for 7 words is one instruction
+        // too large so it just falls through.
+        .align  5
+0:
+        ret
--- a/src/os_cpu/linux_zero/vm/os_linux_zero.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os_cpu/linux_zero/vm/os_linux_zero.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2007, 2008, 2009, 2010 Red Hat, Inc.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -65,6 +65,7 @@
 
 frame os::get_sender_for_C_frame(frame* fr) {
   ShouldNotCallThis();
+  return frame(NULL, NULL); // silence compile warning.
 }
 
 frame os::current_frame() {
@@ -102,6 +103,7 @@
 
 address os::Linux::ucontext_get_pc(const ucontext_t* uc) {
   ShouldNotCallThis();
+  return NULL; // silence compile warnings
 }
 
 void os::Linux::ucontext_set_pc(ucontext_t * uc, address pc) {
@@ -112,10 +114,12 @@
                                         intptr_t** ret_sp,
                                         intptr_t** ret_fp) {
   ShouldNotCallThis();
+  return NULL; // silence compile warnings
 }
 
 frame os::fetch_frame_from_context(const void* ucVoid) {
   ShouldNotCallThis();
+  return frame(NULL, NULL); // silence compile warnings
 }
 
 extern "C" JNIEXPORT int
@@ -262,11 +266,16 @@
   }
 #endif // !PRODUCT
 
-  const char *fmt = "caught unhandled signal %d";
   char buf[64];
 
-  sprintf(buf, fmt, sig);
+  sprintf(buf, "caught unhandled signal %d", sig);
+
+// Silence -Wformat-security warning for fatal()
+PRAGMA_DIAG_PUSH
+PRAGMA_FORMAT_NONLITERAL_IGNORED
   fatal(buf);
+PRAGMA_DIAG_POP
+  return true; // silence compiler warnings
 }
 
 void os::Linux::init_thread_fpu_state(void) {
@@ -275,6 +284,7 @@
 
 int os::Linux::get_fpu_control_word() {
   ShouldNotCallThis();
+  return -1; // silence compile warnings
 }
 
 void os::Linux::set_fpu_control_word(int fpu) {
@@ -419,6 +429,7 @@
 
 extern "C" {
   int SpinPause() {
+      return -1; // silence compile warnings
   }
 
 
--- a/src/os_cpu/linux_zero/vm/thread_linux_zero.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os_cpu/linux_zero/vm/thread_linux_zero.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2007, 2008, 2009, 2010 Red Hat, Inc.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -110,6 +110,7 @@
                                            void* ucontext,
                                            bool isInJava) {
     ShouldNotCallThis();
+    return false; // silence compile warning
   }
 
   // These routines are only used on cpu architectures that
--- a/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -264,6 +264,7 @@
 
 // We need to keep these here as long as we have to build on Solaris
 // versions before 10.
+
 #ifndef SI_ARCHITECTURE_32
 #define SI_ARCHITECTURE_32      516     /* basic 32-bit SI_ARCHITECTURE */
 #endif
@@ -272,36 +273,87 @@
 #define SI_ARCHITECTURE_64      517     /* basic 64-bit SI_ARCHITECTURE */
 #endif
 
-static void do_sysinfo(int si, const char* string, int* features, int mask) {
-  char   tmp;
-  size_t bufsize = sysinfo(si, &tmp, 1);
+#ifndef SI_CPUBRAND
+#define SI_CPUBRAND             523     /* return cpu brand string */
+#endif
 
-  // All SI defines used below must be supported.
-  guarantee(bufsize != -1, "must be supported");
+class Sysinfo {
+  char* _string;
+public:
+  Sysinfo(int si) : _string(NULL) {
+    char   tmp;
+    size_t bufsize = sysinfo(si, &tmp, 1);
 
-  char* buf = (char*) os::malloc(bufsize, mtInternal);
-
-  if (buf == NULL)
-    return;
-
-  if (sysinfo(si, buf, bufsize) == bufsize) {
-    // Compare the string.
-    if (strcmp(buf, string) == 0) {
-      *features |= mask;
+    if (bufsize != -1) {
+      char* buf = (char*) os::malloc(bufsize, mtInternal);
+      if (buf != NULL) {
+        if (sysinfo(si, buf, bufsize) == bufsize) {
+          _string = buf;
+        } else {
+          os::free(buf);
+        }
+      }
     }
   }
 
-  os::free(buf);
-}
+  ~Sysinfo() {
+    if (_string != NULL) {
+      os::free(_string);
+    }
+  }
+
+  const char* value() const {
+    return _string;
+  }
+
+  bool valid() const {
+    return _string != NULL;
+  }
+
+  bool match(const char* s) const {
+    return valid() ? strcmp(_string, s) == 0 : false;
+  }
+
+  bool match_substring(const char* s) const {
+    return valid() ? strstr(_string, s) != NULL : false;
+  }
+};
+
+class Sysconf {
+  int _value;
+public:
+  Sysconf(int sc) : _value(-1) {
+    _value = sysconf(sc);
+  }
+  bool valid() const {
+    return _value != -1;
+  }
+  int value() const {
+    return _value;
+  }
+};
+
+
+#ifndef _SC_DCACHE_LINESZ
+#define _SC_DCACHE_LINESZ       508     /* Data cache line size */
+#endif
+
+#ifndef _SC_L2CACHE_LINESZ
+#define _SC_L2CACHE_LINESZ      527     /* Size of L2 cache line */
+#endif
 
 int VM_Version::platform_features(int features) {
   assert(os::Solaris::supports_getisax(), "getisax() must be available");
 
   // Check 32-bit architecture.
-  do_sysinfo(SI_ARCHITECTURE_32, "sparc", &features, v8_instructions_m);
+  if (Sysinfo(SI_ARCHITECTURE_32).match("sparc")) {
+    features |= v8_instructions_m;
+  }
 
   // Check 64-bit architecture.
-  do_sysinfo(SI_ARCHITECTURE_64, "sparcv9", &features, generic_v9_m);
+  if (Sysinfo(SI_ARCHITECTURE_64).match("sparcv9")) {
+    features |= generic_v9_m;
+  }
 
   // Extract valid instruction set extensions.
   uint_t avs[2];
@@ -388,67 +440,63 @@
   if (av & AV_SPARC_SHA512)       features |= sha512_instruction_m;
 
   // Determine the machine type.
-  do_sysinfo(SI_MACHINE, "sun4v", &features, sun4v_m);
+  if (Sysinfo(SI_MACHINE).match("sun4v")) {
+    features |= sun4v_m;
+  }
 
-  {
-    // Using kstat to determine the machine type.
+  bool use_solaris_12_api = false;
+  Sysinfo impl(SI_CPUBRAND);
+  if (impl.valid()) {
+    // If SI_CPUBRAND works, that means Solaris 12 API to get the cache line sizes
+    // is available to us as well
+    use_solaris_12_api = true;
+    features |= parse_features(impl.value());
+  } else {
+    // Otherwise use kstat to determine the machine type.
     kstat_ctl_t* kc = kstat_open();
     kstat_t* ksp = kstat_lookup(kc, (char*)"cpu_info", -1, NULL);
-    const char* implementation = "UNKNOWN";
+    const char* implementation;
+    bool has_implementation = false;
     if (ksp != NULL) {
       if (kstat_read(kc, ksp, NULL) != -1 && ksp->ks_data != NULL) {
         kstat_named_t* knm = (kstat_named_t *)ksp->ks_data;
         for (int i = 0; i < ksp->ks_ndata; i++) {
           if (strcmp((const char*)&(knm[i].name),"implementation") == 0) {
             implementation = KSTAT_NAMED_STR_PTR(&knm[i]);
+            has_implementation = true;
 #ifndef PRODUCT
             if (PrintMiscellaneous && Verbose) {
               tty->print_cr("cpu_info.implementation: %s", implementation);
             }
 #endif
-            // Convert to UPPER case before compare.
-            char* impl = os::strdup_check_oom(implementation);
-
-            for (int i = 0; impl[i] != 0; i++)
-              impl[i] = (char)toupper((uint)impl[i]);
-
-            if (strstr(impl, "SPARC64") != NULL) {
-              features |= sparc64_family_m;
-            } else if (strstr(impl, "SPARC-M") != NULL) {
-              // M-series SPARC is based on T-series.
-              features |= (M_family_m | T_family_m);
-            } else if (strstr(impl, "SPARC-T") != NULL) {
-              features |= T_family_m;
-              if (strstr(impl, "SPARC-T1") != NULL) {
-                features |= T1_model_m;
-              }
-            } else {
-              if (strstr(impl, "SPARC") == NULL) {
-#ifndef PRODUCT
-                // kstat on Solaris 8 virtual machines (branded zones)
-                // returns "(unsupported)" implementation. Solaris 8 is not
-                // supported anymore, but include this check to be on the
-                // safe side.
-                warning("kstat cpu_info implementation = '%s', assume generic SPARC", impl);
-#endif
-                implementation = "SPARC";
-              }
-            }
-            os::free((void*)impl);
+            features |= parse_features(implementation);
             break;
           }
         } // for(
       }
     }
-    assert(strcmp(implementation, "UNKNOWN") != 0,
-           "unknown cpu info (changed kstat interface?)");
+    assert(has_implementation, "unknown cpu info (changed kstat interface?)");
     kstat_close(kc);
   }
 
-  // Figure out cache line sizes using PICL
-  PICL picl((features & sparc64_family_m) != 0, (features & sun4v_m) != 0);
-  _L1_data_cache_line_size = picl.L1_data_cache_line_size();
-  _L2_data_cache_line_size = picl.L2_data_cache_line_size();
+  bool is_sun4v = (features & sun4v_m) != 0;
+  if (use_solaris_12_api && is_sun4v) {
+    // If Solaris 12 API is supported and it's sun4v use sysconf() to get the cache line sizes
+    Sysconf l1_dcache_line_size(_SC_DCACHE_LINESZ);
+    if (l1_dcache_line_size.valid()) {
+      _L1_data_cache_line_size =  l1_dcache_line_size.value();
+    }
 
+    Sysconf l2_dcache_line_size(_SC_L2CACHE_LINESZ);
+    if (l2_dcache_line_size.valid()) {
+      _L2_data_cache_line_size = l2_dcache_line_size.value();
+    }
+  } else {
+    // Otherwise figure out the cache line sizes using PICL
+    bool is_fujitsu = (features & sparc64_family_m) != 0;
+    PICL picl(is_fujitsu, is_sun4v);
+    _L1_data_cache_line_size = picl.L1_data_cache_line_size();
+    _L2_data_cache_line_size = picl.L2_data_cache_line_size();
+  }
   return features;
 }
--- a/src/share/vm/c1/c1_Canonicalizer.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/c1/c1_Canonicalizer.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -257,7 +257,38 @@
   }
 }
 
-void Canonicalizer::do_LoadIndexed    (LoadIndexed*     x) {}
+void Canonicalizer::do_LoadIndexed    (LoadIndexed*     x) {
+  StableArrayConstant* array = x->array()->type()->as_StableArrayConstant();
+  IntConstant* index = x->index()->type()->as_IntConstant();
+
+  assert(array == NULL || FoldStableValues, "not enabled");
+
+  // Constant fold loads from stable arrays.
+  if (array != NULL && index != NULL) {
+    jint idx = index->value();
+    if (idx < 0 || idx >= array->value()->length()) {
+      // Leave the load as is. The range check will handle it.
+      return;
+    }
+
+    ciConstant field_val = array->value()->element_value(idx);
+    if (!field_val.is_null_or_zero()) {
+      jint dimension = array->dimension();
+      assert(dimension <= array->value()->array_type()->dimension(), "inconsistent info");
+      ValueType* value = NULL;
+      if (dimension > 1) {
+        // Preserve information about the dimension for the element.
+        assert(field_val.as_object()->is_array(), "not an array");
+        value = new StableArrayConstant(field_val.as_object()->as_array(), dimension - 1);
+      } else {
+        assert(dimension == 1, "sanity");
+        value = as_ValueType(field_val);
+      }
+      set_canonical(new Constant(value));
+    }
+  }
+}
+
 void Canonicalizer::do_StoreIndexed   (StoreIndexed*    x) {
   // If a value is going to be stored into a field or array some of
   // the conversions emitted by javac are unneeded because the fields
@@ -471,7 +502,7 @@
     InstanceConstant* c = x->argument_at(0)->type()->as_InstanceConstant();
     if (c != NULL && !c->value()->is_null_object()) {
       // ciInstance::java_mirror_type() returns non-NULL only for Java mirrors
-      ciType* t = c->value()->as_instance()->java_mirror_type();
+      ciType* t = c->value()->java_mirror_type();
       if (t->is_klass()) {
         // substitute cls.isInstance(obj) of a constant Class into
         // an InstantOf instruction
@@ -487,6 +518,17 @@
     }
     break;
   }
+  case vmIntrinsics::_isPrimitive        : {
+    assert(x->number_of_arguments() == 1, "wrong type");
+
+    // Class.isPrimitive is known on constant classes:
+    InstanceConstant* c = x->argument_at(0)->type()->as_InstanceConstant();
+    if (c != NULL && !c->value()->is_null_object()) {
+      ciType* t = c->value()->java_mirror_type();
+      set_constant(t->is_primitive_type());
+    }
+    break;
+  }
   }
 }
 
--- a/src/share/vm/c1/c1_Compiler.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/c1/c1_Compiler.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -148,6 +148,7 @@
   case vmIntrinsics::_longBitsToDouble:
   case vmIntrinsics::_getClass:
   case vmIntrinsics::_isInstance:
+  case vmIntrinsics::_isPrimitive:
   case vmIntrinsics::_currentThread:
   case vmIntrinsics::_dabs:
   case vmIntrinsics::_dsqrt:
@@ -228,8 +229,6 @@
   case vmIntrinsics::_getCharStringU:
   case vmIntrinsics::_putCharStringU:
 #ifdef TRACE_HAVE_INTRINSICS
-  case vmIntrinsics::_classID:
-  case vmIntrinsics::_threadID:
   case vmIntrinsics::_counterTime:
 #endif
     break;
--- a/src/share/vm/c1/c1_GraphBuilder.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/c1/c1_GraphBuilder.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1519,6 +1519,29 @@
   append(new Return(x));
 }
 
+Value GraphBuilder::make_constant(ciConstant field_value, ciField* field) {
+  BasicType field_type = field_value.basic_type();
+  ValueType* value = as_ValueType(field_value);
+
+  // Attach dimension info to stable arrays.
+  if (FoldStableValues &&
+      field->is_stable() && field_type == T_ARRAY && !field_value.is_null_or_zero()) {
+    ciArray* array = field_value.as_object()->as_array();
+    jint dimension = field->type()->as_array_klass()->dimension();
+    value = new StableArrayConstant(array, dimension);
+  }
+
+  switch (field_type) {
+    case T_ARRAY:
+    case T_OBJECT:
+      if (field_value.as_object()->should_be_constant()) {
+        return new Constant(value);
+      }
+      return NULL; // Not a constant.
+    default:
+      return new Constant(value);
+  }
+}
 
 void GraphBuilder::access_field(Bytecodes::Code code) {
   bool will_link;
@@ -1563,22 +1586,13 @@
   switch (code) {
     case Bytecodes::_getstatic: {
       // check for compile-time constants, i.e., initialized static final fields
-      Instruction* constant = NULL;
+      Value constant = NULL;
       if (field->is_constant() && !PatchALot) {
-        ciConstant field_val = field->constant_value();
-        BasicType field_type = field_val.basic_type();
-        switch (field_type) {
-        case T_ARRAY:
-        case T_OBJECT:
-          if (field_val.as_object()->should_be_constant()) {
-            constant = new Constant(as_ValueType(field_val));
-          }
-          break;
-
-        default:
-          constant = new Constant(as_ValueType(field_val));
-        }
+        ciConstant field_value = field->constant_value();
         // Stable static fields are checked for non-default values in ciField::initialize_from().
+        assert(!field->is_stable() || !field_value.is_null_or_zero(),
+               "stable static w/ default value shouldn't be a constant");
+        constant = make_constant(field_value, field);
       }
       if (constant != NULL) {
         push(type, append(constant));
@@ -1591,38 +1605,29 @@
       }
       break;
     }
-    case Bytecodes::_putstatic:
-      { Value val = pop(type);
-        if (state_before == NULL) {
-          state_before = copy_state_for_exception();
-        }
-        append(new StoreField(append(obj), offset, field, val, true, state_before, needs_patching));
+    case Bytecodes::_putstatic: {
+      Value val = pop(type);
+      if (state_before == NULL) {
+        state_before = copy_state_for_exception();
       }
+      append(new StoreField(append(obj), offset, field, val, true, state_before, needs_patching));
       break;
+    }
     case Bytecodes::_getfield: {
       // Check for compile-time constants, i.e., trusted final non-static fields.
-      Instruction* constant = NULL;
+      Value constant = NULL;
       obj = apop();
       ObjectType* obj_type = obj->type()->as_ObjectType();
       if (obj_type->is_constant() && !PatchALot) {
         ciObject* const_oop = obj_type->constant_value();
         if (!const_oop->is_null_object() && const_oop->is_loaded()) {
           if (field->is_constant()) {
-            ciConstant field_val = field->constant_value_of(const_oop);
-            BasicType field_type = field_val.basic_type();
-            switch (field_type) {
-            case T_ARRAY:
-            case T_OBJECT:
-              if (field_val.as_object()->should_be_constant()) {
-                constant = new Constant(as_ValueType(field_val));
-              }
-              break;
-            default:
-              constant = new Constant(as_ValueType(field_val));
-            }
-            if (FoldStableValues && field->is_stable() && field_val.is_null_or_zero()) {
+            ciConstant field_value = field->constant_value_of(const_oop);
+            if (FoldStableValues && field->is_stable() && field_value.is_null_or_zero()) {
               // Stable field with default value can't be constant.
               constant = NULL;
+            } else {
+              constant = make_constant(field_value, field);
             }
           } else {
             // For CallSite objects treat the target field as a compile time constant.
@@ -3942,7 +3947,7 @@
 
 
 bool GraphBuilder::try_method_handle_inline(ciMethod* callee) {
-  ValueStack* state_before = state()->copy_for_parsing();
+  ValueStack* state_before = copy_state_before();
   vmIntrinsics::ID iid = callee->intrinsic_id();
   switch (iid) {
   case vmIntrinsics::_invokeBasic:
@@ -4032,7 +4037,7 @@
     fatal("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid));
     break;
   }
-  set_state(state_before);
+  set_state(state_before->copy_for_parsing());
   return false;
 }
 
--- a/src/share/vm/c1/c1_GraphBuilder.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/c1/c1_GraphBuilder.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -276,6 +276,7 @@
   void iterate_all_blocks(bool start_in_current_block_for_inlining = false);
   Dependencies* dependency_recorder() const; // = compilation()->dependencies()
   bool direct_compare(ciKlass* k);
+  Value make_constant(ciConstant value, ciField* field);
 
   void kill_all();
 
--- a/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -43,6 +43,9 @@
 #if INCLUDE_ALL_GCS
 #include "gc/g1/heapRegion.hpp"
 #endif // INCLUDE_ALL_GCS
+#ifdef TRACE_HAVE_INTRINSICS
+#include "trace/traceMacros.hpp"
+#endif
 
 #ifdef ASSERT
 #define __ gen()->lir(__FILE__, __LINE__)->
@@ -1293,6 +1296,25 @@
   __ move_wide(new LIR_Address(temp, in_bytes(Klass::java_mirror_offset()), T_OBJECT), result);
 }
 
+// java.lang.Class::isPrimitive()
+void LIRGenerator::do_isPrimitive(Intrinsic* x) {
+  assert(x->number_of_arguments() == 1, "wrong type");
+
+  LIRItem rcvr(x->argument_at(0), this);
+  rcvr.load_item();
+  LIR_Opr temp = new_register(T_METADATA);
+  LIR_Opr result = rlock_result(x);
+
+  CodeEmitInfo* info = NULL;
+  if (x->needs_null_check()) {
+    info = state_for(x);
+  }
+
+  __ move(new LIR_Address(rcvr.result(), java_lang_Class::klass_offset_in_bytes(), T_ADDRESS), temp, info);
+  __ cmp(lir_cond_notEqual, temp, LIR_OprFact::intConst(0));
+  __ cmove(lir_cond_notEqual, LIR_OprFact::intConst(0), LIR_OprFact::intConst(1), result, T_BOOLEAN);
+}
+
 
 // Example: Thread.currentThread()
 void LIRGenerator::do_currentThread(Intrinsic* x) {
@@ -3067,42 +3089,7 @@
   __ move(reg, result);
 }
 
-#ifdef TRACE_HAVE_INTRINSICS
-void LIRGenerator::do_ThreadIDIntrinsic(Intrinsic* x) {
-    LIR_Opr thread = getThreadPointer();
-    LIR_Opr osthread = new_pointer_register();
-    __ move(new LIR_Address(thread, in_bytes(JavaThread::osthread_offset()), osthread->type()), osthread);
-    size_t thread_id_size = OSThread::thread_id_size();
-    if (thread_id_size == (size_t) BytesPerLong) {
-      LIR_Opr id = new_register(T_LONG);
-      __ move(new LIR_Address(osthread, in_bytes(OSThread::thread_id_offset()), T_LONG), id);
-      __ convert(Bytecodes::_l2i, id, rlock_result(x));
-    } else if (thread_id_size == (size_t) BytesPerInt) {
-      __ move(new LIR_Address(osthread, in_bytes(OSThread::thread_id_offset()), T_INT), rlock_result(x));
-    } else {
-      ShouldNotReachHere();
-    }
-}
-
-void LIRGenerator::do_ClassIDIntrinsic(Intrinsic* x) {
-    CodeEmitInfo* info = state_for(x);
-    CodeEmitInfo* info2 = new CodeEmitInfo(info); // Clone for the second null check
-    BasicType klass_pointer_type = NOT_LP64(T_INT) LP64_ONLY(T_LONG);
-    assert(info != NULL, "must have info");
-    LIRItem arg(x->argument_at(1), this);
-    arg.load_item();
-    LIR_Opr klass = new_pointer_register();
-    __ move(new LIR_Address(arg.result(), java_lang_Class::klass_offset_in_bytes(), klass_pointer_type), klass, info);
-    LIR_Opr id = new_register(T_LONG);
-    ByteSize offset = TRACE_ID_OFFSET;
-    LIR_Address* trace_id_addr = new LIR_Address(klass, in_bytes(offset), T_LONG);
-    __ move(trace_id_addr, id);
-    __ logical_or(id, LIR_OprFact::longConst(0x01l), id);
-    __ store(id, trace_id_addr);
-    __ logical_and(id, LIR_OprFact::longConst(~0x3l), id);
-    __ move(id, rlock_result(x));
-}
-#endif
+
 
 void LIRGenerator::do_Intrinsic(Intrinsic* x) {
   switch (x->id()) {
@@ -3115,8 +3102,6 @@
   }
 
 #ifdef TRACE_HAVE_INTRINSICS
-  case vmIntrinsics::_threadID: do_ThreadIDIntrinsic(x); break;
-  case vmIntrinsics::_classID: do_ClassIDIntrinsic(x); break;
   case vmIntrinsics::_counterTime:
     do_RuntimeCall(CAST_FROM_FN_PTR(address, TRACE_TIME_METHOD), x);
     break;
@@ -3132,6 +3117,7 @@
 
   case vmIntrinsics::_Object_init:    do_RegisterFinalizer(x); break;
   case vmIntrinsics::_isInstance:     do_isInstance(x);    break;
+  case vmIntrinsics::_isPrimitive:    do_isPrimitive(x);   break;
   case vmIntrinsics::_getClass:       do_getClass(x);      break;
   case vmIntrinsics::_currentThread:  do_currentThread(x); break;
 
--- a/src/share/vm/c1/c1_LIRGenerator.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/c1/c1_LIRGenerator.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -246,6 +246,7 @@
 
   void do_RegisterFinalizer(Intrinsic* x);
   void do_isInstance(Intrinsic* x);
+  void do_isPrimitive(Intrinsic* x);
   void do_getClass(Intrinsic* x);
   void do_currentThread(Intrinsic* x);
   void do_MathIntrinsic(Intrinsic* x);
@@ -440,10 +441,7 @@
   void do_SwitchRanges(SwitchRangeArray* x, LIR_Opr value, BlockBegin* default_sux);
 
   void do_RuntimeCall(address routine, Intrinsic* x);
-#ifdef TRACE_HAVE_INTRINSICS
-  void do_ThreadIDIntrinsic(Intrinsic* x);
-  void do_ClassIDIntrinsic(Intrinsic* x);
-#endif
+
   ciKlass* profile_type(ciMethodData* md, int md_first_offset, int md_offset, intptr_t profiled_k,
                         Value arg, LIR_Opr& mdp, bool not_null, ciKlass* signature_at_call_k,
                         ciKlass* callee_signature_k);
--- a/src/share/vm/c1/c1_Runtime1.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/c1/c1_Runtime1.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -335,6 +335,7 @@
   NOT_PRODUCT(_new_instance_slowcase_cnt++;)
 
   assert(klass->is_klass(), "not a class");
+  Handle holder(THREAD, klass->klass_holder()); // keep the klass alive
   instanceKlassHandle h(thread, klass);
   h->check_valid_for_instantiation(true, CHECK);
   // make sure klass is initialized
@@ -370,6 +371,7 @@
   //       anymore after new_objArray() and no GC can happen before.
   //       (This may have to change if this code changes!)
   assert(array_klass->is_klass(), "not a class");
+  Handle holder(THREAD, array_klass->klass_holder()); // keep the klass alive
   Klass* elem_klass = ObjArrayKlass::cast(array_klass)->element_klass();
   objArrayOop obj = oopFactory::new_objArray(elem_klass, length, CHECK);
   thread->set_vm_result(obj);
@@ -386,6 +388,7 @@
 
   assert(klass->is_klass(), "not a class");
   assert(rank >= 1, "rank must be nonzero");
+  Handle holder(THREAD, klass->klass_holder()); // keep the klass alive
   oop obj = ArrayKlass::cast(klass)->multi_allocate(rank, dims, CHECK);
   thread->set_vm_result(obj);
 JRT_END
--- a/src/share/vm/c1/c1_ValueType.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/c1/c1_ValueType.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -45,6 +45,7 @@
 class     ObjectConstant;
 class     ArrayType;
 class       ArrayConstant;
+class         StableArrayConstant;
 class     InstanceType;
 class       InstanceConstant;
 class   MetadataType;
@@ -168,6 +169,7 @@
   virtual MethodConstant*   as_MethodConstant()  { return NULL; }
   virtual MethodDataConstant* as_MethodDataConstant() { return NULL; }
   virtual ArrayConstant*    as_ArrayConstant()   { return NULL; }
+  virtual StableArrayConstant* as_StableArrayConstant()   { return NULL; }
   virtual AddressConstant*  as_AddressConstant() { return NULL; }
 
   // type operations
@@ -355,6 +357,20 @@
   virtual ciType* exact_type() const;
 };
 
+class StableArrayConstant: public ArrayConstant {
+ private:
+  jint _dimension;
+
+ public:
+  StableArrayConstant(ciArray* value, jint dimension) : ArrayConstant(value) {
+    assert(dimension > 0, "not a stable array");
+    _dimension = dimension;
+  }
+
+  jint dimension() const                              { return _dimension; }
+
+  virtual StableArrayConstant* as_StableArrayConstant() { return this; }
+};
 
 class InstanceType: public ObjectType {
  public:
--- a/src/share/vm/ci/ciArray.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/ci/ciArray.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -107,8 +107,9 @@
   intptr_t header = arrayOopDesc::base_offset_in_bytes(elembt);
   intptr_t index = (element_offset - header) >> shift;
   intptr_t offset = header + ((intptr_t)index << shift);
-  if (offset != element_offset || index != (jint)index)
+  if (offset != element_offset || index != (jint)index || index < 0 || index >= length()) {
     return ciConstant();
+  }
   return element_value((jint) index);
 }
 
--- a/src/share/vm/ci/ciMethodData.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/ci/ciMethodData.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -81,7 +81,7 @@
 void ciMethodData::load_extra_data() {
   MethodData* mdo = get_MethodData();
 
-  MutexLocker(mdo->extra_data_lock());
+  MutexLocker ml(mdo->extra_data_lock());
 
   // speculative trap entries also hold a pointer to a Method so need to be translated
   DataLayout* dp_src  = mdo->extra_data_base();
@@ -103,16 +103,13 @@
 
     switch(tag) {
     case DataLayout::speculative_trap_data_tag: {
-      ciSpeculativeTrapData* data_dst = new ciSpeculativeTrapData(dp_dst);
-      SpeculativeTrapData* data_src = new SpeculativeTrapData(dp_src);
+      ciSpeculativeTrapData data_dst(dp_dst);
+      SpeculativeTrapData   data_src(dp_src);
 
-      data_dst->translate_from(data_src);
-
-#ifdef ASSERT
-      SpeculativeTrapData* data_src2 = new SpeculativeTrapData(dp_src);
-      assert(data_src2->method() == data_src->method() && data_src2->bci() == data_src->bci(), "entries changed while translating");
-#endif
-
+      { // During translation a safepoint can happen or VM lock can be taken (e.g., Compile_lock).
+        MutexUnlocker ml(mdo->extra_data_lock());
+        data_dst.translate_from(&data_src);
+      }
       break;
     }
     case DataLayout::bit_data_tag:
@@ -120,9 +117,11 @@
     case DataLayout::no_tag:
     case DataLayout::arg_info_data_tag:
       // An empty slot or ArgInfoData entry marks the end of the trap data
-      return;
+      {
+        return; // Need a block to avoid SS compiler bug
+      }
     default:
-      fatal("bad tag = %d", dp_dst->tag());
+      fatal("bad tag = %d", tag);
     }
   }
 }
--- a/src/share/vm/classfile/classFileParser.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/classfile/classFileParser.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -5380,7 +5380,7 @@
     }
   }
 
-  TRACE_INIT_ID(ik);
+  TRACE_INIT_KLASS_ID(ik);
 
   // If we reach here, all is well.
   // Now remove the InstanceKlass* from the _klass_to_deallocate field
--- a/src/share/vm/classfile/classLoader.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/classfile/classLoader.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -37,6 +37,7 @@
 #include "gc/shared/generation.hpp"
 #include "interpreter/bytecodeStream.hpp"
 #include "interpreter/oopMapCache.hpp"
+#include "logging/logTag.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/filemap.hpp"
 #include "memory/oopFactory.hpp"
@@ -417,34 +418,30 @@
 #if INCLUDE_CDS
 void ClassLoader::exit_with_path_failure(const char* error, const char* message) {
   assert(DumpSharedSpaces, "only called at dump time");
-  tty->print_cr("Hint: enable -XX:+TraceClassPaths to diagnose the failure");
+  tty->print_cr("Hint: enable -Xlog:classpath=info to diagnose the failure");
   vm_exit_during_initialization(error, message);
 }
 #endif
 
-void ClassLoader::trace_class_path(outputStream* out, const char* msg, const char* name) {
-  if (!TraceClassPaths) {
-    return;
-  }
-
-  if (msg) {
-    out->print("%s", msg);
-  }
-  if (name) {
-    if (strlen(name) < 256) {
-      out->print("%s", name);
-    } else {
-      // For very long paths, we need to print each character separately,
-      // as print_cr() has a length limit
-      while (name[0] != '\0') {
-        out->print("%c", name[0]);
-        name++;
+void ClassLoader::trace_class_path(const char* msg, const char* name) {
+  if (log_is_enabled(Info, classpath)) {
+    ResourceMark rm;
+    outputStream* out = LogHandle(classpath)::info_stream();
+    if (msg) {
+      out->print("%s", msg);
+    }
+    if (name) {
+      if (strlen(name) < 256) {
+        out->print("%s", name);
+      } else {
+        // For very long paths, we need to print each character separately,
+        // as print_cr() has a length limit
+        while (name[0] != '\0') {
+          out->print("%c", name[0]);
+          name++;
+        }
       }
     }
-  }
-  if (msg && msg[0] == '[') {
-    out->print_cr("]");
-  } else {
     out->cr();
   }
 }
@@ -470,11 +467,13 @@
 void ClassLoader::setup_bootstrap_search_path() {
   assert(_first_entry == NULL, "should not setup bootstrap class search path twice");
   const char* sys_class_path = Arguments::get_sysclasspath();
+  const char* java_class_path = Arguments::get_appclasspath();
   if (PrintSharedArchiveAndExit) {
     // Don't print sys_class_path - this is the bootcp of this current VM process, not necessarily
     // the same as the bootcp of the shared archive.
   } else {
-    trace_class_path(tty, "[Bootstrap loader class path=", sys_class_path);
+    trace_class_path("bootstrap loader class path=", sys_class_path);
+    trace_class_path("classpath: ", java_class_path);
   }
 #if INCLUDE_CDS
   if (DumpSharedSpaces) {
@@ -578,9 +577,7 @@
         }
       }
     }
-    if (TraceClassPaths) {
-      tty->print_cr("[Opened %s]", path);
-    }
+    log_info(classpath)("opened: %s", path);
     log_info(classload)("opened: %s", path);
   } else {
     // Directory
--- a/src/share/vm/classfile/classLoader.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/classfile/classLoader.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -331,7 +331,7 @@
   static void  exit_with_path_failure(const char* error, const char* message);
 #endif
 
-  static void  trace_class_path(outputStream* out, const char* msg, const char* name = NULL);
+  static void  trace_class_path(const char* msg, const char* name = NULL);
 
   // VM monitoring and management support
   static jlong classloader_time_ms();
--- a/src/share/vm/classfile/dictionary.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/classfile/dictionary.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -135,8 +135,10 @@
     //          via a store to _pd_set.
     OrderAccess::release_store_ptr(&_pd_set, new_head);
   }
-  if (TraceProtectionDomainVerification && WizardMode) {
-    print();
+  if (log_is_enabled(Trace, protectiondomain)) {
+    ResourceMark rm;
+    outputStream* log = LogHandle(protectiondomain)::trace_stream();
+    print_count(log);
   }
 }
 
--- a/src/share/vm/classfile/dictionary.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/classfile/dictionary.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -29,6 +29,7 @@
 #include "oops/instanceKlass.hpp"
 #include "oops/oop.hpp"
 #include "utilities/hashtable.hpp"
+#include "utilities/ostream.hpp"
 
 class DictionaryEntry;
 class PSPromotionManager;
@@ -323,14 +324,14 @@
     return (klass->name() == class_name && _loader_data == loader_data);
   }
 
-  void print() {
+  void print_count(outputStream *st) {
     int count = 0;
     for (ProtectionDomainEntry* current = _pd_set;
                                 current != NULL;
                                 current = current->_next) {
       count++;
     }
-    tty->print_cr("pd set = #%d", count);
+    st->print_cr("pd set count = #%d", count);
   }
 };
 
--- a/src/share/vm/classfile/klassFactory.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/classfile/klassFactory.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+* Copyright (c) 2015, 2016, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -29,6 +29,7 @@
 #include "classfile/klassFactory.hpp"
 #include "memory/resourceArea.hpp"
 #include "prims/jvmtiEnvBase.hpp"
+#include "trace/traceMacros.hpp"
 
 static ClassFileStream* prologue(ClassFileStream* stream,
                                  Symbol* name,
@@ -136,5 +137,7 @@
     result->set_cached_class_file(cached_class_file);
   }
 
+  TRACE_KLASS_CREATION(result, parser, THREAD);
+
   return result;
 }
--- a/src/share/vm/classfile/sharedPathsMiscInfo.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/classfile/sharedPathsMiscInfo.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -26,15 +26,15 @@
 #include "classfile/classLoader.hpp"
 #include "classfile/classLoaderData.inline.hpp"
 #include "classfile/sharedPathsMiscInfo.hpp"
+#include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/metaspaceShared.hpp"
 #include "runtime/arguments.hpp"
+#include "utilities/ostream.hpp"
 
 void SharedPathsMiscInfo::add_path(const char* path, int type) {
-  if (TraceClassPaths) {
-    tty->print("[type=%s] ", type_name(type));
-    trace_class_path("[Add misc shared path ", path);
-  }
+  log_info(classpath)("type=%s ", type_name(type));
+  ClassLoader::trace_class_path("add misc shared path ", path);
   write(path, strlen(path) + 1);
   write_jint(jint(type));
 }
@@ -67,11 +67,29 @@
 }
 
 bool SharedPathsMiscInfo::fail(const char* msg, const char* name) {
-  ClassLoader::trace_class_path(tty, msg, name);
+  ClassLoader::trace_class_path(msg, name);
   MetaspaceShared::set_archive_loading_failed();
   return false;
 }
 
+void SharedPathsMiscInfo::print_path(int type, const char* path) {
+  ResourceMark rm;
+  outputStream* out = LogHandle(classpath)::info_stream();
+  switch (type) {
+  case BOOT:
+    out->print("Expecting -Dsun.boot.class.path=%s", path);
+    break;
+  case NON_EXIST:
+    out->print("Expecting that %s does not exist", path);
+    break;
+  case REQUIRED:
+    out->print("Expecting that file %s must exist and is not altered", path);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+}
+
 bool SharedPathsMiscInfo::check() {
   // The whole buffer must be 0 terminated so that we can use strlen and strcmp
   // without fear.
@@ -90,17 +108,14 @@
     if (!read_jint(&type)) {
       return fail("Corrupted archive file header");
     }
-    if (TraceClassPaths) {
-      tty->print("[type=%s ", type_name(type));
-      print_path(tty, type, path);
-      tty->print_cr("]");
-    }
+    log_info(classpath)("type=%s ", type_name(type));
+    print_path(type, path);
     if (!check(type, path)) {
       if (!PrintSharedArchiveAndExit) {
         return false;
       }
     } else {
-      trace_class_path("[ok");
+      ClassLoader::trace_class_path("ok");
     }
   }
 
--- a/src/share/vm/classfile/sharedPathsMiscInfo.hpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/classfile/sharedPathsMiscInfo.hpp	Thu Mar 17 17:03:20 2016 +0000
@@ -64,9 +64,6 @@
   void write(const void* ptr, size_t size);
   bool read(void* ptr, size_t size);
 
-  static void trace_class_path(const char* msg, const char* name = NULL) {
-    ClassLoader::trace_class_path(tty, msg, name);
-  }
 protected:
   static bool fail(const char* msg, const char* name = NULL);
   virtual bool check(jint type, const char* path);
@@ -144,21 +141,7 @@
     }
   }
 
-  virtual void print_path(outputStream* out, int type, const char* path) {
-    switch (type) {
-    case BOOT:
-      out->print("Expecting -Dsun.boot.class.path=%s", path);
-      break;
-    case NON_EXIST:
-      out->print("Expecting that %s does not exist", path);
-      break;
-    case REQUIRED:
-      out->print("Expecting that file %s must exist and is not altered", path);
-      break;
-    default:
-      ShouldNotReachHere();
-    }
-  }
+  virtual void print_path(int type, const char* path);
 
   bool check();
   bool read_jint(jint *ptr) {
--- a/src/share/vm/classfile/stringTable.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/classfile/stringTable.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -200,7 +200,6 @@
   return string;
 }
 
-
 oop StringTable::intern(Handle string_or_null, jchar* name,
                         int len, TRAPS) {
   oop found_string = lookup_shared(name, len);
@@ -214,7 +213,9 @@
 
   // Found
   if (found_string != NULL) {
-    ensure_string_alive(found_string);
+    if (found_string != string_or_null()) {
+      ensure_string_alive(found_string);
+    }
     return found_string;
   }
 
@@ -249,7 +250,9 @@
                                   hashValue, CHECK_NULL);
   }
 
-  ensure_string_alive(added_or_found);
+  if (added_or_found != string()) {
+    ensure_string_alive(added_or_found);
+  }
 
   return added_or_found;
 }
--- a/src/share/vm/classfile/systemDictionary.cpp	Thu Feb 25 14:59:44 2016 +0000
+++ b/src/share/vm/classfile/systemDictionary.cpp	Thu Mar 17 17:03:20 2016 +0000
@@ -430,12 +430,15 @@
 
   // Now we have to call back to java to check if the initating class has access
   JavaValue result(T_VOID);
-  if (TraceProtectionDomainVerification) {
+  if (log_is_enabled(Debug, protectiondomain)) {
+    ResourceMark rm;
     // Print out trace information
-    tty->print_cr("Checking package access");
-    tty->print(" - class loader:      "); class_loader()->print_value_on(tty);      tty->cr();
-    tty->print(" - protection domain: "); protection_domain()->print_value_on(tty); tty->cr();
-    tty->print(" - loading:           "); klass()->print_value_on(tty);             tty->cr();
+    outputStream* log = LogHandle(protectiondomain)::debug_stream();
+    log->print_cr("Checking package access");
+    log->print("class loader: "); class_loader()->print_value_on(log);
+    log->print(" protection domain: "); protection_domain()->print_value_on(log);
+    log->print(" loading: "); klass()->print_value_on(log);
+    log->cr();
   }
 
   KlassHandle system_loader(THREAD, SystemDictionary::ClassLoader_klass());
@@ -448,13 +451,10 @@
                          protection_domain,
                          THREAD);
 
-  if (TraceProtectionDomainVerification) {
-    if (HAS_PENDING_EXCEPTION) {
-      tty->print_cr(" -> DENIED !!!!!!!!!!!!!!!!!!!!!");
-    } else {
-