changeset 50839:8e326bd343bb

Merge
author prr
date Mon, 25 Jun 2018 10:21:50 -0700
parents 732a3b600098 5637aca18f1d
children 1a9ebf66fd9f
files src/hotspot/share/gc/cms/cmsOopClosures.cpp src/hotspot/share/gc/cms/cms_specialized_oop_closures.hpp src/hotspot/share/gc/cms/parOopClosures.cpp src/hotspot/share/gc/g1/g1_specialized_oop_closures.hpp src/hotspot/share/gc/serial/serial_specialized_oop_closures.hpp src/hotspot/share/gc/shared/genOopClosures.cpp src/hotspot/share/gc/shared/specialized_oop_closures.hpp src/hotspot/share/gc/z/z_specialized_oop_closures.hpp test/jdk/ProblemList.txt
diffstat 636 files changed, 32957 insertions(+), 5148 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Mon Jun 25 14:32:46 2018 +0530
+++ b/.hgtags	Mon Jun 25 10:21:50 2018 -0700
@@ -491,3 +491,5 @@
 64e4b1686141e57a681936a8283983341484676e jdk-11+17
 e1b3def126240d5433902f3cb0e91a4c27f6db50 jdk-11+18
 fb8b3f4672774e15654958295558a1af1b576919 jdk-11+19
+fb8b3f4672774e15654958295558a1af1b576919 jdk-11+19
+36ca515343e00b021dcfc902e986d26ec994a2e5 jdk-11+19
--- a/bin/nashorn/runopt.sh	Mon Jun 25 14:32:46 2018 +0530
+++ b/bin/nashorn/runopt.sh	Mon Jun 25 10:21:50 2018 -0700
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-# Copyright (c) 2010, 2014, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2010, 2018, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 # 
 # This code is free software; you can redistribute it and/or modify it
@@ -75,7 +75,6 @@
 #
 # see above - already in place, copy the flags down here to disable
 ENABLE_FLIGHT_RECORDER_FLAGS="\
-    -XX:+UnlockCommercialFeatures \
     -XX:+FlightRecorder \
     -XX:FlightRecorderOptions=defaultrecording=true,disk=true,dumponexit=true,dumponexitpath=$JFR_FILENAME,stackdepth=1024"
 
--- a/make/autoconf/flags-ldflags.m4	Mon Jun 25 14:32:46 2018 +0530
+++ b/make/autoconf/flags-ldflags.m4	Mon Jun 25 10:21:50 2018 -0700
@@ -81,7 +81,7 @@
 
   elif test "x$TOOLCHAIN_TYPE" = xclang; then
     BASIC_LDFLAGS_JVM_ONLY="-mno-omit-leaf-frame-pointer -mstack-alignment=16 \
-        -stdlib=libc++ -fPIC"
+        -fPIC"
 
   elif test "x$TOOLCHAIN_TYPE" = xsolstudio; then
     BASIC_LDFLAGS="-Wl,-z,defs"
--- a/make/conf/jib-profiles.js	Mon Jun 25 14:32:46 2018 +0530
+++ b/make/conf/jib-profiles.js	Mon Jun 25 10:21:50 2018 -0700
@@ -832,7 +832,7 @@
 
     var devkit_platform_revisions = {
         linux_x64: "gcc7.3.0-OEL6.4+1.0",
-        macosx_x64: "Xcode6.3-MacOSX10.9+1.0",
+        macosx_x64: "Xcode9.4-MacOSX10.13+1.0",
         solaris_x64: "SS12u4-Solaris11u1+1.0",
         solaris_sparcv9: "SS12u4-Solaris11u1+1.1",
         windows_x64: "VS2017-15.5.5+1.0",
--- a/make/data/jdwp/jdwp.spec	Mon Jun 25 14:32:46 2018 +0530
+++ b/make/data/jdwp/jdwp.spec	Mon Jun 25 10:21:50 2018 -0700
@@ -395,8 +395,8 @@
                      "Can the VM add methods when redefining "
                      "classes?")
             (boolean canUnrestrictedlyRedefineClasses
-                     "Can the VM redefine classes"
-                     "in arbitrary ways?")
+                     "Can the VM redefine classes "
+                     "in ways that are normally restricted?")
             (boolean canPopFrames
                      "Can the VM pop stack frames?")
             (boolean canUseInstanceFilters
@@ -460,12 +460,23 @@
         "<a href=\"#JDWP_StackFrame_PopFrames\">PopFrames</a> command can be used "
         "to pop frames with obsolete methods."
         "<p>"
+        "Unless the canUnrestrictedlyRedefineClasses capability is present the following "
+        "redefinitions are restricted: "
+        "<ul>"
+        "<li>changing the schema (the fields)</li>"
+        "<li>changing the hierarchy (superclasses, interfaces)</li>"
+        "<li>deleting a method</li>"
+        "<li>changing class modifiers</li>"
+        "<li>changing method modifiers</li>"
+        "<li>changing the <code>NestHost</code> or <code>NestMembers</code> class attributes</li>"
+        "</ul>"
+        "<p>"
         "Requires canRedefineClasses capability - see "
         "<a href=\"#JDWP_VirtualMachine_CapabilitiesNew\">CapabilitiesNew</a>. "
         "In addition to the canRedefineClasses capability, the target VM must "
         "have the canAddMethod capability to add methods when redefining classes, "
-        "or the canUnrestrictedlyRedefineClasses to redefine classes in arbitrary "
-        "ways."
+        "or the canUnrestrictedlyRedefineClasses capability to redefine classes in ways "
+        "that are normally restricted."
         (Out
             (Repeat classes "Number of reference types that follow."
                 (Group ClassDef
@@ -496,6 +507,7 @@
             (Error DELETE_METHOD_NOT_IMPLEMENTED)
             (Error CLASS_MODIFIERS_CHANGE_NOT_IMPLEMENTED)
             (Error METHOD_MODIFIERS_CHANGE_NOT_IMPLEMENTED)
+            (Error CLASS_ATTRIBUTE_CHANGE_NOT_IMPLEMENTED)
             (Error VM_DEAD)
         )
     )
@@ -3148,12 +3160,16 @@
                                           "different from the name in the old class object.")
     (Constant CLASS_MODIFIERS_CHANGE_NOT_IMPLEMENTED
                                      =70  "The new class version has different modifiers and "
-                                          "and canUnrestrictedlyRedefineClasses is false.")
+                                          "canUnrestrictedlyRedefineClasses is false.")
     (Constant METHOD_MODIFIERS_CHANGE_NOT_IMPLEMENTED
                                      =71  "A method in the new class version has "
                                           "different modifiers "
                                           "than its counterpart in the old class version and "
-                                          "and canUnrestrictedlyRedefineClasses is false.")
+                                          "canUnrestrictedlyRedefineClasses is false.")
+    (Constant CLASS_ATTRIBUTE_CHANGE_NOT_IMPLEMENTED
+                                     =72  "The new class version has different NestHost or "
+                                          "NestMembers class attribute and "
+                                          "canUnrestrictedlyRedefineClasses is false.")
     (Constant NOT_IMPLEMENTED        =99  "The functionality is not implemented in "
                                           "this virtual machine.")
     (Constant NULL_POINTER           =100 "Invalid pointer.")
--- a/make/data/symbols/jdk.scripting.nashorn-7.sym.txt	Mon Jun 25 14:32:46 2018 +0530
+++ b/make/data/symbols/jdk.scripting.nashorn-7.sym.txt	Mon Jun 25 10:21:50 2018 -0700
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -28,6 +28,8 @@
 #
 -class name jdk/nashorn/api/scripting/AbstractJSObject
 
+-class name jdk/nashorn/api/scripting/ClassFilter
+
 -class name jdk/nashorn/api/scripting/JSObject
 
 -class name jdk/nashorn/api/scripting/NashornException
--- a/make/data/symbols/jdk.scripting.nashorn-8.sym.txt	Mon Jun 25 14:32:46 2018 +0530
+++ b/make/data/symbols/jdk.scripting.nashorn-8.sym.txt	Mon Jun 25 10:21:50 2018 -0700
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -27,7 +27,7 @@
 # ##########################################################
 #
 class name jdk/nashorn/api/scripting/AbstractJSObject
-header extends java/lang/Object implements jdk/nashorn/api/scripting/JSObject flags 421
+header extends java/lang/Object implements jdk/nashorn/api/scripting/JSObject flags 421 runtimeAnnotations @Ljdk/Exported;
 method name <init> descriptor ()V flags 1
 method name call descriptor (Ljava/lang/Object;[Ljava/lang/Object;)Ljava/lang/Object; flags 81
 method name newObject descriptor ([Ljava/lang/Object;)Ljava/lang/Object; flags 81
@@ -47,10 +47,16 @@
 method name isFunction descriptor ()Z flags 1
 method name isStrictFunction descriptor ()Z flags 1
 method name isArray descriptor ()Z flags 1
-method name toNumber descriptor ()D flags 1
+method name toNumber descriptor ()D flags 1 deprecated true runtimeAnnotations @Ljava/lang/Deprecated;
+method name getDefaultValue descriptor (Ljava/lang/Class;)Ljava/lang/Object; flags 1 signature (Ljava/lang/Class<*>;)Ljava/lang/Object;
+method name getDefaultValue descriptor (Ljdk/nashorn/api/scripting/JSObject;Ljava/lang/Class;)Ljava/lang/Object; flags 9 signature (Ljdk/nashorn/api/scripting/JSObject;Ljava/lang/Class<*>;)Ljava/lang/Object;
+
+class name jdk/nashorn/api/scripting/ClassFilter
+header extends java/lang/Object flags 601 runtimeAnnotations @Ljdk/Exported;
+method name exposeToScripts descriptor (Ljava/lang/String;)Z flags 401
 
 class name jdk/nashorn/api/scripting/JSObject
-header extends java/lang/Object flags 601
+header extends java/lang/Object flags 601 runtimeAnnotations @Ljdk/Exported;
 method name call descriptor (Ljava/lang/Object;[Ljava/lang/Object;)Ljava/lang/Object; flags 481
 method name newObject descriptor ([Ljava/lang/Object;)Ljava/lang/Object; flags 481
 method name eval descriptor (Ljava/lang/String;)Ljava/lang/Object; flags 401
@@ -69,22 +75,28 @@
 method name isFunction descriptor ()Z flags 401
 method name isStrictFunction descriptor ()Z flags 401
 method name isArray descriptor ()Z flags 401
-method name toNumber descriptor ()D flags 401
+method name toNumber descriptor ()D flags 401 deprecated true runtimeAnnotations @Ljava/lang/Deprecated;
 
 class name jdk/nashorn/api/scripting/NashornException
-header extends java/lang/RuntimeException flags 421
-field name ENGINE_SCRIPT_SOURCE_NAME descriptor Ljava/lang/String; constantValue nashorn:engine/resources/engine.js flags 19
+header extends java/lang/RuntimeException flags 421 runtimeAnnotations @Ljdk/Exported;
 method name <init> descriptor (Ljava/lang/String;Ljava/lang/String;II)V flags 4
 method name <init> descriptor (Ljava/lang/String;Ljava/lang/Throwable;Ljava/lang/String;II)V flags 4
 method name <init> descriptor (Ljava/lang/String;Ljava/lang/Throwable;)V flags 4
 method name getFileName descriptor ()Ljava/lang/String; flags 11
+method name setFileName descriptor (Ljava/lang/String;)V flags 11
 method name getLineNumber descriptor ()I flags 11
+method name setLineNumber descriptor (I)V flags 11
 method name getColumnNumber descriptor ()I flags 11
+method name setColumnNumber descriptor (I)V flags 11
 method name getScriptFrames descriptor (Ljava/lang/Throwable;)[Ljava/lang/StackTraceElement; flags 9
 method name getScriptStackString descriptor (Ljava/lang/Throwable;)Ljava/lang/String; flags 9
+method name getThrown descriptor ()Ljava/lang/Object; flags 4
+method name initEcmaError descriptor (Ljdk/nashorn/internal/runtime/ScriptObject;)Ljdk/nashorn/api/scripting/NashornException; flags 4
+method name getEcmaError descriptor ()Ljava/lang/Object; flags 1
+method name setEcmaError descriptor (Ljava/lang/Object;)V flags 1
 
 class name jdk/nashorn/api/scripting/NashornScriptEngine
-header extends javax/script/AbstractScriptEngine implements javax/script/Compilable,javax/script/Invocable flags 31
+header extends javax/script/AbstractScriptEngine implements javax/script/Compilable,javax/script/Invocable flags 31 runtimeAnnotations @Ljdk/Exported;
 innerclass innerClass java/lang/invoke/MethodHandles$Lookup outerClass java/lang/invoke/MethodHandles innerClassName Lookup flags 19
 field name NASHORN_GLOBAL descriptor Ljava/lang/String; constantValue nashorn.global flags 19
 method name eval descriptor (Ljava/io/Reader;Ljavax/script/ScriptContext;)Ljava/lang/Object; thrownTypes javax/script/ScriptException flags 1
@@ -97,10 +109,9 @@
 method name invokeMethod descriptor (Ljava/lang/Object;Ljava/lang/String;[Ljava/lang/Object;)Ljava/lang/Object; thrownTypes javax/script/ScriptException,java/lang/NoSuchMethodException flags 81
 method name getInterface descriptor (Ljava/lang/Class;)Ljava/lang/Object; flags 1 signature <T:Ljava/lang/Object;>(Ljava/lang/Class<TT;>;)TT;
 method name getInterface descriptor (Ljava/lang/Object;Ljava/lang/Class;)Ljava/lang/Object; flags 1 signature <T:Ljava/lang/Object;>(Ljava/lang/Object;Ljava/lang/Class<TT;>;)TT;
-method name __noSuchProperty__ descriptor (Ljava/lang/Object;Ljavax/script/ScriptContext;Ljava/lang/String;)Ljava/lang/Object; flags 1
 
 class name jdk/nashorn/api/scripting/NashornScriptEngineFactory
-header extends java/lang/Object implements javax/script/ScriptEngineFactory flags 31
+header extends java/lang/Object implements javax/script/ScriptEngineFactory flags 31 runtimeAnnotations @Ljdk/Exported;
 method name <init> descriptor ()V flags 1
 method name getEngineName descriptor ()Ljava/lang/String; flags 1
 method name getEngineVersion descriptor ()Ljava/lang/String; flags 1
@@ -115,11 +126,13 @@
 method name getProgram descriptor ([Ljava/lang/String;)Ljava/lang/String; flags 81
 method name getScriptEngine descriptor ()Ljavax/script/ScriptEngine; flags 1
 method name getScriptEngine descriptor (Ljava/lang/ClassLoader;)Ljavax/script/ScriptEngine; flags 1
-method name getScriptEngine descriptor ([Ljava/lang/String;)Ljavax/script/ScriptEngine; flags 1
+method name getScriptEngine descriptor (Ljdk/nashorn/api/scripting/ClassFilter;)Ljavax/script/ScriptEngine; flags 1
+method name getScriptEngine descriptor ([Ljava/lang/String;)Ljavax/script/ScriptEngine; flags 81
 method name getScriptEngine descriptor ([Ljava/lang/String;Ljava/lang/ClassLoader;)Ljavax/script/ScriptEngine; flags 1
+method name getScriptEngine descriptor ([Ljava/lang/String;Ljava/lang/ClassLoader;Ljdk/nashorn/api/scripting/ClassFilter;)Ljavax/script/ScriptEngine; flags 1
 
 class name jdk/nashorn/api/scripting/ScriptObjectMirror
-header extends jdk/nashorn/api/scripting/AbstractJSObject implements javax/script/Bindings flags 31
+header extends jdk/nashorn/api/scripting/AbstractJSObject implements javax/script/Bindings flags 31 runtimeAnnotations @Ljdk/Exported;
 innerclass innerClass java/util/Map$Entry outerClass java/util/Map innerClassName Entry flags 609
 method name equals descriptor (Ljava/lang/Object;)Z flags 1
 method name hashCode descriptor ()I flags 1
@@ -135,6 +148,7 @@
 method name removeMember descriptor (Ljava/lang/String;)V flags 1
 method name setMember descriptor (Ljava/lang/String;Ljava/lang/Object;)V flags 1
 method name setSlot descriptor (ILjava/lang/Object;)V flags 1
+method name setIndexedPropertiesToExternalArrayData descriptor (Ljava/nio/ByteBuffer;)V flags 1
 method name isInstance descriptor (Ljava/lang/Object;)Z flags 1
 method name getClassName descriptor ()Ljava/lang/String; flags 1
 method name isFunction descriptor ()Z flags 1
@@ -166,25 +180,28 @@
 method name isUndefined descriptor (Ljava/lang/Object;)Z flags 9
 method name to descriptor (Ljava/lang/Class;)Ljava/lang/Object; flags 1 signature <T:Ljava/lang/Object;>(Ljava/lang/Class<TT;>;)TT;
 method name wrap descriptor (Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; flags 9
+method name wrapAsJSONCompatible descriptor (Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; flags 9
 method name unwrap descriptor (Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; flags 9
 method name wrapArray descriptor ([Ljava/lang/Object;Ljava/lang/Object;)[Ljava/lang/Object; flags 9
 method name unwrapArray descriptor ([Ljava/lang/Object;Ljava/lang/Object;)[Ljava/lang/Object; flags 9
-method name toNumber descriptor ()D flags 1
+method name identical descriptor (Ljava/lang/Object;Ljava/lang/Object;)Z flags 9
+method name toNumber descriptor ()D flags 1 deprecated true runtimeAnnotations @Ljava/lang/Deprecated;
+method name getDefaultValue descriptor (Ljava/lang/Class;)Ljava/lang/Object; flags 1 signature (Ljava/lang/Class<*>;)Ljava/lang/Object;
 method name put descriptor (Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; flags 1041
 
 class name jdk/nashorn/api/scripting/ScriptUtils
-header extends java/lang/Object flags 31
+header extends java/lang/Object flags 31 runtimeAnnotations @Ljdk/Exported;
 method name parse descriptor (Ljava/lang/String;Ljava/lang/String;Z)Ljava/lang/String; flags 9
 method name format descriptor (Ljava/lang/String;[Ljava/lang/Object;)Ljava/lang/String; flags 9
-method name makeSynchronizedFunction descriptor (Ljdk/nashorn/internal/runtime/ScriptFunction;Ljava/lang/Object;)Ljava/lang/Object; flags 9
-method name wrap descriptor (Ljava/lang/Object;)Ljava/lang/Object; flags 9
+method name makeSynchronizedFunction descriptor (Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; flags 9
+method name wrap descriptor (Ljava/lang/Object;)Ljdk/nashorn/api/scripting/ScriptObjectMirror; flags 9
 method name unwrap descriptor (Ljava/lang/Object;)Ljava/lang/Object; flags 9
 method name wrapArray descriptor ([Ljava/lang/Object;)[Ljava/lang/Object; flags 9
 method name unwrapArray descriptor ([Ljava/lang/Object;)[Ljava/lang/Object; flags 9
 method name convert descriptor (Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; flags 9
 
 class name jdk/nashorn/api/scripting/URLReader
-header extends java/io/Reader flags 31
+header extends java/io/Reader flags 31 runtimeAnnotations @Ljdk/Exported;
 method name <init> descriptor (Ljava/net/URL;)V flags 1
 method name <init> descriptor (Ljava/net/URL;Ljava/lang/String;)V flags 1
 method name <init> descriptor (Ljava/net/URL;Ljava/nio/charset/Charset;)V flags 1
--- a/make/data/symbols/jdk.scripting.nashorn-9.sym.txt	Mon Jun 25 14:32:46 2018 +0530
+++ b/make/data/symbols/jdk.scripting.nashorn-9.sym.txt	Mon Jun 25 10:21:50 2018 -0700
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@@ -30,15 +30,16 @@
 header exports jdk/nashorn/api/scripting,jdk/nashorn/api/tree requires name\u0020;jdk.dynalink\u0020;flags\u0020;0,name\u0020;java.logging\u0020;flags\u0020;0,name\u0020;java.base\u0020;flags\u0020;8000,name\u0020;java.scripting\u0020;flags\u0020;20 provides interface\u0020;javax/script/ScriptEngineFactory\u0020;impls\u0020;jdk/nashorn/api/scripting/NashornScriptEngineFactory,interface\u0020;jdk/dynalink/linker/GuardingDynamicLinkerExporter\u0020;impls\u0020;jdk/nashorn/api/linker/NashornLinkerExporter flags 8000
 
 class name jdk/nashorn/api/scripting/AbstractJSObject
--method name toNumber descriptor ()D
-method name toNumber descriptor ()D flags 1 deprecated true runtimeAnnotations @Ljava/lang/Deprecated;
+header extends java/lang/Object implements jdk/nashorn/api/scripting/JSObject flags 421
+-method name getDefaultValue descriptor (Ljava/lang/Class;)Ljava/lang/Object;
+-method name getDefaultValue descriptor (Ljdk/nashorn/api/scripting/JSObject;Ljava/lang/Class;)Ljava/lang/Object;
 method name getDefaultValue descriptor (Ljdk/nashorn/api/scripting/JSObject;Ljava/lang/Class;)Ljava/lang/Object; flags 9 deprecated true signature (Ljdk/nashorn/api/scripting/JSObject;Ljava/lang/Class<*>;)Ljava/lang/Object; runtimeAnnotations @Ljava/lang/Deprecated;
 
 class name jdk/nashorn/api/scripting/ClassFilter
 header extends java/lang/Object flags 601
-method name exposeToScripts descriptor (Ljava/lang/String;)Z flags 401
 
 class name jdk/nashorn/api/scripting/JSObject
+header extends java/lang/Object flags 601
 -method name toNumber descriptor ()D
 method name toNumber descriptor ()D flags 1 deprecated true runtimeAnnotations @Ljava/lang/Deprecated;
 method name getDefaultValue descriptor (Ljava/lang/Class;)Ljava/lang/Object; thrownTypes java/lang/UnsupportedOperationException flags 1 signature (Ljava/lang/Class<*>;)Ljava/lang/Object;
@@ -46,41 +47,26 @@
 class name jdk/nashorn/api/scripting/NashornException
 header extends java/lang/RuntimeException flags 421
 innerclass innerClass java/lang/invoke/MethodHandles$Lookup outerClass java/lang/invoke/MethodHandles innerClassName Lookup flags 19
--field name ENGINE_SCRIPT_SOURCE_NAME descriptor Ljava/lang/String;
-method name setFileName descriptor (Ljava/lang/String;)V flags 11
-method name setLineNumber descriptor (I)V flags 11
-method name setColumnNumber descriptor (I)V flags 11
-method name getThrown descriptor ()Ljava/lang/Object; flags 4
-method name getEcmaError descriptor ()Ljava/lang/Object; flags 1
-method name setEcmaError descriptor (Ljava/lang/Object;)V flags 1
+-method name initEcmaError descriptor (Ljdk/nashorn/internal/runtime/ScriptObject;)Ljdk/nashorn/api/scripting/NashornException;
 
 class name jdk/nashorn/api/scripting/NashornScriptEngine
--method name __noSuchProperty__ descriptor (Ljava/lang/Object;Ljavax/script/ScriptContext;Ljava/lang/String;)Ljava/lang/Object;
+header extends javax/script/AbstractScriptEngine implements javax/script/Compilable,javax/script/Invocable flags 31
+innerclass innerClass java/lang/invoke/MethodHandles$Lookup outerClass java/lang/invoke/MethodHandles innerClassName Lookup flags 19
 
 class name jdk/nashorn/api/scripting/NashornScriptEngineFactory
 header extends java/lang/Object implements javax/script/ScriptEngineFactory flags 31
 innerclass innerClass java/lang/invoke/MethodHandles$Lookup outerClass java/lang/invoke/MethodHandles innerClassName Lookup flags 19
--method name getScriptEngine descriptor ([Ljava/lang/String;)Ljavax/script/ScriptEngine;
-method name getScriptEngine descriptor (Ljdk/nashorn/api/scripting/ClassFilter;)Ljavax/script/ScriptEngine; flags 1
-method name getScriptEngine descriptor ([Ljava/lang/String;)Ljavax/script/ScriptEngine; flags 81
-method name getScriptEngine descriptor ([Ljava/lang/String;Ljava/lang/ClassLoader;Ljdk/nashorn/api/scripting/ClassFilter;)Ljavax/script/ScriptEngine; flags 1
 
 class name jdk/nashorn/api/scripting/ScriptObjectMirror
 header extends jdk/nashorn/api/scripting/AbstractJSObject implements javax/script/Bindings flags 31
 innerclass innerClass java/util/Map$Entry outerClass java/util/Map innerClassName Entry flags 609
 innerclass innerClass java/lang/invoke/MethodHandles$Lookup outerClass java/lang/invoke/MethodHandles innerClassName Lookup flags 19
--method name toNumber descriptor ()D
-method name setIndexedPropertiesToExternalArrayData descriptor (Ljava/nio/ByteBuffer;)V flags 1
-method name wrapAsJSONCompatible descriptor (Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; flags 9
-method name identical descriptor (Ljava/lang/Object;Ljava/lang/Object;)Z flags 9
-method name toNumber descriptor ()D flags 1 deprecated true runtimeAnnotations @Ljava/lang/Deprecated;
-method name getDefaultValue descriptor (Ljava/lang/Class;)Ljava/lang/Object; flags 1 signature (Ljava/lang/Class<*>;)Ljava/lang/Object;
 
 class name jdk/nashorn/api/scripting/ScriptUtils
--method name makeSynchronizedFunction descriptor (Ljdk/nashorn/internal/runtime/ScriptFunction;Ljava/lang/Object;)Ljava/lang/Object;
--method name wrap descriptor (Ljava/lang/Object;)Ljava/lang/Object;
-method name makeSynchronizedFunction descriptor (Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; flags 9
-method name wrap descriptor (Ljava/lang/Object;)Ljdk/nashorn/api/scripting/ScriptObjectMirror; flags 9
+header extends java/lang/Object flags 31
+
+class name jdk/nashorn/api/scripting/URLReader
+header extends java/io/Reader flags 31
 
 class name jdk/nashorn/api/tree/ArrayAccessTree
 header extends java/lang/Object implements jdk/nashorn/api/tree/ExpressionTree flags 601
--- a/make/hotspot/symbols/symbols-unix	Mon Jun 25 14:32:46 2018 +0530
+++ b/make/hotspot/symbols/symbols-unix	Mon Jun 25 10:21:50 2018 -0700
@@ -22,6 +22,7 @@
 #
 
 JVM_ActiveProcessorCount
+JVM_AreNestMates
 JVM_ArrayCopy
 JVM_AssertionStatusDirectives
 JVM_BeforeHalt
@@ -118,6 +119,8 @@
 JVM_GetMethodParameters
 JVM_GetMethodTypeAnnotations
 JVM_GetNanoTimeAdjustment
+JVM_GetNestHost
+JVM_GetNestMembers
 JVM_GetPrimitiveArrayElement
 JVM_GetProtectionDomain
 JVM_GetSimpleBinaryName
--- a/make/lib/Lib-jdk.crypto.mscapi.gmk	Mon Jun 25 14:32:46 2018 +0530
+++ b/make/lib/Lib-jdk.crypto.mscapi.gmk	Mon Jun 25 10:21:50 2018 -0700
@@ -35,7 +35,7 @@
       CFLAGS := $(CFLAGS_JDKLIB), \
       LDFLAGS := $(LDFLAGS_JDKLIB) $(LDFLAGS_CXX_JDK) \
           $(call SET_SHARED_LIBRARY_ORIGIN), \
-      LIBS := crypt32.lib advapi32.lib, \
+      LIBS := crypt32.lib advapi32.lib ncrypt.lib, \
   ))
 
   TARGETS += $(BUILD_LIBSUNMSCAPI)
--- a/make/nashorn/project.properties	Mon Jun 25 14:32:46 2018 +0530
+++ b/make/nashorn/project.properties	Mon Jun 25 10:21:50 2018 -0700
@@ -351,7 +351,7 @@
 run.test.xmx=2G
 run.test.xms=2G
 
-# uncomment this jfr.args to enable light recordings. the stack needs to be cranked up to 1024 frames,
+# uncomment this jfr.args to enable flight recordings. the stack needs to be cranked up to 1024 frames,
 # or everything will as of the now drown in lambda forms and be cut off.
 #
 #jfr.args=-XX:StartFlightRecording=disk=true,dumponexit=true,dumponexitpath="test_suite.jfr",stackdepth=1024
--- a/src/hotspot/cpu/aarch64/aarch64.ad	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/aarch64.ad	Mon Jun 25 10:21:50 2018 -0700
@@ -1193,21 +1193,28 @@
   //   MemBarRelease
   //   MemBarCPUOrder
   //   StoreX[mo_release] {CardMark}-optional
+  //   MemBarCPUOrder
   //   MemBarVolatile
   //
-  // n.b. as an aside, the cpuorder membar is not itself subject to
+  // n.b. as an aside, a cpuorder membar is not itself subject to
   // matching and translation by adlc rules.  However, the rule
   // predicates need to detect its presence in order to correctly
   // select the desired adlc rules.
   //
-  // Inlined unsafe volatile gets manifest as a somewhat different
-  // node sequence to a normal volatile get
+  // Inlined unsafe volatile gets manifest as a slightly different
+  // node sequence to a normal volatile get because of the
+  // introduction of some CPUOrder memory barriers to bracket the
+  // Load. However, but the same basic skeleton of a LoadX feeding a
+  // MemBarAcquire, possibly thorugh an optional DecodeN, is still
+  // present
   //
   //   MemBarCPUOrder
   //        ||       \\
-  //   MemBarAcquire LoadX[mo_acquire]
-  //        ||
-  //   MemBarCPUOrder
+  //   MemBarCPUOrder LoadX[mo_acquire]
+  //        ||            |
+  //        ||       {DecodeN} optional
+  //        ||       /
+  //     MemBarAcquire
   //
   // In this case the acquire membar does not directly depend on the
   // load. However, we can be sure that the load is generated from an
@@ -1314,8 +1321,8 @@
 
   MemBarNode *child_membar(const MemBarNode *n)
   {
-    ProjNode *ctl = n->proj_out(TypeFunc::Control);
-    ProjNode *mem = n->proj_out(TypeFunc::Memory);
+    ProjNode *ctl = n->proj_out_or_null(TypeFunc::Control);
+    ProjNode *mem = n->proj_out_or_null(TypeFunc::Memory);
 
     // MemBar needs to have both a Ctl and Mem projection
     if (! ctl || ! mem)
@@ -1432,6 +1439,8 @@
   //         | \     /
   //         | MergeMem
   //         | /
+  //  {MemBarCPUOrder} -- optional
+  //  {      ||      }
   //   MemBarVolatile
   //
   // where
@@ -1453,6 +1462,8 @@
   //         | MergeMem
   //         | /
   //         ||      /
+  //  {MemBarCPUOrder} -- optional
+  //  {      ||      }
   //   MemBarVolatile
   //
   // i.e. the leading membar feeds Ctl to a CastP2X (which converts
@@ -1505,6 +1516,7 @@
   //          |   /
   //         MergeMem
   //          |
+  //  {MemBarCPUOrder}
   //   MemBarVolatile
   //
   // This is referred to as a *normal* subgraph. It can easily be
@@ -1567,7 +1579,7 @@
   // object put and the corresponding conditional card mark. CMS
   // employs a post-write GC barrier while G1 employs both a pre- and
   // post-write GC barrier. Of course the extra nodes may be absent --
-  // they are only inserted for object puts. This significantly
+  // they are only inserted for object puts/swaps. This significantly
   // complicates the task of identifying whether a MemBarRelease,
   // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
   // when using these GC configurations (see below). It adds similar
@@ -1575,8 +1587,8 @@
   // CompareAndSwapX or MemBarAcquire forms part of a CAS.
   //
   // In both cases the post-write subtree includes an auxiliary
-  // MemBarVolatile (StoreLoad barrier) separating the object put and
-  // the read of the corresponding card. This poses two additional
+  // MemBarVolatile (StoreLoad barrier) separating the object put/swap
+  // and the read of the corresponding card. This poses two additional
   // problems.
   //
   // Firstly, a card mark MemBarVolatile needs to be distinguished
@@ -1638,6 +1650,7 @@
   //          |  . . .  \  / Bot
   //          |       MergeMem
   //          |          |
+  //       {MemBarCPUOrder}
   //        MemBarVolatile (trailing)
   //
   // The first MergeMem merges the AliasIdxBot Mem slice from the
@@ -1647,53 +1660,39 @@
   // from the StoreCM into the trailing membar (n.b. the latter
   // proceeds via a Phi associated with the If region).
   //
-  // The graph for a CAS varies slightly, the obvious difference being
+  // The graph for a CAS varies slightly, the difference being
   // that the StoreN/P node is replaced by a CompareAndSwapP/N node
   // and the trailing MemBarVolatile by a MemBarCPUOrder +
-  // MemBarAcquire pair. The other important difference is that the
-  // CompareAndSwap node's SCMemProj is not merged into the card mark
-  // membar - it still feeds the trailing MergeMem. This also means
-  // that the card mark membar receives its Mem feed directly from the
-  // leading membar rather than via a MergeMem.
+  // MemBarAcquire pair.
   //
   //   MemBarRelease
-  //   MemBarCPUOrder__(leading)_________________________
-  //       ||                       \\                 C \
-  //   MemBarVolatile (card mark)  CompareAndSwapN/P  CastP2X
-  //     C |  ||    M |              |
-  //       | LoadB    |       ______/|
-  //       |   |      |      /       |
-  //       | Cmp      |     /      SCMemProj
-  //       | /        |    /         |
-  //       If         |   /         /
-  //       | \        |  /         /
-  // IfFalse  IfTrue  | /         /
-  //       \     / \  |/ prec    /
-  //        \   / StoreCM       /
-  //         \ /      |        /
-  //        Region   . . .    /
-  //          | \            /
-  //          |  . . .  \   / Bot
+  //   MemBarCPUOrder_(leading)_______________
+  //     C |    M \       \\                C \
+  //       |       \    CompareAndSwapN/P  CastP2X
+  //       |        \      |
+  //       |         \   SCMemProj
+  //       |      Bot \   /
+  //       |        MergeMem
+  //       |         /
+  //      MemBarVolatile (card mark)
+  //     C |  ||    M |
+  //       | LoadB    |
+  //       |   |      |
+  //       | Cmp      |\
+  //       | /        | \
+  //       If         |  \
+  //       | \        |   \
+  // IfFalse  IfTrue  |    \
+  //       \     / \  |     \
+  //        \   / StoreCM    |
+  //         \ /      |      |
+  //        Region   . . .   |
+  //          | \           /
+  //          |  . . .  \  / Bot
   //          |       MergeMem
   //          |          |
-  //        MemBarCPUOrder
-  //        MemBarAcquire (trailing)
-  //
-  // This has a slightly different memory subgraph to the one seen
-  // previously but the core of it is the same as for the CAS normal
-  // sungraph
-  //
-  //   MemBarRelease
-  //   MemBarCPUOrder____
-  //      ||             \      . . .
-  //   MemBarVolatile  CompareAndSwapX  . . .
-  //      |  \            |
-  //        . . .   SCMemProj
-  //          |     /  . . .
-  //         MergeMem
-  //          |
-  //   MemBarCPUOrder
-  //   MemBarAcquire
+  //       {MemBarCPUOrder}
+  //        MemBarVolatile (trailing)
   //
   //
   // G1 is quite a lot more complicated. The nodes inserted on behalf
@@ -1742,15 +1741,13 @@
   //          (post write subtree elided)
   //                    . . .
   //             C \         M /
-  //         MemBarVolatile (trailing)
+  //                \         /
+  //             {MemBarCPUOrder}
+  //              MemBarVolatile (trailing)
   //
   // n.b. the LoadB in this subgraph is not the card read -- it's a
   // read of the SATB queue active flag.
   //
-  // Once again the CAS graph is a minor variant on the above with the
-  // expected substitutions of CompareAndSawpX for StoreN/P and
-  // MemBarCPUOrder + MemBarAcquire for trailing MemBarVolatile.
-  //
   // The G1 post-write subtree is also optional, this time when the
   // new value being written is either null or can be identified as a
   // newly allocated (young gen) object with no intervening control
@@ -1773,7 +1770,8 @@
   // checking if card_val != young).  n.b. although this test requires
   // a pre-read of the card it can safely be done before the StoreLoad
   // barrier. However that does not bypass the need to reread the card
-  // after the barrier.
+  // after the barrier. A final, 4th If tests if the card is already
+  // marked.
   //
   //                (pre-write subtree elided)
   //        . . .                  . . .    . . .  . . .
@@ -1826,6 +1824,7 @@
   //   |              |  |  / Bot
   //    \            MergeMem
   //     \            /
+  //    {MemBarCPUOrder}
   //     MemBarVolatile
   //
   // As with CMS the initial MergeMem merges the AliasIdxBot Mem slice
@@ -1845,26 +1844,29 @@
   // otherwise it is 3.
   //
   // The CAS graph when using G1GC also includes a pre-write subgraph
-  // and an optional post-write subgraph. Teh sam evarioations are
+  // and an optional post-write subgraph. The same variations are
   // introduced as for CMS with conditional card marking i.e. the
-  // StoreP/N is swapped for a CompareAndSwapP/N, the tariling
-  // MemBarVolatile for a MemBarCPUOrder + MemBarAcquire pair and the
-  // Mem feed from the CompareAndSwapP/N includes a precedence
-  // dependency feed to the StoreCM and a feed via an SCMemProj to the
-  // trailing membar. So, as before the configuration includes the
-  // normal CAS graph as a subgraph of the memory flow.
-  //
-  // So, the upshot is that in all cases the volatile put graph will
-  // include a *normal* memory subgraph betwen the leading membar and
-  // its child membar, either a volatile put graph (including a
-  // releasing StoreX) or a CAS graph (including a CompareAndSwapX).
-  // When that child is not a card mark membar then it marks the end
-  // of the volatile put or CAS subgraph. If the child is a card mark
-  // membar then the normal subgraph will form part of a volatile put
-  // subgraph if and only if the child feeds an AliasIdxBot Mem feed
-  // to a trailing barrier via a MergeMem. That feed is either direct
-  // (for CMS) or via 2 or 3 Phi nodes merging the leading barrier
-  // memory flow (for G1).
+  // StoreP/N is swapped for a CompareAndSwapP/N with a following
+  // SCMemProj, the trailing MemBarVolatile for a MemBarCPUOrder +
+  // MemBarAcquire pair. There may be an extra If test introduced in
+  // the CAS case, when the boolean result of the CAS is tested by the
+  // caller. In that case an extra Region and AliasIdxBot Phi may be
+  // introduced before the MergeMem
+  //
+  // So, the upshot is that in all cases the subgraph will include a
+  // *normal* memory subgraph betwen the leading membar and its child
+  // membar: either a normal volatile put graph including a releasing
+  // StoreX and terminating with a trailing volatile membar or card
+  // mark volatile membar; or a normal CAS graph including a
+  // CompareAndSwapX + SCMemProj pair and terminating with a card mark
+  // volatile membar or a trailing cpu order and acquire membar
+  // pair. If the child membar is not a (volatile) card mark membar
+  // then it marks the end of the volatile put or CAS subgraph. If the
+  // child is a card mark membar then the normal subgraph will form
+  // part of a larger volatile put or CAS subgraph if and only if the
+  // child feeds an AliasIdxBot Mem feed to a trailing barrier via a
+  // MergeMem. That feed is either direct (for CMS) or via 2, 3 or 4
+  // Phi nodes merging the leading barrier memory flow (for G1).
   //
   // The predicates controlling generation of instructions for store
   // and barrier nodes employ a few simple helper functions (described
@@ -1907,13 +1909,27 @@
     }
   }
 
+  // helper to determine the maximum number of Phi nodes we may need to
+  // traverse when searching from a card mark membar for the merge mem
+  // feeding a trailing membar or vice versa
+
+  int max_phis()
+  {
+    if (UseG1GC) {
+      return 4;
+    } else if (UseConcMarkSweepGC && UseCondCardMark) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
 
   // leading_to_normal
   //
-  //graph traversal helper which detects the normal case Mem feed from
-  // a release membar (or, optionally, its cpuorder child) to a
-  // dependent volatile membar i.e. it ensures that one or other of
-  // the following Mem flow subgraph is present.
+  // graph traversal helper which detects the normal case Mem feed
+  // from a release membar (or, optionally, its cpuorder child) to a
+  // dependent volatile or acquire membar i.e. it ensures that one of
+  // the following 3 Mem flow subgraphs is present.
   //
   //   MemBarRelease
   //   MemBarCPUOrder {leading}
@@ -1922,19 +1938,27 @@
   //          |   /
   //         MergeMem
   //          |
+  //  {MemBarCPUOrder}
   //   MemBarVolatile {trailing or card mark}
   //
   //   MemBarRelease
   //   MemBarCPUOrder {leading}
-  //      |       \      . . .
-  //      |     CompareAndSwapX  . . .
-  //               |
-  //     . . .    SCMemProj
-  //           \   |
-  //      |    MergeMem
-  //      |       /
-  //    MemBarCPUOrder
-  //    MemBarAcquire {trailing}
+  //          |  \      . . .
+  //          |  CompareAndSwapX  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile {card mark}
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //          |  \      . . .
+  //          |  CompareAndSwapX  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarCPUOrder
+  //   MemBarAcquire {trailing}
   //
   // if the correct configuration is present returns the trailing
   // membar otherwise NULL.
@@ -1991,45 +2015,36 @@
       return NULL;
     }
 
-    // must have a merge if we also have st
-    if (st && !mm) {
+    // must have a merge
+    if (!mm) {
       return NULL;
     }
 
-    Node *y = NULL;
+    Node *feed = NULL;
     if (cas) {
       // look for an SCMemProj
       for (DUIterator_Fast imax, i = cas->fast_outs(imax); i < imax; i++) {
 	x = cas->fast_out(i);
-	if (x->is_Proj()) {
-	  y = x;
+        if (x->Opcode() == Op_SCMemProj) {
+	  feed = x;
 	  break;
 	}
       }
-      if (y == NULL) {
+      if (feed == NULL) {
 	return NULL;
       }
-      // the proj must feed a MergeMem
-      for (DUIterator_Fast imax, i = y->fast_outs(imax); i < imax; i++) {
-	x = y->fast_out(i);
-	if (x->is_MergeMem()) {
-	  mm = x->as_MergeMem();
-	  break;
-	}
+    } else {
+      feed = st;
+    }
+    // ensure the feed node feeds the existing mergemem;
+    for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+      x = feed->fast_out(i);
+      if (x == mm) {
+        break;
       }
-      if (mm == NULL)
-	return NULL;
-    } else {
-      // ensure the store feeds the existing mergemem;
-      for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
-	if (st->fast_out(i) == mm) {
-	  y = st;
-	  break;
-	}
-      }
-      if (y == NULL) {
-	return NULL;
-      }
+    }
+    if (x != mm) {
+      return NULL;
     }
 
     MemBarNode *mbar = NULL;
@@ -2037,15 +2052,28 @@
     for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
       x = mm->fast_out(i);
       if (x->is_MemBar()) {
-	int opcode = x->Opcode();
-	if (opcode == Op_MemBarVolatile && st) {
-	  mbar = x->as_MemBar();
-	} else if (cas && opcode == Op_MemBarCPUOrder) {
+        if (x->Opcode() == Op_MemBarCPUOrder) {
+          // with a store any cpu order membar should precede a
+          // trailing volatile membar. with a cas it should precede a
+          // trailing acquire membar. in either case try to skip to
+          // that next membar
 	  MemBarNode *y =  x->as_MemBar();
 	  y = child_membar(y);
-	  if (y != NULL && y->Opcode() == Op_MemBarAcquire) {
-	    mbar = y;
+	  if (y != NULL) {
+            // skip to this new membar to do the check
+	    x = y;
 	  }
+          
+        }
+	if (x->Opcode() == Op_MemBarVolatile) {
+	  mbar = x->as_MemBar();
+          // for a volatile store this can be either a trailing membar
+          // or a card mark membar. for a cas it must be a card mark
+          // membar
+          assert(cas == NULL || is_card_mark_membar(mbar),
+                 "in CAS graph volatile membar must be a card mark");
+	} else if (cas != NULL && x->Opcode() == Op_MemBarAcquire) {
+	  mbar = x->as_MemBar();
 	}
 	break;
       }
@@ -2059,28 +2087,36 @@
   // graph traversal helper which detects the normal case Mem feed
   // from either a card mark or a trailing membar to a preceding
   // release membar (optionally its cpuorder child) i.e. it ensures
-  // that one or other of the following Mem flow subgraphs is present.
+  // that one of the following 3 Mem flow subgraphs is present.
   //
   //   MemBarRelease
-  //   MemBarCPUOrder {leading}
+  //  {MemBarCPUOrder} {leading}
   //          |  \      . . .
   //          |  StoreN/P[mo_release]  . . .
   //          |   /
   //         MergeMem
   //          |
-  //   MemBarVolatile {card mark or trailing}
+  //  {MemBarCPUOrder}
+  //   MemBarVolatile {trailing or card mark}
   //
   //   MemBarRelease
   //   MemBarCPUOrder {leading}
-  //      |       \      . . .
-  //      |     CompareAndSwapX  . . .
-  //               |
-  //     . . .    SCMemProj
-  //           \   |
-  //      |    MergeMem
-  //      |        /
-  //    MemBarCPUOrder
-  //    MemBarAcquire {trailing}
+  //          |  \      . . .
+  //          |  CompareAndSwapX  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile {card mark}
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //          |  \      . . .
+  //          |  CompareAndSwapX  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarCPUOrder
+  //   MemBarAcquire {trailing}
   //
   // this predicate checks for the same flow as the previous predicate
   // but starting from the bottom rather than the top.
@@ -2097,20 +2133,19 @@
     assert((barrier->Opcode() == Op_MemBarVolatile ||
 	    barrier->Opcode() == Op_MemBarAcquire),
 	   "expecting a volatile or an acquire membar");
-    Node *x;
-    bool is_cas = barrier->Opcode() == Op_MemBarAcquire;
-
-    // if we have an acquire membar then it must be fed via a CPUOrder
-    // membar
-
-    if (is_cas) {
-      // skip to parent barrier which must be a cpuorder
-      x = parent_membar(barrier);
-      if (x->Opcode() != Op_MemBarCPUOrder)
-	return NULL;
-    } else {
-      // start from the supplied barrier
+    bool barrier_is_acquire = barrier->Opcode() == Op_MemBarAcquire;
+
+    // if we have an intervening cpu order membar then start the
+    // search from it
+    
+    Node *x = parent_membar(barrier);
+
+    if (x == NULL) {
+      // stick with the original barrier
       x = (Node *)barrier;
+    } else if (x->Opcode() != Op_MemBarCPUOrder) {
+      // any other barrier means this is not the graph we want
+      return NULL;
     }
 
     // the Mem feed to the membar should be a merge
@@ -2120,30 +2155,8 @@
 
     MergeMemNode *mm = x->as_MergeMem();
 
-    if (is_cas) {
-      // the merge should be fed from the CAS via an SCMemProj node
-      x = NULL;
-      for (uint idx = 1; idx < mm->req(); idx++) {
-	if (mm->in(idx)->Opcode() == Op_SCMemProj) {
-	  x = mm->in(idx);
-	  break;
-	}
-      }
-      if (x == NULL) {
-	return NULL;
-      }
-      // check for a CAS feeding this proj
-      x = x->in(0);
-      int opcode = x->Opcode();
-      if (!is_CAS(opcode)) {
-	return NULL;
-      }
-      // the CAS should get its mem feed from the leading membar
-      x = x->in(MemNode::Memory);
-    } else {
-      // the merge should get its Bottom mem feed from the leading membar
-      x = mm->in(Compile::AliasIdxBot);
-    }
+    // the merge should get its Bottom mem feed from the leading membar
+    x = mm->in(Compile::AliasIdxBot);
 
     // ensure this is a non control projection
     if (!x->is_Proj() || x->is_CFG()) {
@@ -2188,15 +2201,34 @@
     if (st == NULL & cas == NULL) {
       return NULL;
     }
-
     if (st == NULL) {
-      // nothing more to check
-      return leading;
+      // if we started from a volatile membar and found a CAS then the
+      // original membar ought to be for a card mark
+      assert((barrier_is_acquire || is_card_mark_membar(barrier)),
+             "unexpected volatile barrier (i.e. not card mark) in CAS graph");
+      // check that the CAS feeds the merge we used to get here via an
+      // intermediary SCMemProj
+      Node *scmemproj = NULL;
+      for (DUIterator_Fast imax, i = cas->fast_outs(imax); i < imax; i++) {
+        x = cas->fast_out(i);
+        if (x->Opcode() == Op_SCMemProj) {
+          scmemproj = x;
+          break;
+        }
+      }
+      if (scmemproj == NULL) {
+        return NULL;
+      }
+      for (DUIterator_Fast imax, i = scmemproj->fast_outs(imax); i < imax; i++) {
+        x = scmemproj->fast_out(i);
+        if (x == mm) {
+          return leading;
+        }
+      }
     } else {
-      // we should not have a store if we started from an acquire
-      if (is_cas) {
-	return NULL;
-      }
+      // we should not have found a store if we started from an acquire
+      assert(!barrier_is_acquire,
+             "unexpected trailing acquire barrier in volatile store graph");
 
       // the store should feed the merge we used to get here
       for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
@@ -2227,8 +2259,9 @@
   //  Bot |  /
   //   MergeMem
   //      |
-  //      |
-  //    MemBarVolatile {trailing}
+  //   {MemBarCPUOrder}            OR  MemBarCPUOrder
+  //    MemBarVolatile {trailing}      MemBarAcquire {trailing}
+  //                                 
   //
   // 2)
   //   MemBarRelease/CPUOrder (leading)
@@ -2246,8 +2279,8 @@
   //     Bot |   /
   //       MergeMem
   //         |
-  //    MemBarVolatile {trailing}
-  //
+  //   {MemBarCPUOrder}            OR  MemBarCPUOrder
+  //    MemBarVolatile {trailing}      MemBarAcquire {trailing}
   //
   // 3)
   //   MemBarRelease/CPUOrder (leading)
@@ -2269,12 +2302,44 @@
   //       MergeMem
   //         |
   //         |
-  //    MemBarVolatile {trailing}
+  //   {MemBarCPUOrder}            OR  MemBarCPUOrder
+  //    MemBarVolatile {trailing}      MemBarAcquire {trailing}
+  //
+  // 4)
+  //   MemBarRelease/CPUOrder (leading)
+  //    |
+  //    |\
+  //    | \
+  //    |  \
+  //    |   \
+  //    |\   \
+  //    | \   \
+  //    |  \   \        . . .
+  //    |   \   \         |
+  //    |\   \   \   MemBarVolatile (card mark)
+  //    | \   \   \   /   |
+  //    |  \   \   \ /  StoreCM    . . .
+  //    |   \   \  Phi
+  //     \   \   \ /
+  //      \   \  Phi
+  //       \   \ /
+  //        \  Phi
+  //         \ /
+  //         Phi  . . .
+  //      Bot |   /
+  //       MergeMem
+  //          |
+  //          |
+  //    MemBarCPUOrder
+  //    MemBarAcquire {trailing}
   //
   // configuration 1 is only valid if UseConcMarkSweepGC &&
   // UseCondCardMark
   //
-  // configurations 2 and 3 are only valid if UseG1GC.
+  // configuration 2, is only valid if UseConcMarkSweepGC &&
+  // UseCondCardMark or if UseG1GC
+  //
+  // configurations 3 and 4 are only valid if UseG1GC.
   //
   // if a valid configuration is present returns the trailing membar
   // otherwise NULL.
@@ -2292,8 +2357,8 @@
     Node *x;
     MergeMemNode *mm = NULL;
 
-    const int MAX_PHIS = 3;	// max phis we will search through
-    int phicount = 0; 		// current search count
+    const int MAX_PHIS = max_phis(); // max phis we will search through
+    int phicount = 0;                // current search count
 
     bool retry_feed = true;
     while (retry_feed) {
@@ -2308,7 +2373,7 @@
       }
       if (mm) {
 	retry_feed = false;
-      } else if (UseG1GC & phicount++ < MAX_PHIS) {
+      } else if (phicount++ < MAX_PHIS) {
 	// the barrier may feed indirectly via one or two Phi nodes
 	PhiNode *phi = NULL;
 	for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
@@ -2334,12 +2399,24 @@
     assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
 
     MemBarNode *trailing = NULL;
-    // be sure we have a trailing membar the merge
+    // be sure we have a trailing membar fed by the merge
     for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
       x = mm->fast_out(i);
-      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-	trailing = x->as_MemBar();
-	break;
+      if (x->is_MemBar()) {
+        // if this is an intervening cpu order membar skip to the
+        // following membar
+        if (x->Opcode() == Op_MemBarCPUOrder) {
+          MemBarNode *y =  x->as_MemBar();
+          y = child_membar(y);
+          if (y != NULL) {
+            x = y;
+          }
+        }
+        if (x->Opcode() == Op_MemBarVolatile ||
+            x->Opcode() == Op_MemBarAcquire) {
+          trailing = x->as_MemBar();
+        }
+        break;
       }
     }
 
@@ -2360,18 +2437,33 @@
   // otherwise NULL
   //
   // n.b. the supplied membar is expected to be a trailing
-  // MemBarVolatile i.e. the caller must ensure the input node has the
-  // correct opcode
+  // MemBarVolatile or MemBarAcquire i.e. the caller must ensure the
+  // input node has the correct opcode
 
   MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
   {
-    assert(trailing->Opcode() == Op_MemBarVolatile,
-	   "expecting a volatile membar");
+    assert(trailing->Opcode() == Op_MemBarVolatile ||
+           trailing->Opcode() == Op_MemBarAcquire,
+	   "expecting a volatile or acquire membar");
     assert(!is_card_mark_membar(trailing),
 	   "not expecting a card mark membar");
 
+    Node *x = (Node *)trailing;
+
+    // look for a preceding cpu order membar
+    MemBarNode *y = parent_membar(x->as_MemBar());
+    if (y != NULL) {
+      // make sure it is a cpu order membar
+      if (y->Opcode() != Op_MemBarCPUOrder) {
+        // this is nto the graph we were looking for
+        return NULL;
+      }
+      // start the search from here
+      x = y;
+    }
+
     // the Mem feed to the membar should be a merge
-    Node *x = trailing->in(TypeFunc::Memory);
+    x = x->in(TypeFunc::Memory);
     if (!x->is_MergeMem()) {
       return NULL;
     }
@@ -2382,20 +2474,20 @@
     // with G1 we may possibly see a Phi or two before we see a Memory
     // Proj from the card mark membar
 
-    const int MAX_PHIS = 3;	// max phis we will search through
-    int phicount = 0; 		// current search count
+    const int MAX_PHIS = max_phis(); // max phis we will search through
+    int phicount = 0;                    // current search count
 
     bool retry_feed = !x->is_Proj();
 
     while (retry_feed) {
-      if (UseG1GC && x->is_Phi() && phicount++ < MAX_PHIS) {
+      if (x->is_Phi() && phicount++ < MAX_PHIS) {
 	PhiNode *phi = x->as_Phi();
 	ProjNode *proj = NULL;
 	PhiNode *nextphi = NULL;
 	bool found_leading = false;
 	for (uint i = 1; i < phi->req(); i++) {
 	  x = phi->in(i);
-	  if (x->is_Phi()) {
+	  if (x->is_Phi() && x->adr_type() == TypePtr::BOTTOM) {
 	    nextphi = x->as_Phi();
 	  } else if (x->is_Proj()) {
 	    int opcode = x->in(0)->Opcode();
@@ -2475,10 +2567,8 @@
       return leading;
     }
 
-    // nothing more to do if this is an acquire
-    if (trailing->Opcode() == Op_MemBarAcquire) {
-      return NULL;
-    }
+    // there is no normal path from trailing to leading membar. see if
+    // we can arrive via a card mark membar
 
     MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
 
@@ -2506,15 +2596,6 @@
   // with a bogus read dependency on it's preceding load. so in those
   // cases we will find the load node at the PARMS offset of the
   // acquire membar.  n.b. there may be an intervening DecodeN node.
-  //
-  // a volatile load derived from an inlined unsafe field access
-  // manifests as a cpuorder membar with Ctl and Mem projections
-  // feeding both an acquire membar and a LoadX[mo_acquire]. The
-  // acquire then feeds another cpuorder membar via Ctl and Mem
-  // projections. The load has no output dependency on these trailing
-  // membars because subsequent nodes inserted into the graph take
-  // their control feed from the final membar cpuorder meaning they
-  // are all ordered after the load.
 
   Node *x = barrier->lookup(TypeFunc::Parms);
   if (x) {
@@ -2537,61 +2618,7 @@
     return (x->is_Load() && x->as_Load()->is_acquire());
   }
 
-  // now check for an unsafe volatile get
-
-  // need to check for
-  //
-  //   MemBarCPUOrder
-  //        ||       \\
-  //   MemBarAcquire* LoadX[mo_acquire]
-  //        ||
-  //   MemBarCPUOrder
-  //
-  // where * tags node we were passed
-  // and || or \\ are Ctl+Mem feeds via intermediate Proj Nodes
-
-  // check for a parent MemBarCPUOrder
-  ProjNode *ctl;
-  ProjNode *mem;
-  MemBarNode *parent = parent_membar(barrier);
-  if (!parent || parent->Opcode() != Op_MemBarCPUOrder)
-    return false;
-  ctl = parent->proj_out(TypeFunc::Control);
-  mem = parent->proj_out(TypeFunc::Memory);
-  if (!ctl || !mem) {
-    return false;
-  }
-  // ensure the proj nodes both feed a LoadX[mo_acquire]
-  LoadNode *ld = NULL;
-  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
-    x = ctl->fast_out(i);
-    // if we see a load we keep hold of it and stop searching
-    if (x->is_Load()) {
-      ld = x->as_Load();
-      break;
-    }
-  }
-  // it must be an acquiring load
-  if (ld && ld->is_acquire()) {
-
-    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-      x = mem->fast_out(i);
-      // if we see the same load we drop it and stop searching
-      if (x == ld) {
-	ld = NULL;
-	break;
-      }
-    }
-    // we must have dropped the load
-    if (ld == NULL) {
-      // check for a child cpuorder membar
-      MemBarNode *child  = child_membar(barrier->as_MemBar());
-      if (child && child->Opcode() == Op_MemBarCPUOrder)
-	return true;
-    }
-  }
-
-  // final option for unnecessary mebar is that it is a trailing node
+  // other option for unnecessary membar is that it is a trailing node
   // belonging to a CAS
 
   MemBarNode *leading = trailing_to_leading(barrier->as_MemBar());
@@ -2647,39 +2674,7 @@
     return true;
   }
 
-  // now check for an unsafe volatile get
-
-  // check if Ctl and Proj feed comes from a MemBarCPUOrder
-  //
-  //     MemBarCPUOrder
-  //        ||       \\
-  //   MemBarAcquire* LoadX[mo_acquire]
-  //        ||
-  //   MemBarCPUOrder
-
-  MemBarNode *membar;
-
-  membar = parent_membar(ld);
-
-  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) {
-    return false;
-  }
-
-  // ensure that there is a CPUOrder->Acquire->CPUOrder membar chain
-
-  membar = child_membar(membar);
-
-  if (!membar || !membar->Opcode() == Op_MemBarAcquire) {
-    return false;
-  }
-
-  membar = child_membar(membar);
-
-  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) {
-    return false;
-  }
-
-  return true;
+  return false;
 }
 
 bool unnecessary_release(const Node *n)
@@ -2739,7 +2734,7 @@
   }
 
   // ok, if it's not a card mark then we still need to check if it is
-  // a trailing membar of a volatile put hgraph.
+  // a trailing membar of a volatile put graph.
 
   return (trailing_to_leading(mbvol) != NULL);
 }
@@ -2848,6 +2843,14 @@
 
   assert(mbar != NULL, "CAS not embedded in normal graph!");
 
+  // if this is a card mark membar check we have a trailing acquire
+
+  if (is_card_mark_membar(mbar)) {
+    mbar = card_mark_to_trailing(mbar);
+  }
+
+  assert(mbar != NULL, "card mark membar for CAS not embedded in normal graph!");
+
   assert(mbar->Opcode() == Op_MemBarAcquire, "trailing membar should be an acquire");
 #endif // ASSERT
   // so we can just return true here
@@ -15849,81 +15852,88 @@
 %}
 
 instruct string_compareU(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 cnt2,
-                        iRegI_R0 result, iRegP_R10 tmp1, rFlagsReg cr)
+                        iRegI_R0 result, iRegP_R10 tmp1, iRegL_R11 tmp2, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
-  effect(KILL tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
+  effect(KILL tmp1, KILL tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
 
   format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1" %}
   ins_encode %{
     // Count is in 8-bit bytes; non-Compact chars are 16 bits.
     __ string_compare($str1$$Register, $str2$$Register,
                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
-                      $tmp1$$Register,
-                      fnoreg, fnoreg, StrIntrinsicNode::UU);
+                      $tmp1$$Register, $tmp2$$Register,
+                      fnoreg, fnoreg, fnoreg, StrIntrinsicNode::UU);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_compareL(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 cnt2,
-                        iRegI_R0 result, iRegP_R10 tmp1, rFlagsReg cr)
+                        iRegI_R0 result, iRegP_R10 tmp1, iRegL_R11 tmp2, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
-  effect(KILL tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
+  effect(KILL tmp1, KILL tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
 
   format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1" %}
   ins_encode %{
     __ string_compare($str1$$Register, $str2$$Register,
                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
-                      $tmp1$$Register,
-                      fnoreg, fnoreg, StrIntrinsicNode::LL);
+                      $tmp1$$Register, $tmp2$$Register,
+                      fnoreg, fnoreg, fnoreg, StrIntrinsicNode::LL);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_compareUL(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 cnt2,
-                        iRegI_R0 result, vRegD vtmp1, vRegD vtmp2, iRegP_R10 tmp1, rFlagsReg cr)
+                        iRegI_R0 result, iRegP_R10 tmp1, iRegL_R11 tmp2,
+                        vRegD_V0 vtmp1, vRegD_V1 vtmp2, vRegD_V2 vtmp3, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
-  effect(KILL tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, TEMP vtmp1, TEMP vtmp2, KILL cr);
-
-  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1" %}
+  effect(KILL tmp1, KILL tmp2, KILL vtmp1, KILL vtmp2, KILL vtmp3,
+         USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
+
+  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1, $tmp2, $vtmp1, $vtmp2, $vtmp3" %}
   ins_encode %{
     __ string_compare($str1$$Register, $str2$$Register,
                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
-                      $tmp1$$Register,
-                      $vtmp1$$FloatRegister, $vtmp2$$FloatRegister, StrIntrinsicNode::UL);
+                      $tmp1$$Register, $tmp2$$Register,
+                      $vtmp1$$FloatRegister, $vtmp2$$FloatRegister,
+                      $vtmp3$$FloatRegister, StrIntrinsicNode::UL);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_compareLU(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 cnt2,
-                        iRegI_R0 result, vRegD vtmp1, vRegD vtmp2, iRegP_R10 tmp1, rFlagsReg cr)
+                        iRegI_R0 result, iRegP_R10 tmp1, iRegL_R11 tmp2,
+                        vRegD_V0 vtmp1, vRegD_V1 vtmp2, vRegD_V2 vtmp3, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
-  effect(KILL tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, TEMP vtmp1, TEMP vtmp2, KILL cr);
-
-  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1" %}
+  effect(KILL tmp1, KILL tmp2, KILL vtmp1, KILL vtmp2, KILL vtmp3,
+         USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
+
+  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1, $tmp2, $vtmp1, $vtmp2, $vtmp3" %}
   ins_encode %{
     __ string_compare($str1$$Register, $str2$$Register,
                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
-                      $tmp1$$Register,
-                      $vtmp1$$FloatRegister, $vtmp2$$FloatRegister, StrIntrinsicNode::LU);
+                      $tmp1$$Register, $tmp2$$Register,
+                      $vtmp1$$FloatRegister, $vtmp2$$FloatRegister,
+                      $vtmp3$$FloatRegister,StrIntrinsicNode::LU);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_indexofUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
-       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
+       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3,
+       iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr)
 %{
   predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU);
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
   effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
-         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr);
   format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UU)" %}
 
   ins_encode %{
@@ -15931,18 +15941,20 @@
                       $cnt1$$Register, $cnt2$$Register,
                       $tmp1$$Register, $tmp2$$Register,
                       $tmp3$$Register, $tmp4$$Register,
+                      $tmp5$$Register, $tmp6$$Register,
                       -1, $result$$Register, StrIntrinsicNode::UU);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_indexofLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
-       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
+       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3,
+       iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr)
 %{
   predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL);
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
   effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
-         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr);
   format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LL)" %}
 
   ins_encode %{
@@ -15950,18 +15962,20 @@
                       $cnt1$$Register, $cnt2$$Register,
                       $tmp1$$Register, $tmp2$$Register,
                       $tmp3$$Register, $tmp4$$Register,
+                      $tmp5$$Register, $tmp6$$Register,
                       -1, $result$$Register, StrIntrinsicNode::LL);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_indexofUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
-       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
+       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3,
+       iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr)
 %{
   predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL);
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
   effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
-         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr);
   format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UL)" %}
 
   ins_encode %{
@@ -15969,30 +15983,12 @@
                       $cnt1$$Register, $cnt2$$Register,
                       $tmp1$$Register, $tmp2$$Register,
                       $tmp3$$Register, $tmp4$$Register,
+                      $tmp5$$Register, $tmp6$$Register,
                       -1, $result$$Register, StrIntrinsicNode::UL);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
-instruct string_indexofLU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
-       iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
-%{
-  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LU);
-  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
-  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
-         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
-  format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LU)" %}
-
-  ins_encode %{
-    __ string_indexof($str1$$Register, $str2$$Register,
-                      $cnt1$$Register, $cnt2$$Register,
-                      $tmp1$$Register, $tmp2$$Register,
-                      $tmp3$$Register, $tmp4$$Register,
-                      -1, $result$$Register, StrIntrinsicNode::LU);
-  %}
-  ins_pipe(pipe_class_memory);
-%}
-
 instruct string_indexof_conUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2,
                  immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2,
                  iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
@@ -16008,7 +16004,7 @@
     __ string_indexof($str1$$Register, $str2$$Register,
                       $cnt1$$Register, zr,
                       $tmp1$$Register, $tmp2$$Register,
-                      $tmp3$$Register, $tmp4$$Register,
+                      $tmp3$$Register, $tmp4$$Register, zr, zr,
                       icnt2, $result$$Register, StrIntrinsicNode::UU);
   %}
   ins_pipe(pipe_class_memory);
@@ -16029,7 +16025,7 @@
     __ string_indexof($str1$$Register, $str2$$Register,
                       $cnt1$$Register, zr,
                       $tmp1$$Register, $tmp2$$Register,
-                      $tmp3$$Register, $tmp4$$Register,
+                      $tmp3$$Register, $tmp4$$Register, zr, zr,
                       icnt2, $result$$Register, StrIntrinsicNode::LL);
   %}
   ins_pipe(pipe_class_memory);
@@ -16050,33 +16046,12 @@
     __ string_indexof($str1$$Register, $str2$$Register,
                       $cnt1$$Register, zr,
                       $tmp1$$Register, $tmp2$$Register,
-                      $tmp3$$Register, $tmp4$$Register,
+                      $tmp3$$Register, $tmp4$$Register, zr, zr,
                       icnt2, $result$$Register, StrIntrinsicNode::UL);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
-instruct string_indexof_conLU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2,
-                 immI_1 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2,
-                 iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr)
-%{
-  predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LU);
-  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
-  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1,
-         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
-  format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (LU)" %}
-
-  ins_encode %{
-    int icnt2 = (int)$int_cnt2$$constant;
-    __ string_indexof($str1$$Register, $str2$$Register,
-                      $cnt1$$Register, zr,
-                      $tmp1$$Register, $tmp2$$Register,
-                      $tmp3$$Register, $tmp4$$Register,
-                      icnt2, $result$$Register, StrIntrinsicNode::LU);
-  %}
-  ins_pipe(pipe_class_memory);
-%}
-
 instruct string_indexofU_char(iRegP_R1 str1, iRegI_R2 cnt1, iRegI_R3 ch,
                               iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2,
                               iRegINoSp tmp3, rFlagsReg cr)
@@ -16193,7 +16168,7 @@
 
 // fast byte[] to char[] inflation
 instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len,
-                        vRegD tmp1, vRegD tmp2, vRegD tmp3, iRegP_R3 tmp4, rFlagsReg cr)
+                        vRegD_V0 tmp1, vRegD_V1 tmp2, vRegD_V2 tmp3, iRegP_R3 tmp4, rFlagsReg cr)
 %{
   match(Set dummy (StrInflatedCopy src (Binary dst len)));
   effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr);
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp	Mon Jun 25 10:21:50 2018 -0700
@@ -1638,12 +1638,14 @@
 #undef INSN
 
   // Conditional compare (both kinds)
-  void conditional_compare(unsigned op, int o2, int o3,
+  void conditional_compare(unsigned op, int o1, int o2, int o3,
                            Register Rn, unsigned imm5, unsigned nzcv,
                            unsigned cond) {
+    starti;
     f(op, 31, 29);
     f(0b11010010, 28, 21);
     f(cond, 15, 12);
+    f(o1, 11);
     f(o2, 10);
     f(o3, 4);
     f(nzcv, 3, 0);
@@ -1652,15 +1654,12 @@
 
 #define INSN(NAME, op)                                                  \
   void NAME(Register Rn, Register Rm, int imm, Condition cond) {        \
-    starti;                                                             \
-    f(0, 11);                                                           \
-    conditional_compare(op, 0, 0, Rn, (uintptr_t)Rm, imm, cond);        \
+    int regNumber = (Rm == zr ? 31 : (uintptr_t)Rm);                    \
+    conditional_compare(op, 0, 0, 0, Rn, regNumber, imm, cond);         \
   }                                                                     \
                                                                         \
-  void NAME(Register Rn, int imm5, int imm, Condition cond) {   \
-    starti;                                                             \
-    f(1, 11);                                                           \
-    conditional_compare(op, 0, 0, Rn, imm5, imm, cond);                 \
+  void NAME(Register Rn, int imm5, int imm, Condition cond) {           \
+    conditional_compare(op, 1, 0, 0, Rn, imm5, imm, cond);              \
   }
 
   INSN(ccmnw, 0b001);
@@ -2025,6 +2024,57 @@
       fmovd(Vn, zr);
   }
 
+   // Floating-point rounding
+   // type: half-precision = 11
+   //       single         = 00
+   //       double         = 01
+   // rmode: A = Away     = 100
+   //        I = current  = 111
+   //        M = MinusInf = 010
+   //        N = eveN     = 000
+   //        P = PlusInf  = 001
+   //        X = eXact    = 110
+   //        Z = Zero     = 011
+  void float_round(unsigned type, unsigned rmode, FloatRegister Rd, FloatRegister Rn) {
+    starti;
+    f(0b00011110, 31, 24);
+    f(type, 23, 22);
+    f(0b1001, 21, 18);
+    f(rmode, 17, 15);
+    f(0b10000, 14, 10);
+    rf(Rn, 5), rf(Rd, 0);
+  }
+#define INSN(NAME, type, rmode)                   \
+  void NAME(FloatRegister Vd, FloatRegister Vn) { \
+    float_round(type, rmode, Vd, Vn);             \
+  }
+
+public:
+  INSN(frintah, 0b11, 0b100);
+  INSN(frintih, 0b11, 0b111);
+  INSN(frintmh, 0b11, 0b010);
+  INSN(frintnh, 0b11, 0b000);
+  INSN(frintph, 0b11, 0b001);
+  INSN(frintxh, 0b11, 0b110);
+  INSN(frintzh, 0b11, 0b011);
+
+  INSN(frintas, 0b00, 0b100);
+  INSN(frintis, 0b00, 0b111);
+  INSN(frintms, 0b00, 0b010);
+  INSN(frintns, 0b00, 0b000);
+  INSN(frintps, 0b00, 0b001);
+  INSN(frintxs, 0b00, 0b110);
+  INSN(frintzs, 0b00, 0b011);
+
+  INSN(frintad, 0b01, 0b100);
+  INSN(frintid, 0b01, 0b111);
+  INSN(frintmd, 0b01, 0b010);
+  INSN(frintnd, 0b01, 0b000);
+  INSN(frintpd, 0b01, 0b001);
+  INSN(frintxd, 0b01, 0b110);
+  INSN(frintzd, 0b01, 0b011);
+#undef INSN
+
 /* SIMD extensions
  *
  * We just use FloatRegister in the following. They are exactly the same
@@ -2294,6 +2344,42 @@
 
 #undef INSN
 
+#define INSN(NAME, op1, op2) \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm, int index = 0) { \
+    starti;                                                                                            \
+    assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");                                   \
+    assert(index >= 0 && ((T == T2D && index <= 1) || (T != T2D && index <= 3)), "invalid index");     \
+    f(0, 31), f((int)T & 1, 30), f(op1, 29); f(0b011111, 28, 23);                                      \
+    f(T == T2D ? 1 : 0, 22), f(T == T2D ? 0 : index & 1, 21), rf(Vm, 16);                              \
+    f(op2, 15, 12), f(T == T2D ? index : (index >> 1), 11), f(0, 10);                                  \
+    rf(Vn, 5), rf(Vd, 0);                                                                              \
+  }
+
+  // FMLA/FMLS - Vector - Scalar
+  INSN(fmlavs, 0, 0b0001);
+  INSN(fmlsvs, 0, 0b0001);
+  // FMULX - Vector - Scalar
+  INSN(fmulxvs, 1, 0b1001);
+
+#undef INSN
+
+  // Floating-point Reciprocal Estimate
+  void frecpe(FloatRegister Vd, FloatRegister Vn, SIMD_RegVariant type) {
+    assert(type == D || type == S, "Wrong type for frecpe");
+    starti;
+    f(0b010111101, 31, 23);
+    f(type == D ? 1 : 0, 22);
+    f(0b100001110110, 21, 10);
+    rf(Vn, 5), rf(Vd, 0);
+  }
+
+  // (double) {a, b} -> (a + b)
+  void faddpd(FloatRegister Vd, FloatRegister Vn) {
+    starti;
+    f(0b0111111001110000110110, 31, 10);
+    rf(Vn, 5), rf(Vd, 0);
+  }
+
   void ins(FloatRegister Vd, SIMD_RegVariant T, FloatRegister Vn, int didx, int sidx) {
     starti;
     assert(T != Q, "invalid register variant");
--- a/src/hotspot/cpu/aarch64/c1_LIRGenerator_aarch64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/c1_LIRGenerator_aarch64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -745,6 +745,14 @@
 }
 
 void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
+  assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
+  if (x->id() == vmIntrinsics::_dexp || x->id() == vmIntrinsics::_dlog ||
+      x->id() == vmIntrinsics::_dpow || x->id() == vmIntrinsics::_dcos ||
+      x->id() == vmIntrinsics::_dsin || x->id() == vmIntrinsics::_dtan ||
+      x->id() == vmIntrinsics::_dlog10) {
+    do_LibmIntrinsic(x);
+    return;
+  }
   switch (x->id()) {
     case vmIntrinsics::_dabs:
     case vmIntrinsics::_dsqrt: {
@@ -754,61 +762,100 @@
       LIR_Opr dst = rlock_result(x);
 
       switch (x->id()) {
-      case vmIntrinsics::_dsqrt: {
-        __ sqrt(value.result(), dst, LIR_OprFact::illegalOpr);
-        break;
-      }
-      case vmIntrinsics::_dabs: {
-        __ abs(value.result(), dst, LIR_OprFact::illegalOpr);
-        break;
-      }
+        case vmIntrinsics::_dsqrt: {
+          __ sqrt(value.result(), dst, LIR_OprFact::illegalOpr);
+          break;
+        }
+        case vmIntrinsics::_dabs: {
+          __ abs(value.result(), dst, LIR_OprFact::illegalOpr);
+          break;
+        }
       }
       break;
     }
-    case vmIntrinsics::_dlog10: // fall through
-    case vmIntrinsics::_dlog: // fall through
-    case vmIntrinsics::_dsin: // fall through
-    case vmIntrinsics::_dtan: // fall through
-    case vmIntrinsics::_dcos: // fall through
-    case vmIntrinsics::_dexp: {
-      assert(x->number_of_arguments() == 1, "wrong type");
+  }
+}
 
-      address runtime_entry = NULL;
-      switch (x->id()) {
-      case vmIntrinsics::_dsin:
-        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dsin);
-        break;
-      case vmIntrinsics::_dcos:
-        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dcos);
-        break;
-      case vmIntrinsics::_dtan:
-        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dtan);
-        break;
-      case vmIntrinsics::_dlog:
-        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog);
-        break;
-      case vmIntrinsics::_dlog10:
-        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10);
-        break;
-      case vmIntrinsics::_dexp:
-        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);
-        break;
-      default:
-        ShouldNotReachHere();
+void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
+  LIRItem value(x->argument_at(0), this);
+  value.set_destroys_register();
+
+  LIR_Opr calc_result = rlock_result(x);
+  LIR_Opr result_reg = result_register_for(x->type());
+
+  CallingConvention* cc = NULL;
+
+  if (x->id() == vmIntrinsics::_dpow) {
+    LIRItem value1(x->argument_at(1), this);
+
+    value1.set_destroys_register();
+
+    BasicTypeList signature(2);
+    signature.append(T_DOUBLE);
+    signature.append(T_DOUBLE);
+    cc = frame_map()->c_calling_convention(&signature);
+    value.load_item_force(cc->at(0));
+    value1.load_item_force(cc->at(1));
+  } else {
+    BasicTypeList signature(1);
+    signature.append(T_DOUBLE);
+    cc = frame_map()->c_calling_convention(&signature);
+    value.load_item_force(cc->at(0));
+  }
+
+  switch (x->id()) {
+    case vmIntrinsics::_dexp:
+      if (StubRoutines::dexp() != NULL) {
+        __ call_runtime_leaf(StubRoutines::dexp(), getThreadTemp(), result_reg, cc->args());
+      } else {
+        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dexp), getThreadTemp(), result_reg, cc->args());
       }
-
-      LIR_Opr result = call_runtime(x->argument_at(0), runtime_entry, x->type(), NULL);
-      set_result(x, result);
       break;
-    }
-    case vmIntrinsics::_dpow: {
-      assert(x->number_of_arguments() == 2, "wrong type");
-      address runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dpow);
-      LIR_Opr result = call_runtime(x->argument_at(0), x->argument_at(1), runtime_entry, x->type(), NULL);
-      set_result(x, result);
+    case vmIntrinsics::_dlog:
+      if (StubRoutines::dlog() != NULL) {
+        __ call_runtime_leaf(StubRoutines::dlog(), getThreadTemp(), result_reg, cc->args());
+      } else {
+        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dlog), getThreadTemp(), result_reg, cc->args());
+      }
       break;
-    }
+    case vmIntrinsics::_dlog10:
+      if (StubRoutines::dlog10() != NULL) {
+        __ call_runtime_leaf(StubRoutines::dlog10(), getThreadTemp(), result_reg, cc->args());
+      } else {
+        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dlog10), getThreadTemp(), result_reg, cc->args());
+      }
+      break;
+    case vmIntrinsics::_dpow:
+      if (StubRoutines::dpow() != NULL) {
+        __ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
+      } else {
+        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), getThreadTemp(), result_reg, cc->args());
+      }
+      break;
+    case vmIntrinsics::_dsin:
+      if (StubRoutines::dsin() != NULL) {
+        __ call_runtime_leaf(StubRoutines::dsin(), getThreadTemp(), result_reg, cc->args());
+      } else {
+        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), getThreadTemp(), result_reg, cc->args());
+      }
+      break;
+    case vmIntrinsics::_dcos:
+      if (StubRoutines::dcos() != NULL) {
+        __ call_runtime_leaf(StubRoutines::dcos(), getThreadTemp(), result_reg, cc->args());
+      } else {
+        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), getThreadTemp(), result_reg, cc->args());
+      }
+      break;
+    case vmIntrinsics::_dtan:
+      if (StubRoutines::dtan() != NULL) {
+        __ call_runtime_leaf(StubRoutines::dtan(), getThreadTemp(), result_reg, cc->args());
+      } else {
+        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), getThreadTemp(), result_reg, cc->args());
+      }
+      break;
+    default:  ShouldNotReachHere();
   }
+  __ move(result_reg, calc_result);
 }
 
 
--- a/src/hotspot/cpu/aarch64/frame_aarch64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/frame_aarch64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -71,9 +71,20 @@
     return false;
   }
 
-  // unextended sp must be within the stack and above or equal sp
-  bool unextended_sp_safe = (unextended_sp < thread->stack_base()) &&
-                            (unextended_sp >= sp);
+  // When we are running interpreted code the machine stack pointer, SP, is
+  // set low enough so that the Java expression stack can grow and shrink
+  // without ever exceeding the machine stack bounds.  So, ESP >= SP.
+
+  // When we call out of an interpreted method, SP is incremented so that
+  // the space between SP and ESP is removed.  The SP saved in the callee's
+  // frame is the SP *before* this increment.  So, when we walk a stack of
+  // interpreter frames the sender's SP saved in a frame might be less than
+  // the SP at the point of call.
+
+  // So unextended sp must be within the stack but we need not to check
+  // that unextended sp >= sp
+
+  bool unextended_sp_safe = (unextended_sp < thread->stack_base());
 
   if (!unextended_sp_safe) {
     return false;
--- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -43,7 +43,7 @@
 
 void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register addr, Register count, RegSet saved_regs) {
-  bool dest_uninitialized = (decorators & AS_DEST_NOT_INITIALIZED) != 0;
+  bool dest_uninitialized = (decorators & IS_DEST_UNINITIALIZED) != 0;
   if (!dest_uninitialized) {
     __ push(saved_regs, sp);
     if (count == c_rarg0) {
--- a/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -37,14 +37,14 @@
 
   bool in_heap = (decorators & IN_HEAP) != 0;
   bool in_native = (decorators & IN_NATIVE) != 0;
-  bool oop_not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool is_not_null = (decorators & IS_NOT_NULL) != 0;
   switch (type) {
   case T_OBJECT:
   case T_ARRAY: {
     if (in_heap) {
       if (UseCompressedOops) {
         __ ldrw(dst, src);
-        if (oop_not_null) {
+        if (is_not_null) {
           __ decode_heap_oop_not_null(dst);
         } else {
           __ decode_heap_oop(dst);
--- a/src/hotspot/cpu/aarch64/gc/shared/cardTableBarrierSetAssembler_aarch64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/gc/shared/cardTableBarrierSetAssembler_aarch64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -91,9 +91,9 @@
 void CardTableBarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                                 Address dst, Register val, Register tmp1, Register tmp2) {
   bool in_heap = (decorators & IN_HEAP) != 0;
-  bool on_array = (decorators & IN_HEAP_ARRAY) != 0;
+  bool is_array = (decorators & IS_ARRAY) != 0;
   bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
-  bool precise = on_array || on_anonymous;
+  bool precise = is_array || on_anonymous;
 
   bool needs_post_barrier = val != noreg && in_heap;
   BarrierSetAssembler::store_at(masm, decorators, type, dst, val, noreg, noreg);
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -3991,7 +3991,7 @@
 
 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
                                             Register thread_tmp, DecoratorSet decorators) {
-  access_load_at(T_OBJECT, IN_HEAP | OOP_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
+  access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
 }
 
 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
@@ -4316,8 +4316,10 @@
                                     Register cnt2, Register cnt1,
                                     Register tmp1, Register tmp2,
                                     Register tmp3, Register tmp4,
+                                    Register tmp5, Register tmp6,
                                     int icnt1, Register result, int ae) {
-  Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
+  // NOTE: tmp5, tmp6 can be zr depending on specific method version
+  Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 
   Register ch1 = rscratch1;
   Register ch2 = rscratch2;
@@ -4346,18 +4348,21 @@
   // if (substr.count > string.count) return -1;
   // if (substr.count == 0) return 0;
 
-// We have two strings, a source string in str2, cnt2 and a pattern string
-// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
-
-// For larger pattern and source we use a simplified Boyer Moore algorithm.
-// With a small pattern and source we use linear scan.
+  // We have two strings, a source string in str2, cnt2 and a pattern string
+  // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
+
+  // For larger pattern and source we use a simplified Boyer Moore algorithm.
+  // With a small pattern and source we use linear scan.
 
   if (icnt1 == -1) {
-    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
-    ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
-    br(LO, LINEARSEARCH);       // a byte array.
-    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
-    br(HS, LINEARSEARCH);
+    sub(result_tmp, cnt2, cnt1);
+    cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
+    br(LT, LINEARSEARCH);
+    dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
+    cmp(cnt1, 256);
+    lsr(tmp1, cnt2, 2);
+    ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
+    br(GE, LINEARSTUB);
   }
 
 // The Boyer Moore alogorithm is based on the description here:-
@@ -4377,7 +4382,9 @@
 //
 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 //
-// #define ASIZE 128
+// This particular implementation has few java-specific optimizations.
+//
+// #define ASIZE 256
 //
 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 //       int i, j;
@@ -4386,11 +4393,16 @@
 //
 //       /* Preprocessing */
 //       for (i = 0; i < ASIZE; ++i)
-//          bc[i] = 0;
+//          bc[i] = m;
 //       for (i = 0; i < m - 1; ) {
 //          c = x[i];
 //          ++i;
-//          if (c < ASIZE) bc[c] = i;
+//          // c < 256 for Latin1 string, so, no need for branch
+//          #ifdef PATTERN_STRING_IS_LATIN1
+//          bc[c] = m - i;
+//          #else
+//          if (c < ASIZE) bc[c] = m - i;
+//          #endif
 //       }
 //
 //       /* Searching */
@@ -4400,84 +4412,160 @@
 //          if (x[m-1] == c)
 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 //          if (i < 0) return j;
+//          // c < 256 for Latin1 string, so, no need for branch
+//          #ifdef SOURCE_STRING_IS_LATIN1
+//          // LL case: (c< 256) always true. Remove branch
+//          j += bc[y[j+m-1]];
+//          #endif
+//          #ifndef PATTERN_STRING_IS_UTF
+//          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 //          if (c < ASIZE)
-//            j = j - bc[y[j+m-1]] + m;
+//            j += bc[y[j+m-1]];
 //          else
-//            j += 1; // Advance by 1 only if char >= ASIZE
+//            j += 1
+//          #endif
+//          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
+//          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
+//          if (c < ASIZE)
+//            j += bc[y[j+m-1]];
+//          else
+//            j += m
+//          #endif
 //       }
 //    }
 
   if (icnt1 == -1) {
-    BIND(BM);
-
-    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
-    Label BMADV, BMMATCH, BMCHECKEND;
-
+    Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
+        BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
     Register cnt1end = tmp2;
     Register str2end = cnt2;
     Register skipch = tmp2;
 
-    // Restrict ASIZE to 128 to reduce stack space/initialisation.
-    // The presence of chars >= ASIZE in the target string does not affect
-    // performance, but we must be careful not to initialise them in the stack
-    // array.
-    // The presence of chars >= ASIZE in the source string may adversely affect
-    // performance since we can only advance by one when we encounter one.
-
-      stp(zr, zr, pre(sp, -128));
-      for (int i = 1; i < 8; i++)
-          stp(zr, zr, Address(sp, i*16));
-
-      mov(cnt1tmp, 0);
-      sub(cnt1end, cnt1, 1);
+    // str1 length is >=8, so, we can read at least 1 register for cases when
+    // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
+    // UL case. We'll re-read last character in inner pre-loop code to have
+    // single outer pre-loop load
+    const int firstStep = isL ? 7 : 3;
+
+    const int ASIZE = 256;
+    const int STORED_BYTES = 32; // amount of bytes stored per instruction
+    sub(sp, sp, ASIZE);
+    mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
+    mov(ch1, sp);
+    BIND(BM_INIT_LOOP);
+      stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
+      subs(tmp5, tmp5, 1);
+      br(GT, BM_INIT_LOOP);
+
+      sub(cnt1tmp, cnt1, 1);
+      mov(tmp5, str2);
+      add(str2end, str2, result_tmp, LSL, str2_chr_shift);
+      sub(ch2, cnt1, 1);
+      mov(tmp3, str1);
     BIND(BCLOOP);
-      (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
-      cmp(ch1, 128);
-      add(cnt1tmp, cnt1tmp, 1);
-      br(HS, BCSKIP);
-      strb(cnt1tmp, Address(sp, ch1));
+      (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
+      if (!str1_isL) {
+        cmp(ch1, ASIZE);
+        br(HS, BCSKIP);
+      }
+      strb(ch2, Address(sp, ch1));
     BIND(BCSKIP);
-      cmp(cnt1tmp, cnt1end);
-      br(LT, BCLOOP);
-
-      mov(result_tmp, str2);
-
-      sub(cnt2, cnt2, cnt1);
-      add(str2end, str2, cnt2, LSL, str2_chr_shift);
+      subs(ch2, ch2, 1);
+      br(GT, BCLOOP);
+
+      add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
+      if (str1_isL == str2_isL) {
+        // load last 8 bytes (8LL/4UU symbols)
+        ldr(tmp6, Address(tmp6, -wordSize));
+      } else {
+        ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
+        // convert Latin1 to UTF. We'll have to wait until load completed, but
+        // it's still faster than per-character loads+checks
+        lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
+        ubfx(ch1, tmp6, 8, 8); // str1[N-2]
+        ubfx(ch2, tmp6, 16, 8); // str1[N-3]
+        andr(tmp6, tmp6, 0xFF); // str1[N-4]
+        orr(ch2, ch1, ch2, LSL, 16);
+        orr(tmp6, tmp6, tmp3, LSL, 48);
+        orr(tmp6, tmp6, ch2, LSL, 16);
+      }
     BIND(BMLOOPSTR2);
-      sub(cnt1tmp, cnt1, 1);
-      (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
-      cmp(ch1, skipch);
+      sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
+      if (str1_isL == str2_isL) {
+        // re-init tmp3. It's for free because it's executed in parallel with
+        // load above. Alternative is to initialize it before loop, but it'll
+        // affect performance on in-order systems with 2 or more ld/st pipelines
+        lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
+      }
+      if (!isL) { // UU/UL case
+        lsl(ch2, cnt1tmp, 1); // offset in bytes
+      }
+      cmp(tmp3, skipch);
       br(NE, BMSKIP);
-      subs(cnt1tmp, cnt1tmp, 1);
-      br(LT, BMMATCH);
+      ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
+      mov(ch1, tmp6);
+      if (isL) {
+        b(BMLOOPSTR1_AFTER_LOAD);
+      } else {
+        sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
+        b(BMLOOPSTR1_CMP);
+      }
     BIND(BMLOOPSTR1);
       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
+    BIND(BMLOOPSTR1_AFTER_LOAD);
+      subs(cnt1tmp, cnt1tmp, 1);
+      br(LT, BMLOOPSTR1_LASTCMP);
+    BIND(BMLOOPSTR1_CMP);
+      cmp(ch1, ch2);
+      br(EQ, BMLOOPSTR1);
+    BIND(BMSKIP);
+      if (!isL) {
+        // if we've met UTF symbol while searching Latin1 pattern, then we can
+        // skip cnt1 symbols
+        if (str1_isL != str2_isL) {
+          mov(result_tmp, cnt1);
+        } else {
+          mov(result_tmp, 1);
+        }
+        cmp(skipch, ASIZE);
+        br(HS, BMADV);
+      }
+      ldrb(result_tmp, Address(sp, skipch)); // load skip distance
+    BIND(BMADV);
+      sub(cnt1tmp, cnt1, 1);
+      add(str2, str2, result_tmp, LSL, str2_chr_shift);
+      cmp(str2, str2end);
+      br(LE, BMLOOPSTR2);
+      add(sp, sp, ASIZE);
+      b(NOMATCH);
+    BIND(BMLOOPSTR1_LASTCMP);
       cmp(ch1, ch2);
       br(NE, BMSKIP);
-      subs(cnt1tmp, cnt1tmp, 1);
-      br(GE, BMLOOPSTR1);
     BIND(BMMATCH);
-      sub(result, str2, result_tmp);
+      sub(result, str2, tmp5);
       if (!str2_isL) lsr(result, result, 1);
-      add(sp, sp, 128);
+      add(sp, sp, ASIZE);
       b(DONE);
-    BIND(BMADV);
-      add(str2, str2, str2_chr_size);
-      b(BMCHECKEND);
-    BIND(BMSKIP);
-      cmp(skipch, 128);
-      br(HS, BMADV);
-      ldrb(ch2, Address(sp, skipch));
-      add(str2, str2, cnt1, LSL, str2_chr_shift);
-      sub(str2, str2, ch2, LSL, str2_chr_shift);
-    BIND(BMCHECKEND);
-      cmp(str2, str2end);
-      br(LE, BMLOOPSTR2);
-      add(sp, sp, 128);
-      b(NOMATCH);
+
+    BIND(LINEARSTUB);
+    cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
+    br(LT, LINEAR_MEDIUM);
+    mov(result, zr);
+    RuntimeAddress stub = NULL;
+    if (isL) {
+      stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
+      assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
+    } else if (str1_isL) {
+      stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
+       assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
+    } else {
+      stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
+      assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
+    }
+    trampoline_call(stub);
+    b(DONE);
   }
 
   BIND(LINEARSEARCH);
@@ -4493,15 +4581,12 @@
 
         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
         br(LT, DOSHORT);
-
-        sub(cnt2, cnt2, cnt1);
-        mov(result_tmp, cnt2);
-
+      BIND(LINEAR_MEDIUM);
+        (this->*str1_load_1chr)(first, Address(str1));
         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
-        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
-        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
-        (this->*str1_load_1chr)(first, Address(str1, cnt1_neg));
+        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
+        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 
       BIND(FIRST_LOOP);
         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
@@ -4539,10 +4624,9 @@
       Label CH1_LOOP;
 
         (this->*load_4chr)(ch1, str1);
-        sub(cnt2, cnt2, 4);
-        mov(result_tmp, cnt2);
-        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
-        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
+        sub(result_tmp, cnt2, 4);
+        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
+        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 
       BIND(CH1_LOOP);
         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
@@ -4551,18 +4635,18 @@
         adds(cnt2_neg, cnt2_neg, str2_chr_size);
         br(LE, CH1_LOOP);
         b(NOMATCH);
-    }
+      }
 
     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
       Label CH1_LOOP;
 
       BIND(DO2);
         (this->*load_2chr)(ch1, str1);
-        sub(cnt2, cnt2, 2);
-        mov(result_tmp, cnt2);
-        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
-        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
-
+        if (icnt1 == 2) {
+          sub(result_tmp, cnt2, 2);
+        }
+        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
+        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
       BIND(CH1_LOOP);
         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
         cmp(ch1, ch2);
@@ -4578,12 +4662,11 @@
       BIND(DO3);
         (this->*load_2chr)(first, str1);
         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
-
-        sub(cnt2, cnt2, 3);
-        mov(result_tmp, cnt2);
-        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
-        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
-
+        if (icnt1 == 3) {
+          sub(result_tmp, cnt2, 3);
+        }
+        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
+        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
       BIND(FIRST_LOOP);
         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
         cmpw(first, ch2);
@@ -4602,30 +4685,23 @@
     }
 
     if (icnt1 == -1 || icnt1 == 1) {
-      Label CH1_LOOP, HAS_ZERO;
-      Label DO1_SHORT, DO1_LOOP;
+      Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 
       BIND(DO1);
         (this->*str1_load_1chr)(ch1, str1);
         cmp(cnt2, 8);
         br(LT, DO1_SHORT);
 
+        sub(result_tmp, cnt2, 8/str2_chr_size);
+        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
+        mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
+        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
+
         if (str2_isL) {
-          if (!str1_isL) {
-            tst(ch1, 0xff00);
-            br(NE, NOMATCH);
-          }
           orr(ch1, ch1, ch1, LSL, 8);
         }
         orr(ch1, ch1, ch1, LSL, 16);
         orr(ch1, ch1, ch1, LSL, 32);
-
-        sub(cnt2, cnt2, 8/str2_chr_size);
-        mov(result_tmp, cnt2);
-        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
-        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
-
-        mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
       BIND(CH1_LOOP);
         ldr(ch2, Address(str2, cnt2_neg));
         eor(ch2, ch1, ch2);
@@ -4733,12 +4809,13 @@
 
 // Compare strings.
 void MacroAssembler::string_compare(Register str1, Register str2,
-                                    Register cnt1, Register cnt2, Register result,
-                                    Register tmp1,
-                                    FloatRegister vtmp, FloatRegister vtmpZ, int ae) {
-  Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
-    NEXT_WORD, DIFFERENCE;
-
+    Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
+    FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
+  Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
+      DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
+      SHORT_LOOP_START, TAIL_CHECK;
+
+  const int STUB_THRESHOLD = 64 + 8;
   bool isLL = ae == StrIntrinsicNode::LL;
   bool isLU = ae == StrIntrinsicNode::LU;
   bool isUL = ae == StrIntrinsicNode::UL;
@@ -4750,7 +4827,9 @@
   int str2_chr_shift = str2_isL ? 0 : 1;
   int str1_chr_size = str1_isL ? 1 : 2;
   int str2_chr_size = str2_isL ? 1 : 2;
-
+  int minCharsInWord = isLL ? wordSize : wordSize/2;
+
+  FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
                                       (chr_insn)&MacroAssembler::ldrh;
   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
@@ -4766,73 +4845,116 @@
   if (!str2_isL) asrw(cnt2, cnt2, 1);
 
   // Compute the minimum of the string lengths and save the difference.
-  subsw(tmp1, cnt1, cnt2);
+  subsw(result, cnt1, cnt2);
   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 
   // A very short string
-  cmpw(cnt2, isLL ? 8:4);
+  cmpw(cnt2, minCharsInWord);
   br(Assembler::LT, SHORT_STRING);
 
-  // Check if the strings start at the same location.
-  cmp(str1, str2);
-  br(Assembler::EQ, LENGTH_DIFF);
-
   // Compare longwords
+  // load first parts of strings and finish initialization while loading
   {
-    subw(cnt2, cnt2, isLL ? 8:4); // The last longword is a special case
-
-    // Move both string pointers to the last longword of their
-    // strings, negate the remaining count, and convert it to bytes.
-    lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
-    lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
-    if (isLU || isUL) {
+    if (str1_isL == str2_isL) { // LL or UU
+      ldr(tmp1, Address(str1));
+      cmp(str1, str2);
+      br(Assembler::EQ, DONE);
+      ldr(tmp2, Address(str2));
+      cmp(cnt2, STUB_THRESHOLD);
+      br(GE, STUB);
+      subsw(cnt2, cnt2, minCharsInWord);
+      br(EQ, TAIL_CHECK);
+      lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
+      lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
+      sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
+    } else if (isLU) {
+      ldrs(vtmp, Address(str1));
+      cmp(str1, str2);
+      br(Assembler::EQ, DONE);
+      ldr(tmp2, Address(str2));
+      cmp(cnt2, STUB_THRESHOLD);
+      br(GE, STUB);
+      subsw(cnt2, cnt2, 4);
+      br(EQ, TAIL_CHECK);
+      eor(vtmpZ, T16B, vtmpZ, vtmpZ);
+      lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
+      lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
+      zip1(vtmp, T8B, vtmp, vtmpZ);
       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
+      sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
+      add(cnt1, cnt1, 4);
+      fmovd(tmp1, vtmp);
+    } else { // UL case
+      ldr(tmp1, Address(str1));
+      cmp(str1, str2);
+      br(Assembler::EQ, DONE);
+      ldrs(vtmp, Address(str2));
+      cmp(cnt2, STUB_THRESHOLD);
+      br(GE, STUB);
+      subsw(cnt2, cnt2, 4);
+      br(EQ, TAIL_CHECK);
+      lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
+      lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
+      sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
+      zip1(vtmp, T8B, vtmp, vtmpZ);
+      sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
+      add(cnt1, cnt1, 8);
+      fmovd(tmp2, vtmp);
     }
-    sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
-
-    // Loop, loading longwords and comparing them into rscratch2.
+    adds(cnt2, cnt2, isUL ? 4 : 8);
+    br(GE, TAIL);
+    eor(rscratch2, tmp1, tmp2);
+    cbnz(rscratch2, DIFFERENCE);
+    // main loop
     bind(NEXT_WORD);
-    if (isLU) {
+    if (str1_isL == str2_isL) {
+      ldr(tmp1, Address(str1, cnt2));
+      ldr(tmp2, Address(str2, cnt2));
+      adds(cnt2, cnt2, 8);
+    } else if (isLU) {
       ldrs(vtmp, Address(str1, cnt1));
+      ldr(tmp2, Address(str2, cnt2));
+      add(cnt1, cnt1, 4);
       zip1(vtmp, T8B, vtmp, vtmpZ);
-      umov(result, vtmp, D, 0);
-    } else {
-      ldr(result, Address(str1, isUL ? cnt1:cnt2));
+      fmovd(tmp1, vtmp);
+      adds(cnt2, cnt2, 8);
+    } else { // UL
+      ldrs(vtmp, Address(str2, cnt2));
+      ldr(tmp1, Address(str1, cnt1));
+      zip1(vtmp, T8B, vtmp, vtmpZ);
+      add(cnt1, cnt1, 8);
+      fmovd(tmp2, vtmp);
+      adds(cnt2, cnt2, 4);
     }
-    if (isUL) {
-      ldrs(vtmp, Address(str2, cnt2));
-      zip1(vtmp, T8B, vtmp, vtmpZ);
-      umov(rscratch1, vtmp, D, 0);
-    } else {
-      ldr(rscratch1, Address(str2, cnt2));
-    }
-    adds(cnt2, cnt2, isUL ? 4:8);
-    if (isLU || isUL) add(cnt1, cnt1, isLU ? 4:8);
-    eor(rscratch2, result, rscratch1);
+    br(GE, TAIL);
+
+    eor(rscratch2, tmp1, tmp2);
+    cbz(rscratch2, NEXT_WORD);
+    b(DIFFERENCE);
+    bind(TAIL);
+    eor(rscratch2, tmp1, tmp2);
     cbnz(rscratch2, DIFFERENCE);
-    br(Assembler::LT, NEXT_WORD);
-
     // Last longword.  In the case where length == 4 we compare the
     // same longword twice, but that's still faster than another
     // conditional branch.
-
-    if (isLU) {
+    if (str1_isL == str2_isL) {
+      ldr(tmp1, Address(str1));
+      ldr(tmp2, Address(str2));
+    } else if (isLU) {
       ldrs(vtmp, Address(str1));
+      ldr(tmp2, Address(str2));
       zip1(vtmp, T8B, vtmp, vtmpZ);
-      umov(result, vtmp, D, 0);
-    } else {
-      ldr(result, Address(str1));
+      fmovd(tmp1, vtmp);
+    } else { // UL
+      ldrs(vtmp, Address(str2));
+      ldr(tmp1, Address(str1));
+      zip1(vtmp, T8B, vtmp, vtmpZ);
+      fmovd(tmp2, vtmp);
     }
-    if (isUL) {
-      ldrs(vtmp, Address(str2));
-      zip1(vtmp, T8B, vtmp, vtmpZ);
-      umov(rscratch1, vtmp, D, 0);
-    } else {
-      ldr(rscratch1, Address(str2));
-    }
-    eor(rscratch2, result, rscratch1);
-    cbz(rscratch2, LENGTH_DIFF);
+    bind(TAIL_CHECK);
+    eor(rscratch2, tmp1, tmp2);
+    cbz(rscratch2, DONE);
 
     // Find the first different characters in the longwords and
     // compute their difference.
@@ -4840,31 +4962,78 @@
     rev(rscratch2, rscratch2);
     clz(rscratch2, rscratch2);
     andr(rscratch2, rscratch2, isLL ? -8 : -16);
-    lsrv(result, result, rscratch2);
-    (this->*ext_chr)(result, result);
-    lsrv(rscratch1, rscratch1, rscratch2);
-    (this->*ext_chr)(rscratch1, rscratch1);
-    subw(result, result, rscratch1);
+    lsrv(tmp1, tmp1, rscratch2);
+    (this->*ext_chr)(tmp1, tmp1);
+    lsrv(tmp2, tmp2, rscratch2);
+    (this->*ext_chr)(tmp2, tmp2);
+    subw(result, tmp1, tmp2);
     b(DONE);
   }
 
+  bind(STUB);
+    RuntimeAddress stub = NULL;
+    switch(ae) {
+      case StrIntrinsicNode::LL:
+        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
+        break;
+      case StrIntrinsicNode::UU:
+        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
+        break;
+      case StrIntrinsicNode::LU:
+        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
+        break;
+      case StrIntrinsicNode::UL:
+        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
+        break;
+      default:
+        ShouldNotReachHere();
+     }
+    assert(stub.target() != NULL, "compare_long_string stub has not been generated");
+    trampoline_call(stub);
+    b(DONE);
+
   bind(SHORT_STRING);
   // Is the minimum length zero?
-  cbz(cnt2, LENGTH_DIFF);
-
+  cbz(cnt2, DONE);
+  // arrange code to do most branches while loading and loading next characters
+  // while comparing previous
+  (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
+  subs(cnt2, cnt2, 1);
+  br(EQ, SHORT_LAST_INIT);
+  (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
+  b(SHORT_LOOP_START);
   bind(SHORT_LOOP);
-  (this->*str1_load_chr)(result, Address(post(str1, str1_chr_size)));
+  subs(cnt2, cnt2, 1);
+  br(EQ, SHORT_LAST);
+  bind(SHORT_LOOP_START);
+  (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
+  (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
+  cmp(tmp1, cnt1);
+  br(NE, SHORT_LOOP_TAIL);
+  subs(cnt2, cnt2, 1);
+  br(EQ, SHORT_LAST2);
+  (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
-  subw(result, result, cnt1);
-  cbnz(result, DONE);
-  sub(cnt2, cnt2, 1);
-  cbnz(cnt2, SHORT_LOOP);
-
-  // Strings are equal up to min length.  Return the length difference.
-  bind(LENGTH_DIFF);
-  mov(result, tmp1);
-
-  // That's it
+  cmp(tmp2, rscratch1);
+  br(EQ, SHORT_LOOP);
+  sub(result, tmp2, rscratch1);
+  b(DONE);
+  bind(SHORT_LOOP_TAIL);
+  sub(result, tmp1, cnt1);
+  b(DONE);
+  bind(SHORT_LAST2);
+  cmp(tmp2, rscratch1);
+  br(EQ, DONE);
+  sub(result, tmp2, rscratch1);
+
+  b(DONE);
+  bind(SHORT_LAST_INIT);
+  (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
+  bind(SHORT_LAST);
+  cmp(tmp1, cnt1);
+  br(EQ, DONE);
+  sub(result, tmp1, cnt1);
+
   bind(DONE);
 
   BLOCK_COMMENT("} string_compare");
@@ -4928,9 +5097,8 @@
 
 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
                                    Register tmp4, Register tmp5, Register result,
-                                   Register cnt1, int elem_size)
-{
-  Label DONE;
+                                   Register cnt1, int elem_size) {
+  Label DONE, SAME;
   Register tmp1 = rscratch1;
   Register tmp2 = rscratch2;
   Register cnt2 = tmp2;  // cnt2 only used in array length compare
@@ -4952,21 +5120,21 @@
     BLOCK_COMMENT(comment);
   }
 #endif
+
+  // if (a1 == a2)
+  //     return true;
+  cmpoop(a1, a2); // May have read barriers for a1 and a2.
+  br(EQ, SAME);
+
   if (UseSimpleArrayEquals) {
-    Label NEXT_WORD, SHORT, SAME, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
-    // if (a1==a2)
-    //     return true;
-    // if (a==null || a2==null)
+    Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
+    // if (a1 == null || a2 == null)
     //     return false;
     // a1 & a2 == 0 means (some-pointer is null) or
     // (very-rare-or-even-probably-impossible-pointer-values)
     // so, we can save one branch in most cases
-    cmpoop(a1, a2);
-    br(EQ, SAME);
-    eor(rscratch1, a1, a2);
     tst(a1, a2);
     mov(result, false);
-    cbz(rscratch1, SAME);
     br(EQ, A_MIGHT_BE_NULL);
     // if (a1.length != a2.length)
     //      return false;
@@ -5032,22 +5200,18 @@
         cbnzw(tmp5, DONE);
       }
     }
-    bind(SAME);
-    mov(result, true);
   } else {
-    Label NEXT_DWORD, A_IS_NULL, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
-        CSET_EQ, LAST_CHECK, LEN_IS_ZERO, SAME;
-    cbz(a1, A_IS_NULL);
+    Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
+        CSET_EQ, LAST_CHECK;
+    mov(result, false);
+    cbz(a1, DONE);
     ldrw(cnt1, Address(a1, length_offset));
-    cbz(a2, A_IS_NULL);
+    cbz(a2, DONE);
     ldrw(cnt2, Address(a2, length_offset));
-    mov(result, false);
     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
     // faster to perform another branch before comparing a1 and a2
     cmp(cnt1, elem_per_word);
     br(LE, SHORT); // short or same
-    cmpoop(a1, a2);
-    br(EQ, SAME);
     ldr(tmp3, Address(pre(a1, base_offset)));
     cmp(cnt1, stubBytesThreshold);
     br(GE, STUB);
@@ -5099,23 +5263,15 @@
     trampoline_call(stub);
     b(DONE);
 
-    bind(SAME);
-    mov(result, true);
-    b(DONE);
-    bind(A_IS_NULL);
-    // a1 or a2 is null. if a2 == a2 then return true. else return false
-    cmp(a1, a2);
-    b(CSET_EQ);
     bind(EARLY_OUT);
     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
     // so, if a2 == null => return false(0), else return true, so we can return a2
     mov(result, a2);
     b(DONE);
-    bind(LEN_IS_ZERO);
-    cmp(cnt2, zr);
-    b(CSET_EQ);
     bind(SHORT);
-    cbz(cnt1, LEN_IS_ZERO);
+    cmp(cnt2, cnt1);
+    br(NE, DONE);
+    cbz(cnt1, SAME);
     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
     ldr(tmp3, Address(a1, base_offset));
     ldr(tmp4, Address(a2, base_offset));
@@ -5125,8 +5281,11 @@
     cmp(tmp5, zr);
     bind(CSET_EQ);
     cset(result, EQ);
+    b(DONE);
   }
 
+  bind(SAME);
+  mov(result, true);
   // That's it.
   bind(DONE);
 
@@ -5418,65 +5577,103 @@
                       FloatRegister Vtmp1, FloatRegister Vtmp2,
                       FloatRegister Vtmp3, FloatRegister Vtmp4)
 {
-    Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
-    Register tmp1 = rscratch1;
+    Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
+        NEXT_32_START, NEXT_32_PRFM_START;
+    Register tmp1 = rscratch1, tmp2 = rscratch2;
 
       mov(result, len); // Save initial len
 
 #ifndef BUILTIN_SIM
-      subs(len, len, 32);
-      br(LT, LOOP_8);
-
-// The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
-// to convert chars to bytes. These set the 'QC' bit in the FPSR if
-// any char could not fit in a byte, so clear the FPSR so we can test it.
-      clear_fpsr();
-
-    BIND(NEXT_32);
-      ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
-      uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
-      uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
-      uqxtn(Vtmp2, T8B, Vtmp3, T8H);
-      uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
-      get_fpsr(tmp1);
-      cbnzw(tmp1, LOOP_8);
-      st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
-      subs(len, len, 32);
+      cmp(len, 8); // handle shortest strings first
+      br(LT, LOOP_1);
+      cmp(len, 32);
+      br(LT, NEXT_8);
+      // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
+      // to convert chars to bytes
+      if (SoftwarePrefetchHintDistance >= 0) {
+        ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
+        cmp(len, SoftwarePrefetchHintDistance/2 + 16);
+        br(LE, NEXT_32_START);
+        b(NEXT_32_PRFM_START);
+        BIND(NEXT_32_PRFM);
+          ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
+        BIND(NEXT_32_PRFM_START);
+          prfm(Address(src, SoftwarePrefetchHintDistance));
+          orr(v4, T16B, Vtmp1, Vtmp2);
+          orr(v5, T16B, Vtmp3, Vtmp4);
+          uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
+          uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
+          stpq(Vtmp1, Vtmp3, dst);
+          uzp2(v5, T16B, v4, v5); // high bytes
+          umov(tmp2, v5, D, 1);
+          fmovd(tmp1, v5);
+          orr(tmp1, tmp1, tmp2);
+          cbnz(tmp1, LOOP_8);
+          sub(len, len, 32);
+          add(dst, dst, 32);
+          add(src, src, 64);
+          cmp(len, SoftwarePrefetchHintDistance/2 + 16);
+          br(GE, NEXT_32_PRFM);
+          cmp(len, 32);
+          br(LT, LOOP_8);
+        BIND(NEXT_32);
+          ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
+        BIND(NEXT_32_START);
+      } else {
+        BIND(NEXT_32);
+          ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
+      }
+      prfm(Address(src, SoftwarePrefetchHintDistance));
+      uzp1(v4, T16B, Vtmp1, Vtmp2);
+      uzp1(v5, T16B, Vtmp3, Vtmp4);
+      stpq(v4, v5, dst);
+      orr(Vtmp1, T16B, Vtmp1, Vtmp2);
+      orr(Vtmp3, T16B, Vtmp3, Vtmp4);
+      uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
+      umov(tmp2, Vtmp1, D, 1);
+      fmovd(tmp1, Vtmp1);
+      orr(tmp1, tmp1, tmp2);
+      cbnz(tmp1, LOOP_8);
+      sub(len, len, 32);
+      add(dst, dst, 32);
       add(src, src, 64);
+      cmp(len, 32);
       br(GE, NEXT_32);
+      cbz(len, DONE);
 
     BIND(LOOP_8);
-      adds(len, len, 32-8);
+      cmp(len, 8);
       br(LT, LOOP_1);
-      clear_fpsr(); // QC may be set from loop above, clear again
     BIND(NEXT_8);
       ld1(Vtmp1, T8H, src);
-      uqxtn(Vtmp1, T8B, Vtmp1, T8H);
-      get_fpsr(tmp1);
-      cbnzw(tmp1, LOOP_1);
-      st1(Vtmp1, T8B, post(dst, 8));
-      subs(len, len, 8);
+      uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
+      uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
+      strd(Vtmp2, dst);
+      fmovd(tmp1, Vtmp3);
+      cbnz(tmp1, NEXT_1);
+
+      sub(len, len, 8);
+      add(dst, dst, 8);
       add(src, src, 16);
+      cmp(len, 8);
       br(GE, NEXT_8);
 
     BIND(LOOP_1);
-      adds(len, len, 8);
-      br(LE, DONE);
-#else
-      cbz(len, DONE);
 #endif
+    cbz(len, DONE);
     BIND(NEXT_1);
       ldrh(tmp1, Address(post(src, 2)));
+      strb(tmp1, Address(post(dst, 1)));
       tst(tmp1, 0xff00);
-      br(NE, DONE);
-      strb(tmp1, Address(post(dst, 1)));
+      br(NE, SET_RESULT);
       subs(len, len, 1);
       br(GT, NEXT_1);
 
-    BIND(DONE);
+    BIND(SET_RESULT);
       sub(result, result, len); // Return index where we stopped
                                 // Return len == 0 if we processed all
                                 // characters
+    BIND(DONE);
 }
 
 
@@ -5484,26 +5681,24 @@
 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
                                         Register tmp4) {
-  Label big, done;
+  Label big, done, after_init, to_stub;
 
   assert_different_registers(src, dst, len, tmp4, rscratch1);
 
-  fmovd(vtmp1 , zr);
-  lsrw(rscratch1, len, 3);
-
-  cbnzw(rscratch1, big);
-
+  fmovd(vtmp1, zr);
+  lsrw(tmp4, len, 3);
+  bind(after_init);
+  cbnzw(tmp4, big);
   // Short string: less than 8 bytes.
   {
-    Label loop, around, tiny;
-
-    subsw(len, len, 4);
-    andw(len, len, 3);
-    br(LO, tiny);
-
+    Label loop, tiny;
+
+    cmpw(len, 4);
+    br(LT, tiny);
     // Use SIMD to do 4 bytes.
     ldrs(vtmp2, post(src, 4));
     zip1(vtmp3, T8B, vtmp2, vtmp1);
+    subw(len, len, 4);
     strd(vtmp3, post(dst, 8));
 
     cbzw(len, done);
@@ -5517,35 +5712,65 @@
     bind(tiny);
     cbnz(len, loop);
 
-    bind(around);
     b(done);
   }
 
+  if (SoftwarePrefetchHintDistance >= 0) {
+    bind(to_stub);
+      RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
+      assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
+      trampoline_call(stub);
+      b(after_init);
+  }
+
   // Unpack the bytes 8 at a time.
   bind(big);
-  andw(len, len, 7);
-
   {
-    Label loop, around;
-
-    bind(loop);
-    ldrd(vtmp2, post(src, 8));
-    sub(rscratch1, rscratch1, 1);
-    zip1(vtmp3, T16B, vtmp2, vtmp1);
-    st1(vtmp3, T8H, post(dst, 16));
-    cbnz(rscratch1, loop);
-
-    bind(around);
+    Label loop, around, loop_last, loop_start;
+
+    if (SoftwarePrefetchHintDistance >= 0) {
+      const int large_loop_threshold = (64 + 16)/8;
+      ldrd(vtmp2, post(src, 8));
+      andw(len, len, 7);
+      cmp(tmp4, large_loop_threshold);
+      br(GE, to_stub);
+      b(loop_start);
+
+      bind(loop);
+      ldrd(vtmp2, post(src, 8));
+      bind(loop_start);
+      subs(tmp4, tmp4, 1);
+      br(EQ, loop_last);
+      zip1(vtmp2, T16B, vtmp2, vtmp1);
+      ldrd(vtmp3, post(src, 8));
+      st1(vtmp2, T8H, post(dst, 16));
+      subs(tmp4, tmp4, 1);
+      zip1(vtmp3, T16B, vtmp3, vtmp1);
+      st1(vtmp3, T8H, post(dst, 16));
+      br(NE, loop);
+      b(around);
+      bind(loop_last);
+      zip1(vtmp2, T16B, vtmp2, vtmp1);
+      st1(vtmp2, T8H, post(dst, 16));
+      bind(around);
+      cbz(len, done);
+    } else {
+      andw(len, len, 7);
+      bind(loop);
+      ldrd(vtmp2, post(src, 8));
+      sub(tmp4, tmp4, 1);
+      zip1(vtmp3, T16B, vtmp2, vtmp1);
+      st1(vtmp3, T8H, post(dst, 16));
+      cbnz(tmp4, loop);
+    }
   }
 
   // Do the tail of up to 8 bytes.
-  sub(src, src, 8);
-  add(src, src, len, ext::uxtw, 0);
-  ldrd(vtmp2, Address(src));
-  sub(dst, dst, 16);
+  add(src, src, len);
+  ldrd(vtmp3, Address(src, -8));
   add(dst, dst, len, ext::uxtw, 1);
-  zip1(vtmp3, T16B, vtmp2, vtmp1);
-  st1(vtmp3, T8H, Address(dst));
+  zip1(vtmp3, T16B, vtmp3, vtmp1);
+  strq(vtmp3, Address(dst, -16));
 
   bind(done);
 }
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Mon Jun 25 10:21:50 2018 -0700
@@ -1212,8 +1212,8 @@
 
   void string_compare(Register str1, Register str2,
                       Register cnt1, Register cnt2, Register result,
-                      Register tmp1,
-                      FloatRegister vtmp, FloatRegister vtmpZ, int ae);
+                      Register tmp1, Register tmp2, FloatRegister vtmp1,
+                      FloatRegister vtmp2, FloatRegister vtmp3, int ae);
 
   void has_negatives(Register ary1, Register len, Register result);
 
@@ -1247,11 +1247,25 @@
                       Register cnt1, Register cnt2,
                       Register tmp1, Register tmp2,
                       Register tmp3, Register tmp4,
+                      Register tmp5, Register tmp6,
                       int int_cnt1, Register result, int ae);
   void string_indexof_char(Register str1, Register cnt1,
                            Register ch, Register result,
                            Register tmp1, Register tmp2, Register tmp3);
-private:
+  void fast_log(FloatRegister vtmp0, FloatRegister vtmp1, FloatRegister vtmp2,
+                FloatRegister vtmp3, FloatRegister vtmp4, FloatRegister vtmp5,
+                FloatRegister tmpC1, FloatRegister tmpC2, FloatRegister tmpC3,
+                FloatRegister tmpC4, Register tmp1, Register tmp2,
+                Register tmp3, Register tmp4, Register tmp5);
+  void generate_dsin_dcos(bool isCos, address npio2_hw, address two_over_pi,
+      address pio2, address dsin_coef, address dcos_coef);
+ private:
+  // begin trigonometric functions support block
+  void generate__ieee754_rem_pio2(address npio2_hw, address two_over_pi, address pio2);
+  void generate__kernel_rem_pio2(address two_over_pi, address pio2);
+  void generate_kernel_sin(FloatRegister x, bool iyIsOne, address dsin_coef);
+  void generate_kernel_cos(FloatRegister x, address dcos_coef);
+  // end trigonometric functions support block
   void add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
                        Register src1, Register src2);
   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_log.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -0,0 +1,365 @@
+/* Copyright (c) 2018, Cavium. All rights reserved. (By BELLSOFT)
+ * Copyright (c) 2016, Intel Corporation.
+ * Intel Math Library (LIBM) Source Code
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "macroAssembler_aarch64.hpp"
+
+// Algorithm idea is taken from x86 hotspot intrinsic and adapted for AARCH64.
+//
+// For mathematical background please refer to the following literature:
+//
+// Tang, Ping-Tak Peter.
+// Table-driven implementation of the logarithm function
+// in IEEE floating-point arithmetic.
+// ACM Transactions on Mathematical Software (TOMS) 16, no. 4, 1990: 378-400.
+
+/******************************************************************************/
+//                     ALGORITHM DESCRIPTION - LOG()
+//                     ---------------------
+//
+//    x=2^k * mx, mx in [1,2)
+//
+//    Get B~1/mx based on the output of frecpe instruction (B0)
+//    B = int((B0*2^7+0.5))/2^7
+//
+//    Reduced argument: r=B*mx-1.0 (computed accurately in high and low parts)
+//
+//    Result:  k*log(2) - log(B) + p(r) if |x-1| >= small value (2^-6)  and
+//             p(r) is a degree 7 polynomial
+//             -log(B) read from data table (high, low parts)
+//             Result is formed from high and low parts
+//
+// Special cases:
+// 1. log(NaN) = quiet NaN
+// 2. log(+INF) = that INF
+// 3. log(0) = -INF
+// 4. log(1) = +0
+// 5. log(x) = NaN if x < -0, including -INF
+//
+/******************************************************************************/
+
+// Table with p(r) polynomial coefficients
+// and table representation of logarithm values (hi and low parts)
+__attribute__ ((aligned(64))) juint _L_tbl[] =
+{
+    // coefficients of p(r) polynomial:
+    // _coeff[]
+    0x00000000UL, 0xbfd00000UL, // C1_0 = -0.25
+    0x92492492UL, 0x3fc24924UL, // C1_1 = 0.14285714285714285
+    0x55555555UL, 0x3fd55555UL, // C2_0 = 0.3333333333333333
+    0x3d6fb175UL, 0xbfc5555eUL, // C2_1 = -0.16666772842235003
+    0x00000000UL, 0xbfe00000UL, // C3_0 = -0.5
+    0x9999999aUL, 0x3fc99999UL, // C3_1 = 0.2
+    // _log2[]
+    0xfefa3800UL, 0x3fa62e42UL, // C4_0 = 0.043321698784993146
+    0x93c76730UL, 0x3ceef357UL, // C4_1 = 3.436201886692732e-15
+    // _L_tbl[] with logarithm values (hi and low parts)
+    0xfefa3800UL, 0x3fe62e42UL, 0x93c76730UL, 0x3d2ef357UL, 0xaa241800UL,
+    0x3fe5ee82UL, 0x0cda46beUL, 0x3d220238UL, 0x5c364800UL, 0x3fe5af40UL,
+    0xac10c9fbUL, 0x3d2dfa63UL, 0x26bb8c00UL, 0x3fe5707aUL, 0xff3303ddUL,
+    0x3d09980bUL, 0x26867800UL, 0x3fe5322eUL, 0x5d257531UL, 0x3d05ccc4UL,
+    0x835a5000UL, 0x3fe4f45aUL, 0x6d93b8fbUL, 0xbd2e6c51UL, 0x6f970c00UL,
+    0x3fe4b6fdUL, 0xed4c541cUL, 0x3cef7115UL, 0x27e8a400UL, 0x3fe47a15UL,
+    0xf94d60aaUL, 0xbd22cb6aUL, 0xf2f92400UL, 0x3fe43d9fUL, 0x481051f7UL,
+    0xbcfd984fUL, 0x2125cc00UL, 0x3fe4019cUL, 0x30f0c74cUL, 0xbd26ce79UL,
+    0x0c36c000UL, 0x3fe3c608UL, 0x7cfe13c2UL, 0xbd02b736UL, 0x17197800UL,
+    0x3fe38ae2UL, 0xbb5569a4UL, 0xbd218b7aUL, 0xad9d8c00UL, 0x3fe35028UL,
+    0x9527e6acUL, 0x3d10b83fUL, 0x44340800UL, 0x3fe315daUL, 0xc5a0ed9cUL,
+    0xbd274e93UL, 0x57b0e000UL, 0x3fe2dbf5UL, 0x07b9dc11UL, 0xbd17a6e5UL,
+    0x6d0ec000UL, 0x3fe2a278UL, 0xe797882dUL, 0x3d206d2bUL, 0x1134dc00UL,
+    0x3fe26962UL, 0x05226250UL, 0xbd0b61f1UL, 0xd8bebc00UL, 0x3fe230b0UL,
+    0x6e48667bUL, 0x3d12fc06UL, 0x5fc61800UL, 0x3fe1f863UL, 0xc9fe81d3UL,
+    0xbd2a7242UL, 0x49ae6000UL, 0x3fe1c078UL, 0xed70e667UL, 0x3cccacdeUL,
+    0x40f23c00UL, 0x3fe188eeUL, 0xf8ab4650UL, 0x3d14cc4eUL, 0xf6f29800UL,
+    0x3fe151c3UL, 0xa293ae49UL, 0xbd2edd97UL, 0x23c75c00UL, 0x3fe11af8UL,
+    0xbb9ddcb2UL, 0xbd258647UL, 0x8611cc00UL, 0x3fe0e489UL, 0x07801742UL,
+    0x3d1c2998UL, 0xe2d05400UL, 0x3fe0ae76UL, 0x887e7e27UL, 0x3d1f486bUL,
+    0x0533c400UL, 0x3fe078bfUL, 0x41edf5fdUL, 0x3d268122UL, 0xbe760400UL,
+    0x3fe04360UL, 0xe79539e0UL, 0xbd04c45fUL, 0xe5b20800UL, 0x3fe00e5aUL,
+    0xb1727b1cUL, 0xbd053ba3UL, 0xaf7a4800UL, 0x3fdfb358UL, 0x3c164935UL,
+    0x3d0085faUL, 0xee031800UL, 0x3fdf4aa7UL, 0x6f014a8bUL, 0x3d12cde5UL,
+    0x56b41000UL, 0x3fdee2a1UL, 0x5a470251UL, 0x3d2f27f4UL, 0xc3ddb000UL,
+    0x3fde7b42UL, 0x5372bd08UL, 0xbd246550UL, 0x1a272800UL, 0x3fde148aUL,
+    0x07322938UL, 0xbd1326b2UL, 0x484c9800UL, 0x3fddae75UL, 0x60dc616aUL,
+    0xbd1ea42dUL, 0x46def800UL, 0x3fdd4902UL, 0xe9a767a8UL, 0x3d235bafUL,
+    0x18064800UL, 0x3fdce42fUL, 0x3ec7a6b0UL, 0xbd0797c3UL, 0xc7455800UL,
+    0x3fdc7ff9UL, 0xc15249aeUL, 0xbd29b6ddUL, 0x693fa000UL, 0x3fdc1c60UL,
+    0x7fe8e180UL, 0x3d2cec80UL, 0x1b80e000UL, 0x3fdbb961UL, 0xf40a666dUL,
+    0x3d27d85bUL, 0x04462800UL, 0x3fdb56faUL, 0x2d841995UL, 0x3d109525UL,
+    0x5248d000UL, 0x3fdaf529UL, 0x52774458UL, 0xbd217cc5UL, 0x3c8ad800UL,
+    0x3fda93edUL, 0xbea77a5dUL, 0x3d1e36f2UL, 0x0224f800UL, 0x3fda3344UL,
+    0x7f9d79f5UL, 0x3d23c645UL, 0xea15f000UL, 0x3fd9d32bUL, 0x10d0c0b0UL,
+    0xbd26279eUL, 0x43135800UL, 0x3fd973a3UL, 0xa502d9f0UL, 0xbd152313UL,
+    0x635bf800UL, 0x3fd914a8UL, 0x2ee6307dUL, 0xbd1766b5UL, 0xa88b3000UL,
+    0x3fd8b639UL, 0xe5e70470UL, 0xbd205ae1UL, 0x776dc800UL, 0x3fd85855UL,
+    0x3333778aUL, 0x3d2fd56fUL, 0x3bd81800UL, 0x3fd7fafaUL, 0xc812566aUL,
+    0xbd272090UL, 0x687cf800UL, 0x3fd79e26UL, 0x2efd1778UL, 0x3d29ec7dUL,
+    0x76c67800UL, 0x3fd741d8UL, 0x49dc60b3UL, 0x3d2d8b09UL, 0xe6af1800UL,
+    0x3fd6e60eUL, 0x7c222d87UL, 0x3d172165UL, 0x3e9c6800UL, 0x3fd68ac8UL,
+    0x2756eba0UL, 0x3d20a0d3UL, 0x0b3ab000UL, 0x3fd63003UL, 0xe731ae00UL,
+    0xbd2db623UL, 0xdf596000UL, 0x3fd5d5bdUL, 0x08a465dcUL, 0xbd0a0b2aUL,
+    0x53c8d000UL, 0x3fd57bf7UL, 0xee5d40efUL, 0x3d1fadedUL, 0x0738a000UL,
+    0x3fd522aeUL, 0x8164c759UL, 0x3d2ebe70UL, 0x9e173000UL, 0x3fd4c9e0UL,
+    0x1b0ad8a4UL, 0xbd2e2089UL, 0xc271c800UL, 0x3fd4718dUL, 0x0967d675UL,
+    0xbd2f27ceUL, 0x23d5e800UL, 0x3fd419b4UL, 0xec90e09dUL, 0x3d08e436UL,
+    0x77333000UL, 0x3fd3c252UL, 0xb606bd5cUL, 0x3d183b54UL, 0x76be1000UL,
+    0x3fd36b67UL, 0xb0f177c8UL, 0x3d116ecdUL, 0xe1d36000UL, 0x3fd314f1UL,
+    0xd3213cb8UL, 0xbd28e27aUL, 0x7cdc9000UL, 0x3fd2bef0UL, 0x4a5004f4UL,
+    0x3d2a9cfaUL, 0x1134d800UL, 0x3fd26962UL, 0xdf5bb3b6UL, 0x3d2c93c1UL,
+    0x6d0eb800UL, 0x3fd21445UL, 0xba46baeaUL, 0x3d0a87deUL, 0x635a6800UL,
+    0x3fd1bf99UL, 0x5147bdb7UL, 0x3d2ca6edUL, 0xcbacf800UL, 0x3fd16b5cUL,
+    0xf7a51681UL, 0x3d2b9acdUL, 0x8227e800UL, 0x3fd1178eUL, 0x63a5f01cUL,
+    0xbd2c210eUL, 0x67616000UL, 0x3fd0c42dUL, 0x163ceae9UL, 0x3d27188bUL,
+    0x604d5800UL, 0x3fd07138UL, 0x16ed4e91UL, 0x3cf89cdbUL, 0x5626c800UL,
+    0x3fd01eaeUL, 0x1485e94aUL, 0xbd16f08cUL, 0x6cb3b000UL, 0x3fcf991cUL,
+    0xca0cdf30UL, 0x3d1bcbecUL, 0xe4dd0000UL, 0x3fcef5adUL, 0x65bb8e11UL,
+    0xbcca2115UL, 0xffe71000UL, 0x3fce530eUL, 0x6041f430UL, 0x3cc21227UL,
+    0xb0d49000UL, 0x3fcdb13dUL, 0xf715b035UL, 0xbd2aff2aUL, 0xf2656000UL,
+    0x3fcd1037UL, 0x75b6f6e4UL, 0xbd084a7eUL, 0xc6f01000UL, 0x3fcc6ffbUL,
+    0xc5962bd2UL, 0xbcf1ec72UL, 0x383be000UL, 0x3fcbd087UL, 0x595412b6UL,
+    0xbd2d4bc4UL, 0x575bd000UL, 0x3fcb31d8UL, 0x4eace1aaUL, 0xbd0c358dUL,
+    0x3c8ae000UL, 0x3fca93edUL, 0x50562169UL, 0xbd287243UL, 0x07089000UL,
+    0x3fc9f6c4UL, 0x6865817aUL, 0x3d29904dUL, 0xdcf70000UL, 0x3fc95a5aUL,
+    0x58a0ff6fUL, 0x3d07f228UL, 0xeb390000UL, 0x3fc8beafUL, 0xaae92cd1UL,
+    0xbd073d54UL, 0x6551a000UL, 0x3fc823c1UL, 0x9a631e83UL, 0x3d1e0ddbUL,
+    0x85445000UL, 0x3fc7898dUL, 0x70914305UL, 0xbd1c6610UL, 0x8b757000UL,
+    0x3fc6f012UL, 0xe59c21e1UL, 0xbd25118dUL, 0xbe8c1000UL, 0x3fc6574eUL,
+    0x2c3c2e78UL, 0x3d19cf8bUL, 0x6b544000UL, 0x3fc5bf40UL, 0xeb68981cUL,
+    0xbd127023UL, 0xe4a1b000UL, 0x3fc527e5UL, 0xe5697dc7UL, 0x3d2633e8UL,
+    0x8333b000UL, 0x3fc4913dUL, 0x54fdb678UL, 0x3d258379UL, 0xa5993000UL,
+    0x3fc3fb45UL, 0x7e6a354dUL, 0xbd2cd1d8UL, 0xb0159000UL, 0x3fc365fcUL,
+    0x234b7289UL, 0x3cc62fa8UL, 0x0c868000UL, 0x3fc2d161UL, 0xcb81b4a1UL,
+    0x3d039d6cUL, 0x2a49c000UL, 0x3fc23d71UL, 0x8fd3df5cUL, 0x3d100d23UL,
+    0x7e23f000UL, 0x3fc1aa2bUL, 0x44389934UL, 0x3d2ca78eUL, 0x8227e000UL,
+    0x3fc1178eUL, 0xce2d07f2UL, 0x3d21ef78UL, 0xb59e4000UL, 0x3fc08598UL,
+    0x7009902cUL, 0xbd27e5ddUL, 0x39dbe000UL, 0x3fbfe891UL, 0x4fa10afdUL,
+    0xbd2534d6UL, 0x830a2000UL, 0x3fbec739UL, 0xafe645e0UL, 0xbd2dc068UL,
+    0x63844000UL, 0x3fbda727UL, 0x1fa71733UL, 0x3d1a8940UL, 0x01bc4000UL,
+    0x3fbc8858UL, 0xc65aacd3UL, 0x3d2646d1UL, 0x8dad6000UL, 0x3fbb6ac8UL,
+    0x2bf768e5UL, 0xbd139080UL, 0x40b1c000UL, 0x3fba4e76UL, 0xb94407c8UL,
+    0xbd0e42b6UL, 0x5d594000UL, 0x3fb9335eUL, 0x3abd47daUL, 0x3d23115cUL,
+    0x2f40e000UL, 0x3fb8197eUL, 0xf96ffdf7UL, 0x3d0f80dcUL, 0x0aeac000UL,
+    0x3fb700d3UL, 0xa99ded32UL, 0x3cec1e8dUL, 0x4d97a000UL, 0x3fb5e95aUL,
+    0x3c5d1d1eUL, 0xbd2c6906UL, 0x5d208000UL, 0x3fb4d311UL, 0x82f4e1efUL,
+    0xbcf53a25UL, 0xa7d1e000UL, 0x3fb3bdf5UL, 0xa5db4ed7UL, 0x3d2cc85eUL,
+    0xa4472000UL, 0x3fb2aa04UL, 0xae9c697dUL, 0xbd20b6e8UL, 0xd1466000UL,
+    0x3fb1973bUL, 0x560d9e9bUL, 0xbd25325dUL, 0xb59e4000UL, 0x3fb08598UL,
+    0x7009902cUL, 0xbd17e5ddUL, 0xc006c000UL, 0x3faeea31UL, 0x4fc93b7bUL,
+    0xbd0e113eUL, 0xcdddc000UL, 0x3faccb73UL, 0x47d82807UL, 0xbd1a68f2UL,
+    0xd0fb0000UL, 0x3faaaef2UL, 0x353bb42eUL, 0x3d20fc1aUL, 0x149fc000UL,
+    0x3fa894aaUL, 0xd05a267dUL, 0xbd197995UL, 0xf2d4c000UL, 0x3fa67c94UL,
+    0xec19afa2UL, 0xbd029efbUL, 0xd42e0000UL, 0x3fa466aeUL, 0x75bdfd28UL,
+    0xbd2c1673UL, 0x2f8d0000UL, 0x3fa252f3UL, 0xe021b67bUL, 0x3d283e9aUL,
+    0x89e74000UL, 0x3fa0415dUL, 0x5cf1d753UL, 0x3d0111c0UL, 0xec148000UL,
+    0x3f9c63d2UL, 0x3f9eb2f3UL, 0x3d2578c6UL, 0x28c90000UL, 0x3f984925UL,
+    0x325a0c34UL, 0xbd2aa0baUL, 0x25980000UL, 0x3f9432a9UL, 0x928637feUL,
+    0x3d098139UL, 0x58938000UL, 0x3f902056UL, 0x06e2f7d2UL, 0xbd23dc5bUL,
+    0xa3890000UL, 0x3f882448UL, 0xda74f640UL, 0xbd275577UL, 0x75890000UL,
+    0x3f801015UL, 0x999d2be8UL, 0xbd10c76bUL, 0x59580000UL, 0x3f700805UL,
+    0xcb31c67bUL, 0x3d2166afUL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
+    0x80000000UL
+};
+
+// BEGIN dlog PSEUDO CODE:
+//  double dlog(double X) {
+//    // p(r) polynomial coefficients initialized from _L_tbl table
+//    double C1_0 = _L_tbl[0];
+//    double C1_1 = _L_tbl[1];
+//    double C2_0 = _L_tbl[2];
+//    double C2_1 = _L_tbl[3];
+//    double C3_0 = _L_tbl[4];
+//    double C3_1 = _L_tbl[5];
+//    double C4_0 = _L_tbl[6];
+//    double C4_1 = _L_tbl[7];
+//    // NOTE: operations with coefficients above are mostly vectorized in assembly
+//    // Check corner cases first
+//    if (X == 1.0d || AS_LONG_BITS(X) + 0x0010000000000000 <= 0x0010000000000000) {
+//      // NOTE: AS_LONG_BITS(X) + 0x0010000000000000 <= 0x0010000000000000 means
+//      //    that X < 0 or X >= 0x7FF0000000000000 (0x7FF* is NaN or INF)
+//      if (X < 0 || X is NaN) return NaN;
+//      if (X == 1.0d) return 0.0d;
+//      if (X == 0.0d) return -INFINITY;
+//      if (X is INFINITY) return INFINITY;
+//    }
+//    // double representation is 2^exponent * mantissa
+//    // split X into two multipliers: 2^exponent and 1.0 * mantissa
+//    // pseudo function: zeroExponent(X) return value of X with exponent == 0
+//    float vtmp5 = 1/(float)(zeroExponent(X)); // reciprocal estimate
+//    // pseudo function: HI16(X) returns high 16 bits of double value
+//    int hiWord = HI16(X);
+//    double vtmp1 = (double) 0x77F0 << 48 | mantissa(X);
+//    hiWord -= 16;
+//    if (AS_LONG_BITS(hiWord) > 0x8000) {
+//      // SMALL_VALUE branch
+//      vtmp0 = vtmp1 = vtmp0 * AS_DOUBLE_BITS(0x47F0000000000000);
+//      hiWord = HI16(vtmp1);
+//      vtmp0 = AS_DOUBLE_BITS(AS_LONG_BITS(vtmp0) |= 0x3FF0000000000000);
+//      vtmp5 = (double) (1/(float)vtmp0);
+//      vtmp1 <<= 12;
+//      vtmp1 >>= 12;
+//    }
+//    // MAIN branch
+//    double vtmp3 = AS_LONG_BITS(vtmp1) & 0xffffe00000000000; // hi part
+//    int intB0 = AS_INT_BITS(vtmp5) + 0x8000;
+//    double vtmp0 = AS_DOUBLE_BITS(0xffffe00000000000 & (intB0<<29));
+//    int index = (intB0 >> 16) && 0xFF;
+//    double hiTableValue = _L_tbl[8+index]; // vtmp2[0]
+//    double lowTableValue = _L_tbl[16+index]; // vtmp2[1]
+//    vtmp5 = AS_DOUBLE_BITS(hiWord & 0x7FF0 - 0x3FE0); // 0x3FE = 1023 << 4
+//    vtmp1 -= vtmp3; // low part
+//    vtmp3 = vtmp3*vtmp0 - 1.0;
+//    hiTableValue += C4_0 * vtmp5;
+//    lowTableValue += C4_1 * vtmp5;
+//    double r = vtmp1 * vtmp0 + vtmp3; // r = B*mx-1.0, computed in hi and low parts
+//    vtmp0 = hiTableValue + r;
+//    hiTableValue -= vtmp0;
+//    double r2 = r*r;
+//    double r3 = r2*r;
+//    double p7 = C3_0*r2 + C2_0*r3 + C1_0*r2*r2 + C3_1*r3*r2 + C2_1*r3*r3
+//              + C1_1*r3*r2*r2; // degree 7 polynomial
+//    return p7 + (vtmp0 + ((r + hiTableValue) + lowTableValue));
+//  }
+//
+// END dlog PSEUDO CODE
+
+
+// Generate log(X). X passed in register v0. Return log(X) into v0.
+// Generator parameters: 10 temporary FPU registers and  temporary general
+// purpose registers
+void MacroAssembler::fast_log(FloatRegister vtmp0, FloatRegister vtmp1,
+                              FloatRegister vtmp2, FloatRegister vtmp3,
+                              FloatRegister vtmp4, FloatRegister vtmp5,
+                              FloatRegister C1, FloatRegister C2,
+                              FloatRegister C3, FloatRegister C4,
+                              Register tmp1, Register tmp2, Register tmp3,
+                              Register tmp4, Register tmp5) {
+  Label DONE, CHECK_CORNER_CASES, SMALL_VALUE, MAIN,
+      CHECKED_CORNER_CASES, RETURN_MINF_OR_NAN;
+  const long INF_OR_NAN_PREFIX = 0x7FF0;
+  const long MINF_OR_MNAN_PREFIX = 0xFFF0;
+  const long ONE_PREFIX = 0x3FF0;
+    movz(tmp2, ONE_PREFIX, 48);
+    movz(tmp4, 0x0010, 48);
+    fmovd(rscratch1, v0); // rscratch1 = AS_LONG_BITS(X)
+    lea(rscratch2, ExternalAddress((address)_L_tbl));
+    movz(tmp5, 0x7F);
+    add(tmp1, rscratch1, tmp4);
+    cmp(tmp2, rscratch1);
+    lsr(tmp3, rscratch1, 29);
+    ccmp(tmp1, tmp4, 0b1101 /* LE */, NE);
+    bfm(tmp3, tmp5, 41, 8);
+    fmovs(vtmp5, tmp3);
+    // Load coefficients from table. All coefficients are organized to be
+    // in specific order, because load below will load it in vectors to be used
+    // later in vector instructions. Load will be performed in parallel while
+    // branches are taken. C1 will contain vector of {C1_0, C1_1}, C2 =
+    // {C2_0, C2_1}, C3 = {C3_0, C3_1}, C4 = {C4_0, C4_1}
+    ld1(C1, C2, C3, C4, T2D, post(rscratch2, 64));
+    br(LE, CHECK_CORNER_CASES);
+  bind(CHECKED_CORNER_CASES);
+    // all corner cases are handled
+    frecpe(vtmp5, vtmp5, S);                   // vtmp5 ~= 1/vtmp5
+    lsr(tmp2, rscratch1, 48);
+    movz(tmp4, 0x77f0, 48);
+    fmovd(vtmp4, 1.0d);
+    movz(tmp1, INF_OR_NAN_PREFIX, 48);
+    bfm(tmp4, rscratch1, 0, 51);               // tmp4 = 0x77F0 << 48 | mantissa(X)
+    // vtmp1 = AS_DOUBLE_BITS(0x77F0 << 48 | mantissa(X)) == mx
+    fmovd(vtmp1, tmp4);
+    subw(tmp2, tmp2, 16);
+    cmp(tmp2, 0x8000);
+    br(GE, SMALL_VALUE);
+  bind(MAIN);
+    fmovs(tmp3, vtmp5);                        // int intB0 = AS_INT_BITS(B);
+    mov(tmp5, 0x3FE0);
+    mov(rscratch1, 0xffffe00000000000);
+    andr(tmp2, tmp2, tmp1, LSR, 48);           // hiWord & 0x7FF0
+    sub(tmp2, tmp2, tmp5);                     // tmp2 = hiWord & 0x7FF0 - 0x3FE0
+    scvtfwd(vtmp5, tmp2);                      // vtmp5 = (double)tmp2;
+    addw(tmp3, tmp3, 0x8000);                  // tmp3 = B
+    andr(tmp4, tmp4, rscratch1);               // tmp4 == hi_part(mx)
+    andr(rscratch1, rscratch1, tmp3, LSL, 29); // rscratch1 = hi_part(B)
+    ubfm(tmp3, tmp3, 16, 23);                  // int index = (intB0 >> 16) && 0xFF
+    ldrq(vtmp2, Address(rscratch2, tmp3, Address::lsl(4))); // vtmp2 = _L_tbl[index]
+    // AS_LONG_BITS(vtmp1) & 0xffffe00000000000 // hi_part(mx)
+    fmovd(vtmp3, tmp4);
+    fmovd(vtmp0, rscratch1);                   // vtmp0 = hi_part(B)
+    fsubd(vtmp1, vtmp1, vtmp3);                // vtmp1 -= vtmp3; // low_part(mx)
+    fnmsub(vtmp3, vtmp3, vtmp0, vtmp4);        // vtmp3 = vtmp3*vtmp0 - vtmp4
+    fmlavs(vtmp2, T2D, C4, vtmp5, 0);          // vtmp2 += {C4} * vtmp5
+    // vtmp1 = r = vtmp1 * vtmp0 + vtmp3 == low_part(mx) * hi_part(B) + (hi_part(mx)*hi_part(B) - 1.0)
+    fmaddd(vtmp1, vtmp1, vtmp0, vtmp3);
+    ins(vtmp5, D, vtmp2, 0, 1);                // vtmp5 = vtmp2[1];
+    faddd(vtmp0, vtmp2, vtmp1);                // vtmp0 = vtmp2 + vtmp1
+    fmlavs(C3, T2D, C2, vtmp1, 0);             // {C3} += {C2}*vtmp1
+    fsubd(vtmp2, vtmp2, vtmp0);                // vtmp2 -= vtmp0
+    fmuld(vtmp3, vtmp1, vtmp1);                // vtmp3 = vtmp1*vtmp1
+    faddd(C4, vtmp1, vtmp2);                   // C4[0] = vtmp1 + vtmp2
+    fmlavs(C3, T2D, C1, vtmp3, 0);             // {C3} += {C1}*vtmp3
+    faddd(C4, C4, vtmp5);                      // C4 += vtmp5
+    fmuld(vtmp4, vtmp3, vtmp1);                // vtmp4 = vtmp3*vtmp1
+    faddd(vtmp0, vtmp0, C4);                   // vtmp0 += C4
+    fmlavs(C3, T2D, vtmp4, C3, 1);             // {C3} += {vtmp4}*C3[1]
+    fmaddd(vtmp0, C3, vtmp3, vtmp0);           // vtmp0 = C3 * vtmp3 + vtmp0
+    ret(lr);
+
+  block_comment("if (AS_LONG_BITS(hiWord) > 0x8000)"); {
+    bind(SMALL_VALUE);
+      movz(tmp2, 0x47F0, 48);
+      fmovd(vtmp1, tmp2);
+      fmuld(vtmp0, vtmp1, v0);
+      fmovd(vtmp1, vtmp0);
+      umov(tmp2, vtmp1, S, 3);
+      orr(vtmp0, T16B, vtmp0, vtmp4);
+      ushr(vtmp5, T2D, vtmp0, 27);
+      ushr(vtmp5, T4S, vtmp5, 2);
+      frecpe(vtmp5, vtmp5, S);
+      shl(vtmp1, T2D, vtmp1, 12);
+      ushr(vtmp1, T2D, vtmp1, 12);
+      b(MAIN);
+  }
+
+  block_comment("Corner cases"); {
+    bind(RETURN_MINF_OR_NAN);
+      movz(tmp1, MINF_OR_MNAN_PREFIX, 48);
+      orr(rscratch1, rscratch1, tmp1);
+      fmovd(v0, rscratch1);
+      ret(lr);
+    bind(CHECK_CORNER_CASES);
+      movz(tmp1, INF_OR_NAN_PREFIX, 48);
+      cmp(rscratch1, zr);
+      br(LE, RETURN_MINF_OR_NAN);
+      cmp(rscratch1, tmp1);
+      br(GE, DONE);
+      cmp(rscratch1, tmp2);
+      br(NE, CHECKED_CORNER_CASES);
+      fmovd(v0, 0.0d);
+  }
+  bind(DONE);
+    ret(lr);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -0,0 +1,1488 @@
+/* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, Cavium. All rights reserved. (By BELLSOFT)
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "macroAssembler_aarch64.hpp"
+
+// The following code is a optimized version of fdlibm sin/cos implementation
+// (C code is in share/runtime/sharedRuntimeTrig.cpp) adapted for AARCH64.
+
+// Please refer to sin/cos approximation via polynomial and
+// trigonometric argument reduction techniques to the following literature:
+//
+// [1] Muller, Jean-Michel, Nicolas Brisebarre, Florent De Dinechin,
+// Claude-Pierre Jeannerod, Vincent Lefevre, Guillaume Melquiond,
+// Nathalie Revol, Damien Stehlé, and Serge Torres:
+// Handbook of floating-point arithmetic.
+// Springer Science & Business Media, 2009.
+// [2] K. C. Ng
+// Argument Reduction for Huge Arguments: Good to the Last Bit
+// July 13, 1992, SunPro
+//
+// HOW TO READ THIS CODE:
+// This code consists of several functions. Each function has following header:
+// 1) Description
+// 2) C-pseudo code with differences from fdlibm marked by comments starting
+//        with "NOTE". Check unmodified fdlibm code in
+//        share/runtime/SharedRuntimeTrig.cpp
+// 3) Brief textual description of changes between fdlibm and current
+//        implementation along with optimization notes (if applicable)
+// 4) Assumptions, input and output
+// 5) (Optional) additional notes about intrinsic implementation
+// Each function is separated in blocks which follow the pseudo-code structure
+//
+// HIGH-LEVEL ALGORITHM DESCRIPTION:
+//    - entry point: generate_dsin_dcos(...);
+//    - check corner cases: NaN, INF, tiny argument.
+//    - check if |x| < Pi/4. Then approximate sin/cos via polynomial (kernel_sin/kernel_cos)
+//    -- else proceed to argument reduction routine (__ieee754_rem_pio2) and
+//           use reduced argument to get result via kernel_sin/kernel_cos
+//
+// HIGH-LEVEL CHANGES BETWEEN INTRINSICS AND FDLIBM:
+// 1) two_over_pi table fdlibm representation is int[], while intrinsic version
+// has these int values converted to double representation to load converted
+// double values directly (see stubRoutines_aarch4::_two_over_pi)
+// 2) Several loops are unrolled and vectorized: see comments in code after
+// labels: SKIP_F_LOAD, RECOMP_FOR1_CHECK, RECOMP_FOR2
+// 3) fdlibm npio2_hw table now has "prefix" with constants used in
+// calculation. These constants are loaded from npio2_hw table instead of
+// constructing it in code (see stubRoutines_aarch64.cpp)
+// 4) Polynomial coefficients for sin and cos are moved to table sin_coef
+// and cos_coef to use the same optimization as in 3). It allows to load most of
+// required constants via single instruction
+//
+//
+//
+///* __ieee754_rem_pio2(x,y)
+// *
+// * returns the remainder of x rem pi/2 in y[0]+y[1] (i.e. like x div pi/2)
+// * x is input argument, y[] is hi and low parts of reduced argument (x)
+// * uses __kernel_rem_pio2()
+// */
+// // use tables(see stubRoutines_aarch64.cpp): two_over_pi and modified npio2_hw
+//
+// BEGIN __ieee754_rem_pio2 PSEUDO CODE
+//
+//static int __ieee754_rem_pio2(double x, double *y) {
+//  double z,w,t,r,fn;
+//  double tx[3];
+//  int e0,i,j,nx,n,ix,hx,i0;
+//
+//  i0 = ((*(int*)&two24A)>>30)^1;        /* high word index */
+//  hx = *(i0+(int*)&x);          /* high word of x */
+//  ix = hx&0x7fffffff;
+//  if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
+//    if(hx>0) {
+//      z = x - pio2_1;
+//      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
+//        y[0] = z - pio2_1t;
+//        y[1] = (z-y[0])-pio2_1t;
+//      } else {                /* near pi/2, use 33+33+53 bit pi */
+//        z -= pio2_2;
+//        y[0] = z - pio2_2t;
+//        y[1] = (z-y[0])-pio2_2t;
+//      }
+//      return 1;
+//    } else {    /* negative x */
+//      z = x + pio2_1;
+//      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
+//        y[0] = z + pio2_1t;
+//        y[1] = (z-y[0])+pio2_1t;
+//      } else {                /* near pi/2, use 33+33+53 bit pi */
+//        z += pio2_2;
+//        y[0] = z + pio2_2t;
+//        y[1] = (z-y[0])+pio2_2t;
+//      }
+//      return -1;
+//    }
+//  }
+//  if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
+//    t  = fabsd(x);
+//    n  = (int) (t*invpio2+half);
+//    fn = (double)n;
+//    r  = t-fn*pio2_1;
+//    w  = fn*pio2_1t;    /* 1st round good to 85 bit */
+//    // NOTE: y[0] = r-w; is moved from if/else below to be before "if"
+//    y[0] = r-w;
+//    if(n<32&&ix!=npio2_hw[n-1]) {
+//      // y[0] = r-w;       /* quick check no cancellation */ // NOTE: moved earlier
+//    } else {
+//      j  = ix>>20;
+//      // y[0] = r-w; // NOTE: moved earlier
+//      i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
+//      if(i>16) {  /* 2nd iteration needed, good to 118 */
+//        t  = r;
+//        w  = fn*pio2_2;
+//        r  = t-w;
+//        w  = fn*pio2_2t-((t-r)-w);
+//        y[0] = r-w;
+//        i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
+//        if(i>49)  {     /* 3rd iteration need, 151 bits acc */
+//          t  = r;       /* will cover all possible cases */
+//          w  = fn*pio2_3;
+//          r  = t-w;
+//          w  = fn*pio2_3t-((t-r)-w);
+//          y[0] = r-w;
+//        }
+//      }
+//    }
+//    y[1] = (r-y[0])-w;
+//    if(hx<0)    {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+//    else         return n;
+//  }
+//  /*
+//   * all other (large) arguments
+//   */
+//  // NOTE: this check is removed, because it was checked in dsin/dcos
+//  // if(ix>=0x7ff00000) {          /* x is inf or NaN */
+//  //  y[0]=y[1]=x-x; return 0;
+//  // }
+//  /* set z = scalbn(|x|,ilogb(x)-23) */
+//  *(1-i0+(int*)&z) = *(1-i0+(int*)&x);
+//  e0    = (ix>>20)-1046;        /* e0 = ilogb(z)-23; */
+//  *(i0+(int*)&z) = ix - (e0<<20);
+//
+//  // NOTE: "for" loop below in unrolled. See comments in asm code
+//  for(i=0;i<2;i++) {
+//    tx[i] = (double)((int)(z));
+//    z     = (z-tx[i])*two24A;
+//  }
+//
+//  tx[2] = z;
+//  nx = 3;
+//
+//  // NOTE: while(tx[nx-1]==zeroA) nx--;  is unrolled. See comments in asm code
+//  while(tx[nx-1]==zeroA) nx--;  /* skip zero term */
+//
+//  n  =  __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
+//  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+//  return n;
+//}
+//
+// END __ieee754_rem_pio2 PSEUDO CODE
+//
+// Changes between fdlibm and intrinsic for __ieee754_rem_pio2:
+//     1. INF/NaN check for huge argument is removed in comparison with fdlibm
+//     code, because this check is already done in dcos/dsin code
+//     2. Most constants are now loaded from table instead of direct initialization
+//     3. Two loops are unrolled
+// Assumptions:
+//     1. Assume |X| >= PI/4
+//     2. Assume rscratch1 = 0x3fe921fb00000000  (~ PI/4)
+//     3. Assume ix = r3
+// Input and output:
+//     1. Input: X = r0
+//     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
+// NOTE: general purpose register names match local variable names in C code
+// NOTE: fpu registers are actively reused. See comments in code about their usage
+void MacroAssembler::generate__ieee754_rem_pio2(address npio2_hw,
+    address two_over_pi, address pio2) {
+  const long PIO2_1t = 0x3DD0B4611A626331UL;
+  const long PIO2_2  = 0x3DD0B4611A600000UL;
+  const long PIO2_2t = 0x3BA3198A2E037073UL;
+  Label X_IS_NEGATIVE, X_IS_MEDIUM_OR_LARGE, X_IS_POSITIVE_LONG_PI, LARGE_ELSE,
+      REDUCTION_DONE, X_IS_MEDIUM_BRANCH_DONE, X_IS_LARGE, NX_SET,
+      X_IS_NEGATIVE_LONG_PI;
+  Register X = r0, n = r2, ix = r3, jv = r4, tmp5 = r5, jx = r6,
+      tmp3 = r7, iqBase = r10, ih = r11, i = r17;
+    // initializing constants first
+    // rscratch1 = 0x3fe921fb00000000 (see assumptions)
+    movk(rscratch1, 0x3ff9, 48); // was 0x3fe921fb0..0 now it's 0x3ff921fb0..0
+    mov(rscratch2, 0x4002d97c); // 3*PI/4 high word
+    movk(rscratch1, 0x5440, 16); // now rscratch1 == PIO2_1
+    fmovd(v1, rscratch1); // v1 = PIO2_1
+    cmp(rscratch2, ix);
+    br(LE, X_IS_MEDIUM_OR_LARGE);
+
+    block_comment("if(ix<0x4002d97c) {...  /* |x| ~< 3pi/4 */ "); {
+      cmp(X, zr);
+      br(LT, X_IS_NEGATIVE);
+
+      block_comment("if(hx>0) {"); {
+        fsubd(v2, v0, v1); // v2 = z = x - pio2_1
+        cmp(ix, rscratch1, LSR, 32);
+        mov(n, 1);
+        br(EQ, X_IS_POSITIVE_LONG_PI);
+
+        block_comment("case: hx > 0 &&  ix!=0x3ff921fb {"); { /* 33+53 bit pi is good enough */
+          mov(rscratch2, PIO2_1t);
+          fmovd(v27, rscratch2);
+          fsubd(v4, v2, v27); // v4 = y[0] = z - pio2_1t;
+          fsubd(v5, v2, v4);
+          fsubd(v5, v5, v27); // v5 = y[1] = (z-y[0])-pio2_1t
+          b(REDUCTION_DONE);
+        }
+
+        block_comment("case: hx > 0 &*& ix==0x3ff921fb {"); { /* near pi/2, use 33+33+53 bit pi */
+          bind(X_IS_POSITIVE_LONG_PI);
+            mov(rscratch1, PIO2_2);
+            mov(rscratch2, PIO2_2t);
+            fmovd(v27, rscratch1);
+            fmovd(v6, rscratch2);
+            fsubd(v2, v2, v27); // z-= pio2_2
+            fsubd(v4, v2, v6);  // y[0] = z - pio2_2t
+            fsubd(v5, v2, v4);
+            fsubd(v5, v5, v6);  // v5 = (z - y[0]) - pio2_2t
+            b(REDUCTION_DONE);
+        }
+      }
+
+      block_comment("case: hx <= 0)"); {
+        bind(X_IS_NEGATIVE);
+          faddd(v2, v0, v1); // v2 = z = x + pio2_1
+          cmp(ix, rscratch1, LSR, 32);
+          mov(n, -1);
+          br(EQ, X_IS_NEGATIVE_LONG_PI);
+
+          block_comment("case: hx <= 0 && ix!=0x3ff921fb) {"); { /* 33+53 bit pi is good enough */
+            mov(rscratch2, PIO2_1t);
+            fmovd(v27, rscratch2);
+            faddd(v4, v2, v27); // v4 = y[0] = z + pio2_1t;
+            fsubd(v5, v2, v4);
+            faddd(v5, v5, v27); // v5 = y[1] = (z-y[0]) + pio2_1t
+            b(REDUCTION_DONE);
+          }
+
+          block_comment("case: hx <= 0 && ix==0x3ff921fb"); { /* near pi/2, use 33+33+53 bit pi */
+            bind(X_IS_NEGATIVE_LONG_PI);
+              mov(rscratch1, PIO2_2);
+              mov(rscratch2, PIO2_2t);
+              fmovd(v27, rscratch1);
+              fmovd(v6, rscratch2);
+              faddd(v2, v2, v27); // z += pio2_2
+              faddd(v4, v2, v6);  // y[0] = z + pio2_2t
+              fsubd(v5, v2, v4);
+              faddd(v5, v5, v6);  // v5 = (z - y[0]) + pio2_2t
+              b(REDUCTION_DONE);
+          }
+      }
+  }
+  bind(X_IS_MEDIUM_OR_LARGE);
+    mov(rscratch1, 0x413921fb);
+    cmp(ix, rscratch1); // ix < = 0x413921fb ?
+    br(GT, X_IS_LARGE);
+
+    block_comment("|x| ~<= 2^19*(pi/2), medium size"); {
+      lea(ih, ExternalAddress(npio2_hw));
+      ld1(v4, v5, v6, v7, T1D, ih);
+      fabsd(v31, v0);          // v31 = t = |x|
+      add(ih, ih, 64);
+      fmaddd(v2, v31, v5, v4); // v2 = t * invpio2 + half (invpio2 = 53 bits of 2/pi, half = 0.5)
+      fcvtzdw(n, v2);          // n = (int) v2
+      frintzd(v2, v2);
+      fmsubd(v3, v2, v6, v31); // v3 = r = t - fn * pio2_1
+      fmuld(v26, v2, v7);      // v26 = w = fn * pio2_1t
+      fsubd(v4, v3, v26);      // y[0] = r - w. Calculated before branch
+      cmp(n, 32);
+      br(GT, LARGE_ELSE);
+      subw(tmp5, n, 1);        // tmp5 = n - 1
+      ldrw(jv, Address(ih, tmp5, Address::lsl(2)));
+      cmp(ix, jv);
+      br(NE, X_IS_MEDIUM_BRANCH_DONE);
+
+      block_comment("else block for if(n<32&&ix!=npio2_hw[n-1])"); {
+        bind(LARGE_ELSE);
+          fmovd(jx, v4);
+          lsr(tmp5, ix, 20);                       // j = ix >> 20
+          lsl(jx, jx, 1);
+          sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1);   // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
+
+          block_comment("if(i>16)"); {
+            cmp(tmp3, 16);
+            br(LE, X_IS_MEDIUM_BRANCH_DONE);
+            // i > 16. 2nd iteration needed
+            ldpd(v6, v7, Address(ih, -32));
+            fmovd(v28, v3);                        // t = r
+            fmuld(v29, v2, v6);                    // w = v29 = fn * pio2_2
+            fsubd(v3, v28, v29);                   // r = t - w
+            fsubd(v31, v28, v3);                   // v31 = (t - r)
+            fsubd(v31, v29, v31);                  // v31 = w - (t - r) = - ((t - r) - w)
+            fmaddd(v26, v2, v7, v31);              // v26 = w = fn*pio2_2t - ((t - r) - w)
+            fsubd(v4, v3, v26);                    // y[0] = r - w
+            fmovd(jx, v4);
+            lsl(jx, jx, 1);
+            sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1); // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
+
+            block_comment("if(i>49)"); {
+              cmp(tmp3, 49);
+              br(LE, X_IS_MEDIUM_BRANCH_DONE);
+              // 3rd iteration need, 151 bits acc
+              ldpd(v6, v7, Address(ih, -16));
+              fmovd(v28, v3);                      // save "r"
+              fmuld(v29, v2, v6);                  // v29 = fn * pio2_3
+              fsubd(v3, v28, v29);                 // r = r - w
+              fsubd(v31, v28, v3);                 // v31 = (t - r)
+              fsubd(v31, v29, v31);                // v31 = w - (t - r) = - ((t - r) - w)
+              fmaddd(v26, v2, v7, v31);            // v26 = w = fn*pio2_3t - ((t - r) - w)
+              fsubd(v4, v3, v26);                  // y[0] = r - w
+            }
+          }
+      }
+    block_comment("medium x tail"); {
+      bind(X_IS_MEDIUM_BRANCH_DONE);
+        fsubd(v5, v3, v4);                         // v5 = y[1] = (r - y[0])
+        fsubd(v5, v5, v26);                        // v5 = y[1] = (r - y[0]) - w
+        cmp(X, zr);
+        br(GT, REDUCTION_DONE);
+        fnegd(v4, v4);
+        negw(n, n);
+        fnegd(v5, v5);
+        b(REDUCTION_DONE);
+    }
+  }
+
+  block_comment("all other (large) arguments"); {
+    bind(X_IS_LARGE);
+      lsr(rscratch1, ix, 20);                      // ix >> 20
+      movz(tmp5, 0x4170, 48);
+      subw(rscratch1, rscratch1, 1046);            // e0
+      fmovd(v10, tmp5);                            // init two24A value
+      subw(jv, ix, rscratch1, LSL, 20);            // ix - (e0<<20)
+      lsl(jv, jv, 32);
+      subw(rscratch2, rscratch1, 3);
+      bfm(jv, X, 0, 31);                           // jv = z
+      movw(i, 24);
+      fmovd(v26, jv);                              // v26 = z
+
+      block_comment("unrolled for(i=0;i<2;i++) {tx[i] = (double)((int)(z));z = (z-tx[i])*two24A;}"); {
+        // tx[0,1,2] = v6,v7,v26
+        frintzd(v6, v26);                          // v6 = (double)((int)v26)
+        sdivw(jv, rscratch2, i);                   // jv = (e0 - 3)/24
+        fsubd(v26, v26, v6);
+        sub(sp, sp, 560);
+        fmuld(v26, v26, v10);
+        frintzd(v7, v26);                          // v7 = (double)((int)v26)
+        movw(jx, 2); // calculate jx as nx - 1, which is initially 2. Not a part of unrolled loop
+        fsubd(v26, v26, v7);
+      }
+
+      block_comment("nx calculation with unrolled while(tx[nx-1]==zeroA) nx--;"); {
+        fcmpd(v26, 0.0d);                          // if NE then jx == 2. else it's 1 or 0
+        add(iqBase, sp, 480);                      // base of iq[]
+        fmuld(v3, v26, v10);
+        br(NE, NX_SET);
+        fcmpd(v7, 0.0d);                           // v7 == 0 => jx = 0. Else jx = 1
+        csetw(jx, NE);
+      }
+    bind(NX_SET);
+      generate__kernel_rem_pio2(two_over_pi, pio2);
+      // now we have y[0] = v4, y[1] = v5 and n = r2
+      cmp(X, zr);
+      br(GE, REDUCTION_DONE);
+      fnegd(v4, v4);
+      fnegd(v5, v5);
+      negw(n, n);
+  }
+  bind(REDUCTION_DONE);
+}
+
+///*
+// * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
+// * double x[],y[]; int e0,nx,prec; int ipio2[];
+// *
+// * __kernel_rem_pio2 return the last three digits of N with
+// *              y = x - N*pi/2
+// * so that |y| < pi/2.
+// *
+// * The method is to compute the integer (mod 8) and fraction parts of
+// * (2/pi)*x without doing the full multiplication. In general we
+// * skip the part of the product that are known to be a huge integer (
+// * more accurately, = 0 mod 8 ). Thus the number of operations are
+// * independent of the exponent of the input.
+// *
+// * NOTE: 2/pi int representation is converted to double
+// * // (2/pi) is represented by an array of 24-bit integers in ipio2[].
+// *
+// * Input parameters:
+// *      x[]     The input value (must be positive) is broken into nx
+// *              pieces of 24-bit integers in double precision format.
+// *              x[i] will be the i-th 24 bit of x. The scaled exponent
+// *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
+// *              match x's up to 24 bits.
+// *
+// *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
+// *                      e0 = ilogb(z)-23
+// *                      z  = scalbn(z,-e0)
+// *              for i = 0,1,2
+// *                      x[i] = floor(z)
+// *                      z    = (z-x[i])*2**24
+// *
+// *
+// *      y[]     ouput result in an array of double precision numbers.
+// *              The dimension of y[] is:
+// *                      24-bit  precision       1
+// *                      53-bit  precision       2
+// *                      64-bit  precision       2
+// *                      113-bit precision       3
+// *              The actual value is the sum of them. Thus for 113-bit
+// *              precsion, one may have to do something like:
+// *
+// *              long double t,w,r_head, r_tail;
+// *              t = (long double)y[2] + (long double)y[1];
+// *              w = (long double)y[0];
+// *              r_head = t+w;
+// *              r_tail = w - (r_head - t);
+// *
+// *      e0      The exponent of x[0]
+// *
+// *      nx      dimension of x[]
+// *
+// *      prec    an interger indicating the precision:
+// *                      0       24  bits (single)
+// *                      1       53  bits (double)
+// *                      2       64  bits (extended)
+// *                      3       113 bits (quad)
+// *
+// *      NOTE: ipio2[] array below is converted to double representation
+// *      //ipio2[]
+// *      //        integer array, contains the (24*i)-th to (24*i+23)-th
+// *      //        bit of 2/pi after binary point. The corresponding
+// *      //        floating value is
+// *
+// *                      ipio2[i] * 2^(-24(i+1)).
+// *
+// * Here is the description of some local variables:
+// *
+// *      jk      jk+1 is the initial number of terms of ipio2[] needed
+// *              in the computation. The recommended value is 2,3,4,
+// *              6 for single, double, extended,and quad.
+// *
+// *      jz      local integer variable indicating the number of
+// *              terms of ipio2[] used.
+// *
+// *      jx      nx - 1
+// *
+// *      jv      index for pointing to the suitable ipio2[] for the
+// *              computation. In general, we want
+// *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
+// *              is an integer. Thus
+// *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
+// *              Hence jv = max(0,(e0-3)/24).
+// *
+// *      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
+// *
+// *      q[]     double array with integral value, representing the
+// *              24-bits chunk of the product of x and 2/pi.
+// *
+// *      q0      the corresponding exponent of q[0]. Note that the
+// *              exponent for q[i] would be q0-24*i.
+// *
+// *      PIo2[]  double precision array, obtained by cutting pi/2
+// *              into 24 bits chunks.
+// *
+// *      f[]     ipio2[] in floating point
+// *
+// *      iq[]    integer array by breaking up q[] in 24-bits chunk.
+// *
+// *      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
+// *
+// *      ih      integer. If >0 it indicates q[] is >= 0.5, hence
+// *              it also indicates the *sign* of the result.
+// *
+// */
+//
+// Use PIo2 table(see stubRoutines_aarch64.cpp)
+//
+// BEGIN __kernel_rem_pio2 PSEUDO CODE
+//
+//static int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, /* NOTE: converted to double */ const double *ipio2 // const int *ipio2) {
+//  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
+//  double z,fw,f[20],fq[20],q[20];
+//
+//  /* initialize jk*/
+//  // jk = init_jk[prec]; // NOTE: prec==2 for double. jk is always 4.
+//  jp = jk; // NOTE: always 4
+//
+//  /* determine jx,jv,q0, note that 3>q0 */
+//  jx =  nx-1;
+//  jv = (e0-3)/24; if(jv<0) jv=0;
+//  q0 =  e0-24*(jv+1);
+//
+//  /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+//  j = jv-jx; m = jx+jk;
+//
+//  // NOTE: split into two for-loops: one with zeroB and one with ipio2[j]. It
+//  //       allows the use of wider loads/stores
+//  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; //(double) ipio2[j];
+//
+//  // NOTE: unrolled and vectorized "for". See comments in asm code
+//  /* compute q[0],q[1],...q[jk] */
+//  for (i=0;i<=jk;i++) {
+//    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
+//  }
+//
+//  jz = jk;
+//recompute:
+//  /* distill q[] into iq[] reversingly */
+//  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
+//    fw    =  (double)((int)(twon24* z));
+//    iq[i] =  (int)(z-two24B*fw);
+//    z     =  q[j-1]+fw;
+//  }
+//
+//  /* compute n */
+//  z  = scalbnA(z,q0);           /* actual value of z */
+//  z -= 8.0*floor(z*0.125);              /* trim off integer >= 8 */
+//  n  = (int) z;
+//  z -= (double)n;
+//  ih = 0;
+//  if(q0>0) {    /* need iq[jz-1] to determine n */
+//    i  = (iq[jz-1]>>(24-q0)); n += i;
+//    iq[jz-1] -= i<<(24-q0);
+//    ih = iq[jz-1]>>(23-q0);
+//  }
+//  else if(q0==0) ih = iq[jz-1]>>23;
+//  else if(z>=0.5) ih=2;
+//
+//  if(ih>0) {    /* q > 0.5 */
+//    n += 1; carry = 0;
+//    for(i=0;i<jz ;i++) {        /* compute 1-q */
+//      j = iq[i];
+//      if(carry==0) {
+//        if(j!=0) {
+//          carry = 1; iq[i] = 0x1000000- j;
+//        }
+//      } else  iq[i] = 0xffffff - j;
+//    }
+//    if(q0>0) {          /* rare case: chance is 1 in 12 */
+//      switch(q0) {
+//      case 1:
+//        iq[jz-1] &= 0x7fffff; break;
+//      case 2:
+//        iq[jz-1] &= 0x3fffff; break;
+//      }
+//    }
+//    if(ih==2) {
+//      z = one - z;
+//      if(carry!=0) z -= scalbnA(one,q0);
+//    }
+//  }
+//
+//  /* check if recomputation is needed */
+//  if(z==zeroB) {
+//    j = 0;
+//    for (i=jz-1;i>=jk;i--) j |= iq[i];
+//    if(j==0) { /* need recomputation */
+//      for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
+//
+//      for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
+//        f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i];
+//        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
+//        q[i] = fw;
+//      }
+//      jz += k;
+//      goto recompute;
+//    }
+//  }
+//
+//  /* chop off zero terms */
+//  if(z==0.0) {
+//    jz -= 1; q0 -= 24;
+//    while(iq[jz]==0) { jz--; q0-=24;}
+//  } else { /* break z into 24-bit if necessary */
+//    z = scalbnA(z,-q0);
+//    if(z>=two24B) {
+//      fw = (double)((int)(twon24*z));
+//      iq[jz] = (int)(z-two24B*fw);
+//      jz += 1; q0 += 24;
+//      iq[jz] = (int) fw;
+//    } else iq[jz] = (int) z ;
+//  }
+//
+//  /* convert integer "bit" chunk to floating-point value */
+//  fw = scalbnA(one,q0);
+//  for(i=jz;i>=0;i--) {
+//    q[i] = fw*(double)iq[i]; fw*=twon24;
+//  }
+//
+//  /* compute PIo2[0,...,jp]*q[jz,...,0] */
+//  for(i=jz;i>=0;i--) {
+//    for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
+//    fq[jz-i] = fw;
+//  }
+//
+//  // NOTE: switch below is eliminated, because prec is always 2 for doubles
+//  /* compress fq[] into y[] */
+//  //switch(prec) {
+//  //case 0:
+//  //  fw = 0.0;
+//  //  for (i=jz;i>=0;i--) fw += fq[i];
+//  //  y[0] = (ih==0)? fw: -fw;
+//  //  break;
+//  //case 1:
+//  //case 2:
+//    fw = 0.0;
+//    for (i=jz;i>=0;i--) fw += fq[i];
+//    y[0] = (ih==0)? fw: -fw;
+//    fw = fq[0]-fw;
+//    for (i=1;i<=jz;i++) fw += fq[i];
+//    y[1] = (ih==0)? fw: -fw;
+//  //  break;
+//  //case 3:       /* painful */
+//  //  for (i=jz;i>0;i--) {
+//  //    fw      = fq[i-1]+fq[i];
+//  // fq[i]  += fq[i-1]-fw;
+//  //    fq[i-1] = fw;
+//  //  }
+//  //  for (i=jz;i>1;i--) {
+//  //    fw      = fq[i-1]+fq[i];
+//  //    fq[i]  += fq[i-1]-fw;
+//  //    fq[i-1] = fw;
+//  //  }
+//  //  for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
+//  //  if(ih==0) {
+//  //    y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
+//  //  } else {
+//  //    y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
+//  //  }
+//  //}
+//  return n&7;
+//}
+//
+// END __kernel_rem_pio2 PSEUDO CODE
+//
+// Changes between fdlibm and intrinsic:
+//     1. One loop is unrolled and vectorized (see comments in code)
+//     2. One loop is split into 2 loops (see comments in code)
+//     3. Non-double code is removed(last switch). Sevaral variables became
+//         constants because of that (see comments in code)
+//     4. Use of jx, which is nx-1 instead of nx
+// Assumptions:
+//     1. Assume |X| >= PI/4
+// Input and output:
+//     1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
+//     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
+// NOTE: general purpose register names match local variable names in C code
+// NOTE: fpu registers are actively reused. See comments in code about their usage
+void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
+  Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
+      RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
+      INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
+      Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
+      RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
+      CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
+      IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
+      RECOMP_FOR1_CHECK;
+  Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
+      tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
+      jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18;
+    // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
+    // jx = nx - 1
+    lea(twoOverPiBase, ExternalAddress(two_over_pi));
+    cmpw(jv, zr);
+    addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
+    cselw(jv, jv, zr, GE);
+    fmovd(v26, 0.0d);
+    addw(tmp5, jv, 1);                    // jv+1
+    subsw(j, jv, jx);
+    add(qBase, sp, 320);                  // base of q[]
+    msubw(rscratch1, i, tmp5, rscratch1); // q0 =  e0-24*(jv+1)
+    // use double f[20], fq[20], q[20], iq[20] on stack, which is
+    // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
+    // will contain f[20], fq[20], q[20], iq[20]
+    // now initialize f[20] indexes 0..m (inclusive)
+    // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
+    mov(tmp5, sp);
+
+    block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
+        eorw(i, i, i);
+        br(GE, INIT_F_COPY);
+      bind(INIT_F_ZERO);
+        stpq(v26, v26, Address(post(tmp5, 32)));
+        addw(i, i, 4);
+        addsw(j, j, 4);
+        br(LT, INIT_F_ZERO);
+        subw(i, i, j);
+        movw(j, zr);
+      bind(INIT_F_COPY);
+        add(tmp1, twoOverPiBase, j, LSL, 3); // ipio2[j] start address
+        ld1(v18, v19, v20, v21, T16B, tmp1);
+        add(tmp5, sp, i, ext::uxtx, 3);
+        st1(v18, v19, v20, v21, T16B, tmp5);
+    }
+    // v18..v21 can actually contain f[0..7]
+    cbz(i, SKIP_F_LOAD); // i == 0 => f[i] == f[0] => already loaded
+    ld1(v18, v19, v20, v21, T2D, Address(sp)); // load f[0..7]
+  bind(SKIP_F_LOAD);
+    // calculate 2^q0 and 2^-q0, which we'll need further.
+    // q0 is exponent. So, calculate biased exponent(q0+1023)
+    negw(tmp4, rscratch1);
+    addw(tmp5, rscratch1, 1023);
+    addw(tmp4, tmp4, 1023);
+    // Unroll following for(s) depending on jx in [0,1,2]
+    // for (i=0;i<=jk;i++) {
+    //   for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
+    // }
+    // Unrolling for jx == 0 case:
+    //   q[0] = x[0] * f[0]
+    //   q[1] = x[0] * f[1]
+    //   q[2] = x[0] * f[2]
+    //   q[3] = x[0] * f[3]
+    //   q[4] = x[0] * f[4]
+    //
+    // Vectorization for unrolled jx == 0 case:
+    //   {q[0], q[1]} = {f[0], f[1]} * x[0]
+    //   {q[2], q[3]} = {f[2], f[3]} * x[0]
+    //   q[4] = f[4] * x[0]
+    //
+    // Unrolling for jx == 1 case:
+    //   q[0] = x[0] * f[1] + x[1] * f[0]
+    //   q[1] = x[0] * f[2] + x[1] * f[1]
+    //   q[2] = x[0] * f[3] + x[1] * f[2]
+    //   q[3] = x[0] * f[4] + x[1] * f[3]
+    //   q[4] = x[0] * f[5] + x[1] * f[4]
+    //
+    // Vectorization for unrolled jx == 1 case:
+    //   {q[0], q[1]} = {f[0], f[1]} * x[1]
+    //   {q[2], q[3]} = {f[2], f[3]} * x[1]
+    //   q[4] = f[4] * x[1]
+    //   {q[0], q[1]} += {f[1], f[2]} * x[0]
+    //   {q[2], q[3]} += {f[3], f[4]} * x[0]
+    //   q[4] += f[5] * x[0]
+    //
+    // Unrolling for jx == 2 case:
+    //   q[0] = x[0] * f[2] + x[1] * f[1] + x[2] * f[0]
+    //   q[1] = x[0] * f[3] + x[1] * f[2] + x[2] * f[1]
+    //   q[2] = x[0] * f[4] + x[1] * f[3] + x[2] * f[2]
+    //   q[3] = x[0] * f[5] + x[1] * f[4] + x[2] * f[3]
+    //   q[4] = x[0] * f[6] + x[1] * f[5] + x[2] * f[4]
+    //
+    // Vectorization for unrolled jx == 2 case:
+    //   {q[0], q[1]} = {f[0], f[1]} * x[2]
+    //   {q[2], q[3]} = {f[2], f[3]} * x[2]
+    //   q[4] = f[4] * x[2]
+    //   {q[0], q[1]} += {f[1], f[2]} * x[1]
+    //   {q[2], q[3]} += {f[3], f[4]} * x[1]
+    //   q[4] += f[5] * x[1]
+    //   {q[0], q[1]} += {f[2], f[3]} * x[0]
+    //   {q[2], q[3]} += {f[4], f[5]} * x[0]
+    //   q[4] += f[6] * x[0]
+  block_comment("unrolled and vectorized computation of q[0]..q[jk]"); {
+      cmpw(jx, 1);
+      lsl(tmp5, tmp5, 52);                     // now it's 2^q0 double value
+      lsl(tmp4, tmp4, 52);                     // now it's 2^-q0 double value
+      br(LT, JX_IS_0);
+      add(i, sp, 8);
+      ldpq(v26, v27, i);                       // load f[1..4]
+      br(GT, JX_IS_2);
+      // jx == 1
+      fmulxvs(v28, T2D, v18, v7);              // f[0,1] * x[1]
+      fmulxvs(v29, T2D, v19, v7);              // f[2,3] * x[1]
+      fmuld(v30, v20, v7);                     // f[4] * x[1]
+      fmlavs(v28, T2D, v26, v6, 0);
+      fmlavs(v29, T2D, v27, v6, 0);
+      fmlavs(v30, T2D, v6, v20, 1);            // v30 += f[5] * x[0]
+      b(Q_DONE);
+    bind(JX_IS_2);
+      fmulxvs(v28, T2D, v18, v3);              // f[0,1] * x[2]
+      fmulxvs(v29, T2D, v19, v3);              // f[2,3] * x[2]
+      fmuld(v30, v20, v3);                     // f[4] * x[2]
+      fmlavs(v28, T2D, v26, v7, 0);
+      fmlavs(v29, T2D, v27, v7, 0);
+      fmlavs(v30, T2D, v7, v20, 1);            // v30 += f[5] * x[1]
+      fmlavs(v28, T2D, v19, v6, 0);
+      fmlavs(v29, T2D, v20, v6, 0);
+      fmlavs(v30, T2D, v6, v21, 0);            // v30 += f[6] * x[0]
+      b(Q_DONE);
+    bind(JX_IS_0);
+      fmulxvs(v28, T2D, v18, v6);              // f[0,1] * x[0]
+      fmulxvs(v29, T2D, v19, v6);              // f[2,3] * x[0]
+      fmuld(v30, v20, v6);                     // f[4] * x[0]
+    bind(Q_DONE);
+      st1(v28, v29, v30, T2D, Address(qBase)); // save calculated q[0]...q[jk]
+  }
+  movz(i, 0x3E70, 48);
+  movw(jz, 4);
+  fmovd(v17, i);                               // v17 = twon24
+  fmovd(v30, tmp5);                            // 2^q0
+  fmovd(v21, 0.125d);
+  fmovd(v20, 8.0d);
+  fmovd(v22, tmp4);                            // 2^-q0
+
+  block_comment("recompute loop"); {
+    bind(RECOMPUTE);
+      //  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
+      //    fw    =  (double)((int)(twon24* z));
+      //    iq[i] =  (int)(z-two24A*fw);
+      //    z     =  q[j-1]+fw;
+      //  }
+      block_comment("distill q[] into iq[] reversingly"); {
+          eorw(i, i, i);
+          movw(j, jz);
+          add(tmp2, qBase, jz, LSL, 3);                    // q[jz] address
+          ldrd(v18, post(tmp2, -8));                       // z = q[j] and moving address to q[j-1]
+        bind(RECOMP_FIRST_FOR);
+          ldrd(v27, post(tmp2, -8));
+          fmuld(v29, v17, v18);                            // twon24*z
+          frintzd(v29, v29);                               // (double)(int)
+          fmsubd(v28, v10, v29, v18);                      // v28 = z-two24A*fw
+          fcvtzdw(tmp1, v28);                              // (int)(z-two24A*fw)
+          strw(tmp1, Address(iqBase, i, Address::lsl(2)));
+          faddd(v18, v27, v29);
+          add(i, i, 1);
+          subs(j, j, 1);
+          br(GT, RECOMP_FIRST_FOR);
+      }
+      // compute n
+      fmuld(v18, v18, v30);
+      fmuld(v2, v18, v21);
+      frintmd(v2, v2);                                     // v2 = floor(v2) == rounding towards -inf
+      fmsubd(v18, v2, v20, v18);                           // z -= 8.0*floor(z*0.125);
+      movw(ih, 2);
+      frintzd(v2, v18);                                    // v2 = (double)((int)z)
+      fcvtzdw(n, v18);                                     // n  = (int) z;
+      fsubd(v18, v18, v2);                                 // z -= (double)n;
+
+      block_comment("q0-dependent initialization"); {
+          cmpw(rscratch1, 0);                              // if (q0 > 0)
+          br(LT, Q0_ZERO_CMP_LT);
+          subw(j, jz, 1);                                  // j = jz - 1
+          ldrw(tmp2, Address(iqBase, j, Address::lsl(2))); // tmp2 = iq[jz-1]
+          br(EQ, Q0_ZERO_CMP_EQ);
+          movw(tmp4, 24);
+          subw(tmp4, tmp4, rscratch1);                     // == 24 - q0
+          lsrvw(i, tmp2, tmp4);                            // i = iq[jz-1] >> (24-q0)
+          lslvw(tmp5, i, tmp4);
+          subw(tmp2, tmp2, tmp5);                          // iq[jz-1] -= i<<(24-q0);
+          strw(tmp2, Address(iqBase, j, Address::lsl(2))); // store iq[jz-1]
+          subw(rscratch2, tmp4, 1);                        // == 23 - q0
+          addw(n, n, i);                                   // n+=i
+          lsrvw(ih, tmp2, rscratch2);                      // ih = iq[jz-1] >> (23-q0)
+          b(Q0_ZERO_CMP_DONE);
+        bind(Q0_ZERO_CMP_EQ);
+          lsr(ih, tmp2, 23);                               // ih = iq[z-1] >> 23
+          b(Q0_ZERO_CMP_DONE);
+        bind(Q0_ZERO_CMP_LT);
+          fmovd(v4, 0.5d);
+          fcmpd(v18, v4);
+          cselw(ih, zr, ih, LT);                           // if (z<0.5) ih = 0
+      }
+    bind(Q0_ZERO_CMP_DONE);
+      cmpw(ih, zr);
+      br(LE, IH_HANDLED);
+
+    block_comment("if(ih>) {"); {
+      // use rscratch2 as carry
+
+      block_comment("for(i=0;i<jz ;i++) {...}"); {
+          addw(n, n, 1);
+          eorw(i, i, i);
+          eorw(rscratch2, rscratch2, rscratch2);
+        bind(IH_FOR);
+          ldrw(j, Address(iqBase, i, Address::lsl(2)));    // j = iq[i]
+          movw(tmp3, 0x1000000);
+          subw(tmp3, tmp3, rscratch2);
+          cbnzw(rscratch2, IH_FOR_STORE);
+          cbzw(j, IH_FOR_INCREMENT);
+          movw(rscratch2, 1);
+        bind(IH_FOR_STORE);
+          subw(tmp3, tmp3, j);
+          strw(tmp3, Address(iqBase, i, Address::lsl(2))); // iq[i] = 0xffffff - j
+        bind(IH_FOR_INCREMENT);
+          addw(i, i, 1);
+          cmpw(i, jz);
+          br(LT, IH_FOR);
+      }
+
+      block_comment("if(q0>0) {"); {
+        cmpw(rscratch1, zr);
+        br(LE, IH_AFTER_SWITCH);
+        // tmp3 still has iq[jz-1] value. no need to reload
+        // now, zero high tmp3 bits (rscratch1 number of bits)
+        movw(j, -1);
+        subw(i, jz, 1);                                    // set i to jz-1
+        lsrv(j, j, rscratch1);
+        andw(tmp3, tmp3, j, LSR, 8);                       // we have 24-bit-based constants
+        strw(tmp3, Address(iqBase, i, Address::lsl(2)));   // save iq[jz-1]
+      }
+      bind(IH_AFTER_SWITCH);
+        cmpw(ih, 2);
+        br(NE, IH_HANDLED);
+
+        block_comment("if(ih==2) {"); {
+          fmovd(v25, 1.0d);
+          fsubd(v18, v25, v18);                            // z = one - z;
+          cbzw(rscratch2, IH_HANDLED);
+          fsubd(v18, v18, v30);                            // z -= scalbnA(one,q0);
+        }
+    }
+    bind(IH_HANDLED);
+      // check if recomputation is needed
+      fcmpd(v18, 0.0d);
+      br(NE, RECOMP_CHECK_DONE_NOT_ZERO);
+
+      block_comment("if(z==zeroB) {"); {
+
+        block_comment("for (i=jz-1;i>=jk;i--) j |= iq[i];"); {
+            subw(i, jz, 1);
+            eorw(j, j, j);
+            b(RECOMP_FOR1_CHECK);
+          bind(RECOMP_FOR1);
+            ldrw(tmp1, Address(iqBase, i, Address::lsl(2)));
+            orrw(j, j, tmp1);
+            subw(i, i, 1);
+          bind(RECOMP_FOR1_CHECK);
+            cmpw(i, 4);
+            br(GE, RECOMP_FOR1);
+        }
+        cbnzw(j, RECOMP_CHECK_DONE);
+
+        block_comment("if(j==0) {"); {
+            // for(k=1;iq[jk-k]==0;k++); // let's unroll it. jk == 4. So, read
+            // iq[3], iq[2], iq[1], iq[0] until non-zero value
+            ldp(tmp1, tmp3, iqBase);               // iq[0..3]
+            movw(j, 2);
+            cmp(tmp3, zr);
+            csel(tmp1, tmp1, tmp3, EQ);            // set register for further consideration
+            cselw(j, j, zr, EQ);                   // set initial k. Use j as k
+            cmp(zr, tmp1, LSR, 32);
+            addw(i, jz, 1);
+            csincw(j, j, j, NE);
+
+          block_comment("for(i=jz+1;i<=jz+k;i++) {...}"); {
+              addw(jz, i, j); // i = jz+1, j = k-1. j+i = jz+k (which is a new jz)
+            bind(RECOMP_FOR2);
+              addw(tmp1, jv, i);
+              ldrd(v29, Address(twoOverPiBase, tmp1, Address::lsl(3)));
+              addw(tmp2, jx, i);
+              strd(v29, Address(sp, tmp2, Address::lsl(3)));
+              // f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i];
+              // since jx = 0, 1 or 2 we can unroll it:
+              // for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
+              // f[jx+i-j] == (for first iteration) f[jx+i], which is already v29
+              add(tmp2, sp, tmp2, ext::uxtx, 3); // address of f[jx+i]
+              ldpd(v4, v5, Address(tmp2, -16)); // load f[jx+i-2] and f[jx+i-1]
+              fmuld(v26, v6, v29); // initial fw
+              cbzw(jx, RECOMP_FW_UPDATED);
+              fmaddd(v26, v7, v5, v26);
+              cmpw(jx, 1);
+              br(EQ, RECOMP_FW_UPDATED);
+              fmaddd(v26, v3, v4, v26);
+            bind(RECOMP_FW_UPDATED);
+              strd(v26, Address(qBase, i, Address::lsl(3))); // q[i] = fw;
+              addw(i, i, 1);
+              cmpw(i, jz);                                   // jz here is "old jz" + k
+              br(LE, RECOMP_FOR2);
+          }
+            b(RECOMPUTE);
+        }
+      }
+    }
+    bind(RECOMP_CHECK_DONE);
+      // chop off zero terms
+      fcmpd(v18, 0.0d);
+      br(EQ, Z_IS_ZERO);
+
+      block_comment("else block of if(z==0.0) {"); {
+        bind(RECOMP_CHECK_DONE_NOT_ZERO);
+          fmuld(v18, v18, v22);
+          fcmpd(v18, v10);                                   // v10 is stil two24A
+          br(LT, Z_IS_LESS_THAN_TWO24B);
+          fmuld(v1, v18, v17);                               // twon24*z
+          frintzd(v1, v1);                                   // v1 = (double)(int)(v1)
+          fmaddd(v2, v10, v1, v18);
+          fcvtzdw(tmp3, v1);                                 // (int)fw
+          fcvtzdw(tmp2, v2);                                 // double to int
+          strw(tmp2, Address(iqBase, jz, Address::lsl(2)));
+          addw(rscratch1, rscratch1, 24);
+          addw(jz, jz, 1);
+          strw(tmp3, Address(iqBase, jz, Address::lsl(2)));  // iq[jz] = (int) fw
+          b(Z_ZERO_CHECK_DONE);
+        bind(Z_IS_LESS_THAN_TWO24B);
+          fcvtzdw(tmp3, v18);                                // (int)z
+          strw(tmp3, Address(iqBase, jz, Address::lsl(2)));  // iq[jz] = (int) z
+          b(Z_ZERO_CHECK_DONE);
+      }
+
+      block_comment("if(z==0.0) {"); {
+        bind(Z_IS_ZERO);
+          subw(jz, jz, 1);
+          ldrw(tmp1, Address(iqBase, jz, Address::lsl(2)));
+          subw(rscratch1, rscratch1, 24);
+          cbz(tmp1, Z_IS_ZERO);
+      }
+      bind(Z_ZERO_CHECK_DONE);
+        // convert integer "bit" chunk to floating-point value
+        // v17 = twon24
+        // update v30, which was scalbnA(1.0, <old q0>);
+        addw(tmp2, rscratch1, 1023); // biased exponent
+        lsl(tmp2, tmp2, 52); // put at correct position
+        mov(i, jz);
+        fmovd(v30, tmp2);
+
+        block_comment("for(i=jz;i>=0;i--) {q[i] = fw*(double)iq[i]; fw*=twon24;}"); {
+          bind(CONVERTION_FOR);
+            ldrw(tmp1, Address(iqBase, i, Address::lsl(2)));
+            scvtfwd(v31, tmp1);
+            fmuld(v31, v31, v30);
+            strd(v31, Address(qBase, i, Address::lsl(3)));
+            fmuld(v30, v30, v17);
+            subsw(i, i, 1);
+            br(GE, CONVERTION_FOR);
+        }
+        add(rscratch2, sp, 160); // base for fq
+        // reusing twoOverPiBase
+        lea(twoOverPiBase, ExternalAddress(pio2));
+
+      block_comment("compute PIo2[0,...,jp]*q[jz,...,0]. for(i=jz;i>=0;i--) {...}"); {
+          movw(i, jz);
+          movw(tmp2, zr); // tmp2 will keep jz - i == 0 at start
+        bind(COMP_FOR);
+          // for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
+          fmovd(v30, 0.0d);
+          add(tmp5, qBase, i, LSL, 3); // address of q[i+k] for k==0
+          movw(tmp3, 4);
+          movw(tmp4, zr);              // used as k
+          cmpw(tmp2, 4);
+          add(tmp1, qBase, i, LSL, 3); // used as q[i] address
+          cselw(tmp3, tmp2, tmp3, LE); // min(jz - i, jp)
+
+          block_comment("for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];"); {
+            bind(COMP_INNER_LOOP);
+              ldrd(v18, Address(tmp1, tmp4, Address::lsl(3)));          // q[i+k]
+              ldrd(v19, Address(twoOverPiBase, tmp4, Address::lsl(3))); // PIo2[k]
+              fmaddd(v30, v18, v19, v30);                               // fw += PIo2[k]*q[i+k];
+              addw(tmp4, tmp4, 1);                                      // k++
+              cmpw(tmp4, tmp3);
+              br(LE, COMP_INNER_LOOP);
+          }
+          strd(v30, Address(rscratch2, tmp2, Address::lsl(3)));         // fq[jz-i]
+          add(tmp2, tmp2, 1);
+          subsw(i, i, 1);
+          br(GE, COMP_FOR);
+      }
+
+      block_comment("switch(prec) {...}. case 2:"); {
+        // compress fq into y[]
+        // remember prec == 2
+
+        block_comment("for (i=jz;i>=0;i--) fw += fq[i];"); {
+            fmovd(v4, 0.0d);
+            mov(i, jz);
+          bind(FW_FOR1);
+            ldrd(v1, Address(rscratch2, i, Address::lsl(3)));
+            subsw(i, i, 1);
+            faddd(v4, v4, v1);
+            br(GE, FW_FOR1);
+        }
+        bind(FW_FOR1_DONE);
+          // v1 contains fq[0]. so, keep it so far
+          fsubd(v5, v1, v4); // fw = fq[0] - fw
+          cbzw(ih, FW_Y0_NO_NEGATION);
+          fnegd(v4, v4);
+        bind(FW_Y0_NO_NEGATION);
+
+        block_comment("for (i=1;i<=jz;i++) fw += fq[i];"); {
+            movw(i, 1);
+              cmpw(jz, 1);
+            br(LT, FW_FOR2_DONE);
+          bind(FW_FOR2);
+            ldrd(v1, Address(rscratch2, i, Address::lsl(3)));
+            addw(i, i, 1);
+            cmp(i, jz);
+            faddd(v5, v5, v1);
+            br(LE, FW_FOR2);
+        }
+        bind(FW_FOR2_DONE);
+          cbz(ih, FW_Y1_NO_NEGATION);
+          fnegd(v5, v5);
+        bind(FW_Y1_NO_NEGATION);
+          add(sp, sp, 560);
+      }
+}
+
+///* __kernel_sin( x, y, iy)
+// * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
+// * Input x is assumed to be bounded by ~pi/4 in magnitude.
+// * Input y is the tail of x.
+// * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
+// *
+// * Algorithm
+// *      1. Since sin(-x) = -sin(x), we need only to consider positive x.
+// *      2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
+// *      3. sin(x) is approximated by a polynomial of degree 13 on
+// *         [0,pi/4]
+// *                               3            13
+// *              sin(x) ~ x + S1*x + ... + S6*x
+// *         where
+// *
+// *      |sin(x)         2     4     6     8     10     12  |     -58
+// *      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
+// *      |  x                                               |
+// *
+// *      4. sin(x+y) = sin(x) + sin'(x')*y
+// *                  ~ sin(x) + (1-x*x/2)*y
+// *         For better accuracy, let
+// *                   3      2      2      2      2
+// *              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
+// *         then                   3    2
+// *              sin(x) = x + (S1*x + (x *(r-y/2)+y))
+// */
+//static const double
+//S1  = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
+//S2  =  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
+//S3  = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
+//S4  =  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
+//S5  = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
+//S6  =  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
+//
+// NOTE: S1..S6 were moved into a table: StubRoutines::aarch64::_dsin_coef
+//
+// BEGIN __kernel_sin PSEUDO CODE
+//
+//static double __kernel_sin(double x, double y, bool iy)
+//{
+//        double z,r,v;
+//
+//        // NOTE: not needed. moved to dsin/dcos
+//        //int ix;
+//        //ix = high(x)&0x7fffffff;                /* high word of x */
+//
+//        // NOTE: moved to dsin/dcos
+//        //if(ix<0x3e400000)                       /* |x| < 2**-27 */
+//        //   {if((int)x==0) return x;}            /* generate inexact */
+//
+//        z       =  x*x;
+//        v       =  z*x;
+//        r       =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
+//        if(iy==0) return x+v*(S1+z*r);
+//        else      return x-((z*(half*y-v*r)-y)-v*S1);
+//}
+//
+// END __kernel_sin PSEUDO CODE
+//
+// Changes between fdlibm and intrinsic:
+//     1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos
+//     2. Constants are now loaded from table dsin_coef
+//     3. C code parameter "int iy" was modified to "bool iyIsOne", because
+//         iy is always 0 or 1. Also, iyIsOne branch was moved into
+//         generation phase instead of taking it during code execution
+// Input ans output:
+//     1. Input for generated function: X argument = x
+//     2. Input for generator: x = register to read argument from, iyIsOne
+//         = flag to use low argument low part or not, dsin_coef = coefficients
+//         table address
+//     3. Return sin(x) value in v0
+void MacroAssembler::generate_kernel_sin(FloatRegister x, bool iyIsOne,
+    address dsin_coef) {
+  FloatRegister y = v5, z = v6, v = v7, r = v16, S1 = v17, S2 = v18,
+      S3 = v19, S4 = v20, S5 = v21, S6 = v22, half = v23;
+  lea(rscratch2, ExternalAddress(dsin_coef));
+  ldpd(S5, S6, Address(rscratch2, 32));
+  fmuld(z, x, x); // z =  x*x;
+  ld1(S1, S2, S3, S4, T1D, Address(rscratch2));
+  fmuld(v, z, x); // v =  z*x;
+
+  block_comment("calculate r =  S2+z*(S3+z*(S4+z*(S5+z*S6)))"); {
+    fmaddd(r, z, S6, S5);
+    // initialize "half" in current block to utilize 2nd FPU. However, it's
+    // not a part of this block
+    fmovd(half, 0.5);
+    fmaddd(r, z, r, S4);
+    fmaddd(r, z, r, S3);
+    fmaddd(r, z, r, S2);
+  }
+
+  if (!iyIsOne) {
+    // return x+v*(S1+z*r);
+    fmaddd(S1, z, r, S1);
+    fmaddd(v0, v, S1, x);
+  } else {
+    // return x-((z*(half*y-v*r)-y)-v*S1);
+    fmuld(S6, half, y);    // half*y
+    fmsubd(S6, v, r, S6);  // half*y-v*r
+    fmsubd(S6, z, S6, y);  // y - z*(half*y-v*r) = - (z*(half*y-v*r)-y)
+    fmaddd(S6, v, S1, S6); // - (z*(half*y-v*r)-y) + v*S1 == -((z*(half*y-v*r)-y)-v*S1)
+    faddd(v0, x, S6);
+  }
+}
+
+///*
+// * __kernel_cos( x,  y )
+// * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
+// * Input x is assumed to be bounded by ~pi/4 in magnitude.
+// * Input y is the tail of x.
+// *
+// * Algorithm
+// *      1. Since cos(-x) = cos(x), we need only to consider positive x.
+// *      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
+// *      3. cos(x) is approximated by a polynomial of degree 14 on
+// *         [0,pi/4]
+// *                                       4            14
+// *              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
+// *         where the remez error is
+// *
+// *      |              2     4     6     8     10    12     14 |     -58
+// *      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
+// *      |                                                      |
+// *
+// *                     4     6     8     10    12     14
+// *      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
+// *             cos(x) = 1 - x*x/2 + r
+// *         since cos(x+y) ~ cos(x) - sin(x)*y
+// *                        ~ cos(x) - x*y,
+// *         a correction term is necessary in cos(x) and hence
+// *              cos(x+y) = 1 - (x*x/2 - (r - x*y))
+// *         For better accuracy when x > 0.3, let qx = |x|/4 with
+// *         the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
+// *         Then
+// *              cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
+// *         Note that 1-qx and (x*x/2-qx) is EXACT here, and the
+// *         magnitude of the latter is at least a quarter of x*x/2,
+// *         thus, reducing the rounding error in the subtraction.
+// */
+//
+//static const double
+//C1  =  4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
+//C2  = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
+//C3  =  2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
+//C4  = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
+//C5  =  2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
+//C6  = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
+//
+// NOTE: C1..C6 were moved into a table: StubRoutines::aarch64::_dcos_coef
+//
+// BEGIN __kernel_cos PSEUDO CODE
+//
+//static double __kernel_cos(double x, double y)
+//{
+//  double a,h,z,r,qx=0;
+//
+//  // NOTE: ix is already initialized in dsin/dcos. Reuse value from register
+//  //int ix;
+//  //ix = high(x)&0x7fffffff;              /* ix = |x|'s high word*/
+//
+//  // NOTE: moved to dsin/dcos
+//  //if(ix<0x3e400000) {                   /* if x < 2**27 */
+//  //  if(((int)x)==0) return one;         /* generate inexact */
+//  //}
+//
+//  z  = x*x;
+//  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+//  if(ix < 0x3FD33333)                   /* if |x| < 0.3 */
+//    return one - (0.5*z - (z*r - x*y));
+//  else {
+//    if(ix > 0x3fe90000) {               /* x > 0.78125 */
+//      qx = 0.28125;
+//    } else {
+//      set_high(&qx, ix-0x00200000); /* x/4 */
+//      set_low(&qx, 0);
+//    }
+//    h = 0.5*z-qx;
+//    a = one-qx;
+//    return a - (h - (z*r-x*y));
+//  }
+//}
+//
+// END __kernel_cos PSEUDO CODE
+//
+// Changes between fdlibm and intrinsic:
+//     1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos
+//     2. Constants are now loaded from table dcos_coef
+// Input and output:
+//     1. Input for generated function: X argument = x
+//     2. Input for generator: x = register to read argument from, dcos_coef
+//        = coefficients table address
+//     2. Return cos(x) value in v0
+void MacroAssembler::generate_kernel_cos(FloatRegister x, address dcos_coef) {
+  Register ix = r3;
+  FloatRegister qx = v1, h = v2, a = v3, y = v5, z = v6, r = v7, C1 = v18,
+      C2 = v19, C3 = v20, C4 = v21, C5 = v22, C6 = v23, one = v25, half = v26;
+  Label IX_IS_LARGE, SET_QX_CONST, DONE, QX_SET;
+    lea(rscratch2, ExternalAddress(dcos_coef));
+    ldpd(C5, C6, Address(rscratch2, 32));         // load C5, C6
+    fmuld(z, x, x);                               // z=x^2
+    ld1(C1, C2, C3, C4, T1D, Address(rscratch2)); // load C1..C3\4
+    block_comment("calculate r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))))"); {
+      fmaddd(r, z, C6, C5);
+      fmovd(half, 0.5d);
+      fmaddd(r, z, r, C4);
+      fmuld(y, x, y);
+      fmaddd(r, z, r, C3);
+      mov(rscratch1, 0x3FD33333);
+      fmaddd(r, z, r, C2);
+      fmuld(x, z, z);                             // x = z^2
+      fmaddd(r, z, r, C1);                        // r = C1+z(C2+z(C4+z(C5+z*C6)))
+    }
+    // need to multiply r by z to have "final" r value
+    fmovd(one, 1.0d);
+    cmp(ix, rscratch1);
+    br(GT, IX_IS_LARGE);
+    block_comment("if(ix < 0x3FD33333) return one - (0.5*z - (z*r - x*y))"); {
+      // return 1.0 - (0.5*z - (z*r - x*y)) = 1.0 - (0.5*z + (x*y - z*r))
+      fmsubd(v0, x, r, y);
+      fmaddd(v0, half, z, v0);
+      fsubd(v0, one, v0);
+      b(DONE);
+    }
+  block_comment("if(ix >= 0x3FD33333)"); {
+    bind(IX_IS_LARGE);
+      movz(rscratch2, 0x3FE9, 16);
+      cmp(ix, rscratch2);
+      br(GT, SET_QX_CONST);
+      block_comment("set_high(&qx, ix-0x00200000); set_low(&qx, 0);"); {
+        subw(rscratch2, ix, 0x00200000);
+        lsl(rscratch2, rscratch2, 32);
+        fmovd(qx, rscratch2);
+      }
+      b(QX_SET);
+    bind(SET_QX_CONST);
+      block_comment("if(ix > 0x3fe90000) qx = 0.28125;"); {
+        fmovd(qx, 0.28125d);
+      }
+    bind(QX_SET);
+      fnmsub(C6, x, r, y);    // z*r - xy
+      fnmsub(h, half, z, qx); // h = 0.5*z - qx
+      fsubd(a, one, qx);      // a = 1-qx
+      fsubd(C6, h, C6);       // = h - (z*r - x*y)
+      fsubd(v0, a, C6);
+  }
+  bind(DONE);
+}
+
+// generate_dsin_dcos creates stub for dsin and dcos
+// Generation is done via single call because dsin and dcos code is almost the
+// same(see C code below). These functions work as follows:
+// 1) handle corner cases: |x| ~< pi/4, x is NaN or INF, |x| < 2**-27
+// 2) perform argument reduction if required
+// 3) call kernel_sin or kernel_cos which approximate sin/cos via polynomial
+//
+// BEGIN dsin/dcos PSEUDO CODE
+//
+//dsin_dcos(jdouble x, bool isCos) {
+//  double y[2],z=0.0;
+//  int n, ix;
+//
+//  /* High word of x. */
+//  ix = high(x);
+//
+//  /* |x| ~< pi/4 */
+//  ix &= 0x7fffffff;
+//  if(ix <= 0x3fe921fb) return isCos ? __kernel_cos : __kernel_sin(x,z,0);
+//
+//  /* sin/cos(Inf or NaN) is NaN */
+//  else if (ix>=0x7ff00000) return x-x;
+//  else if (ix<0x3e400000) {                   /* if ix < 2**27 */
+//    if(((int)x)==0) return isCos ? one : x;         /* generate inexact */
+//  }
+//  /* argument reduction needed */
+//  else {
+//    n = __ieee754_rem_pio2(x,y);
+//    switch(n&3) {
+//    case 0: return isCos ?  __kernel_cos(y[0],y[1])      :  __kernel_sin(y[0],y[1], true);
+//    case 1: return isCos ? -__kernel_sin(y[0],y[1],true) :  __kernel_cos(y[0],y[1]);
+//    case 2: return isCos ? -__kernel_cos(y[0],y[1])      : -__kernel_sin(y[0],y[1], true);
+//    default:
+//      return isCos ? __kernel_sin(y[0],y[1],1) : -__kernel_cos(y[0],y[1]);
+//    }
+//  }
+//}
+// END dsin/dcos PSEUDO CODE
+//
+// Changes between fdlibm and intrinsic:
+//     1. Moved ix < 2**27 from kernel_sin/kernel_cos into dsin/dcos
+//     2. Final switch use equivalent bit checks(tbz/tbnz)
+// Input ans output:
+//     1. Input for generated function: X = r0
+//     2. Input for generator: isCos = generate sin or cos, npio2_hw = address
+//         of npio2_hw table, two_over_pi = address of two_over_pi table,
+//         pio2 = address if pio2 table, dsin_coef = address if dsin_coef table,
+//         dcos_coef = address of dcos_coef table
+//     3. Return result in v0
+// NOTE: general purpose register names match local variable names in C code
+void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
+    address two_over_pi, address pio2, address dsin_coef, address dcos_coef) {
+  const int POSITIVE_INFINITY_OR_NAN_PREFIX = 0x7FF0;
+
+  Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE;
+  Register X = r0, absX = r1, n = r2, ix = r3;
+  FloatRegister y0 = v4, y1 = v5;
+    block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); {
+      fmovd(X, v0);
+      mov(rscratch2, 0x3e400000);
+      mov(rscratch1, 0x3fe921fb00000000);            // pi/4. shifted to reuse later
+      ubfm(absX, X, 0, 62);                          // absX
+      movz(r10, POSITIVE_INFINITY_OR_NAN_PREFIX, 48);
+      cmp(rscratch2, absX, LSR, 32);
+      lsr(ix, absX, 32);                             // set ix
+      br(GT, TINY_X);                                // handle tiny x (|x| < 2^-27)
+      cmp(ix, rscratch1, LSR, 32);
+      br(LE, EARLY_CASE);                            // if(ix <= 0x3fe921fb) return
+      cmp(absX, r10);
+      br(LT, ARG_REDUCTION);
+      // X is NaN or INF(i.e. 0x7FF* or 0xFFF*). Return NaN (mantissa != 0).
+      // Set last bit unconditionally to make it NaN
+      orr(r10, r10, 1);
+      fmovd(v0, r10);
+      ret(lr);
+    }
+  block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); {
+    bind(TINY_X);
+      if (isCos) {
+        fmovd(v0, 1.0d);
+      }
+      ret(lr);
+  }
+  bind(ARG_REDUCTION); /* argument reduction needed */
+    block_comment("n = __ieee754_rem_pio2(x,y);"); {
+      generate__ieee754_rem_pio2(npio2_hw, two_over_pi, pio2);
+    }
+    block_comment("switch(n&3) {case ... }"); {
+      if (isCos) {
+        eorw(absX, n, n, LSR, 1);
+        tbnz(n, 0, RETURN_SIN);
+      } else {
+        tbz(n, 0, RETURN_SIN);
+      }
+      generate_kernel_cos(y0, dcos_coef);
+      if (isCos) {
+        tbz(absX, 0, DONE);
+      } else {
+        tbz(n, 1, DONE);
+      }
+      fnegd(v0, v0);
+      ret(lr);
+    bind(RETURN_SIN);
+      generate_kernel_sin(y0, true, dsin_coef);
+      if (isCos) {
+        tbz(absX, 0, DONE);
+      } else {
+        tbz(n, 1, DONE);
+      }
+      fnegd(v0, v0);
+      ret(lr);
+    }
+  bind(EARLY_CASE);
+    eor(y1, T8B, y1, y1);
+    if (isCos) {
+      generate_kernel_cos(v0, dcos_coef);
+    } else {
+      generate_kernel_sin(v0, false, dsin_coef);
+    }
+  bind(DONE);
+    ret(lr);
+}
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -1351,9 +1351,9 @@
       BLOCK_COMMENT("Entry:");
     }
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -1425,9 +1425,9 @@
     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
     __ br(Assembler::HS, nooverlap_target);
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -1789,10 +1789,10 @@
     }
 #endif //ASSERT
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_CHECKCAST;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
     bool is_oop = true;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
 
     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
@@ -3990,6 +3990,701 @@
     return entry;
   }
 
+  address generate_dsin_dcos(bool isCos) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
+    address start = __ pc();
+    __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
+        (address)StubRoutines::aarch64::_two_over_pi,
+        (address)StubRoutines::aarch64::_pio2,
+        (address)StubRoutines::aarch64::_dsin_coef,
+        (address)StubRoutines::aarch64::_dcos_coef);
+    return start;
+  }
+
+  address generate_dlog() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "dlog");
+    address entry = __ pc();
+    FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
+        vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
+    Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
+    __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
+        tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
+    return entry;
+  }
+
+  // code for comparing 16 bytes of strings with same encoding
+  void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
+    Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
+    __ ldr(rscratch1, Address(__ post(str1, 8)));
+    __ eor(rscratch2, tmp1, tmp2);
+    __ ldr(cnt1, Address(__ post(str2, 8)));
+    __ cbnz(rscratch2, DIFF1);
+    __ ldr(tmp1, Address(__ post(str1, 8)));
+    __ eor(rscratch2, rscratch1, cnt1);
+    __ ldr(tmp2, Address(__ post(str2, 8)));
+    __ cbnz(rscratch2, DIFF2);
+  }
+
+  // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
+  void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
+      Label &DIFF2) {
+    Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
+    FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
+
+    __ ldrq(vtmp, Address(__ post(tmp2, 16)));
+    __ ldr(tmpU, Address(__ post(cnt1, 8)));
+    __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
+    // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
+
+    __ fmovd(tmpL, vtmp3);
+    __ eor(rscratch2, tmp3, tmpL);
+    __ cbnz(rscratch2, DIFF2);
+
+    __ ldr(tmp3, Address(__ post(cnt1, 8)));
+    __ umov(tmpL, vtmp3, __ D, 1);
+    __ eor(rscratch2, tmpU, tmpL);
+    __ cbnz(rscratch2, DIFF1);
+
+    __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
+    __ ldr(tmpU, Address(__ post(cnt1, 8)));
+    __ fmovd(tmpL, vtmp);
+    __ eor(rscratch2, tmp3, tmpL);
+    __ cbnz(rscratch2, DIFF2);
+
+    __ ldr(tmp3, Address(__ post(cnt1, 8)));
+    __ umov(tmpL, vtmp, __ D, 1);
+    __ eor(rscratch2, tmpU, tmpL);
+    __ cbnz(rscratch2, DIFF1);
+  }
+
+  // r0  = result
+  // r1  = str1
+  // r2  = cnt1
+  // r3  = str2
+  // r4  = cnt2
+  // r10 = tmp1
+  // r11 = tmp2
+  address generate_compare_long_string_different_encoding(bool isLU) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", isLU
+        ? "compare_long_string_different_encoding LU"
+        : "compare_long_string_different_encoding UL");
+    address entry = __ pc();
+    Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
+        DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
+        LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
+    Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
+        tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
+    FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
+    RegSet spilled_regs = RegSet::of(tmp3, tmp4);
+
+    int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
+
+    __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
+    // cnt2 == amount of characters left to compare
+    // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
+    __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
+    __ add(str1, str1, isLU ? wordSize/2 : wordSize);
+    __ add(str2, str2, isLU ? wordSize : wordSize/2);
+    __ fmovd(isLU ? tmp1 : tmp2, vtmp);
+    __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
+    __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
+    __ eor(rscratch2, tmp1, tmp2);
+    __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
+    __ mov(rscratch1, tmp2);
+    __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
+    Register strU = isLU ? str2 : str1,
+             strL = isLU ? str1 : str2,
+             tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
+             tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
+    __ push(spilled_regs, sp);
+    __ sub(tmp2, strL, cnt2); // strL pointer to load from
+    __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
+
+    __ ldr(tmp3, Address(__ post(cnt1, 8)));
+
+    if (SoftwarePrefetchHintDistance >= 0) {
+      __ cmp(cnt2, prefetchLoopExitCondition);
+      __ br(__ LT, SMALL_LOOP);
+      __ bind(LARGE_LOOP_PREFETCH);
+        __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
+        __ mov(tmp4, 2);
+        __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
+        __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
+          compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
+          __ subs(tmp4, tmp4, 1);
+          __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
+          __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
+          __ mov(tmp4, 2);
+        __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
+          compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
+          __ subs(tmp4, tmp4, 1);
+          __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
+          __ sub(cnt2, cnt2, 64);
+          __ cmp(cnt2, prefetchLoopExitCondition);
+          __ br(__ GE, LARGE_LOOP_PREFETCH);
+    }
+    __ cbz(cnt2, LOAD_LAST); // no characters left except last load
+    __ subs(cnt2, cnt2, 16);
+    __ br(__ LT, TAIL);
+    __ b(SMALL_LOOP_ENTER);
+    __ bind(SMALL_LOOP); // smaller loop
+      __ subs(cnt2, cnt2, 16);
+    __ bind(SMALL_LOOP_ENTER);
+      compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
+      __ br(__ GE, SMALL_LOOP);
+      __ cbz(cnt2, LOAD_LAST);
+    __ bind(TAIL); // 1..15 characters left
+      __ cmp(cnt2, -8);
+      __ br(__ GT, TAIL_LOAD_16);
+      __ ldrd(vtmp, Address(tmp2));
+      __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
+
+      __ ldr(tmpU, Address(__ post(cnt1, 8)));
+      __ fmovd(tmpL, vtmp3);
+      __ eor(rscratch2, tmp3, tmpL);
+      __ cbnz(rscratch2, DIFF2);
+      __ umov(tmpL, vtmp3, __ D, 1);
+      __ eor(rscratch2, tmpU, tmpL);
+      __ cbnz(rscratch2, DIFF1);
+      __ b(LOAD_LAST);
+    __ bind(TAIL_LOAD_16);
+      __ ldrq(vtmp, Address(tmp2));
+      __ ldr(tmpU, Address(__ post(cnt1, 8)));
+      __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
+      __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
+      __ fmovd(tmpL, vtmp3);
+      __ eor(rscratch2, tmp3, tmpL);
+      __ cbnz(rscratch2, DIFF2);
+
+      __ ldr(tmp3, Address(__ post(cnt1, 8)));
+      __ umov(tmpL, vtmp3, __ D, 1);
+      __ eor(rscratch2, tmpU, tmpL);
+      __ cbnz(rscratch2, DIFF1);
+
+      __ ldr(tmpU, Address(__ post(cnt1, 8)));
+      __ fmovd(tmpL, vtmp);
+      __ eor(rscratch2, tmp3, tmpL);
+      __ cbnz(rscratch2, DIFF2);
+
+      __ umov(tmpL, vtmp, __ D, 1);
+      __ eor(rscratch2, tmpU, tmpL);
+      __ cbnz(rscratch2, DIFF1);
+      __ b(LOAD_LAST);
+    __ bind(DIFF2);
+      __ mov(tmpU, tmp3);
+    __ bind(DIFF1);
+      __ pop(spilled_regs, sp);
+      __ b(CALCULATE_DIFFERENCE);
+    __ bind(LOAD_LAST);
+      __ pop(spilled_regs, sp);
+
+      __ ldrs(vtmp, Address(strL));
+      __ ldr(tmpU, Address(strU));
+      __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
+      __ fmovd(tmpL, vtmp);
+
+      __ eor(rscratch2, tmpU, tmpL);
+      __ cbz(rscratch2, DONE);
+
+    // Find the first different characters in the longwords and
+    // compute their difference.
+    __ bind(CALCULATE_DIFFERENCE);
+      __ rev(rscratch2, rscratch2);
+      __ clz(rscratch2, rscratch2);
+      __ andr(rscratch2, rscratch2, -16);
+      __ lsrv(tmp1, tmp1, rscratch2);
+      __ uxthw(tmp1, tmp1);
+      __ lsrv(rscratch1, rscratch1, rscratch2);
+      __ uxthw(rscratch1, rscratch1);
+      __ subw(result, tmp1, rscratch1);
+    __ bind(DONE);
+      __ ret(lr);
+    return entry;
+  }
+
+  // r0  = result
+  // r1  = str1
+  // r2  = cnt1
+  // r3  = str2
+  // r4  = cnt2
+  // r10 = tmp1
+  // r11 = tmp2
+  address generate_compare_long_string_same_encoding(bool isLL) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", isLL
+        ? "compare_long_string_same_encoding LL"
+        : "compare_long_string_same_encoding UU");
+    address entry = __ pc();
+    Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
+        tmp1 = r10, tmp2 = r11;
+    Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
+        LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
+        DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
+    // exit from large loop when less than 64 bytes left to read or we're about
+    // to prefetch memory behind array border
+    int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
+    // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
+    // update cnt2 counter with already loaded 8 bytes
+    __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
+    // update pointers, because of previous read
+    __ add(str1, str1, wordSize);
+    __ add(str2, str2, wordSize);
+    if (SoftwarePrefetchHintDistance >= 0) {
+      __ bind(LARGE_LOOP_PREFETCH);
+        __ prfm(Address(str1, SoftwarePrefetchHintDistance));
+        __ prfm(Address(str2, SoftwarePrefetchHintDistance));
+        compare_string_16_bytes_same(DIFF, DIFF2);
+        compare_string_16_bytes_same(DIFF, DIFF2);
+        __ sub(cnt2, cnt2, isLL ? 64 : 32);
+        compare_string_16_bytes_same(DIFF, DIFF2);
+        __ cmp(cnt2, largeLoopExitCondition);
+        compare_string_16_bytes_same(DIFF, DIFF2);
+        __ br(__ GT, LARGE_LOOP_PREFETCH);
+        __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
+        // less than 16 bytes left?
+        __ subs(cnt2, cnt2, isLL ? 16 : 8);
+        __ br(__ LT, TAIL);
+    }
+    __ bind(SMALL_LOOP);
+      compare_string_16_bytes_same(DIFF, DIFF2);
+      __ subs(cnt2, cnt2, isLL ? 16 : 8);
+      __ br(__ GE, SMALL_LOOP);
+    __ bind(TAIL);
+      __ adds(cnt2, cnt2, isLL ? 16 : 8);
+      __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
+      __ subs(cnt2, cnt2, isLL ? 8 : 4);
+      __ br(__ LE, CHECK_LAST);
+      __ eor(rscratch2, tmp1, tmp2);
+      __ cbnz(rscratch2, DIFF);
+      __ ldr(tmp1, Address(__ post(str1, 8)));
+      __ ldr(tmp2, Address(__ post(str2, 8)));
+      __ sub(cnt2, cnt2, isLL ? 8 : 4);
+    __ bind(CHECK_LAST);
+      if (!isLL) {
+        __ add(cnt2, cnt2, cnt2); // now in bytes
+      }
+      __ eor(rscratch2, tmp1, tmp2);
+      __ cbnz(rscratch2, DIFF);
+      __ ldr(rscratch1, Address(str1, cnt2));
+      __ ldr(cnt1, Address(str2, cnt2));
+      __ eor(rscratch2, rscratch1, cnt1);
+      __ cbz(rscratch2, LENGTH_DIFF);
+      // Find the first different characters in the longwords and
+      // compute their difference.
+    __ bind(DIFF2);
+      __ rev(rscratch2, rscratch2);
+      __ clz(rscratch2, rscratch2);
+      __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
+      __ lsrv(rscratch1, rscratch1, rscratch2);
+      if (isLL) {
+        __ lsrv(cnt1, cnt1, rscratch2);
+        __ uxtbw(rscratch1, rscratch1);
+        __ uxtbw(cnt1, cnt1);
+      } else {
+        __ lsrv(cnt1, cnt1, rscratch2);
+        __ uxthw(rscratch1, rscratch1);
+        __ uxthw(cnt1, cnt1);
+      }
+      __ subw(result, rscratch1, cnt1);
+      __ b(LENGTH_DIFF);
+    __ bind(DIFF);
+      __ rev(rscratch2, rscratch2);
+      __ clz(rscratch2, rscratch2);
+      __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
+      __ lsrv(tmp1, tmp1, rscratch2);
+      if (isLL) {
+        __ lsrv(tmp2, tmp2, rscratch2);
+        __ uxtbw(tmp1, tmp1);
+        __ uxtbw(tmp2, tmp2);
+      } else {
+        __ lsrv(tmp2, tmp2, rscratch2);
+        __ uxthw(tmp1, tmp1);
+        __ uxthw(tmp2, tmp2);
+      }
+      __ subw(result, tmp1, tmp2);
+      __ b(LENGTH_DIFF);
+    __ bind(LAST_CHECK_AND_LENGTH_DIFF);
+      __ eor(rscratch2, tmp1, tmp2);
+      __ cbnz(rscratch2, DIFF);
+    __ bind(LENGTH_DIFF);
+      __ ret(lr);
+    return entry;
+  }
+
+  void generate_compare_long_strings() {
+      StubRoutines::aarch64::_compare_long_string_LL
+          = generate_compare_long_string_same_encoding(true);
+      StubRoutines::aarch64::_compare_long_string_UU
+          = generate_compare_long_string_same_encoding(false);
+      StubRoutines::aarch64::_compare_long_string_LU
+          = generate_compare_long_string_different_encoding(true);
+      StubRoutines::aarch64::_compare_long_string_UL
+          = generate_compare_long_string_different_encoding(false);
+  }
+
+  // R0 = result
+  // R1 = str2
+  // R2 = cnt1
+  // R3 = str1
+  // R4 = cnt2
+  // This generic linear code use few additional ideas, which makes it faster:
+  // 1) we can safely keep at least 1st register of pattern(since length >= 8)
+  // in order to skip initial loading(help in systems with 1 ld pipeline)
+  // 2) we can use "fast" algorithm of finding single character to search for
+  // first symbol with less branches(1 branch per each loaded register instead
+  // of branch for each symbol), so, this is where constants like
+  // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
+  // 3) after loading and analyzing 1st register of source string, it can be
+  // used to search for every 1st character entry, saving few loads in
+  // comparison with "simplier-but-slower" implementation
+  // 4) in order to avoid lots of push/pop operations, code below is heavily
+  // re-using/re-initializing/compressing register values, which makes code
+  // larger and a bit less readable, however, most of extra operations are
+  // issued during loads or branches, so, penalty is minimal
+  address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
+    const char* stubName = str1_isL
+        ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
+        : "indexof_linear_uu";
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stubName);
+    address entry = __ pc();
+
+    int str1_chr_size = str1_isL ? 1 : 2;
+    int str2_chr_size = str2_isL ? 1 : 2;
+    int str1_chr_shift = str1_isL ? 0 : 1;
+    int str2_chr_shift = str2_isL ? 0 : 1;
+    bool isL = str1_isL && str2_isL;
+   // parameters
+    Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
+    // temporary registers
+    Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
+    RegSet spilled_regs = RegSet::range(tmp1, tmp4);
+    // redefinitions
+    Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
+
+    __ push(spilled_regs, sp);
+    Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP,
+        L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
+        L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
+        L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
+        L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
+        L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
+    // Read whole register from str1. It is safe, because length >=8 here
+    __ ldr(ch1, Address(str1));
+    // Read whole register from str2. It is safe, because length >=8 here
+    __ ldr(ch2, Address(str2));
+    __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
+    if (str1_isL != str2_isL) {
+      __ eor(v0, __ T16B, v0, v0);
+    }
+    __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
+    __ mul(first, first, tmp1);
+    // check if we have less than 1 register to check
+    __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
+    if (str1_isL != str2_isL) {
+      __ fmovd(v1, ch1);
+    }
+    __ br(__ LE, L_SMALL);
+    __ eor(ch2, first, ch2);
+    if (str1_isL != str2_isL) {
+      __ zip1(v1, __ T16B, v1, v0);
+    }
+    __ sub(tmp2, ch2, tmp1);
+    __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+    __ bics(tmp2, tmp2, ch2);
+    if (str1_isL != str2_isL) {
+      __ fmovd(ch1, v1);
+    }
+    __ br(__ NE, L_HAS_ZERO);
+    __ subs(cnt2, cnt2, wordSize/str2_chr_size);
+    __ add(result, result, wordSize/str2_chr_size);
+    __ add(str2, str2, wordSize);
+    __ br(__ LT, L_POST_LOOP);
+    __ BIND(L_LOOP);
+      __ ldr(ch2, Address(str2));
+      __ eor(ch2, first, ch2);
+      __ sub(tmp2, ch2, tmp1);
+      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+      __ bics(tmp2, tmp2, ch2);
+      __ br(__ NE, L_HAS_ZERO);
+    __ BIND(L_LOOP_PROCEED);
+      __ subs(cnt2, cnt2, wordSize/str2_chr_size);
+      __ add(str2, str2, wordSize);
+      __ add(result, result, wordSize/str2_chr_size);
+      __ br(__ GE, L_LOOP);
+    __ BIND(L_POST_LOOP);
+      __ cmp(cnt2, -wordSize/str2_chr_size); // no extra characters to check
+      __ br(__ LE, NOMATCH);
+      __ ldr(ch2, Address(str2));
+      __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
+      __ eor(ch2, first, ch2);
+      __ sub(tmp2, ch2, tmp1);
+      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+      __ mov(tmp4, -1); // all bits set
+      __ b(L_SMALL_PROCEED);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_SMALL);
+      __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
+      __ eor(ch2, first, ch2);
+      if (str1_isL != str2_isL) {
+        __ zip1(v1, __ T16B, v1, v0);
+      }
+      __ sub(tmp2, ch2, tmp1);
+      __ mov(tmp4, -1); // all bits set
+      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
+      if (str1_isL != str2_isL) {
+        __ fmovd(ch1, v1); // move converted 4 symbols
+      }
+    __ BIND(L_SMALL_PROCEED);
+      __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
+      __ bic(tmp2, tmp2, ch2);
+      __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
+      __ rbit(tmp2, tmp2);
+      __ br(__ EQ, NOMATCH);
+    __ BIND(L_SMALL_HAS_ZERO_LOOP);
+      __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
+      __ cmp(cnt1, wordSize/str2_chr_size);
+      __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
+      if (str2_isL) { // LL
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
+        __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+      } else {
+        __ mov(ch2, 0xE); // all bits in byte set except last one
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+      }
+      __ cmp(ch1, ch2);
+      __ mov(tmp4, wordSize/str2_chr_size);
+      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
+    __ BIND(L_SMALL_CMP_LOOP);
+      str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
+               : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
+      str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
+               : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
+      __ add(tmp4, tmp4, 1);
+      __ cmp(tmp4, cnt1);
+      __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
+      __ cmp(first, ch2);
+      __ br(__ EQ, L_SMALL_CMP_LOOP);
+    __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
+      __ cbz(tmp2, NOMATCH); // no more matches. exit
+      __ clz(tmp4, tmp2);
+      __ add(result, result, 1); // advance index
+      __ add(str2, str2, str2_chr_size); // advance pointer
+      __ b(L_SMALL_HAS_ZERO_LOOP);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
+      __ cmp(first, ch2);
+      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
+      if (str2_isL) { // LL
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
+        __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+      } else {
+        __ mov(ch2, 0xE); // all bits in byte set except last one
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+      }
+      __ cmp(ch1, ch2);
+      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_HAS_ZERO);
+      __ rbit(tmp2, tmp2);
+      __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
+      // Now, perform compression of counters(cnt2 and cnt1) into one register.
+      // It's fine because both counters are 32bit and are not changed in this
+      // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
+      __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
+      __ sub(result, result, 1);
+    __ BIND(L_HAS_ZERO_LOOP);
+      __ mov(cnt1, wordSize/str2_chr_size);
+      __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
+      __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
+      if (str2_isL) {
+        __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1);
+        __ mov(tmp4, wordSize/str2_chr_size);
+      } else {
+        __ mov(ch2, 0xE);
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1);
+        __ mov(tmp4, wordSize/str2_chr_size);
+        __ sub(str2, str2, str2_chr_size);
+      }
+      __ cmp(ch1, ch2);
+      __ mov(tmp4, wordSize/str2_chr_size);
+      __ br(__ NE, L_CMP_LOOP_NOMATCH);
+    __ BIND(L_CMP_LOOP);
+      str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
+               : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
+      str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
+               : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
+      __ add(tmp4, tmp4, 1);
+      __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
+      __ br(__ GE, L_CMP_LOOP_LAST_CMP);
+      __ cmp(cnt1, ch2);
+      __ br(__ EQ, L_CMP_LOOP);
+    __ BIND(L_CMP_LOOP_NOMATCH);
+      // here we're not matched
+      __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
+      __ clz(tmp4, tmp2);
+      __ add(str2, str2, str2_chr_size); // advance pointer
+      __ b(L_HAS_ZERO_LOOP);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_CMP_LOOP_LAST_CMP);
+      __ cmp(cnt1, ch2);
+      __ br(__ NE, L_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_CMP_LOOP_LAST_CMP2);
+      if (str2_isL) {
+        __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ lsl(tmp2, tmp2, 1);
+      } else {
+        __ mov(ch2, 0xE);
+        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
+        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
+        __ lslv(tmp2, tmp2, tmp4);
+        __ add(tmp4, tmp4, 1);
+        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
+        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
+        __ lsl(tmp2, tmp2, 1);
+        __ sub(str2, str2, str2_chr_size);
+      }
+      __ cmp(ch1, ch2);
+      __ br(__ NE, L_CMP_LOOP_NOMATCH);
+      __ b(DONE);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
+      // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
+      // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
+      // so, result was increased at max by wordSize/str2_chr_size - 1, so,
+      // respective high bit wasn't changed. L_LOOP_PROCEED will increase
+      // result by analyzed characters value, so, we can just reset lower bits
+      // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
+      // 2) restore cnt1 and cnt2 values from "compressed" cnt2
+      // 3) advance str2 value to represent next str2 octet. result & 7/3 is
+      // index of last analyzed substring inside current octet. So, str2 in at
+      // respective start address. We need to advance it to next octet
+      __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
+      __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
+      __ bfm(result, zr, 0, 2 - str2_chr_shift);
+      __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
+      __ movw(cnt2, cnt2);
+      __ b(L_LOOP_PROCEED);
+    __ align(OptoLoopAlignment);
+    __ BIND(NOMATCH);
+      __ mov(result, -1);
+    __ BIND(DONE);
+      __ pop(spilled_regs, sp);
+      __ ret(lr);
+    return entry;
+  }
+
+  void generate_string_indexof_stubs() {
+    StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
+    StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
+    StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
+  }
+
+  void inflate_and_store_2_fp_registers(bool generatePrfm,
+      FloatRegister src1, FloatRegister src2) {
+    Register dst = r1;
+    __ zip1(v1, __ T16B, src1, v0);
+    __ zip2(v2, __ T16B, src1, v0);
+    if (generatePrfm) {
+      __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
+    }
+    __ zip1(v3, __ T16B, src2, v0);
+    __ zip2(v4, __ T16B, src2, v0);
+    __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
+  }
+
+  // R0 = src
+  // R1 = dst
+  // R2 = len
+  // R3 = len >> 3
+  // V0 = 0
+  // v1 = loaded 8 bytes
+  address generate_large_byte_array_inflate() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
+    address entry = __ pc();
+    Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
+    Register src = r0, dst = r1, len = r2, octetCounter = r3;
+    const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
+
+    // do one more 8-byte read to have address 16-byte aligned in most cases
+    // also use single store instruction
+    __ ldrd(v2, __ post(src, 8));
+    __ sub(octetCounter, octetCounter, 2);
+    __ zip1(v1, __ T16B, v1, v0);
+    __ zip1(v2, __ T16B, v2, v0);
+    __ st1(v1, v2, __ T16B, __ post(dst, 32));
+    __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
+    __ cmp(octetCounter, large_loop_threshold);
+    __ br(__ LE, LOOP_START);
+    __ b(LOOP_PRFM_START);
+    __ bind(LOOP_PRFM);
+      __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
+    __ bind(LOOP_PRFM_START);
+      __ prfm(Address(src, SoftwarePrefetchHintDistance));
+      __ sub(octetCounter, octetCounter, 8);
+      __ cmp(octetCounter, large_loop_threshold);
+      inflate_and_store_2_fp_registers(true, v3, v4);
+      inflate_and_store_2_fp_registers(true, v5, v6);
+      __ br(__ GT, LOOP_PRFM);
+      __ cmp(octetCounter, 8);
+      __ br(__ LT, DONE);
+    __ bind(LOOP);
+      __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
+      __ bind(LOOP_START);
+      __ sub(octetCounter, octetCounter, 8);
+      __ cmp(octetCounter, 8);
+      inflate_and_store_2_fp_registers(false, v3, v4);
+      inflate_and_store_2_fp_registers(false, v5, v6);
+      __ br(__ GE, LOOP);
+    __ bind(DONE);
+      __ ret(lr);
+    return entry;
+  }
 
   /**
    *  Arguments:
@@ -5044,6 +5739,18 @@
     if (UseCRC32CIntrinsics) {
       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
     }
+
+    if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
+      StubRoutines::_dlog = generate_dlog();
+    }
+
+    if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
+      StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
+    }
+
+    if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
+      StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
+    }
   }
 
   void generate_all() {
@@ -5078,6 +5785,13 @@
       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
     }
 
+    generate_compare_long_strings();
+
+    generate_string_indexof_stubs();
+
+    // byte_array_inflate stub for large arrays.
+    StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
+
     if (UseMultiplyToLenIntrinsic) {
       StubRoutines::_multiplyToLen = generate_multiplyToLen();
     }
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -48,6 +48,14 @@
 address StubRoutines::aarch64::_has_negatives = NULL;
 address StubRoutines::aarch64::_has_negatives_long = NULL;
 address StubRoutines::aarch64::_large_array_equals = NULL;
+address StubRoutines::aarch64::_compare_long_string_LL = NULL;
+address StubRoutines::aarch64::_compare_long_string_UU = NULL;
+address StubRoutines::aarch64::_compare_long_string_LU = NULL;
+address StubRoutines::aarch64::_compare_long_string_UL = NULL;
+address StubRoutines::aarch64::_string_indexof_linear_ll = NULL;
+address StubRoutines::aarch64::_string_indexof_linear_uu = NULL;
+address StubRoutines::aarch64::_string_indexof_linear_ul = NULL;
+address StubRoutines::aarch64::_large_byte_array_inflate = NULL;
 bool StubRoutines::aarch64::_completed = false;
 
 /**
@@ -278,3 +286,87 @@
     0x02D578EDUL, 0x7DAEED62UL,         // word swap
     0xD502ED78UL, 0xAE7D62EDUL,         // byte swap of word swap
 };
+
+juint StubRoutines::aarch64::_npio2_hw[] __attribute__ ((aligned(64))) = {
+    // first, various coefficient values: 0.5, invpio2, pio2_1, pio2_1t, pio2_2,
+    // pio2_2t, pio2_3, pio2_3t
+    // This is a small optimization wich keeping double[8] values in int[] table
+    // to have less address calculation instructions
+    //
+    // invpio2:  53 bits of 2/pi (enough for cases when trigonometric argument is small)
+    // pio2_1:   first  33 bit of pi/2
+    // pio2_1t:  pi/2 - pio2_1
+    // pio2_2:   second 33 bit of pi/2
+    // pio2_2t:  pi/2 - (pio2_1+pio2_2)
+    // pio2_3:   third  33 bit of pi/2
+    // pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
+    0x00000000, 0x3fe00000, // 0.5
+    0x6DC9C883, 0x3FE45F30, // invpio2 = 6.36619772367581382433e-01
+    0x54400000, 0x3FF921FB, // pio2_1 = 1.57079632673412561417e+00
+    0x1A626331, 0x3DD0B461, // pio2_1t = 6.07710050650619224932e-11
+    0x1A600000, 0x3DD0B461, // pio2_2 = 6.07710050630396597660e-11
+    0x2E037073, 0x3BA3198A, // pio2_2t = 2.02226624879595063154e-21
+    0x2E000000, 0x3BA3198A, // pio2_3 = 2.02226624871116645580e-21
+    0x252049C1, 0x397B839A, // pio2_3t = 8.47842766036889956997e-32
+    // now, npio2_hw itself
+    0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
+    0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
+    0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
+    0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C,
+    0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB,
+    0x404858EB, 0x404921FB
+};
+
+// Coefficients for sin(x) polynomial approximation: S1..S6.
+// See kernel_sin comments in macroAssembler_aarch64_trig.cpp for details
+jdouble StubRoutines::aarch64::_dsin_coef[] __attribute__ ((aligned(64))) = {
+    -1.66666666666666324348e-01, // 0xBFC5555555555549
+     8.33333333332248946124e-03, // 0x3F8111111110F8A6
+    -1.98412698298579493134e-04, // 0xBF2A01A019C161D5
+     2.75573137070700676789e-06, // 0x3EC71DE357B1FE7D
+    -2.50507602534068634195e-08, // 0xBE5AE5E68A2B9CEB
+     1.58969099521155010221e-10  // 0x3DE5D93A5ACFD57C
+};
+
+// Coefficients for cos(x) polynomial approximation: C1..C6.
+// See kernel_cos comments in macroAssembler_aarch64_trig.cpp for details
+jdouble StubRoutines::aarch64::_dcos_coef[] __attribute__ ((aligned(64))) = {
+     4.16666666666666019037e-02, // c0x3FA555555555554C
+    -1.38888888888741095749e-03, // 0xBF56C16C16C15177
+     2.48015872894767294178e-05, // 0x3EFA01A019CB1590
+    -2.75573143513906633035e-07, // 0xBE927E4F809C52AD
+     2.08757232129817482790e-09, // 0x3E21EE9EBDB4B1C4
+    -1.13596475577881948265e-11  // 0xBDA8FAE9BE8838D4
+};
+
+// Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi.
+// Used in cases of very large argument. 396 hex digits is enough to support
+// required precision.
+// Converted to double to avoid unnecessary conversion in code
+// NOTE: table looks like original int table: {0xA2F983, 0x6E4E44,...} with
+//       only (double) conversion added
+jdouble StubRoutines::aarch64::_two_over_pi[] __attribute__ ((aligned(64))) = {
+  (double)0xA2F983, (double)0x6E4E44, (double)0x1529FC, (double)0x2757D1, (double)0xF534DD, (double)0xC0DB62,
+  (double)0x95993C, (double)0x439041, (double)0xFE5163, (double)0xABDEBB, (double)0xC561B7, (double)0x246E3A,
+  (double)0x424DD2, (double)0xE00649, (double)0x2EEA09, (double)0xD1921C, (double)0xFE1DEB, (double)0x1CB129,
+  (double)0xA73EE8, (double)0x8235F5, (double)0x2EBB44, (double)0x84E99C, (double)0x7026B4, (double)0x5F7E41,
+  (double)0x3991D6, (double)0x398353, (double)0x39F49C, (double)0x845F8B, (double)0xBDF928, (double)0x3B1FF8,
+  (double)0x97FFDE, (double)0x05980F, (double)0xEF2F11, (double)0x8B5A0A, (double)0x6D1F6D, (double)0x367ECF,
+  (double)0x27CB09, (double)0xB74F46, (double)0x3F669E, (double)0x5FEA2D, (double)0x7527BA, (double)0xC7EBE5,
+  (double)0xF17B3D, (double)0x0739F7, (double)0x8A5292, (double)0xEA6BFB, (double)0x5FB11F, (double)0x8D5D08,
+  (double)0x560330, (double)0x46FC7B, (double)0x6BABF0, (double)0xCFBC20, (double)0x9AF436, (double)0x1DA9E3,
+  (double)0x91615E, (double)0xE61B08, (double)0x659985, (double)0x5F14A0, (double)0x68408D, (double)0xFFD880,
+  (double)0x4D7327, (double)0x310606, (double)0x1556CA, (double)0x73A8C9, (double)0x60E27B, (double)0xC08C6B,
+};
+
+// Pi over 2 value
+jdouble StubRoutines::aarch64::_pio2[] __attribute__ ((aligned(64))) = {
+  1.57079625129699707031e+00, // 0x3FF921FB40000000
+  7.54978941586159635335e-08, // 0x3E74442D00000000
+  5.39030252995776476554e-15, // 0x3CF8469880000000
+  3.28200341580791294123e-22, // 0x3B78CC5160000000
+  1.27065575308067607349e-29, // 0x39F01B8380000000
+  1.22933308981111328932e-36, // 0x387A252040000000
+  2.73370053816464559624e-44, // 0x36E3822280000000
+  2.16741683877804819444e-51, // 0x3569F31D00000000
+};
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Jun 25 10:21:50 2018 -0700
@@ -41,7 +41,7 @@
 
 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 28000           // simply increase if too small (assembler will crash if too small)
 };
 
 class aarch64 {
@@ -66,6 +66,14 @@
   static address _has_negatives;
   static address _has_negatives_long;
   static address _large_array_equals;
+  static address _compare_long_string_LL;
+  static address _compare_long_string_LU;
+  static address _compare_long_string_UL;
+  static address _compare_long_string_UU;
+  static address _string_indexof_linear_ll;
+  static address _string_indexof_linear_uu;
+  static address _string_indexof_linear_ul;
+  static address _large_byte_array_inflate;
   static bool _completed;
 
  public:
@@ -136,6 +144,38 @@
       return _large_array_equals;
   }
 
+  static address compare_long_string_LL() {
+      return _compare_long_string_LL;
+  }
+
+  static address compare_long_string_LU() {
+      return _compare_long_string_LU;
+  }
+
+  static address compare_long_string_UL() {
+      return _compare_long_string_UL;
+  }
+
+  static address compare_long_string_UU() {
+      return _compare_long_string_UU;
+  }
+
+  static address string_indexof_linear_ul() {
+      return _string_indexof_linear_ul;
+  }
+
+  static address string_indexof_linear_ll() {
+      return _string_indexof_linear_ll;
+  }
+
+  static address string_indexof_linear_uu() {
+      return _string_indexof_linear_uu;
+  }
+
+  static address large_byte_array_inflate() {
+      return _large_byte_array_inflate;
+  }
+
   static bool complete() {
     return _completed;
   }
@@ -146,7 +186,13 @@
 
 private:
   static juint    _crc_table[];
-
+  // begin trigonometric tables block. See comments in .cpp file
+  static juint    _npio2_hw[];
+  static jdouble   _two_over_pi[];
+  static jdouble   _pio2[];
+  static jdouble   _dsin_coef[];
+  static jdouble  _dcos_coef[];
+  // end trigonometric tables block
 };
 
 #endif // CPU_AARCH64_VM_STUBROUTINES_AARCH64_HPP
--- a/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -247,26 +247,54 @@
   address fn;
   switch (kind) {
   case Interpreter::java_lang_math_sin :
-    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dsin);
+    if (StubRoutines::dsin() == NULL) {
+      fn = CAST_FROM_FN_PTR(address, SharedRuntime::dsin);
+    } else {
+      fn = CAST_FROM_FN_PTR(address, StubRoutines::dsin());
+    }
     break;
   case Interpreter::java_lang_math_cos :
-    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dcos);
+    if (StubRoutines::dcos() == NULL) {
+      fn = CAST_FROM_FN_PTR(address, SharedRuntime::dcos);
+    } else {
+      fn = CAST_FROM_FN_PTR(address, StubRoutines::dcos());
+    }
     break;
   case Interpreter::java_lang_math_tan :
-    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dtan);
+    if (StubRoutines::dtan() == NULL) {
+      fn = CAST_FROM_FN_PTR(address, SharedRuntime::dtan);
+    } else {
+      fn = CAST_FROM_FN_PTR(address, StubRoutines::dtan());
+    }
     break;
   case Interpreter::java_lang_math_log :
-    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dlog);
+    if (StubRoutines::dlog() == NULL) {
+      fn = CAST_FROM_FN_PTR(address, SharedRuntime::dlog);
+    } else {
+      fn = CAST_FROM_FN_PTR(address, StubRoutines::dlog());
+    }
     break;
   case Interpreter::java_lang_math_log10 :
-    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10);
+    if (StubRoutines::dlog10() == NULL) {
+      fn = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10);
+    } else {
+      fn = CAST_FROM_FN_PTR(address, StubRoutines::dlog10());
+    }
     break;
   case Interpreter::java_lang_math_exp :
-    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);
+    if (StubRoutines::dexp() == NULL) {
+      fn = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);
+    } else {
+      fn = CAST_FROM_FN_PTR(address, StubRoutines::dexp());
+    }
     break;
   case Interpreter::java_lang_math_pow :
     fpargs = 2;
-    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dpow);
+    if (StubRoutines::dpow() == NULL) {
+      fn = CAST_FROM_FN_PTR(address, SharedRuntime::dpow);
+    } else {
+      fn = CAST_FROM_FN_PTR(address, StubRoutines::dpow());
+    }
     break;
   default:
     ShouldNotReachHere();
--- a/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -761,7 +761,7 @@
   // r1: index
   index_check(r0, r1); // leaves index in r1, kills rscratch1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_INT) >> 2);
-  __ access_load_at(T_INT, IN_HEAP | IN_HEAP_ARRAY, r0, Address(r0, r1, Address::uxtw(2)), noreg, noreg);
+  __ access_load_at(T_INT, IN_HEAP | IS_ARRAY, r0, Address(r0, r1, Address::uxtw(2)), noreg, noreg);
 }
 
 void TemplateTable::laload()
@@ -773,7 +773,7 @@
   // r1: index
   index_check(r0, r1); // leaves index in r1, kills rscratch1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_LONG) >> 3);
-  __ access_load_at(T_LONG, IN_HEAP | IN_HEAP_ARRAY, r0, Address(r0, r1, Address::uxtw(3)), noreg, noreg);
+  __ access_load_at(T_LONG, IN_HEAP | IS_ARRAY, r0, Address(r0, r1, Address::uxtw(3)), noreg, noreg);
 }
 
 void TemplateTable::faload()
@@ -785,7 +785,7 @@
   // r1: index
   index_check(r0, r1); // leaves index in r1, kills rscratch1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_FLOAT) >> 2);
-  __ access_load_at(T_FLOAT, IN_HEAP | IN_HEAP_ARRAY, r0, Address(r0, r1, Address::uxtw(2)), noreg, noreg);
+  __ access_load_at(T_FLOAT, IN_HEAP | IS_ARRAY, r0, Address(r0, r1, Address::uxtw(2)), noreg, noreg);
 }
 
 void TemplateTable::daload()
@@ -797,7 +797,7 @@
   // r1: index
   index_check(r0, r1); // leaves index in r1, kills rscratch1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_DOUBLE) >> 3);
-  __ access_load_at(T_DOUBLE, IN_HEAP | IN_HEAP_ARRAY, r0, Address(r0, r1, Address::uxtw(3)), noreg, noreg);
+  __ access_load_at(T_DOUBLE, IN_HEAP | IS_ARRAY, r0, Address(r0, r1, Address::uxtw(3)), noreg, noreg);
 }
 
 void TemplateTable::aaload()
@@ -812,7 +812,7 @@
   do_oop_load(_masm,
               Address(r0, r1, Address::uxtw(LogBytesPerHeapOop)),
               r0,
-              IN_HEAP_ARRAY);
+              IS_ARRAY);
 }
 
 void TemplateTable::baload()
@@ -824,7 +824,7 @@
   // r1: index
   index_check(r0, r1); // leaves index in r1, kills rscratch1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_BYTE) >> 0);
-  __ access_load_at(T_BYTE, IN_HEAP | IN_HEAP_ARRAY, r0, Address(r0, r1, Address::uxtw(0)), noreg, noreg);
+  __ access_load_at(T_BYTE, IN_HEAP | IS_ARRAY, r0, Address(r0, r1, Address::uxtw(0)), noreg, noreg);
 }
 
 void TemplateTable::caload()
@@ -836,7 +836,7 @@
   // r1: index
   index_check(r0, r1); // leaves index in r1, kills rscratch1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_CHAR) >> 1);
-  __ access_load_at(T_CHAR, IN_HEAP | IN_HEAP_ARRAY, r0, Address(r0, r1, Address::uxtw(1)), noreg, noreg);
+  __ access_load_at(T_CHAR, IN_HEAP | IS_ARRAY, r0, Address(r0, r1, Address::uxtw(1)), noreg, noreg);
 }
 
 // iload followed by caload frequent pair
@@ -853,7 +853,7 @@
   // r1: index
   index_check(r0, r1); // leaves index in r1, kills rscratch1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_CHAR) >> 1);
-  __ access_load_at(T_CHAR, IN_HEAP | IN_HEAP_ARRAY, r0, Address(r0, r1, Address::uxtw(1)), noreg, noreg);
+  __ access_load_at(T_CHAR, IN_HEAP | IS_ARRAY, r0, Address(r0, r1, Address::uxtw(1)), noreg, noreg);
 }
 
 void TemplateTable::saload()
@@ -865,7 +865,7 @@
   // r1: index
   index_check(r0, r1); // leaves index in r1, kills rscratch1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_SHORT) >> 1);
-  __ access_load_at(T_SHORT, IN_HEAP | IN_HEAP_ARRAY, r0, Address(r0, r1, Address::uxtw(1)), noreg, noreg);
+  __ access_load_at(T_SHORT, IN_HEAP | IS_ARRAY, r0, Address(r0, r1, Address::uxtw(1)), noreg, noreg);
 }
 
 void TemplateTable::iload(int n)
@@ -1059,7 +1059,7 @@
   // r3: array
   index_check(r3, r1); // prefer index in r1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_INT) >> 2);
-  __ access_store_at(T_INT, IN_HEAP | IN_HEAP_ARRAY, Address(r3, r1, Address::uxtw(2)), r0, noreg, noreg);
+  __ access_store_at(T_INT, IN_HEAP | IS_ARRAY, Address(r3, r1, Address::uxtw(2)), r0, noreg, noreg);
 }
 
 void TemplateTable::lastore() {
@@ -1071,7 +1071,7 @@
   // r3: array
   index_check(r3, r1); // prefer index in r1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_LONG) >> 3);
-  __ access_store_at(T_LONG, IN_HEAP | IN_HEAP_ARRAY, Address(r3, r1, Address::uxtw(3)), r0, noreg, noreg);
+  __ access_store_at(T_LONG, IN_HEAP | IS_ARRAY, Address(r3, r1, Address::uxtw(3)), r0, noreg, noreg);
 }
 
 void TemplateTable::fastore() {
@@ -1083,7 +1083,7 @@
   // r3:  array
   index_check(r3, r1); // prefer index in r1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_FLOAT) >> 2);
-  __ access_store_at(T_FLOAT, IN_HEAP | IN_HEAP_ARRAY, Address(r3, r1, Address::uxtw(2)), noreg /* ftos */, noreg, noreg);
+  __ access_store_at(T_FLOAT, IN_HEAP | IS_ARRAY, Address(r3, r1, Address::uxtw(2)), noreg /* ftos */, noreg, noreg);
 }
 
 void TemplateTable::dastore() {
@@ -1095,7 +1095,7 @@
   // r3:  array
   index_check(r3, r1); // prefer index in r1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_DOUBLE) >> 3);
-  __ access_store_at(T_DOUBLE, IN_HEAP | IN_HEAP_ARRAY, Address(r3, r1, Address::uxtw(3)), noreg /* dtos */, noreg, noreg);
+  __ access_store_at(T_DOUBLE, IN_HEAP | IS_ARRAY, Address(r3, r1, Address::uxtw(3)), noreg /* dtos */, noreg, noreg);
 }
 
 void TemplateTable::aastore() {
@@ -1136,7 +1136,7 @@
   // Get the value we will store
   __ ldr(r0, at_tos());
   // Now store using the appropriate barrier
-  do_oop_store(_masm, element_address, r0, IN_HEAP_ARRAY);
+  do_oop_store(_masm, element_address, r0, IS_ARRAY);
   __ b(done);
 
   // Have a NULL in r0, r3=array, r2=index.  Store NULL at ary[idx]
@@ -1144,7 +1144,7 @@
   __ profile_null_seen(r2);
 
   // Store a NULL
-  do_oop_store(_masm, element_address, noreg, IN_HEAP_ARRAY);
+  do_oop_store(_masm, element_address, noreg, IS_ARRAY);
 
   // Pop stack arguments
   __ bind(done);
@@ -1172,7 +1172,7 @@
   __ bind(L_skip);
 
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_BYTE) >> 0);
-  __ access_store_at(T_BYTE, IN_HEAP | IN_HEAP_ARRAY, Address(r3, r1, Address::uxtw(0)), r0, noreg, noreg);
+  __ access_store_at(T_BYTE, IN_HEAP | IS_ARRAY, Address(r3, r1, Address::uxtw(0)), r0, noreg, noreg);
 }
 
 void TemplateTable::castore()
@@ -1185,7 +1185,7 @@
   // r3: array
   index_check(r3, r1); // prefer index in r1
   __ add(r1, r1, arrayOopDesc::base_offset_in_bytes(T_CHAR) >> 1);
-  __ access_store_at(T_CHAR, IN_HEAP | IN_HEAP_ARRAY, Address(r3, r1, Address::uxtw(1)), r0, noreg, noreg);
+  __ access_store_at(T_CHAR, IN_HEAP | IS_ARRAY, Address(r3, r1, Address::uxtw(1)), r0, noreg, noreg);
 }
 
 void TemplateTable::sastore()
@@ -3362,22 +3362,45 @@
   // r2: receiver
   // r3: flags
 
+  // First check for Object case, then private interface method,
+  // then regular interface method.
+
   // Special case of invokeinterface called for virtual method of
-  // java.lang.Object.  See cpCacheOop.cpp for details.
-  // This code isn't produced by javac, but could be produced by
-  // another compliant java compiler.
-  Label notMethod;
-  __ tbz(r3, ConstantPoolCacheEntry::is_forced_virtual_shift, notMethod);
+  // java.lang.Object.  See cpCache.cpp for details.
+  Label notObjectMethod;
+  __ tbz(r3, ConstantPoolCacheEntry::is_forced_virtual_shift, notObjectMethod);
 
   invokevirtual_helper(rmethod, r2, r3);
-  __ bind(notMethod);
+  __ bind(notObjectMethod);
+
+  Label no_such_interface;
+
+  // Check for private method invocation - indicated by vfinal
+  Label notVFinal;
+  __ tbz(r3, ConstantPoolCacheEntry::is_vfinal_shift, notVFinal);
+
+  // Get receiver klass into r3 - also a null check
+  __ null_check(r2, oopDesc::klass_offset_in_bytes());
+  __ load_klass(r3, r2);
+
+  Label subtype;
+  __ check_klass_subtype(r3, r0, r4, subtype);
+  // If we get here the typecheck failed
+  __ b(no_such_interface);
+  __ bind(subtype);
+
+  __ profile_final_call(r0);
+  __ profile_arguments_type(r0, rmethod, r4, true);
+  __ jump_from_interpreted(rmethod, r0);
+
+  __ bind(notVFinal);
 
   // Get receiver klass into r3 - also a null check
   __ restore_locals();
   __ null_check(r2, oopDesc::klass_offset_in_bytes());
   __ load_klass(r3, r2);
 
-  Label no_such_interface, no_such_method;
+  Label no_such_method;
 
   // Preserve method for throw_AbstractMethodErrorVerbose.
   __ mov(r16, rmethod);
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -53,7 +53,7 @@
 
 void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register addr, Register count, int callee_saved_regs) {
-  bool dest_uninitialized = (decorators & AS_DEST_NOT_INITIALIZED) != 0;
+  bool dest_uninitialized = (decorators & IS_DEST_UNINITIALIZED) != 0;
   if (!dest_uninitialized) {
     assert( addr->encoding() < callee_saved_regs, "addr must be saved");
     assert(count->encoding() < callee_saved_regs, "count must be saved");
--- a/src/hotspot/cpu/arm/gc/shared/cardTableBarrierSetAssembler_arm.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/arm/gc/shared/cardTableBarrierSetAssembler_arm.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -75,9 +75,9 @@
 
 void CardTableBarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                              Address obj, Register new_val, Register tmp1, Register tmp2, Register tmp3, bool is_null) {
-  bool on_array = (decorators & IN_HEAP_ARRAY) != 0;
+  bool is_array = (decorators & IS_ARRAY) != 0;
   bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
-  bool precise = on_array || on_anonymous;
+  bool precise = is_array || on_anonymous;
 
   if (is_null) {
     BarrierSetAssembler::store_at(masm, decorators, type, obj, new_val, tmp1, tmp2, tmp3, true);
--- a/src/hotspot/cpu/arm/stubGenerator_arm.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/arm/stubGenerator_arm.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -2945,7 +2945,7 @@
     __ push(LR);
 #endif // AARCH64
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
     if (disjoint) {
       decorators |= ARRAYCOPY_DISJOINT;
     }
@@ -3217,7 +3217,7 @@
     pushed+=1;
 #endif // AARCH64
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_CHECKCAST;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
 
     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
     bs->arraycopy_prologue(_masm, decorators, true, to, count, callee_saved_regs);
--- a/src/hotspot/cpu/arm/templateTable_arm.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/arm/templateTable_arm.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -943,7 +943,7 @@
   const Register Rindex = R0_tos;
 
   index_check(Rarray, Rindex);
-  do_oop_load(_masm, R0_tos, get_array_elem_addr(T_OBJECT, Rarray, Rindex, Rtemp), IN_HEAP_ARRAY);
+  do_oop_load(_masm, R0_tos, get_array_elem_addr(T_OBJECT, Rarray, Rindex, Rtemp), IS_ARRAY);
 }
 
 
@@ -1328,7 +1328,7 @@
   __ add(Raddr_1, Raddr_1, AsmOperand(Rindex_4, lsl, LogBytesPerHeapOop));
 
   // Now store using the appropriate barrier
-  do_oop_store(_masm, Raddr_1, Rvalue_2, Rtemp, R0_tmp, R3_tmp, false, IN_HEAP_ARRAY);
+  do_oop_store(_masm, Raddr_1, Rvalue_2, Rtemp, R0_tmp, R3_tmp, false, IS_ARRAY);
   __ b(done);
 
   __ bind(throw_array_store);
@@ -1344,7 +1344,7 @@
   __ profile_null_seen(R0_tmp);
 
   // Store a NULL
-  do_oop_store(_masm, Address::indexed_oop(Raddr_1, Rindex_4), Rvalue_2, Rtemp, R0_tmp, R3_tmp, true, IN_HEAP_ARRAY);
+  do_oop_store(_masm, Address::indexed_oop(Raddr_1, Rindex_4), Rvalue_2, Rtemp, R0_tmp, R3_tmp, true, IS_ARRAY);
 
   // Pop stack arguments
   __ bind(done);
@@ -4276,25 +4276,41 @@
   const Register Rinterf = R5_tmp;
   const Register Rindex  = R4_tmp;
   const Register Rflags  = R3_tmp;
-  const Register Rklass  = R3_tmp;
+  const Register Rklass  = R2_tmp; // Note! Same register with Rrecv
 
   prepare_invoke(byte_no, Rinterf, Rmethod, Rrecv, Rflags);
 
+  // First check for Object case, then private interface method,
+  // then regular interface method.
+
   // Special case of invokeinterface called for virtual method of
-  // java.lang.Object.  See cpCacheOop.cpp for details.
-  // This code isn't produced by javac, but could be produced by
-  // another compliant java compiler.
-  Label notMethod;
-  __ tbz(Rflags, ConstantPoolCacheEntry::is_forced_virtual_shift, notMethod);
-
+  // java.lang.Object.  See cpCache.cpp for details.
+  Label notObjectMethod;
+  __ tbz(Rflags, ConstantPoolCacheEntry::is_forced_virtual_shift, notObjectMethod);
   invokevirtual_helper(Rmethod, Rrecv, Rflags);
-  __ bind(notMethod);
+  __ bind(notObjectMethod);
 
   // Get receiver klass into Rklass - also a null check
   __ load_klass(Rklass, Rrecv);
 
+  // Check for private method invocation - indicated by vfinal
   Label no_such_interface;
 
+  Label notVFinal;
+  __ tbz(Rflags, ConstantPoolCacheEntry::is_vfinal_shift, notVFinal);
+
+  Label subtype;
+  __ check_klass_subtype(Rklass, Rinterf, R1_tmp, R3_tmp, noreg, subtype);
+  // If we get here the typecheck failed
+  __ b(no_such_interface);
+  __ bind(subtype);
+
+  // do the call
+  __ profile_final_call(R0_tmp);
+  __ jump_from_interpreted(Rmethod);
+
+  __ bind(notVFinal);
+
   // Receiver subtype check against REFC.
   __ lookup_interface_method(// inputs: rec. class, interface
                              Rklass, Rinterf, noreg,
--- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -44,7 +44,7 @@
 void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register from, Register to, Register count,
                                                             Register preserve1, Register preserve2) {
-  bool dest_uninitialized = (decorators & AS_DEST_NOT_INITIALIZED) != 0;
+  bool dest_uninitialized = (decorators & IS_DEST_UNINITIALIZED) != 0;
   // With G1, don't generate the call if we statically know that the target in uninitialized
   if (!dest_uninitialized) {
     int spill_slots = 3;
@@ -107,7 +107,7 @@
 
 void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, DecoratorSet decorators, Register obj, RegisterOrConstant ind_or_offs, Register pre_val,
                                                  Register tmp1, Register tmp2, bool needs_frame) {
-  bool not_null  = (decorators & OOP_NOT_NULL) != 0,
+  bool not_null  = (decorators & IS_NOT_NULL) != 0,
        preloaded = obj == noreg;
   Register nv_save = noreg;
 
@@ -205,7 +205,7 @@
 
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators, Register store_addr, Register new_val,
                                                   Register tmp1, Register tmp2, Register tmp3) {
-  bool not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool not_null = (decorators & IS_NOT_NULL) != 0;
 
   Label runtime, filtered;
   assert_different_registers(store_addr, new_val, tmp1, tmp2);
@@ -279,9 +279,9 @@
 void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                        Register base, RegisterOrConstant ind_or_offs, Register val,
                                        Register tmp1, Register tmp2, Register tmp3, bool needs_frame) {
-  bool on_array = (decorators & IN_HEAP_ARRAY) != 0;
+  bool is_array = (decorators & IS_ARRAY) != 0;
   bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
-  bool precise = on_array || on_anonymous;
+  bool precise = is_array || on_anonymous;
   // Load and record the previous value.
   g1_write_barrier_pre(masm, decorators, base, ind_or_offs,
                        tmp1, tmp2, tmp3, needs_frame);
@@ -318,7 +318,7 @@
     // these parameters the pre-barrier does not generate
     // the load of the previous value
     // We only reach here if value is not null.
-    g1_write_barrier_pre(masm, decorators | OOP_NOT_NULL, noreg /* obj */, (intptr_t)0, dst /* pre_val */,
+    g1_write_barrier_pre(masm, decorators | IS_NOT_NULL, noreg /* obj */, (intptr_t)0, dst /* pre_val */,
                          tmp1, tmp2, needs_frame);
   }
   __ bind(done);
--- a/src/hotspot/cpu/ppc/gc/shared/barrierSetAssembler_ppc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/ppc/gc/shared/barrierSetAssembler_ppc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -35,7 +35,7 @@
                                    Register tmp1, Register tmp2, Register tmp3, bool needs_frame) {
   bool in_heap = (decorators & IN_HEAP) != 0;
   bool in_native = (decorators & IN_NATIVE) != 0;
-  bool not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool not_null = (decorators & IS_NOT_NULL) != 0;
   assert(in_heap || in_native, "where?");
   assert_different_registers(base, val, tmp1, tmp2, R0);
 
@@ -68,7 +68,7 @@
                                   Register tmp1, Register tmp2, bool needs_frame, Label *L_handle_null) {
   bool in_heap = (decorators & IN_HEAP) != 0;
   bool in_native = (decorators & IN_NATIVE) != 0;
-  bool not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool not_null = (decorators & IS_NOT_NULL) != 0;
   assert(in_heap || in_native, "where?");
   assert_different_registers(ind_or_offs.register_or_noreg(), dst, R0);
 
--- a/src/hotspot/cpu/ppc/gc/shared/cardTableBarrierSetAssembler_ppc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/ppc/gc/shared/cardTableBarrierSetAssembler_ppc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -93,9 +93,9 @@
 void CardTableBarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                                 Register base, RegisterOrConstant ind_or_offs, Register val,
                                                 Register tmp1, Register tmp2, Register tmp3, bool needs_frame) {
-  bool on_array = (decorators & IN_HEAP_ARRAY) != 0;
+  bool is_array = (decorators & IS_ARRAY) != 0;
   bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
-  bool precise = on_array || on_anonymous;
+  bool precise = is_array || on_anonymous;
 
   BarrierSetAssembler::store_at(masm, decorators, type, base, ind_or_offs, val, tmp1, tmp2, tmp3, needs_frame);
 
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -2046,7 +2046,7 @@
   assert_different_registers(mtype_reg, mh_reg, temp_reg);
   // Compare method type against that of the receiver.
   load_heap_oop(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg,
-                noreg, noreg, false, OOP_NOT_NULL);
+                noreg, noreg, false, IS_NOT_NULL);
   cmpd(CCR0, temp_reg, mtype_reg);
   bne(CCR0, wrong_method_type);
 }
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.inline.hpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.inline.hpp	Mon Jun 25 10:21:50 2018 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -329,7 +329,7 @@
 inline void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
                                             Register base, RegisterOrConstant ind_or_offs, Register val,
                                             Register tmp1, Register tmp2, Register tmp3, bool needs_frame) {
-  assert((decorators & ~(AS_RAW | IN_HEAP | IN_HEAP_ARRAY | IN_NATIVE | OOP_NOT_NULL |
+  assert((decorators & ~(AS_RAW | IN_HEAP | IN_NATIVE | IS_ARRAY | IS_NOT_NULL |
                          ON_UNKNOWN_OOP_REF)) == 0, "unsupported decorator");
   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
   bool as_raw = (decorators & AS_RAW) != 0;
@@ -348,7 +348,7 @@
 inline void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
                                            Register base, RegisterOrConstant ind_or_offs, Register dst,
                                            Register tmp1, Register tmp2, bool needs_frame, Label *L_handle_null) {
-  assert((decorators & ~(AS_RAW | IN_HEAP | IN_HEAP_ARRAY | IN_NATIVE | OOP_NOT_NULL |
+  assert((decorators & ~(AS_RAW | IN_HEAP | IN_NATIVE | IS_ARRAY | IS_NOT_NULL |
                          ON_PHANTOM_OOP_REF | ON_WEAK_OOP_REF)) == 0, "unsupported decorator");
   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
   decorators = AccessInternal::decorator_fixup(decorators);
--- a/src/hotspot/cpu/ppc/methodHandles_ppc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/ppc/methodHandles_ppc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -174,13 +174,13 @@
   // Load the invoker, as MH -> MH.form -> LF.vmentry
   __ verify_oop(recv);
   __ load_heap_oop(method_temp, NONZERO(java_lang_invoke_MethodHandle::form_offset_in_bytes()), recv,
-                   temp2, noreg, false, OOP_NOT_NULL);
+                   temp2, noreg, false, IS_NOT_NULL);
   __ verify_oop(method_temp);
   __ load_heap_oop(method_temp, NONZERO(java_lang_invoke_LambdaForm::vmentry_offset_in_bytes()), method_temp,
-                   temp2, noreg, false, OOP_NOT_NULL);
+                   temp2, noreg, false, IS_NOT_NULL);
   __ verify_oop(method_temp);
   __ load_heap_oop(method_temp, NONZERO(java_lang_invoke_MemberName::method_offset_in_bytes()), method_temp,
-                   temp2, noreg, false, OOP_NOT_NULL);
+                   temp2, noreg, false, IS_NOT_NULL);
   __ verify_oop(method_temp);
   __ ld(method_temp, NONZERO(java_lang_invoke_ResolvedMethodName::vmtarget_offset_in_bytes()), method_temp);
 
@@ -342,7 +342,7 @@
         Label L_ok;
         Register temp2_defc = temp2;
         __ load_heap_oop(temp2_defc, NONZERO(java_lang_invoke_MemberName::clazz_offset_in_bytes()), member_reg,
-                         temp3, noreg, false, OOP_NOT_NULL);
+                         temp3, noreg, false, IS_NOT_NULL);
         load_klass_from_Class(_masm, temp2_defc, temp3, temp4);
         __ verify_klass_ptr(temp2_defc);
         __ check_klass_subtype(temp1_recv_klass, temp2_defc, temp3, temp4, L_ok);
@@ -370,7 +370,7 @@
         verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp2);
       }
       __ load_heap_oop(R19_method, NONZERO(java_lang_invoke_MemberName::method_offset_in_bytes()), member_reg,
-                       temp3, noreg, false, OOP_NOT_NULL);
+                       temp3, noreg, false, IS_NOT_NULL);
       __ ld(R19_method, NONZERO(java_lang_invoke_ResolvedMethodName::vmtarget_offset_in_bytes()), R19_method);
       break;
 
@@ -379,7 +379,7 @@
         verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp2);
       }
       __ load_heap_oop(R19_method, NONZERO(java_lang_invoke_MemberName::method_offset_in_bytes()), member_reg,
-                       temp3, noreg, false, OOP_NOT_NULL);
+                       temp3, noreg, false, IS_NOT_NULL);
       __ ld(R19_method, NONZERO(java_lang_invoke_ResolvedMethodName::vmtarget_offset_in_bytes()), R19_method);
       break;
 
@@ -422,7 +422,7 @@
 
       Register temp2_intf = temp2;
       __ load_heap_oop(temp2_intf, NONZERO(java_lang_invoke_MemberName::clazz_offset_in_bytes()), member_reg,
-                       temp3, noreg, false, OOP_NOT_NULL);
+                       temp3, noreg, false, IS_NOT_NULL);
       load_klass_from_Class(_masm, temp2_intf, temp3, temp4);
       __ verify_klass_ptr(temp2_intf);
 
--- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -2024,9 +2024,9 @@
       STUB_ENTRY(arrayof_oop_disjoint_arraycopy) :
       STUB_ENTRY(oop_disjoint_arraycopy);
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -2063,9 +2063,9 @@
     address start = __ function_entry();
     assert_positive_int(R5_ARG3);
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -2159,9 +2159,9 @@
     }
 #endif
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_CHECKCAST;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
 
     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
--- a/src/hotspot/cpu/ppc/templateTable_ppc_64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/ppc/templateTable_ppc_64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -688,7 +688,7 @@
                  Rtemp2     = R31;
   __ index_check(Rarray, R17_tos /* index */, UseCompressedOops ? 2 : LogBytesPerWord, Rtemp, Rload_addr);
   do_oop_load(_masm, Rload_addr, arrayOopDesc::base_offset_in_bytes(T_OBJECT), R17_tos, Rtemp, Rtemp2,
-              IN_HEAP_ARRAY);
+              IS_ARRAY);
   __ verify_oop(R17_tos);
   //__ dcbt(R17_tos); // prefetch
 }
@@ -1015,14 +1015,14 @@
 
   __ bind(Lis_null);
   do_oop_store(_masm, Rstore_addr, arrayOopDesc::base_offset_in_bytes(T_OBJECT), noreg /* 0 */,
-               Rscratch, Rscratch2, Rscratch3, IN_HEAP_ARRAY);
+               Rscratch, Rscratch2, Rscratch3, IS_ARRAY);
   __ profile_null_seen(Rscratch, Rscratch2);
   __ b(Ldone);
 
   // Store is OK.
   __ bind(Lstore_ok);
   do_oop_store(_masm, Rstore_addr, arrayOopDesc::base_offset_in_bytes(T_OBJECT), R17_tos /* value */,
-               Rscratch, Rscratch2, Rscratch3, IN_HEAP_ARRAY | OOP_NOT_NULL);
+               Rscratch, Rscratch2, Rscratch3, IS_ARRAY | IS_NOT_NULL);
 
   __ bind(Ldone);
   // Adjust sp (pops array, index and value).
@@ -3583,14 +3583,46 @@
 
   prepare_invoke(byte_no, Rinterface_klass, Rret_addr, Rmethod, Rreceiver, Rflags, Rscratch1);
 
-  // Get receiver klass.
+  // First check for Object case, then private interface method,
+  // then regular interface method.
+
+  // Get receiver klass - this is also a null check
   __ null_check_throw(Rreceiver, oopDesc::klass_offset_in_bytes(), Rscratch2);
   __ load_klass(Rrecv_klass, Rreceiver);
 
   // Check corner case object method.
-  Label LobjectMethod, L_no_such_interface, Lthrow_ame;
+  // Special case of invokeinterface called for virtual method of
+  // java.lang.Object. See ConstantPoolCacheEntry::set_method() for details:
+  // The invokeinterface was rewritten to a invokevirtual, hence we have
+  // to handle this corner case.
+
+  Label LnotObjectMethod, Lthrow_ame;
   __ testbitdi(CCR0, R0, Rflags, ConstantPoolCacheEntry::is_forced_virtual_shift);
-  __ btrue(CCR0, LobjectMethod);
+  __ bfalse(CCR0, LnotObjectMethod);
+  invokeinterface_object_method(Rrecv_klass, Rret_addr, Rflags, Rmethod, Rscratch1, Rscratch2);
+  __ bind(LnotObjectMethod);
+
+  // Check for private method invocation - indicated by vfinal
+  Label LnotVFinal, L_no_such_interface, L_subtype;
+
+  __ testbitdi(CCR0, R0, Rflags, ConstantPoolCacheEntry::is_vfinal_shift);
+  __ bfalse(CCR0, LnotVFinal);
+
+  __ check_klass_subtype(Rrecv_klass, Rinterface_klass, Rscratch1, Rscratch2, L_subtype);
+  // If we get here the typecheck failed
+  __ b(L_no_such_interface);
+  __ bind(L_subtype);
+
+  // do the call
+
+  Register Rscratch = Rflags; // Rflags is dead now.
+
+  __ profile_final_call(Rscratch1, Rscratch);
+  __ profile_arguments_type(Rindex, Rscratch, Rrecv_klass /* scratch */, true);
+
+  __ call_from_interpreter(Rindex, Rret_addr, Rscratch, Rrecv_klass /* scratch */);
+
+  __ bind(LnotVFinal);
 
   __ lookup_interface_method(Rrecv_klass, Rinterface_klass, noreg, noreg, Rscratch1, Rscratch2,
                              L_no_such_interface, /*return_method=*/false);
@@ -3631,14 +3663,6 @@
   call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_IncompatibleClassChangeErrorVerbose),
           Rrecv_klass, Rinterface_klass);
   DEBUG_ONLY( __ should_not_reach_here(); )
-
-  // Special case of invokeinterface called for virtual method of
-  // java.lang.Object. See ConstantPoolCacheEntry::set_method() for details:
-  // The invokeinterface was rewritten to a invokevirtual, hence we have
-  // to handle this corner case. This code isn't produced by javac, but could
-  // be produced by another compliant java compiler.
-  __ bind(LobjectMethod);
-  invokeinterface_object_method(Rrecv_klass, Rret_addr, Rflags, Rmethod, Rscratch1, Rscratch2);
 }
 
 void TemplateTable::invokedynamic(int byte_no) {
--- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -46,7 +46,7 @@
 
 void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register addr, Register count) {
-  bool dest_uninitialized = (decorators & AS_DEST_NOT_INITIALIZED) != 0;
+  bool dest_uninitialized = (decorators & IS_DEST_UNINITIALIZED) != 0;
 
   // With G1, don't generate the call if we statically know that the target is uninitialized.
   if (!dest_uninitialized) {
@@ -108,7 +108,7 @@
   if (on_oop && on_reference) {
     // Generate the G1 pre-barrier code to log the value of
     // the referent field in an SATB buffer.
-    g1_write_barrier_pre(masm, decorators | OOP_NOT_NULL,
+    g1_write_barrier_pre(masm, decorators | IS_NOT_NULL,
                          NULL /* obj */,
                          dst  /* pre_val */,
                          noreg/* preserve */ ,
@@ -127,7 +127,7 @@
                                                  bool            pre_val_needed // Save Rpre_val across runtime call, caller uses it.
                                                  ) {
 
-  bool not_null  = (decorators & OOP_NOT_NULL) != 0,
+  bool not_null  = (decorators & IS_NOT_NULL) != 0,
        preloaded = obj == NULL;
 
   const Register Robj = obj ? obj->base() : noreg,
@@ -260,7 +260,7 @@
 
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators, Register Rstore_addr, Register Rnew_val,
                                                   Register Rtmp1, Register Rtmp2, Register Rtmp3) {
-  bool not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool not_null = (decorators & IS_NOT_NULL) != 0;
 
   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); // Most probably, Rnew_val == Rtmp3.
 
@@ -372,9 +372,9 @@
 
 void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                          const Address& dst, Register val, Register tmp1, Register tmp2, Register tmp3) {
-  bool on_array = (decorators & IN_HEAP_ARRAY) != 0;
+  bool is_array = (decorators & IS_ARRAY) != 0;
   bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
-  bool precise = on_array || on_anonymous;
+  bool precise = is_array || on_anonymous;
   // Load and record the previous value.
   g1_write_barrier_pre(masm, decorators, &dst, tmp3, val, tmp1, tmp2, false);
 
--- a/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -39,7 +39,7 @@
                                   const Address& addr, Register dst, Register tmp1, Register tmp2, Label *L_handle_null) {
   bool in_heap = (decorators & IN_HEAP) != 0;
   bool in_native = (decorators & IN_NATIVE) != 0;
-  bool not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool not_null = (decorators & IS_NOT_NULL) != 0;
   assert(in_heap || in_native, "where?");
 
   switch (type) {
@@ -69,7 +69,7 @@
                                    const Address& addr, Register val, Register tmp1, Register tmp2, Register tmp3) {
   bool in_heap = (decorators & IN_HEAP) != 0;
   bool in_native = (decorators & IN_NATIVE) != 0;
-  bool not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool not_null = (decorators & IS_NOT_NULL) != 0;
   assert(in_heap || in_native, "where?");
   assert_different_registers(val, tmp1, tmp2);
 
--- a/src/hotspot/cpu/s390/gc/shared/cardTableBarrierSetAssembler_s390.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/s390/gc/shared/cardTableBarrierSetAssembler_s390.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -156,9 +156,9 @@
 
 void CardTableBarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                                 const Address& dst, Register val, Register tmp1, Register tmp2, Register tmp3) {
-  bool on_array = (decorators & IN_HEAP_ARRAY) != 0;
+  bool is_array = (decorators & IS_ARRAY) != 0;
   bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
-  bool precise = on_array || on_anonymous;
+  bool precise = is_array || on_anonymous;
 
   BarrierSetAssembler::store_at(masm, decorators, type, dst, val, tmp1, tmp2, tmp3);
 
--- a/src/hotspot/cpu/s390/macroAssembler_s390.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/s390/macroAssembler_s390.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -4051,7 +4051,7 @@
 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
                                      const Address& addr, Register val,
                                      Register tmp1, Register tmp2, Register tmp3) {
-  assert((decorators & ~(AS_RAW | IN_HEAP | IN_HEAP_ARRAY | IN_NATIVE | OOP_NOT_NULL |
+  assert((decorators & ~(AS_RAW | IN_HEAP | IN_NATIVE | IS_ARRAY | IS_NOT_NULL |
                          ON_UNKNOWN_OOP_REF)) == 0, "unsupported decorator");
   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
   decorators = AccessInternal::decorator_fixup(decorators);
@@ -4070,7 +4070,7 @@
 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
                                     const Address& addr, Register dst,
                                     Register tmp1, Register tmp2, Label *is_null) {
-  assert((decorators & ~(AS_RAW | IN_HEAP | IN_HEAP_ARRAY | IN_NATIVE | OOP_NOT_NULL |
+  assert((decorators & ~(AS_RAW | IN_HEAP | IN_NATIVE | IS_ARRAY | IS_NOT_NULL |
                          ON_PHANTOM_OOP_REF | ON_WEAK_OOP_REF)) == 0, "unsupported decorator");
   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
   decorators = AccessInternal::decorator_fixup(decorators);
--- a/src/hotspot/cpu/s390/methodHandles_s390.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/s390/methodHandles_s390.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -198,17 +198,17 @@
   __ load_heap_oop(method_temp,
                    Address(recv,
                            NONZERO(java_lang_invoke_MethodHandle::form_offset_in_bytes())),
-                   noreg, noreg, OOP_NOT_NULL);
+                   noreg, noreg, IS_NOT_NULL);
   __ verify_oop(method_temp);
   __ load_heap_oop(method_temp,
                    Address(method_temp,
                            NONZERO(java_lang_invoke_LambdaForm::vmentry_offset_in_bytes())),
-                   noreg, noreg, OOP_NOT_NULL);
+                   noreg, noreg, IS_NOT_NULL);
   __ verify_oop(method_temp);
   __ load_heap_oop(method_temp,
                    Address(method_temp,
                            NONZERO(java_lang_invoke_MemberName::method_offset_in_bytes())),
-                   noreg, noreg, OOP_NOT_NULL);
+                   noreg, noreg, IS_NOT_NULL);
   __ verify_oop(method_temp);
   __ z_lg(method_temp,
           Address(method_temp,
@@ -409,7 +409,7 @@
       Register temp2_defc = temp2;
 
       __ load_heap_oop(temp2_defc, member_clazz,
-                       noreg, noreg, OOP_NOT_NULL);
+                       noreg, noreg, IS_NOT_NULL);
       load_klass_from_Class(_masm, temp2_defc, temp3, temp4);
       __ verify_klass_ptr(temp2_defc);
       __ check_klass_subtype(temp1_recv_klass, temp2_defc, temp3, temp4, L_ok);
@@ -436,7 +436,7 @@
         verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp3);
       }
       __ load_heap_oop(Z_method, member_vmtarget,
-                       noreg, noreg, OOP_NOT_NULL);
+                       noreg, noreg, IS_NOT_NULL);
       __ z_lg(Z_method, vmtarget_method);
       method_is_live = true;
       break;
@@ -446,7 +446,7 @@
         verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp3);
       }
       __ load_heap_oop(Z_method, member_vmtarget,
-                       noreg, noreg, OOP_NOT_NULL);
+                       noreg, noreg, IS_NOT_NULL);
       __ z_lg(Z_method, vmtarget_method);
       method_is_live = true;
       break;
@@ -488,7 +488,7 @@
       Register temp3_intf = temp3;
 
       __ load_heap_oop(temp3_intf, member_clazz,
-                       noreg, noreg, OOP_NOT_NULL);
+                       noreg, noreg, IS_NOT_NULL);
       load_klass_from_Class(_masm, temp3_intf, temp2, temp4);
 
       Register Z_index = Z_method;
--- a/src/hotspot/cpu/s390/stubGenerator_s390.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/s390/stubGenerator_s390.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -1300,9 +1300,9 @@
     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
     unsigned int size      = UseCompressedOops ? 4 : 8;
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -1392,9 +1392,9 @@
     // Branch to disjoint_copy (if applicable) before pre_barrier to avoid double pre_barrier.
     array_overlap_test(nooverlap_target, shift);  // Branch away to nooverlap_target if disjoint.
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
--- a/src/hotspot/cpu/s390/templateTable_s390.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/s390/templateTable_s390.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -853,7 +853,7 @@
   index_check(Z_tmp_1, index, shift);
   // Now load array element.
   do_oop_load(_masm, Address(Z_tmp_1, index, arrayOopDesc::base_offset_in_bytes(T_OBJECT)), Z_tos,
-              Z_tmp_2, Z_tmp_3, IN_HEAP_ARRAY);
+              Z_tmp_2, Z_tmp_3, IS_ARRAY);
   __ verify_oop(Z_tos);
 }
 
@@ -1197,7 +1197,7 @@
 
   // Store a NULL.
   do_oop_store(_masm, Address(Rstore_addr, (intptr_t)0), noreg,
-               tmp3, tmp2, tmp1, IN_HEAP_ARRAY);
+               tmp3, tmp2, tmp1, IS_ARRAY);
   __ z_bru(done);
 
   // Come here on success.
@@ -1205,7 +1205,7 @@
 
   // Now store using the appropriate barrier.
   do_oop_store(_masm, Address(Rstore_addr, (intptr_t)0), Rvalue,
-               tmp3, tmp2, tmp1, IN_HEAP_ARRAY | OOP_NOT_NULL);
+               tmp3, tmp2, tmp1, IS_ARRAY | IS_NOT_NULL);
 
   // Pop stack arguments.
   __ bind(done);
@@ -3610,20 +3610,43 @@
 
   BLOCK_COMMENT("invokeinterface {");
 
-  prepare_invoke(byte_no, interface, method,  // Get f1 klassOop, f2 itable index.
+  prepare_invoke(byte_no, interface, method,  // Get f1 klassOop, f2 Method*.
                  receiver, flags);
 
   // Z_R14 (== Z_bytecode) : return entry
 
+  // First check for Object case, then private interface method,
+  // then regular interface method.
+
   // Special case of invokeinterface called for virtual method of
-  // java.lang.Object. See cpCacheOop.cpp for details.
-  // This code isn't produced by javac, but could be produced by
-  // another compliant java compiler.
-  NearLabel notMethod, no_such_interface, no_such_method;
+  // java.lang.Object. See cpCache.cpp for details.
+  NearLabel notObjectMethod, no_such_method;
   __ testbit(flags, ConstantPoolCacheEntry::is_forced_virtual_shift);
-  __ z_brz(notMethod);
+  __ z_brz(notObjectMethod);
   invokevirtual_helper(method, receiver, flags);
-  __ bind(notMethod);
+  __ bind(notObjectMethod);
+
+  // Check for private method invocation - indicated by vfinal
+  NearLabel notVFinal;
+  __ testbit(flags, ConstantPoolCacheEntry::is_vfinal_shift);
+  __ z_brz(notVFinal);
+
+  // Get receiver klass into klass - also a null check.
+  __ load_klass(klass, receiver);
+
+  NearLabel subtype, no_such_interface;
+
+  __ check_klass_subtype(klass, interface, Z_tmp_2, Z_tmp_3, subtype);
+  // If we get here the typecheck failed
+  __ z_bru(no_such_interface);
+  __ bind(subtype);
+
+  // do the call
+  __ profile_final_call(Z_tmp_2);
+  __ profile_arguments_type(Z_tmp_2, method, Z_ARG5, true);
+  __ jump_from_interpreted(method, Z_tmp_2);
+
+  __ bind(notVFinal);
 
   // Get receiver klass into klass - also a null check.
   __ restore_locals();
--- a/src/hotspot/cpu/sparc/gc/g1/g1BarrierSetAssembler_sparc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/sparc/gc/g1/g1BarrierSetAssembler_sparc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -43,7 +43,7 @@
 
 void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register addr, Register count) {
-  bool dest_uninitialized = (decorators & AS_DEST_NOT_INITIALIZED) != 0;
+  bool dest_uninitialized = (decorators & IS_DEST_UNINITIALIZED) != 0;
   // With G1, don't generate the call if we statically know that the target in uninitialized
   if (!dest_uninitialized) {
     Register tmp = O5;
@@ -406,9 +406,9 @@
   // No need for post barrier if storing NULL
   bool needs_post_barrier = val != G0 && in_heap;
 
-  bool on_array = (decorators & IN_HEAP_ARRAY) != 0;
+  bool is_array = (decorators & IS_ARRAY) != 0;
   bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
-  bool precise = on_array || on_anonymous;
+  bool precise = is_array || on_anonymous;
 
   Register index = dst.has_index() ? dst.index() : noreg;
   int disp = dst.has_disp() ? dst.disp() : 0;
--- a/src/hotspot/cpu/sparc/gc/shared/barrierSetAssembler_sparc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/sparc/gc/shared/barrierSetAssembler_sparc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -34,7 +34,7 @@
                                    Register val, Address dst, Register tmp) {
   bool in_heap = (decorators & IN_HEAP) != 0;
   bool in_native = (decorators & IN_NATIVE) != 0;
-  bool oop_not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool is_not_null = (decorators & IS_NOT_NULL) != 0;
 
   switch (type) {
   case T_ARRAY:
@@ -47,7 +47,7 @@
       }
       if (UseCompressedOops) {
         assert(dst.base() != val, "not enough registers");
-        if (oop_not_null) {
+        if (is_not_null) {
           __ encode_heap_oop_not_null(val);
         } else {
           __ encode_heap_oop(val);
@@ -70,7 +70,7 @@
                                   Address src, Register dst, Register tmp) {
   bool in_heap = (decorators & IN_HEAP) != 0;
   bool in_native = (decorators & IN_NATIVE) != 0;
-  bool oop_not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool is_not_null = (decorators & IS_NOT_NULL) != 0;
 
   switch (type) {
   case T_ARRAY:
@@ -83,7 +83,7 @@
       }
       if (UseCompressedOops) {
         __ lduw(src, dst);
-        if (oop_not_null) {
+        if (is_not_null) {
           __ decode_heap_oop_not_null(dst);
         } else {
           __ decode_heap_oop(dst);
--- a/src/hotspot/cpu/sparc/gc/shared/cardTableBarrierSetAssembler_sparc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/sparc/gc/shared/cardTableBarrierSetAssembler_sparc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -1,4 +1,3 @@
-
 /*
  * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -90,9 +89,9 @@
                                                 Register val, Address dst, Register tmp) {
   bool in_heap = (decorators & IN_HEAP) != 0;
 
-  bool on_array = (decorators & IN_HEAP_ARRAY) != 0;
+  bool is_array = (decorators & IS_ARRAY) != 0;
   bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
-  bool precise = on_array || on_anonymous;
+  bool precise = is_array || on_anonymous;
 
   // No need for post barrier if storing NULL
   bool needs_post_barrier = val != G0 && in_heap;
--- a/src/hotspot/cpu/sparc/stubGenerator_sparc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/sparc/stubGenerator_sparc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -2269,9 +2269,9 @@
       BLOCK_COMMENT("Entry:");
     }
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -2326,9 +2326,9 @@
 
     array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -2446,9 +2446,9 @@
       BLOCK_COMMENT("Entry:");
     }
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_CHECKCAST;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
 
     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
--- a/src/hotspot/cpu/sparc/templateTable_sparc.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/sparc/templateTable_sparc.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -697,7 +697,7 @@
               arrayOopDesc::base_offset_in_bytes(T_OBJECT),
               Otos_i,
               G3_scratch,
-              IN_HEAP_ARRAY);
+              IS_ARRAY);
   __ verify_oop(Otos_i);
 }
 
@@ -997,13 +997,13 @@
 
   // Store is OK.
   __ bind(store_ok);
-  do_oop_store(_masm, O1, noreg, arrayOopDesc::base_offset_in_bytes(T_OBJECT), Otos_i, G3_scratch, IN_HEAP_ARRAY);
+  do_oop_store(_masm, O1, noreg, arrayOopDesc::base_offset_in_bytes(T_OBJECT), Otos_i, G3_scratch, IS_ARRAY);
 
   __ ba(done);
   __ delayed()->inc(Lesp, 3* Interpreter::stackElementSize); // adj sp (pops array, index and value)
 
   __ bind(is_null);
-  do_oop_store(_masm, O1, noreg, arrayOopDesc::base_offset_in_bytes(T_OBJECT), G0, G4_scratch, IN_HEAP_ARRAY);
+  do_oop_store(_masm, O1, noreg, arrayOopDesc::base_offset_in_bytes(T_OBJECT), G0, G4_scratch, IS_ARRAY);
 
   __ profile_null_seen(G3_scratch);
   __ inc(Lesp, 3* Interpreter::stackElementSize);     // adj sp (pops array, index and value)
@@ -3202,28 +3202,56 @@
 
   prepare_invoke(byte_no, Rinterface, Rret, Rmethod, O0_recv, O1_flags);
 
-  // get receiver klass
+  // First check for Object case, then private interface method,
+  // then regular interface method.
+
+  // get receiver klass - this is also a null check
   __ null_check(O0_recv, oopDesc::klass_offset_in_bytes());
   __ load_klass(O0_recv, O2_Klass);
 
   // Special case of invokeinterface called for virtual method of
-  // java.lang.Object.  See cpCacheOop.cpp for details.
-  // This code isn't produced by javac, but could be produced by
-  // another compliant java compiler.
-  Label notMethod;
+  // java.lang.Object.  See cpCache.cpp for details.
+  Label notObjectMethod;
   __ set((1 << ConstantPoolCacheEntry::is_forced_virtual_shift), Rscratch);
   __ btst(O1_flags, Rscratch);
-  __ br(Assembler::zero, false, Assembler::pt, notMethod);
+  __ br(Assembler::zero, false, Assembler::pt, notObjectMethod);
   __ delayed()->nop();
 
   invokeinterface_object_method(O2_Klass, Rinterface, Rret, O1_flags);
 
-  __ bind(notMethod);
+  __ bind(notObjectMethod);
+
+  Label L_no_such_interface;
+
+  // Check for private method invocation - indicated by vfinal
+  Label notVFinal;
+  {
+    __ set((1 << ConstantPoolCacheEntry::is_vfinal_shift), Rscratch);
+    __ btst(O1_flags, Rscratch);
+    __ br(Assembler::zero, false, Assembler::pt, notVFinal);
+    __ delayed()->nop();
+
+    Label subtype;
+    Register Rtemp = O1_flags;
+    __ check_klass_subtype(O2_Klass, Rinterface, Rscratch, Rtemp, subtype);
+    // If we get here the typecheck failed
+    __ ba(L_no_such_interface);
+    __ delayed()->nop();
+    __ bind(subtype);
+
+    // do the call
+    Register Rcall = Rinterface;
+    __ mov(Rmethod, G5_method);
+    assert_different_registers(Rcall, G5_method, Gargs, Rret);
+
+    __ profile_arguments_type(G5_method, Rcall, Gargs, true);
+    __ profile_final_call(Rscratch);
+    __ call_from_interpreter(Rcall, Gargs, Rret);
+  }
+  __ bind(notVFinal);
 
   Register Rtemp = O1_flags;
 
-  Label L_no_such_interface;
-
   // Receiver subtype check against REFC.
   __ lookup_interface_method(// inputs: rec. class, interface, itable index
                              O2_Klass, Rinterface, noreg,
--- a/src/hotspot/cpu/x86/assembler_x86.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -1303,6 +1303,16 @@
   emit_int8(0xC0 | encode);
 }
 
+void Assembler::vaesdec(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_vaes(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xDE);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+
 void Assembler::aesdeclast(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
@@ -1320,6 +1330,15 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::vaesdeclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_vaes(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xDF);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::aesenc(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
@@ -4391,6 +4410,15 @@
   emit_int8(imm8);
 }
 
+void Assembler::evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x3);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(imm8);
+}
+
 void Assembler::pblendw(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
@@ -6708,7 +6736,29 @@
   emit_int8(0x59);
   emit_operand(dst, src);
 }
-
+void Assembler::evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(vector_len != Assembler::AVX_128bit, "");
+  assert(VM_Version::supports_avx512dq(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x5A);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evbroadcasti64x2(XMMRegister dst, Address src, int vector_len) {
+  assert(vector_len != Assembler::AVX_128bit, "");
+  assert(VM_Version::supports_avx512dq(), "");
+  assert(dst != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_rex_vex_w_reverted();
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T2, /* input_size_in_bits */ EVEX_64bit);
+  // swap src<->dst for encoding
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x5A);
+  emit_operand(dst, src);
+}
 
 // scalar single/double precision replicate
 
--- a/src/hotspot/cpu/x86/assembler_x86.hpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp	Mon Jun 25 10:21:50 2018 -0700
@@ -926,7 +926,8 @@
   void aesenc(XMMRegister dst, XMMRegister src);
   void aesenclast(XMMRegister dst, Address src);
   void aesenclast(XMMRegister dst, XMMRegister src);
-
+  void vaesdec(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vaesdeclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
 
   void andl(Address  dst, int32_t imm32);
   void andl(Register dst, int32_t imm32);
@@ -1739,6 +1740,7 @@
 
   void palignr(XMMRegister dst, XMMRegister src, int imm8);
   void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
+  void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
 
   void pblendw(XMMRegister dst, XMMRegister src, int imm8);
 
@@ -2102,6 +2104,9 @@
   void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
   void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
 
+  void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
+  void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);
+
   // scalar single/double precision replicate
   void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
   void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
--- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -43,7 +43,7 @@
 
 void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register addr, Register count) {
-  bool dest_uninitialized = (decorators & AS_DEST_NOT_INITIALIZED) != 0;
+  bool dest_uninitialized = (decorators & IS_DEST_UNINITIALIZED) != 0;
 
   if (!dest_uninitialized) {
     Register thread = NOT_LP64(rax) LP64_ONLY(r15_thread);
--- a/src/hotspot/cpu/x86/gc/shared/barrierSetAssembler_x86.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/x86/gc/shared/barrierSetAssembler_x86.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -35,7 +35,7 @@
                                   Register dst, Address src, Register tmp1, Register tmp_thread) {
   bool in_heap = (decorators & IN_HEAP) != 0;
   bool in_native = (decorators & IN_NATIVE) != 0;
-  bool oop_not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool is_not_null = (decorators & IS_NOT_NULL) != 0;
   bool atomic = (decorators & MO_RELAXED) != 0;
 
   switch (type) {
@@ -45,7 +45,7 @@
 #ifdef _LP64
       if (UseCompressedOops) {
         __ movl(dst, src);
-        if (oop_not_null) {
+        if (is_not_null) {
           __ decode_heap_oop_not_null(dst);
         } else {
           __ decode_heap_oop(dst);
@@ -100,7 +100,7 @@
                                    Address dst, Register val, Register tmp1, Register tmp2) {
   bool in_heap = (decorators & IN_HEAP) != 0;
   bool in_native = (decorators & IN_NATIVE) != 0;
-  bool oop_not_null = (decorators & OOP_NOT_NULL) != 0;
+  bool is_not_null = (decorators & IS_NOT_NULL) != 0;
   bool atomic = (decorators & MO_RELAXED) != 0;
 
   switch (type) {
@@ -108,7 +108,7 @@
   case T_ARRAY: {
     if (in_heap) {
       if (val == noreg) {
-        assert(!oop_not_null, "inconsistent access");
+        assert(!is_not_null, "inconsistent access");
 #ifdef _LP64
         if (UseCompressedOops) {
           __ movl(dst, (int32_t)NULL_WORD);
@@ -122,7 +122,7 @@
 #ifdef _LP64
         if (UseCompressedOops) {
           assert(!dst.uses(val), "not enough registers");
-          if (oop_not_null) {
+          if (is_not_null) {
             __ encode_heap_oop_not_null(val);
           } else {
             __ encode_heap_oop(val);
--- a/src/hotspot/cpu/x86/gc/shared/cardTableBarrierSetAssembler_x86.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/x86/gc/shared/cardTableBarrierSetAssembler_x86.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -135,9 +135,9 @@
                                                 Address dst, Register val, Register tmp1, Register tmp2) {
   bool in_heap = (decorators & IN_HEAP) != 0;
 
-  bool on_array = (decorators & IN_HEAP_ARRAY) != 0;
+  bool is_array = (decorators & IS_ARRAY) != 0;
   bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
-  bool precise = on_array || on_anonymous;
+  bool precise = is_array || on_anonymous;
 
   bool needs_post_barrier = val != noreg && in_heap;
 
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -6287,7 +6287,7 @@
 // Doesn't do verfication, generates fixed size code
 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
                                             Register thread_tmp, DecoratorSet decorators) {
-  access_load_at(T_OBJECT, IN_HEAP | OOP_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
+  access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
 }
 
 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
--- a/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -837,9 +837,9 @@
       __ jcc(Assembler::zero, L_0_count);
     }
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -1026,9 +1026,9 @@
       __ jcc(Assembler::zero, L_0_count);
     }
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -1383,9 +1383,9 @@
     Address   to_element_addr(end_to,   count, Address::times_ptr, 0);
     Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_CHECKCAST;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
 
     BasicType type = T_OBJECT;
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -1832,9 +1832,9 @@
     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
                       // r9 and r10 may be used to save non-volatile registers
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -1926,9 +1926,9 @@
     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
                       // r9 and r10 may be used to save non-volatile registers
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -2030,9 +2030,9 @@
                       // r9 and r10 may be used to save non-volatile registers
     // 'from', 'to' and 'qword_count' are now valid
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -2123,9 +2123,9 @@
                       // r9 and r10 may be used to save non-volatile registers
     // 'from', 'to' and 'qword_count' are now valid
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
     if (aligned) {
       decorators |= ARRAYCOPY_ALIGNED;
@@ -2306,9 +2306,9 @@
     Address from_element_addr(end_from, count, TIMES_OOP, 0);
     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
 
-    DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_CHECKCAST;
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
     if (dest_uninitialized) {
-      decorators |= AS_DEST_NOT_INITIALIZED;
+      decorators |= IS_DEST_UNINITIALIZED;
     }
 
     BasicType type = T_OBJECT;
@@ -4084,6 +4084,312 @@
     return start;
   }
 
+void roundDec(XMMRegister xmm_reg) {
+  __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
+}
+
+void roundDeclast(XMMRegister xmm_reg) {
+  __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
+  __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
+}
+
+  void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
+    __ movdqu(xmmdst, Address(key, offset));
+    if (xmm_shuf_mask != NULL) {
+      __ pshufb(xmmdst, xmm_shuf_mask);
+    } else {
+      __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    }
+    __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
+
+  }
+
+address generate_cipherBlockChaining_decryptVectorAESCrypt() {
+    assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+    address start = __ pc();
+
+    const Register from = c_rarg0;  // source array address
+    const Register to = c_rarg1;  // destination array address
+    const Register key = c_rarg2;  // key array address
+    const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
+    // and left with the results of the last encryption block
+#ifndef _WIN64
+    const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
+#else
+    const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
+    const Register len_reg = r11;      // pick the volatile windows register
+#endif
+
+    Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
+          Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
+
+    __ enter();
+
+#ifdef _WIN64
+  // on win64, fill len_reg from stack position
+    __ movl(len_reg, len_mem);
+#else
+    __ push(len_reg); // Save
+#endif
+    __ push(rbx);
+    __ vzeroupper();
+
+    // Temporary variable declaration for swapping key bytes
+    const XMMRegister xmm_key_shuf_mask = xmm1;
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+
+    // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
+    const Register rounds = rbx;
+    __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    const XMMRegister IV = xmm0;
+    // Load IV and broadcast value to 512-bits
+    __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
+
+    // Temporary variables for storing round keys
+    const XMMRegister RK0 = xmm30;
+    const XMMRegister RK1 = xmm9;
+    const XMMRegister RK2 = xmm18;
+    const XMMRegister RK3 = xmm19;
+    const XMMRegister RK4 = xmm20;
+    const XMMRegister RK5 = xmm21;
+    const XMMRegister RK6 = xmm22;
+    const XMMRegister RK7 = xmm23;
+    const XMMRegister RK8 = xmm24;
+    const XMMRegister RK9 = xmm25;
+    const XMMRegister RK10 = xmm26;
+
+     // Load and shuffle key
+    // the java expanded key ordering is rotated one position from what we want
+    // so we start from 1*16 here and hit 0*16 last
+    ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
+    ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
+    ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
+    ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
+    ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
+    ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
+    ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
+    ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
+    ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
+    ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
+    ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
+
+    // Variables for storing source cipher text
+    const XMMRegister S0 = xmm10;
+    const XMMRegister S1 = xmm11;
+    const XMMRegister S2 = xmm12;
+    const XMMRegister S3 = xmm13;
+    const XMMRegister S4 = xmm14;
+    const XMMRegister S5 = xmm15;
+    const XMMRegister S6 = xmm16;
+    const XMMRegister S7 = xmm17;
+
+    // Variables for storing decrypted text
+    const XMMRegister B0 = xmm1;
+    const XMMRegister B1 = xmm2;
+    const XMMRegister B2 = xmm3;
+    const XMMRegister B3 = xmm4;
+    const XMMRegister B4 = xmm5;
+    const XMMRegister B5 = xmm6;
+    const XMMRegister B6 = xmm7;
+    const XMMRegister B7 = xmm8;
+
+    __ cmpl(rounds, 44);
+    __ jcc(Assembler::greater, KEY_192);
+    __ jmp(Loop);
+
+    __ BIND(KEY_192);
+    const XMMRegister RK11 = xmm27;
+    const XMMRegister RK12 = xmm28;
+    ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
+    ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
+
+    __ cmpl(rounds, 52);
+    __ jcc(Assembler::greater, KEY_256);
+    __ jmp(Loop);
+
+    __ BIND(KEY_256);
+    const XMMRegister RK13 = xmm29;
+    const XMMRegister RK14 = xmm31;
+    ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
+    ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
+
+    __ BIND(Loop);
+    __ cmpl(len_reg, 512);
+    __ jcc(Assembler::below, Lcbc_dec_rem);
+    __ BIND(Loop1);
+    __ subl(len_reg, 512);
+    __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
+    __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
+    __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
+    __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
+    __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
+    __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
+    __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
+    __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
+    __ leaq(from, Address(from, 8 * 64));
+
+    __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
+    __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
+    __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
+    __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
+    __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
+    __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
+    __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
+    __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
+
+    __ evalignq(IV, S0, IV, 0x06);
+    __ evalignq(S0, S1, S0, 0x06);
+    __ evalignq(S1, S2, S1, 0x06);
+    __ evalignq(S2, S3, S2, 0x06);
+    __ evalignq(S3, S4, S3, 0x06);
+    __ evalignq(S4, S5, S4, 0x06);
+    __ evalignq(S5, S6, S5, 0x06);
+    __ evalignq(S6, S7, S6, 0x06);
+
+    roundDec(RK2);
+    roundDec(RK3);
+    roundDec(RK4);
+    roundDec(RK5);
+    roundDec(RK6);
+    roundDec(RK7);
+    roundDec(RK8);
+    roundDec(RK9);
+    roundDec(RK10);
+
+    __ cmpl(rounds, 44);
+    __ jcc(Assembler::belowEqual, L_128);
+    roundDec(RK11);
+    roundDec(RK12);
+
+    __ cmpl(rounds, 52);
+    __ jcc(Assembler::belowEqual, L_192);
+    roundDec(RK13);
+    roundDec(RK14);
+
+    __ BIND(L_256);
+    roundDeclast(RK0);
+    __ jmp(Loop2);
+
+    __ BIND(L_128);
+    roundDeclast(RK0);
+    __ jmp(Loop2);
+
+    __ BIND(L_192);
+    roundDeclast(RK0);
+
+    __ BIND(Loop2);
+    __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
+    __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
+    __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
+    __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
+    __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
+    __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
+    __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
+    __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
+    __ evmovdquq(IV, S7, Assembler::AVX_512bit);
+
+    __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
+    __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
+    __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
+    __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
+    __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
+    __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
+    __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
+    __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
+    __ leaq(to, Address(to, 8 * 64));
+    __ jmp(Loop);
+
+    __ BIND(Lcbc_dec_rem);
+    __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
+
+    __ BIND(Lcbc_dec_rem_loop);
+    __ subl(len_reg, 16);
+    __ jcc(Assembler::carrySet, Lcbc_dec_ret);
+
+    __ movdqu(S0, Address(from, 0));
+    __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
+    __ cmpl(rounds, 44);
+    __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
+
+    __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
+    __ cmpl(rounds, 52);
+    __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
+
+    __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
+    __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
+
+    __ BIND(Lcbc_dec_rem_last);
+    __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
+
+    __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
+    __ evmovdquq(IV, S0, Assembler::AVX_512bit);
+    __ movdqu(Address(to, 0), B0);
+    __ leaq(from, Address(from, 16));
+    __ leaq(to, Address(to, 16));
+    __ jmp(Lcbc_dec_rem_loop);
+
+    __ BIND(Lcbc_dec_ret);
+    __ movdqu(Address(rvec, 0), IV);
+
+    // Zero out the round keys
+    __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
+    __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
+    __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
+    __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
+    __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
+    __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
+    __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
+    __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
+    __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
+    __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
+    __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
+    __ cmpl(rounds, 44);
+    __ jcc(Assembler::belowEqual, Lcbc_exit);
+    __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
+    __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
+    __ cmpl(rounds, 52);
+    __ jcc(Assembler::belowEqual, Lcbc_exit);
+    __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
+    __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
+
+    __ BIND(Lcbc_exit);
+    __ pop(rbx);
+#ifdef _WIN64
+    __ movl(rax, len_mem);
+#else
+    __ pop(rax); // return length
+#endif
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+    return start;
+}
+
   // byte swap x86 long
   address generate_ghash_long_swap_mask() {
     __ align(CodeEntryAlignment);
@@ -5078,7 +5384,11 @@
       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
-      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
+      if (VM_Version::supports_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
+        StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
+      } else {
+        StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
+      }
     }
     if (UseAESCTRIntrinsics){
       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
--- a/src/hotspot/cpu/x86/templateTable_x86.cpp	Mon Jun 25 14:32:46 2018 +0530
+++ b/src/hotspot/cpu/x86/templateTable_x86.cpp	Mon Jun 25 10:21:50 2018 -0700
@@ -770,7 +770,7 @@
   // rax: index
   // rdx: array
   index_check(rdx, rax); // kills rbx
-  __ access_load_at(T_INT, IN_HEAP | IN_HEAP_ARRAY, rax,
+  __ access_load_at(T_INT, IN_HEAP | IS_ARRAY, rax,
                     Address(rdx, rax, Address::times_4,
                             arrayOopDesc::base_offset_in_bytes(T_INT)),
                     noreg, noreg);
@@ -783,7 +783,7 @@
   index_check(rdx, rax); // kills rbx
   NOT_LP64(__ mov(rbx, rax));
   // rbx,: index
-  __ access_load_at(T_LONG, IN_HEAP | IN_HEAP_ARRAY, noreg /* ltos */,
+  __ access_load_at(T_LONG, IN_HEAP | IS_ARRAY, noreg /* ltos */,
                     Address(rdx, rbx, Address::times_8,
                             arrayOopDesc::base_offset_in_bytes(T_LONG)),
                     noreg, noreg);
@@ -796,7 +796,7 @@
   // rax: index
   // rdx: array
   index_check(rdx, rax); // kills rbx